bpp-seq3  3.0.0
MaseTools.cpp
Go to the documentation of this file.
1 // SPDX-FileCopyrightText: The Bio++ Development Group
2 //
3 // SPDX-License-Identifier: CECILL-2.1
4 
6 #include <Bpp/Text/TextTools.h>
7 #include <iostream>
8 
9 #include "../Container/AlignedSequenceContainer.h"
10 #include "../Container/SequenceContainerTools.h"
11 #include "../Container/VectorSequenceContainer.h"
12 #include "MaseTools.h"
13 
14 using namespace std;
15 using namespace bpp;
16 
17 SiteSelection MaseTools::getSiteSet(const Comments& maseFileHeader, const string& setName)
18 {
19  SiteSelection selection;
20  for (size_t i = 0; i < maseFileHeader.size(); i++)
21  {
22  string current = maseFileHeader[i];
23  string::size_type index = current.find("# of");
24  if (index < current.npos)
25  {
26  StringTokenizer st(string(current.begin() + static_cast<ptrdiff_t>(index + 4), current.end()), " \t=;");
27  st.nextToken(); // skip next word: may be 'regions' or 'segments' or else ;-)
28  size_t numberOfSegments = TextTools::to<size_t>(st.nextToken());
29  string name = st.unparseRemainingTokens();
30  if (name == setName)
31  {
32  // cout << numberOfSegments << " segments found." << endl;
33  // Then look for the set definition:
34  i++; // next line.
35  size_t counter = 0;
36  while (i < maseFileHeader.size())
37  {
38  current = maseFileHeader[i++];
39  StringTokenizer st2(current);
40  // st.nextToken(); //Skip ';;'
41  while (st2.hasMoreToken())
42  {
43  StringTokenizer st3(st2.nextToken(), ",");
44  size_t begin = TextTools::to<size_t>(st3.nextToken());
45  size_t end = TextTools::to<size_t>(st3.nextToken());
46  // WARNING!!! In the mase+ format, sites are numbered from 1 to nbSites,
47  // Whereas in SiteContainer the index begins at 0.
48  for (size_t j = begin; j <= end; j++)
49  {
50  selection.push_back(j - 1); // bounds included.
51  }
52  counter++;
53  if (counter == numberOfSegments)
54  return selection;
55  }
56  }
57  }
58  }
59  }
60  if (selection.size() == 0)
61  {
62  throw IOException("Site set " + setName + " has not been found in the sequence file.");
63  }
64  return selection;
65 }
66 
67 /******************************************************************************/
68 
69 SequenceSelection MaseTools::getSequenceSet(const Comments& maseFileHeader, const string& setName)
70 {
71  SequenceSelection selection;
72  for (size_t i = 0; i < maseFileHeader.size(); i++)
73  {
74  string current = maseFileHeader[i];
75 
76  string::size_type index = current.find("@ of");
77  if (index < current.npos)
78  {
79  StringTokenizer st(string(current.begin() + static_cast<ptrdiff_t>(index + 4), current.end()), " \t=;");
80  st.nextToken(); // skip next word: may be 'sequences' or else ;-)
81  size_t numberOfSequences = TextTools::to<size_t>(st.nextToken());
82  string name = st.unparseRemainingTokens();
83  size_t counter = 0;
84  if (name == setName)
85  {
86  // cout << numberOfSequences << " segments found." << endl;
87  // Then look for the set definition:
88  i++; // next line.
89  while (i < maseFileHeader.size())
90  {
91  current = maseFileHeader[i++];
92  StringTokenizer st2(current, ",");
93  while (st2.hasMoreToken())
94  {
95  size_t seqIndex = TextTools::to<size_t>(st2.nextToken());
96  // WARNING!!! In the mase+ format, sequences are numbered from 1 to nbSequences,
97  // Whereas in SequenceContainer the index begins at 0.
98  selection.push_back(seqIndex - 1); // bounds included.
99  counter++;
100  if (counter == numberOfSequences)
101  return selection;
102  }
103  }
104  }
105  }
106  }
107  if (selection.size() == 0)
108  {
109  throw IOException("Sequence set " + setName + " has not been found in the sequence file.");
110  }
111  return selection;
112 }
113 
114 /******************************************************************************/
115 
116 map<string, size_t> MaseTools::getAvailableSiteSelections(const Comments& maseHeader)
117 {
118  map<string, size_t> selections;
119  for (size_t i = 0; i < maseHeader.size(); i++)
120  {
121  string current = maseHeader[i];
122 
123  string::size_type index = current.find("# of");
124  if (index < current.npos)
125  {
126  StringTokenizer st(string(current.begin() + static_cast<ptrdiff_t>(index + 4), current.end()), " \t\n\f\r=;");
127  st.nextToken(); // skip next word: may be 'sequences' or else ;-)
128  size_t numberOfSegments = TextTools::to<size_t>(st.nextToken());
129  string name = st.nextToken();
130  while (st.hasMoreToken())
131  {
132  name += " " + st.nextToken();
133  }
134  size_t counter = 0;
135  size_t nbSites = 0;
136  while (i < maseHeader.size())
137  {
138  i++;
139  current = maseHeader[i];
140  StringTokenizer st2(current);
141  // st.nextToken(); //Skip ';;'
142  while (st2.hasMoreToken())
143  {
144  StringTokenizer st3(st2.nextToken(), ",");
145  size_t begin = TextTools::to<size_t>(st3.nextToken());
146  size_t end = TextTools::to<size_t>(st3.nextToken());
147  counter++;
148  nbSites += end - begin + 1;
149  }
150  if (counter == numberOfSegments)
151  {
152  selections[name] = nbSites;
153  break;
154  }
155  }
156  }
157  }
158  return selections;
159 }
160 
161 /******************************************************************************/
162 
163 map<string, size_t> MaseTools::getAvailableSequenceSelections(const Comments& maseHeader)
164 {
165  map<string, size_t> selections;
166  for (size_t i = 0; i < maseHeader.size(); i++)
167  {
168  string current = maseHeader[i];
169 
170  string::size_type index = current.find("@ of");
171  if (index < current.npos)
172  {
173  StringTokenizer st(string(current.begin() + static_cast<ptrdiff_t>(index + 4), current.end()), " \t\n\f\r=;");
174  st.nextToken(); // skip next word: may be 'sequences' or else ;-)
175  size_t numberOfSequences = TextTools::fromString<size_t>(st.nextToken());
176  string name = st.nextToken();
177  while (st.hasMoreToken())
178  {
179  name += st.nextToken();
180  }
181  selections[name] = numberOfSequences;
182  }
183  }
184  return selections;
185 }
186 
187 /******************************************************************************/
188 
189 size_t MaseTools::getPhase(const Comments& maseFileHeader, const string& setName)
190 {
191  size_t phase = 0;
192  string::size_type index = 0;
193  for (size_t i = 0; i < maseFileHeader.size(); i++)
194  {
195  string current = maseFileHeader[i];
196 
197  index = current.find("# of");
198  if (index < current.npos)
199  {
200  StringTokenizer st(string(current.begin() + static_cast<ptrdiff_t>(index + 12), current.end()), " \t\n\f\r=;");
201  // size_t numberOfSegments = TextTools::toInt(st.nextToken());
202  // cout << "Number of regions: " << st.nextToken() << endl;
203  string name;
204  while (st.hasMoreToken())
205  {
206  name = st.nextToken();
207  // cout << "Name of regions: " << name << endl;
208  }
209  if (name == setName)
210  {
211  return phase;
212  }
213  }
214 
215  index = current.find("/codon_start");
216  if (index < current.npos)
217  {
218  StringTokenizer st(string(current.begin() + static_cast<ptrdiff_t>(index + 12), current.end()), " \t\n\f\r=;");
219  phase = TextTools::to<size_t>(st.nextToken());
220  }
221  }
222  throw Exception("PolymorphismSequenceContainer::getPhase: no /codon_start found, or site selection missing.");
223 }
224 
225 /******************************************************************************/
const std::string & nextToken()
bool hasMoreToken() const
std::string unparseRemainingTokens() const
This alphabet is used to deal NumericAlphabet.
std::vector< size_t > SiteSelection
std::vector< size_t > SequenceSelection
std::vector< std::string > Comments
Declaration of Comments type.
Definition: Commentable.h:21