bpp-seq3  3.0.0
Pasta.cpp
Go to the documentation of this file.
1 // SPDX-FileCopyrightText: The Bio++ Development Group
2 //
3 // SPDX-License-Identifier: CECILL-2.1
4 
5 #include <Bpp/Io/FileTools.h>
7 #include <Bpp/Text/TextTools.h>
8 #include <fstream>
9 
10 #include "../StringSequenceTools.h"
11 #include "../Container/SequenceContainer.h"
12 #include "Pasta.h"
13 
14 using namespace bpp;
15 using namespace std;
16 
17 /********************************************************************************/
18 
19 bool Pasta::nextSequence(istream& input, ProbabilisticSequence& seq, bool hasLabels, const vector<size_t>& permutationMap) const
20 {
21  if (!input)
22  throw IOException("Pasta::nextSequence : can't read from istream input");
23 
24  string seqname = "";
25  vector<double> tokens;
26  Comments seqcmts;
27  short seqcpt = 0;
28  string linebuffer = "";
29  char c;
30 
31  while (!input.eof())
32  {
33  c = static_cast<char>(input.peek());
34  if (input.eof())
35  c = '\n';
36 
37  // detect the beginning of a sequence
38  if (c == '>')
39  {
40  // stop if we find a new sequence
41  if (seqcpt++)
42  break;
43  }
44 
45  getline(input, linebuffer);
46 
47  if (c == '>')
48  {
49  // get the sequence name line
50  seqname = string(linebuffer.begin() + 1, linebuffer.end());
51  }
52 
53  if (c != '>' && !TextTools::isWhiteSpaceCharacter(c))
54  {
55  // sequence content : probabilities for each site are space-separated
56  StringTokenizer st(linebuffer, " \t\n", false, false);
57  while (st.hasMoreToken())
58  {
59  double t;
60  stringstream ss(st.nextToken());
61  ss >> t;
62  tokens.push_back(t);
63  }
64  }
65  }
66 
67  bool res = (!input.eof());
68 
69  // Sequence name and comments isolation (identical to that of Fasta)
70  if (strictNames_ || extended_)
71  {
72  size_t pos = seqname.find_first_of(" \t\n");
73  string seqcmt;
74 
75  if (pos != string::npos)
76  {
77  seqcmt = seqname.substr(pos + 1);
78  seqname = seqname.substr(0, pos);
79  }
80 
81  if (extended_)
82  {
83  StringTokenizer st(seqcmt, " \\", true, false);
84  while (st.hasMoreToken())
85  {
86  seqcmts.push_back(st.nextToken());
87  }
88  }
89  else
90  {
91  seqcmts.push_back(seqcmt);
92  }
93  seq.setComments(seqcmts);
94  }
95 
96  seq.setName(seqname);
97 
98  /* finally, deal with the content */
99 
100  // there is a header that specifies to which character each
101  // probability is associated
102  auto size = seq.getAlphabet()->getSize();
103  if (hasLabels)
104  {
105  DataTable content(size, 0);
106  vector<double>::const_iterator i = tokens.begin();
107  while (i != tokens.end())
108  {
109  // junk up the tokens into groups of alphabetsize, and permute
110  // according to how the header is permuted
111  vector<double> row(size, 0.);
112  for (const auto j:permutationMap)
113  {
114  if (i == tokens.end())
115  throw Exception("Pasta::nextSequence : input is incomplete");
116  row[(size_t)j] = *i;
117  ++i;
118  }
119 
120  content.addColumn(row);
121  }
122  // finally set the content
123  seq.setContent(content.getData());
124  }
125  // o.w., we assume that each probability is that a (binary)
126  // character is 1
127  else
128  {
129  DataTable content(2, 0);
130 
131  // fill in pairs of probabilities that (binary) character is 0,
132  // resp. 1
133  for (const auto& i : tokens)
134  {
135  // the following will throw an exception if v[i] is not a properly
136  // formatted double : a check that we want to have
137 
138  vector<double> pair_v{1. - i, i};
139  content.addColumn(pair_v);
140  }
141 
142  // finally, we set the content of the sequence to the above.
143  // Since any number format exception was taken care of above, as
144  // well as that each sequence must be of the same length by
145  // construction of a DataTable object, the only thing left that
146  // could go wrong is that p(0) + p(1) != 1 : a check that is done
147  // in the call of the function below
148  seq.setContent(content.getData());
149  }
150 
151  return res;
152 }
153 
154 /********************************************************************************/
155 
157 {
158  if (!input)
159  throw IOException("Pasta::appendAlignmentFromStream: can't read from istream input");
160 
161  char c = '\n';
162  char last_c;
163  bool header = false;
164  bool hasSeq = true;
165  string line = "";
166  Comments cmts;
167  auto alphaPtr = container.getAlphabet();
168 
169  // labels for the states
170  bool hasLabels = false;
171  vector<string> labels;
172  vector<size_t> permutationMap;
173 
174  while (!input.eof() && hasSeq)
175  {
176  last_c = c;
177  input.get(c);
178 
179  // detect the header
180  if (extended_ && c == '#')
181  {
182  header = true;
183  continue;
184  }
185 
186  // detect the end of the header
187  if (c == '\n')
188  {
189  if (extended_ && header)
190  {
191  if (line[0] == '\\')
192  {
193  line.erase(line.begin());
194  cmts.push_back(line);
195  }
196  line = "";
197  header = false;
198  }
199  continue;
200  }
201 
202  // capture the header
203  if (header)
204  {
205  line.append(1, c);
206  }
207 
208  // detect/get labels for the states
209  if (!header && c != '>')
210  {
211  hasLabels = true;
212  input.putback(c);
213  c = last_c;
214 
215  getline(input, line);
216  StringTokenizer st(line, " \t\n", false, false);
217 
218  while (st.hasMoreToken())
219  {
220  string s = st.nextToken();
221  labels.push_back(s);
222  }
223 
224  /* check labels against alphabet of the container */
225  vector<string> resolved_chars = container.getAlphabet()->getResolvedChars();
226 
227  // build permutation map on the content, error should one exist
228  for (const auto& i : labels)
229  {
230  bool found = false;
231 
232  for (size_t j = 0; j < resolved_chars.size(); ++j)
233  {
234  if (i == resolved_chars[j])
235  {
236  if (found)
237  throw Exception("Pasta::appendSequencesFromStream. Label " + i + " found twice.");
238 
239  permutationMap.push_back(j);
240  found = true;
241  }
242  }
243 
244  if (!found)
245  {
246  string states = "<";
247  for (const auto& i2 : resolved_chars)
248  {
249  states += " " + i2;
250  }
251  states += " >";
252  throw Exception("Pasta::appendSequencesFromStream. Label " + i + " is not found in alphabet " + states + ".");
253  }
254  }
255  }
256 
257  // detect the sequence
258  if (c == '>' && last_c == '\n')
259  {
260  input.putback(c);
261  c = last_c;
262  auto tmpseq = make_unique<ProbabilisticSequence>(alphaPtr); // add probabilistic sequences instead
263  hasSeq = nextSequence(input, *tmpseq, hasLabels, permutationMap);
264  container.addSequence(tmpseq->getName(), tmpseq);
265  }
266  }
267  if (extended_ && cmts.size())
268  {
269  container.setComments(cmts);
270  }
271 }
272 
273 
274 /********************************************************************************/
275 
276 void Pasta::writeSequence(ostream& output, const ProbabilisticSequence& seq, bool header) const
277 {
278  if (!output)
279  throw IOException("Pasta::writeSequence : can't write to ostream output");
280 
281  // The alphabet
282  if (header)
283  {
284  vector<string> resolved_chars = seq.getAlphabet()->getResolvedChars();
285  for (auto state : resolved_chars)
286  {
287  output << state << "\t";
288  }
289  output << endl;
290  }
291 
292  // sequence name
293  output << ">" << seq.getName();
294 
295  // sequence comments
296  if (extended_)
297  {
298  for (size_t i = 0; i < seq.getComments().size(); ++i)
299  {
300  output << " \\" << seq.getComments()[i];
301  }
302  }
303  output << endl;
304 
305  StlOutputStreamWrapper outs(&output);
306 
307  // sequence content
308  const vector<vector<double>>& data = seq.getContent();
309 
310  for (auto i : data)
311  {
312  VectorTools::print(i, outs, "\t");
313  }
314 }
315 
316 /********************************************************************************/
317 
318 void Pasta::writeSequence(ostream& output, const Sequence& seq, bool header) const
319 {
320  if (!output)
321  throw IOException("Pasta::writeSequence : can't write to ostream output");
322 
323  // The alphabet
324  if (header)
325  {
326  vector<string> resolved_chars = seq.getAlphabet()->getResolvedChars();
327  for (auto state : resolved_chars)
328  {
329  output << state << "\t";
330  }
331  output << endl;
332  }
333 
334  // sequence name
335  output << ">" << seq.getName();
336 
337  // sequence comments
338  if (extended_)
339  {
340  for (size_t i = 0; i < seq.getComments().size(); ++i)
341  {
342  output << " \\" << seq.getComments()[i];
343  }
344  }
345  output << endl;
346 
347  // sequence content
348 
349  int nbc = (int)seq.getAlphabet()->getResolvedChars().size();
350 
351  for (size_t i = 0; i < seq.size(); ++i)
352  {
353  for (int s = 0; s < nbc; ++s)
354  {
355  output << seq.getStateValueAt(i, (int)s);
356  output << "\t";
357  }
358  output << endl;
359  }
360 }
const std::string & getName() const override
Get the name of this sequence.
Definition: CoreSequence.h:170
void setName(const std::string &name) override
Set the name of this sequence.
Definition: CoreSequence.h:172
std::shared_ptr< const Alphabet > getAlphabet() const override
Get the alphabet associated to the list.
Definition: SymbolList.h:120
size_t size() const override
Get the number of elements in the list.
Definition: SymbolList.h:124
virtual void setComments(const Comments &comments)=0
Set the comments.
void writeSequence(std::ostream &output, const ProbabilisticSequence &seq, bool header) const
Definition: Pasta.cpp:276
void appendAlignmentFromStream(std::istream &input, ProbabilisticSequenceContainerInterface &psc) const override
Append sequences to a container from a stream.
Definition: Pasta.cpp:156
bool nextSequence(std::istream &input, ProbabilisticSequence &seq, bool hasLabels, const std::vector< size_t > &permutationMap) const
Definition: Pasta.cpp:19
A basic implementation of the ProbabilisticSequence interface.
std::shared_ptr< const Alphabet > getAlphabet() const override
Get the alphabet associated to the list.
const std::vector< std::vector< double > > & getContent() const override
void setContent(const std::vector< std::vector< double >> &list) override
A basic implementation of the Sequence interface.
Definition: Sequence.h:117
double getStateValueAt(size_t sitePosition, int state) const override
get value of a state at a position
Definition: Sequence.h:366
const Comments & getComments() const override
Get the comments.
Definition: Commentable.h:79
void setComments(const Comments &comments) override
Set the comments.
Definition: Commentable.h:86
const std::string & nextToken()
bool hasMoreToken() const
const std::vector< std::vector< T > > & getData() const
void addColumn(const std::vector< T > &newColumn, int pos=-1)
The SequenceContainer interface.
virtual void addSequence(const HashType &sequenceKey, std::unique_ptr< SequenceType > &sequencePtr)=0
Add a sequence to the container.
virtual std::shared_ptr< const Alphabet > getAlphabet() const =0
Get a pointer toward the container's alphabet.
static void print(const std::vector< T > &v1, OutputStream &out=*ApplicationTools::message, const std::string &delim=" ")
bool isWhiteSpaceCharacter(char c)
This alphabet is used to deal NumericAlphabet.
std::vector< std::string > Comments
Declaration of Comments type.
Definition: Commentable.h:21