bpp-seq3  3.0.0
Mase.cpp
Go to the documentation of this file.
1 // SPDX-FileCopyrightText: The Bio++ Development Group
2 //
3 // SPDX-License-Identifier: CECILL-2.1
4 
5 #include "../StringSequenceTools.h"
6 #include "Mase.h"
7 
8 using namespace bpp;
9 using namespace std;
10 
11 /****************************************************************************************/
12 
13 void Mase::appendSequencesFromStream(std::istream& input, SequenceContainerInterface& vsc) const
14 {
15  if (!input)
16  {
17  throw IOException ("Mase::read : fail to open file");
18  }
19 
20  // Initialization
21  Comments seqComments, fileComments;
22  string temp, name, sequence = "";
23  bool comments = false;
24 
25  // Get current general comments is VectorSequenceContainer
26  fileComments = vsc.getComments();
27 
28  // Main loop : for all file lines
29  while (!input.eof())
30  {
31  getline(input, temp, '\n'); // Copy current line in temporary string
32 
33  // If first character is ;
34  if (temp[0] == ';')
35  {
36  // If second character is also ;
37  if (temp[1] == ';')
38  {
39  // File comments isolation
40  temp.erase(0, 2); // Characters ;; deletion
41  if (temp != "")
42  fileComments.push_back(temp);
43  }
44  else
45  {
46  // If a name and a sequence were founded
47  if ((name != "") && (sequence != ""))
48  {
49  // New sequence creation, and addition in existing VectorSequenceContainer
50  auto alphaPtr = vsc.getAlphabet();
51  auto seqPtr = make_unique<Sequence>(name, sequence, seqComments, alphaPtr);
52  vsc.addSequence(seqPtr->getName(), seqPtr);
53  name = "";
54  sequence = "";
55  seqComments.clear();
56  }
57 
58  // Sequence commentaries isolation
59  temp.erase(temp.begin()); // Character ; deletion
60  if (temp != "")
61  seqComments.push_back(temp);
62  comments = true;
63  }
64  }
65  else
66  {
67  // If sequence commentaries were just isolated
68  if (comments)
69  {
70  // Sequence name isolation
71  name = temp;
72  comments = false;
73  }
74  else
75  sequence += temp; // Sequence isolation
76  }
77  }
78 
79  // Addition of the last sequence in file
80  if ((name != "") && (sequence != ""))
81  {
82  auto alphaPtr = vsc.getAlphabet();
83  auto seqPtr = make_unique<Sequence>(name, sequence, seqComments, alphaPtr);
84  vsc.addSequence(seqPtr->getName(), seqPtr);
85  }
86 
87  // Set new general comments in VectorSequenceContainer (old + new comments)
88  vsc.setComments(fileComments);
89 }
90 
91 /****************************************************************************************/
92 
93 void Mase::writeSequences(ostream& output, const SequenceContainerInterface& sc) const
94 {
95  // Checking the existence of specified file, and possibility to open it in write mode
96  if (!output)
97  {
98  throw IOException ("Mase::write : failed to open file");
99  }
100 
101  Comments comments = sc.getComments();
102 
103  // Writing all general comments in file
104  if (comments.size() == 0)
105  {
106  output << ";;" << endl;
107  }
108  for (size_t i = 0; i < comments.size(); i++)
109  {
110  output << ";;" << comments[i] << endl;
111  }
112 
113  string seq, temp = ""; // Initialization
114 
115  // Main loop : for all sequences
116  for (const auto& seqKey: sc.getSequenceKeys())
117  {
118  comments = sc.sequence(seqKey).getComments();
119 
120  // Writing all sequence comments in file
121  // If no comments are associated with current sequence, an empty commentary line will be writed
122  if (comments.size() == 0)
123  {
124  output << ";" << endl;
125  }
126  else
127  {
128  for (size_t j = 0; j < comments.size(); j++)
129  {
130  output << ";" << comments[j] << endl;
131  }
132  }
133 
134  // Sequence name writing
135  output << sc.sequence(seqKey).getName() << endl;
136 
137  // Sequence cutting to specified characters number per line
138  seq = sc.sequence(seqKey).toString();
139  while (seq != "")
140  {
141  if (seq.size() > charsByLine_)
142  {
143  temp = seq;
144  temp.erase(temp.begin() + static_cast<ptrdiff_t>(charsByLine_), temp.end());
145  output << temp << endl;
146  seq.erase(seq.begin(), seq.begin() + static_cast<ptrdiff_t>(charsByLine_));
147  }
148  else
149  {
150  output << seq << endl;
151  seq = "";
152  }
153  }
154  }
155 }
156 
157 /****************************************************************************************/
158 
159 void Mase::readHeader_(std::istream& input, MaseHeader& header) const
160 {
161  do
162  {
163  // Check if the line is a header line:
164  if (input.peek() == ';')
165  {
166  char c;
167  input.get(c);
168  if (input.peek() == ';')
169  {
170  input.get(c);
171  string line = FileTools::getNextLine(input);
172 
173  // Check the type of line...
174 
175  // Site selection:
176  string::size_type index = line.find("# of");
177  if (index < line.npos)
178  {
179  StringTokenizer st(string(line.begin() + static_cast<ptrdiff_t>(index + 4), line.end()), " \t=;");
180  st.nextToken(); // skip next word: may be 'regions' or 'segments' or else ;-)
181  unsigned int numberOfSegments = TextTools::to<unsigned int>(st.nextToken());
182  string name = st.unparseRemainingTokens();
183  // Then look for the set definition:
184  MultiRange<size_t> siteSelection;
185  while (siteSelection.size() < numberOfSegments)
186  {
187  line = FileTools::getNextLine(input);
188  if (line[0] != ';' || line[1] != ';')
189  throw Exception("Mase::readHeader_(): corrupted file, site selection " + name + " is incomplete. Aborting.");
190  line = line.substr(2);
191  StringTokenizer st2(line);
192  while (st2.hasMoreToken())
193  {
194  StringTokenizer st3(st2.nextToken(), ",");
195  unsigned int begin = TextTools::to<unsigned int>(st3.nextToken());
196  unsigned int end = TextTools::to<unsigned int>(st3.nextToken());
197  // WARNING!!! In the mase+ format, sites numerotation is 1-based, including, while ranges are 0-based, [a, b[:
198  siteSelection.addRange(Range<size_t>(begin - 1, end));
199  }
200  if (siteSelection.size() > numberOfSegments)
201  throw Exception("Mase::readHeader_(): incorrected file, found " + TextTools::toString(siteSelection.size()) + "segments while expected " + TextTools::toString(numberOfSegments));
202  }
203  header.setSiteSelection(name, siteSelection);
204  }
205  else
206  {
207  // Sequence selection:
208  index = line.find("@ of");
209  if (index < line.npos)
210  {
211  StringTokenizer st(line.substr(index + 4), " \t=;");
212  st.nextToken(); // skip next word: may be 'sequences' or else ;-)
213  unsigned int numberOfSequences = TextTools::to<unsigned int>(st.nextToken());
214  string name = st.unparseRemainingTokens();
215  // The look for the set definition:
216  vector<size_t> sequenceSelection;
217  while (sequenceSelection.size() < numberOfSequences)
218  {
219  line = FileTools::getNextLine(input);
220  if (line[0] != ';' || line[1] != ';')
221  throw Exception("Mase::readHeader_(): corrupted file, sequence selection " + name + " is incomplete. Aborting.");
222  line = line.substr(2);
223  StringTokenizer st2(line, ", ");
224  while (st2.hasMoreToken())
225  {
226  unsigned int pos = TextTools::to<unsigned int>(st2.nextToken());
227  // WARNING!!! In the mase+ format, sequence numerotation is 1-based
228  sequenceSelection.push_back(pos);
229  }
230  if (sequenceSelection.size() > numberOfSequences)
231  throw Exception("Mase::readHeader_(): incorrected file, found " + TextTools::toString(sequenceSelection.size()) + "sequences while expected " + TextTools::toString(numberOfSequences));
232  }
233  header.setSequenceSelection(name, sequenceSelection);
234  }
235  else
236  {
237  // Tree:
238  index = line.find("$");
239  if (index < line.npos)
240  {
241  string name = TextTools::removeSurroundingWhiteSpaces(line.substr(index + 1));
242  // Here we stop if the line ends with a ";"
243  string tree = "";
244  do
245  {
246  line = FileTools::getNextLine(input);
247  if (line[0] != ';' || line[1] != ';')
248  throw Exception("Mase::readHeader_(): corrupted file, tree " + name + " is incomplete. Aborting.");
249  line = TextTools::removeSurroundingWhiteSpaces(line.substr(2));
250  tree += line;
251  }
252  while (!TextTools::endsWith(line, ";"));
253  header.setTree(name, tree);
254  }
255  }
256  }
257  }
258  else
259  {
260  input.putback(c);
261  break;
262  }
263  }
264  }
265  while (true);
266 }
267 
268 /****************************************************************************************/
269 
270 void Mase::writeHeader_(std::ostream& output, const MaseHeader& header) const
271 {
272  // Write trees:
273  vector<string> treeNames = header.getTreeNames();
274  for (size_t i = 0; i < treeNames.size(); ++i)
275  {
276  output << ";;$ " + treeNames[i] << endl;
277  output << ";;" + header.getTree(treeNames[i]);
278  output << endl;
279  }
280 
281  // Write site selections:
282  vector<string> siteSelectionNames = header.getSiteSelectionNames();
283  for (size_t i = 0; i < siteSelectionNames.size(); ++i)
284  {
285  MultiRange<size_t> ranges = header.getSiteSelection(siteSelectionNames[i]);
286  output << ";;Site selection " << siteSelectionNames[i] << " (" << ranges.totalLength() << " sites)" << endl;
287  output << ";;# of segments=" << ranges.size() << " " << siteSelectionNames[i] << endl;
288  output << ";;";
289  for (size_t j = 0; j < ranges.size(); ++j)
290  {
291  output << " " << (ranges.getRange(j).begin() + 1) << "," << ranges.getRange(j).end();
292  if ((j + 1) % 10 == 0)
293  output << endl << ";;";
294  }
295  output << endl;
296  }
297 
298  // Write sequence selections:
299  vector<string> sequenceSelectionNames = header.getSequenceSelectionNames();
300  for (size_t i = 0; i < sequenceSelectionNames.size(); ++i)
301  {
302  vector<size_t> set = header.getSequenceSelection(sequenceSelectionNames[i]);
303  output << ";;@ of species=" << set.size() << " " << sequenceSelectionNames[i] << endl;
304  output << ";;";
305  for (unsigned int j = 0; j < set.size(); ++j)
306  {
307  output << " " << set[j];
308  if ((j + 1) % 10 == 0)
309  output << endl << ";;";
310  }
311  output << endl;
312  }
313 }
314 
315 /****************************************************************************************/
virtual const Comments & getComments() const =0
Get the comments.
virtual void setComments(const Comments &comments)=0
Set the comments.
static std::string getNextLine(std::istream &in)
A class to store information from the header of Mase files.
Definition: Mase.h:26
std::vector< std::string > getTreeNames() const
Definition: Mase.h:41
const std::vector< size_t > & getSequenceSelection(const std::string &name) const
Definition: Mase.h:67
void setSiteSelection(const std::string &name, const MultiRange< size_t > &ranges)
Definition: Mase.h:83
std::vector< std::string > getSiteSelectionNames() const
Definition: Mase.h:42
const MultiRange< size_t > & getSiteSelection(const std::string &name) const
Definition: Mase.h:56
const std::string & getTree(const std::string &name) const
Definition: Mase.h:45
void setSequenceSelection(const std::string &name, const std::vector< size_t > &set)
Definition: Mase.h:87
std::vector< std::string > getSequenceSelectionNames() const
Definition: Mase.h:43
void setTree(const std::string &name, const std::string &tree)
Definition: Mase.h:79
void appendSequencesFromStream(std::istream &input, SequenceContainerInterface &sc) const override
Append sequences to a container from a stream.
Definition: Mase.cpp:13
void writeSequences(std::ostream &output, const SequenceContainerInterface &sc) const override
Write a container to a stream.
Definition: Mase.cpp:93
void writeHeader_(std::ostream &output, const MaseHeader &header) const
Definition: Mase.cpp:270
void readHeader_(std::istream &input, MaseHeader &header) const
Definition: Mase.cpp:159
size_t size() const
void addRange(const Range< T > &r)
size_t totalLength() const
const Range< T > & getRange(size_t i) const
T end() const
T begin() const
const std::string & nextToken()
bool hasMoreToken() const
std::string unparseRemainingTokens() const
The SequenceContainer interface.
virtual void addSequence(const HashType &sequenceKey, std::unique_ptr< SequenceType > &sequencePtr)=0
Add a sequence to the container.
virtual const SequenceType & sequence(const HashType &sequenceKey) const override=0
Retrieve a sequence object from the container.
virtual std::vector< HashType > getSequenceKeys() const =0
virtual std::shared_ptr< const Alphabet > getAlphabet() const =0
Get a pointer toward the container's alphabet.
std::string removeSurroundingWhiteSpaces(const std::string &s)
bool endsWith(const std::string &s, const std::string &pattern)
std::string toString(T t)
This alphabet is used to deal NumericAlphabet.
std::vector< std::string > Comments
Declaration of Comments type.
Definition: Commentable.h:21