bpp-seq3  3.0.0
Phylip.cpp
Go to the documentation of this file.
1 // SPDX-FileCopyrightText: The Bio++ Development Group
2 //
3 // SPDX-License-Identifier: CECILL-2.1
4 
5 #include <Bpp/Io/FileTools.h>
7 #include <Bpp/Text/TextTools.h>
8 
9 #include "../Container/SequenceContainerTools.h"
10 #include "Phylip.h"
11 
12 using namespace bpp;
13 
14 // From the STL:
15 #include <sstream>
16 
17 using namespace std;
18 
19 /******************************************************************************/
20 
21 const std::vector<std::string> Phylip::splitNameAndSequence(const std::string& s) const
22 {
23  vector<string> v(2);
24  if (extended_)
25  {
26  string::size_type index = s.find(namesSplit_);
27  if (index == string::npos)
28  throw Exception("No sequence name found.");
29  v[0] = TextTools::removeSurroundingWhiteSpaces(s.substr(0, index));
30  v[1] = TextTools::removeFirstWhiteSpaces (s.substr(index + namesSplit_.size())); // There may be more than 2 white spaces.
31  }
32  else
33  {
34  v[0] = TextTools::removeSurroundingWhiteSpaces(s.substr(0, 10));
35  v[1] = s.substr(10);
36  }
37  return v;
38 }
39 
40 /******************************************************************************/
41 
42 void Phylip::readSequential(std::istream& in, SequenceContainerInterface& sc) const
43 {
44  auto alphaPtr = sc.getAlphabet();
45  string temp;
46 
47  // Ignore first line:
48  getline(in, temp, '\n'); // Copy current line in temporary string
50  string name = "";
51  string seq = "";
52 
53  while (!in.eof())
54  {
55  // Read each sequence:
56  vector<string> v;
57  bool hasName = true;
58  try
59  {
60  v = splitNameAndSequence(temp);
61  }
62  catch (Exception& e)
63  {
64  hasName = false;
65  }
66  if (hasName)
67  {
68  // a new sequence is found:
69  if (!TextTools::isEmpty(name)) // If this is not the first sequence!
70  {
71  // Add the previous sequence to the container:
72  auto seqPtr = make_unique<Sequence>(name, seq, alphaPtr);
73  sc.addSequence(name, seqPtr);
74  }
75  name = v[0];
76  seq = v[1];
77  }
78  else
79  {
80  // No sequence name found.
81  if (TextTools::isEmpty(name))
82  throw Exception("First sequence in file has no name!");
83  seq += TextTools::removeWhiteSpaces(temp);
84  }
85  // while(!TextTools::isEmpty(temp))
86  // {
87  // //Sequences are separated by at least one blank line:
88  // getline(in, temp, '\n'); // read next line in file.
89  // seq += TextTools::removeWhiteSpaces(temp);
90  // }
91  // end of this sequence:
93  }
94  // Add last sequence:
95  auto seqPtr = make_unique<Sequence>(name, seq, alphaPtr);
96  sc.addSequence(name, seqPtr);
97 }
98 
99 /******************************************************************************/
100 
101 void Phylip::readInterleaved(std::istream& in, SequenceContainerInterface& sc) const
102 {
103  auto alphaPtr = sc.getAlphabet();
104  string temp;
105 
106  // Read first line:
107  getline(in, temp, '\n'); // Copy current line in temporary string
108  StringTokenizer st(temp);
109  unsigned int nbSequences = TextTools::to<unsigned int>(st.nextToken());
110  // int nbSites = TextTools::toInt(st.nextToken());
111  temp = FileTools::getNextLine(in);
112 
113  vector<string> names, seqs;
114  // Read first block:
115  for (size_t i = 0; i < nbSequences && !in.eof() && !TextTools::isEmpty(temp); ++i)
116  {
117  vector<string> v = splitNameAndSequence(temp);
118  names.push_back(v[0]);
119  seqs.push_back(v[1]);
120  getline(in, temp, '\n'); // read next line in file.
121  }
122 
123  // Then read all other blocks:
124  temp = FileTools::getNextLine(in);
125  while (!in.eof())
126  {
127  for (size_t i = 0; i < names.size(); ++i)
128  {
129  if (TextTools::isEmpty(temp))
130  throw IOException("Phylip::readInterleaved. Bad file,there are not the same number of sequence in each block.");
131  seqs[i] += TextTools::removeWhiteSpaces(temp);
132  getline(in, temp, '\n'); // read next line in file.
133  }
134  temp = FileTools::getNextLine(in);
135  }
136  for (size_t i = 0; i < names.size(); ++i)
137  {
138  auto seqPtr = make_unique<Sequence>(names[i], seqs[i], alphaPtr);
139  sc.addSequence(names[i], seqPtr);
140  }
141 }
142 
143 /******************************************************************************/
144 
146 {
147  // Checking the existence of specified file
148  if (!input)
149  {
150  throw IOException ("Phylip::read: fail to open file");
151  }
152 
153  if (sequential_)
154  readSequential (input, sc);
155  else
156  readInterleaved(input, sc);
157 }
158 
159 /******************************************************************************/
160 
161 unsigned int Phylip::getNumberOfSequences(const std::string& path) const
162 {
163  // Checking the existence of specified file
164  ifstream file (path.c_str(), ios::in);
165  if (!file)
166  {
167  throw IOException ("Phylip::getNumberOfSequences: failed to open file");
168  }
169  string firstLine = FileTools::getNextLine(file);
170  StringTokenizer st(firstLine, " \t");
171  istringstream iss(st.nextToken());
172  unsigned int nb;
173  iss >> nb;
174  file.close();
175  return nb;
176 }
177 
178 /******************************************************************************/
179 
180 std::vector<std::string> Phylip::getSizedNames(const std::vector<std::string>& names) const
181 {
182  vector<string> sizedNames(names.size());
183  if (extended_)
184  {
185  // Add 6 white spaces to the larger name and align other names.
186  // First, determine the size of the wider name:
187  size_t sizeMax = 0;
188  for (size_t i = 0; i < names.size(); i++)
189  {
190  if (names[i].size() > sizeMax)
191  sizeMax = names[i].size();
192  }
193  // Quite easy ;-) Now update all lengths:
194  for (size_t i = 0; i < names.size(); i++)
195  {
196  sizedNames[i] = TextTools::resizeRight(names[i], sizeMax) + namesSplit_;
197  }
198  }
199  else
200  {
201  // We trunc all names to ten characters:
202  for (unsigned int i = 0; i < names.size(); i++)
203  {
204  sizedNames[i] = TextTools::resizeRight(names[i], 10);
205  }
206  cout << "Warning: names have been truncated to 10 characters. They may be ambiguous sequence names then." << endl;
207  }
208  return sizedNames;
209 }
210 
211 /******************************************************************************/
212 
213 void Phylip::writeSequential(std::ostream& out, const SiteContainerInterface& sc) const
214 {
215  // cout << "Write sequential" << endl;
216  size_t numberOfSites = sc.sequence(sc.getSequenceNames()[0]).size() * sc.getAlphabet()->getStateCodingSize();
217  out << sc.getNumberOfSequences() << " " << numberOfSites << endl;
218 
219  vector<string> seqNames = sc.getSequenceNames();
220  vector<string> names = getSizedNames(seqNames);
221  for (size_t i = 0; i < sc.getNumberOfSequences(); ++i)
222  {
223  vector<string> seq = TextTools::split(sc.sequence(i).toString(), charsByLine_);
224  out << names[i] << seq[0] << endl;
225  for (size_t j = 1; j < seq.size(); ++j)
226  {
227  out << string(names[i].size(), ' ') << seq[j] << endl;
228  }
229  out << endl;
230  }
231 }
232 
233 void Phylip::writeInterleaved(std::ostream& out, const SiteContainerInterface& sc) const
234 {
235  // cout << "Write interleaved;" << endl;
236  size_t numberOfSites = sc.sequence(sc.getSequenceNames()[0]).size() * sc.getAlphabet()->getStateCodingSize();
237  out << sc.getNumberOfSequences() << " " << numberOfSites << endl;
238 
239  vector<string> seqNames = sc.getSequenceNames();
240  vector<string> names = getSizedNames(seqNames);
241  // Split sequences:
242  vector< vector<string>> seqs(sc.getNumberOfSequences());
243  for (size_t i = 0; i < sc.getNumberOfSequences(); ++i)
244  {
245  seqs[i] = TextTools::split(sc.sequence(i).toString(), charsByLine_);
246  }
247  // Write first block:
248  for (size_t i = 0; i < names.size(); ++i)
249  {
250  out << names[i] << seqs[i][0] << endl;
251  }
252  out << endl;
253  // Write other blocks:
254  for (size_t j = 1; j < seqs[0].size(); ++j)
255  {
256  for (size_t i = 0; i < sc.getNumberOfSequences(); ++i)
257  {
258  out << seqs[i][j] << endl;
259  }
260  out << endl;
261  }
262 }
263 
264 /******************************************************************************/
265 
266 void Phylip::writeAlignment(std::ostream& output, const SiteContainerInterface& sc) const
267 {
268  // First must check if all sequences are aligned:
269  if (sc.getNumberOfSequences() == 0)
270  throw Exception("Phylip::write. SequenceContainer appear to contain no sequence.");
271 
272  // Checking the existence of specified file, and possibility to open it in write mode
273  if (!output)
274  {
275  throw IOException ("Phylip::write : failed to open file");
276  }
277 
278  if (sequential_)
279  writeSequential (output, sc);
280  else
281  writeInterleaved(output, sc);
282 }
283 
284 /******************************************************************************/
285 
286 const std::string Phylip::getFormatName() const { return "Phylip file, " + string(extended_ ? "extended," : "") + string(sequential_ ? "sequential" : "interleaved"); }
287 
288 /******************************************************************************/
289 
290 const std::string Phylip::getFormatDescription() const
291 {
292  return "Phylip file format, sequential and interleaved. PAML extension also supported.";
293 }
294 
295 /******************************************************************************/
static std::string getNextLine(std::istream &in)
void writeSequential(std::ostream &out, const SiteContainerInterface &sc) const
Definition: Phylip.cpp:213
const std::string getFormatDescription() const override
Definition: Phylip.cpp:290
void readSequential(std::istream &in, SequenceContainerInterface &asc) const
Definition: Phylip.cpp:42
void writeInterleaved(std::ostream &out, const SiteContainerInterface &sc) const
Definition: Phylip.cpp:233
unsigned int getNumberOfSequences(const std::string &path) const
Definition: Phylip.cpp:161
void writeAlignment(std::ostream &output, const SiteContainerInterface &sc) const override
Write a container to a stream.
Definition: Phylip.cpp:266
const std::string getFormatName() const override
Definition: Phylip.cpp:286
std::vector< std::string > getSizedNames(const std::vector< std::string > &names) const
Definition: Phylip.cpp:180
const std::vector< std::string > splitNameAndSequence(const std::string &s) const
Definition: Phylip.cpp:21
void readInterleaved(std::istream &in, SequenceContainerInterface &asc) const
Definition: Phylip.cpp:101
void appendAlignmentFromStream(std::istream &input, SequenceContainerInterface &sc) const override
Append sequences to a container from a stream.
Definition: Phylip.cpp:145
const std::string & nextToken()
The SequenceContainer interface.
virtual void addSequence(const HashType &sequenceKey, std::unique_ptr< SequenceType > &sequencePtr)=0
Add a sequence to the container.
virtual const SequenceType & sequence(const HashType &sequenceKey) const override=0
Retrieve a sequence object from the container.
virtual std::vector< std::string > getSequenceNames() const =0
virtual size_t getNumberOfSequences() const =0
Get the number of sequences in the container.
virtual std::shared_ptr< const Alphabet > getAlphabet() const =0
Get a pointer toward the container's alphabet.
std::string removeWhiteSpaces(const std::string &s)
std::string removeSurroundingWhiteSpaces(const std::string &s)
bool isEmpty(const std::string &s)
std::vector< std::string > split(const std::string &s, std::size_t n)
std::string removeFirstWhiteSpaces(const std::string &s)
std::string resizeRight(const std::string &s, std::size_t newSize, char fill)
This alphabet is used to deal NumericAlphabet.