bpp-seq3  3.0.0
Fasta.cpp
Go to the documentation of this file.
1 // SPDX-FileCopyrightText: The Bio++ Development Group
2 //
3 // SPDX-License-Identifier: CECILL-2.1
4 
5 #include <Bpp/Io/FileTools.h>
7 #include <Bpp/Text/TextTools.h>
8 #include <fstream>
9 
10 #include "../StringSequenceTools.h"
11 #include "Fasta.h"
12 
13 using namespace bpp;
14 using namespace std;
15 
16 /******************************************************************************/
17 
18 bool Fasta::nextSequence(istream& input, Sequence& seq) const
19 {
20  if (!input)
21  throw IOException("Fasta::nextSequence: can't read from istream input");
22  string seqname = "";
23  string content = "";
24  Comments seqcmts;
25  short seqcpt = 0;
26  string linebuffer = "";
27  char c;
28  while (!input.eof())
29  {
30  c = static_cast<char>(input.peek());
31  if (input.eof())
32  c = '\n';
33 
34  // Sequence beginning detection
35  if (c == '>')
36  {
37  // Stop if find a new sequence
38  if (seqcpt++)
39  break;
40  }
41  getline(input, linebuffer);
42  if (c == '>')
43  {
44  // Get the sequence name line
45  seqname = string(linebuffer.begin() + 1, linebuffer.end());
46  }
47  if (c != '>' && !TextTools::isWhiteSpaceCharacter(c))
48  {
49  // Sequence content
50  content += TextTools::toUpper(TextTools::removeWhiteSpaces(linebuffer));
51  }
52  }
53 
54  seqname = TextTools::removeWhiteSpaces(seqname);
55 
56  bool res = (!input.eof());
57  // Sequence name and comments isolation
58  if (strictNames_ || extended_)
59  {
60  size_t pos = seqname.find_first_of(" \t\n");
61  string seqcmt;
62  if (pos != string::npos)
63  {
64  seqcmt = seqname.substr(pos + 1);
65  seqname = seqname.substr(0, pos);
66  }
67  if (extended_)
68  {
69  StringTokenizer st(seqcmt, " \\", true, false);
70  while (st.hasMoreToken())
71  {
72  seqcmts.push_back(st.nextToken());
73  }
74  }
75  else
76  {
77  seqcmts.push_back(seqcmt);
78  }
79  seq.setComments(seqcmts);
80  }
81  seq.setName(seqname);
82  seq.setContent(content);
83  return res;
84 }
85 
86 /******************************************************************************/
87 
88 void Fasta::writeSequence(ostream& output, const Sequence& seq) const
89 {
90  if (!output)
91  throw IOException("Fasta::writeSequence: can't write to ostream output");
92  // Sequence name
93  output << ">" << seq.getName();
94  // Sequence comments
95  if (extended_)
96  {
97  for (unsigned int i = 0; i < seq.getComments().size(); i++)
98  {
99  output << " \\" << seq.getComments()[i];
100  }
101  }
102  output << endl;
103  // Sequence content
104  string buffer; // use a buffer to format sequence with states > 1 char
105  for (size_t i = 0; i < seq.size(); ++i)
106  {
107  buffer += seq.getChar(i);
108  if (buffer.size() >= charsByLine_)
109  {
110  output << string(buffer.begin(), buffer.begin() + charsByLine_) << endl;
111  buffer.erase(0, charsByLine_);
112  }
113  }
114  output << string(buffer.begin(), buffer.end()) << endl;
115 }
116 
117 /******************************************************************************/
118 
120 {
121  if (!input)
122  throw IOException("Fasta::appendFromStream: can't read from istream input");
123  char c = '\n';
124  char last_c;
125  bool header = false;
126  bool hasSeq = true;
127  string line = "";
128  Comments cmts;
129  while (!input.eof() && hasSeq)
130  {
131  last_c = c;
132  input.get(c);
133  // Header detection
134  if (extended_ && c == '#')
135  {
136  header = true;
137  continue;
138  }
139  // Header end detection
140  if (c == '\n')
141  {
142  if (extended_ && header)
143  {
144  if (line[0] == '\\')
145  {
146  line.erase(line.begin());
147  cmts.push_back(line);
148  }
149  line = "";
150  header = false;
151  }
152  continue;
153  }
154  // Header capture
155  if (header)
156  {
157  line.append(1, c);
158  }
159  // Sequence detection
160  if (c == '>' && last_c == '\n')
161  {
162  input.putback(c);
163  c = last_c;
164  auto alphaPtr = vsc.getAlphabet();
165  auto tmpseq = make_unique<Sequence>("", "", alphaPtr);
166  hasSeq = nextSequence(input, *tmpseq);
167  vsc.addSequence(tmpseq->getName(), tmpseq);
168  }
169  }
170  if (extended_ && cmts.size())
171  {
172  vsc.setComments(cmts);
173  }
174 }
175 
176 /******************************************************************************/
177 
178 void Fasta::writeSequences(ostream& output, const SequenceContainerInterface& sc) const
179 {
180  if (!output)
181  throw IOException("Fasta::write: can't write to ostream output");
182 
183  if (extended_)
184  {
185  // Loop for all general comments
186  for (size_t i = 0; i < sc.getComments().size(); ++i)
187  {
188  output << "#\\" << sc.getComments()[i] << endl;
189  }
190  output << endl;
191  }
192 
193  // Main loop : for all sequences in vector container
194  vector<string> names = sc.getSequenceNames();
195  for (size_t i = 0; i < names.size(); ++i)
196  {
197  writeSequence(output, sc.sequence(names[i]));
198  }
199 }
200 
201 /******************************************************************************/
202 
203 // FileIndex class
204 
205 void Fasta::FileIndex::build(const std::string& path, const bool strictSequenceNames)
206 {
207  // open the file
208  std::ifstream f_in(path.c_str());
209  // get the size of the file
210  f_in.seekg(0, std::ios::end);
211  fileSize_ = f_in.tellg();
212  // feed the map
213  f_in.seekg(0, std::ios::beg);
214  streampos pos = f_in.tellg();
215  char ch;
216  std::string seq_id = "";
217  while (f_in.get(ch))
218  {
219  if (ch == '>')
220  {
221  pos = static_cast<int>(f_in.tellg()) - 1;
222  std::getline(f_in, seq_id);
223  if (strictSequenceNames)
224  {
225  seq_id = seq_id.substr(0, seq_id.find_first_of(" \t\n"));
226  }
227  index_[seq_id] = pos;
228  }
229  }
230  f_in.close();
231 }
232 
233 streampos Fasta::FileIndex::getSequencePosition(const std::string& id) const
234 {
235  std::map<std::string, streampos>::const_iterator it = index_.find(id);
236  if (it != index_.end())
237  {
238  return it->second;
239  }
240  throw Exception("Sequence not found: " + id);
241 }
242 
243 void Fasta::FileIndex::read(const std::string& path)
244 {
245  std::ifstream f_in(path.c_str());
246  std::string line_buffer = "";
247  while (!f_in.eof())
248  {
249  std::getline(f_in, line_buffer);
251  {
252  continue;
253  }
254  bpp::StringTokenizer tk(line_buffer, "\t");
255  index_[tk.getToken(0)] = bpp::TextTools::toInt(tk.getToken(1));
256  }
257  f_in.close();
258 }
259 
260 void Fasta::FileIndex::write(const std::string& path)
261 {
262  std::ofstream f_out(path.c_str());
263  for (std::map<std::string, streampos>::const_iterator it = index_.begin(); it != index_.end(); ++it)
264  {
265  f_out << it->first << "\t" << bpp::TextTools::toString(it->second) << std::endl;
266  }
267  f_out.close();
268 }
269 
270 void Fasta::FileIndex::getSequence(const std::string& seqid, Sequence& seq, const std::string& path) const
271 {
272  getSequence(seqid, seq, path, false);
273 }
274 
275 void Fasta::FileIndex::getSequence(const std::string& seqid, Sequence& seq, const std::string& path, const bool strictSequenceNames) const
276 {
277  Fasta fs(60);
278  fs.strictNames(strictSequenceNames);
279  streampos seq_pos = this->getSequencePosition(seqid);
280  std::ifstream fasta(path.c_str());
281  fasta.seekg(seq_pos);
282  fs.nextSequence(fasta, seq);
283  fasta.close();
284 }
285 
286 /******************************************************************************/
const std::string & getName() const override
Get the name of this sequence.
Definition: CoreSequence.h:170
void setName(const std::string &name) override
Set the name of this sequence.
Definition: CoreSequence.h:172
size_t size() const override
Get the number of elements in the list.
Definition: SymbolList.h:124
virtual const Comments & getComments() const =0
Get the comments.
virtual void setComments(const Comments &comments)=0
Set the comments.
void read(const std::string &path)
Read the index from a file.
Definition: Fasta.cpp:243
void build(const std::string &path)
Build the index given a path to the file.
Definition: Fasta.h:151
void write(const std::string &path)
Write the index to a file.
Definition: Fasta.cpp:260
void getSequence(const std::string &seqid, Sequence &seq, const std::string &path) const
Get a sequence given its ID.
Definition: Fasta.cpp:270
std::streampos getSequencePosition(const std::string &id) const
Get the position of a Sequence given its ID.
Definition: Fasta.cpp:233
The fasta sequence file format.
Definition: Fasta.h:32
bool strictNames() const
Definition: Fasta.h:133
bool nextSequence(std::istream &input, Sequence &seq) const override
Definition: Fasta.cpp:18
void appendSequencesFromStream(std::istream &input, SequenceContainerInterface &sc) const override
Append sequences to a container from a stream.
Definition: Fasta.cpp:119
void writeSequence(std::ostream &output, const Sequence &seq) const override
Definition: Fasta.cpp:88
void writeSequences(std::ostream &output, const SequenceContainerInterface &sc) const override
Write a container to a stream.
Definition: Fasta.cpp:178
A basic implementation of the Sequence interface.
Definition: Sequence.h:117
void setContent(const std::string &sequence) override
Set the whole content of the sequence.
Definition: Sequence.cpp:20
std::string getChar(size_t pos) const override
Get the element at position 'pos' as a character.
Definition: Sequence.h:346
const Comments & getComments() const override
Get the comments.
Definition: Commentable.h:79
void setComments(const Comments &comments) override
Set the comments.
Definition: Commentable.h:86
const std::string & nextToken()
bool hasMoreToken() const
const std::string & getToken(size_t pos) const
The SequenceContainer interface.
virtual void addSequence(const HashType &sequenceKey, std::unique_ptr< SequenceType > &sequencePtr)=0
Add a sequence to the container.
virtual const SequenceType & sequence(const HashType &sequenceKey) const override=0
Retrieve a sequence object from the container.
virtual std::vector< std::string > getSequenceNames() const =0
virtual std::shared_ptr< const Alphabet > getAlphabet() const =0
Get a pointer toward the container's alphabet.
int toInt(const std::string &s, char scientificNotation='e')
std::string removeWhiteSpaces(const std::string &s)
std::string removeSurroundingWhiteSpaces(const std::string &s)
std::string toUpper(const std::string &s)
bool isWhiteSpaceCharacter(char c)
bool isEmpty(const std::string &s)
std::string toString(T t)
This alphabet is used to deal NumericAlphabet.
std::vector< std::string > Comments
Declaration of Comments type.
Definition: Commentable.h:21