bpp-seq-omics  2.4.1
AlignmentFilterMafIterator.cpp
Go to the documentation of this file.
1 //
2 // File: AlignmentFilterMafIterator.cpp
3 // Authors: Julien Dutheil
4 // Created: Tue Sep 07 2010
5 //
6 
7 /*
8 Copyright or © or Copr. Bio++ Development Team, (2010)
9 
10 This software is a computer program whose purpose is to provide classes
11 for sequences analysis.
12 
13 This software is governed by the CeCILL license under French law and
14 abiding by the rules of distribution of free software. You can use,
15 modify and/ or redistribute the software under the terms of the CeCILL
16 license as circulated by CEA, CNRS and INRIA at the following URL
17 "http://www.cecill.info".
18 
19 As a counterpart to the access to the source code and rights to copy,
20 modify and redistribute granted by the license, users are provided only
21 with a limited warranty and the software's author, the holder of the
22 economic rights, and the successive licensors have only limited
23 liability.
24 
25 In this respect, the user's attention is drawn to the risks associated
26 with loading, using, modifying and/or developing or reproducing the
27 software by the user in light of its specific status of free software,
28 that may mean that it is complicated to manipulate, and that also
29 therefore means that it is reserved for developers and experienced
30 professionals having in-depth computer knowledge. Users are therefore
31 encouraged to load and test the software's suitability as regards their
32 requirements in conditions enabling the security of their systems and/or
33 data to be ensured and, more generally, to use and operate it in the
34 same conditions as regards security.
35 
36 The fact that you are presently reading this means that you have had
37 knowledge of the CeCILL license and that you accept its terms.
38 */
39 
41 
42 using namespace bpp;
43 
44 //From the STL:
45 #include <string>
46 #include <numeric>
47 
48 using namespace std;
49 
51 {
52  if (blockBuffer_.size() == 0) {
53  //Else there is no more block in the buffer, we need to parse more:
54  do {
55  MafBlock* block = iterator_->nextBlock();
56  if (!block) return 0; //No more block.
57 
58  //Parse block.
61  size_t nr;
62  size_t nc = static_cast<size_t>(block->getNumberOfSites());
63  if (nc < windowSize_)
64  throw Exception("AlignmentFilterMafIterator::analyseCurrentBlock_. Block is smaller than window size: " + TextTools::toString(nc));
65 
66  vector< vector<int> > aln;
67  if (missingAsGap_) {
68  nr = species_.size();
69  aln.resize(nr);
70  for (size_t i = 0; i < nr; ++i) {
71  if (block->hasSequenceForSpecies(species_[i]))
72  aln[i] = block->getSequenceForSpecies(species_[i]).getContent();
73  else {
74  aln[i].resize(nc);
75  fill(aln[i].begin(), aln[i].end(), gap);
76  }
77  }
78  } else {
79  vector<string> speciesSet = VectorTools::vectorIntersection(species_, block->getSpeciesList());
80  for (size_t i = 0; i < species_.size(); ++i) {
81  if (block->hasSequenceForSpecies(species_[i])) {
82  aln.push_back(block->getSequenceForSpecies(species_[i]).getContent());
83  } else {
84  if (!relative_) {
85  throw Exception("AlignmentFilterMafIterator::analyseCurrentBlock_. Block does not include selected species '" + species_[i] + "' and threshold are absolutes, leading to an undefined behavior. Consider selecting blocks first, or use relative thresholds.");
86  }
87  }
88  }
89  nr = aln.size();
90  }
91 
92  //First we create a mask:
93  vector<size_t> pos;
94  vector<int> col(nr);
95  //Reset window:
96  window_.clear();
97  //Init window:
98  size_t i;
99  for (i = 0; i < windowSize_; ++i) {
100  for (size_t j = 0; j < nr; ++j) {
101  col[j] = aln[j][i];
102  }
103  window_.push_back(col);
104  }
105  //Slide window:
106  if (verbose_) {
107  ApplicationTools::message->endLine();
108  ApplicationTools::displayTask("Sliding window for alignment filter", true);
109  }
110  while (i + step_ < nc) {
111  if (verbose_)
112  ApplicationTools::displayGauge(i - windowSize_, nc - windowSize_ - 1, '>');
113  //Evaluate current window:
114  unsigned int sumGap = 0;
115  double sumEnt = 0;
116  for (size_t u = 0; u < window_.size(); ++u) {
117  for (size_t v = 0; v < window_[u].size(); ++v) {
118  if (window_[u][v] == gap || window_[u][v] == unk) {
119  sumGap++;
120  }
121  col[v] = window_[u][v];
122  if (col[v] == unk) col[v] = gap;
123  }
124  sumEnt += VectorTools::shannonDiscrete<int, double>(col) / log(5.);
125  }
126  bool test = (sumEnt / static_cast<double>(windowSize_)) > maxEnt_;
127  if (relative_) {
128  double propGap = static_cast<double>(sumGap) / static_cast<double>(nr * windowSize_);
129  test = test && (propGap > maxPropGap_);
130  } else {
131  test = test && (sumGap > maxGap_);
132  }
133  if (test) {
134  if (pos.size() == 0) {
135  pos.push_back(i - windowSize_);
136  pos.push_back(i);
137  } else {
138  if (i - windowSize_ <= pos[pos.size() - 1]) {
139  pos[pos.size() - 1] = i; //Windows are overlapping and we extend previous region
140  } else { //This is a new region
141  pos.push_back(i - windowSize_);
142  pos.push_back(i);
143  }
144  }
145  }
146 
147  //Move forward:
148  for (size_t k = 0; k < step_; ++k) {
149  for (size_t j = 0; j < nr; ++j) {
150  col[j] = aln[j][i];
151  }
152  window_.push_back(col);
153  window_.pop_front();
154  ++i;
155  }
156  }
157 
158  //Evaluate last window:
159  unsigned int sumGap = 0;
160  double sumEnt = 0;
161  for (size_t u = 0; u < window_.size(); ++u) {
162  for (size_t v = 0; v < window_[u].size(); ++v) {
163  if (window_[u][v] == gap || window_[u][v] == unk) {
164  sumGap++;
165  }
166  col[v] = window_[u][v];
167  if (col[v] == unk) col[v] = gap;
168  }
169  sumEnt += VectorTools::shannonDiscrete<int, double>(col) / log(5.);
170  }
171  bool test = (sumEnt / static_cast<double>(windowSize_)) > maxEnt_;
172  if (relative_) {
173  double propGap = static_cast<double>(sumGap) / static_cast<double>(nr * windowSize_);
174  test = test && (propGap > maxPropGap_);
175  } else {
176  test = test && (sumGap > maxGap_);
177  }
178  if (test) {
179  if (pos.size() == 0) {
180  pos.push_back(i - windowSize_);
181  pos.push_back(i);
182  } else {
183  if (i - windowSize_ <= pos[pos.size() - 1]) {
184  pos[pos.size() - 1] = i; //Windows are overlapping and we extend previous region
185  } else { //This is a new region
186  pos.push_back(i - windowSize_);
187  pos.push_back(i);
188  }
189  }
190  }
191  if (verbose_)
193 
194  //Now we remove regions with two many gaps, using a sliding window:
195  if (pos.size() == 0) {
196  blockBuffer_.push_back(block);
197  if (logstream_) {
198  (*logstream_ << "ALN CLEANER: block " << block->getDescription() << " is clean and kept as is.").endLine();
199  }
200  } else if (pos.size() == 2 && pos.front() == 0 && pos.back() == block->getNumberOfSites()) {
201  //Everything is removed:
202  if (logstream_) {
203  (*logstream_ << "ALN CLEANER: block " << block->getDescription() << " was entirely removed. Tried to get the next one.").endLine();
204  }
205  } else {
206  if (logstream_) {
207  (*logstream_ << "ALN CLEANER: block " << block->getDescription() << " with size "<< block->getNumberOfSites() << " will be split into " << (pos.size() / 2 + 1) << " blocks.").endLine();
208  }
209  if (verbose_) {
210  ApplicationTools::message->endLine();
211  ApplicationTools::displayTask("Spliting block", true);
212  }
213  for (i = 0; i < pos.size(); i+=2) {
214  if (verbose_)
215  ApplicationTools::displayGauge(i, pos.size() - 2, '=');
216  if (logstream_) {
217  (*logstream_ << "ALN CLEANER: removing region (" << pos[i] << ", " << pos[i+1] << ") from block " << block->getDescription() << ".").endLine();
218  }
219  if (pos[i] > 0) {
220  MafBlock* newBlock = new MafBlock();
221  newBlock->setScore(block->getScore());
222  newBlock->setPass(block->getPass());
223  for (size_t j = 0; j < block->getNumberOfSequences(); ++j) {
224  MafSequence* subseq;
225  if (i == 0) {
226  subseq = block->getSequence(j).subSequence(0, pos[i]);
227  } else {
228  subseq = block->getSequence(j).subSequence(pos[i - 1], pos[i] - pos[i - 1]);
229  }
230  newBlock->addSequence(*subseq);
231  delete subseq;
232  }
233  blockBuffer_.push_back(newBlock);
234  }
235 
236  if (keepTrashedBlocks_) {
237  MafBlock* outBlock = new MafBlock();
238  outBlock->setScore(block->getScore());
239  outBlock->setPass(block->getPass());
240  for (size_t j = 0; j < block->getNumberOfSequences(); ++j) {
241  MafSequence* outseq = block->getSequence(j).subSequence(pos[i], pos[i + 1] - pos[i]);
242  outBlock->addSequence(*outseq);
243  delete outseq;
244  }
245  trashBuffer_.push_back(outBlock);
246  }
247  }
248  //Add last block:
249  if (pos[pos.size() - 1] < block->getNumberOfSites()) {
250  MafBlock* newBlock = new MafBlock();
251  newBlock->setScore(block->getScore());
252  newBlock->setPass(block->getPass());
253  for (size_t j = 0; j < block->getNumberOfSequences(); ++j) {
254  MafSequence* subseq;
255  subseq = block->getSequence(j).subSequence(pos[pos.size() - 1], block->getNumberOfSites() - pos[pos.size() - 1]);
256  newBlock->addSequence(*subseq);
257  delete subseq;
258  }
259  blockBuffer_.push_back(newBlock);
260  }
261  if (verbose_)
263 
264  delete block;
265  }
266  } while (blockBuffer_.size() == 0);
267  }
268 
269  MafBlock* block = blockBuffer_.front();
270  blockBuffer_.pop_front();
271  return block;
272 }
273 
275 {
276  if (blockBuffer_.size() == 0) {
277  //Else there is no more block in the buffer, we need to parse more:
278  do {
279  MafBlock* block = iterator_->nextBlock();
280  if (!block) return 0; //No more block.
281 
282  //Parse block.
285  size_t nr;
286  size_t nc = static_cast<size_t>(block->getNumberOfSites());
287  if (nc < windowSize_)
288  throw Exception("AlignmentFilter2MafIterator::analyseCurrentBlock_. Block is smaller than window size: " + TextTools::toString(nc));
289 
290  vector< vector<int> > aln;
291  if (missingAsGap_) {
292  nr = species_.size();
293  aln.resize(nr);
294  for (size_t i = 0; i < nr; ++i) {
295  if (block->hasSequenceForSpecies(species_[i]))
296  aln[i] = block->getSequenceForSpecies(species_[i]).getContent();
297  else {
298  aln[i].resize(nc);
299  fill(aln[i].begin(), aln[i].end(), gap);
300  }
301  }
302  } else {
303  vector<string> speciesSet = VectorTools::vectorIntersection(species_, block->getSpeciesList());
304  for (size_t i = 0; i < species_.size(); ++i) {
305  if (block->hasSequenceForSpecies(species_[i])) {
306  aln.push_back(block->getSequenceForSpecies(species_[i]).getContent());
307  } else {
308  if (!relative_) {
309  throw Exception("AlignmentFilter2MafIterator::analyseCurrentBlock_. Block does not include selected species '" + species_[i] + "' and threshold are absolutes, leading to an undefined behavior. Consider selecting blocks first, or use relative thresholds.");
310  }
311  }
312  }
313  nr = aln.size();
314  }
315 
316  //First we create a mask:
317  vector<size_t> pos;
318  vector<bool> col(nr);
319  //Reset window:
320  window_.clear();
321  //Init window:
322  size_t i;
323  for (i = 0; i < windowSize_; ++i) {
324  for (size_t j = 0; j < nr; ++j) {
325  col[j] = (aln[j][i] == gap || aln[j][i] == unk);
326  }
327  window_.push_back(col);
328  }
329  //Slide window:
330  if (verbose_) {
331  ApplicationTools::message->endLine();
332  ApplicationTools::displayTask("Sliding window for alignment filter", true);
333  }
334  while (i + step_ < nc) {
335  if (verbose_)
336  ApplicationTools::displayGauge(i - windowSize_, nc - windowSize_ - 1, '>');
337  //Evaluate current window:
338  unsigned int count = 0;
339  bool posIsGap = false;
340  for (size_t u = 0; u < window_.size(); ++u) {
341  unsigned int partialCount = 0;
342  if (!posIsGap || (u > 0 && window_[u] != window_[u - 1])) {
343  for (size_t v = 0; v < window_[u].size(); ++v)
344  if (window_[u][v]) partialCount++;
345  bool test;
346  if (relative_) {
347  test = (static_cast<double>(partialCount) / static_cast<double>(nr) > maxPropGap_);
348  } else {
349  test = (partialCount > maxGap_);
350  }
351  if (test) {
352  count++;
353  posIsGap = true;
354  } else {
355  posIsGap = false;
356  }
357  }
358  }
359  if (count > maxPos_) {
360  if (pos.size() == 0) {
361  pos.push_back(i - windowSize_);
362  pos.push_back(i);
363  } else {
364  if (i - windowSize_ <= pos[pos.size() - 1]) {
365  pos[pos.size() - 1] = i; //Windows are overlapping and we extend previous region
366  } else { //This is a new region
367  pos.push_back(i - windowSize_);
368  pos.push_back(i);
369  }
370  }
371  }
372 
373  //Move forward:
374  for (size_t k = 0; k < step_; ++k) {
375  for (size_t j = 0; j < nr; ++j) {
376  col[j] = (aln[j][i] == gap || aln[j][i] == unk);
377  }
378  window_.push_back(col);
379  window_.pop_front();
380  ++i;
381  }
382  }
383 
384  //Evaluate last window:
385  unsigned int count = 0;
386  bool posIsGap = false;
387  for (size_t u = 0; u < window_.size(); ++u) {
388  unsigned int partialCount = 0;
389  if (!posIsGap || (u > 0 && window_[u] != window_[u - 1])) {
390  for (size_t v = 0; v < window_[u].size(); ++v)
391  if (window_[u][v]) partialCount++;
392  bool test;
393  if (relative_) {
394  test = (static_cast<double>(partialCount) / static_cast<double>(nr) > maxPropGap_);
395  } else {
396  test = (partialCount > maxGap_);
397  }
398  if (test) {
399  count++;
400  posIsGap = true;
401  } else {
402  posIsGap = false;
403  }
404  }
405  }
406  if (count > maxPos_) {
407  if (pos.size() == 0) {
408  pos.push_back(i - windowSize_);
409  pos.push_back(i);
410  } else {
411  if (i - windowSize_ <= pos[pos.size() - 1]) {
412  pos[pos.size() - 1] = i; //Windows are overlapping and we extend previous region
413  } else { //This is a new region
414  pos.push_back(i - windowSize_);
415  pos.push_back(i);
416  }
417  }
418  }
419  if (verbose_)
421 
422  //Now we remove regions with two many gaps, using a sliding window:
423  if (pos.size() == 0) {
424  blockBuffer_.push_back(block);
425  if (logstream_) {
426  (*logstream_ << "ALN CLEANER: block " << block->getDescription() << " is clean and kept as is.").endLine();
427  }
428  } else if (pos.size() == 2 && pos.front() == 0 && pos.back() == block->getNumberOfSites()) {
429  //Everything is removed:
430  if (logstream_) {
431  (*logstream_ << "ALN CLEANER: block " << block->getDescription() << " was entirely removed. Tried to get the next one.").endLine();
432  }
433  } else {
434  if (logstream_) {
435  (*logstream_ << "ALN CLEANER: block " << block->getDescription() << " with size "<< block->getNumberOfSites() << " will be split into " << (pos.size() / 2 + 1) << " blocks.").endLine();
436  }
437  if (verbose_) {
438  ApplicationTools::message->endLine();
439  ApplicationTools::displayTask("Spliting block", true);
440  }
441  for (i = 0; i < pos.size(); i+=2) {
442  if (verbose_)
443  ApplicationTools::displayGauge(i, pos.size() - 2, '=');
444  if (logstream_) {
445  (*logstream_ << "ALN CLEANER: removing region (" << pos[i] << ", " << pos[i+1] << ") from block " << block->getDescription() << ".").endLine();
446  }
447  if (pos[i] > 0) {
448  MafBlock* newBlock = new MafBlock();
449  newBlock->setScore(block->getScore());
450  newBlock->setPass(block->getPass());
451  for (size_t j = 0; j < block->getNumberOfSequences(); ++j) {
452  MafSequence* subseq;
453  if (i == 0) {
454  subseq = block->getSequence(j).subSequence(0, pos[i]);
455  } else {
456  subseq = block->getSequence(j).subSequence(pos[i - 1], pos[i] - pos[i - 1]);
457  }
458  newBlock->addSequence(*subseq);
459  delete subseq;
460  }
461  blockBuffer_.push_back(newBlock);
462  }
463 
464  if (keepTrashedBlocks_) {
465  MafBlock* outBlock = new MafBlock();
466  outBlock->setScore(block->getScore());
467  outBlock->setPass(block->getPass());
468  for (size_t j = 0; j < block->getNumberOfSequences(); ++j) {
469  MafSequence* outseq = block->getSequence(j).subSequence(pos[i], pos[i + 1] - pos[i]);
470  outBlock->addSequence(*outseq);
471  delete outseq;
472  }
473  trashBuffer_.push_back(outBlock);
474  }
475  }
476  //Add last block:
477  if (pos[pos.size() - 1] < block->getNumberOfSites()) {
478  MafBlock* newBlock = new MafBlock();
479  newBlock->setScore(block->getScore());
480  newBlock->setPass(block->getPass());
481  for (size_t j = 0; j < block->getNumberOfSequences(); ++j) {
482  MafSequence* subseq;
483  subseq = block->getSequence(j).subSequence(pos[pos.size() - 1], block->getNumberOfSites() - pos[pos.size() - 1]);
484  newBlock->addSequence(*subseq);
485  delete subseq;
486  }
487  blockBuffer_.push_back(newBlock);
488  }
489  if (verbose_)
491 
492  delete block;
493  }
494  } while (blockBuffer_.size() == 0);
495  }
496 
497  MafBlock* block = blockBuffer_.front();
498  blockBuffer_.pop_front();
499  return block;
500 }
501 
int getGapCharacterCode() const
static const DNA DNA_ALPHABET
static void displayTask(const std::string &text, bool eof=false)
static std::shared_ptr< OutputStream > message
static void displayTaskDone()
static void displayGauge(size_t iter, size_t total, char symbol='>', const std::string &mes="")
A synteny block data structure, the basic unit of a MAF alignement file.
Definition: MafBlock.h:57
unsigned int getPass() const
Definition: MafBlock.h:106
void setScore(double score)
Definition: MafBlock.h:102
void setPass(unsigned int pass)
Definition: MafBlock.h:103
size_t getNumberOfSequences() const
Definition: MafBlock.h:111
double getScore() const
Definition: MafBlock.h:105
size_t getNumberOfSites() const
Definition: MafBlock.h:113
const MafSequence & getSequence(const std::string &name) const
Definition: MafBlock.h:121
std::string getDescription() const
Definition: MafBlock.h:177
void addSequence(const MafSequence &sequence)
Definition: MafBlock.h:115
bool hasSequenceForSpecies(const std::string &species) const
Definition: MafBlock.h:129
std::vector< std::string > getSpeciesList() const
Definition: MafBlock.h:162
const MafSequence & getSequenceForSpecies(const std::string &species) const
Definition: MafBlock.h:139
A sequence class which is used to store data from MAF files.
Definition: MafSequence.h:64
MafSequence * subSequence(size_t startAt, size_t length) const
Extract a sub-sequence.
Definition: MafSequence.cpp:48
int getUnknownCharacterCode() const
virtual const std::vector< int > & getContent() const
static std::vector< T > vectorIntersection(const std::vector< T > &vec1, const std::vector< T > &vec2)
std::string toString(T t)
std::size_t count(const std::string &s, const std::string &pattern)