bpp-seq-omics  2.4.1
EntropyFilterMafIterator.cpp
Go to the documentation of this file.
1 //
2 // File: EntropyFilterMafIterator.cpp
3 // Authors: Julien Dutheil
4 // Created: Tue Sep 07 2010
5 //
6 
7 /*
8 Copyright or © or Copr. Bio++ Development Team, (2010)
9 
10 This software is a computer program whose purpose is to provide classes
11 for sequences analysis.
12 
13 This software is governed by the CeCILL license under French law and
14 abiding by the rules of distribution of free software. You can use,
15 modify and/ or redistribute the software under the terms of the CeCILL
16 license as circulated by CEA, CNRS and INRIA at the following URL
17 "http://www.cecill.info".
18 
19 As a counterpart to the access to the source code and rights to copy,
20 modify and redistribute granted by the license, users are provided only
21 with a limited warranty and the software's author, the holder of the
22 economic rights, and the successive licensors have only limited
23 liability.
24 
25 In this respect, the user's attention is drawn to the risks associated
26 with loading, using, modifying and/or developing or reproducing the
27 software by the user in light of its specific status of free software,
28 that may mean that it is complicated to manipulate, and that also
29 therefore means that it is reserved for developers and experienced
30 professionals having in-depth computer knowledge. Users are therefore
31 encouraged to load and test the software's suitability as regards their
32 requirements in conditions enabling the security of their systems and/or
33 data to be ensured and, more generally, to use and operate it in the
34 same conditions as regards security.
35 
36 The fact that you are presently reading this means that you have had
37 knowledge of the CeCILL license and that you accept its terms.
38 */
39 
41 
42 using namespace bpp;
43 
44 //From the STL:
45 #include <string>
46 #include <numeric>
47 
48 using namespace std;
49 
51 {
52  if (blockBuffer_.size() == 0) {
53  //Else there is no more block in the buffer, we need to parse more:
54  do {
55  MafBlock* block = iterator_->nextBlock();
56  if (!block) return 0; //No more block.
57 
58  //Parse block.
60  //unused for now
61  //int unk = AlphabetTools::DNA_ALPHABET.getUnknownCharacterCode();
62  size_t nr;
63  size_t nc = static_cast<size_t>(block->getNumberOfSites());
64  if (nc < windowSize_)
65  throw Exception("EntropyFilterMafIterator::analyseCurrentBlock_. Block is smaller than window size: " + TextTools::toString(nc));
66 
67  vector< vector<int> > aln;
68  if (missingAsGap_ && !ignoreGaps_) {
69  nr = species_.size();
70  aln.resize(nr);
71  for (size_t i = 0; i < nr; ++i) {
72  if (block->hasSequenceForSpecies(species_[i]))
73  aln[i] = block->getSequenceForSpecies(species_[i]).getContent();
74  else {
75  aln[i].resize(nc);
76  fill(aln[i].begin(), aln[i].end(), gap);
77  }
78  }
79  } else {
80  vector<string> speciesSet = VectorTools::vectorIntersection(species_, block->getSpeciesList());
81  nr = speciesSet.size();
82  aln.resize(nr);
83  for (size_t i = 0; i < nr; ++i) {
84  aln[i] = block->getSequenceForSpecies(species_[i]).getContent();
85  }
86  }
87  //First we create a mask:
88  vector<size_t> pos;
89  //Reset window:
90  window_.clear();
91  //Init window:
92  size_t i;
93  for (i = 0; i < windowSize_; ++i) {
94  vector<int> col;
95  for (size_t j = 0; j < nr; ++j) {
96  int x = aln[j][i];
97  if (x != gap || !ignoreGaps_)
98  col.push_back(x);
99  }
100  double entropy = VectorTools::shannonDiscrete<int, double>(col) / log(5.);
101  window_.push_back(entropy > maxEnt_ ? 1 : 0);
102  }
103  //Slide window:
104  if (verbose_) {
105  ApplicationTools::message->endLine();
106  ApplicationTools::displayTask("Sliding window for entropy filter", true);
107  }
108  while (i + step_ < nc) {
109  if (verbose_)
110  ApplicationTools::displayGauge(i - windowSize_, nc - windowSize_ - 1, '>');
111  //Evaluate current window:
112  unsigned int count = std::accumulate(window_.begin(), window_.end(), 0u);
113  if (count > maxPos_) {
114  if (pos.size() == 0) {
115  pos.push_back(i - windowSize_);
116  pos.push_back(i);
117  } else {
118  if (i - windowSize_ < pos[pos.size() - 1]) {
119  pos[pos.size() - 1] = i; //Windows are overlapping and we extend previous region
120  } else { //This is a new region
121  pos.push_back(i - windowSize_);
122  pos.push_back(i);
123  }
124  }
125  }
126 
127  //Move forward:
128  for (size_t k = 0; k < step_; ++k) {
129  vector<int> col;
130  for (size_t j = 0; j < nr; ++j) {
131  int x = aln[j][i];
132  if (x != gap || !ignoreGaps_)
133  col.push_back(x);
134  }
135  double entropy = VectorTools::shannonDiscrete<int, double>(col) / log(5.);
136  window_.push_back(entropy > maxEnt_ ? 1 : 0);
137  window_.pop_front();
138  ++i;
139  }
140  }
141 
142  //Evaluate last window:
143  unsigned int count = std::accumulate(window_.begin(), window_.end(), 0u);
144  if (count > maxPos_) {
145  if (pos.size() == 0) {
146  pos.push_back(i - windowSize_);
147  pos.push_back(i);
148  } else {
149  if (i - windowSize_ <= pos[pos.size() - 1]) {
150  pos[pos.size() - 1] = i; //Windows are overlapping and we extend previous region
151  } else { //This is a new region
152  pos.push_back(i - windowSize_);
153  pos.push_back(i);
154  }
155  }
156  }
157  if (verbose_)
159 
160  //Now we remove regions with two many gaps, using a sliding window:
161  if (pos.size() == 0) {
162  blockBuffer_.push_back(block);
163  if (logstream_) {
164  (*logstream_ << "ENTROPY CLEANER: block " << block->getDescription() << " is clean and kept as is.").endLine();
165  }
166  } else if (pos.size() == 2 && pos.front() == 0 && pos.back() == block->getNumberOfSites()) {
167  //Everything is removed:
168  if (logstream_) {
169  (*logstream_ << "ENTROPY CLEANER: block " << block->getDescription() << " was entirely removed. Tried to get the next one.").endLine();
170  }
171  } else {
172  if (logstream_) {
173  (*logstream_ << "ALN CLEANER: block " << block->getDescription() << " with size "<< block->getNumberOfSites() << " will be split into " << (pos.size() / 2 + 1) << " blocks.").endLine();
174  }
175  if (verbose_) {
176  ApplicationTools::message->endLine();
177  ApplicationTools::displayTask("Spliting block", true);
178  }
179  for (i = 0; i < pos.size(); i+=2) {
180  if (verbose_)
181  ApplicationTools::displayGauge(i, pos.size() - 2, '=');
182  if (logstream_) {
183  (*logstream_ << "ENTROPY CLEANER: removing region (" << pos[i] << ", " << pos[i+1] << ") from block " << block->getDescription() << ".").endLine();
184  }
185  if (pos[i] > 0) {
186  MafBlock* newBlock = new MafBlock();
187  newBlock->setScore(block->getScore());
188  newBlock->setPass(block->getPass());
189  for (size_t j = 0; j < block->getNumberOfSequences(); ++j) {
190  MafSequence* subseq;
191  if (i == 0) {
192  subseq = block->getSequence(j).subSequence(0, pos[i]);
193  } else {
194  subseq = block->getSequence(j).subSequence(pos[i - 1], pos[i] - pos[i - 1]);
195  }
196  newBlock->addSequence(*subseq);
197  delete subseq;
198  }
199  blockBuffer_.push_back(newBlock);
200  }
201 
202  if (keepTrashedBlocks_) {
203  MafBlock* outBlock = new MafBlock();
204  outBlock->setScore(block->getScore());
205  outBlock->setPass(block->getPass());
206  for (size_t j = 0; j < block->getNumberOfSequences(); ++j) {
207  MafSequence* outseq = block->getSequence(j).subSequence(pos[i], pos[i + 1] - pos[i]);
208  outBlock->addSequence(*outseq);
209  delete outseq;
210  }
211  trashBuffer_.push_back(outBlock);
212  }
213  }
214  //Add last block:
215  if (pos[pos.size() - 1] < block->getNumberOfSites()) {
216  MafBlock* newBlock = new MafBlock();
217  newBlock->setScore(block->getScore());
218  newBlock->setPass(block->getPass());
219  for (size_t j = 0; j < block->getNumberOfSequences(); ++j) {
220  MafSequence* subseq;
221  subseq = block->getSequence(j).subSequence(pos[pos.size() - 1], block->getNumberOfSites() - pos[pos.size() - 1]);
222  newBlock->addSequence(*subseq);
223  delete subseq;
224  }
225  blockBuffer_.push_back(newBlock);
226  }
227  if (verbose_)
229 
230  delete block;
231  }
232  } while (blockBuffer_.size() == 0);
233  }
234 
235  MafBlock* block = blockBuffer_.front();
236  blockBuffer_.pop_front();
237  return block;
238 }
239 
int getGapCharacterCode() const
static const DNA DNA_ALPHABET
static void displayTask(const std::string &text, bool eof=false)
static std::shared_ptr< OutputStream > message
static void displayTaskDone()
static void displayGauge(size_t iter, size_t total, char symbol='>', const std::string &mes="")
A synteny block data structure, the basic unit of a MAF alignement file.
Definition: MafBlock.h:57
unsigned int getPass() const
Definition: MafBlock.h:106
void setScore(double score)
Definition: MafBlock.h:102
void setPass(unsigned int pass)
Definition: MafBlock.h:103
size_t getNumberOfSequences() const
Definition: MafBlock.h:111
double getScore() const
Definition: MafBlock.h:105
size_t getNumberOfSites() const
Definition: MafBlock.h:113
const MafSequence & getSequence(const std::string &name) const
Definition: MafBlock.h:121
std::string getDescription() const
Definition: MafBlock.h:177
void addSequence(const MafSequence &sequence)
Definition: MafBlock.h:115
bool hasSequenceForSpecies(const std::string &species) const
Definition: MafBlock.h:129
std::vector< std::string > getSpeciesList() const
Definition: MafBlock.h:162
const MafSequence & getSequenceForSpecies(const std::string &species) const
Definition: MafBlock.h:139
A sequence class which is used to store data from MAF files.
Definition: MafSequence.h:64
MafSequence * subSequence(size_t startAt, size_t length) const
Extract a sub-sequence.
Definition: MafSequence.cpp:48
virtual const std::vector< int > & getContent() const
static std::vector< T > vectorIntersection(const std::vector< T > &vec1, const std::vector< T > &vec2)
std::string toString(T t)
std::size_t count(const std::string &s, const std::string &pattern)