bpp-seq-omics  2.4.1
FeatureFilterMafIterator.cpp
Go to the documentation of this file.
1 //
2 // File: FeatureFilterMafIterator.cpp
3 // Authors: Julien Dutheil
4 // Created: Tue Sep 07 2010
5 //
6 
7 /*
8 Copyright or © or Copr. Bio++ Development Team, (2010)
9 
10 This software is a computer program whose purpose is to provide classes
11 for sequences analysis.
12 
13 This software is governed by the CeCILL license under French law and
14 abiding by the rules of distribution of free software. You can use,
15 modify and/ or redistribute the software under the terms of the CeCILL
16 license as circulated by CEA, CNRS and INRIA at the following URL
17 "http://www.cecill.info".
18 
19 As a counterpart to the access to the source code and rights to copy,
20 modify and redistribute granted by the license, users are provided only
21 with a limited warranty and the software's author, the holder of the
22 economic rights, and the successive licensors have only limited
23 liability.
24 
25 In this respect, the user's attention is drawn to the risks associated
26 with loading, using, modifying and/or developing or reproducing the
27 software by the user in light of its specific status of free software,
28 that may mean that it is complicated to manipulate, and that also
29 therefore means that it is reserved for developers and experienced
30 professionals having in-depth computer knowledge. Users are therefore
31 encouraged to load and test the software's suitability as regards their
32 requirements in conditions enabling the security of their systems and/or
33 data to be ensured and, more generally, to use and operate it in the
34 same conditions as regards security.
35 
36 The fact that you are presently reading this means that you have had
37 knowledge of the CeCILL license and that you accept its terms.
38 */
39 
41 
42 using namespace bpp;
43 
44 //From the STL:
45 #include <string>
46 #include <numeric>
47 
48 using namespace std;
49 
51 {
52  if (blockBuffer_.size() == 0) {
53  //Unless there is no more block in the buffer, we need to parse more:
54  do {
55  MafBlock* block = iterator_->nextBlock();
56  if (!block) return 0; //No more block.
57 
58  //Check if the block contains the reference species:
59  if (!block->hasSequenceForSpecies(refSpecies_)) {
60  if (logstream_) {
61  (*logstream_ << "FEATURE FILTER: block " << block->getDescription() << " does not contain the reference species and was kept as is.").endLine();
62  }
63  return block;
64  }
65 
66  //Get the feature ranges for this block:
67  const MafSequence& refSeq = block->getSequenceForSpecies(refSpecies_);
68  //first check if there is one (for now we assume that features refer to the chromosome or contig name, with implicit species):
69  std::map<std::string, MultiRange<size_t> >::iterator mr = ranges_.find(refSeq.getChromosome());
70  if (mr == ranges_.end()) {
71  if (logstream_) {
72  (*logstream_ << "FEATURE FILTER: block " << block->getDescription() << " does not contain any feature and was kept as is.").endLine();
73  }
74  return block;
75  }
76  //else
77  MultiRange<size_t> mRange = mr->second;
78  //mRange.restrictTo(Range<size_t>(refSeq.start(), refSeq.stop() + 1)); jdutheil on 17/04/13: do we really need the +1 here?
79  mRange.restrictTo(refSeq.getRange(true));
80  if (mRange.isEmpty()) {
81  if (logstream_) {
82  (*logstream_ << "FEATURE FILTER: block " << block->getDescription() << " does not contain any feature and was kept as is.").endLine();
83  }
84  return block;
85  }
86  std::vector<size_t> tmp = mRange.getBounds();
87 
88  //If the reference sequence is on the negative strand, then we have to correct the coordinates:
89  std::deque<size_t> refBounds;
90  if (refSeq.getStrand() == '-') {
91  for (size_t i = 0; i < tmp.size(); ++i)
92  {
93  refBounds.push_front(refSeq.getSrcSize() - tmp[i]);
94  }
95  } else {
96  refBounds = deque<size_t>(tmp.begin(), tmp.end());
97  }
98 
99  //Now extract corresponding alignments. We use the range to split the original block.
100  //Only thing to watch out is the coordinates, refering to the ref species...
101  //A good idea is then to convert those with respect to the given block:
102 
103  int gap = refSeq.getAlphabet()->getGapCharacterCode();
104  long int refPos = static_cast<long int>(refSeq.start()) - 1;
105  //long int refPos = refSeq.getStrand() == '-' ? static_cast<long int>(refSeq.getSrcSize() - refSeq.start()) - 1 : static_cast<long int>(refSeq.start()) - 1;
106  std::vector<size_t> pos;
107  if (verbose_) {
108  ApplicationTools::message->endLine();
109  ApplicationTools::displayTask("Removing features", true);
110  }
111  for (size_t alnPos = 0; alnPos < refSeq.size() && refBounds.size() > 0; ++alnPos) {
112  if (verbose_)
113  ApplicationTools::displayGauge(static_cast<size_t>(refPos + 1), refBounds.back() + 1, '>');
114  if (refSeq[alnPos] != gap) {
115  refPos++;
116  //check if this position is a bound:
117  while (refBounds.front() == static_cast<size_t>(refPos)) {
118  pos.push_back(alnPos);
119  refBounds.pop_front();
120  }
121  }
122  }
123  if (verbose_)
125 
126  //Check if the last bound matches the end of the alignment:
127  if (refBounds.size() > 0 && refBounds.front() == refSeq.stop()) {
128  pos.push_back(refSeq.size());
129  refBounds.pop_front();
130  }
131 
132  if (refBounds.size() > 0) {
133  VectorTools::print(vector<size_t>(refBounds.begin(), refBounds.end()));
134  throw Exception("FeatureFilterMafIterator::nextBlock(). An error occurred here, " + TextTools::toString(refBounds.size()) + " coordinates are left, in sequence " + refSeq.getDescription() + "... this is most likely a bug, please report!");
135  }
136 
137  //Next step is simply to split the block according to the translated coordinates:
138  if (pos.size() == 2 && pos.front() == 0 && pos.back() == block->getNumberOfSites()) {
139  //Everything is removed:
140  if (logstream_) {
141  (*logstream_ << "FEATURE FILTER: block " << block->getDescription() << " was entirely removed. Tried to get the next one.").endLine();
142  }
143  } else {
144  if (logstream_) {
145  (*logstream_ << "FEATURE FILTER: block " << block->getDescription() << " with size "<< block->getNumberOfSites() << " will be split into " << (pos.size() / 2 + 1) << " blocks.").endLine();
146  }
147  if (verbose_) {
148  ApplicationTools::displayTask("Spliting block", true);
149  }
150  for (size_t i = 0; i < pos.size(); i+=2) {
151  if (verbose_)
152  ApplicationTools::displayGauge(i, pos.size() - 2, '=');
153  if (logstream_) {
154  (*logstream_ << "FEATURE FILTER: removing region (" << pos[i] << ", " << pos[i+1] << ") from block " << block->getDescription() << ".").endLine();
155  }
156  if (pos[i] > 0) {
157  MafBlock* newBlock = new MafBlock();
158  newBlock->setScore(block->getScore());
159  newBlock->setPass(block->getPass());
160  for (size_t j = 0; j < block->getNumberOfSequences(); ++j) {
161  MafSequence* subseq;
162  if (i == 0) {
163  subseq = block->getSequence(j).subSequence(0, pos[i]);
164  } else {
165  subseq = block->getSequence(j).subSequence(pos[i - 1], pos[i] - pos[i - 1]);
166  }
167  newBlock->addSequence(*subseq);
168  delete subseq;
169  }
170  if (newBlock->getNumberOfSites() > 0)
171  blockBuffer_.push_back(newBlock);
172  else
173  delete newBlock;
174  }
175 
176  if (keepTrashedBlocks_) {
177  MafBlock* outBlock = new MafBlock();
178  outBlock->setScore(block->getScore());
179  outBlock->setPass(block->getPass());
180  for (size_t j = 0; j < block->getNumberOfSequences(); ++j) {
181  MafSequence* outseq = block->getSequence(j).subSequence(pos[i], pos[i + 1] - pos[i]);
182  outBlock->addSequence(*outseq);
183  delete outseq;
184  }
185  trashBuffer_.push_back(outBlock);
186  }
187  }
188  //Add last block:
189  if (pos.back() < block->getNumberOfSites()) {
190  MafBlock* newBlock = new MafBlock();
191  newBlock->setScore(block->getScore());
192  newBlock->setPass(block->getPass());
193  for (size_t j = 0; j < block->getNumberOfSequences(); ++j) {
194  MafSequence* subseq;
195  subseq = block->getSequence(j).subSequence(pos[pos.size() - 1], block->getNumberOfSites() - pos[pos.size() - 1]);
196  newBlock->addSequence(*subseq);
197  delete subseq;
198  }
199  blockBuffer_.push_back(newBlock);
200  }
201  if (verbose_)
203 
204  delete block;
205  }
206  } while (blockBuffer_.size() == 0);
207  }
208 
209  MafBlock* nxtBlock = blockBuffer_.front();
210  blockBuffer_.pop_front();
211  return nxtBlock;
212 }
213 
virtual int getGapCharacterCode() const=0
static void displayTask(const std::string &text, bool eof=false)
static std::shared_ptr< OutputStream > message
static void displayTaskDone()
static void displayGauge(size_t iter, size_t total, char symbol='>', const std::string &mes="")
virtual const Alphabet * getAlphabet() const=0
virtual size_t size() const=0
A synteny block data structure, the basic unit of a MAF alignement file.
Definition: MafBlock.h:57
unsigned int getPass() const
Definition: MafBlock.h:106
void setScore(double score)
Definition: MafBlock.h:102
void setPass(unsigned int pass)
Definition: MafBlock.h:103
size_t getNumberOfSequences() const
Definition: MafBlock.h:111
double getScore() const
Definition: MafBlock.h:105
size_t getNumberOfSites() const
Definition: MafBlock.h:113
const MafSequence & getSequence(const std::string &name) const
Definition: MafBlock.h:121
std::string getDescription() const
Definition: MafBlock.h:177
void addSequence(const MafSequence &sequence)
Definition: MafBlock.h:115
bool hasSequenceForSpecies(const std::string &species) const
Definition: MafBlock.h:129
const MafSequence & getSequenceForSpecies(const std::string &species) const
Definition: MafBlock.h:139
A sequence class which is used to store data from MAF files.
Definition: MafSequence.h:64
std::string getDescription() const
Definition: MafSequence.h:188
const std::string & getChromosome() const
Definition: MafSequence.h:164
size_t start() const
Definition: MafSequence.h:116
size_t stop() const
Definition: MafSequence.h:121
char getStrand() const
Definition: MafSequence.h:166
size_t getSrcSize() const
Definition: MafSequence.h:170
Range< size_t > getRange(bool origin=true) const
Definition: MafSequence.h:131
MafSequence * subSequence(size_t startAt, size_t length) const
Extract a sub-sequence.
Definition: MafSequence.cpp:48
void restrictTo(const Range< T > &r)
bool isEmpty() const
std::vector< T > getBounds() const
static void print(const std::vector< T > &v1, OutputStream &out=*ApplicationTools::message, const std::string &delim=" ")
std::string toString(T t)