52 if (blockBuffer_.size() == 0) {
55 MafBlock* block = iterator_->nextBlock();
66 vector< vector<int> > aln;
70 for (
size_t i = 0; i < nr; ++i) {
75 fill(aln[i].begin(), aln[i].end(), gap);
80 for (
size_t i = 0; i < species_.size(); ++i) {
85 throw Exception(
"AlignmentFilterMafIterator::analyseCurrentBlock_. Block does not include selected species '" + species_[i] +
"' and threshold are absolutes, leading to an undefined behavior. Consider selecting blocks first, or use relative thresholds.");
99 for (i = 0; i < windowSize_; ++i) {
100 for (
size_t j = 0; j < nr; ++j) {
103 window_.push_back(col);
110 while (i + step_ < nc) {
114 unsigned int sumGap = 0;
116 for (
size_t u = 0; u < window_.size(); ++u) {
117 for (
size_t v = 0; v < window_[u].size(); ++v) {
118 if (window_[u][v] == gap || window_[u][v] == unk) {
121 col[v] = window_[u][v];
122 if (col[v] == unk) col[v] = gap;
124 sumEnt += VectorTools::shannonDiscrete<int, double>(col) / log(5.);
126 bool test = (sumEnt /
static_cast<double>(windowSize_)) > maxEnt_;
128 double propGap =
static_cast<double>(sumGap) /
static_cast<double>(nr * windowSize_);
129 test = test && (propGap > maxPropGap_);
131 test = test && (sumGap > maxGap_);
134 if (pos.size() == 0) {
135 pos.push_back(i - windowSize_);
138 if (i - windowSize_ <= pos[pos.size() - 1]) {
139 pos[pos.size() - 1] = i;
141 pos.push_back(i - windowSize_);
148 for (
size_t k = 0; k < step_; ++k) {
149 for (
size_t j = 0; j < nr; ++j) {
152 window_.push_back(col);
159 unsigned int sumGap = 0;
161 for (
size_t u = 0; u < window_.size(); ++u) {
162 for (
size_t v = 0; v < window_[u].size(); ++v) {
163 if (window_[u][v] == gap || window_[u][v] == unk) {
166 col[v] = window_[u][v];
167 if (col[v] == unk) col[v] = gap;
169 sumEnt += VectorTools::shannonDiscrete<int, double>(col) / log(5.);
171 bool test = (sumEnt /
static_cast<double>(windowSize_)) > maxEnt_;
173 double propGap =
static_cast<double>(sumGap) /
static_cast<double>(nr * windowSize_);
174 test = test && (propGap > maxPropGap_);
176 test = test && (sumGap > maxGap_);
179 if (pos.size() == 0) {
180 pos.push_back(i - windowSize_);
183 if (i - windowSize_ <= pos[pos.size() - 1]) {
184 pos[pos.size() - 1] = i;
186 pos.push_back(i - windowSize_);
195 if (pos.size() == 0) {
196 blockBuffer_.push_back(block);
198 (*logstream_ <<
"ALN CLEANER: block " << block->
getDescription() <<
" is clean and kept as is.").endLine();
200 }
else if (pos.size() == 2 && pos.front() == 0 && pos.back() == block->
getNumberOfSites()) {
203 (*logstream_ <<
"ALN CLEANER: block " << block->
getDescription() <<
" was entirely removed. Tried to get the next one.").endLine();
207 (*logstream_ <<
"ALN CLEANER: block " << block->
getDescription() <<
" with size "<< block->
getNumberOfSites() <<
" will be split into " << (pos.size() / 2 + 1) <<
" blocks.").endLine();
213 for (i = 0; i < pos.size(); i+=2) {
217 (*logstream_ <<
"ALN CLEANER: removing region (" << pos[i] <<
", " << pos[i+1] <<
") from block " << block->
getDescription() <<
".").endLine();
233 blockBuffer_.push_back(newBlock);
236 if (keepTrashedBlocks_) {
245 trashBuffer_.push_back(outBlock);
259 blockBuffer_.push_back(newBlock);
266 }
while (blockBuffer_.size() == 0);
269 MafBlock* block = blockBuffer_.front();
270 blockBuffer_.pop_front();
276 if (blockBuffer_.size() == 0) {
279 MafBlock* block = iterator_->nextBlock();
280 if (!block)
return 0;
287 if (nc < windowSize_)
290 vector< vector<int> > aln;
292 nr = species_.size();
294 for (
size_t i = 0; i < nr; ++i) {
299 fill(aln[i].begin(), aln[i].end(), gap);
304 for (
size_t i = 0; i < species_.size(); ++i) {
309 throw Exception(
"AlignmentFilter2MafIterator::analyseCurrentBlock_. Block does not include selected species '" + species_[i] +
"' and threshold are absolutes, leading to an undefined behavior. Consider selecting blocks first, or use relative thresholds.");
318 vector<bool> col(nr);
323 for (i = 0; i < windowSize_; ++i) {
324 for (
size_t j = 0; j < nr; ++j) {
325 col[j] = (aln[j][i] == gap || aln[j][i] == unk);
327 window_.push_back(col);
334 while (i + step_ < nc) {
338 unsigned int count = 0;
339 bool posIsGap =
false;
340 for (
size_t u = 0; u < window_.size(); ++u) {
341 unsigned int partialCount = 0;
342 if (!posIsGap || (u > 0 && window_[u] != window_[u - 1])) {
343 for (
size_t v = 0; v < window_[u].size(); ++v)
344 if (window_[u][v]) partialCount++;
347 test = (
static_cast<double>(partialCount) /
static_cast<double>(nr) > maxPropGap_);
349 test = (partialCount > maxGap_);
359 if (
count > maxPos_) {
360 if (pos.size() == 0) {
361 pos.push_back(i - windowSize_);
364 if (i - windowSize_ <= pos[pos.size() - 1]) {
365 pos[pos.size() - 1] = i;
367 pos.push_back(i - windowSize_);
374 for (
size_t k = 0; k < step_; ++k) {
375 for (
size_t j = 0; j < nr; ++j) {
376 col[j] = (aln[j][i] == gap || aln[j][i] == unk);
378 window_.push_back(col);
385 unsigned int count = 0;
386 bool posIsGap =
false;
387 for (
size_t u = 0; u < window_.size(); ++u) {
388 unsigned int partialCount = 0;
389 if (!posIsGap || (u > 0 && window_[u] != window_[u - 1])) {
390 for (
size_t v = 0; v < window_[u].size(); ++v)
391 if (window_[u][v]) partialCount++;
394 test = (
static_cast<double>(partialCount) /
static_cast<double>(nr) > maxPropGap_);
396 test = (partialCount > maxGap_);
406 if (
count > maxPos_) {
407 if (pos.size() == 0) {
408 pos.push_back(i - windowSize_);
411 if (i - windowSize_ <= pos[pos.size() - 1]) {
412 pos[pos.size() - 1] = i;
414 pos.push_back(i - windowSize_);
423 if (pos.size() == 0) {
424 blockBuffer_.push_back(block);
426 (*logstream_ <<
"ALN CLEANER: block " << block->
getDescription() <<
" is clean and kept as is.").endLine();
428 }
else if (pos.size() == 2 && pos.front() == 0 && pos.back() == block->
getNumberOfSites()) {
431 (*logstream_ <<
"ALN CLEANER: block " << block->
getDescription() <<
" was entirely removed. Tried to get the next one.").endLine();
435 (*logstream_ <<
"ALN CLEANER: block " << block->
getDescription() <<
" with size "<< block->
getNumberOfSites() <<
" will be split into " << (pos.size() / 2 + 1) <<
" blocks.").endLine();
441 for (i = 0; i < pos.size(); i+=2) {
445 (*logstream_ <<
"ALN CLEANER: removing region (" << pos[i] <<
", " << pos[i+1] <<
") from block " << block->
getDescription() <<
".").endLine();
461 blockBuffer_.push_back(newBlock);
464 if (keepTrashedBlocks_) {
473 trashBuffer_.push_back(outBlock);
487 blockBuffer_.push_back(newBlock);
494 }
while (blockBuffer_.size() == 0);
497 MafBlock* block = blockBuffer_.front();
498 blockBuffer_.pop_front();
int getGapCharacterCode() const
MafBlock * analyseCurrentBlock_()
MafBlock * analyseCurrentBlock_()
A synteny block data structure, the basic unit of a MAF alignement file.
unsigned int getPass() const
void setScore(double score)
void setPass(unsigned int pass)
size_t getNumberOfSequences() const
size_t getNumberOfSites() const
const MafSequence & getSequence(const std::string &name) const
std::string getDescription() const
void addSequence(const MafSequence &sequence)
bool hasSequenceForSpecies(const std::string &species) const
std::vector< std::string > getSpeciesList() const
const MafSequence & getSequenceForSpecies(const std::string &species) const
A sequence class which is used to store data from MAF files.
MafSequence * subSequence(size_t startAt, size_t length) const
Extract a sub-sequence.
int getUnknownCharacterCode() const
virtual const std::vector< int > & getContent() const
std::string toString(T t)
std::size_t count(const std::string &s, const std::string &pattern)