52 if (blockBuffer_.size() == 0) {
55 MafBlock* block = iterator_->nextBlock();
67 vector< vector<int> > aln;
68 if (missingAsGap_ && !ignoreGaps_) {
71 for (
size_t i = 0; i < nr; ++i) {
76 fill(aln[i].begin(), aln[i].end(), gap);
81 nr = speciesSet.size();
83 for (
size_t i = 0; i < nr; ++i) {
93 for (i = 0; i < windowSize_; ++i) {
95 for (
size_t j = 0; j < nr; ++j) {
97 if (x != gap || !ignoreGaps_)
100 double entropy = VectorTools::shannonDiscrete<int, double>(col) / log(5.);
101 window_.push_back(entropy > maxEnt_ ? 1 : 0);
108 while (i + step_ < nc) {
112 unsigned int count = std::accumulate(window_.begin(), window_.end(), 0u);
113 if (
count > maxPos_) {
114 if (pos.size() == 0) {
115 pos.push_back(i - windowSize_);
118 if (i - windowSize_ < pos[pos.size() - 1]) {
119 pos[pos.size() - 1] = i;
121 pos.push_back(i - windowSize_);
128 for (
size_t k = 0; k < step_; ++k) {
130 for (
size_t j = 0; j < nr; ++j) {
132 if (x != gap || !ignoreGaps_)
135 double entropy = VectorTools::shannonDiscrete<int, double>(col) / log(5.);
136 window_.push_back(entropy > maxEnt_ ? 1 : 0);
143 unsigned int count = std::accumulate(window_.begin(), window_.end(), 0u);
144 if (
count > maxPos_) {
145 if (pos.size() == 0) {
146 pos.push_back(i - windowSize_);
149 if (i - windowSize_ <= pos[pos.size() - 1]) {
150 pos[pos.size() - 1] = i;
152 pos.push_back(i - windowSize_);
161 if (pos.size() == 0) {
162 blockBuffer_.push_back(block);
164 (*logstream_ <<
"ENTROPY CLEANER: block " << block->
getDescription() <<
" is clean and kept as is.").endLine();
166 }
else if (pos.size() == 2 && pos.front() == 0 && pos.back() == block->
getNumberOfSites()) {
169 (*logstream_ <<
"ENTROPY CLEANER: block " << block->
getDescription() <<
" was entirely removed. Tried to get the next one.").endLine();
173 (*logstream_ <<
"ALN CLEANER: block " << block->
getDescription() <<
" with size "<< block->
getNumberOfSites() <<
" will be split into " << (pos.size() / 2 + 1) <<
" blocks.").endLine();
179 for (i = 0; i < pos.size(); i+=2) {
183 (*logstream_ <<
"ENTROPY CLEANER: removing region (" << pos[i] <<
", " << pos[i+1] <<
") from block " << block->
getDescription() <<
".").endLine();
199 blockBuffer_.push_back(newBlock);
202 if (keepTrashedBlocks_) {
211 trashBuffer_.push_back(outBlock);
225 blockBuffer_.push_back(newBlock);
232 }
while (blockBuffer_.size() == 0);
235 MafBlock* block = blockBuffer_.front();
236 blockBuffer_.pop_front();
int getGapCharacterCode() const
MafBlock * analyseCurrentBlock_()
A synteny block data structure, the basic unit of a MAF alignement file.
unsigned int getPass() const
void setScore(double score)
void setPass(unsigned int pass)
size_t getNumberOfSequences() const
size_t getNumberOfSites() const
const MafSequence & getSequence(const std::string &name) const
std::string getDescription() const
void addSequence(const MafSequence &sequence)
bool hasSequenceForSpecies(const std::string &species) const
std::vector< std::string > getSpeciesList() const
const MafSequence & getSequenceForSpecies(const std::string &species) const
A sequence class which is used to store data from MAF files.
MafSequence * subSequence(size_t startAt, size_t length) const
Extract a sub-sequence.
virtual const std::vector< int > & getContent() const
std::string toString(T t)
std::size_t count(const std::string &s, const std::string &pattern)