bpp-seq-omics  2.4.1
CoordinateTranslatorMafIterator.cpp
Go to the documentation of this file.
1 //
2 // File: CoordinateTranslatorMafIterator.cpp
3 // Authors: Julien Dutheil
4 // Created: Thu Jan 28 2016
5 //
6 
7 
8 /*
9 Copyright or © or Copr. Bio++ Development Team, (2010)
10 
11 This software is a computer program whose purpose is to provide classes
12 for sequences analysis.
13 
14 This software is governed by the CeCILL license under French law and
15 abiding by the rules of distribution of free software. You can use,
16 modify and/ or redistribute the software under the terms of the CeCILL
17 license as circulated by CEA, CNRS and INRIA at the following URL
18 "http://www.cecill.info".
19 
20 As a counterpart to the access to the source code and rights to copy,
21 modify and redistribute granted by the license, users are provided only
22 with a limited warranty and the software's author, the holder of the
23 economic rights, and the successive licensors have only limited
24 liability.
25 
26 In this respect, the user's attention is drawn to the risks associated
27 with loading, using, modifying and/or developing or reproducing the
28 software by the user in light of its specific status of free software,
29 that may mean that it is complicated to manipulate, and that also
30 therefore means that it is reserved for developers and experienced
31 professionals having in-depth computer knowledge. Users are therefore
32 encouraged to load and test the software's suitability as regards their
33 requirements in conditions enabling the security of their systems and/or
34 data to be ensured and, more generally, to use and operate it in the
35 same conditions as regards security.
36 
37 The fact that you are presently reading this means that you have had
38 knowledge of the CeCILL license and that you accept its terms.
39 */
40 
42 
43 //From bpp-seq:
44 #include <Bpp/Seq/SequenceWalker.h>
45 
46 using namespace bpp;
47 
48 //From the STL:
49 #include <string>
50 #include <numeric>
51 
52 using namespace std;
53 
55 {
56  unique_ptr<MafBlock> block(iterator_->nextBlock());
57  if (!block.get()) return 0; //No more block.
58 
59  //Check if the block contains the reference and target species:
60  if (!block->hasSequenceForSpecies(referenceSpecies_))
61  return block.release();
62  if (!block->hasSequenceForSpecies(targetSpecies_))
63  return block.release();
64 
65  //Get the feature ranges for this block:
66  const MafSequence& refSeq = block->getSequenceForSpecies(referenceSpecies_);
67  const MafSequence& targetSeq = block->getSequenceForSpecies(targetSpecies_);
68 
69  //first check if there is one (for now we assume that features refer to the chromosome or contig name, with implicit species):
70  std::map<std::string, SequenceFeatureSet*>::iterator mr = inputFeaturesPerChr_.find(refSeq.getChromosome());
71  if (mr == inputFeaturesPerChr_.end())
72  return block.release();
73 
74  //second get only features within this block:
75  unique_ptr<SequenceFeatureSet> selectedFeatures(mr->second->getSubsetForRange(SeqRange(refSeq.getRange(true)), true));
76 
77  //test if there are some features to translate here:
78  if (selectedFeatures->isEmpty())
79  return block.release();
80 
81  //Get coordinate range sets:
82  RangeSet<size_t> ranges;
83  selectedFeatures->fillRangeCollection(ranges);
84 
85  //If the reference sequence is on the negative strand, then we have to correct the coordinates:
86  if (refSeq.getStrand() == '-') {
87  RangeSet<size_t> cRanges;
88  for (set<Range<size_t>*>::iterator it = ranges.getSet().begin();
89  it != ranges.getSet().end();
90  ++it)
91  {
92  cRanges.addRange(SeqRange(refSeq.getSrcSize() - (**it).end(), refSeq.getSrcSize() - (**it).begin(), dynamic_cast<SeqRange*>(*it)->getStrand()));
93  }
94  ranges = cRanges;
95  }
96 
97  //We will need to convert to alignment positions, using a sequence walker:
98  SequenceWalker referenceWalker(refSeq);
99  SequenceWalker targetWalker(targetSeq);
100 
101  //Now creates all blocks for all ranges:
102  if (verbose_) {
103  ApplicationTools::message->endLine();
104  ApplicationTools::displayTask("Extracting annotations", true);
105  }
106  if (logstream_) {
107  (*logstream_ << "COORDINATE CONVERTOR: lifting over " << ranges.getSet().size() << " features from block " << block->getDescription() << ".").endLine();
108  }
109 
110  const Alphabet* alphabet = refSeq.getAlphabet();
111  size_t i = 0;
112  for (set<Range<size_t>*>::iterator it = ranges.getSet().begin();
113  it != ranges.getSet().end();
114  ++it)
115  {
116  if (verbose_) {
117  ApplicationTools::displayGauge(i++, ranges.getSet().size() - 1, '=');
118  }
119  size_t a = referenceWalker.getAlignmentPosition((**it).begin() - refSeq.start());
120  size_t b = referenceWalker.getAlignmentPosition((**it).end() - refSeq.start() - 1);
121  string targetPos1 = "NA", targetPos2 = "NA";
122  if (!alphabet->isGap(targetSeq[a]) || outputClosestCoordinate_) {
123  size_t a2 = targetWalker.getSequencePosition(a) + targetSeq.start();
124  if (targetSeq.getStrand() == '-') {
125  a2 = targetSeq.getSrcSize() - a2;
126  }
127  targetPos1 = TextTools::toString(a2);
128  }
129  if (!alphabet->isGap(targetSeq[b]) || outputClosestCoordinate_) {
130  size_t b2 = targetWalker.getSequencePosition(b) + targetSeq.start() + 1;
131  if (targetSeq.getStrand() == '-') {
132  b2 = targetSeq.getSrcSize() - b2;
133  }
134  targetPos2 = TextTools::toString(b2);
135  }
136  output_ << refSeq.getChromosome() << "\t" << refSeq.getStrand() << "\t" << (**it).begin() << "\t" << (**it).end() << "\t";
137  output_ << targetSeq.getChromosome() << "\t" << targetSeq.getStrand() << "\t" << targetPos1 << "\t" << targetPos2 << endl;
138  }
139 
140  if (verbose_)
142 
143  //Block is simply forwarded:
144  return block.release();
145 }
146 
virtual bool isGap(int state) const=0
static void displayTask(const std::string &text, bool eof=false)
static std::shared_ptr< OutputStream > message
static void displayTaskDone()
static void displayGauge(size_t iter, size_t total, char symbol='>', const std::string &mes="")
virtual const Alphabet * getAlphabet() const=0
A synteny block data structure, the basic unit of a MAF alignement file.
Definition: MafBlock.h:57
A sequence class which is used to store data from MAF files.
Definition: MafSequence.h:64
const std::string & getChromosome() const
Definition: MafSequence.h:164
size_t start() const
Definition: MafSequence.h:116
char getStrand() const
Definition: MafSequence.h:166
size_t getSrcSize() const
Definition: MafSequence.h:170
Range< size_t > getRange(bool origin=true) const
Definition: MafSequence.h:131
void addRange(const Range< T > &r)
const std::set< Range< T > *, rangeComp_< T > > & getSet() const
a coordinate range on a sequence. Stores coordinates as a Range<size_t> object, but also keep the str...
virtual char getStrand() const
size_t getSequencePosition(size_t alnPos)
size_t getAlignmentPosition(size_t seqPos)
std::string toString(T t)