bpp-seq-omics  2.4.1
MafStatistics.cpp
Go to the documentation of this file.
1 //
2 // File: MafStatistics.cpp
3 // Authors: Julien Dutheil
4 // Created: Mon Jun 25 2012
5 //
6 
7 /*
8 Copyright or © or Copr. Bio++ Development Team, (2010)
9 
10 This software is a computer program whose purpose is to provide classes
11 for sequences analysis.
12 
13 This software is governed by the CeCILL license under French law and
14 abiding by the rules of distribution of free software. You can use,
15 modify and/ or redistribute the software under the terms of the CeCILL
16 license as circulated by CEA, CNRS and INRIA at the following URL
17 "http://www.cecill.info".
18 
19 As a counterpart to the access to the source code and rights to copy,
20 modify and redistribute granted by the license, users are provided only
21 with a limited warranty and the software's author, the holder of the
22 economic rights, and the successive licensors have only limited
23 liability.
24 
25 In this respect, the user's attention is drawn to the risks associated
26 with loading, using, modifying and/or developing or reproducing the
27 software by the user in light of its specific status of free software,
28 that may mean that it is complicated to manipulate, and that also
29 therefore means that it is reserved for developers and experienced
30 professionals having in-depth computer knowledge. Users are therefore
31 encouraged to load and test the software's suitability as regards their
32 requirements in conditions enabling the security of their systems and/or
33 data to be ensured and, more generally, to use and operate it in the
34 same conditions as regards security.
35 
36 The fact that you are presently reading this means that you have had
37 knowledge of the CeCILL license and that you accept its terms.
38 */
39 
40 #include "MafStatistics.h"
44 #include <Bpp/Seq/SiteTools.h>
45 
46 //From bpp-core:
48 
49 //From the STL:
50 #include <cmath>
51 #include <map>
52 
53 using namespace bpp;
54 using namespace std;
55 
57 {
58  vector<const MafSequence*> seqs1 = block.getSequencesForSpecies(species1_);
59  vector<const MafSequence*> seqs2 = block.getSequencesForSpecies(species2_);
60  if (seqs1.size() > 1 || seqs2.size() > 1)
61  throw Exception("PairwiseDivergenceMafStatistics::compute. Duplicated sequence for species " + species1_ + "or " + species2_ + ".");
62  if (seqs1.size() == 0 || seqs2.size() == 0)
63  result_.setValue(NumConstants::NaN());
64  else
65  result_.setValue(100. - SequenceTools::getPercentIdentity(*seqs1[0], *seqs2[0], true));
66 }
67 
69 {
70  if (noSpeciesMeansAllSpecies_ && species_.size() == 0) {
71  return new VectorSiteContainer(block.getAlignment());
72  }
73  //Otherwise, we select species:
75  for (size_t i = 0; i < species_.size(); ++i) {
76  if (block.hasSequenceForSpecies(species_[i])) {
77  vector<const MafSequence*> selection = block.getSequencesForSpecies(species_[i]);
78  for (size_t j = 0; j < selection.size(); ++j) {
79  alignment->addSequence(*selection[j]);
80  }
81  }
82  }
83  return alignment;
84 }
85 
87  species_(species)
88 {
89  size_t n = VectorTools::vectorUnion(species).size();
90  size_t m = 0;
91  for (size_t i = 0; i < species.size(); ++i)
92  m += species_[i].size();
93  if (m != n)
94  throw Exception("AbstractSpeciesMultipleSelectionMafStatistics (constructor). Species selections must be fully distinct.");
95 }
96 
98 {
99  vector<SiteContainer*> alignments;
100  for (size_t k = 0; k < species_.size(); ++k) {
102  for (size_t i = 0; i < species_[k].size(); ++i) {
103  if (block.hasSequenceForSpecies(species_[k][i])) {
104  vector<const MafSequence*> selection = block.getSequencesForSpecies(species_[k][i]);
105  for (size_t j = 0; j < selection.size(); ++j) {
106  alignment->addSequence(*selection[j]);
107  }
108  }
109  }
110  alignments.push_back(alignment);
111  }
112  return alignments;
113 }
114 
116 {
117  vector<string> tags;
118  for (int i = 0; i < static_cast<int>(alphabet_->getSize()); ++i) {
119  tags.push_back(alphabet_->intToChar(i));
120  }
121  tags.push_back("Gap");
122  tags.push_back("Unresolved");
123 
124  return tags;
125 }
126 
128 {
129  std::map<int, int> counts;
130  unique_ptr<SiteContainer> sites(getSiteContainer_(block));
131  SequenceContainerTools::getCounts(*sites, counts);
132  for (int i = 0; i < static_cast<int>(alphabet_->getSize()); ++i) {
133  result_.setValue(alphabet_->intToChar(i), counts[i]);
134  }
135  result_.setValue("Gap", counts[alphabet_->getGapCharacterCode()]);
136  double countUnres = 0;
137  for (map<int, int>::iterator it = counts.begin(); it != counts.end(); ++it) {
138  if (alphabet_->isUnresolved(it->first))
139  countUnres += it->second;
140  }
141  result_.setValue("Unresolved", countUnres);
142 }
143 
145 {
146  vector<string> tags;
147  for (size_t i = 0; i < categorizer_.getNumberOfCategories(); ++i) {
148  tags.push_back("Bin" + TextTools::toString(i + 1));
149  }
150  tags.push_back("Unresolved");
151  tags.push_back("Saturated");
152  tags.push_back("Ignored");
153  return tags;
154 }
155 
157 {
158  unsigned int nbUnresolved = 0;
159  unsigned int nbSaturated = 0;
160  unsigned int nbIgnored = 0;
162  int state;
163  bool hasOutgroup = (outgroup_ != "");
164  bool isAnalyzable;
165  unique_ptr<SiteContainer> alignment;
166  const Sequence* outgroupSeq = 0;
167  if (hasOutgroup) {
168  isAnalyzable = (block.hasSequenceForSpecies(outgroup_) && block.getNumberOfSequences() > 1);
169  if (isAnalyzable) {
170  //We need to extract the outgroup sequence:
171  outgroupSeq = &block.getSequenceForSpecies(outgroup_); //Here we assume there is only one! Otherwise we take the first one...
172  alignment.reset(getSiteContainer_(block));
173  }
174  } else {
175  isAnalyzable = (block.getNumberOfSequences() > 0);
176  if (isAnalyzable)
177  alignment.reset(getSiteContainer_(block));
178  }
179  if (isAnalyzable) {
180  for (size_t i = 0; i < alignment->getNumberOfSites(); ++i) {
181  //Note: we do not rely on SiteTool::getCounts as it would be unefficient to count everything.
182  const Site& site = alignment->getSite(i);
183  map<int, unsigned int> counts;
184  bool isUnresolved = false;
185  bool isSaturated = false;
186  for (size_t j = 0; !isUnresolved && !isSaturated && j < site.size(); ++j) {
187  state = site[j];
188  if (alphabet_->isGap(state) || alphabet_->isUnresolved(state)) {
189  isUnresolved = true;
190  } else {
191  counts[state]++;
192  if (counts.size() > 2) {
193  isSaturated = true;
194  }
195  }
196  }
197  if (isUnresolved) {
198  nbUnresolved++;
199  } else if (isSaturated) {
200  nbSaturated++;
201  } else if (hasOutgroup && (
202  alignment->getAlphabet()->isGap((*outgroupSeq)[i]) ||
203  alignment->getAlphabet()->isUnresolved((*outgroupSeq)[i]))) {
204  nbUnresolved++;
205  } else {
206  //Determine frequency class:
207  double count;
208  if (counts.size() == 1) {
209  if (hasOutgroup) {
210  if (counts.begin()->first == (*outgroupSeq)[i])
211  count = 0; //This is the ancestral state.
212  else
213  count = counts.begin()->second; //This is a derived state.
214  } else {
215  count = 0; //In this case we do not know, so we put 0.
216  }
217  } else {
218  map<int, unsigned int>::iterator it = counts.begin();
219  unsigned int count1 = it->second;
220  it++;
221  unsigned int count2 = it->second;
222  if (hasOutgroup) {
223  if (counts.begin()->first == (*outgroupSeq)[i])
224  count = counts.rbegin()->second; //This is the ancestral state, therefore we other one is the derived state.
225  else if (counts.rbegin()->first == (*outgroupSeq)[i])
226  count = counts.begin()->second; //The second state is the ancestral one, therefore the first one is the derived state.
227  else {
228  //None of the two states are ancestral! The position is therefore discarded.
229  isSaturated = true;
230  }
231  } else {
232  count = min(count1, count2); //In this case we do not know, so we take the minimum of the two values.
233  }
234  }
235  if (isSaturated)
236  nbSaturated++;
237  else {
238  try {
240  } catch (OutOfRangeException& oof) {
241  nbIgnored++;
242  }
243  }
244  }
245  }
246  }
247  result_.setValue("Unresolved", nbUnresolved);
248  result_.setValue("Saturated", nbSaturated);
249  result_.setValue("Ignored", nbIgnored);
250  for (size_t i = 0; i < counts_.size(); ++i) {
251  result_.setValue("Bin" + TextTools::toString(i + 1), counts_[i]);
252  }
253 }
254 
256 {
257  vector<string> tags;
258  tags.push_back("f1100");
259  tags.push_back("f0110");
260  tags.push_back("f1010");
261  tags.push_back("Ignored");
262  return tags;
263 }
264 
266 {
267  counts_.assign(6, 0);
268  unique_ptr<SiteContainer> alignment(getSiteContainer_(block));
269  if (alignment->getNumberOfSequences() == 4) {
270  unsigned int nbIgnored = 0;
271  for (size_t i = 0; i < block.getNumberOfSites(); ++i) {
272  const Site& site = alignment->getSite(i);
273  if (SiteTools::isComplete(site)) {
274  if (site[0] == site[1] &&
275  site[2] != site[1] &&
276  site[3] == site[2])
277  counts_[0]++;
278  else if (site[1] == site[2] &&
279  site[1] != site[0] &&
280  site[3] == site[0])
281  counts_[1]++;
282  else if (site[0] == site[2] &&
283  site[1] != site[0] &&
284  site[3] == site[1])
285  counts_[2]++;
286  } else {
287  nbIgnored++;
288  }
289  }
290  result_.setValue("f1100", counts_[0]);
291  result_.setValue("f0110", counts_[1]);
292  result_.setValue("f1010", counts_[2]);
293  result_.setValue("Ignored", nbIgnored);
294  } else {
295  result_.setValue("f1100", 0);
296  result_.setValue("f0110", 0);
297  result_.setValue("f1010", 0);
298  result_.setValue("Ignored", static_cast<double>(block.getNumberOfSites()));
299  }
300 }
301 
303 {
304  vector<string> tags;
305  tags.push_back("NbWithoutGap");
306  tags.push_back("NbComplete");
307  tags.push_back("NbConstant");
308  tags.push_back("NbBiallelic");
309  tags.push_back("NbTriallelic");
310  tags.push_back("NbQuadriallelic");
311  tags.push_back("NbParsimonyInformative");
312  return tags;
313 }
314 
316 {
317  unique_ptr<SiteContainer> alignment(getSiteContainer_(block));
318  unsigned int nbNg = 0;
319  unsigned int nbCo = 0;
320  unsigned int nbPi = 0;
321  unsigned int nbP1 = 0;
322  unsigned int nbP2 = 0;
323  unsigned int nbP3 = 0;
324  unsigned int nbP4 = 0;
325  if (alignment->getNumberOfSequences() > 0) {
326  for (size_t i = 0; i < alignment->getNumberOfSites(); ++i) {
327  if (!SiteTools::hasGap(alignment->getSite(i)))
328  nbNg++;
329  if (SiteTools::isComplete(alignment->getSite(i))) {
330  nbCo++;
331  map<int, size_t> counts;
332  SiteTools::getCounts(alignment->getSite(i), counts);
333  switch (counts.size()) {
334  case 1: nbP1++; break;
335  case 2: nbP2++; break;
336  case 3: nbP3++; break;
337  case 4: nbP4++; break;
338  default: throw Exception("The impossible happened. Probably a distortion in the Minkowski space.");
339  }
340  }
341  if (SiteTools::isParsimonyInformativeSite(alignment->getSite(i)))
342  nbPi++;
343  }
344  }
345  result_.setValue("NbWithoutGap", nbNg);
346  result_.setValue("NbComplete", nbCo);
347  result_.setValue("NbConstant", nbP1);
348  result_.setValue("NbBiallelic", nbP2);
349  result_.setValue("NbTriallelic", nbP3);
350  result_.setValue("NbQuadriallelic", nbP4);
351  result_.setValue("NbParsimonyInformative", nbPi);
352 }
353 
355 {
356  vector<string> tags;
357  tags.push_back("F");
358  tags.push_back("P");
359  tags.push_back("FP");
360  tags.push_back("PF");
361  tags.push_back("FF");
362  tags.push_back("X");
363  tags.push_back("FX");
364  tags.push_back("PX");
365  tags.push_back("XF");
366  tags.push_back("XP");
367  return tags;
368 }
369 
371 {
372  vector<int> patterns(sites.getNumberOfSites());
373  for (size_t i = 0; i < sites.getNumberOfSites(); ++i) {
374  const Site& site = sites.getSite(i);
375  int s = -1; //Unresolved
376  if (SiteTools::isComplete(site)) {
377  if (SiteTools::isConstant(site)) {
378  s = site[0]; //The fixed state
379  } else {
380  s = -10; //Polymorphic.
381  }
382  }
383  patterns[i] = s;
384  }
385  return patterns;
386 }
387 
389 {
390  vector<SiteContainer*> alignments(getSiteContainers_(block));
391  unsigned int nbF = 0;
392  unsigned int nbP = 0;
393  unsigned int nbFF = 0;
394  unsigned int nbFP = 0;
395  unsigned int nbPF = 0;
396  unsigned int nbX = 0;
397  unsigned int nbFX = 0;
398  unsigned int nbPX = 0;
399  unsigned int nbXF = 0;
400  unsigned int nbXP = 0;
401  //Get all patterns:
402  vector<int> patterns1(block.getNumberOfSites(), -1);
403  vector<int> patterns2(block.getNumberOfSites(), -1);
404  if (alignments[0]->getNumberOfSequences() > 0) {
405  patterns1 = getPatterns_(*alignments[0]);
406  }
407  if (alignments[1]->getNumberOfSequences() > 0) {
408  patterns2 = getPatterns_(*alignments[1]);
409  }
410  //Compare patterns:
411  for (size_t i = 0; i < block.getNumberOfSites(); ++i) {
412  int p1 = patterns1[i];
413  int p2 = patterns2[i];
414  switch (p1) {
415  case -1 :
416  switch (p2) {
417  case -1 :
418  nbX++;
419  break;
420  case -10 :
421  nbXP++;
422  break;
423  default :
424  nbXF++;
425  }
426  break;
427 
428  case -10 :
429  switch (p2) {
430  case -1 :
431  nbPX++;
432  break;
433  case -10 :
434  nbP++;
435  break;
436  default :
437  nbPF++;
438  }
439  break;
440 
441  default :
442  switch (p2) {
443  case -1 :
444  nbFX++;
445  break;
446  case -10 :
447  nbFP++;
448  break;
449  default :
450  if (p1 == p2)
451  nbF++;
452  else
453  nbFF++;
454  }
455  }
456  }
457 
458  //Set results:
459  result_.setValue("F", nbF);
460  result_.setValue("P", nbP);
461  result_.setValue("FF", nbFF);
462  result_.setValue("FP", nbFP);
463  result_.setValue("PF", nbPF);
464  result_.setValue("X", nbX);
465  result_.setValue("FX", nbFX);
466  result_.setValue("PX", nbPX);
467  result_.setValue("XF", nbXF);
468  result_.setValue("XP", nbXP);
469 }
470 
472 {
473  vector<string> tags;
474  tags.push_back("NbSeggregating");
475  tags.push_back("WattersonTheta");
476  tags.push_back("TajimaPi");
477  tags.push_back("TajimaD");
478  return tags;
479 }
480 
482 {
483  unique_ptr<SiteContainer> alignment(getSiteContainer_(block));
484  //Get only complete sites:
485  unique_ptr<VectorSiteContainer> alignment2(dynamic_cast<VectorSiteContainer*>(SiteContainerTools::getCompleteSites(*alignment)));
486  if (alignment2==0)
487  throw Exception("SequenceDiversityMafStatistics::compute : alignment2 not plain VectorSiteContainer.");
488 
489  double S = 0;
490  size_t nbTot = 0;
491  size_t n = alignment2->getNumberOfSequences();
492  if (n > 0) {
493  for (size_t i = 0; i < alignment2->getNumberOfSites(); ++i) {
494  const Site& site = alignment2->getSite(i);
495  if (SiteTools::isComplete(site)) {
496  nbTot++;
497  if (!SiteTools::isConstant(site))
498  S++;
499  }
500  }
501  } else {
502  nbTot = alignment2->getNumberOfSites();
503  }
504 
505  double a1 = 0;
506  double a2 = 0;
507  double dn = static_cast<double>(n);
508  for (double i = 1; i < dn; ++i) {
509  a1 += 1. / i;
510  a2 += 1. / (i * i);
511  }
512  double wt = S / (static_cast<double>(nbTot) * a1);
513  double b1 = (dn + 1) / (3 * (dn - 1));
514  double b2 = 2 * (dn * dn + dn + 3) / (9 * dn * (dn - 1));
515  double c1 = b1 - 1. / a1;
516  double c2 = b2 - (dn + 2) / (a1 * dn) + a2 / (a1 * a1);
517  double e1 = c1 / a1;
518  double e2 = c2 / (a1 * a1 + a2);
519 
520  //Compute pairwise heterozigosity:
521  double pi = 0;
522  for (size_t i = 0; i < n - 1; ++i) {
523  for (size_t j = i + 1; j < n; ++j) {
525  alignment2->getSequence(i),
526  alignment2->getSequence(j),
527  true,
529  true);
530  }
531  }
532  pi /= static_cast<double>((n - 1) * n / 2);
533 
534  //Compute Tajima's D:
535  double tajd = static_cast<double>(nbTot) * (pi - wt) / sqrt(e1 * S + e2 * S * (S - 1));
536 
537  result_.setValue("NbSeggregating", S);
538  result_.setValue("WattersonTheta", wt);
539  result_.setValue("TajimaPi", pi);
540  result_.setValue("TajimaD", tajd);
541 }
542 
MafStatisticsResult result_
std::vector< SiteContainer * > getSiteContainers_(const MafBlock &block)
AbstractSpeciesMultipleSelectionMafStatistics(const std::vector< std::vector< std::string > > &species)
std::vector< std::vector< std::string > > species_
SiteContainer * getSiteContainer_(const MafBlock &block)
virtual size_t getNumberOfSites() const=0
virtual std::string intToChar(int state) const=0
virtual bool isUnresolved(int state) const=0
virtual bool isGap(int state) const=0
virtual int getGapCharacterCode() const=0
virtual unsigned int getSize() const=0
std::vector< std::string > getSupportedTags() const
void compute(const MafBlock &block)
virtual size_t size() const=0
std::vector< unsigned int > counts_
std::vector< std::string > getSupportedTags() const
A synteny block data structure, the basic unit of a MAF alignement file.
Definition: MafBlock.h:57
size_t getNumberOfSequences() const
Definition: MafBlock.h:111
std::vector< const MafSequence * > getSequencesForSpecies(const std::string &species) const
Definition: MafBlock.h:149
size_t getNumberOfSites() const
Definition: MafBlock.h:113
AlignedSequenceContainer & getAlignment()
Definition: MafBlock.h:108
bool hasSequenceForSpecies(const std::string &species) const
Definition: MafBlock.h:129
const MafSequence & getSequenceForSpecies(const std::string &species) const
Definition: MafBlock.h:139
virtual void setValue(const std::string &tag, double value)
Associate a value to a certain tag. Any existing tag will be overwritten.
Definition: MafStatistics.h:95
static double NaN()
void compute(const MafBlock &block)
static std::vector< int > getPatterns_(const SiteContainer &sites)
void compute(const MafBlock &block)
std::vector< std::string > getSupportedTags() const
static void getCounts(const SequenceContainer &sequences, std::map< int, int > &)
std::vector< std::string > getSupportedTags() const
void compute(const MafBlock &block)
static double getPercentIdentity(const Sequence &seq1, const Sequence &seq2, bool ignoreGaps=false)
virtual const Alphabet * getAlphabet() const=0
static double computeSimilarity(const Sequence &seq1, const Sequence &seq2, bool dist=false, const std::string &gapOption=SIMILARITY_NODOUBLEGAP, bool unresolvedAsGap=true)
static const std::string SIMILARITY_NOGAP
static AlignedValuesContainer * getCompleteSites(const AlignedValuesContainer &sites)
virtual const Site & getSite(size_t siteIndex) const=0
std::vector< std::string > getSupportedTags() const
std::vector< unsigned int > counts_
void compute(const MafBlock &block)
void compute(const MafBlock &block)
std::vector< std::string > getSupportedTags() const
static bool hasGap(const IntCoreSymbolList &site)
static bool isComplete(const IntCoreSymbolList &site)
static bool isConstant(const IntCoreSymbolList &site, bool ignoreUnknown=false, bool unresolvedRaisesException=true)
static bool isParsimonyInformativeSite(const IntCoreSymbolList &site)
static void getCounts(const IntCoreSymbolList &list, std::map< int, size_t > &counts)
void addSequence(const Sequence &sequence, bool checkName=true)
static std::vector< T > vectorUnion(const std::vector< T > &vec1, const std::vector< T > &vec2)
std::string toString(T t)
std::size_t count(const std::string &s, const std::string &pattern)