bpp-seq-omics  2.4.1
MafStatistics.h
Go to the documentation of this file.
1 //
2 // File: MafStatistics.h
3 // Authors: Julien Dutheil
4 // Created: Mon Jun 25 2012
5 //
6 
7 /*
8 Copyright or © or Copr. Bio++ Development Team, (2012)
9 
10 This software is a computer program whose purpose is to provide classes
11 for sequences analysis.
12 
13 This software is governed by the CeCILL license under French law and
14 abiding by the rules of distribution of free software. You can use,
15 modify and/ or redistribute the software under the terms of the CeCILL
16 license as circulated by CEA, CNRS and INRIA at the following URL
17 "http://www.cecill.info".
18 
19 As a counterpart to the access to the source code and rights to copy,
20 modify and redistribute granted by the license, users are provided only
21 with a limited warranty and the software's author, the holder of the
22 economic rights, and the successive licensors have only limited
23 liability.
24 
25 In this respect, the user's attention is drawn to the risks associated
26 with loading, using, modifying and/or developing or reproducing the
27 software by the user in light of its specific status of free software,
28 that may mean that it is complicated to manipulate, and that also
29 therefore means that it is reserved for developers and experienced
30 professionals having in-depth computer knowledge. Users are therefore
31 encouraged to load and test the software's suitability as regards their
32 requirements in conditions enabling the security of their systems and/or
33 data to be ensured and, more generally, to use and operate it in the
34 same conditions as regards security.
35 
36 The fact that you are presently reading this means that you have had
37 knowledge of the CeCILL license and that you accept its terms.
38 */
39 
40 #ifndef _MAFSTATISTICS_H_
41 #define _MAFSTATISTICS_H_
42 
43 #include "MafBlock.h"
44 
45 //From bpp-core:
46 #include <Bpp/Utils/MapTools.h>
48 #include <Bpp/Numeric/Number.h>
49 
50 //From the STL:
51 #include <map>
52 #include <string>
53 
54 namespace bpp {
55 
63 {
64  protected:
65  mutable std::map<std::string, BppNumberI*> values_;
66 
67  public:
69  virtual ~MafStatisticsResult() {}
70 
72  {
73  for (std::map<std::string, BppNumberI*>::const_iterator it = msr.values_.begin();
74  it != msr.values_.end();
75  ++it) {
76  values_[it->first] = it->second->clone();
77  }
78  }
79 
80  public:
81  virtual const BppNumberI& getValue(const std::string& tag) const {
82  std::map<std::string, BppNumberI*>::iterator it = values_.find(tag);
83  if (it != values_.end())
84  return *it->second;
85  else
86  throw Exception("MafStatisticsResult::getValue(). No value found for tag: " + tag + ".");
87  }
88 
95  virtual void setValue(const std::string& tag, double value) {
96  if (values_[tag]) {
97  delete values_[tag];
98  }
99  values_[tag] = new BppDouble(value);
100  }
101 
108  virtual void setValue(const std::string& tag, int value) {
109  if (values_[tag]) {
110  delete values_[tag];
111  }
112  values_[tag] = new BppInteger(value);
113  }
114 
121  virtual void setValue(const std::string& tag, unsigned int value) {
122  if (values_[tag]) {
123  delete values_[tag];
124  }
125  values_[tag] = new BppUnsignedInteger(value);
126  }
127 
132  virtual bool hasValue(const std::string& tag) const {
133  return (values_.find(tag) != values_.end());
134  }
135 
139  std::vector<std::string> getAvailableTags() const { return MapTools::getKeys(values_); }
140 };
141 
146  public virtual MafStatisticsResult
147 {
148  private:
149  std::string name_;
150 
151  public:
152  SimpleMafStatisticsResult(const std::string& name): MafStatisticsResult(), name_(name) {
153  setValue(name, 0);
154  }
156 
157  public:
158  virtual const BppNumberI& getValue(const std::string& tag) const {
159  return MafStatisticsResult::getValue(tag);
160  }
161 
162  virtual const BppNumberI& getValue() const { return *values_[name_]; }
163 
164  virtual void setValue(const std::string& tag, double value) {
165  if (tag == name_)
166  setValue(value);
167  else
168  throw Exception("SimpleMafStatisticsResult::setValue(). Unvalid tag name: " + tag + ".");
169  }
170 
171  virtual void setValue(const std::string& tag, int value) {
172  if (tag == name_)
173  setValue(value);
174  else
175  throw Exception("SimpleMafStatisticsResult::setValue(). Unvalid tag name: " + tag + ".");
176  }
177 
178  virtual void setValue(const std::string& tag, unsigned int value) {
179  if (tag == name_)
180  setValue(value);
181  else
182  throw Exception("SimpleMafStatisticsResult::setValue(). Unvalid tag name: " + tag + ".");
183  }
184 
185  virtual void setValue(double value) {
186  if (values_[name_]) delete values_[name_];
187  values_[name_] = new BppDouble(value);
188  }
189 
190  virtual void setValue(int value) {
191  if (values_[name_]) delete values_[name_];
192  values_[name_] = new BppInteger(value);
193  }
194 
195  virtual void setValue(unsigned int value) {
196  if (values_[name_]) delete values_[name_];
197  values_[name_] = new BppUnsignedInteger(value);
198  }
199 
200 };
201 
209 {
210  public:
212  virtual ~MafStatistics() {}
213 
214  public:
215  virtual std::string getShortName() const = 0;
216  virtual std::string getFullName() const = 0;
217  virtual const MafStatisticsResult& getResult() const = 0;
218  virtual void compute(const MafBlock& block) = 0;
219 
223  virtual std::vector<std::string> getSupportedTags() const = 0;
224 
225 };
226 
231  public virtual MafStatistics
232 {
233  protected:
235 
236  public:
239 
240  public:
241  const MafStatisticsResult& getResult() const { return result_; }
242 };
243 
248  public MafStatistics
249 {
250  protected:
252 
253  public:
254  AbstractMafStatisticsSimple(const std::string& name): result_(name) {}
256 
257  public:
258  const SimpleMafStatisticsResult& getResult() const { return result_; }
259  std::vector<std::string> getSupportedTags() const { return result_.getAvailableTags(); }
260 };
261 
267 {
268  private:
269  std::string species1_;
270  std::string species2_;
271 
272  public:
273  PairwiseDivergenceMafStatistics(const std::string& species1, const std::string& species2):
274  AbstractMafStatisticsSimple("Divergence"), species1_(species1), species2_(species2) {}
275 
277 
278  public:
279  std::string getShortName() const { return "Div." + species1_ + "-" + species2_; }
280  std::string getFullName() const { return "Pairwise divergence between " + species1_ + " and " + species2_ + "."; }
281  void compute(const MafBlock& block);
282 
283 };
284 
290 {
291  public:
294 
295  public:
296  std::string getShortName() const { return "BlockSize"; }
297  std::string getFullName() const { return "Number of sequences."; }
298  void compute(const MafBlock& block) {
299  result_.setValue(static_cast<double>(block.getNumberOfSequences()));
300  }
301 };
302 
308 {
309  public:
312 
313  public:
314  std::string getShortName() const { return "BlockLength"; }
315  std::string getFullName() const { return "Number of sites."; }
316  void compute(const MafBlock& block) {
317  result_.setValue(static_cast<double>(block.getNumberOfSites()));
318  }
319 };
320 
329 {
330  private:
331  std::string species_;
332 
333  public:
334  SequenceLengthMafStatistics(const std::string& species): AbstractMafStatisticsSimple("BlockSize"), species_(species) {}
336 
337  public:
338  std::string getShortName() const { return "SequenceLengthFor" + species_; }
339  std::string getFullName() const { return "Sequence length for species " + species_; }
340  void compute(const MafBlock& block) {
341  std::vector<const MafSequence*> seqs = block.getSequencesForSpecies(species_);
342  if (seqs.size() == 0)
343  result_.setValue(0.);
344  else if (seqs.size() == 1)
345  result_.setValue(static_cast<double>(SequenceTools::getNumberOfSites(*seqs[0])));
346  else
347  throw Exception("SequenceLengthMafStatistics::compute. More than one sequence found for species " + species_ + " in current block.");
348  }
349 };
350 
351 
357 {
358  public:
361 
362  public:
363  std::string getShortName() const { return "AlnScore"; }
364  std::string getFullName() const { return "Alignment score."; }
365  void compute(const MafBlock& block) {
366  result_.setValue(block.getScore());
367  }
368 };
369 
370 
371 
378  public virtual MafStatistics
379 {
380  private:
381  std::vector<std::string> species_;
383 
384  protected:
385  std::string suffix_;
386 
387  public:
389  const std::vector<std::string>& species,
390  bool noSpeciesMeansAllSpecies = false,
391  const std::string& suffix = ""):
392  species_(species),
393  noSpeciesMeansAllSpecies_(noSpeciesMeansAllSpecies),
394  suffix_(suffix)
395  {}
396 
397  protected:
399 
400 };
401 
402 
409  public virtual MafStatistics
410 {
411  private:
412  std::vector< std::vector<std::string> > species_;
413 
414  public:
415  AbstractSpeciesMultipleSelectionMafStatistics(const std::vector< std::vector<std::string> >& species);
416 
417  protected:
418  std::vector<SiteContainer*> getSiteContainers_(const MafBlock& block);
419 
420 };
421 
422 
423 
437  public AbstractMafStatistics,
439 {
440  private:
442 
443  public:
444  CharacterCountsMafStatistics(const Alphabet* alphabet, const std::vector<std::string>& species, const std::string suffix):
446  AbstractSpeciesSelectionMafStatistics(species, true, suffix),
447  alphabet_(alphabet) {}
448 
450  AbstractMafStatistics(stats),
452  alphabet_(stats.alphabet_) {}
453 
455  AbstractMafStatistics::operator=(stats);
456  AbstractSpeciesSelectionMafStatistics::operator=(stats);
457  alphabet_ = stats.alphabet_;
458  return *this;
459  }
460 
462 
463  public:
464  std::string getShortName() const { return "Counts" + suffix_; }
465  std::string getFullName() const { return "Character counts (" + suffix_ + ")."; }
466  void compute(const MafBlock& block);
467  std::vector<std::string> getSupportedTags() const;
468 };
469 
470 
471 
479  public AbstractMafStatistics,
481 {
482  private:
483  class Categorizer {
484  private:
485  std::vector<double> bounds_;
486 
487  public:
488  Categorizer(const std::vector<double>& bounds):
489  bounds_(bounds) {
490  std::sort(bounds_.begin(), bounds_.end());
491  }
492 
493  public:
494  size_t getNumberOfCategories() const { return (bounds_.size() - 1); }
495 
496  //Category numbers start at 1!
497  size_t getCategory(double value) const {
498  if (value < bounds_[0])
499  throw OutOfRangeException("SiteFrequencySpectrumMafStatistics::Categorizer::getCategory.", value, *bounds_.begin(), *bounds_.rbegin());
500  for (size_t i = 1; i < bounds_.size(); ++i) {
501  if (value < bounds_[i])
502  return i;
503  }
504  throw OutOfRangeException("SiteFrequencySpectrumMafStatistics::Categorizer::getCategory.", value, *bounds_.begin(), *bounds_.rbegin());
505  }
506  };
507 
508  private:
511  std::vector<unsigned int> counts_;
512  std::string outgroup_;
513 
514  public:
515  SiteFrequencySpectrumMafStatistics(const Alphabet* alphabet, const std::vector<double>& bounds, const std::vector<std::string>& ingroup, const std::string outgroup = ""):
518  alphabet_(alphabet),
519  categorizer_(bounds),
520  counts_(bounds.size() - 1),
521  outgroup_(outgroup)
522  {}
523 
527  alphabet_(stats.alphabet_),
528  categorizer_(stats.categorizer_),
529  counts_(stats.counts_),
530  outgroup_(stats.outgroup_)
531  {}
532 
534  AbstractMafStatistics::operator=(stats);
535  AbstractSpeciesSelectionMafStatistics::operator=(stats);
536  alphabet_ = stats.alphabet_;
537  categorizer_ = stats.categorizer_;
538  counts_ = stats.counts_;
539  outgroup_ = stats.outgroup_;
540  return *this;
541  }
542 
544 
545  public:
546  std::string getShortName() const { return "SiteFrequencySpectrum"; }
547  std::string getFullName() const { return "Site frequency spectrum."; }
548  void compute(const MafBlock& block);
549  std::vector<std::string> getSupportedTags() const;
550 };
551 
552 
564  public AbstractMafStatistics,
566 {
567  private:
569  std::vector<unsigned int> counts_;
570 
571  public:
573  const Alphabet* alphabet,
574  const std::vector<std::string>& species):
577  alphabet_(alphabet),
578  counts_(6)
579  {
580  if (species.size() != 4)
581  throw Exception("FourSpeciesPatternCountsMafStatistics, constructor: 4 species should be provided.");
582  if (VectorTools::unique(species).size() != 4)
583  throw Exception("FourSpeciesPatternCountsMafStatistics, constructor: duplicated species name!");
584  }
585 
589  alphabet_(stats.alphabet_),
590  counts_(stats.counts_)
591  {}
592 
594  AbstractMafStatistics::operator=(stats);
595  AbstractSpeciesSelectionMafStatistics::operator=(stats);
596  alphabet_ = stats.alphabet_;
597  counts_ = stats.counts_;
598  return *this;
599  }
600 
602 
603  public:
604  std::string getShortName() const { return "FourSpeciesPatternCounts"; }
605  std::string getFullName() const { return "FourSpecies pattern counts."; }
606  void compute(const MafBlock& block);
607  std::vector<std::string> getSupportedTags() const;
608 };
609 
610 
611 
625  public AbstractMafStatistics,
627 {
628  public:
629  SiteMafStatistics(const std::vector<std::string>& species):
632  {}
633 
634  virtual ~SiteMafStatistics() {}
635 
636  public:
637  std::string getShortName() const { return "SiteStatistics"; }
638  std::string getFullName() const { return "Site statistics."; }
639  void compute(const MafBlock& block);
640  std::vector<std::string> getSupportedTags() const;
641 };
642 
643 
657  public AbstractMafStatistics,
659 {
660  public:
661  PolymorphismMafStatistics(const std::vector< std::vector<std::string> >& species):
664  {
665  if (species.size() != 2)
666  throw Exception("PolymorphismStatistics: exactly two species selection should be provided.");
667  }
668 
670 
671  public:
672  std::string getShortName() const { return "PolymorphismStatistics"; }
673  std::string getFullName() const { return "Polymorphism statistics."; }
674  void compute(const MafBlock& block);
675  std::vector<std::string> getSupportedTags() const;
676 
677  private:
678  static std::vector<int> getPatterns_(const SiteContainer& sites);
679 };
680 
681 
682 
694  public AbstractMafStatistics,
696 {
697  public:
698  SequenceDiversityMafStatistics(const std::vector<std::string>& ingroup):
701  {}
702 
704 
705  public:
706  std::string getShortName() const { return "SequenceDiversityStatistics"; }
707  std::string getFullName() const { return "Sequence diversity statistics."; }
708  void compute(const MafBlock& block);
709  std::vector<std::string> getSupportedTags() const;
710 
711  private:
712  static std::vector<int> getPatterns_(const SiteContainer& sites);
713 };
714 
715 
716 } // end of namespace bpp
717 
718 #endif //_MAFSTATISTICS_H_
719 
Partial implementation of MafStatistics, for convenience.
const SimpleMafStatisticsResult & getResult() const
AbstractMafStatisticsSimple(const std::string &name)
SimpleMafStatisticsResult result_
std::vector< std::string > getSupportedTags() const
Partial implementation of MafStatistics, for convenience.
const MafStatisticsResult & getResult() const
MafStatisticsResult result_
Partial implementation of MafStatistics for method working on multiple distinct subsets of species,...
std::vector< SiteContainer * > getSiteContainers_(const MafBlock &block)
AbstractSpeciesMultipleSelectionMafStatistics(const std::vector< std::vector< std::string > > &species)
std::vector< std::vector< std::string > > species_
Partial implementation of MafStatistics for method working on a subset of species,...
SiteContainer * getSiteContainer_(const MafBlock &block)
AbstractSpeciesSelectionMafStatistics(const std::vector< std::string > &species, bool noSpeciesMeansAllSpecies=false, const std::string &suffix="")
Retrieves the alignment score of a maf block.
std::string getFullName() const
std::string getShortName() const
void compute(const MafBlock &block)
Computes the number of columns in a maf block.
void compute(const MafBlock &block)
std::string getShortName() const
std::string getFullName() const
Computes the number of sequences in a maf block.
void compute(const MafBlock &block)
std::string getShortName() const
std::string getFullName() const
Compute the base frequencies of a maf block.
std::string getShortName() const
std::vector< std::string > getSupportedTags() const
CharacterCountsMafStatistics(const CharacterCountsMafStatistics &stats)
void compute(const MafBlock &block)
CharacterCountsMafStatistics(const Alphabet *alphabet, const std::vector< std::string > &species, const std::string suffix)
CharacterCountsMafStatistics & operator=(const CharacterCountsMafStatistics &stats)
Compute the frequency of site patterns for a quadruplet of species.
std::vector< unsigned int > counts_
FourSpeciesPatternCountsMafStatistics(const FourSpeciesPatternCountsMafStatistics &stats)
FourSpeciesPatternCountsMafStatistics & operator=(const FourSpeciesPatternCountsMafStatistics &stats)
std::vector< std::string > getSupportedTags() const
FourSpeciesPatternCountsMafStatistics(const Alphabet *alphabet, const std::vector< std::string > &species)
A synteny block data structure, the basic unit of a MAF alignement file.
Definition: MafBlock.h:57
size_t getNumberOfSequences() const
Definition: MafBlock.h:111
std::vector< const MafSequence * > getSequencesForSpecies(const std::string &species) const
Definition: MafBlock.h:149
double getScore() const
Definition: MafBlock.h:105
size_t getNumberOfSites() const
Definition: MafBlock.h:113
General interface for storing statistical results.
Definition: MafStatistics.h:63
virtual void setValue(const std::string &tag, unsigned int value)
Associate a value to a certain tag. Any existing tag will be overwritten.
virtual void setValue(const std::string &tag, int value)
Associate a value to a certain tag. Any existing tag will be overwritten.
std::map< std::string, BppNumberI * > values_
Definition: MafStatistics.h:65
std::vector< std::string > getAvailableTags() const
virtual void setValue(const std::string &tag, double value)
Associate a value to a certain tag. Any existing tag will be overwritten.
Definition: MafStatistics.h:95
MafStatisticsResult(const MafStatisticsResult &msr)
Definition: MafStatistics.h:71
virtual const BppNumberI & getValue(const std::string &tag) const
Definition: MafStatistics.h:81
virtual bool hasValue(const std::string &tag) const
General interface for computing statistics based on a Maf block.
virtual void compute(const MafBlock &block)=0
virtual const MafStatisticsResult & getResult() const =0
virtual std::vector< std::string > getSupportedTags() const =0
virtual ~MafStatistics()
virtual std::string getFullName() const =0
virtual std::string getShortName() const =0
static std::vector< Key > getKeys(const std::map< Key, T, Cmp > &myMap)
Computes the pairwise divergence for a pair of sequences in a maf block.
void compute(const MafBlock &block)
PairwiseDivergenceMafStatistics(const std::string &species1, const std::string &species2)
Counts number of polymorphic / fixed sites in two populations.
static std::vector< int > getPatterns_(const SiteContainer &sites)
PolymorphismMafStatistics(const std::vector< std::vector< std::string > > &species)
std::string getFullName() const
std::string getShortName() const
void compute(const MafBlock &block)
std::vector< std::string > getSupportedTags() const
Provide estimates of sequence diversity.
std::vector< std::string > getSupportedTags() const
static std::vector< int > getPatterns_(const SiteContainer &sites)
void compute(const MafBlock &block)
SequenceDiversityMafStatistics(const std::vector< std::string > &ingroup)
Retrieve the sequence length (number of nucleotides) for a given species in a maf block.
std::string getFullName() const
void compute(const MafBlock &block)
std::string getShortName() const
SequenceLengthMafStatistics(const std::string &species)
static size_t getNumberOfSites(const Sequence &seq)
A simple maf statistics result, with only one value.
virtual const BppNumberI & getValue(const std::string &tag) const
SimpleMafStatisticsResult(const std::string &name)
virtual void setValue(int value)
virtual void setValue(const std::string &tag, double value)
Associate a value to a certain tag. Any existing tag will be overwritten.
virtual void setValue(double value)
virtual void setValue(const std::string &tag, unsigned int value)
Associate a value to a certain tag. Any existing tag will be overwritten.
virtual const BppNumberI & getValue() const
virtual void setValue(unsigned int value)
virtual void setValue(const std::string &tag, int value)
Associate a value to a certain tag. Any existing tag will be overwritten.
Categorizer(const std::vector< double > &bounds)
Compute the Site Frequency Spectrum of a maf block.
std::vector< std::string > getSupportedTags() const
std::vector< unsigned int > counts_
SiteFrequencySpectrumMafStatistics(const SiteFrequencySpectrumMafStatistics &stats)
SiteFrequencySpectrumMafStatistics & operator=(const SiteFrequencySpectrumMafStatistics &stats)
void compute(const MafBlock &block)
SiteFrequencySpectrumMafStatistics(const Alphabet *alphabet, const std::vector< double > &bounds, const std::vector< std::string > &ingroup, const std::string outgroup="")
Compute a few site statistics in a maf block.
std::string getShortName() const
void compute(const MafBlock &block)
std::vector< std::string > getSupportedTags() const
SiteMafStatistics(const std::vector< std::string > &species)
std::string getFullName() const
static std::vector< T > unique(const std::vector< T > &v)