bpp-popgen  3.0.0
SequenceStatistics.h
Go to the documentation of this file.
1 //
2 // File SequenceStatistics.h
3 // Authors: Eric Bazin
4 // Sylvain Gaillard
5 // Khalid Belkhir
6 // Benoit Nabholz
7 // Created on: Wed Aug 04 2004
8 //
9 
10 /*
11  Copyright or © or Copr. Bio++ Development Team, (November 17, 2004)
12 
13  This software is a computer program whose purpose is to provide classes
14  for population genetics analysis.
15 
16  This software is governed by the CeCILL license under French law and
17  abiding by the rules of distribution of free software. You can use,
18  modify and/ or redistribute the software under the terms of the CeCILL
19  license as circulated by CEA, CNRS and INRIA at the following URL
20  "http://www.cecill.info".
21 
22  As a counterpart to the access to the source code and rights to copy,
23  modify and redistribute granted by the license, users are provided only
24  with a limited warranty and the software's author, the holder of the
25  economic rights, and the successive licensors have only limited
26  liability.
27 
28  In this respect, the user's attention is drawn to the risks associated
29  with loading, using, modifying and/or developing or reproducing the
30  software by the user in light of its specific status of free software,
31  that may mean that it is complicated to manipulate, and that also
32  therefore means that it is reserved for developers and experienced
33  professionals having in-depth computer knowledge. Users are therefore
34  encouraged to load and test the software's suitability as regards their
35  requirements in conditions enabling the security of their systems and/or
36  data to be ensured and, more generally, to use and operate it in the
37  same conditions as regards security.
38 
39  The fact that you are presently reading this means that you have had
40  knowledge of the CeCILL license and that you accept its terms.
41  */
42 
43 // Secured inclusion of header's file
44 #ifndef _SEQUENCESTATISTICS_H_
45 #define _SEQUENCESTATISTICS_H_
46 
47 // From the bpp-seq library
54 
56 
57 // From the STL
58 #include <string>
59 #include <map>
60 #include <vector>
61 
62 namespace bpp
63 {
70 {
71 public:
87  static unsigned int numberOfPolymorphicSites(
89  bool gapflag = true,
90  bool ignoreUnknown = true);
91 
108  static double frequencyOfPolymorphicSites(
110  bool gapflag = true,
111  bool ignoreUnknown = true);
112 
120  static unsigned int numberOfParsimonyInformativeSites(
122  bool gapflag = true);
123 
132  static unsigned int numberOfSingletons(
134  bool gapflag = true);
135 
146  static unsigned int totalNumberOfMutations(
148  bool gapflag = true);
149 
162  static unsigned int totalNumberOfMutationsOnExternalBranches(
164  const PolymorphismSequenceContainer& outg);
165 
173  static unsigned int numberOfTriplets(
175  bool gapflag = true);
176 
183  static double heterozygosity(
185  bool gapflag = true);
186 
194  static double squaredHeterozygosity(
196  bool gapflag = true);
197 
203  static double gcContent(
204  const PolymorphismSequenceContainer& psc);
205 
220  static std::vector<unsigned int> gcPolymorphism(
222  bool gapflag = true);
223 
242  static double watterson75(
244  bool gapflag = true,
245  bool ignoreUnknown = true,
246  bool scaled = false);
247 
269  static double tajima83(
271  bool gapflag = true,
272  bool ignoreUnknown = true,
273  bool scaled = false);
274 
283  static double fayWu2000(
285  const Sequence& ancestralSites);
286 
299  static unsigned int dvk(
301  bool gapflag = true);
302 
315  static double dvh(
317  bool gapflag = true);
318 
325  static unsigned int numberOfTransitions(
326  const PolymorphismSequenceContainer& psc);
327 
334  static unsigned int numberOfTransversions(
335  const PolymorphismSequenceContainer& psc);
336 
343  static double ratioOfTransitionsTransversions(
344  const PolymorphismSequenceContainer& psc);
345 
355  static unsigned int numberOfSitesWithStopCodon(
357  const GeneticCode& gCode,
358  bool gapflag = true);
359 
372  static unsigned int numberOfMonoSitePolymorphicCodons(
374  bool stopflag = true,
375  bool gapflag = true);
376 
387  static unsigned int numberOfSynonymousPolymorphicCodons(
389  const GeneticCode& gc);
390 
405  static double watterson75Synonymous(
407  const GeneticCode& gc);
408 
423  static double watterson75NonSynonymous(
425  const GeneticCode& gc);
426 
442  static double piSynonymous(
444  const GeneticCode& gc,
445  bool minchange = false);
446 
462  static double piNonSynonymous(
464  const GeneticCode& gc,
465  bool minchange = false);
466 
481  static double meanNumberOfSynonymousSites(
483  const GeneticCode& gc,
484  double ratio = 1.);
485 
499  static double meanNumberOfNonSynonymousSites(
501  const GeneticCode& gc,
502  double ratio = 1.);
503 
519  static unsigned int numberOfSynonymousSubstitutions(
521  const GeneticCode& gc,
522  double freqmin = 0.);
523 
539  static unsigned int numberOfNonSynonymousSubstitutions(
541  const GeneticCode& gc,
542  double freqmin = 0.);
543 
561  static std::vector<unsigned int> fixedDifferences(
562  const PolymorphismSequenceContainer& pscin,
563  const PolymorphismSequenceContainer& pscout,
565  const GeneticCode& gc);
566 
578  static std::vector<unsigned int> mkTable(
579  const PolymorphismSequenceContainer& ingroup,
580  const PolymorphismSequenceContainer& outgroup,
581  const GeneticCode& gc,
582  double freqmin = 0.);
583 
597  static double neutralityIndex(
598  const PolymorphismSequenceContainer& ingroup,
599  const PolymorphismSequenceContainer& outgroup,
600  const GeneticCode& gc,
601  double freqmin = 0.);
602 
620  static double tajimaDss(
622  bool gapflag = true,
623  bool ignoreUnknown = true);
624 
640  static double tajimaDtnm(
642  bool gapflag = true,
643  bool ignoreUnknown = true);
644 
663  static double fuLiD(
664  const PolymorphismSequenceContainer& ingroup,
665  const PolymorphismSequenceContainer& outgroup,
666  bool useNbSingletons = true,
667  bool useNbSegregatingSites = false);
668 
678  static double fuLiDStar(
679  const PolymorphismSequenceContainer& group,
680  bool useNbSegregatingSites = false);
681 
700  static double fuLiF(
701  const PolymorphismSequenceContainer& ingroup,
702  const PolymorphismSequenceContainer& outgroup,
703  bool useNbSingletons = true,
704  bool useNbSegregatingSites = false);
705 
715  static double fuLiFStar(
716  const PolymorphismSequenceContainer& group,
717  bool useNbSegregatingSites);
718 
737  static double fstHudson92(
739  size_t id1,
740  size_t id2);
741 
742 
771  bool keepsingleton = true,
772  double freqmin = 0.);
773 
789  bool keepsingleton = true,
790  double freqmin = 0.);
791 
808  bool keepsingleton = true,
809  double freqmin = 0.);
810 
823  static Vdouble pairwiseD(
825  bool keepsingleton = true,
826  double freqmin = 0.);
827 
840  static Vdouble pairwiseDprime(
842  bool keepsingleton = true,
843  double freqmin = 0.);
844 
857  static Vdouble pairwiseR2(
859  bool keepsingleton = true,
860  double freqmin = 0.);
861 
874  static double meanD(
876  bool keepsingleton = true,
877  double freqmin = 0.);
878 
891  static double meanDprime(
893  bool keepsingleton = true,
894  double freqmin = 0.);
895 
908  static double meanR2(
910  bool keepsingleton = true,
911  double freqmin = 0.);
912 
924  static double meanDistance1(
926  bool keepsingleton = true,
927  double freqmin = 0.);
928 
940  static double meanDistance2(
942  bool keepsingleton = true,
943  double freqmin = 0.);
944 
961  static double originRegressionD(
963  bool distance1 = false,
964  bool keepsingleton = true,
965  double freqmin = 0.);
966 
983  static double originRegressionDprime(
985  bool distance1 = false,
986  bool keepsingleton = true,
987  double freqmin = 0.);
988 
1005  static double originRegressionR2(
1006  const PolymorphismSequenceContainer& psc,
1007  bool distance1 = false,
1008  bool keepsingleton = true,
1009  double freqmin = 0.);
1010 
1027  static Vdouble linearRegressionD(
1028  const PolymorphismSequenceContainer& psc,
1029  bool distance1 = false,
1030  bool keepsingleton = true,
1031  double freqmin = 0.);
1032 
1050  const PolymorphismSequenceContainer& psc,
1051  bool distance1 = false,
1052  bool keepsingleton = true,
1053  double freqmin = 0.);
1054 
1071  static Vdouble linearRegressionR2(
1072  const PolymorphismSequenceContainer& psc,
1073  bool distance1 = false,
1074  bool keepsingleton = true,
1075  double freqmin = 0.);
1076 
1094  static double inverseRegressionR2(
1095  const PolymorphismSequenceContainer& psc,
1096  bool distance1 = false,
1097  bool keepsingleton = true,
1098  double freqmin = 0.);
1099 
1109  static double hudson87(
1110  const PolymorphismSequenceContainer& psc,
1111  double precision = 0.000001,
1112  double cinf = 0.001,
1113  double csup = 10000.);
1114 
1121  static void testUsefulValues(
1122  std::ostream& s,
1123  size_t n);
1124 
1125 private:
1129  static unsigned int getNumberOfMutations_(const Site& site);
1130 
1134  static unsigned int getNumberOfSingletons_(const Site& site);
1135 
1143  static unsigned getNumberOfDerivedSingletons_(
1144  const Site& site_in,
1145  const Site& site_out);
1146 
1180  static std::map<std::string, double> getUsefulValues_(
1181  size_t n);
1182 
1195  static double getVD_(
1196  size_t n,
1197  double a1,
1198  double a2,
1199  double cn);
1200 
1211  static double getUD_(
1212  double a1,
1213  double vD);
1214 
1227  static double getVDstar_(
1228  size_t n,
1229  double a1,
1230  double a2,
1231  double dn);
1232 
1244  static double getUDstar_(
1245  size_t n,
1246  double a1,
1247  double vDs);
1248 
1254  static double leftHandHudson_(
1255  const PolymorphismSequenceContainer& psc);
1256 
1261  static double rightHandHudson_(
1262  double c,
1263  size_t n);
1264 
1265  /************************************************************************/
1266 };
1267 } // end of namespace bpp;
1268 
1269 #endif // _SEQUENCESTATISTICS_H_
1270 
The PolymorphismSequenceContainer class.
Static class providing methods to compute statistics on sequences data.
static double fuLiDStar(const PolymorphismSequenceContainer &group, bool useNbSegregatingSites=false)
Return the Fu and Li D* test (Fu & Li 1993, Genetics, 133 pp693-709).
static unsigned int numberOfSitesWithStopCodon(const PolymorphismSequenceContainer &psc, const GeneticCode &gCode, bool gapflag=true)
Compute the number of codon sites with stop codon.
static double meanDistance1(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give mean pairwise distances between sites / method 1: differences between sequences are not taken in...
static double meanR2(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give mean R² over all pairwise comparisons
static unsigned int dvk(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Return the number of haplotype in the sample. Depaulis and Veuille (1998, Mol Biol Evol,...
static unsigned getNumberOfDerivedSingletons_(const Site &site_in, const Site &site_out)
Count the number of singleton for a site.
static double hudson87(const PolymorphismSequenceContainer &psc, double precision=0.000001, double cinf=0.001, double csup=10000.)
give estimate of C=4Nr using Hudson method (Hudson 1987, Genet. Res., 50 pp245-250)
static double meanDistance2(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give mean pairwise distances between sites / method 2: differences between sequences are taken into a...
static unsigned int numberOfMonoSitePolymorphicCodons(const PolymorphismSequenceContainer &psc, bool stopflag=true, bool gapflag=true)
Compute the number of polymorphic codon with only one mutated site.
static double tajimaDtnm(const PolymorphismSequenceContainer &psc, bool gapflag=true, bool ignoreUnknown=true)
Return the Tajima's D test (Tajima 1989, Genetics 123 pp 585-595).
static unsigned int numberOfTriplets(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Compute the number of triplet in an alignment.
static double watterson75Synonymous(const PolymorphismSequenceContainer &psc, const GeneticCode &gc)
Compute the Watterson(1975,Theor Popul Biol, 7 pp256-276) estimator for synonymous positions.
static unsigned int numberOfSynonymousSubstitutions(const PolymorphismSequenceContainer &psc, const GeneticCode &gc, double freqmin=0.)
compute the number of synonymous subsitutions in an alignment
static double dvh(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Return the haplotype diversity of a sample. Depaulis and Veuille (1998, Mol Biol Evol,...
static double fstHudson92(const PolymorphismSequenceContainer &psc, size_t id1, size_t id2)
static double fayWu2000(const PolymorphismSequenceContainer &psc, const Sequence &ancestralSites)
Compute diversity estimator Theta H (eq. 3) of Fay and Wu (2000, Genetics, 155: 1405-1413)
static double fuLiD(const PolymorphismSequenceContainer &ingroup, const PolymorphismSequenceContainer &outgroup, bool useNbSingletons=true, bool useNbSegregatingSites=false)
Return the Fu and Li D test (Fu & Li 1993, Genetics, 133 pp693-709).
static double tajima83(const PolymorphismSequenceContainer &psc, bool gapflag=true, bool ignoreUnknown=true, bool scaled=false)
Compute diversity estimator Theta of Tajima (1983, Genetics, 105 pp437-460)
static Vdouble pairwiseR2(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give the vector of all mean pairwise R² value between two sites (Hill & Robertson 1968,...
static Vdouble pairwiseD(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give the vector of all mean pairwise D value between two sites (Lewontin & Kojima 1964,...
static Vdouble linearRegressionD(const PolymorphismSequenceContainer &psc, bool distance1=false, bool keepsingleton=true, double freqmin=0.)
give the slope and the origin of the regression |D| = a*distance+b
static std::vector< unsigned int > fixedDifferences(const PolymorphismSequenceContainer &pscin, const PolymorphismSequenceContainer &pscout, PolymorphismSequenceContainer &psccons, const GeneticCode &gc)
compute the number of fixed differences between two alignements
static double getUDstar_(size_t n, double a1, double vDs)
Get the uD* value of D* equation in Fu & Li 1993, Genetics, 133 pp693-709)
static double squaredHeterozygosity(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Compute the sum of per site squared heterozygosity in an alignment.
static double originRegressionD(const PolymorphismSequenceContainer &psc, bool distance1=false, bool keepsingleton=true, double freqmin=0.)
give the slope of the regression |D| = 1+a*distance
static double leftHandHudson_(const PolymorphismSequenceContainer &psc)
give the left hand term of equation (4) in Hudson (Hudson 1987, Genet. Res., 50 pp245-250) This term ...
static unsigned int getNumberOfSingletons_(const Site &site)
Count the number of singleton for a site.
static unsigned int numberOfParsimonyInformativeSites(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Compute the number of parsimony informative sites in an alignment.
static double meanNumberOfNonSynonymousSites(const PolymorphismSequenceContainer &psc, const GeneticCode &gc, double ratio=1.)
compute the mean number of non-synonymous site in an alignment
static double watterson75(const PolymorphismSequenceContainer &psc, bool gapflag=true, bool ignoreUnknown=true, bool scaled=false)
Compute diversity estimator Theta of Watterson (1975, Theor Popul Biol, 7 pp256-276)
static std::map< std::string, double > getUsefulValues_(size_t n)
Get useful values for theta estimators.
static unsigned int getNumberOfMutations_(const Site &site)
Count the number of mutation for a site.
static unsigned int numberOfTransitions(const PolymorphismSequenceContainer &psc)
Return the number of transitions.
static double frequencyOfPolymorphicSites(const PolymorphismSequenceContainer &psc, bool gapflag=true, bool ignoreUnknown=true)
Compute the frequency of polymorphic site in an alignment.
static double watterson75NonSynonymous(const PolymorphismSequenceContainer &psc, const GeneticCode &gc)
Compute the Watterson(1975, Theor Popul Biol, 7 pp256-276) estimator for non synonymous positions.
static unsigned int totalNumberOfMutationsOnExternalBranches(const PolymorphismSequenceContainer &ing, const PolymorphismSequenceContainer &outg)
Count the total number of mutations in external branchs.
static PolymorphismSequenceContainer * generateLdContainer(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
generate a special PolymorphismSequenceContainer for linkage disequilbrium analysis
static std::vector< unsigned int > mkTable(const PolymorphismSequenceContainer &ingroup, const PolymorphismSequenceContainer &outgroup, const GeneticCode &gc, double freqmin=0.)
return a vector containing Pa, Ps, Da, Ds
static double heterozygosity(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Compute the sum of per site heterozygosity in an alignment.
static void testUsefulValues(std::ostream &s, size_t n)
Test useful values.
static unsigned int numberOfSingletons(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Count the number of singleton nucleotides in an alignment.
static double originRegressionR2(const PolymorphismSequenceContainer &psc, bool distance1=false, bool keepsingleton=true, double freqmin=0.)
give the slope of the regression R² = 1+a*distance
static double piSynonymous(const PolymorphismSequenceContainer &psc, const GeneticCode &gc, bool minchange=false)
Compute the synonymous nucleotide diversity, pi.
static double meanDprime(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give mean D' over all pairwise comparisons
static Vdouble linearRegressionR2(const PolymorphismSequenceContainer &psc, bool distance1=false, bool keepsingleton=true, double freqmin=0.)
give the slope and the origin of the regression R² = a*distance+b
static std::vector< unsigned int > gcPolymorphism(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Return the number of GC alleles and the total number of alleles at polymorphic sites only.
static Vdouble linearRegressionDprime(const PolymorphismSequenceContainer &psc, bool distance1=false, bool keepsingleton=true, double freqmin=0.)
give the slope and the origin of the regression |D'| = a*distance+b
static unsigned int totalNumberOfMutations(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Count the total number of mutations in an alignment.
static unsigned int numberOfTransversions(const PolymorphismSequenceContainer &psc)
Return the number of transversions.
static Vdouble pairwiseDistances2(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give the vector of all mean pairwise distance between two sites to a LD SequencePolymorphismContainer
static double getVDstar_(size_t n, double a1, double a2, double dn)
Get the vD* value of D* equation in Fu & Li 1993, Genetics, 133 pp693-709)
static double tajimaDss(const PolymorphismSequenceContainer &psc, bool gapflag=true, bool ignoreUnknown=true)
Return the Tajima's D test (Tajima 1989, Genetics 123 pp 585-595).
static double rightHandHudson_(double c, size_t n)
give the right hand term of equation (4) in Hudson (Hudson 1987, Genet. Res., 50 pp245-250) This term...
static unsigned int numberOfNonSynonymousSubstitutions(const PolymorphismSequenceContainer &psc, const GeneticCode &gc, double freqmin=0.)
compute the number of non synonymous subsitutions in an alignment
static double fuLiFStar(const PolymorphismSequenceContainer &group, bool useNbSegregatingSites)
Return the Fu and Li F* test (Fu & Li 1993, Genetics, 133 pp693-709).
static double piNonSynonymous(const PolymorphismSequenceContainer &psc, const GeneticCode &gc, bool minchange=false)
Compute the non-synonymous nucleotide diversity, pi.
static double gcContent(const PolymorphismSequenceContainer &psc)
Compute the mean GC content in an alignment.
static unsigned int numberOfPolymorphicSites(const PolymorphismSequenceContainer &psc, bool gapflag=true, bool ignoreUnknown=true)
Compute the number of polymorphic site in an alignment.
static double inverseRegressionR2(const PolymorphismSequenceContainer &psc, bool distance1=false, bool keepsingleton=true, double freqmin=0.)
give the slope of the regression R² = 1/(1+a*distance)
static Vdouble pairwiseDistances1(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give the vector of the pairwise distances between site positions corresponding to a LD SequencePolymo...
static double getUD_(double a1, double vD)
Get the uD value of equation (32) in Fu & Li 1993, Genetics, 133 pp693-709)
static double ratioOfTransitionsTransversions(const PolymorphismSequenceContainer &psc)
Return the ratio of transitions/transversions.
static Vdouble pairwiseDprime(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give the vector of all mean pairwise D' value between two sites (Lewontin 1964, Genetics 49 pp49-67))
static double getVD_(size_t n, double a1, double a2, double cn)
Get the vD value of equation (32) in Fu & Li 1993, Genetics, 133 pp693-709)
static unsigned int numberOfSynonymousPolymorphicCodons(const PolymorphismSequenceContainer &psc, const GeneticCode &gc)
Compute the number of synonymous polymorphic codon sites.
static double meanNumberOfSynonymousSites(const PolymorphismSequenceContainer &psc, const GeneticCode &gc, double ratio=1.)
compute the mean number of synonymous site in an alignment
static double originRegressionDprime(const PolymorphismSequenceContainer &psc, bool distance1=false, bool keepsingleton=true, double freqmin=0.)
give the slope of the regression |D'| = 1+a*distance
static double fuLiF(const PolymorphismSequenceContainer &ingroup, const PolymorphismSequenceContainer &outgroup, bool useNbSingletons=true, bool useNbSegregatingSites=false)
Return the Fu and Li F test (Fu & Li 1993, Genetics, 133 pp693-709).
static double meanD(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give mean D over all pairwise comparisons
static double neutralityIndex(const PolymorphismSequenceContainer &ingroup, const PolymorphismSequenceContainer &outgroup, const GeneticCode &gc, double freqmin=0.)
return the neutrality index NI = (Pa/Ps)/(Da/Ds) (Rand & Kann 1996, Mol. Biol. Evol....
std::vector< double > Vdouble