bpp-core3  3.0.0
SequenceStatistics.h
Go to the documentation of this file.
1 // SPDX-FileCopyrightText: The Bio++ Development Group
2 //
3 // SPDX-License-Identifier: CECILL-2.1
4 
5 // Secured inclusion of header's file
6 #ifndef _SEQUENCESTATISTICS_H_
7 #define _SEQUENCESTATISTICS_H_
8 
9 // From the bpp-seq library
10 #include <Bpp/Seq/SymbolListTools.h>
11 #include <Bpp/Seq/Alphabet/CodonAlphabet.h>
12 #include <Bpp/Seq/GeneticCode/GeneticCode.h>
13 #include <Bpp/Seq/Container/SiteContainerIterator.h>
14 #include <Bpp/Seq/Container/SiteContainer.h>
15 #include <Bpp/Seq/Container/SiteContainerTools.h>
16 
19 
20 // From the STL
21 #include <string>
22 #include <map>
23 #include <vector>
24 
25 namespace bpp
26 {
27 using ConstSiteIterator = TemplateSiteIteratorInterface<const Site>;
28 
35 {
36 public:
52  static unsigned int numberOfPolymorphicSites(
54  bool gapflag = true,
55  bool ignoreUnknown = true);
56 
73  static double frequencyOfPolymorphicSites(
75  bool gapflag = true,
76  bool ignoreUnknown = true);
77 
85  static unsigned int numberOfParsimonyInformativeSites(
87  bool gapflag = true);
88 
97  static unsigned int numberOfSingletons(
99  bool gapflag = true);
100 
111  static unsigned int totalNumberOfMutations(
113  bool gapflag = true);
114 
127  static unsigned int totalNumberOfMutationsOnExternalBranches(
129  const PolymorphismSequenceContainer& outg);
130 
138  static unsigned int numberOfTriplets(
140  bool gapflag = true);
141 
148  static double heterozygosity(
150  bool gapflag = true);
151 
159  static double squaredHeterozygosity(
161  bool gapflag = true);
162 
168  static double gcContent(
169  const PolymorphismSequenceContainer& psc);
170 
185  static std::vector<unsigned int> gcPolymorphism(
187  bool gapflag = true);
188 
207  static double watterson75(
209  bool gapflag = true,
210  bool ignoreUnknown = true,
211  bool scaled = false);
212 
234  static double tajima83(
236  bool gapflag = true,
237  bool ignoreUnknown = true,
238  bool scaled = false);
239 
248  static double fayWu2000(
250  const Sequence& ancestralSites);
251 
264  static unsigned int dvk(
266  bool gapflag = true);
267 
280  static double dvh(
282  bool gapflag = true);
283 
290  static unsigned int numberOfTransitions(
291  const PolymorphismSequenceContainer& psc);
292 
299  static unsigned int numberOfTransversions(
300  const PolymorphismSequenceContainer& psc);
301 
308  static double ratioOfTransitionsTransversions(
309  const PolymorphismSequenceContainer& psc);
310 
320  static unsigned int numberOfSitesWithStopCodon(
322  const GeneticCode& gCode,
323  bool gapflag = true);
324 
337  static unsigned int numberOfMonoSitePolymorphicCodons(
339  bool stopflag = true,
340  bool gapflag = true);
341 
352  static unsigned int numberOfSynonymousPolymorphicCodons(
354  const GeneticCode& gc);
355 
370  static double watterson75Synonymous(
372  const GeneticCode& gc);
373 
388  static double watterson75NonSynonymous(
390  const GeneticCode& gc);
391 
407  static double piSynonymous(
409  const GeneticCode& gc,
410  bool minchange = false);
411 
427  static double piNonSynonymous(
429  const GeneticCode& gc,
430  bool minchange = false);
431 
446  static double meanNumberOfSynonymousSites(
448  const GeneticCode& gc,
449  double ratio = 1.);
450 
464  static double meanNumberOfNonSynonymousSites(
466  const GeneticCode& gc,
467  double ratio = 1.);
468 
484  static unsigned int numberOfSynonymousSubstitutions(
486  const GeneticCode& gc,
487  double freqmin = 0.);
488 
504  static unsigned int numberOfNonSynonymousSubstitutions(
506  const GeneticCode& gc,
507  double freqmin = 0.);
508 
526  static std::vector<unsigned int> fixedDifferences(
527  const PolymorphismSequenceContainer& pscin,
528  const PolymorphismSequenceContainer& pscout,
530  const GeneticCode& gc);
531 
543  static std::vector<unsigned int> mkTable(
544  const PolymorphismSequenceContainer& ingroup,
545  const PolymorphismSequenceContainer& outgroup,
546  const GeneticCode& gc,
547  double freqmin = 0.);
548 
562  static double neutralityIndex(
563  const PolymorphismSequenceContainer& ingroup,
564  const PolymorphismSequenceContainer& outgroup,
565  const GeneticCode& gc,
566  double freqmin = 0.);
567 
585  static double tajimaDss(
587  bool gapflag = true,
588  bool ignoreUnknown = true);
589 
605  static double tajimaDtnm(
607  bool gapflag = true,
608  bool ignoreUnknown = true);
609 
628  static double fuLiD(
629  const PolymorphismSequenceContainer& ingroup,
630  const PolymorphismSequenceContainer& outgroup,
631  bool useNbSingletons = true,
632  bool useNbSegregatingSites = false);
633 
643  static double fuLiDStar(
644  const PolymorphismSequenceContainer& group,
645  bool useNbSegregatingSites = false);
646 
665  static double fuLiF(
666  const PolymorphismSequenceContainer& ingroup,
667  const PolymorphismSequenceContainer& outgroup,
668  bool useNbSingletons = true,
669  bool useNbSegregatingSites = false);
670 
680  static double fuLiFStar(
681  const PolymorphismSequenceContainer& group,
682  bool useNbSegregatingSites);
683 
702  static double fstHudson92(
704  size_t id1,
705  size_t id2);
706 
707 
734  static std::unique_ptr<PolymorphismSequenceContainer> generateLdContainer(
736  bool keepsingleton = true,
737  double freqmin = 0.);
738 
752  static Vdouble pairwiseDistances1(
754  bool keepsingleton = true,
755  double freqmin = 0.);
756 
771  static Vdouble pairwiseDistances2(
773  bool keepsingleton = true,
774  double freqmin = 0.);
775 
788  static Vdouble pairwiseD(
790  bool keepsingleton = true,
791  double freqmin = 0.);
792 
805  static Vdouble pairwiseDprime(
807  bool keepsingleton = true,
808  double freqmin = 0.);
809 
822  static Vdouble pairwiseR2(
824  bool keepsingleton = true,
825  double freqmin = 0.);
826 
839  static double meanD(
841  bool keepsingleton = true,
842  double freqmin = 0.);
843 
856  static double meanDprime(
858  bool keepsingleton = true,
859  double freqmin = 0.);
860 
873  static double meanR2(
875  bool keepsingleton = true,
876  double freqmin = 0.);
877 
889  static double meanDistance1(
891  bool keepsingleton = true,
892  double freqmin = 0.);
893 
905  static double meanDistance2(
907  bool keepsingleton = true,
908  double freqmin = 0.);
909 
926  static double originRegressionD(
928  bool distance1 = false,
929  bool keepsingleton = true,
930  double freqmin = 0.);
931 
948  static double originRegressionDprime(
950  bool distance1 = false,
951  bool keepsingleton = true,
952  double freqmin = 0.);
953 
970  static double originRegressionR2(
972  bool distance1 = false,
973  bool keepsingleton = true,
974  double freqmin = 0.);
975 
992  static Vdouble linearRegressionD(
994  bool distance1 = false,
995  bool keepsingleton = true,
996  double freqmin = 0.);
997 
1014  static Vdouble linearRegressionDprime(
1015  const PolymorphismSequenceContainer& psc,
1016  bool distance1 = false,
1017  bool keepsingleton = true,
1018  double freqmin = 0.);
1019 
1036  static Vdouble linearRegressionR2(
1037  const PolymorphismSequenceContainer& psc,
1038  bool distance1 = false,
1039  bool keepsingleton = true,
1040  double freqmin = 0.);
1041 
1059  static double inverseRegressionR2(
1060  const PolymorphismSequenceContainer& psc,
1061  bool distance1 = false,
1062  bool keepsingleton = true,
1063  double freqmin = 0.);
1064 
1074  static double hudson87(
1075  const PolymorphismSequenceContainer& psc,
1076  double precision = 0.000001,
1077  double cinf = 0.001,
1078  double csup = 10000.);
1079 
1086  static void testUsefulValues(
1087  std::ostream& s,
1088  size_t n);
1089 
1090 private:
1094  static unsigned int getNumberOfMutations_(const Site& site);
1095 
1099  static unsigned int getNumberOfSingletons_(const Site& site);
1100 
1108  static unsigned getNumberOfDerivedSingletons_(
1109  const Site& site_in,
1110  const Site& site_out);
1111 
1145  static std::map<std::string, double> getUsefulValues_(
1146  size_t n);
1147 
1160  static double getVD_(
1161  size_t n,
1162  double a1,
1163  double a2,
1164  double cn);
1165 
1176  static double getUD_(
1177  double a1,
1178  double vD);
1179 
1192  static double getVDstar_(
1193  size_t n,
1194  double a1,
1195  double a2,
1196  double dn);
1197 
1209  static double getUDstar_(
1210  size_t n,
1211  double a1,
1212  double vDs);
1213 
1219  static double leftHandHudson_(
1220  const PolymorphismSequenceContainer& psc);
1221 
1226  static double rightHandHudson_(
1227  double c,
1228  size_t n);
1229 
1230  /************************************************************************/
1231 };
1232 } // end of namespace bpp;
1233 
1234 #endif // _SEQUENCESTATISTICS_H_
static unsigned int getNumberOfSingletons_(const Site &site)
Count the number of singleton for a site.
static double watterson75(const PolymorphismSequenceContainer &psc, bool gapflag=true, bool ignoreUnknown=true, bool scaled=false)
Compute diversity estimator Theta of Watterson (1975, Theor Popul Biol, 7 pp256-276) ...
static unsigned int totalNumberOfMutationsOnExternalBranches(const PolymorphismSequenceContainer &ing, const PolymorphismSequenceContainer &outg)
Count the total number of mutations in external branches.
static unsigned int dvk(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Return the number of haplotype in the sample. Depaulis and Veuille (1998, Mol Biol Evol...
static double getUD_(double a1, double vD)
Get the uD value of equation (32) in Fu & Li 1993, Genetics, 133 pp693-709)
static double fuLiFStar(const PolymorphismSequenceContainer &group, bool useNbSegregatingSites)
Return the Fu and Li F* test (Fu & Li 1993, Genetics, 133 pp693-709).
static double meanNumberOfSynonymousSites(const PolymorphismSequenceContainer &psc, const GeneticCode &gc, double ratio=1.)
compute the mean number of synonymous site in an alignment
static double neutralityIndex(const PolymorphismSequenceContainer &ingroup, const PolymorphismSequenceContainer &outgroup, const GeneticCode &gc, double freqmin=0.)
return the neutrality index NI = (Pa/Ps)/(Da/Ds) (Rand & Kann 1996, Mol. Biol. Evol. 13 pp735-748)
static double originRegressionDprime(const PolymorphismSequenceContainer &psc, bool distance1=false, bool keepsingleton=true, double freqmin=0.)
give the slope of the regression |D&#39;| = 1+a*distance
static std::vector< unsigned int > fixedDifferences(const PolymorphismSequenceContainer &pscin, const PolymorphismSequenceContainer &pscout, PolymorphismSequenceContainer &psccons, const GeneticCode &gc)
compute the number of fixed differences between two alignements
static double meanD(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give mean D over all pairwise comparisons
Definition: AlleleInfo.h:13
static double watterson75Synonymous(const PolymorphismSequenceContainer &psc, const GeneticCode &gc)
Compute the Watterson(1975,Theor Popul Biol, 7 pp256-276) estimator for synonymous positions...
static double leftHandHudson_(const PolymorphismSequenceContainer &psc)
give the left hand term of equation (4) in Hudson (Hudson 1987, Genet. Res., 50 pp245-250) This term ...
static Vdouble pairwiseD(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give the vector of all mean pairwise D value between two sites (Lewontin & Kojima 1964...
static double frequencyOfPolymorphicSites(const PolymorphismSequenceContainer &psc, bool gapflag=true, bool ignoreUnknown=true)
Compute the frequency of polymorphic site in an alignment.
static double fuLiF(const PolymorphismSequenceContainer &ingroup, const PolymorphismSequenceContainer &outgroup, bool useNbSingletons=true, bool useNbSegregatingSites=false)
Return the Fu and Li F test (Fu & Li 1993, Genetics, 133 pp693-709).
static double tajimaDtnm(const PolymorphismSequenceContainer &psc, bool gapflag=true, bool ignoreUnknown=true)
Return the Tajima&#39;s D test (Tajima 1989, Genetics 123 pp 585-595).
static double meanDistance2(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give mean pairwise distances between sites / method 2: differences between sequences are taken into a...
static unsigned int numberOfSitesWithStopCodon(const PolymorphismSequenceContainer &psc, const GeneticCode &gCode, bool gapflag=true)
Compute the number of codon sites with stop codon.
static void testUsefulValues(std::ostream &s, size_t n)
Test useful values.
static unsigned int totalNumberOfMutations(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Count the total number of mutations in an alignment.
TemplateSiteIteratorInterface< const Site > ConstSiteIterator
static Vdouble linearRegressionD(const PolymorphismSequenceContainer &psc, bool distance1=false, bool keepsingleton=true, double freqmin=0.)
give the slope and the origin of the regression |D| = a*distance+b
static double piSynonymous(const PolymorphismSequenceContainer &psc, const GeneticCode &gc, bool minchange=false)
Compute the synonymous nucleotide diversity, pi.
static double gcContent(const PolymorphismSequenceContainer &psc)
Compute the mean GC content in an alignment.
static double meanDprime(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give mean D&#39; over all pairwise comparisons
static std::map< std::string, double > getUsefulValues_(size_t n)
Get useful values for theta estimators.
static double ratioOfTransitionsTransversions(const PolymorphismSequenceContainer &psc)
Return the ratio of transitions/transversions.
static double fstHudson92(const PolymorphismSequenceContainer &psc, size_t id1, size_t id2)
static unsigned int numberOfSynonymousSubstitutions(const PolymorphismSequenceContainer &psc, const GeneticCode &gc, double freqmin=0.)
compute the number of synonymous substitutions in an alignment
static Vdouble linearRegressionDprime(const PolymorphismSequenceContainer &psc, bool distance1=false, bool keepsingleton=true, double freqmin=0.)
give the slope and the origin of the regression |D&#39;| = a*distance+b
static double tajimaDss(const PolymorphismSequenceContainer &psc, bool gapflag=true, bool ignoreUnknown=true)
Return the Tajima&#39;s D test (Tajima 1989, Genetics 123 pp 585-595).
static double originRegressionR2(const PolymorphismSequenceContainer &psc, bool distance1=false, bool keepsingleton=true, double freqmin=0.)
give the slope of the regression R² = 1+a*distance
static Vdouble pairwiseDistances1(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give the vector of the pairwise distances between site positions corresponding to a LD SequencePolymo...
static double originRegressionD(const PolymorphismSequenceContainer &psc, bool distance1=false, bool keepsingleton=true, double freqmin=0.)
give the slope of the regression |D| = 1+a*distance
static double getVDstar_(size_t n, double a1, double a2, double dn)
Get the vD* value of D* equation in Fu & Li 1993, Genetics, 133 pp693-709)
static double heterozygosity(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Compute the sum of per site heterozygosity in an alignment.
static std::vector< unsigned int > gcPolymorphism(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Return the number of GC alleles and the total number of alleles at polymorphic sites only...
Static class providing methods to compute statistics on sequences data.
static double hudson87(const PolymorphismSequenceContainer &psc, double precision=0.000001, double cinf=0.001, double csup=10000.)
give estimate of C=4Nr using Hudson method (Hudson 1987, Genet. Res., 50 pp245-250) ...
static double piNonSynonymous(const PolymorphismSequenceContainer &psc, const GeneticCode &gc, bool minchange=false)
Compute the non-synonymous nucleotide diversity, pi.
static unsigned int numberOfTransversions(const PolymorphismSequenceContainer &psc)
Return the number of transversions.
static double fayWu2000(const PolymorphismSequenceContainer &psc, const Sequence &ancestralSites)
Compute diversity estimator Theta H (eq. 3) of Fay and Wu (2000, Genetics, 155: 1405-1413) ...
static double meanDistance1(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give mean pairwise distances between sites / method 1: differences between sequences are not taken in...
static double meanR2(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give mean R² over all pairwise comparisons
static unsigned int numberOfMonoSitePolymorphicCodons(const PolymorphismSequenceContainer &psc, bool stopflag=true, bool gapflag=true)
Compute the number of polymorphic codon with only one mutated site.
static Vdouble pairwiseR2(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give the vector of all mean pairwise R² value between two sites (Hill & Robertson 1968...
static unsigned int numberOfTransitions(const PolymorphismSequenceContainer &psc)
Return the number of transitions.
static double dvh(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Return the haplotype diversity of a sample. Depaulis and Veuille (1998, Mol Biol Evol, 12 pp1788-1790)
static unsigned int numberOfTriplets(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Compute the number of triplet in an alignment.
static unsigned int numberOfNonSynonymousSubstitutions(const PolymorphismSequenceContainer &psc, const GeneticCode &gc, double freqmin=0.)
compute the number of non synonymous substitutions in an alignment
static Vdouble pairwiseDprime(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give the vector of all mean pairwise D&#39; value between two sites (Lewontin 1964, Genetics 49 pp49-67))...
static unsigned getNumberOfDerivedSingletons_(const Site &site_in, const Site &site_out)
Count the number of singleton for a site.
static std::vector< unsigned int > mkTable(const PolymorphismSequenceContainer &ingroup, const PolymorphismSequenceContainer &outgroup, const GeneticCode &gc, double freqmin=0.)
return a vector containing Pa, Ps, Da, Ds
static std::unique_ptr< PolymorphismSequenceContainer > generateLdContainer(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
generate a special PolymorphismSequenceContainer for linkage disequilbrium analysis ...
The PolymorphismSequenceContainer class.
static double tajima83(const PolymorphismSequenceContainer &psc, bool gapflag=true, bool ignoreUnknown=true, bool scaled=false)
Compute diversity estimator Theta of Tajima (1983, Genetics, 105 pp437-460)
static Vdouble pairwiseDistances2(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give the vector of all mean pairwise distance between two sites to a LD SequencePolymorphismContainer...
static unsigned int numberOfParsimonyInformativeSites(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Compute the number of parsimony informative sites in an alignment.
static double watterson75NonSynonymous(const PolymorphismSequenceContainer &psc, const GeneticCode &gc)
Compute the Watterson(1975, Theor Popul Biol, 7 pp256-276) estimator for non synonymous positions...
static Vdouble linearRegressionR2(const PolymorphismSequenceContainer &psc, bool distance1=false, bool keepsingleton=true, double freqmin=0.)
give the slope and the origin of the regression R² = a*distance+b
static double fuLiD(const PolymorphismSequenceContainer &ingroup, const PolymorphismSequenceContainer &outgroup, bool useNbSingletons=true, bool useNbSegregatingSites=false)
Return the Fu and Li D test (Fu & Li 1993, Genetics, 133 pp693-709).
static double fuLiDStar(const PolymorphismSequenceContainer &group, bool useNbSegregatingSites=false)
Return the Fu and Li D* test (Fu & Li 1993, Genetics, 133 pp693-709).
static double squaredHeterozygosity(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Compute the sum of per site squared heterozygosity in an alignment.
static double rightHandHudson_(double c, size_t n)
give the right hand term of equation (4) in Hudson (Hudson 1987, Genet. Res., 50 pp245-250) This term...
static double getVD_(size_t n, double a1, double a2, double cn)
Get the vD value of equation (32) in Fu & Li 1993, Genetics, 133 pp693-709)
static double getUDstar_(size_t n, double a1, double vDs)
Get the uD* value of D* equation in Fu & Li 1993, Genetics, 133 pp693-709)
static double meanNumberOfNonSynonymousSites(const PolymorphismSequenceContainer &psc, const GeneticCode &gc, double ratio=1.)
compute the mean number of non-synonymous site in an alignment
static unsigned int numberOfSynonymousPolymorphicCodons(const PolymorphismSequenceContainer &psc, const GeneticCode &gc)
Compute the number of synonymous polymorphic codon sites.
static unsigned int getNumberOfMutations_(const Site &site)
Count the number of mutation for a site.
static double inverseRegressionR2(const PolymorphismSequenceContainer &psc, bool distance1=false, bool keepsingleton=true, double freqmin=0.)
give the slope of the regression R² = 1/(1+a*distance)
static unsigned int numberOfSingletons(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Count the number of singleton nucleotides in an alignment.
static unsigned int numberOfPolymorphicSites(const PolymorphismSequenceContainer &psc, bool gapflag=true, bool ignoreUnknown=true)
Compute the number of polymorphic site in an alignment.