6 #ifndef _SEQUENCESTATISTICS_H_ 7 #define _SEQUENCESTATISTICS_H_ 10 #include <Bpp/Seq/SymbolListTools.h> 11 #include <Bpp/Seq/Alphabet/CodonAlphabet.h> 12 #include <Bpp/Seq/GeneticCode/GeneticCode.h> 13 #include <Bpp/Seq/Container/SiteContainerIterator.h> 14 #include <Bpp/Seq/Container/SiteContainer.h> 15 #include <Bpp/Seq/Container/SiteContainerTools.h> 55 bool ignoreUnknown =
true);
76 bool ignoreUnknown =
true);
113 bool gapflag =
true);
140 bool gapflag =
true);
150 bool gapflag =
true);
161 bool gapflag =
true);
187 bool gapflag =
true);
210 bool ignoreUnknown =
true,
211 bool scaled =
false);
237 bool ignoreUnknown =
true,
238 bool scaled =
false);
250 const Sequence& ancestralSites);
264 static unsigned int dvk(
266 bool gapflag =
true);
282 bool gapflag =
true);
322 const GeneticCode& gCode,
323 bool gapflag =
true);
339 bool stopflag =
true,
340 bool gapflag =
true);
354 const GeneticCode& gc);
372 const GeneticCode& gc);
390 const GeneticCode& gc);
409 const GeneticCode& gc,
410 bool minchange =
false);
429 const GeneticCode& gc,
430 bool minchange =
false);
448 const GeneticCode& gc,
466 const GeneticCode& gc,
486 const GeneticCode& gc,
487 double freqmin = 0.);
506 const GeneticCode& gc,
507 double freqmin = 0.);
530 const GeneticCode& gc);
543 static std::vector<unsigned int>
mkTable(
546 const GeneticCode& gc,
547 double freqmin = 0.);
565 const GeneticCode& gc,
566 double freqmin = 0.);
588 bool ignoreUnknown =
true);
608 bool ignoreUnknown =
true);
631 bool useNbSingletons =
true,
632 bool useNbSegregatingSites =
false);
645 bool useNbSegregatingSites =
false);
668 bool useNbSingletons =
true,
669 bool useNbSegregatingSites =
false);
682 bool useNbSegregatingSites);
736 bool keepsingleton =
true,
737 double freqmin = 0.);
754 bool keepsingleton =
true,
755 double freqmin = 0.);
773 bool keepsingleton =
true,
774 double freqmin = 0.);
790 bool keepsingleton =
true,
791 double freqmin = 0.);
807 bool keepsingleton =
true,
808 double freqmin = 0.);
824 bool keepsingleton =
true,
825 double freqmin = 0.);
841 bool keepsingleton =
true,
842 double freqmin = 0.);
858 bool keepsingleton =
true,
859 double freqmin = 0.);
875 bool keepsingleton =
true,
876 double freqmin = 0.);
891 bool keepsingleton =
true,
892 double freqmin = 0.);
907 bool keepsingleton =
true,
908 double freqmin = 0.);
928 bool distance1 =
false,
929 bool keepsingleton =
true,
930 double freqmin = 0.);
950 bool distance1 =
false,
951 bool keepsingleton =
true,
952 double freqmin = 0.);
972 bool distance1 =
false,
973 bool keepsingleton =
true,
974 double freqmin = 0.);
994 bool distance1 =
false,
995 bool keepsingleton =
true,
996 double freqmin = 0.);
1016 bool distance1 =
false,
1017 bool keepsingleton =
true,
1018 double freqmin = 0.);
1038 bool distance1 =
false,
1039 bool keepsingleton =
true,
1040 double freqmin = 0.);
1061 bool distance1 =
false,
1062 bool keepsingleton =
true,
1063 double freqmin = 0.);
1076 double precision = 0.000001,
1077 double cinf = 0.001,
1078 double csup = 10000.);
1109 const Site& site_in,
1110 const Site& site_out);
1234 #endif // _SEQUENCESTATISTICS_H_ static unsigned int getNumberOfSingletons_(const Site &site)
Count the number of singleton for a site.
static double watterson75(const PolymorphismSequenceContainer &psc, bool gapflag=true, bool ignoreUnknown=true, bool scaled=false)
Compute diversity estimator Theta of Watterson (1975, Theor Popul Biol, 7 pp256-276) ...
static unsigned int totalNumberOfMutationsOnExternalBranches(const PolymorphismSequenceContainer &ing, const PolymorphismSequenceContainer &outg)
Count the total number of mutations in external branches.
static unsigned int dvk(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Return the number of haplotype in the sample. Depaulis and Veuille (1998, Mol Biol Evol...
static double getUD_(double a1, double vD)
Get the uD value of equation (32) in Fu & Li 1993, Genetics, 133 pp693-709)
static double fuLiFStar(const PolymorphismSequenceContainer &group, bool useNbSegregatingSites)
Return the Fu and Li F* test (Fu & Li 1993, Genetics, 133 pp693-709).
static double meanNumberOfSynonymousSites(const PolymorphismSequenceContainer &psc, const GeneticCode &gc, double ratio=1.)
compute the mean number of synonymous site in an alignment
static double neutralityIndex(const PolymorphismSequenceContainer &ingroup, const PolymorphismSequenceContainer &outgroup, const GeneticCode &gc, double freqmin=0.)
return the neutrality index NI = (Pa/Ps)/(Da/Ds) (Rand & Kann 1996, Mol. Biol. Evol. 13 pp735-748)
static double originRegressionDprime(const PolymorphismSequenceContainer &psc, bool distance1=false, bool keepsingleton=true, double freqmin=0.)
give the slope of the regression |D'| = 1+a*distance
static std::vector< unsigned int > fixedDifferences(const PolymorphismSequenceContainer &pscin, const PolymorphismSequenceContainer &pscout, PolymorphismSequenceContainer &psccons, const GeneticCode &gc)
compute the number of fixed differences between two alignements
static double meanD(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give mean D over all pairwise comparisons
static double watterson75Synonymous(const PolymorphismSequenceContainer &psc, const GeneticCode &gc)
Compute the Watterson(1975,Theor Popul Biol, 7 pp256-276) estimator for synonymous positions...
static double leftHandHudson_(const PolymorphismSequenceContainer &psc)
give the left hand term of equation (4) in Hudson (Hudson 1987, Genet. Res., 50 pp245-250) This term ...
static Vdouble pairwiseD(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give the vector of all mean pairwise D value between two sites (Lewontin & Kojima 1964...
static double frequencyOfPolymorphicSites(const PolymorphismSequenceContainer &psc, bool gapflag=true, bool ignoreUnknown=true)
Compute the frequency of polymorphic site in an alignment.
static double fuLiF(const PolymorphismSequenceContainer &ingroup, const PolymorphismSequenceContainer &outgroup, bool useNbSingletons=true, bool useNbSegregatingSites=false)
Return the Fu and Li F test (Fu & Li 1993, Genetics, 133 pp693-709).
static double tajimaDtnm(const PolymorphismSequenceContainer &psc, bool gapflag=true, bool ignoreUnknown=true)
Return the Tajima's D test (Tajima 1989, Genetics 123 pp 585-595).
static double meanDistance2(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give mean pairwise distances between sites / method 2: differences between sequences are taken into a...
static unsigned int numberOfSitesWithStopCodon(const PolymorphismSequenceContainer &psc, const GeneticCode &gCode, bool gapflag=true)
Compute the number of codon sites with stop codon.
static void testUsefulValues(std::ostream &s, size_t n)
Test useful values.
static unsigned int totalNumberOfMutations(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Count the total number of mutations in an alignment.
TemplateSiteIteratorInterface< const Site > ConstSiteIterator
static Vdouble linearRegressionD(const PolymorphismSequenceContainer &psc, bool distance1=false, bool keepsingleton=true, double freqmin=0.)
give the slope and the origin of the regression |D| = a*distance+b
static double piSynonymous(const PolymorphismSequenceContainer &psc, const GeneticCode &gc, bool minchange=false)
Compute the synonymous nucleotide diversity, pi.
static double gcContent(const PolymorphismSequenceContainer &psc)
Compute the mean GC content in an alignment.
static double meanDprime(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give mean D' over all pairwise comparisons
static std::map< std::string, double > getUsefulValues_(size_t n)
Get useful values for theta estimators.
static double ratioOfTransitionsTransversions(const PolymorphismSequenceContainer &psc)
Return the ratio of transitions/transversions.
static double fstHudson92(const PolymorphismSequenceContainer &psc, size_t id1, size_t id2)
static unsigned int numberOfSynonymousSubstitutions(const PolymorphismSequenceContainer &psc, const GeneticCode &gc, double freqmin=0.)
compute the number of synonymous substitutions in an alignment
static Vdouble linearRegressionDprime(const PolymorphismSequenceContainer &psc, bool distance1=false, bool keepsingleton=true, double freqmin=0.)
give the slope and the origin of the regression |D'| = a*distance+b
static double tajimaDss(const PolymorphismSequenceContainer &psc, bool gapflag=true, bool ignoreUnknown=true)
Return the Tajima's D test (Tajima 1989, Genetics 123 pp 585-595).
static double originRegressionR2(const PolymorphismSequenceContainer &psc, bool distance1=false, bool keepsingleton=true, double freqmin=0.)
give the slope of the regression R² = 1+a*distance
static Vdouble pairwiseDistances1(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give the vector of the pairwise distances between site positions corresponding to a LD SequencePolymo...
static double originRegressionD(const PolymorphismSequenceContainer &psc, bool distance1=false, bool keepsingleton=true, double freqmin=0.)
give the slope of the regression |D| = 1+a*distance
static double getVDstar_(size_t n, double a1, double a2, double dn)
Get the vD* value of D* equation in Fu & Li 1993, Genetics, 133 pp693-709)
static double heterozygosity(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Compute the sum of per site heterozygosity in an alignment.
static std::vector< unsigned int > gcPolymorphism(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Return the number of GC alleles and the total number of alleles at polymorphic sites only...
Static class providing methods to compute statistics on sequences data.
static double hudson87(const PolymorphismSequenceContainer &psc, double precision=0.000001, double cinf=0.001, double csup=10000.)
give estimate of C=4Nr using Hudson method (Hudson 1987, Genet. Res., 50 pp245-250) ...
static double piNonSynonymous(const PolymorphismSequenceContainer &psc, const GeneticCode &gc, bool minchange=false)
Compute the non-synonymous nucleotide diversity, pi.
static unsigned int numberOfTransversions(const PolymorphismSequenceContainer &psc)
Return the number of transversions.
static double fayWu2000(const PolymorphismSequenceContainer &psc, const Sequence &ancestralSites)
Compute diversity estimator Theta H (eq. 3) of Fay and Wu (2000, Genetics, 155: 1405-1413) ...
static double meanDistance1(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give mean pairwise distances between sites / method 1: differences between sequences are not taken in...
static double meanR2(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give mean R² over all pairwise comparisons
static unsigned int numberOfMonoSitePolymorphicCodons(const PolymorphismSequenceContainer &psc, bool stopflag=true, bool gapflag=true)
Compute the number of polymorphic codon with only one mutated site.
static Vdouble pairwiseR2(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give the vector of all mean pairwise R² value between two sites (Hill & Robertson 1968...
static unsigned int numberOfTransitions(const PolymorphismSequenceContainer &psc)
Return the number of transitions.
static double dvh(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Return the haplotype diversity of a sample. Depaulis and Veuille (1998, Mol Biol Evol, 12 pp1788-1790)
static unsigned int numberOfTriplets(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Compute the number of triplet in an alignment.
static unsigned int numberOfNonSynonymousSubstitutions(const PolymorphismSequenceContainer &psc, const GeneticCode &gc, double freqmin=0.)
compute the number of non synonymous substitutions in an alignment
static Vdouble pairwiseDprime(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give the vector of all mean pairwise D' value between two sites (Lewontin 1964, Genetics 49 pp49-67))...
static unsigned getNumberOfDerivedSingletons_(const Site &site_in, const Site &site_out)
Count the number of singleton for a site.
static std::vector< unsigned int > mkTable(const PolymorphismSequenceContainer &ingroup, const PolymorphismSequenceContainer &outgroup, const GeneticCode &gc, double freqmin=0.)
return a vector containing Pa, Ps, Da, Ds
static std::unique_ptr< PolymorphismSequenceContainer > generateLdContainer(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
generate a special PolymorphismSequenceContainer for linkage disequilbrium analysis ...
The PolymorphismSequenceContainer class.
static double tajima83(const PolymorphismSequenceContainer &psc, bool gapflag=true, bool ignoreUnknown=true, bool scaled=false)
Compute diversity estimator Theta of Tajima (1983, Genetics, 105 pp437-460)
static Vdouble pairwiseDistances2(const PolymorphismSequenceContainer &psc, bool keepsingleton=true, double freqmin=0.)
give the vector of all mean pairwise distance between two sites to a LD SequencePolymorphismContainer...
static unsigned int numberOfParsimonyInformativeSites(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Compute the number of parsimony informative sites in an alignment.
static double watterson75NonSynonymous(const PolymorphismSequenceContainer &psc, const GeneticCode &gc)
Compute the Watterson(1975, Theor Popul Biol, 7 pp256-276) estimator for non synonymous positions...
static Vdouble linearRegressionR2(const PolymorphismSequenceContainer &psc, bool distance1=false, bool keepsingleton=true, double freqmin=0.)
give the slope and the origin of the regression R² = a*distance+b
static double fuLiD(const PolymorphismSequenceContainer &ingroup, const PolymorphismSequenceContainer &outgroup, bool useNbSingletons=true, bool useNbSegregatingSites=false)
Return the Fu and Li D test (Fu & Li 1993, Genetics, 133 pp693-709).
static double fuLiDStar(const PolymorphismSequenceContainer &group, bool useNbSegregatingSites=false)
Return the Fu and Li D* test (Fu & Li 1993, Genetics, 133 pp693-709).
static double squaredHeterozygosity(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Compute the sum of per site squared heterozygosity in an alignment.
static double rightHandHudson_(double c, size_t n)
give the right hand term of equation (4) in Hudson (Hudson 1987, Genet. Res., 50 pp245-250) This term...
static double getVD_(size_t n, double a1, double a2, double cn)
Get the vD value of equation (32) in Fu & Li 1993, Genetics, 133 pp693-709)
static double getUDstar_(size_t n, double a1, double vDs)
Get the uD* value of D* equation in Fu & Li 1993, Genetics, 133 pp693-709)
static double meanNumberOfNonSynonymousSites(const PolymorphismSequenceContainer &psc, const GeneticCode &gc, double ratio=1.)
compute the mean number of non-synonymous site in an alignment
static unsigned int numberOfSynonymousPolymorphicCodons(const PolymorphismSequenceContainer &psc, const GeneticCode &gc)
Compute the number of synonymous polymorphic codon sites.
static unsigned int getNumberOfMutations_(const Site &site)
Count the number of mutation for a site.
static double inverseRegressionR2(const PolymorphismSequenceContainer &psc, bool distance1=false, bool keepsingleton=true, double freqmin=0.)
give the slope of the regression R² = 1/(1+a*distance)
static unsigned int numberOfSingletons(const PolymorphismSequenceContainer &psc, bool gapflag=true)
Count the number of singleton nucleotides in an alignment.
static unsigned int numberOfPolymorphicSites(const PolymorphismSequenceContainer &psc, bool gapflag=true, bool ignoreUnknown=true)
Compute the number of polymorphic site in an alignment.