34 unsigned int SequenceStatistics::numberOfPolymorphicSites(
40 unique_ptr<ConstSiteIterator> si;
45 while (si->hasMoreSites())
47 auto site = si->nextSite();
48 if (!SiteTools::isConstant(site, ignoreUnknown))
60 unique_ptr<ConstSiteIterator> si;
65 while (si->hasMoreSites())
67 auto& site = si->nextSite();
69 if (!SiteTools::isConstant(site, ignoreUnknown))
79 unique_ptr<ConstSiteIterator> si;
85 while (si->hasMoreSites())
87 auto& site = si->nextSite();
88 if (SiteTools::isParsimonyInformativeSite(site))
98 unique_ptr<ConstSiteIterator> si;
103 unsigned int nus = 0;
104 while (si->hasMoreSites())
106 auto& site = si->nextSite();
107 nus += getNumberOfSingletons_(site);
114 unique_ptr<ConstSiteIterator> si;
120 while (si->hasMoreSites())
122 auto& site = si->nextSite();
123 if (SiteTools::isTriplet(site))
133 unique_ptr<ConstSiteIterator> si;
138 unsigned int tnm = 0;
139 while (si->hasMoreSites())
141 auto& site = si->nextSite();
142 tnm += getNumberOfMutations_(site);
147 unsigned int SequenceStatistics::totalNumberOfMutationsOnExternalBranches(
152 throw Exception(
"ing and outg must have the same size");
153 unsigned int nmuts = 0;
154 auto si = make_unique<SimpleSiteContainerIterator>(ing);
155 auto so = make_unique<SimpleSiteContainerIterator>(outg);
156 while (si->hasMoreSites())
158 auto& siteIn = si->nextSite();
159 auto& siteOut = so->nextSite();
161 if (SiteTools::isComplete(siteIn) && SiteTools::isComplete(siteOut))
162 nmuts += getNumberOfDerivedSingletons_(siteIn, siteOut);
169 unique_ptr<ConstSiteIterator> si;
175 while (si->hasMoreSites())
177 auto& site = si->nextSite();
178 s += SiteTools::heterozygosity(site);
185 unique_ptr<ConstSiteIterator> si;
191 while (si->hasMoreSites())
193 auto& site = si->nextSite();
194 double h = SiteTools::heterozygosity(site);
206 map<int, double> freqs;
207 SequenceContainerTools::getFrequencies(psc, freqs);
209 return (freqs[alpha.charToInt(
"C")] + freqs[alpha.charToInt(
"G")]) / (freqs[alpha.charToInt(
"A")] + freqs[alpha.charToInt(
"C")] + freqs[alpha.charToInt(
"G")] + freqs[alpha.charToInt(
"T")]);
214 unsigned int nbMut = 0;
215 unsigned int nbGC = 0;
217 vector<unsigned int> vect(2);
218 unique_ptr<ConstSiteIterator> si;
223 while (si->hasMoreSites())
225 auto& site = si->nextSite();
226 if (!SiteTools::isConstant(site))
228 long double freqGC = SymbolListTools::getGCContent(site);
229 if (freqGC > 0 && freqGC < 1)
231 nbMut +=
static_cast<unsigned int>(nbSeq);
232 long double adGC = freqGC * nbSeq;
233 nbGC +=
static_cast<unsigned int>(adGC);
250 map<string, double> values = getUsefulValues_(n);
253 s = frequencyOfPolymorphicSites(psc, gapflag, ignoreUnknown);
255 s =
static_cast<double>(numberOfPolymorphicSites(psc, gapflag, ignoreUnknown));
256 ThetaW = s / values[
"a1"];
262 size_t alphabetSize = psc.
getAlphabet()->getSize();
263 unique_ptr<ConstSiteIterator> si;
270 while (si->hasMoreSites())
272 auto& site = si->nextSite();
274 if (!SiteTools::isConstant(site, ignoreUnknown))
277 map<int, size_t>
count;
278 SymbolListTools::getCounts(site,
count);
279 map<int, size_t> tmp_k;
281 for (
auto& it :
count)
283 if (it.first >= 0 && it.first <
static_cast<int>(alphabetSize))
285 tmp_k[it.first] = it.second * (it.second - 1);
289 if (tmp_n == 0 || tmp_n == 1)
291 for (
auto& it : tmp_k)
293 value +=
static_cast<double>(it.second) /
static_cast<double>(tmp_n * (tmp_n - 1));
295 value2 += 1. - value;
298 return scaled ? value2 / l : value2;
304 throw Exception(
"SequenceStatistics::FayWu2000: ancestralSites and psc don't have the same size!!!'" );
313 string ancB = ancestralSites.
getChar(i);
314 int ancV = ancestralSites.
getValue(i);
316 if (!SiteTools::isConstant(site) || tmps.
getChar(i) != ancB)
321 map<int, size_t>
count;
322 SymbolListTools::getCounts(site,
count);
323 map<int, size_t> tmp_k;
325 for (
auto& it :
count)
327 if (it.first >= 0 && it.first <
static_cast<int>(alphabetSize))
330 if (it.first != ancV)
332 tmp_k[it.first] = 2 * it.second * it.second;
337 if (tmp_n == 0 || tmp_n == 1)
339 for (map<int, size_t>::iterator it = tmp_k.begin(); it != tmp_k.end(); it++)
341 value +=
static_cast<double>(it->second) /
static_cast<double>(tmp_n * (tmp_n - 1));
355 unique_ptr<PolymorphismSequenceContainer> sc;
357 sc = PolymorphismSequenceContainerTools::getSitesWithoutGaps(psc);
359 sc = make_unique<PolymorphismSequenceContainer>(psc);
361 vector<string> pscvector;
362 pscvector.push_back(sc->sequence(0).toString());
364 for (
size_t i = 1; i < sc->getNumberOfSequences(); ++i)
367 string query = sc->sequence(i).toString();
368 for (
auto& it : pscvector)
370 if (query.compare(it) == 0)
379 pscvector.push_back(query);
383 return static_cast<unsigned int>(pscvector.size());
393 unique_ptr<PolymorphismSequenceContainer> sc;
395 sc = PolymorphismSequenceContainerTools::getSitesWithoutGaps(psc);
397 sc = make_unique<PolymorphismSequenceContainer>(psc);
400 vector<string> pscvector;
401 vector<size_t> effvector;
402 pscvector.push_back(sc->sequence(0).toString());
403 effvector.push_back(sc->getSequenceCount(0));
404 nbSeq = sc->getSequenceCount(0);
405 for (
size_t i = 1; i < sc->getNumberOfSequences(); ++i)
407 nbSeq += sc->getSequenceCount(i);
409 string query = sc->sequence(i).toString();
410 for (
size_t j = 0; j < pscvector.size(); ++j)
412 if (query.compare(pscvector[j]) == 0)
414 effvector[j] += sc->getSequenceCount(i);
421 pscvector.push_back(query);
422 effvector.push_back(sc->getSequenceCount(i));
425 for (
size_t i = 0; i < effvector.size(); ++i)
427 H -= (
static_cast<double>(effvector[i]) /
static_cast<double>(nbSeq)) * (
static_cast<double>(effvector[i]) /
static_cast<double>(nbSeq));
435 unsigned int nbT = 0;
437 while (si->hasMoreSites())
439 const auto& site = si->nextSite();
441 if (SiteTools::getNumberOfDistinctCharacters(site) != 2)
443 int state1 = site[0];
444 int state2 = site[0];
445 for (
size_t i = 1; i < site.size(); ++i)
447 if (state1 != site[i])
453 if (((state1 == 0 && state2 == 2) || (state1 == 2 && state2 == 0)) ||
454 ((state1 == 1 && state2 == 3) || (state1 == 3 && state2 == 1)))
464 unsigned int nbTv = 0;
466 while (si->hasMoreSites())
468 const auto& site = si->nextSite();
470 if (SiteTools::getNumberOfDistinctCharacters(site) != 2)
472 int state1 = site[0];
473 int state2 = site[0];
474 for (
size_t i = 1; i < site.size(); ++i)
476 if (state1 != site[i])
482 if (!(((state1 == 0 && state2 == 2) || (state1 == 2 && state2 == 0)) ||
483 ((state1 == 1 && state2 == 3) || (state1 == 3 && state2 == 1))))
497 vector<int> state(2);
498 while (si->hasMoreSites())
500 map<int, size_t>
count;
501 const auto& site = si->nextSite();
502 SymbolListTools::getCounts(site,
count);
503 if (
count.size() != 2)
506 for (
const auto& it :
count)
511 if (((state[0] == 0 && state[1] == 2) || (state[0] == 2 && state[1] == 0)) ||
512 ((state[0] == 1 && state[1] == 3) || (state[0] == 3 && state[1] == 1)))
530 unsigned int SequenceStatistics::numberOfSitesWithStopCodon(
535 if (!AlphabetTools::isCodonAlphabet(psc.
alphabet()))
536 throw AlphabetMismatchException(
"SequenceStatistics::stopCodonSiteNumber(). PolymorphismSequenceContainer must be with a codon alphabet.", &psc.
alphabet(), AlphabetTools::DNA_CODON_ALPHABET.get());
538 unique_ptr<ConstSiteIterator> si;
544 while (si->hasMoreSites())
546 const auto& site = si->nextSite();
547 if (CodonSiteTools::hasStop(site, gCode))
555 unique_ptr<ConstSiteIterator> si;
566 while (si->hasMoreSites())
568 const auto& site = si->nextSite();
569 if (CodonSiteTools::isMonoSitePolymorphic(site))
579 while (si->hasMoreSites())
581 const auto& site = si->nextSite();
582 if (CodonSiteTools::isSynonymousPolymorphic(site, gc))
592 unsigned int s = numberOfSynonymousSubstitutions(psc, gc);
593 map<string, double> values = getUsefulValues_(n);
594 ThetaW =
static_cast<double>(s) / values[
"a1"];
598 double SequenceStatistics::watterson75NonSynonymous(
604 unsigned int s = numberOfNonSynonymousSubstitutions(psc, gc);
605 map<string, double> values = getUsefulValues_(n);
606 ThetaW =
static_cast<double>(s) / values[
"a1"];
610 double SequenceStatistics::piSynonymous(
620 S += CodonSiteTools::piSynonymous(site, gc, minchange);
625 double SequenceStatistics::piNonSynonymous(
635 S += CodonSiteTools::piNonSynonymous(site, gc, minchange);
640 double SequenceStatistics::meanNumberOfSynonymousSites(
650 S += CodonSiteTools::meanNumberOfSynonymousPositions(site, gc, ratio);
664 S += CodonSiteTools::meanNumberOfSynonymousPositions(site, gc, ratio);
666 return static_cast<double>(n - S);
671 size_t st = 0, sns = 0;
676 st += CodonSiteTools::numberOfSubstitutions(site, gc, freqmin);
677 sns += CodonSiteTools::numberOfNonSynonymousSubstitutions(site, gc, freqmin);
679 return static_cast<unsigned int>(st - sns);
684 unsigned int sns = 0;
689 sns +=
static_cast<unsigned int>(CodonSiteTools::numberOfNonSynonymousSubstitutions(site, gc, freqmin));
694 vector<unsigned int> SequenceStatistics::fixedDifferences(
707 const auto& siteIn = siIn.
nextSite();
708 const auto& siteOut = siOut.
nextSite();
709 const auto& siteCons = siConst.
nextSite();
710 vector<size_t> v = CodonSiteTools::fixedDifferences(siteIn, siteOut, siteCons.getValue(0), siteCons.getValue(1), gc);
714 vector<unsigned int> v(2);
715 v[0] =
static_cast<unsigned int>(NfixS);
716 v[1] =
static_cast<unsigned int>(NfixA);
720 vector<unsigned int> SequenceStatistics::mkTable(
729 auto tmpSeq = make_unique<Sequence>(outgroup.
sequence(i));
733 auto pscComplete = PolymorphismSequenceContainerTools::getCompleteSites(pscTot);
734 auto pscIn = PolymorphismSequenceContainerTools::extractIngroup(*pscComplete);
735 auto pscOut = PolymorphismSequenceContainerTools::extractOutgroup(*pscComplete);
736 auto consensusIn = SiteContainerTools::getConsensus(*pscIn,
"consensusIn");
737 auto consensusOut = SiteContainerTools::getConsensus(*pscOut,
"consensusOut");
738 auto consensus = make_unique<PolymorphismSequenceContainer>(ingroup.
getAlphabet());
739 consensus->addSequence(
"consensusIn", consensusIn);
740 consensus->addSequence(
"consensusOut", consensusOut);
741 vector<unsigned int> u = SequenceStatistics::fixedDifferences(*pscIn, *pscOut, *consensus, gc);
742 vector<unsigned int> v(4);
743 v[0] = SequenceStatistics::numberOfNonSynonymousSubstitutions(*pscIn, gc, freqmin);
744 v[1] = SequenceStatistics::numberOfSynonymousSubstitutions(*pscIn, gc, freqmin);
752 vector<unsigned int> v = SequenceStatistics::mkTable(ingroup, outgroup, gc, freqmin);
753 if (v[1] != 0 && v[2] != 0)
754 return static_cast<double>(v[0] * v[3]) /
static_cast<double>(v[1] * v[2]);
765 unsigned int Sp = numberOfPolymorphicSites(psc, gapflag, ignoreUnknown);
768 double S =
static_cast<double>(Sp);
769 double tajima = tajima83(psc, gapflag, ignoreUnknown);
770 double watterson = watterson75(psc, gapflag, ignoreUnknown);
772 map<string, double> values = getUsefulValues_(n);
773 return (tajima - watterson) / sqrt((values[
"e1"] * S) + (values[
"e2"] * S * (S - 1)));
778 unsigned int etaP = totalNumberOfMutations(psc, gapflag);
781 double eta =
static_cast<double>(etaP);
782 double tajima = tajima83(psc, gapflag, ignoreUnknown);
784 map<string, double> values = getUsefulValues_(n);
785 double eta_a1 = eta / values[
"a1"];
786 return (tajima - eta_a1) / sqrt((values[
"e1"] * eta) + (values[
"e2"] * eta * (eta - 1)));
789 double SequenceStatistics::fuLiD(
792 bool useNbSingletons,
793 bool useNbSegregatingSites)
796 map<string, double> values = getUsefulValues_(n);
797 double vD = getVD_(n, values[
"a1"], values[
"a2"], values[
"cn"]);
798 double uD = getUD_(values[
"a1"], vD);
799 unsigned int etaP = 0;
800 if (useNbSegregatingSites)
801 etaP = numberOfPolymorphicSites(ingroup);
803 etaP = totalNumberOfMutations(ingroup);
806 double eta =
static_cast<double>(etaP);
809 etae =
static_cast<double>(numberOfSingletons(outgroup));
811 etae =
static_cast<double>(totalNumberOfMutationsOnExternalBranches(ingroup, outgroup));
812 return (eta - (values[
"a1"] * etae)) / sqrt((uD * eta) + (vD * eta * eta));
815 double SequenceStatistics::fuLiDStar(
817 bool useNbSegregatingSites)
820 double nn =
static_cast<double>(n);
821 double _n = nn / (nn - 1.);
822 map<string, double> values = getUsefulValues_(n);
823 double vDs = getVDstar_(n, values[
"a1"], values[
"a2"], values[
"dn"]);
824 double uDs = getUDstar_(n, values[
"a1"], vDs);
825 unsigned int etaP = 0;
826 if (useNbSegregatingSites)
827 etaP = numberOfPolymorphicSites(group);
829 etaP = totalNumberOfMutations(group);
832 double eta =
static_cast<double>(etaP);
833 double etas =
static_cast<double>(numberOfSingletons(group));
836 return ((_n * eta) - (values[
"a1"] * etas)) / sqrt(uDs * eta + vDs * eta * eta);
844 double SequenceStatistics::fuLiF(
847 bool useNbSingletons,
848 bool useNbSegregatingSites)
851 double nn =
static_cast<double>(n);
852 map<string, double> values = getUsefulValues_(n);
853 double pi = tajima83(ingroup,
true);
854 double vF = (values[
"cn"] + values[
"b2"] - 2. / (nn - 1.)) / (pow(values[
"a1"], 2) + values[
"a2"]);
855 double uF = ((1. + values[
"b1"] - (4. * ((nn + 1.) / ((nn - 1.) * (nn - 1.)))) * (values[
"a1n"] - (2. * nn) / (nn + 1.))) / values[
"a1"]) - vF;
856 unsigned int etaP = 0;
857 if (useNbSegregatingSites)
858 etaP = numberOfPolymorphicSites(ingroup);
860 etaP = totalNumberOfMutations(ingroup);
863 double eta =
static_cast<double>(etaP);
866 etae =
static_cast<double>(numberOfSingletons(outgroup));
868 etae =
static_cast<double>(totalNumberOfMutationsOnExternalBranches(ingroup, outgroup));
869 return (pi - etae) / sqrt(uF * eta + vF * eta * eta);
872 double SequenceStatistics::fuLiFStar(
874 bool useNbSegregatingSites)
878 double pi = tajima83(group,
true);
885 double vFs = (((2 * n * n * n + 110 * n * n - 255 * n + 153) / (9 * n * n * (n - 1))) + ((2 * (n - 1) * values[
"a1"]) / (n * n)) - 8 * values[
"a2"] / n) / (pow(values[
"a1"], 2) + values[
"a2"]);
886 double uFs = (((4 * n * n + 19 * n + 3 - 12 * (n + 1) * values[
"a1n"]) / (3 * n * (n - 1))) / values[
"a1"]) - vFs;
887 unsigned int etaP = 0;
888 if (useNbSegregatingSites)
889 etaP = numberOfPolymorphicSites(group);
891 etaP = totalNumberOfMutations(group);
894 double eta =
static_cast<double>(etaP);
895 double etas =
static_cast<double>(numberOfSingletons(group));
898 return (pi - ((n - 1.) / n * etas)) / sqrt(uFs * eta + vFs * eta * eta);
901 double SequenceStatistics::fstHudson92(
906 vector<double> vdiff;
907 double piIntra1, piIntra2, meanPiIntra, piInter, Fst;
909 auto Pop1 = PolymorphismSequenceContainerTools::extractGroup(psc, id1);
910 auto Pop2 = PolymorphismSequenceContainerTools::extractGroup(psc, id2);
912 piIntra1 = SequenceStatistics::tajima83(*Pop1,
true);
913 piIntra2 = SequenceStatistics::tajima83(*Pop2,
true);
915 meanPiIntra = (piIntra1 + piIntra2) / 2;
918 for (
size_t i = 0; i < Pop1->getNumberOfSequences(); ++i)
920 const Sequence& s1 = Pop1->sequence(i);
921 for (
size_t j = 0; j < Pop2->getNumberOfSequences(); ++j)
924 const auto& s2 = Pop2->sequence(j);
925 vdiff.push_back(SiteContainerTools::computeSimilarity(s1, s2,
true,
"no gap",
true));
928 piInter = (VectorTools::sum(vdiff) / n) *
static_cast<double>(psc.
getNumberOfSites());
930 Fst = 1.0 - meanPiIntra / piInter;
943 unique_ptr<PolymorphismSequenceContainer> SequenceStatistics::generateLdContainer(
954 if (SiteTools::isComplete(psc.
site(i)) && !SiteTools::isConstant(psc.
site(i)) && !SiteTools::isTriplet(psc.
site(i)))
961 if (SiteTools::isComplete(psc.
site(i)) && !SiteTools::isConstant(psc.
site(i)) && !SiteTools::isTriplet(psc.
site(i)) && !SiteTools::hasSingleton(psc.
site(i)))
968 auto sc = SiteContainerTools::getSelectedSites(psc, ss);
970 auto ldpsc = make_unique<PolymorphismSequenceContainer>(sc->getNumberOfSequences(), alpha);
972 for (
size_t i = 0; i < sc->getNumberOfSites(); i++)
974 const Site& site = sc->site(i);
975 auto siteClone = make_unique<Site>(site);
976 bool deleteSite =
false;
977 map<int, double> freqs;
978 SymbolListTools::getFrequencies(*siteClone, freqs);
980 for (
const auto& it : freqs)
982 if (it.second >= 0.5)
985 for (
size_t j = 0; j < sc->getNumberOfSequences(); ++j)
989 if (freqs[site.
getValue(j)] <= 1 - freqmin)
991 siteClone->setElement(j, 1);
999 if (freqs[site.
getValue(j)] >= freqmin)
1000 siteClone->setElement(j, 0);
1006 ldpsc->addSite(siteClone);
1027 if (SiteTools::isComplete(site) &&
1028 !SiteTools::isConstant(site) &&
1029 !SiteTools::isTriplet(site))
1031 bool deleteSite =
false;
1032 map<int, double> freqs;
1033 SymbolListTools::getFrequencies(site, freqs);
1036 if (freqs[j] >= 1 - freqmin)
1045 if (SiteTools::isComplete(site) &&
1046 !SiteTools::isConstant(site) &&
1047 !SiteTools::isTriplet(site) &&
1048 !SiteTools::hasSingleton(site))
1051 bool deleteSite =
false;
1052 map<int, double> freqs;
1053 SymbolListTools::getFrequencies(site, freqs);
1056 if (freqs[j] >= 1 - freqmin)
1066 throw DimensionException(
"SequenceStatistics::pairwiseDistances1 : less than 2 sites are available", ss.size(), 2);
1068 for (
size_t i = 0; i < ss.size() - 1; ++i)
1070 for (
size_t j = i + 1; j < ss.size(); ++j)
1072 dist.push_back(
static_cast<double>(ss[j] - ss[i]));
1086 if (SiteTools::isComplete(site) &&
1087 !SiteTools::isConstant(site) &&
1088 !SiteTools::isTriplet(site))
1090 bool deleteSite =
false;
1091 map<int, double> freqs;
1092 SymbolListTools::getFrequencies(site, freqs);
1095 if (freqs[j] >= 1 - freqmin)
1104 if (SiteTools::isComplete(site) &&
1105 !SiteTools::isConstant(site) &&
1106 !SiteTools::isTriplet(site) &&
1107 !SiteTools::hasSingleton(site))
1110 bool deleteSite =
false;
1111 map<int, double> freqs;
1112 SymbolListTools::getFrequencies(site, freqs);
1115 if (freqs[j] >= 1 - freqmin)
1123 size_t n = ss.size();
1125 throw DimensionException(
"SequenceStatistics::pairwiseDistances1 : less than 2 sites are available", ss.size(), 2);
1126 Vdouble distance(n * (n - 1) / 2, 0);
1133 for (
size_t i = 0; i < nbSite; ++i)
1139 for (
size_t i = 0; i < gap.size(); ++i)
1141 for (
size_t j = 0; j < ss.size(); ++j)
1147 for (
size_t i = 0; i < n - 1; ++i)
1149 for (
size_t j = i + 1; j < n; ++j)
1151 dist.push_back(
static_cast<double>(newss[j] - newss[i]));
1165 auto newpsc = generateLdContainer(psc, keepsingleton, freqmin);
1167 size_t nbsite = newpsc->getNumberOfSites();
1168 size_t nbseq = newpsc->getNumberOfSequences();
1170 throw DimensionException(
"SequenceStatistics::pairwiseD: less than two sites are available", nbsite, 2);
1172 throw DimensionException(
"SequenceStatistics::pairwiseD: less than two sequences are available", nbseq, 2);
1173 for (
size_t i = 0; i < nbsite - 1; ++i)
1175 for (
size_t j = i + 1; j < nbsite; ++j)
1178 const Site& site1 = newpsc->site(i);
1179 const Site& site2 = newpsc->site(j);
1180 map<int, double> freq1;
1181 map<int, double> freq2;
1182 SymbolListTools::getFrequencies(site1, freq1);
1183 SymbolListTools::getFrequencies(site2, freq2);
1184 for (
size_t k = 0; k < nbseq; ++k)
1189 haplo = haplo /
static_cast<double>(nbseq);
1190 D.push_back(std::abs(haplo - freq1[1] * freq2[1]));
1201 auto newpsc = generateLdContainer(psc, keepsingleton, freqmin);
1203 size_t nbsite = newpsc->getNumberOfSites();
1204 size_t nbseq = newpsc->getNumberOfSequences();
1206 throw DimensionException(
"SequenceStatistics::pairwiseD: less than two sites are available", nbsite, 2);
1208 throw DimensionException(
"SequenceStatistics::pairwiseD: less than two sequences are available", nbseq, 2);
1209 for (
size_t i = 0; i < nbsite - 1; i++)
1211 for (
size_t j = i + 1; j < nbsite; j++)
1214 const Site& site1 = newpsc->site(i);
1215 const Site& site2 = newpsc->site(j);
1216 map<int, double> freq1;
1217 map<int, double> freq2;
1218 SymbolListTools::getFrequencies(site1, freq1);
1219 SymbolListTools::getFrequencies(site2, freq2);
1220 for (
size_t k = 0; k < nbseq; ++k)
1225 haplo = haplo /
static_cast<double>(nbseq);
1226 double d, D = (haplo - freq1[1] * freq2[1]);
1229 if (freq1[1] * freq2[0] <= freq1[0] * freq2[1])
1231 d = std::abs(D) / (freq1[1] * freq2[0]);
1235 d = std::abs(D) / (freq1[0] * freq2[1]);
1240 if (freq1[1] * freq2[1] <= freq1[0] * freq2[0])
1242 d = std::abs(D) / (freq1[1] * freq2[1]);
1246 d = std::abs(D) / (freq1[0] * freq2[0]);
1249 Dprime.push_back(d);
1260 auto newpsc = generateLdContainer(psc, keepsingleton, freqmin);
1262 size_t nbsite = newpsc->getNumberOfSites();
1263 size_t nbseq = newpsc->getNumberOfSequences();
1265 throw DimensionException(
"SequenceStatistics::pairwiseD: less than two sites are available", nbsite, 2);
1267 throw DimensionException(
"SequenceStatistics::pairwiseD: less than two sequences are available", nbseq, 2);
1268 for (
size_t i = 0; i < nbsite - 1; ++i)
1270 for (
size_t j = i + 1; j < nbsite; ++j)
1273 const Site& site1 = newpsc->site(i);
1274 const Site& site2 = newpsc->site(j);
1275 map<int, double> freq1;
1276 map<int, double> freq2;
1277 SymbolListTools::getFrequencies(site1, freq1);
1278 SymbolListTools::getFrequencies(site2, freq2);
1279 for (
size_t k = 0; k < nbseq; ++k)
1284 haplo = haplo /
static_cast<double>(nbseq);
1285 double r = ((haplo - freq1[1] * freq2[1]) * (haplo - freq1[1] * freq2[1])) / (freq1[0] * freq1[1] * freq2[0] * freq2[1]);
1298 Vdouble D = pairwiseD(psc, keepsingleton, freqmin);
1299 return VectorTools::mean<double, double>(D);
1304 Vdouble Dprime = pairwiseDprime(psc, keepsingleton, freqmin);
1305 return VectorTools::mean<double, double>(Dprime);
1310 Vdouble R2 = SequenceStatistics::pairwiseR2(psc, keepsingleton, freqmin);
1311 return VectorTools::mean<double, double>(R2);
1316 Vdouble dist = pairwiseDistances1(psc, keepsingleton, freqmin);
1317 return VectorTools::mean<double, double>(dist);
1322 Vdouble dist = pairwiseDistances2(psc, keepsingleton, freqmin);
1323 return VectorTools::mean<double, double>(dist);
1332 Vdouble D = pairwiseD(psc, keepsingleton, freqmin) - 1;
1335 dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1337 dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1338 return VectorTools::sum(D * dist) / VectorTools::sum(dist * dist);
1343 Vdouble Dprime = pairwiseDprime(psc, keepsingleton, freqmin) - 1;
1346 dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1348 dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1349 return VectorTools::sum(Dprime * dist) / VectorTools::sum(dist * dist);
1354 Vdouble R2 = pairwiseR2(psc, keepsingleton, freqmin) - 1;
1357 dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1359 dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1360 return VectorTools::sum(R2 * dist) / VectorTools::sum(dist * dist);
1365 Vdouble D = pairwiseD(psc, keepsingleton, freqmin);
1369 dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1371 dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1372 reg[0] = VectorTools::cov<double, double>(dist, D) / VectorTools::var<double, double>(dist);
1373 reg[1] = VectorTools::mean<double, double>(D) - reg[0] * VectorTools::mean<double, double>(dist);
1379 Vdouble Dprime = pairwiseDprime(psc, keepsingleton, freqmin);
1383 dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1385 dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1386 reg[0] = VectorTools::cov<double, double>(dist, Dprime) / VectorTools::var<double, double>(dist);
1387 reg[1] = VectorTools::mean<double, double>(Dprime) - reg[0] * VectorTools::mean<double, double>(dist);
1393 Vdouble R2 = pairwiseR2(psc, keepsingleton, freqmin);
1397 dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1399 dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1400 reg[0] = VectorTools::cov<double, double>(dist, R2) / VectorTools::var<double, double>(dist);
1401 reg[1] = VectorTools::mean<double, double>(R2) - reg[0] * VectorTools::mean<double, double>(dist);
1407 Vdouble R2 = pairwiseR2(psc, keepsingleton, freqmin);
1409 Vdouble R2transformed = unit / R2 - 1;
1412 dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1414 dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1415 return VectorTools::sum(R2transformed * dist) / VectorTools::sum(dist * dist);
1424 double left = leftHandHudson_(psc);
1429 if (SequenceStatistics::numberOfPolymorphicSites(psc) < 2)
1431 if (rightHandHudson_(c1, n) < left)
1433 if (rightHandHudson_(c2, n) > left)
1435 while (dif > precision)
1437 if (rightHandHudson_((c1 + c2) / 2, n) > left)
1441 dif = std::abs(2 * (c1 - c2) / (c1 + c2));
1443 return (c1 + c2) / 2;
1450 void SequenceStatistics::testUsefulValues(std::ostream& s,
size_t n)
1452 map<string, double> v = getUsefulValues_(n);
1453 double vD = getVD_(n, v[
"a1"], v[
"a2"], v[
"cn"]);
1454 double uD = getUD_(v[
"a1"], vD);
1455 double vDs = getVDstar_(n, v[
"a1"], v[
"a2"], v[
"dn"]);
1456 double uDs = getUDstar_(n, v[
"a1"], vDs);
1459 s << v[
"a1"] <<
"\t";
1460 s << v[
"a2"] <<
"\t";
1461 s << v[
"a1n"] <<
"\t";
1462 s << v[
"b1"] <<
"\t";
1463 s << v[
"b2"] <<
"\t";
1464 s << v[
"c1"] <<
"\t";
1465 s << v[
"c2"] <<
"\t";
1466 s << v[
"cn"] <<
"\t";
1467 s << v[
"dn"] <<
"\t";
1468 s << v[
"e1"] <<
"\t";
1469 s << v[
"e2"] <<
"\t";
1480 unsigned int SequenceStatistics::getNumberOfMutations_(
const Site& site)
1483 unsigned int tmp_count = 0;
1484 map<int, size_t> states_count;
1485 SymbolListTools::getCounts(site, states_count);
1487 for (map<int, size_t>::iterator it = states_count.begin(); it != states_count.end(); it++)
1497 unsigned int SequenceStatistics::getNumberOfSingletons_(
const Site& site)
1499 unsigned int nus = 0;
1500 map<int, size_t> states_count;
1501 SymbolListTools::getCounts(site, states_count);
1502 for (map<int, size_t>::iterator it = states_count.begin(); it != states_count.end(); it++)
1504 if (it->second == 1)
1510 unsigned int SequenceStatistics::getNumberOfDerivedSingletons_(
const Site& site_in,
const Site& site_out)
1512 unsigned int nus = 0;
1513 map<int, size_t> states_count;
1514 map<int, size_t> outgroup_states_count;
1515 SymbolListTools::getCounts(site_in, states_count);
1516 SymbolListTools::getCounts(site_out, outgroup_states_count);
1518 if (outgroup_states_count.size() == 1)
1520 for (map<int, size_t>::iterator it = states_count.begin(); it != states_count.end(); it++)
1522 if (it->second == 1)
1524 if (outgroup_states_count.find(it->first) == outgroup_states_count.end())
1532 std::map<std::string, double> SequenceStatistics::getUsefulValues_(
size_t n)
1534 double nn =
static_cast<double>(n);
1535 map<string, double> values;
1549 for (
double i = 1; i < nn; i++)
1551 values[
"a1"] += 1. / i;
1552 values[
"a2"] += 1. / (i * i);
1554 values[
"a1n"] = values[
"a1"] + (1. / nn);
1555 values[
"b1"] = (nn + 1.) / (3. * (nn - 1.));
1556 values[
"b2"] = 2. * ((nn * nn) + nn + 3.) / (9. * nn * (nn - 1.));
1557 values[
"c1"] = values[
"b1"] - (1. / values[
"a1"]);
1558 values[
"c2"] = values[
"b2"] - ((nn + 2.) / (values[
"a1"] * nn)) + (values[
"a2"] / (values[
"a1"] * values[
"a1"]));
1566 values[
"cn"] = 2. * ((nn * values[
"a1"]) - (2. * (nn - 1.))) / ((nn - 1.) * (nn - 2.));
1569 + ((nn - 2.) / ((nn - 1.) * (nn - 1.)))
1571 * ((3. / 2.) - (((2. * values[
"a1n"]) - 3.) / (nn - 2.)) - (1. / nn));
1573 values[
"e1"] = values[
"c1"] / values[
"a1"];
1574 values[
"e2"] = values[
"c2"] / ((values[
"a1"] * values[
"a1"]) + values[
"a2"]);
1579 double SequenceStatistics::getVD_(
size_t n,
double a1,
double a2,
double cn)
1581 double nn =
static_cast<double>(n);
1584 double vD = 1. + ((a1 * a1) / (a2 + (a1 * a1))) * (cn - ((nn + 1.) / (nn - 1.)));
1588 double SequenceStatistics::getUD_(
double a1,
double vD)
1590 return a1 - 1. - vD;
1593 double SequenceStatistics::getVDstar_(
size_t n,
double a1,
double a2,
double dn)
1595 double denom = (a1 * a1) + a2;
1596 if (n < 3 || denom == 0.)
1598 double nn =
static_cast<double>(n);
1599 double nnn = nn / (nn - 1.);
1604 - (2. * (nn * a1 * (a1 + 1)) / ((nn - 1.) * (nn - 1.)))
1621 double SequenceStatistics::getUDstar_(
size_t n,
double a1,
double vDs)
1625 double nn =
static_cast<double>(n);
1626 double nnn = nn / (nn - 1.);
1628 double uDs = (nnn * (a1 - nnn)) - vDs;
1638 auto newpsc = PolymorphismSequenceContainerTools::getCompleteSites(psc);
1639 size_t nbseq = newpsc->getNumberOfSequences();
1642 for (
size_t i = 0; i < nbseq - 1; ++i)
1644 for (
size_t j = i + 1; j < nbseq; ++j)
1649 auto psc2 = PolymorphismSequenceContainerTools::getSelectedSequences(*newpsc, ss);
1650 S1 += SequenceStatistics::watterson75(*psc2,
true);
1651 S2 += SequenceStatistics::watterson75(*psc2,
true) * SequenceStatistics::watterson75(*psc2,
true);
1654 double Sk = (2 * S2 - pow(2 * S1 /
static_cast<double>(nbseq), 2.)) / pow(nbseq, 2.);
1655 double H = SequenceStatistics::heterozygosity(*newpsc);
1656 double H2 = SequenceStatistics::squaredHeterozygosity(*newpsc);
1657 return static_cast<double>(Sk - H + H2) / pow(H *
static_cast<double>(nbseq) /
static_cast<double>(nbseq - 1), 2.);
1660 double SequenceStatistics::rightHandHudson_(
double c,
size_t n)
1662 double nn =
static_cast<double>(n);
1663 return 1. / (97. * pow(c, 2.) * pow(nn, 3.)) * ((nn - 1.) * (97. * (c * (4. + (c - 2. * nn) * nn) + (-2. * (7. + c) + 4. * nn + (c - 1.) * pow(nn, 2.)) * log((18. + c * (13. + c)) / 18.)) + sqrt(97.) * (110. + nn * (49. * nn - 52.) + c * (2. + nn * (15. * nn - 8.))) * log(-1. + (72. + 26. * c) / (36. + 13. * c - c * sqrt(97.)))));
const Alphabet & alphabet() const override
std::shared_ptr< const Alphabet > getAlphabet() const override
const int & getValue(size_t pos) const override
virtual unsigned int getSize() const=0
bool hasMoreSites() const override
const SiteType & nextSite() override
virtual size_t size() const=0
virtual const Alphabet & alphabet() const=0
The PolymorphismSequenceContainer class.
void setAsOutgroupMember(size_t index)
Set a sequence as outgroup member by index.
void addSequence(const std::string &sequenceKey, std::unique_ptr< Sequence > &sequence) override
std::string getChar(size_t pos) const override
const SequenceType & sequence(const std::string &sequenceKey) const override
const SiteType & site(size_t sitePosition) const override
size_t getNumberOfSequences() const override
size_t getNumberOfSites() const override
std::size_t count(const std::string &s, const std::string &pattern)
std::vector< size_t > SiteSelection
std::vector< double > Vdouble
std::vector< size_t > SequenceSelection
CompleteTemplateSiteContainerIterator< Site, Sequence, std::string > CompleteSiteContainerIterator
NoGapTemplateSiteContainerIterator< Site, Sequence, std::string > NoGapSiteContainerIterator