76 unique_ptr<ConstSiteIterator> si;
81 while (si->hasMoreSites())
83 site = si->nextSite();
84 if (!SiteTools::isConstant(*site, ignoreUnknown))
97 unique_ptr<ConstSiteIterator> si;
102 while (si->hasMoreSites())
104 site = si->nextSite();
106 if (!SiteTools::isConstant(*site, ignoreUnknown))
116 unique_ptr<ConstSiteIterator> si;
122 const Site* site = 0;
123 while (si->hasMoreSites())
125 site = si->nextSite();
126 if (SiteTools::isParsimonyInformativeSite(*site))
136 unique_ptr<ConstSiteIterator> si;
141 unsigned int nus = 0;
142 const Site* site = 0;
143 while (si->hasMoreSites())
145 site = si->nextSite();
146 nus += getNumberOfSingletons_(*site);
153 unique_ptr<ConstSiteIterator> si;
159 const Site* site = 0;
160 while (si->hasMoreSites())
162 site = si->nextSite();
163 if (SiteTools::isTriplet(*site))
173 unique_ptr<ConstSiteIterator> si;
178 unsigned int tnm = 0;
179 const Site* site = 0;
180 while (si->hasMoreSites())
182 site = si->nextSite();
183 tnm += getNumberOfMutations_(*site);
188 unsigned int SequenceStatistics::totalNumberOfMutationsOnExternalBranches(
193 throw Exception(
"ing and outg must have the same size");
194 unsigned int nmuts = 0;
195 const Site* site_in = 0;
196 const Site* site_out = 0;
199 while (si->hasMoreSites())
201 site_in = si->nextSite();
202 site_out = so->nextSite();
204 if (SiteTools::isComplete(*site_in) && SiteTools::isComplete(*site_out))
205 nmuts += getNumberOfDerivedSingletons_(*site_in, *site_out);
212 unique_ptr<ConstSiteIterator> si;
217 const Site* site = 0;
219 while (si->hasMoreSites())
221 site = si->nextSite();
222 s += SiteTools::heterozygosity(*site);
229 unique_ptr<ConstSiteIterator> si;
234 const Site* site = 0;
236 while (si->hasMoreSites())
238 site = si->nextSite();
239 double h = SiteTools::heterozygosity(*site);
251 map<int, double> freqs;
252 SequenceContainerTools::getFrequencies(psc, freqs);
259 unsigned int nbMut = 0;
260 unsigned int nbGC = 0;
262 vector<unsigned int> vect(2);
263 const Site* site = 0;
264 unique_ptr<ConstSiteIterator> si;
269 while (si->hasMoreSites())
271 site = si->nextSite();
272 if (!SiteTools::isConstant(*site))
274 long double freqGC = SymbolListTools::getGCContent(*site);
281 if (freqGC > 0 && freqGC < 1)
283 nbMut +=
static_cast<unsigned int>(nbSeq);
284 long double adGC = freqGC * nbSeq;
285 nbGC +=
static_cast<unsigned int>(adGC);
302 map<string, double> values = getUsefulValues_(n);
305 s = frequencyOfPolymorphicSites(psc, gapflag, ignoreUnknown);
307 s =
static_cast<double>(numberOfPolymorphicSites(psc, gapflag, ignoreUnknown));
308 ThetaW = s / values[
"a1"];
315 const Site* site = 0;
316 unique_ptr<ConstSiteIterator> si;
323 while (si->hasMoreSites())
325 site = si->nextSite();
327 if (!SiteTools::isConstant(*site, ignoreUnknown))
330 map<int, size_t>
count;
331 SymbolListTools::getCounts(*site,
count);
332 map<int, size_t> tmp_k;
334 for (map<int, size_t>::iterator it =
count.begin(); it !=
count.end(); it++)
336 if (it->first >= 0 && it->first <
static_cast<int>(alphabet_size))
338 tmp_k[it->first] = it->second * (it->second - 1);
342 if (tmp_n == 0 || tmp_n == 1)
344 for (map<int, size_t>::iterator it = tmp_k.begin(); it != tmp_k.end(); it++)
346 value +=
static_cast<double>(it->second) /
static_cast<double>(tmp_n * (tmp_n - 1));
348 value2 += 1. - value;
351 return (scaled ? value2 / l : value2);
357 throw Exception(
"SequenceStatistics::FayWu2000: ancestralSites and psc don't have the same size!!!'" );
361 size_t alphabet_size = (psc.
getAlphabet())->getSize();
366 string ancB = ancestralSites.
getChar(i);
367 int ancV = ancestralSites.
getValue(i);
369 if (!SiteTools::isConstant(site) || tmps.
getChar(i) != ancB)
374 map<int, size_t>
count;
375 SymbolListTools::getCounts(site,
count);
376 map<int, size_t> tmp_k;
378 for (map<int, size_t>::iterator it =
count.begin(); it !=
count.end(); it++)
380 if (it->first >= 0 && it->first <
static_cast<int>(alphabet_size))
383 if (it->first != ancV)
385 tmp_k[it->first] = 2 * it->second * it->second;
390 if (tmp_n == 0 || tmp_n == 1)
392 for (map<int, size_t>::iterator it = tmp_k.begin(); it != tmp_k.end(); it++)
394 value +=
static_cast<double>(it->second) /
static_cast<double>(tmp_n * (tmp_n - 1));
408 unique_ptr<PolymorphismSequenceContainer> sc;
410 sc.reset(PolymorphismSequenceContainerTools::getSitesWithoutGaps(psc));
414 vector<string> pscvector;
415 pscvector.push_back(sc->toString(0));
417 for (
size_t i = 1; i < sc->getNumberOfSequences(); i++)
420 string query = sc->toString(i);
421 for (vector<string>::iterator it = pscvector.begin(); it != pscvector.end(); it++)
423 if (query.compare(*it) == 0)
432 pscvector.push_back(query);
436 return static_cast<unsigned int>(pscvector.size());
446 unique_ptr<PolymorphismSequenceContainer> sc;
448 sc.reset(PolymorphismSequenceContainerTools::getSitesWithoutGaps(psc));
453 vector<string> pscvector;
454 vector<size_t> effvector;
455 pscvector.push_back(sc->toString(0));
456 effvector.push_back(sc->getSequenceCount(0));
457 nbSeq = sc->getSequenceCount(0);
458 for (
size_t i = 1; i < sc->getNumberOfSequences(); i++)
460 nbSeq += sc->getSequenceCount(i);
462 string query = sc->toString(i);
463 for (
size_t j = 0; j < pscvector.size(); j++)
465 if (query.compare(pscvector[j]) == 0)
467 effvector[j] += sc->getSequenceCount(i);
474 pscvector.push_back(query);
475 effvector.push_back(sc->getSequenceCount(i));
478 for (
size_t i = 0; i < effvector.size(); i++)
480 H -= (
static_cast<double>(effvector[i]) /
static_cast<double>(nbSeq)) * (
static_cast<double>(effvector[i]) /
static_cast<double>(nbSeq));
488 unsigned int nbT = 0;
490 const Site* site = 0;
491 while (si->hasMoreSites())
493 site = si->nextSite();
495 if (SiteTools::getNumberOfDistinctCharacters(*site) != 2)
497 int state1 = (*site)[0];
498 int state2 = (*site)[0];
499 for (
size_t i = 1; i < site->
size(); i++)
501 if (state1 != (*site)[i])
507 if (((state1 == 0 && state2 == 2) || (state1 == 2 && state2 == 0)) ||
508 ((state1 == 1 && state2 == 3) || (state1 == 3 && state2 == 1)))
518 unsigned int nbTv = 0;
520 const Site* site = 0;
521 while (si->hasMoreSites())
523 site = si->nextSite();
525 if (SiteTools::getNumberOfDistinctCharacters(*site) != 2)
527 int state1 = (*site)[0];
528 int state2 = (*site)[0];
529 for (
size_t i = 1; i < site->
size(); i++)
531 if (state1 != (*site)[i])
537 if (!(((state1 == 0 && state2 == 2) || (state1 == 2 && state2 == 0)) ||
538 ((state1 == 1 && state2 == 3) || (state1 == 3 && state2 == 1))))
552 const Site* site = 0;
553 vector<int> state(2);
554 while (si->hasMoreSites())
556 map<int, size_t>
count;
557 site = si->nextSite();
558 SymbolListTools::getCounts(*site,
count);
559 if (
count.size() != 2)
562 for (map<int, size_t>::iterator it =
count.begin(); it !=
count.end(); it++)
564 state[i] = it->first;
567 if (((state[0] == 0 && state[1] == 2) || (state[0] == 2 && state[1] == 0)) ||
568 ((state[0] == 1 && state[1] == 3) || (state[0] == 3 && state[1] == 1)))
592 if (!AlphabetTools::isCodonAlphabet(psc.
getAlphabet()))
595 unique_ptr<ConstSiteIterator> si;
601 const Site* site = 0;
602 while (si->hasMoreSites())
604 site = si->nextSite();
605 if (CodonSiteTools::hasStop(*site, gCode))
613 unique_ptr<ConstSiteIterator> si;
625 while (si->hasMoreSites())
627 site = si->nextSite();
628 if (CodonSiteTools::isMonoSitePolymorphic(*site))
639 while (si->hasMoreSites())
641 site = si->nextSite();
642 if (CodonSiteTools::isSynonymousPolymorphic(*site, gc))
652 unsigned int s = numberOfSynonymousSubstitutions(psc, gc);
653 map<string, double> values = getUsefulValues_(n);
654 ThetaW =
static_cast<double>(s) / values[
"a1"];
662 unsigned int s = numberOfNonSynonymousSubstitutions(psc, gc);
663 map<string, double> values = getUsefulValues_(n);
664 ThetaW =
static_cast<double>(s) / values[
"a1"];
672 const Site* site = 0;
676 S += CodonSiteTools::piSynonymous(*site, gc, minchange);
686 const Site* site = 0;
690 S += CodonSiteTools::piNonSynonymous(*site, gc, minchange);
700 const Site* site = 0;
704 S += CodonSiteTools::meanNumberOfSynonymousPositions(*site, gc, ratio);
715 const Site* site = 0;
720 S += CodonSiteTools::meanNumberOfSynonymousPositions(*site, gc, ratio);
723 return static_cast<double>(n - S);
728 size_t st = 0, sns = 0;
730 const Site* site = 0;
731 while (si->hasMoreSites())
733 site = si->nextSite();
734 st += CodonSiteTools::numberOfSubstitutions(*site, gc, freqmin);
735 sns += CodonSiteTools::numberOfNonSynonymousSubstitutions(*site, gc, freqmin);
737 return static_cast<unsigned int>(st - sns);
742 unsigned int sns = 0;
744 const Site* site = 0;
745 while (si->hasMoreSites())
747 site = si->nextSite();
748 sns +=
static_cast<unsigned int>(CodonSiteTools::numberOfNonSynonymousSubstitutions(*site, gc, freqmin));
758 const Site* siteIn = 0;
759 const Site* siteOut = 0;
760 const Site* siteCons = 0;
763 while (siIn->hasMoreSites())
765 siteIn = siIn->nextSite();
766 siteOut = siOut->nextSite();
767 siteCons = siCons->nextSite();
768 vector<size_t> v = CodonSiteTools::fixedDifferences(*siteIn, *siteOut, siteCons->
getValue(0), siteCons->
getValue(1), gc);
772 vector<unsigned int> v(2);
773 v[0] =
static_cast<unsigned int>(NfixS);
774 v[1] =
static_cast<unsigned int>(NfixA);
786 unique_ptr<const PolymorphismSequenceContainer> psccomplet(PolymorphismSequenceContainerTools::getCompleteSites(psctot));
787 unique_ptr<const PolymorphismSequenceContainer> pscin (PolymorphismSequenceContainerTools::extractIngroup(*psccomplet));
788 unique_ptr<const PolymorphismSequenceContainer> pscout (PolymorphismSequenceContainerTools::extractOutgroup(*psccomplet));
789 unique_ptr<const Sequence> consensusIn (SiteContainerTools::getConsensus(*pscin,
"consensusIn"));
790 unique_ptr<const Sequence> consensusOut(SiteContainerTools::getConsensus(*pscout,
"consensusOut"));
792 consensus->addSequence(*consensusIn);
793 consensus->addSequence(*consensusOut);
794 vector<unsigned int> u = SequenceStatistics::fixedDifferences(*pscin, *pscout, *consensus, gc);
795 vector<unsigned int> v(4);
796 v[0] = SequenceStatistics::numberOfNonSynonymousSubstitutions(*pscin, gc, freqmin);
797 v[1] = SequenceStatistics::numberOfSynonymousSubstitutions(*pscin, gc, freqmin);
805 vector<unsigned int> v = SequenceStatistics::mkTable(ingroup, outgroup, gc, freqmin);
806 if (v[1] != 0 && v[2] != 0)
807 return static_cast<double>(v[0] * v[3]) /
static_cast<double>(v[1] * v[2]);
818 unsigned int Sp = numberOfPolymorphicSites(psc, gapflag, ignoreUnknown);
821 double S =
static_cast<double>(Sp);
822 double tajima = tajima83(psc, gapflag, ignoreUnknown);
823 double watterson = watterson75(psc, gapflag, ignoreUnknown);
825 map<string, double> values = getUsefulValues_(n);
826 return (tajima - watterson) / sqrt((values[
"e1"] * S) + (values[
"e2"] * S * (S - 1)));
831 unsigned int etaP = totalNumberOfMutations(psc, gapflag);
834 double eta =
static_cast<double>(etaP);
835 double tajima = tajima83(psc, gapflag, ignoreUnknown);
837 map<string, double> values = getUsefulValues_(n);
838 double eta_a1 = eta / values[
"a1"];
839 return (tajima - eta_a1) / sqrt((values[
"e1"] * eta) + (values[
"e2"] * eta * (eta - 1)));
842 double SequenceStatistics::fuLiD(
845 bool useNbSingletons,
846 bool useNbSegregatingSites)
849 map<string, double> values = getUsefulValues_(n);
850 double vD = getVD_(n, values[
"a1"], values[
"a2"], values[
"cn"]);
851 double uD = getUD_(values[
"a1"], vD);
852 unsigned int etaP = 0;
853 if (useNbSegregatingSites)
854 etaP = numberOfPolymorphicSites(ingroup);
856 etaP = totalNumberOfMutations(ingroup);
859 double eta =
static_cast<double>(etaP);
862 etae =
static_cast<double>(numberOfSingletons(outgroup));
864 etae =
static_cast<double>(totalNumberOfMutationsOnExternalBranches(ingroup, outgroup));
865 return (eta - (values[
"a1"] * etae)) / sqrt((uD * eta) + (vD * eta * eta));
868 double SequenceStatistics::fuLiDStar(
870 bool useNbSegregatingSites)
873 double nn =
static_cast<double>(n);
874 double _n = nn / (nn - 1.);
875 map<string, double> values = getUsefulValues_(n);
876 double vDs = getVDstar_(n, values[
"a1"], values[
"a2"], values[
"dn"]);
877 double uDs = getUDstar_(n, values[
"a1"], vDs);
878 unsigned int etaP = 0;
879 if (useNbSegregatingSites)
880 etaP = numberOfPolymorphicSites(group);
882 etaP = totalNumberOfMutations(group);
885 double eta =
static_cast<double>(etaP);
886 double etas =
static_cast<double>(numberOfSingletons(group));
889 return ((_n * eta) - (values[
"a1"] * etas)) / sqrt(uDs * eta + vDs * eta * eta);
897 double SequenceStatistics::fuLiF(
900 bool useNbSingletons,
901 bool useNbSegregatingSites)
904 double nn =
static_cast<double>(n);
905 map<string, double> values = getUsefulValues_(n);
906 double pi = tajima83(ingroup,
true);
907 double vF = (values[
"cn"] + values[
"b2"] - 2. / (nn - 1.)) / (pow(values[
"a1"], 2) + values[
"a2"]);
908 double uF = ((1. + values[
"b1"] - (4. * ((nn + 1.) / ((nn - 1.) * (nn - 1.)))) * (values[
"a1n"] - (2. * nn) / (nn + 1.))) / values[
"a1"]) - vF;
909 unsigned int etaP = 0;
910 if (useNbSegregatingSites)
911 etaP = numberOfPolymorphicSites(ingroup);
913 etaP = totalNumberOfMutations(ingroup);
916 double eta =
static_cast<double>(etaP);
919 etae =
static_cast<double>(numberOfSingletons(outgroup));
921 etae =
static_cast<double>(totalNumberOfMutationsOnExternalBranches(ingroup, outgroup));
922 return (pi - etae) / sqrt(uF * eta + vF * eta * eta);
925 double SequenceStatistics::fuLiFStar(
927 bool useNbSegregatingSites)
931 double pi = tajima83(group,
true);
938 double vFs = (((2 * n * n * n + 110 * n * n - 255 * n + 153) / (9 * n * n * (n - 1))) + ((2 * (n - 1) * values[
"a1"]) / (n * n)) - 8 * values[
"a2"] / n) / (pow(values[
"a1"], 2) + values[
"a2"]);
939 double uFs = (((4 * n * n + 19 * n + 3 - 12 * (n + 1) * values[
"a1n"]) / (3 * n * (n - 1))) / values[
"a1"]) - vFs;
940 unsigned int etaP = 0;
941 if (useNbSegregatingSites)
942 etaP = numberOfPolymorphicSites(group);
944 etaP = totalNumberOfMutations(group);
947 double eta =
static_cast<double>(etaP);
948 double etas =
static_cast<double>(numberOfSingletons(group));
951 return (pi - ((n - 1.) / n * etas)) / sqrt(uFs * eta + vFs * eta * eta);
956 vector<double> vdiff;
957 double piIntra1, piIntra2, meanPiIntra, piInter, Fst;
962 piIntra1 = SequenceStatistics::tajima83(*Pop1,
true);
963 piIntra2 = SequenceStatistics::tajima83(*Pop2,
true);
965 meanPiIntra = (piIntra1 + piIntra2) / 2;
975 vdiff.push_back(SiteContainerTools::computeSimilarity(s1, s2,
true,
"no gap",
true));
978 piInter = (VectorTools::sum(vdiff) / n) *
static_cast<double>(psc.
getNumberOfSites());
981 Fst = 1.0 - meanPiIntra / piInter;
1005 if (SiteTools::isComplete(psc.
getSite(i)) && !SiteTools::isConstant(psc.
getSite(i)) && !SiteTools::isTriplet(psc.
getSite(i)))
1012 if (SiteTools::isComplete(psc.
getSite(i)) && !SiteTools::isConstant(psc.
getSite(i)) && !SiteTools::isTriplet(psc.
getSite(i)) && !SiteTools::hasSingleton(psc.
getSite(i)))
1019 const SiteContainer* sc = SiteContainerTools::getSelectedSites(psc, ss);
1026 Site siteclone(site);
1027 bool deletesite =
false;
1028 map<int, double> freqs;
1029 SymbolListTools::getFrequencies(siteclone, freqs);
1031 for (map<int, double>::iterator it = freqs.begin(); it != freqs.end(); it++)
1033 if (it->second >= 0.5)
1040 if (freqs[site.
getValue(j)] <= 1 - freqmin)
1050 if (freqs[site.
getValue(j)] >= freqmin)
1075 if (SiteTools::isComplete(psc.
getSite(i)) && !SiteTools::isConstant(psc.
getSite(i)) && !SiteTools::isTriplet(psc.
getSite(i)))
1078 bool deletesite =
false;
1079 map<int, double> freqs;
1080 SymbolListTools::getFrequencies(site, freqs);
1083 if (freqs[j] >= 1 - freqmin)
1092 if (SiteTools::isComplete(psc.
getSite(i)) && !SiteTools::isConstant(psc.
getSite(i)) && !SiteTools::isTriplet(psc.
getSite(i)) && !SiteTools::hasSingleton(psc.
getSite(i)))
1096 bool deletesite =
false;
1097 map<int, double> freqs;
1098 SymbolListTools::getFrequencies(site, freqs);
1101 if (freqs[j] >= 1 - freqmin)
1111 throw DimensionException(
"SequenceStatistics::pairwiseDistances1 : less than 2 sites are available", ss.size(), 2);
1113 for (
size_t i = 0; i < ss.size() - 1; i++)
1115 for (
size_t j = i + 1; j < ss.size(); j++)
1117 dist.push_back(
static_cast<double>(ss[j] - ss[i]));
1130 if (SiteTools::isComplete(psc.
getSite(i)) && !SiteTools::isConstant(psc.
getSite(i)) && !SiteTools::isTriplet(psc.
getSite(i)))
1133 bool deletesite =
false;
1134 map<int, double> freqs;
1135 SymbolListTools::getFrequencies(site, freqs);
1138 if (freqs[j] >= 1 - freqmin)
1147 if (SiteTools::isComplete(psc.
getSite(i)) && !SiteTools::isConstant(psc.
getSite(i)) && !SiteTools::isTriplet(psc.
getSite(i)) && !SiteTools::hasSingleton(psc.
getSite(i)))
1151 bool deletesite =
false;
1152 map<int, double> freqs;
1153 SymbolListTools::getFrequencies(site, freqs);
1156 if (freqs[j] >= 1 - freqmin)
1164 size_t n = ss.size();
1166 throw DimensionException(
"SequenceStatistics::pairwiseDistances1 : less than 2 sites are available", ss.size(), 2);
1167 Vdouble distance(n * (n - 1) / 2, 0);
1174 for (
size_t i = 0; i < nbsite; i++)
1180 for (
size_t i = 0; i < gap.size(); i++)
1182 for (
size_t j = 0; j < ss.size(); j++)
1188 for (
size_t i = 0; i < n - 1; i++)
1190 for (
size_t j = i + 1; j < n; j++)
1192 dist.push_back(
static_cast<double>(newss[j] - newss[i]));
1208 throw DimensionException(
"SequenceStatistics::pairwiseD: less than two sites are available", nbsite, 2);
1210 throw DimensionException(
"SequenceStatistics::pairwiseD: less than two sequences are available", nbseq, 2);
1211 for (
size_t i = 0; i < nbsite - 1; i++)
1213 for (
size_t j = i + 1; j < nbsite; j++)
1218 map<int, double> freq1;
1219 map<int, double> freq2;
1220 SymbolListTools::getFrequencies(site1, freq1);
1221 SymbolListTools::getFrequencies(site2, freq2);
1222 for (
size_t k = 0; k < nbseq; k++)
1227 haplo = haplo /
static_cast<double>(nbseq);
1228 D.push_back(std::abs(haplo - freq1[1] * freq2[1]));
1241 throw DimensionException(
"SequenceStatistics::pairwiseD: less than two sites are available", nbsite, 2);
1243 throw DimensionException(
"SequenceStatistics::pairwiseD: less than two sequences are available", nbseq, 2);
1244 for (
size_t i = 0; i < nbsite - 1; i++)
1246 for (
size_t j = i + 1; j < nbsite; j++)
1251 map<int, double> freq1;
1252 map<int, double> freq2;
1253 SymbolListTools::getFrequencies(site1, freq1);
1254 SymbolListTools::getFrequencies(site2, freq2);
1255 for (
size_t k = 0; k < nbseq; k++)
1260 haplo = haplo /
static_cast<double>(nbseq);
1261 double d, D = (haplo - freq1[1] * freq2[1]);
1264 if (freq1[1] * freq2[0] <= freq1[0] * freq2[1])
1266 d = std::abs(D) / (freq1[1] * freq2[0]);
1270 d = std::abs(D) / (freq1[0] * freq2[1]);
1275 if (freq1[1] * freq2[1] <= freq1[0] * freq2[0])
1277 d = std::abs(D) / (freq1[1] * freq2[1]);
1281 d = std::abs(D) / (freq1[0] * freq2[0]);
1284 Dprime.push_back(d);
1297 throw DimensionException(
"SequenceStatistics::pairwiseD: less than two sites are available", nbsite, 2);
1299 throw DimensionException(
"SequenceStatistics::pairwiseD: less than two sequences are available", nbseq, 2);
1300 for (
size_t i = 0; i < nbsite - 1; i++)
1302 for (
size_t j = i + 1; j < nbsite; j++)
1307 map<int, double> freq1;
1308 map<int, double> freq2;
1309 SymbolListTools::getFrequencies(site1, freq1);
1310 SymbolListTools::getFrequencies(site2, freq2);
1311 for (
size_t k = 0; k < nbseq; k++)
1316 haplo = haplo /
static_cast<double>(nbseq);
1317 double r = ((haplo - freq1[1] * freq2[1]) * (haplo - freq1[1] * freq2[1])) / (freq1[0] * freq1[1] * freq2[0] * freq2[1]);
1330 Vdouble D = pairwiseD(psc, keepsingleton, freqmin);
1331 return VectorTools::mean<double, double>(D);
1336 Vdouble Dprime = pairwiseDprime(psc, keepsingleton, freqmin);
1337 return VectorTools::mean<double, double>(Dprime);
1342 Vdouble R2 = SequenceStatistics::pairwiseR2(psc, keepsingleton, freqmin);
1343 return VectorTools::mean<double, double>(R2);
1348 Vdouble dist = pairwiseDistances1(psc, keepsingleton, freqmin);
1349 return VectorTools::mean<double, double>(dist);
1354 Vdouble dist = pairwiseDistances2(psc, keepsingleton, freqmin);
1355 return VectorTools::mean<double, double>(dist);
1364 Vdouble D = pairwiseD(psc, keepsingleton, freqmin) - 1;
1367 dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1369 dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1370 return VectorTools::sum(D * dist) / VectorTools::sum(dist * dist);
1375 Vdouble Dprime = pairwiseDprime(psc, keepsingleton, freqmin) - 1;
1378 dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1380 dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1381 return VectorTools::sum(Dprime * dist) / VectorTools::sum(dist * dist);
1386 Vdouble R2 = pairwiseR2(psc, keepsingleton, freqmin) - 1;
1389 dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1391 dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1392 return VectorTools::sum(R2 * dist) / VectorTools::sum(dist * dist);
1397 Vdouble D = pairwiseD(psc, keepsingleton, freqmin);
1401 dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1403 dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1404 reg[0] = VectorTools::cov<double, double>(dist, D) / VectorTools::var<double, double>(dist);
1405 reg[1] = VectorTools::mean<double, double>(D) - reg[0] * VectorTools::mean<double, double>(dist);
1411 Vdouble Dprime = pairwiseDprime(psc, keepsingleton, freqmin);
1415 dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1417 dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1418 reg[0] = VectorTools::cov<double, double>(dist, Dprime) / VectorTools::var<double, double>(dist);
1419 reg[1] = VectorTools::mean<double, double>(Dprime) - reg[0] * VectorTools::mean<double, double>(dist);
1425 Vdouble R2 = pairwiseR2(psc, keepsingleton, freqmin);
1429 dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1431 dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1432 reg[0] = VectorTools::cov<double, double>(dist, R2) / VectorTools::var<double, double>(dist);
1433 reg[1] = VectorTools::mean<double, double>(R2) - reg[0] * VectorTools::mean<double, double>(dist);
1439 Vdouble R2 = pairwiseR2(psc, keepsingleton, freqmin);
1441 Vdouble R2transformed = unit / R2 - 1;
1444 dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1446 dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1447 return VectorTools::sum(R2transformed * dist) / VectorTools::sum(dist * dist);
1456 double left = leftHandHudson_(psc);
1461 if (SequenceStatistics::numberOfPolymorphicSites(psc) < 2)
1463 if (rightHandHudson_(c1, n) < left)
1465 if (rightHandHudson_(c2, n) > left)
1467 while (dif > precision)
1469 if (rightHandHudson_((c1 + c2) / 2, n) > left)
1473 dif = std::abs(2 * (c1 - c2) / (c1 + c2));
1475 return (c1 + c2) / 2;
1482 void SequenceStatistics::testUsefulValues(std::ostream& s,
size_t n)
1484 map<string, double> v = getUsefulValues_(n);
1485 double vD = getVD_(n, v[
"a1"], v[
"a2"], v[
"cn"]);
1486 double uD = getUD_(v[
"a1"], vD);
1487 double vDs = getVDstar_(n, v[
"a1"], v[
"a2"], v[
"dn"]);
1488 double uDs = getUDstar_(n, v[
"a1"], vDs);
1491 s << v[
"a1"] <<
"\t";
1492 s << v[
"a2"] <<
"\t";
1493 s << v[
"a1n"] <<
"\t";
1494 s << v[
"b1"] <<
"\t";
1495 s << v[
"b2"] <<
"\t";
1496 s << v[
"c1"] <<
"\t";
1497 s << v[
"c2"] <<
"\t";
1498 s << v[
"cn"] <<
"\t";
1499 s << v[
"dn"] <<
"\t";
1500 s << v[
"e1"] <<
"\t";
1501 s << v[
"e2"] <<
"\t";
1512 unsigned int SequenceStatistics::getNumberOfMutations_(
const Site& site)
1515 unsigned int tmp_count = 0;
1516 map<int, size_t> states_count;
1517 SymbolListTools::getCounts(site, states_count);
1519 for (map<int, size_t>::iterator it = states_count.begin(); it != states_count.end(); it++)
1529 unsigned int SequenceStatistics::getNumberOfSingletons_(
const Site& site)
1531 unsigned int nus = 0;
1532 map<int, size_t> states_count;
1533 SymbolListTools::getCounts(site, states_count);
1534 for (map<int, size_t>::iterator it = states_count.begin(); it != states_count.end(); it++)
1536 if (it->second == 1)
1542 unsigned int SequenceStatistics::getNumberOfDerivedSingletons_(
const Site& site_in,
const Site& site_out)
1544 unsigned int nus = 0;
1545 map<int, size_t> states_count;
1546 map<int, size_t> outgroup_states_count;
1547 SymbolListTools::getCounts(site_in, states_count);
1548 SymbolListTools::getCounts(site_out, outgroup_states_count);
1550 if (outgroup_states_count.size() == 1)
1552 for (map<int, size_t>::iterator it = states_count.begin(); it != states_count.end(); it++)
1554 if (it->second == 1)
1556 if (outgroup_states_count.find(it->first) == outgroup_states_count.end())
1564 std::map<std::string, double> SequenceStatistics::getUsefulValues_(
size_t n)
1566 double nn =
static_cast<double>(n);
1567 map<string, double> values;
1581 for (
double i = 1; i < nn; i++)
1583 values[
"a1"] += 1. / i;
1584 values[
"a2"] += 1. / (i * i);
1586 values[
"a1n"] = values[
"a1"] + (1. / nn);
1587 values[
"b1"] = (nn + 1.) / (3. * (nn - 1.));
1588 values[
"b2"] = 2. * ((nn * nn) + nn + 3.) / (9. * nn * (nn - 1.));
1589 values[
"c1"] = values[
"b1"] - (1. / values[
"a1"]);
1590 values[
"c2"] = values[
"b2"] - ((nn + 2.) / (values[
"a1"] * nn)) + (values[
"a2"] / (values[
"a1"] * values[
"a1"]));
1598 values[
"cn"] = 2. * ((nn * values[
"a1"]) - (2. * (nn - 1.))) / ((nn - 1.) * (nn - 2.));
1601 + ((nn - 2.) / ((nn - 1.) * (nn - 1.)))
1603 * ((3. / 2.) - (((2. * values[
"a1n"]) - 3.) / (nn - 2.)) - (1. / nn));
1605 values[
"e1"] = values[
"c1"] / values[
"a1"];
1606 values[
"e2"] = values[
"c2"] / ((values[
"a1"] * values[
"a1"]) + values[
"a2"]);
1611 double SequenceStatistics::getVD_(
size_t n,
double a1,
double a2,
double cn)
1613 double nn =
static_cast<double>(n);
1616 double vD = 1. + ((a1 * a1) / (a2 + (a1 * a1))) * (cn - ((nn + 1.) / (nn - 1.)));
1620 double SequenceStatistics::getUD_(
double a1,
double vD)
1622 return a1 - 1. - vD;
1625 double SequenceStatistics::getVDstar_(
size_t n,
double a1,
double a2,
double dn)
1627 double denom = (a1 * a1) + a2;
1628 if (n < 3 || denom == 0.)
1630 double nn =
static_cast<double>(n);
1631 double nnn = nn / (nn - 1.);
1636 - (2. * (nn * a1 * (a1 + 1)) / ((nn - 1.) * (nn - 1.)))
1653 double SequenceStatistics::getUDstar_(
size_t n,
double a1,
double vDs)
1657 double nn =
static_cast<double>(n);
1658 double nnn = nn / (nn - 1.);
1660 double uDs = (nnn * (a1 - nnn)) - vDs;
1674 for (
size_t i = 0; i < nbseq - 1; i++)
1676 for (
size_t j = i + 1; j < nbseq; j++)
1682 S1 += SequenceStatistics::watterson75(*psc2,
true);
1683 S2 += SequenceStatistics::watterson75(*psc2,
true) * SequenceStatistics::watterson75(*psc2,
true);
1687 double Sk = (2 * S2 - pow(2 * S1 /
static_cast<double>(nbseq), 2.)) / pow(nbseq, 2.);
1688 double H = SequenceStatistics::heterozygosity(*newpsc);
1689 double H2 = SequenceStatistics::squaredHeterozygosity(*newpsc);
1691 return static_cast<double>(Sk - H + H2) / pow(H *
static_cast<double>(nbseq) /
static_cast<double>(nbseq - 1), 2.);
1694 double SequenceStatistics::rightHandHudson_(
double c,
size_t n)
1696 double nn =
static_cast<double>(n);
1697 return 1. / (97. * pow(c, 2.) * pow(nn, 3.)) * ((nn - 1.) * (97. * (c * (4. + (c - 2. * nn) * nn) + (-2. * (7. + c) + 4. * nn + (c - 1.) * pow(nn, 2.)) * log((18. + c * (13. + c)) / 18.)) + sqrt(97.) * (110. + nn * (49. * nn - 52.) + c * (2. + nn * (15. * nn - 8.))) * log(-1. + (72. + 26. * c) / (36. + 13. * c - c * sqrt(97.)))));
virtual size_t getNumberOfSites() const=0
virtual int charToInt(const std::string &state) const=0
virtual unsigned int getSize() const=0
virtual void setElement(size_t pos, const std::string &c)
virtual bool hasMoreSites() const=0
virtual const Site * nextSite()=0
virtual const T & getValue(size_t pos) const=0
virtual const Alphabet * getAlphabet() const=0
virtual size_t size() const=0
virtual size_t getNumberOfSequences() const=0
The PolymorphismSequenceContainer class.
void setAsOutgroupMember(size_t index)
Set a sequence as outgroup member by index.
void addSequence(const Sequence &sequence, bool checkName=true)
virtual std::string getChar(size_t pos) const=0
virtual const Alphabet * getAlphabet() const=0
virtual const Site & getSite(size_t siteIndex) const=0
virtual const int & getValue(size_t pos) const
const Sequence & getSequence(size_t sequenceIndex) const
void addSite(const Site &site, bool checkPosition=true)
const Site & getSite(size_t siteIndex) const
size_t getNumberOfSites() const
size_t getNumberOfSequences() const
std::size_t count(const std::string &s, const std::string &pattern)
std::vector< size_t > SiteSelection
std::vector< double > Vdouble
std::vector< size_t > SequenceSelection