bpp-popgen3  3.0.0
SequenceStatistics.cpp
Go to the documentation of this file.
1 // SPDX-FileCopyrightText: The Bio++ Development Group
2 //
3 // SPDX-License-Identifier: CECILL-2.1
4 
5 #include "SequenceStatistics.h" // class's header file
8 
9 // From the STL:
10 #include <ctype.h>
11 #include <cmath>
12 #include <iostream>
13 #include <vector>
14 
15 using namespace std;
16 
17 // From SeqLib:
18 #include <Bpp/Seq/Site.h>
19 #include <Bpp/Seq/SiteTools.h>
21 #include <Bpp/Seq/CodonSiteTools.h>
22 #include <Bpp/Seq/Alphabet/DNA.h>
24 
27 
28 using namespace bpp;
29 
30 // ******************************************************************************
31 // Basic statistics
32 // ******************************************************************************
33 
34 unsigned int SequenceStatistics::numberOfPolymorphicSites(
36  bool gapflag,
37  bool ignoreUnknown)
38 {
39  unsigned int s = 0;
40  unique_ptr<ConstSiteIterator> si;
41  if (gapflag)
42  si.reset(new CompleteSiteContainerIterator(psc));
43  else
44  si.reset(new SimpleSiteContainerIterator(psc));
45  while (si->hasMoreSites())
46  {
47  auto site = si->nextSite();
48  if (!SiteTools::isConstant(site, ignoreUnknown))
49  {
50  s++;
51  }
52  }
53  return s;
54 }
55 
56 double SequenceStatistics::frequencyOfPolymorphicSites(const PolymorphismSequenceContainer& psc, bool gapflag, bool ignoreUnknown)
57 {
58  double s = 0;
59  double n = 0;
60  unique_ptr<ConstSiteIterator> si;
61  if (gapflag)
62  si.reset(new CompleteSiteContainerIterator(psc));
63  else
64  si.reset(new SimpleSiteContainerIterator(psc));
65  while (si->hasMoreSites())
66  {
67  auto& site = si->nextSite();
68  n++;
69  if (!SiteTools::isConstant(site, ignoreUnknown))
70  {
71  s++;
72  }
73  }
74  return s / n;
75 }
76 
77 unsigned int SequenceStatistics::numberOfParsimonyInformativeSites(const PolymorphismSequenceContainer& psc, bool gapflag)
78 {
79  unique_ptr<ConstSiteIterator> si;
80  if (gapflag)
81  si.reset(new CompleteSiteContainerIterator(psc));
82  else
83  si.reset(new SimpleSiteContainerIterator(psc));
84  unsigned int s = 0;
85  while (si->hasMoreSites())
86  {
87  auto& site = si->nextSite();
88  if (SiteTools::isParsimonyInformativeSite(site))
89  {
90  s++;
91  }
92  }
93  return s;
94 }
95 
96 unsigned int SequenceStatistics::numberOfSingletons(const PolymorphismSequenceContainer& psc, bool gapflag)
97 {
98  unique_ptr<ConstSiteIterator> si;
99  if (gapflag)
100  si.reset(new CompleteSiteContainerIterator(psc));
101  else
102  si.reset(new SimpleSiteContainerIterator(psc));
103  unsigned int nus = 0;
104  while (si->hasMoreSites())
105  {
106  auto& site = si->nextSite();
107  nus += getNumberOfSingletons_(site);
108  }
109  return nus;
110 }
111 
112 unsigned int SequenceStatistics::numberOfTriplets(const PolymorphismSequenceContainer& psc, bool gapflag)
113 {
114  unique_ptr<ConstSiteIterator> si;
115  if (gapflag)
116  si.reset(new CompleteSiteContainerIterator(psc));
117  else
118  si.reset(new SimpleSiteContainerIterator(psc));
119  unsigned int s = 0;
120  while (si->hasMoreSites())
121  {
122  auto& site = si->nextSite();
123  if (SiteTools::isTriplet(site))
124  {
125  s++;
126  }
127  }
128  return s;
129 }
130 
131 unsigned int SequenceStatistics::totalNumberOfMutations(const PolymorphismSequenceContainer& psc, bool gapflag)
132 {
133  unique_ptr<ConstSiteIterator> si;
134  if (gapflag)
135  si.reset(new CompleteSiteContainerIterator(psc));
136  else
137  si.reset(new SimpleSiteContainerIterator(psc));
138  unsigned int tnm = 0;
139  while (si->hasMoreSites())
140  {
141  auto& site = si->nextSite();
142  tnm += getNumberOfMutations_(site);
143  }
144  return tnm;
145 }
146 
147 unsigned int SequenceStatistics::totalNumberOfMutationsOnExternalBranches(
149  const PolymorphismSequenceContainer& outg)
150 {
151  if (ing.getNumberOfSites() != outg.getNumberOfSites())
152  throw Exception("ing and outg must have the same size");
153  unsigned int nmuts = 0;
154  auto si = make_unique<SimpleSiteContainerIterator>(ing);
155  auto so = make_unique<SimpleSiteContainerIterator>(outg);
156  while (si->hasMoreSites())
157  {
158  auto& siteIn = si->nextSite();
159  auto& siteOut = so->nextSite();
160  // use fully resolved sites
161  if (SiteTools::isComplete(siteIn) && SiteTools::isComplete(siteOut))
162  nmuts += getNumberOfDerivedSingletons_(siteIn, siteOut); // singletons that are not in outgroup
163  }
164  return nmuts;
165 }
166 
167 double SequenceStatistics::heterozygosity(const PolymorphismSequenceContainer& psc, bool gapflag)
168 {
169  unique_ptr<ConstSiteIterator> si;
170  if (gapflag)
171  si.reset(new CompleteSiteContainerIterator(psc));
172  else
173  si.reset(new SimpleSiteContainerIterator(psc));
174  double s = 0;
175  while (si->hasMoreSites())
176  {
177  auto& site = si->nextSite();
178  s += SiteTools::heterozygosity(site);
179  }
180  return s;
181 }
182 
183 double SequenceStatistics::squaredHeterozygosity(const PolymorphismSequenceContainer& psc, bool gapflag)
184 {
185  unique_ptr<ConstSiteIterator> si;
186  if (gapflag)
187  si.reset(new CompleteSiteContainerIterator(psc));
188  else
189  si.reset(new SimpleSiteContainerIterator(psc));
190  double s = 0;
191  while (si->hasMoreSites())
192  {
193  auto& site = si->nextSite();
194  double h = SiteTools::heterozygosity(site);
195  s += h * h;
196  }
197  return s;
198 }
199 
200 // ******************************************************************************
201 // GC statistics
202 // ******************************************************************************
203 
204 double SequenceStatistics::gcContent(const PolymorphismSequenceContainer& psc)
205 {
206  map<int, double> freqs;
207  SequenceContainerTools::getFrequencies(psc, freqs);
208  auto& alpha = psc.alphabet();
209  return (freqs[alpha.charToInt("C")] + freqs[alpha.charToInt("G")]) / (freqs[alpha.charToInt("A")] + freqs[alpha.charToInt("C")] + freqs[alpha.charToInt("G")] + freqs[alpha.charToInt("T")]);
210 }
211 
212 std::vector<unsigned int> SequenceStatistics::gcPolymorphism(const PolymorphismSequenceContainer& psc, bool gapflag)
213 {
214  unsigned int nbMut = 0;
215  unsigned int nbGC = 0;
216  size_t nbSeq = psc.getNumberOfSequences();
217  vector<unsigned int> vect(2);
218  unique_ptr<ConstSiteIterator> si;
219  if (gapflag)
220  si.reset(new CompleteSiteContainerIterator(psc));
221  else
222  si.reset(new NoGapSiteContainerIterator(psc));
223  while (si->hasMoreSites())
224  {
225  auto& site = si->nextSite();
226  if (!SiteTools::isConstant(site))
227  {
228  long double freqGC = SymbolListTools::getGCContent(site);
229  if (freqGC > 0 && freqGC < 1) // Not 100% AT or GC
230  {
231  nbMut += static_cast<unsigned int>(nbSeq);
232  long double adGC = freqGC * nbSeq;
233  nbGC += static_cast<unsigned int>(adGC);
234  }
235  }
236  }
237  vect[0] = nbMut;
238  vect[1] = nbGC;
239  return vect;
240 }
241 
242 // ******************************************************************************
243 // Diversity statistics
244 // ******************************************************************************
245 
246 double SequenceStatistics::watterson75(const PolymorphismSequenceContainer& psc, bool gapflag, bool ignoreUnknown, bool scaled)
247 {
248  double ThetaW;
249  size_t n = psc.getNumberOfSequences();
250  map<string, double> values = getUsefulValues_(n);
251  double s = 0;
252  if (scaled)
253  s = frequencyOfPolymorphicSites(psc, gapflag, ignoreUnknown);
254  else
255  s = static_cast<double>(numberOfPolymorphicSites(psc, gapflag, ignoreUnknown));
256  ThetaW = s / values["a1"];
257  return ThetaW;
258 }
259 
260 double SequenceStatistics::tajima83(const PolymorphismSequenceContainer& psc, bool gapflag, bool ignoreUnknown, bool scaled)
261 {
262  size_t alphabetSize = psc.getAlphabet()->getSize();
263  unique_ptr<ConstSiteIterator> si;
264  double value2 = 0.;
265  double l = 0;
266  if (gapflag)
267  si.reset(new CompleteSiteContainerIterator(psc));
268  else
269  si.reset(new SimpleSiteContainerIterator(psc));
270  while (si->hasMoreSites())
271  {
272  auto& site = si->nextSite();
273  l++;
274  if (!SiteTools::isConstant(site, ignoreUnknown))
275  {
276  double value = 0.;
277  map<int, size_t> count;
278  SymbolListTools::getCounts(site, count);
279  map<int, size_t> tmp_k;
280  size_t tmp_n = 0;
281  for (auto& it : count)
282  {
283  if (it.first >= 0 && it.first < static_cast<int>(alphabetSize))
284  {
285  tmp_k[it.first] = it.second * (it.second - 1);
286  tmp_n += it.second;
287  }
288  }
289  if (tmp_n == 0 || tmp_n == 1)
290  continue;
291  for (auto& it : tmp_k)
292  {
293  value += static_cast<double>(it.second) / static_cast<double>(tmp_n * (tmp_n - 1));
294  }
295  value2 += 1. - value;
296  }
297  }
298  return scaled ? value2 / l : value2;
299 }
300 
301 double SequenceStatistics::fayWu2000(const PolymorphismSequenceContainer& psc, const Sequence& ancestralSites)
302 {
303  if (psc.getNumberOfSites() != ancestralSites.size())
304  throw Exception("SequenceStatistics::FayWu2000: ancestralSites and psc don't have the same size!!!'" );
305 
306  const Sequence& tmps = psc.sequence(0);
307 
308  size_t alphabetSize = psc.alphabet().getSize();
309  double value = 0.;
310  for (size_t i = 0; i < psc.getNumberOfSites(); ++i)
311  {
312  const Site& site = psc.site(i);
313  string ancB = ancestralSites.getChar(i);
314  int ancV = ancestralSites.getValue(i);
315 
316  if (!SiteTools::isConstant(site) || tmps.getChar(i) != ancB)
317  {
318  if (ancV < 0)
319  continue;
320 
321  map<int, size_t> count;
322  SymbolListTools::getCounts(site, count);
323  map<int, size_t> tmp_k;
324  size_t tmp_n = 0;
325  for (auto& it : count)
326  {
327  if (it.first >= 0 && it.first < static_cast<int>(alphabetSize))
328  {
329  /* if derived allele */
330  if (it.first != ancV)
331  {
332  tmp_k[it.first] = 2 * it.second * it.second;
333  }
334  tmp_n += it.second;
335  }
336  }
337  if (tmp_n == 0 || tmp_n == 1)
338  continue;
339  for (map<int, size_t>::iterator it = tmp_k.begin(); it != tmp_k.end(); it++)
340  {
341  value += static_cast<double>(it->second) / static_cast<double>(tmp_n * (tmp_n - 1));
342  }
343  }
344  }
345  return value;
346 }
347 
348 unsigned int SequenceStatistics::dvk(const PolymorphismSequenceContainer& psc, bool gapflag)
349 {
350  /*
351  * Sylvain Gaillard 17/03/2010:
352  * This implementation uses unneeded SequenceContainer recopy and works on
353  * string. It needs to be improved.
354  */
355  unique_ptr<PolymorphismSequenceContainer> sc;
356  if (gapflag)
357  sc = PolymorphismSequenceContainerTools::getSitesWithoutGaps(psc);
358  else
359  sc = make_unique<PolymorphismSequenceContainer>(psc);
360  // int K = 0;
361  vector<string> pscvector;
362  pscvector.push_back(sc->sequence(0).toString());
363  // K++;
364  for (size_t i = 1; i < sc->getNumberOfSequences(); ++i)
365  {
366  bool uniq = true;
367  string query = sc->sequence(i).toString();
368  for (auto& it : pscvector)
369  {
370  if (query.compare(it) == 0)
371  {
372  uniq = false;
373  break;
374  }
375  }
376  if (uniq)
377  {
378  // K++;
379  pscvector.push_back(query);
380  }
381  }
382  // return K;
383  return static_cast<unsigned int>(pscvector.size());
384 }
385 
386 double SequenceStatistics::dvh(const PolymorphismSequenceContainer& psc, bool gapflag)
387 {
388  /*
389  * Sylvain Gaillard 17/03/2010:
390  * This implementation uses unneeded SequenceContainer recopy and works on
391  * string. It needs to be improved.
392  */
393  unique_ptr<PolymorphismSequenceContainer> sc;
394  if (gapflag)
395  sc = PolymorphismSequenceContainerTools::getSitesWithoutGaps(psc);
396  else
397  sc = make_unique<PolymorphismSequenceContainer>(psc);
398  double H = 0.;
399  size_t nbSeq;
400  vector<string> pscvector;
401  vector<size_t> effvector;
402  pscvector.push_back(sc->sequence(0).toString());
403  effvector.push_back(sc->getSequenceCount(0));
404  nbSeq = sc->getSequenceCount(0);
405  for (size_t i = 1; i < sc->getNumberOfSequences(); ++i)
406  {
407  nbSeq += sc->getSequenceCount(i);
408  bool uniq = true;
409  string query = sc->sequence(i).toString();
410  for (size_t j = 0; j < pscvector.size(); ++j)
411  {
412  if (query.compare(pscvector[j]) == 0)
413  {
414  effvector[j] += sc->getSequenceCount(i);
415  uniq = false;
416  break;
417  }
418  }
419  if (uniq)
420  {
421  pscvector.push_back(query);
422  effvector.push_back(sc->getSequenceCount(i));
423  }
424  }
425  for (size_t i = 0; i < effvector.size(); ++i)
426  {
427  H -= (static_cast<double>(effvector[i]) / static_cast<double>(nbSeq)) * ( static_cast<double>(effvector[i]) / static_cast<double>(nbSeq));
428  }
429  H += 1.;
430  return H;
431 }
432 
433 unsigned int SequenceStatistics::numberOfTransitions(const PolymorphismSequenceContainer& psc)
434 {
435  unsigned int nbT = 0;
436  unique_ptr<ConstSiteIterator> si(new CompleteSiteContainerIterator(psc));
437  while (si->hasMoreSites())
438  {
439  const auto& site = si->nextSite();
440  // if (SiteTools::isConstant(*site) || SiteTools::isTriplet(*site)) continue;
441  if (SiteTools::getNumberOfDistinctCharacters(site) != 2)
442  continue;
443  int state1 = site[0];
444  int state2 = site[0];
445  for (size_t i = 1; i < site.size(); ++i)
446  {
447  if (state1 != site[i])
448  {
449  state2 = site[i];
450  break;
451  }
452  }
453  if (((state1 == 0 && state2 == 2) || (state1 == 2 && state2 == 0)) ||
454  ((state1 == 1 && state2 == 3) || (state1 == 3 && state2 == 1)))
455  {
456  nbT++;
457  }
458  }
459  return nbT;
460 }
461 
462 unsigned int SequenceStatistics::numberOfTransversions(const PolymorphismSequenceContainer& psc)
463 {
464  unsigned int nbTv = 0;
465  unique_ptr<ConstSiteIterator> si(new CompleteSiteContainerIterator(psc));
466  while (si->hasMoreSites())
467  {
468  const auto& site = si->nextSite();
469  // if (SiteTools::isConstant(*site) || SiteTools::isTriplet(*site)) continue;
470  if (SiteTools::getNumberOfDistinctCharacters(site) != 2)
471  continue;
472  int state1 = site[0];
473  int state2 = site[0];
474  for (size_t i = 1; i < site.size(); ++i)
475  {
476  if (state1 != site[i])
477  {
478  state2 = site[i];
479  break;
480  }
481  }
482  if (!(((state1 == 0 && state2 == 2) || (state1 == 2 && state2 == 0)) ||
483  ((state1 == 1 && state2 == 3) || (state1 == 3 && state2 == 1))))
484  {
485  nbTv++;
486  }
487  }
488  return nbTv;
489 }
490 
491 double SequenceStatistics::ratioOfTransitionsTransversions(const PolymorphismSequenceContainer& psc)
492 {
493  // return (double) getNumberOfTransitions(psc)/getNumberOfTransversions(psc);
494  double nbTs = 0;
495  double nbTv = 0;
496  unique_ptr<ConstSiteIterator> si(new CompleteSiteContainerIterator(psc));
497  vector<int> state(2);
498  while (si->hasMoreSites())
499  {
500  map<int, size_t> count;
501  const auto& site = si->nextSite();
502  SymbolListTools::getCounts(site, count);
503  if (count.size() != 2)
504  continue;
505  size_t i = 0;
506  for (const auto& it : count)
507  {
508  state[i] = it.first;
509  i++;
510  }
511  if (((state[0] == 0 && state[1] == 2) || (state[0] == 2 && state[1] == 0)) ||
512  ((state[0] == 1 && state[1] == 3) || (state[0] == 3 && state[1] == 1)))
513  {
514  nbTs++; // transitions
515  }
516  else
517  {
518  nbTv++; // transversion
519  }
520  }
521  if (nbTv == 0)
522  throw ZeroDivisionException("SequenceStatistics::getTransitionsTransversionsRatio.");
523  return nbTs / nbTv;
524 }
525 
526 // ******************************************************************************
527 // Synonymous and non-synonymous polymorphism
528 // ******************************************************************************
529 
530 unsigned int SequenceStatistics::numberOfSitesWithStopCodon(
532  const GeneticCode& gCode,
533  bool gapflag)
534 {
535  if (!AlphabetTools::isCodonAlphabet(psc.alphabet()))
536  throw AlphabetMismatchException("SequenceStatistics::stopCodonSiteNumber(). PolymorphismSequenceContainer must be with a codon alphabet.", &psc.alphabet(), AlphabetTools::DNA_CODON_ALPHABET.get());
537 
538  unique_ptr<ConstSiteIterator> si;
539  if (gapflag)
540  si.reset(new NoGapSiteContainerIterator(psc));
541  else
542  si.reset(new SimpleSiteContainerIterator(psc));
543  unsigned int s = 0;
544  while (si->hasMoreSites())
545  {
546  const auto& site = si->nextSite();
547  if (CodonSiteTools::hasStop(site, gCode))
548  s++;
549  }
550  return s;
551 }
552 
553 unsigned int SequenceStatistics::numberOfMonoSitePolymorphicCodons(const PolymorphismSequenceContainer& psc, bool stopflag, bool gapflag)
554 {
555  unique_ptr<ConstSiteIterator> si;
556  if (stopflag)
557  si.reset(new CompleteSiteContainerIterator(psc));
558  else
559  {
560  if (gapflag)
561  si.reset(new NoGapSiteContainerIterator(psc));
562  else
563  si.reset(new SimpleSiteContainerIterator(psc));
564  }
565  unsigned int s = 0;
566  while (si->hasMoreSites())
567  {
568  const auto& site = si->nextSite();
569  if (CodonSiteTools::isMonoSitePolymorphic(site))
570  s++;
571  }
572  return s;
573 }
574 
575 unsigned int SequenceStatistics::numberOfSynonymousPolymorphicCodons(const PolymorphismSequenceContainer& psc, const GeneticCode& gc)
576 {
577  unique_ptr<ConstSiteIterator> si(new CompleteSiteContainerIterator(psc));
578  unsigned int s = 0;
579  while (si->hasMoreSites())
580  {
581  const auto& site = si->nextSite();
582  if (CodonSiteTools::isSynonymousPolymorphic(site, gc))
583  s++;
584  }
585  return s;
586 }
587 
588 double SequenceStatistics::watterson75Synonymous(const PolymorphismSequenceContainer& psc, const GeneticCode& gc)
589 {
590  double ThetaW = 0.;
591  size_t n = psc.getNumberOfSequences();
592  unsigned int s = numberOfSynonymousSubstitutions(psc, gc);
593  map<string, double> values = getUsefulValues_(n);
594  ThetaW = static_cast<double>(s) / values["a1"];
595  return ThetaW;
596 }
597 
598 double SequenceStatistics::watterson75NonSynonymous(
600  const GeneticCode& gc)
601 {
602  double ThetaW;
603  size_t n = psc.getNumberOfSequences();
604  unsigned int s = numberOfNonSynonymousSubstitutions(psc, gc);
605  map<string, double> values = getUsefulValues_(n);
606  ThetaW = static_cast<double>(s) / values["a1"];
607  return ThetaW;
608 }
609 
610 double SequenceStatistics::piSynonymous(
612  const GeneticCode& gc,
613  bool minchange)
614 {
615  double S = 0.;
617  while (si.hasMoreSites())
618  {
619  const auto& site = si.nextSite();
620  S += CodonSiteTools::piSynonymous(site, gc, minchange);
621  }
622  return S;
623 }
624 
625 double SequenceStatistics::piNonSynonymous(
627  const GeneticCode& gc,
628  bool minchange)
629 {
630  double S = 0.;
632  while (si.hasMoreSites())
633  {
634  const auto& site = si.nextSite();
635  S += CodonSiteTools::piNonSynonymous(site, gc, minchange);
636  }
637  return S;
638 }
639 
640 double SequenceStatistics::meanNumberOfSynonymousSites(
642  const GeneticCode& gc,
643  double ratio)
644 {
645  double S = 0.;
647  while (si.hasMoreSites())
648  {
649  const auto& site = si.nextSite();
650  S += CodonSiteTools::meanNumberOfSynonymousPositions(site, gc, ratio);
651  }
652  return S;
653 }
654 
655 double SequenceStatistics::meanNumberOfNonSynonymousSites(const PolymorphismSequenceContainer& psc, const GeneticCode& gc, double ratio)
656 {
657  double S = 0.;
658  int n = 0;
660  while (si.hasMoreSites())
661  {
662  const auto& site = si.nextSite();
663  n = n + 3;
664  S += CodonSiteTools::meanNumberOfSynonymousPositions(site, gc, ratio);
665  }
666  return static_cast<double>(n - S);
667 }
668 
669 unsigned int SequenceStatistics::numberOfSynonymousSubstitutions(const PolymorphismSequenceContainer& psc, const GeneticCode& gc, double freqmin)
670 {
671  size_t st = 0, sns = 0;
673  while (si.hasMoreSites())
674  {
675  const auto& site = si.nextSite();
676  st += CodonSiteTools::numberOfSubstitutions(site, gc, freqmin);
677  sns += CodonSiteTools::numberOfNonSynonymousSubstitutions(site, gc, freqmin);
678  }
679  return static_cast<unsigned int>(st - sns);
680 }
681 
682 unsigned int SequenceStatistics::numberOfNonSynonymousSubstitutions(const PolymorphismSequenceContainer& psc, const GeneticCode& gc, double freqmin)
683 {
684  unsigned int sns = 0;
686  while (si.hasMoreSites())
687  {
688  const auto& site = si.nextSite();
689  sns += static_cast<unsigned int>(CodonSiteTools::numberOfNonSynonymousSubstitutions(site, gc, freqmin));
690  }
691  return sns;
692 }
693 
694 vector<unsigned int> SequenceStatistics::fixedDifferences(
695  const PolymorphismSequenceContainer& pscin,
696  const PolymorphismSequenceContainer& pscout,
698  const GeneticCode& gc)
699 {
700  CompleteSiteContainerIterator siIn(pscin);
701  CompleteSiteContainerIterator siOut (pscout);
702  CompleteSiteContainerIterator siConst(psccons);
703  size_t NfixS = 0;
704  size_t NfixA = 0;
705  while (siIn.hasMoreSites())
706  {
707  const auto& siteIn = siIn.nextSite();
708  const auto& siteOut = siOut.nextSite();
709  const auto& siteCons = siConst.nextSite();
710  vector<size_t> v = CodonSiteTools::fixedDifferences(siteIn, siteOut, siteCons.getValue(0), siteCons.getValue(1), gc);
711  NfixS += v[0];
712  NfixA += v[1];
713  }
714  vector<unsigned int> v(2);
715  v[0] = static_cast<unsigned int>(NfixS);
716  v[1] = static_cast<unsigned int>(NfixA);
717  return v;
718 }
719 
720 vector<unsigned int> SequenceStatistics::mkTable(
721  const PolymorphismSequenceContainer& ingroup,
722  const PolymorphismSequenceContainer& outgroup,
723  const GeneticCode& gc,
724  double freqmin)
725 {
726  PolymorphismSequenceContainer pscTot(ingroup);
727  for (size_t i = 0; i < outgroup.getNumberOfSequences(); ++i)
728  {
729  auto tmpSeq = make_unique<Sequence>(outgroup.sequence(i));
730  pscTot.addSequence(tmpSeq->getName(), tmpSeq);
731  pscTot.setAsOutgroupMember(i + ingroup.getNumberOfSequences());
732  }
733  auto pscComplete = PolymorphismSequenceContainerTools::getCompleteSites(pscTot);
734  auto pscIn = PolymorphismSequenceContainerTools::extractIngroup(*pscComplete);
735  auto pscOut = PolymorphismSequenceContainerTools::extractOutgroup(*pscComplete);
736  auto consensusIn = SiteContainerTools::getConsensus(*pscIn, "consensusIn");
737  auto consensusOut = SiteContainerTools::getConsensus(*pscOut, "consensusOut");
738  auto consensus = make_unique<PolymorphismSequenceContainer>(ingroup.getAlphabet());
739  consensus->addSequence("consensusIn", consensusIn);
740  consensus->addSequence("consensusOut", consensusOut);
741  vector<unsigned int> u = SequenceStatistics::fixedDifferences(*pscIn, *pscOut, *consensus, gc);
742  vector<unsigned int> v(4);
743  v[0] = SequenceStatistics::numberOfNonSynonymousSubstitutions(*pscIn, gc, freqmin);
744  v[1] = SequenceStatistics::numberOfSynonymousSubstitutions(*pscIn, gc, freqmin);
745  v[2] = u[1];
746  v[3] = u[0];
747  return v;
748 }
749 
750 double SequenceStatistics::neutralityIndex(const PolymorphismSequenceContainer& ingroup, const PolymorphismSequenceContainer& outgroup, const GeneticCode& gc, double freqmin)
751 {
752  vector<unsigned int> v = SequenceStatistics::mkTable(ingroup, outgroup, gc, freqmin);
753  if (v[1] != 0 && v[2] != 0)
754  return static_cast<double>(v[0] * v[3]) / static_cast<double>(v[1] * v[2]);
755  else
756  return -1;
757 }
758 
759 // ******************************************************************************
760 // Statistical tests
761 // ******************************************************************************
762 
763 double SequenceStatistics::tajimaDss(const PolymorphismSequenceContainer& psc, bool gapflag, bool ignoreUnknown)
764 {
765  unsigned int Sp = numberOfPolymorphicSites(psc, gapflag, ignoreUnknown);
766  if (Sp == 0)
767  throw ZeroDivisionException("SequenceStatistics::tajimaDss. S should not be 0.");
768  double S = static_cast<double>(Sp);
769  double tajima = tajima83(psc, gapflag, ignoreUnknown);
770  double watterson = watterson75(psc, gapflag, ignoreUnknown);
771  size_t n = psc.getNumberOfSequences();
772  map<string, double> values = getUsefulValues_(n);
773  return (tajima - watterson) / sqrt((values["e1"] * S) + (values["e2"] * S * (S - 1)));
774 }
775 
776 double SequenceStatistics::tajimaDtnm(const PolymorphismSequenceContainer& psc, bool gapflag, bool ignoreUnknown)
777 {
778  unsigned int etaP = totalNumberOfMutations(psc, gapflag);
779  if (etaP == 0)
780  throw ZeroDivisionException("SequenceStatistics::tajimaDtnm. Eta should not be 0.");
781  double eta = static_cast<double>(etaP);
782  double tajima = tajima83(psc, gapflag, ignoreUnknown);
783  size_t n = psc.getNumberOfSequences();
784  map<string, double> values = getUsefulValues_(n);
785  double eta_a1 = eta / values["a1"];
786  return (tajima - eta_a1) / sqrt((values["e1"] * eta) + (values["e2"] * eta * (eta - 1)));
787 }
788 
789 double SequenceStatistics::fuLiD(
790  const PolymorphismSequenceContainer& ingroup,
791  const PolymorphismSequenceContainer& outgroup,
792  bool useNbSingletons,
793  bool useNbSegregatingSites)
794 {
795  size_t n = ingroup.getNumberOfSequences();
796  map<string, double> values = getUsefulValues_(n);
797  double vD = getVD_(n, values["a1"], values["a2"], values["cn"]);
798  double uD = getUD_(values["a1"], vD);
799  unsigned int etaP = 0;
800  if (useNbSegregatingSites)
801  etaP = numberOfPolymorphicSites(ingroup);
802  else
803  etaP = totalNumberOfMutations(ingroup);
804  if (etaP == 0)
805  throw ZeroDivisionException("SequenceStatistics::fuLiD. Eta should not be 0.");
806  double eta = static_cast<double>(etaP);
807  double etae = 0.;
808  if (useNbSingletons)
809  etae = static_cast<double>(numberOfSingletons(outgroup));
810  else
811  etae = static_cast<double>(totalNumberOfMutationsOnExternalBranches(ingroup, outgroup)); // added by Khalid 13/07/2005
812  return (eta - (values["a1"] * etae)) / sqrt((uD * eta) + (vD * eta * eta));
813 }
814 
815 double SequenceStatistics::fuLiDStar(
816  const PolymorphismSequenceContainer& group,
817  bool useNbSegregatingSites)
818 {
819  size_t n = group.getNumberOfSequences();
820  double nn = static_cast<double>(n);
821  double _n = nn / (nn - 1.);
822  map<string, double> values = getUsefulValues_(n);
823  double vDs = getVDstar_(n, values["a1"], values["a2"], values["dn"]);
824  double uDs = getUDstar_(n, values["a1"], vDs);
825  unsigned int etaP = 0;
826  if (useNbSegregatingSites)
827  etaP = numberOfPolymorphicSites(group);
828  else
829  etaP = totalNumberOfMutations(group);
830  if (etaP == 0)
831  throw ZeroDivisionException("eta should not be null");
832  double eta = static_cast<double>(etaP);
833  double etas = static_cast<double>(numberOfSingletons(group));
834 
835  // Fu & Li 1993
836  return ((_n * eta) - (values["a1"] * etas)) / sqrt(uDs * eta + vDs * eta * eta);
837 
838  // Simonsen et al. 1995
839  /*
840  return ((eta / values["a1"]) - (etas * ((n - 1) / n))) / sqrt(uDs * eta + vDs * eta * eta);
841  */
842 }
843 
844 double SequenceStatistics::fuLiF(
845  const PolymorphismSequenceContainer& ingroup,
846  const PolymorphismSequenceContainer& outgroup,
847  bool useNbSingletons,
848  bool useNbSegregatingSites)
849 {
850  size_t n = ingroup.getNumberOfSequences();
851  double nn = static_cast<double>(n);
852  map<string, double> values = getUsefulValues_(n);
853  double pi = tajima83(ingroup, true);
854  double vF = (values["cn"] + values["b2"] - 2. / (nn - 1.)) / (pow(values["a1"], 2) + values["a2"]);
855  double uF = ((1. + values["b1"] - (4. * ((nn + 1.) / ((nn - 1.) * (nn - 1.)))) * (values["a1n"] - (2. * nn) / (nn + 1.))) / values["a1"]) - vF;
856  unsigned int etaP = 0;
857  if (useNbSegregatingSites)
858  etaP = numberOfPolymorphicSites(ingroup);
859  else
860  etaP = totalNumberOfMutations(ingroup);
861  if (etaP == 0)
862  throw ZeroDivisionException("eta should not be null");
863  double eta = static_cast<double>(etaP);
864  double etae = 0.;
865  if (useNbSingletons)
866  etae = static_cast<double>(numberOfSingletons(outgroup));
867  else
868  etae = static_cast<double>(totalNumberOfMutationsOnExternalBranches(ingroup, outgroup)); // added by Khalid 13/07/2005
869  return (pi - etae) / sqrt(uF * eta + vF * eta * eta);
870 }
871 
872 double SequenceStatistics::fuLiFStar(
873  const PolymorphismSequenceContainer& group,
874  bool useNbSegregatingSites)
875 {
876  double n = static_cast<double>(group.getNumberOfSequences());
877  map<string, double> values = getUsefulValues_(group.getNumberOfSequences());
878  double pi = tajima83(group, true);
879 
880  // Fu & Li 1993
881  // double vFs = (values["dn"] + values["b2"] - (2. / (nn - 1.)) * (4. * values["a2"] - 6. + 8. / nn)) / (pow(values["a1"], 2) + values["a2"]);
882  // double uFs = (((nn / (nn - 1.)) + values["b1"] - (4. / (nn * (nn - 1.))) + 2. * ((nn + 1.) / (pow((nn - 1.), 2))) * (values["a1n"] - 2. * nn / (nn + 1.))) / values["a1"]) - vFs;
883 
884  // Simonsen et al. 1995
885  double vFs = (((2 * n * n * n + 110 * n * n - 255 * n + 153) / (9 * n * n * (n - 1))) + ((2 * (n - 1) * values["a1"]) / (n * n)) - 8 * values["a2"] / n) / (pow(values["a1"], 2) + values["a2"]);
886  double uFs = (((4 * n * n + 19 * n + 3 - 12 * (n + 1) * values["a1n"]) / (3 * n * (n - 1))) / values["a1"]) - vFs;
887  unsigned int etaP = 0;
888  if (useNbSegregatingSites)
889  etaP = numberOfPolymorphicSites(group);
890  else
891  etaP = totalNumberOfMutations(group);
892  if (etaP == 0)
893  throw ZeroDivisionException("eta should not be null");
894  double eta = static_cast<double>(etaP);
895  double etas = static_cast<double>(numberOfSingletons(group));
896  // Fu & Li 1993
897  // Simonsen et al. 1995
898  return (pi - ((n - 1.) / n * etas)) / sqrt(uFs * eta + vFs * eta * eta);
899 }
900 
901 double SequenceStatistics::fstHudson92(
903  size_t id1,
904  size_t id2)
905 {
906  vector<double> vdiff;
907  double piIntra1, piIntra2, meanPiIntra, piInter, Fst;
908 
909  auto Pop1 = PolymorphismSequenceContainerTools::extractGroup(psc, id1);
910  auto Pop2 = PolymorphismSequenceContainerTools::extractGroup(psc, id2);
911 
912  piIntra1 = SequenceStatistics::tajima83(*Pop1, true);
913  piIntra2 = SequenceStatistics::tajima83(*Pop2, true);
914 
915  meanPiIntra = (piIntra1 + piIntra2) / 2;
916 
917  double n = 0;
918  for (size_t i = 0; i < Pop1->getNumberOfSequences(); ++i)
919  {
920  const Sequence& s1 = Pop1->sequence(i);
921  for (size_t j = 0; j < Pop2->getNumberOfSequences(); ++j)
922  {
923  n++;
924  const auto& s2 = Pop2->sequence(j);
925  vdiff.push_back(SiteContainerTools::computeSimilarity(s1, s2, true, "no gap", true));
926  }
927  }
928  piInter = (VectorTools::sum(vdiff) / n) * static_cast<double>(psc.getNumberOfSites());
929 
930  Fst = 1.0 - meanPiIntra / piInter;
931 
932  return Fst;
933 }
934 
935 // ******************************************************************************
936 // Linkage disequilibrium statistics
937 // ******************************************************************************
938 
939 /**********************/
940 /* Preliminary method */
941 /**********************/
942 
943 unique_ptr<PolymorphismSequenceContainer> SequenceStatistics::generateLdContainer(
945  bool keepsingleton,
946  double freqmin)
947 {
948  SiteSelection ss;
949  // Extract polymorphic site with only two alleles
950  for (size_t i = 0; i < psc.getNumberOfSites(); ++i)
951  {
952  if (keepsingleton)
953  {
954  if (SiteTools::isComplete(psc.site(i)) && !SiteTools::isConstant(psc.site(i)) && !SiteTools::isTriplet(psc.site(i)))
955  {
956  ss.push_back(i);
957  }
958  }
959  else
960  {
961  if (SiteTools::isComplete(psc.site(i)) && !SiteTools::isConstant(psc.site(i)) && !SiteTools::isTriplet(psc.site(i)) && !SiteTools::hasSingleton(psc.site(i)))
962  {
963  ss.push_back(i);
964  }
965  }
966  }
967 
968  auto sc = SiteContainerTools::getSelectedSites(psc, ss);
969  auto alpha = psc.getAlphabet();
970  auto ldpsc = make_unique<PolymorphismSequenceContainer>(sc->getNumberOfSequences(), alpha);
971  // Assign 1 to the more frequent and 0 to the less frequent alleles
972  for (size_t i = 0; i < sc->getNumberOfSites(); i++)
973  {
974  const Site& site = sc->site(i);
975  auto siteClone = make_unique<Site>(site);
976  bool deleteSite = false;
977  map<int, double> freqs;
978  SymbolListTools::getFrequencies(*siteClone, freqs);
979  int first = 0;
980  for (const auto& it : freqs)
981  {
982  if (it.second >= 0.5)
983  first = it.first;
984  }
985  for (size_t j = 0; j < sc->getNumberOfSequences(); ++j)
986  {
987  if (freqs[site.getValue(j)] >= 0.5 && site.getValue(j) == first)
988  {
989  if (freqs[site.getValue(j)] <= 1 - freqmin)
990  {
991  siteClone->setElement(j, 1);
992  first = site.getValue(j);
993  }
994  else
995  deleteSite = true;
996  }
997  else
998  {
999  if (freqs[site.getValue(j)] >= freqmin)
1000  siteClone->setElement(j, 0);
1001  else
1002  deleteSite = true;
1003  }
1004  }
1005  if (!deleteSite)
1006  ldpsc->addSite(siteClone);
1007  }
1008  return ldpsc;
1009 }
1010 
1011 /*************************************/
1012 /* Pairwise LD and distance measures */
1013 /*************************************/
1014 
1015 Vdouble SequenceStatistics::pairwiseDistances1(
1016  const PolymorphismSequenceContainer& psc,
1017  bool keepsingleton,
1018  double freqmin)
1019 {
1020  // get Positions with sites of interest
1021  SiteSelection ss;
1022  for (size_t i = 0; i < psc.getNumberOfSites(); ++i)
1023  {
1024  const Site& site = psc.site(i);
1025  if (keepsingleton)
1026  {
1027  if (SiteTools::isComplete(site) &&
1028  !SiteTools::isConstant(site) &&
1029  !SiteTools::isTriplet(site))
1030  {
1031  bool deleteSite = false;
1032  map<int, double> freqs;
1033  SymbolListTools::getFrequencies(site, freqs);
1034  for (int j = 0; j < static_cast<int>(site.alphabet().getSize()); ++j)
1035  {
1036  if (freqs[j] >= 1 - freqmin)
1037  deleteSite = true;
1038  }
1039  if (!deleteSite)
1040  ss.push_back(i);
1041  }
1042  }
1043  else
1044  {
1045  if (SiteTools::isComplete(site) &&
1046  !SiteTools::isConstant(site) &&
1047  !SiteTools::isTriplet(site) &&
1048  !SiteTools::hasSingleton(site))
1049  {
1050  ss.push_back(i);
1051  bool deleteSite = false;
1052  map<int, double> freqs;
1053  SymbolListTools::getFrequencies(site, freqs);
1054  for (int j = 0; j < static_cast<int>(site.alphabet().getSize()); ++j)
1055  {
1056  if (freqs[j] >= 1 - freqmin)
1057  deleteSite = true;
1058  }
1059  if (!deleteSite)
1060  ss.push_back(i);
1061  }
1062  }
1063  }
1064  // compute pairwise distances
1065  if (ss.size() < 2)
1066  throw DimensionException("SequenceStatistics::pairwiseDistances1 : less than 2 sites are available", ss.size(), 2);
1067  Vdouble dist;
1068  for (size_t i = 0; i < ss.size() - 1; ++i)
1069  {
1070  for (size_t j = i + 1; j < ss.size(); ++j)
1071  {
1072  dist.push_back(static_cast<double>(ss[j] - ss[i]));
1073  }
1074  }
1075  return dist;
1076 }
1077 
1078 Vdouble SequenceStatistics::pairwiseDistances2(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin)
1079 {
1080  SiteSelection ss;
1081  for (size_t i = 0; i < psc.getNumberOfSites(); ++i)
1082  {
1083  const Site& site = psc.site(i);
1084  if (keepsingleton)
1085  {
1086  if (SiteTools::isComplete(site) &&
1087  !SiteTools::isConstant(site) &&
1088  !SiteTools::isTriplet(site))
1089  {
1090  bool deleteSite = false;
1091  map<int, double> freqs;
1092  SymbolListTools::getFrequencies(site, freqs);
1093  for (int j = 0; j < static_cast<int>(site.alphabet().getSize()); ++i)
1094  {
1095  if (freqs[j] >= 1 - freqmin)
1096  deleteSite = true;
1097  }
1098  if (!deleteSite)
1099  ss.push_back(i);
1100  }
1101  }
1102  else
1103  {
1104  if (SiteTools::isComplete(site) &&
1105  !SiteTools::isConstant(site) &&
1106  !SiteTools::isTriplet(site) &&
1107  !SiteTools::hasSingleton(site))
1108  {
1109  ss.push_back(i);
1110  bool deleteSite = false;
1111  map<int, double> freqs;
1112  SymbolListTools::getFrequencies(site, freqs);
1113  for (int j = 0; j < static_cast<int>(site.alphabet().getSize()); ++j)
1114  {
1115  if (freqs[j] >= 1 - freqmin)
1116  deleteSite = true;
1117  }
1118  if (!deleteSite)
1119  ss.push_back(i);
1120  }
1121  }
1122  }
1123  size_t n = ss.size();
1124  if (n < 2)
1125  throw DimensionException("SequenceStatistics::pairwiseDistances1 : less than 2 sites are available", ss.size(), 2);
1126  Vdouble distance(n * (n - 1) / 2, 0);
1127  size_t nbSite = psc.getNumberOfSites();
1128  for (size_t k = 0; k < psc.getNumberOfSequences(); k++)
1129  {
1130  const Sequence& seq = psc.sequence(k);
1131  SiteSelection gap, newss = ss;
1132  Vdouble dist;
1133  for (size_t i = 0; i < nbSite; ++i)
1134  {
1135  if (seq.getValue(i) == -1)
1136  gap.push_back(i);
1137  }
1138  // Site positions are re-numbered to take gaps into account
1139  for (size_t i = 0; i < gap.size(); ++i)
1140  {
1141  for (size_t j = 0; j < ss.size(); ++j)
1142  {
1143  if (ss[j] > gap[i])
1144  newss[j]--;
1145  }
1146  }
1147  for (size_t i = 0; i < n - 1; ++i)
1148  {
1149  for (size_t j = i + 1; j < n; ++j)
1150  {
1151  dist.push_back(static_cast<double>(newss[j] - newss[i]));
1152  }
1153  }
1154  distance += dist;
1155  }
1156  distance = distance / static_cast<double>(psc.getNumberOfSequences());
1157  return distance;
1158 }
1159 
1160 Vdouble SequenceStatistics::pairwiseD(
1161  const PolymorphismSequenceContainer& psc,
1162  bool keepsingleton,
1163  double freqmin)
1164 {
1165  auto newpsc = generateLdContainer(psc, keepsingleton, freqmin);
1166  Vdouble D;
1167  size_t nbsite = newpsc->getNumberOfSites();
1168  size_t nbseq = newpsc->getNumberOfSequences();
1169  if (nbsite < 2)
1170  throw DimensionException("SequenceStatistics::pairwiseD: less than two sites are available", nbsite, 2);
1171  if (nbseq < 2)
1172  throw DimensionException("SequenceStatistics::pairwiseD: less than two sequences are available", nbseq, 2);
1173  for (size_t i = 0; i < nbsite - 1; ++i)
1174  {
1175  for (size_t j = i + 1; j < nbsite; ++j)
1176  {
1177  double haplo = 0;
1178  const Site& site1 = newpsc->site(i);
1179  const Site& site2 = newpsc->site(j);
1180  map<int, double> freq1;
1181  map<int, double> freq2;
1182  SymbolListTools::getFrequencies(site1, freq1);
1183  SymbolListTools::getFrequencies(site2, freq2);
1184  for (size_t k = 0; k < nbseq; ++k)
1185  {
1186  if (site1.getValue(k) + site2.getValue(k) == 2)
1187  haplo++;
1188  }
1189  haplo = haplo / static_cast<double>(nbseq);
1190  D.push_back(std::abs(haplo - freq1[1] * freq2[1]));
1191  }
1192  }
1193  return D;
1194 }
1195 
1196 Vdouble SequenceStatistics::pairwiseDprime(
1197  const PolymorphismSequenceContainer& psc,
1198  bool keepsingleton,
1199  double freqmin)
1200 {
1201  auto newpsc = generateLdContainer(psc, keepsingleton, freqmin);
1202  Vdouble Dprime;
1203  size_t nbsite = newpsc->getNumberOfSites();
1204  size_t nbseq = newpsc->getNumberOfSequences();
1205  if (nbsite < 2)
1206  throw DimensionException("SequenceStatistics::pairwiseD: less than two sites are available", nbsite, 2);
1207  if (nbseq < 2)
1208  throw DimensionException("SequenceStatistics::pairwiseD: less than two sequences are available", nbseq, 2);
1209  for (size_t i = 0; i < nbsite - 1; i++)
1210  {
1211  for (size_t j = i + 1; j < nbsite; j++)
1212  {
1213  double haplo = 0;
1214  const Site& site1 = newpsc->site(i);
1215  const Site& site2 = newpsc->site(j);
1216  map<int, double> freq1;
1217  map<int, double> freq2;
1218  SymbolListTools::getFrequencies(site1, freq1);
1219  SymbolListTools::getFrequencies(site2, freq2);
1220  for (size_t k = 0; k < nbseq; ++k)
1221  {
1222  if (site1.getValue(k) + site2.getValue(k) == 2)
1223  haplo++;
1224  }
1225  haplo = haplo / static_cast<double>(nbseq);
1226  double d, D = (haplo - freq1[1] * freq2[1]);
1227  if (D > 0)
1228  {
1229  if (freq1[1] * freq2[0] <= freq1[0] * freq2[1])
1230  {
1231  d = std::abs(D) / (freq1[1] * freq2[0]);
1232  }
1233  else
1234  {
1235  d = std::abs(D) / (freq1[0] * freq2[1]);
1236  }
1237  }
1238  else
1239  {
1240  if (freq1[1] * freq2[1] <= freq1[0] * freq2[0])
1241  {
1242  d = std::abs(D) / (freq1[1] * freq2[1]);
1243  }
1244  else
1245  {
1246  d = std::abs(D) / (freq1[0] * freq2[0]);
1247  }
1248  }
1249  Dprime.push_back(d);
1250  }
1251  }
1252  return Dprime;
1253 }
1254 
1255 Vdouble SequenceStatistics::pairwiseR2(
1256  const PolymorphismSequenceContainer& psc,
1257  bool keepsingleton,
1258  double freqmin)
1259 {
1260  auto newpsc = generateLdContainer(psc, keepsingleton, freqmin);
1261  Vdouble R2;
1262  size_t nbsite = newpsc->getNumberOfSites();
1263  size_t nbseq = newpsc->getNumberOfSequences();
1264  if (nbsite < 2)
1265  throw DimensionException("SequenceStatistics::pairwiseD: less than two sites are available", nbsite, 2);
1266  if (nbseq < 2)
1267  throw DimensionException("SequenceStatistics::pairwiseD: less than two sequences are available", nbseq, 2);
1268  for (size_t i = 0; i < nbsite - 1; ++i)
1269  {
1270  for (size_t j = i + 1; j < nbsite; ++j)
1271  {
1272  double haplo = 0;
1273  const Site& site1 = newpsc->site(i);
1274  const Site& site2 = newpsc->site(j);
1275  map<int, double> freq1;
1276  map<int, double> freq2;
1277  SymbolListTools::getFrequencies(site1, freq1);
1278  SymbolListTools::getFrequencies(site2, freq2);
1279  for (size_t k = 0; k < nbseq; ++k)
1280  {
1281  if (site1.getValue(k) + site2.getValue(k) == 2)
1282  haplo++;
1283  }
1284  haplo = haplo / static_cast<double>(nbseq);
1285  double r = ((haplo - freq1[1] * freq2[1]) * (haplo - freq1[1] * freq2[1])) / (freq1[0] * freq1[1] * freq2[0] * freq2[1]);
1286  R2.push_back(r);
1287  }
1288  }
1289  return R2;
1290 }
1291 
1292 /***********************************/
1293 /* Global LD and distance measures */
1294 /***********************************/
1295 
1296 double SequenceStatistics::meanD(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin)
1297 {
1298  Vdouble D = pairwiseD(psc, keepsingleton, freqmin);
1299  return VectorTools::mean<double, double>(D);
1300 }
1301 
1302 double SequenceStatistics::meanDprime(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin)
1303 {
1304  Vdouble Dprime = pairwiseDprime(psc, keepsingleton, freqmin);
1305  return VectorTools::mean<double, double>(Dprime);
1306 }
1307 
1308 double SequenceStatistics::meanR2(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin)
1309 {
1310  Vdouble R2 = SequenceStatistics::pairwiseR2(psc, keepsingleton, freqmin);
1311  return VectorTools::mean<double, double>(R2);
1312 }
1313 
1314 double SequenceStatistics::meanDistance1(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin)
1315 {
1316  Vdouble dist = pairwiseDistances1(psc, keepsingleton, freqmin);
1317  return VectorTools::mean<double, double>(dist);
1318 }
1319 
1320 double SequenceStatistics::meanDistance2(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin)
1321 {
1322  Vdouble dist = pairwiseDistances2(psc, keepsingleton, freqmin);
1323  return VectorTools::mean<double, double>(dist);
1324 }
1325 
1326 /**********************/
1327 /* Regression methods */
1328 /**********************/
1329 
1330 double SequenceStatistics::originRegressionD(const PolymorphismSequenceContainer& psc, bool distance1, bool keepsingleton, double freqmin)
1331 {
1332  Vdouble D = pairwiseD(psc, keepsingleton, freqmin) - 1;
1333  Vdouble dist;
1334  if (distance1)
1335  dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1336  else
1337  dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1338  return VectorTools::sum(D * dist) / VectorTools::sum(dist * dist);
1339 }
1340 
1341 double SequenceStatistics::originRegressionDprime(const PolymorphismSequenceContainer& psc, bool distance1, bool keepsingleton, double freqmin)
1342 {
1343  Vdouble Dprime = pairwiseDprime(psc, keepsingleton, freqmin) - 1;
1344  Vdouble dist;
1345  if (distance1)
1346  dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1347  else
1348  dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1349  return VectorTools::sum(Dprime * dist) / VectorTools::sum(dist * dist);
1350 }
1351 
1352 double SequenceStatistics::originRegressionR2(const PolymorphismSequenceContainer& psc, bool distance1, bool keepsingleton, double freqmin)
1353 {
1354  Vdouble R2 = pairwiseR2(psc, keepsingleton, freqmin) - 1;
1355  Vdouble dist;
1356  if (distance1)
1357  dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1358  else
1359  dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1360  return VectorTools::sum(R2 * dist) / VectorTools::sum(dist * dist);
1361 }
1362 
1363 Vdouble SequenceStatistics::linearRegressionD(const PolymorphismSequenceContainer& psc, bool distance1, bool keepsingleton, double freqmin)
1364 {
1365  Vdouble D = pairwiseD(psc, keepsingleton, freqmin);
1366  Vdouble dist;
1367  Vdouble reg(2);
1368  if (distance1)
1369  dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1370  else
1371  dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1372  reg[0] = VectorTools::cov<double, double>(dist, D) / VectorTools::var<double, double>(dist);
1373  reg[1] = VectorTools::mean<double, double>(D) - reg[0] * VectorTools::mean<double, double>(dist);
1374  return reg;
1375 }
1376 
1377 Vdouble SequenceStatistics::linearRegressionDprime(const PolymorphismSequenceContainer& psc, bool distance1, bool keepsingleton, double freqmin)
1378 {
1379  Vdouble Dprime = pairwiseDprime(psc, keepsingleton, freqmin);
1380  Vdouble dist;
1381  Vdouble reg(2);
1382  if (distance1)
1383  dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1384  else
1385  dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1386  reg[0] = VectorTools::cov<double, double>(dist, Dprime) / VectorTools::var<double, double>(dist);
1387  reg[1] = VectorTools::mean<double, double>(Dprime) - reg[0] * VectorTools::mean<double, double>(dist);
1388  return reg;
1389 }
1390 
1391 Vdouble SequenceStatistics::linearRegressionR2(const PolymorphismSequenceContainer& psc, bool distance1, bool keepsingleton, double freqmin)
1392 {
1393  Vdouble R2 = pairwiseR2(psc, keepsingleton, freqmin);
1394  Vdouble dist;
1395  Vdouble reg(2);
1396  if (distance1)
1397  dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1398  else
1399  dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1400  reg[0] = VectorTools::cov<double, double>(dist, R2) / VectorTools::var<double, double>(dist);
1401  reg[1] = VectorTools::mean<double, double>(R2) - reg[0] * VectorTools::mean<double, double>(dist);
1402  return reg;
1403 }
1404 
1405 double SequenceStatistics::inverseRegressionR2(const PolymorphismSequenceContainer& psc, bool distance1, bool keepsingleton, double freqmin)
1406 {
1407  Vdouble R2 = pairwiseR2(psc, keepsingleton, freqmin);
1408  Vdouble unit(R2.size(), 1);
1409  Vdouble R2transformed = unit / R2 - 1;
1410  Vdouble dist;
1411  if (distance1)
1412  dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1413  else
1414  dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1415  return VectorTools::sum(R2transformed * dist) / VectorTools::sum(dist * dist);
1416 }
1417 
1418 /**********************/
1419 /* Hudson method */
1420 /**********************/
1421 
1422 double SequenceStatistics::hudson87(const PolymorphismSequenceContainer& psc, double precision, double cinf, double csup)
1423 {
1424  double left = leftHandHudson_(psc);
1425  size_t n = psc.getNumberOfSequences();
1426  double dif = 1;
1427  double c1 = cinf;
1428  double c2 = csup;
1429  if (SequenceStatistics::numberOfPolymorphicSites(psc) < 2)
1430  return -1;
1431  if (rightHandHudson_(c1, n) < left)
1432  return cinf;
1433  if (rightHandHudson_(c2, n) > left)
1434  return csup;
1435  while (dif > precision)
1436  {
1437  if (rightHandHudson_((c1 + c2) / 2, n) > left)
1438  c1 = (c1 + c2) / 2;
1439  else
1440  c2 = (c1 + c2) / 2;
1441  dif = std::abs(2 * (c1 - c2) / (c1 + c2));
1442  }
1443  return (c1 + c2) / 2;
1444 }
1445 
1446 /*****************/
1447 /* Tests methods */
1448 /*****************/
1449 
1450 void SequenceStatistics::testUsefulValues(std::ostream& s, size_t n)
1451 {
1452  map<string, double> v = getUsefulValues_(n);
1453  double vD = getVD_(n, v["a1"], v["a2"], v["cn"]);
1454  double uD = getUD_(v["a1"], vD);
1455  double vDs = getVDstar_(n, v["a1"], v["a2"], v["dn"]);
1456  double uDs = getUDstar_(n, v["a1"], vDs);
1457 
1458  s << n << "\t";
1459  s << v["a1"] << "\t";
1460  s << v["a2"] << "\t";
1461  s << v["a1n"] << "\t";
1462  s << v["b1"] << "\t";
1463  s << v["b2"] << "\t";
1464  s << v["c1"] << "\t";
1465  s << v["c2"] << "\t";
1466  s << v["cn"] << "\t";
1467  s << v["dn"] << "\t";
1468  s << v["e1"] << "\t";
1469  s << v["e2"] << "\t";
1470  s << uD << "\t";
1471  s << vD << "\t";
1472  s << uDs << "\t";
1473  s << vDs << endl;
1474 }
1475 
1476 // ******************************************************************************
1477 // Private methods
1478 // ******************************************************************************
1479 
1480 unsigned int SequenceStatistics::getNumberOfMutations_(const Site& site)
1481 {
1482  // jdutheil 27/06/15: does not work if gaps and unknown!!!
1483  unsigned int tmp_count = 0;
1484  map<int, size_t> states_count;
1485  SymbolListTools::getCounts(site, states_count);
1486 
1487  for (map<int, size_t>::iterator it = states_count.begin(); it != states_count.end(); it++)
1488  {
1489  if (it->first >= 0)
1490  tmp_count++;
1491  }
1492  if (tmp_count > 0)
1493  tmp_count--;
1494  return tmp_count;
1495 }
1496 
1497 unsigned int SequenceStatistics::getNumberOfSingletons_(const Site& site)
1498 {
1499  unsigned int nus = 0;
1500  map<int, size_t> states_count;
1501  SymbolListTools::getCounts(site, states_count);
1502  for (map<int, size_t>::iterator it = states_count.begin(); it != states_count.end(); it++)
1503  {
1504  if (it->second == 1)
1505  nus++;
1506  }
1507  return nus;
1508 }
1509 
1510 unsigned int SequenceStatistics::getNumberOfDerivedSingletons_(const Site& site_in, const Site& site_out)
1511 {
1512  unsigned int nus = 0;
1513  map<int, size_t> states_count;
1514  map<int, size_t> outgroup_states_count;
1515  SymbolListTools::getCounts(site_in, states_count);
1516  SymbolListTools::getCounts(site_out, outgroup_states_count);
1517  // if there is more than one variant in the outgroup we will not be able to recover the ancestral state
1518  if (outgroup_states_count.size() == 1)
1519  {
1520  for (map<int, size_t>::iterator it = states_count.begin(); it != states_count.end(); it++)
1521  {
1522  if (it->second == 1)
1523  {
1524  if (outgroup_states_count.find(it->first) == outgroup_states_count.end())
1525  nus++;
1526  }
1527  }
1528  }
1529  return nus;
1530 }
1531 
1532 std::map<std::string, double> SequenceStatistics::getUsefulValues_(size_t n)
1533 {
1534  double nn = static_cast<double>(n);
1535  map<string, double> values;
1536  values["a1"] = 0.;
1537  values["a2"] = 0.;
1538  values["a1n"] = 0.;
1539  values["b1"] = 0.;
1540  values["b2"] = 0.;
1541  values["c1"] = 0.;
1542  values["c2"] = 0.;
1543  values["cn"] = 0.;
1544  values["dn"] = 0.;
1545  values["e1"] = 0.;
1546  values["e2"] = 0.;
1547  if (n > 1)
1548  {
1549  for (double i = 1; i < nn; i++)
1550  {
1551  values["a1"] += 1. / i;
1552  values["a2"] += 1. / (i * i);
1553  }
1554  values["a1n"] = values["a1"] + (1. / nn);
1555  values["b1"] = (nn + 1.) / (3. * (nn - 1.));
1556  values["b2"] = 2. * ((nn * nn) + nn + 3.) / (9. * nn * (nn - 1.));
1557  values["c1"] = values["b1"] - (1. / values["a1"]);
1558  values["c2"] = values["b2"] - ((nn + 2.) / (values["a1"] * nn)) + (values["a2"] / (values["a1"] * values["a1"]));
1559  if (n == 2)
1560  {
1561  values["cn"] = 1.;
1562  values["dn"] = 2.;
1563  }
1564  else
1565  {
1566  values["cn"] = 2. * ((nn * values["a1"]) - (2. * (nn - 1.))) / ((nn - 1.) * (nn - 2.));
1567  values["dn"] =
1568  values["cn"]
1569  + ((nn - 2.) / ((nn - 1.) * (nn - 1.)))
1570  + (2. / (nn - 1.))
1571  * ((3. / 2.) - (((2. * values["a1n"]) - 3.) / (nn - 2.)) - (1. / nn));
1572  }
1573  values["e1"] = values["c1"] / values["a1"];
1574  values["e2"] = values["c2"] / ((values["a1"] * values["a1"]) + values["a2"]);
1575  }
1576  return values;
1577 }
1578 
1579 double SequenceStatistics::getVD_(size_t n, double a1, double a2, double cn)
1580 {
1581  double nn = static_cast<double>(n);
1582  if (n < 3)
1583  return 0.;
1584  double vD = 1. + ((a1 * a1) / (a2 + (a1 * a1))) * (cn - ((nn + 1.) / (nn - 1.)));
1585  return vD;
1586 }
1587 
1588 double SequenceStatistics::getUD_(double a1, double vD)
1589 {
1590  return a1 - 1. - vD;
1591 }
1592 
1593 double SequenceStatistics::getVDstar_(size_t n, double a1, double a2, double dn)
1594 {
1595  double denom = (a1 * a1) + a2;
1596  if (n < 3 || denom == 0.)
1597  return 0.;
1598  double nn = static_cast<double>(n);
1599  double nnn = nn / (nn - 1.);
1600  // Fu & Li 1993
1601  double vDs = (
1602  (nnn * nnn * a2)
1603  + (a1 * a1 * dn)
1604  - (2. * (nn * a1 * (a1 + 1)) / ((nn - 1.) * (nn - 1.)))
1605  )
1606  /
1607  denom;
1608  // Simonsen et al. 1995
1609  /*
1610  double vDs = (
1611  (values["a2"] / pow(values["a1"], 2))
1612  - (2./nn) * (1. + 1./values["a1"] - values["a1"] + values["a1"]/nn)
1613  - 1./(nn*nn)
1614  )
1615  /
1616  (pow(values["a1"], 2) + values["a2"]);
1617  */
1618  return vDs;
1619 }
1620 
1621 double SequenceStatistics::getUDstar_(size_t n, double a1, double vDs)
1622 {
1623  if (n < 3)
1624  return 0.;
1625  double nn = static_cast<double>(n);
1626  double nnn = nn / (nn - 1.);
1627  // Fu & Li 1993
1628  double uDs = (nnn * (a1 - nnn)) - vDs;
1629  // Simonsen et al. 1995
1630  /*
1631  double uDs = (((nn - 1.)/nn - 1./values["a1"]) / values["a1"]) - vDs;
1632  */
1633  return uDs;
1634 }
1635 
1636 double SequenceStatistics::leftHandHudson_(const PolymorphismSequenceContainer& psc)
1637 {
1638  auto newpsc = PolymorphismSequenceContainerTools::getCompleteSites(psc);
1639  size_t nbseq = newpsc->getNumberOfSequences();
1640  double S1 = 0;
1641  double S2 = 0;
1642  for (size_t i = 0; i < nbseq - 1; ++i)
1643  {
1644  for (size_t j = i + 1; j < nbseq; ++j)
1645  {
1646  SequenceSelection ss(2);
1647  ss[0] = i;
1648  ss[1] = j;
1649  auto psc2 = PolymorphismSequenceContainerTools::getSelectedSequences(*newpsc, ss);
1650  S1 += SequenceStatistics::watterson75(*psc2, true);
1651  S2 += SequenceStatistics::watterson75(*psc2, true) * SequenceStatistics::watterson75(*psc2, true);
1652  }
1653  }
1654  double Sk = (2 * S2 - pow(2 * S1 / static_cast<double>(nbseq), 2.)) / pow(nbseq, 2.);
1655  double H = SequenceStatistics::heterozygosity(*newpsc);
1656  double H2 = SequenceStatistics::squaredHeterozygosity(*newpsc);
1657  return static_cast<double>(Sk - H + H2) / pow(H * static_cast<double>(nbseq) / static_cast<double>(nbseq - 1), 2.);
1658 }
1659 
1660 double SequenceStatistics::rightHandHudson_(double c, size_t n)
1661 {
1662  double nn = static_cast<double>(n);
1663  return 1. / (97. * pow(c, 2.) * pow(nn, 3.)) * ((nn - 1.) * (97. * (c * (4. + (c - 2. * nn) * nn) + (-2. * (7. + c) + 4. * nn + (c - 1.) * pow(nn, 2.)) * log((18. + c * (13. + c)) / 18.)) + sqrt(97.) * (110. + nn * (49. * nn - 52.) + c * (2. + nn * (15. * nn - 8.))) * log(-1. + (72. + 26. * c) / (36. + 13. * c - c * sqrt(97.)))));
1664 }
const Alphabet & alphabet() const override
std::shared_ptr< const Alphabet > getAlphabet() const override
const int & getValue(size_t pos) const override
virtual unsigned int getSize() const=0
const SiteType & nextSite() override
virtual size_t size() const=0
virtual const Alphabet & alphabet() const=0
The PolymorphismSequenceContainer class.
void setAsOutgroupMember(size_t index)
Set a sequence as outgroup member by index.
void addSequence(const std::string &sequenceKey, std::unique_ptr< Sequence > &sequence) override
std::string getChar(size_t pos) const override
const SequenceType & sequence(const std::string &sequenceKey) const override
const SiteType & site(size_t sitePosition) const override
size_t getNumberOfSequences() const override
size_t getNumberOfSites() const override
std::size_t count(const std::string &s, const std::string &pattern)
std::vector< size_t > SiteSelection
std::vector< double > Vdouble
std::vector< size_t > SequenceSelection
CompleteTemplateSiteContainerIterator< Site, Sequence, std::string > CompleteSiteContainerIterator
NoGapTemplateSiteContainerIterator< Site, Sequence, std::string > NoGapSiteContainerIterator