bpp-popgen  3.0.0
SequenceStatistics.cpp
Go to the documentation of this file.
1 //
2 // File SequenceStatistics.cpp
3 // Authors: Eric Bazin
4 // Sylvain Gailard
5 // Khalid Belkhir
6 // Benoit Nabholz
7 // Created on: Wed Aug 04 2004
8 //
9 
10 /*
11  Copyright or © or Copr. Bio++ Development Team, (November 17, 2004)
12 
13  This software is a computer program whose purpose is to provide classes
14  for population genetics analysis.
15 
16  This software is governed by the CeCILL license under French law and
17  abiding by the rules of distribution of free software. You can use,
18  modify and/ or redistribute the software under the terms of the CeCILL
19  license as circulated by CEA, CNRS and INRIA at the following URL
20  "http://www.cecill.info".
21 
22  As a counterpart to the access to the source code and rights to copy,
23  modify and redistribute granted by the license, users are provided only
24  with a limited warranty and the software's author, the holder of the
25  economic rights, and the successive licensors have only limited
26  liability.
27 
28  In this respect, the user's attention is drawn to the risks associated
29  with loading, using, modifying and/or developing or reproducing the
30  software by the user in light of its specific status of free software,
31  that may mean that it is complicated to manipulate, and that also
32  therefore means that it is reserved for developers and experienced
33  professionals having in-depth computer knowledge. Users are therefore
34  encouraged to load and test the software's suitability as regards their
35  requirements in conditions enabling the security of their systems and/or
36  data to be ensured and, more generally, to use and operate it in the
37  same conditions as regards security.
38 
39  The fact that you are presently reading this means that you have had
40  knowledge of the CeCILL license and that you accept its terms.
41  */
42 
43 #include "SequenceStatistics.h" // class's header file
46 
47 // From the STL:
48 #include <ctype.h>
49 #include <cmath>
50 #include <iostream>
51 #include <vector>
52 
53 using namespace std;
54 
55 // From SeqLib:
56 #include <Bpp/Seq/Site.h>
57 #include <Bpp/Seq/SiteTools.h>
59 #include <Bpp/Seq/CodonSiteTools.h>
60 #include <Bpp/Seq/Alphabet/DNA.h>
62 
65 
66 using namespace bpp;
67 
68 // ******************************************************************************
69 // Basic statistics
70 // ******************************************************************************
71 
72 unsigned int SequenceStatistics::numberOfPolymorphicSites(const PolymorphismSequenceContainer& psc, bool gapflag, bool ignoreUnknown)
73 {
74  unsigned int s = 0;
75  const Site* site = 0;
76  unique_ptr<ConstSiteIterator> si;
77  if (gapflag)
78  si.reset(new CompleteSiteContainerIterator(psc));
79  else
80  si.reset(new SimpleSiteContainerIterator(psc));
81  while (si->hasMoreSites())
82  {
83  site = si->nextSite();
84  if (!SiteTools::isConstant(*site, ignoreUnknown))
85  {
86  s++;
87  }
88  }
89  return s;
90 }
91 
92 double SequenceStatistics::frequencyOfPolymorphicSites(const PolymorphismSequenceContainer& psc, bool gapflag, bool ignoreUnknown)
93 {
94  double s = 0;
95  double n = 0;
96  const Site* site = 0;
97  unique_ptr<ConstSiteIterator> si;
98  if (gapflag)
99  si.reset(new CompleteSiteContainerIterator(psc));
100  else
101  si.reset(new SimpleSiteContainerIterator(psc));
102  while (si->hasMoreSites())
103  {
104  site = si->nextSite();
105  n++;
106  if (!SiteTools::isConstant(*site, ignoreUnknown))
107  {
108  s++;
109  }
110  }
111  return s / n;
112 }
113 
114 unsigned int SequenceStatistics::numberOfParsimonyInformativeSites(const PolymorphismSequenceContainer& psc, bool gapflag)
115 {
116  unique_ptr<ConstSiteIterator> si;
117  if (gapflag)
118  si.reset(new CompleteSiteContainerIterator(psc));
119  else
120  si.reset(new SimpleSiteContainerIterator(psc));
121  unsigned int s = 0;
122  const Site* site = 0;
123  while (si->hasMoreSites())
124  {
125  site = si->nextSite();
126  if (SiteTools::isParsimonyInformativeSite(*site))
127  {
128  s++;
129  }
130  }
131  return s;
132 }
133 
134 unsigned int SequenceStatistics::numberOfSingletons(const PolymorphismSequenceContainer& psc, bool gapflag)
135 {
136  unique_ptr<ConstSiteIterator> si;
137  if (gapflag)
138  si.reset(new CompleteSiteContainerIterator(psc));
139  else
140  si.reset(new SimpleSiteContainerIterator(psc));
141  unsigned int nus = 0;
142  const Site* site = 0;
143  while (si->hasMoreSites())
144  {
145  site = si->nextSite();
146  nus += getNumberOfSingletons_(*site);
147  }
148  return nus;
149 }
150 
151 unsigned int SequenceStatistics::numberOfTriplets(const PolymorphismSequenceContainer& psc, bool gapflag)
152 {
153  unique_ptr<ConstSiteIterator> si;
154  if (gapflag)
155  si.reset(new CompleteSiteContainerIterator(psc));
156  else
157  si.reset(new SimpleSiteContainerIterator(psc));
158  unsigned int s = 0;
159  const Site* site = 0;
160  while (si->hasMoreSites())
161  {
162  site = si->nextSite();
163  if (SiteTools::isTriplet(*site))
164  {
165  s++;
166  }
167  }
168  return s;
169 }
170 
171 unsigned int SequenceStatistics::totalNumberOfMutations(const PolymorphismSequenceContainer& psc, bool gapflag)
172 {
173  unique_ptr<ConstSiteIterator> si;
174  if (gapflag)
175  si.reset(new CompleteSiteContainerIterator(psc));
176  else
177  si.reset(new SimpleSiteContainerIterator(psc));
178  unsigned int tnm = 0;
179  const Site* site = 0;
180  while (si->hasMoreSites())
181  {
182  site = si->nextSite();
183  tnm += getNumberOfMutations_(*site);
184  }
185  return tnm;
186 }
187 
188 unsigned int SequenceStatistics::totalNumberOfMutationsOnExternalBranches(
190  const PolymorphismSequenceContainer& outg)
191 {
192  if (ing.getNumberOfSites() != outg.getNumberOfSites())
193  throw Exception("ing and outg must have the same size");
194  unsigned int nmuts = 0;
195  const Site* site_in = 0;
196  const Site* site_out = 0;
197  unique_ptr<ConstSiteIterator> si(new SimpleSiteContainerIterator(ing));
198  unique_ptr<ConstSiteIterator> so(new SimpleSiteContainerIterator(outg));
199  while (si->hasMoreSites())
200  {
201  site_in = si->nextSite();
202  site_out = so->nextSite();
203  // use fully resolved sites
204  if (SiteTools::isComplete(*site_in) && SiteTools::isComplete(*site_out))
205  nmuts += getNumberOfDerivedSingletons_(*site_in, *site_out); // singletons that are not in outgroup
206  }
207  return nmuts;
208 }
209 
210 double SequenceStatistics::heterozygosity(const PolymorphismSequenceContainer& psc, bool gapflag)
211 {
212  unique_ptr<ConstSiteIterator> si;
213  if (gapflag)
214  si.reset(new CompleteSiteContainerIterator(psc));
215  else
216  si.reset(new SimpleSiteContainerIterator(psc));
217  const Site* site = 0;
218  double s = 0;
219  while (si->hasMoreSites())
220  {
221  site = si->nextSite();
222  s += SiteTools::heterozygosity(*site);
223  }
224  return s;
225 }
226 
227 double SequenceStatistics::squaredHeterozygosity(const PolymorphismSequenceContainer& psc, bool gapflag)
228 {
229  unique_ptr<ConstSiteIterator> si;
230  if (gapflag)
231  si.reset(new CompleteSiteContainerIterator(psc));
232  else
233  si.reset(new SimpleSiteContainerIterator(psc));
234  const Site* site = 0;
235  double s = 0;
236  while (si->hasMoreSites())
237  {
238  site = si->nextSite();
239  double h = SiteTools::heterozygosity(*site);
240  s += h * h;
241  }
242  return s;
243 }
244 
245 // ******************************************************************************
246 // GC statistics
247 // ******************************************************************************
248 
249 double SequenceStatistics::gcContent(const PolymorphismSequenceContainer& psc)
250 {
251  map<int, double> freqs;
252  SequenceContainerTools::getFrequencies(psc, freqs);
253  const Alphabet* alpha = psc.getAlphabet();
254  return (freqs[alpha->charToInt("C")] + freqs[alpha->charToInt("G")]) / (freqs[alpha->charToInt("A")] + freqs[alpha->charToInt("C")] + freqs[alpha->charToInt("G")] + freqs[alpha->charToInt("T")]);
255 }
256 
257 std::vector<unsigned int> SequenceStatistics::gcPolymorphism(const PolymorphismSequenceContainer& psc, bool gapflag)
258 {
259  unsigned int nbMut = 0;
260  unsigned int nbGC = 0;
261  size_t nbSeq = psc.getNumberOfSequences();
262  vector<unsigned int> vect(2);
263  const Site* site = 0;
264  unique_ptr<ConstSiteIterator> si;
265  if (gapflag)
266  si.reset(new CompleteSiteContainerIterator(psc));
267  else
268  si.reset(new NoGapSiteContainerIterator(psc));
269  while (si->hasMoreSites())
270  {
271  site = si->nextSite();
272  if (!SiteTools::isConstant(*site))
273  {
274  long double freqGC = SymbolListTools::getGCContent(*site);
275  /*
276  * Sylvain Gaillard 15/03/2010: realy unclear ...
277  * freqGC is always in [0,1] then why testing it ?
278  * why casting double into size_t ?
279  * is that method used by someone ?
280  */
281  if (freqGC > 0 && freqGC < 1)
282  {
283  nbMut += static_cast<unsigned int>(nbSeq);
284  long double adGC = freqGC * nbSeq;
285  nbGC += static_cast<unsigned int>(adGC);
286  }
287  }
288  }
289  vect[0] = nbMut;
290  vect[1] = nbGC;
291  return vect;
292 }
293 
294 // ******************************************************************************
295 // Diversity statistics
296 // ******************************************************************************
297 
298 double SequenceStatistics::watterson75(const PolymorphismSequenceContainer& psc, bool gapflag, bool ignoreUnknown, bool scaled)
299 {
300  double ThetaW;
301  size_t n = psc.getNumberOfSequences();
302  map<string, double> values = getUsefulValues_(n);
303  double s = 0;
304  if (scaled)
305  s = frequencyOfPolymorphicSites(psc, gapflag, ignoreUnknown);
306  else
307  s = static_cast<double>(numberOfPolymorphicSites(psc, gapflag, ignoreUnknown));
308  ThetaW = s / values["a1"];
309  return ThetaW;
310 }
311 
312 double SequenceStatistics::tajima83(const PolymorphismSequenceContainer& psc, bool gapflag, bool ignoreUnknown, bool scaled)
313 {
314  size_t alphabet_size = psc.getAlphabet()->getSize();
315  const Site* site = 0;
316  unique_ptr<ConstSiteIterator> si;
317  double value2 = 0.;
318  double l = 0;
319  if (gapflag)
320  si.reset(new CompleteSiteContainerIterator(psc));
321  else
322  si.reset(new SimpleSiteContainerIterator(psc));
323  while (si->hasMoreSites())
324  {
325  site = si->nextSite();
326  l++;
327  if (!SiteTools::isConstant(*site, ignoreUnknown))
328  {
329  double value = 0.;
330  map<int, size_t> count;
331  SymbolListTools::getCounts(*site, count);
332  map<int, size_t> tmp_k;
333  size_t tmp_n = 0;
334  for (map<int, size_t>::iterator it = count.begin(); it != count.end(); it++)
335  {
336  if (it->first >= 0 && it->first < static_cast<int>(alphabet_size))
337  {
338  tmp_k[it->first] = it->second * (it->second - 1);
339  tmp_n += it->second;
340  }
341  }
342  if (tmp_n == 0 || tmp_n == 1)
343  continue;
344  for (map<int, size_t>::iterator it = tmp_k.begin(); it != tmp_k.end(); it++)
345  {
346  value += static_cast<double>(it->second) / static_cast<double>(tmp_n * (tmp_n - 1));
347  }
348  value2 += 1. - value;
349  }
350  }
351  return (scaled ? value2 / l : value2);
352 }
353 
354 double SequenceStatistics::fayWu2000(const PolymorphismSequenceContainer& psc, const Sequence& ancestralSites)
355 {
356  if (psc.getNumberOfSites() != ancestralSites.size())
357  throw Exception("SequenceStatistics::FayWu2000: ancestralSites and psc don't have the same size!!!'" );
358 
359  const Sequence& tmps = psc.getSequence(0);
360 
361  size_t alphabet_size = (psc.getAlphabet())->getSize();
362  double value = 0.;
363  for (size_t i = 0; i < psc.getNumberOfSites(); i++)
364  {
365  const Site& site = psc.getSite(i);
366  string ancB = ancestralSites.getChar(i);
367  int ancV = ancestralSites.getValue(i);
368 
369  if (!SiteTools::isConstant(site) || tmps.getChar(i) != ancB)
370  {
371  if (ancV < 0)
372  continue;
373 
374  map<int, size_t> count;
375  SymbolListTools::getCounts(site, count);
376  map<int, size_t> tmp_k;
377  size_t tmp_n = 0;
378  for (map<int, size_t>::iterator it = count.begin(); it != count.end(); it++)
379  {
380  if (it->first >= 0 && it->first < static_cast<int>(alphabet_size))
381  {
382  /* if derived allele */
383  if (it->first != ancV)
384  {
385  tmp_k[it->first] = 2 * it->second * it->second;
386  }
387  tmp_n += it->second;
388  }
389  }
390  if (tmp_n == 0 || tmp_n == 1)
391  continue;
392  for (map<int, size_t>::iterator it = tmp_k.begin(); it != tmp_k.end(); it++)
393  {
394  value += static_cast<double>(it->second) / static_cast<double>(tmp_n * (tmp_n - 1));
395  }
396  }
397  }
398  return value;
399 }
400 
401 unsigned int SequenceStatistics::dvk(const PolymorphismSequenceContainer& psc, bool gapflag)
402 {
403  /*
404  * Sylvain Gaillard 17/03/2010:
405  * This implementation uses unneeded SequenceContainer recopy and works on
406  * string. It needs to be improved.
407  */
408  unique_ptr<PolymorphismSequenceContainer> sc;
409  if (gapflag)
410  sc.reset(PolymorphismSequenceContainerTools::getSitesWithoutGaps(psc));
411  else
412  sc.reset(new PolymorphismSequenceContainer(psc));
413  // int K = 0;
414  vector<string> pscvector;
415  pscvector.push_back(sc->toString(0));
416  // K++;
417  for (size_t i = 1; i < sc->getNumberOfSequences(); i++)
418  {
419  bool uniq = true;
420  string query = sc->toString(i);
421  for (vector<string>::iterator it = pscvector.begin(); it != pscvector.end(); it++)
422  {
423  if (query.compare(*it) == 0)
424  {
425  uniq = false;
426  break;
427  }
428  }
429  if (uniq)
430  {
431  // K++;
432  pscvector.push_back(query);
433  }
434  }
435  // return K;
436  return static_cast<unsigned int>(pscvector.size());
437 }
438 
439 double SequenceStatistics::dvh(const PolymorphismSequenceContainer& psc, bool gapflag)
440 {
441  /*
442  * Sylvain Gaillard 17/03/2010:
443  * This implementation uses unneeded SequenceContainer recopy and works on
444  * string. It needs to be improved.
445  */
446  unique_ptr<PolymorphismSequenceContainer> sc;
447  if (gapflag)
448  sc.reset(PolymorphismSequenceContainerTools::getSitesWithoutGaps(psc));
449  else
450  sc.reset(new PolymorphismSequenceContainer(psc));
451  double H = 0.;
452  size_t nbSeq;
453  vector<string> pscvector;
454  vector<size_t> effvector;
455  pscvector.push_back(sc->toString(0));
456  effvector.push_back(sc->getSequenceCount(0));
457  nbSeq = sc->getSequenceCount(0);
458  for (size_t i = 1; i < sc->getNumberOfSequences(); i++)
459  {
460  nbSeq += sc->getSequenceCount(i);
461  bool uniq = true;
462  string query = sc->toString(i);
463  for (size_t j = 0; j < pscvector.size(); j++)
464  {
465  if (query.compare(pscvector[j]) == 0)
466  {
467  effvector[j] += sc->getSequenceCount(i);
468  uniq = false;
469  break;
470  }
471  }
472  if (uniq)
473  {
474  pscvector.push_back(query);
475  effvector.push_back(sc->getSequenceCount(i));
476  }
477  }
478  for (size_t i = 0; i < effvector.size(); i++)
479  {
480  H -= (static_cast<double>(effvector[i]) / static_cast<double>(nbSeq)) * ( static_cast<double>(effvector[i]) / static_cast<double>(nbSeq));
481  }
482  H += 1.;
483  return H;
484 }
485 
486 unsigned int SequenceStatistics::numberOfTransitions(const PolymorphismSequenceContainer& psc)
487 {
488  unsigned int nbT = 0;
489  unique_ptr<ConstSiteIterator> si(new CompleteSiteContainerIterator(psc));
490  const Site* site = 0;
491  while (si->hasMoreSites())
492  {
493  site = si->nextSite();
494  // if (SiteTools::isConstant(*site) || SiteTools::isTriplet(*site)) continue;
495  if (SiteTools::getNumberOfDistinctCharacters(*site) != 2)
496  continue;
497  int state1 = (*site)[0];
498  int state2 = (*site)[0];
499  for (size_t i = 1; i < site->size(); i++)
500  {
501  if (state1 != (*site)[i])
502  {
503  state2 = (*site)[i];
504  break;
505  }
506  }
507  if (((state1 == 0 && state2 == 2) || (state1 == 2 && state2 == 0)) ||
508  ((state1 == 1 && state2 == 3) || (state1 == 3 && state2 == 1)))
509  {
510  nbT++;
511  }
512  }
513  return nbT;
514 }
515 
516 unsigned int SequenceStatistics::numberOfTransversions(const PolymorphismSequenceContainer& psc)
517 {
518  unsigned int nbTv = 0;
519  unique_ptr<ConstSiteIterator> si(new CompleteSiteContainerIterator(psc));
520  const Site* site = 0;
521  while (si->hasMoreSites())
522  {
523  site = si->nextSite();
524  // if (SiteTools::isConstant(*site) || SiteTools::isTriplet(*site)) continue;
525  if (SiteTools::getNumberOfDistinctCharacters(*site) != 2)
526  continue;
527  int state1 = (*site)[0];
528  int state2 = (*site)[0];
529  for (size_t i = 1; i < site->size(); i++)
530  {
531  if (state1 != (*site)[i])
532  {
533  state2 = (*site)[i];
534  break;
535  }
536  }
537  if (!(((state1 == 0 && state2 == 2) || (state1 == 2 && state2 == 0)) ||
538  ((state1 == 1 && state2 == 3) || (state1 == 3 && state2 == 1))))
539  {
540  nbTv++;
541  }
542  }
543  return nbTv;
544 }
545 
546 double SequenceStatistics::ratioOfTransitionsTransversions(const PolymorphismSequenceContainer& psc)
547 {
548  // return (double) getNumberOfTransitions(psc)/getNumberOfTransversions(psc);
549  double nbTs = 0;
550  double nbTv = 0;
551  unique_ptr<ConstSiteIterator> si(new CompleteSiteContainerIterator(psc));
552  const Site* site = 0;
553  vector<int> state(2);
554  while (si->hasMoreSites())
555  {
556  map<int, size_t> count;
557  site = si->nextSite();
558  SymbolListTools::getCounts(*site, count);
559  if (count.size() != 2)
560  continue;
561  size_t i = 0;
562  for (map<int, size_t>::iterator it = count.begin(); it != count.end(); it++)
563  {
564  state[i] = it->first;
565  i++;
566  }
567  if (((state[0] == 0 && state[1] == 2) || (state[0] == 2 && state[1] == 0)) ||
568  ((state[0] == 1 && state[1] == 3) || (state[0] == 3 && state[1] == 1)))
569  {
570  nbTs++; // transitions
571  }
572  else
573  {
574  nbTv++; // transversion
575  }
576  }
577  if (nbTv == 0)
578  throw ZeroDivisionException("SequenceStatistics::getTransitionsTransversionsRatio.");
579  return nbTs / nbTv;
580 }
581 
582 // ******************************************************************************
583 // Synonymous and non-synonymous polymorphism
584 // ******************************************************************************
585 
586 unsigned int SequenceStatistics::numberOfSitesWithStopCodon(const PolymorphismSequenceContainer& psc, const GeneticCode& gCode, bool gapflag)
587 {
588  /*
589  * Sylvain Gaillard 17/03/2010
590  * What if the Alphabet is not a codon alphabet?
591  */
592  if (!AlphabetTools::isCodonAlphabet(psc.getAlphabet()))
593  throw AlphabetMismatchException("SequenceStatistics::stopCodonSiteNumber(). PolymorphismSequenceContainer must be with a codon alphabet.", psc.getAlphabet());
594 
595  unique_ptr<ConstSiteIterator> si;
596  if (gapflag)
597  si.reset(new NoGapSiteContainerIterator(psc));
598  else
599  si.reset(new SimpleSiteContainerIterator(psc));
600  unsigned int s = 0;
601  const Site* site = 0;
602  while (si->hasMoreSites())
603  {
604  site = si->nextSite();
605  if (CodonSiteTools::hasStop(*site, gCode))
606  s++;
607  }
608  return s;
609 }
610 
611 unsigned int SequenceStatistics::numberOfMonoSitePolymorphicCodons(const PolymorphismSequenceContainer& psc, bool stopflag, bool gapflag)
612 {
613  unique_ptr<ConstSiteIterator> si;
614  if (stopflag)
615  si.reset(new CompleteSiteContainerIterator(psc));
616  else
617  {
618  if (gapflag)
619  si.reset(new NoGapSiteContainerIterator(psc));
620  else
621  si.reset(new SimpleSiteContainerIterator(psc));
622  }
623  unsigned int s = 0;
624  const Site* site;
625  while (si->hasMoreSites())
626  {
627  site = si->nextSite();
628  if (CodonSiteTools::isMonoSitePolymorphic(*site))
629  s++;
630  }
631  return s;
632 }
633 
634 unsigned int SequenceStatistics::numberOfSynonymousPolymorphicCodons(const PolymorphismSequenceContainer& psc, const GeneticCode& gc)
635 {
636  unique_ptr<ConstSiteIterator> si(new CompleteSiteContainerIterator(psc));
637  unsigned int s = 0;
638  const Site* site;
639  while (si->hasMoreSites())
640  {
641  site = si->nextSite();
642  if (CodonSiteTools::isSynonymousPolymorphic(*site, gc))
643  s++;
644  }
645  return s;
646 }
647 
648 double SequenceStatistics::watterson75Synonymous(const PolymorphismSequenceContainer& psc, const GeneticCode& gc)
649 {
650  double ThetaW = 0.;
651  size_t n = psc.getNumberOfSequences();
652  unsigned int s = numberOfSynonymousSubstitutions(psc, gc);
653  map<string, double> values = getUsefulValues_(n);
654  ThetaW = static_cast<double>(s) / values["a1"];
655  return ThetaW;
656 }
657 
658 double SequenceStatistics::watterson75NonSynonymous(const PolymorphismSequenceContainer& psc, const GeneticCode& gc)
659 {
660  double ThetaW;
661  size_t n = psc.getNumberOfSequences();
662  unsigned int s = numberOfNonSynonymousSubstitutions(psc, gc);
663  map<string, double> values = getUsefulValues_(n);
664  ThetaW = static_cast<double>(s) / values["a1"];
665  return ThetaW;
666 }
667 
668 double SequenceStatistics::piSynonymous(const PolymorphismSequenceContainer& psc, const GeneticCode& gc, bool minchange)
669 {
670  double S = 0.;
672  const Site* site = 0;
673  while (si->hasMoreSites())
674  {
675  site = si->nextSite();
676  S += CodonSiteTools::piSynonymous(*site, gc, minchange);
677  }
678  delete si;
679  return S;
680 }
681 
682 double SequenceStatistics::piNonSynonymous(const PolymorphismSequenceContainer& psc, const GeneticCode& gc, bool minchange)
683 {
684  double S = 0.;
686  const Site* site = 0;
687  while (si->hasMoreSites())
688  {
689  site = si->nextSite();
690  S += CodonSiteTools::piNonSynonymous(*site, gc, minchange);
691  }
692  delete si;
693  return S;
694 }
695 
696 double SequenceStatistics::meanNumberOfSynonymousSites(const PolymorphismSequenceContainer& psc, const GeneticCode& gc, double ratio)
697 {
698  double S = 0.;
700  const Site* site = 0;
701  while (si->hasMoreSites())
702  {
703  site = si->nextSite();
704  S += CodonSiteTools::meanNumberOfSynonymousPositions(*site, gc, ratio);
705  }
706  delete si;
707  return S;
708 }
709 
710 double SequenceStatistics::meanNumberOfNonSynonymousSites(const PolymorphismSequenceContainer& psc, const GeneticCode& gc, double ratio)
711 {
712  double S = 0.;
713  int n = 0;
715  const Site* site = 0;
716  while (si->hasMoreSites())
717  {
718  site = si->nextSite();
719  n = n + 3;
720  S += CodonSiteTools::meanNumberOfSynonymousPositions(*site, gc, ratio);
721  }
722  delete si;
723  return static_cast<double>(n - S);
724 }
725 
726 unsigned int SequenceStatistics::numberOfSynonymousSubstitutions(const PolymorphismSequenceContainer& psc, const GeneticCode& gc, double freqmin)
727 {
728  size_t st = 0, sns = 0;
729  unique_ptr<ConstSiteIterator> si(new CompleteSiteContainerIterator(psc));
730  const Site* site = 0;
731  while (si->hasMoreSites())
732  {
733  site = si->nextSite();
734  st += CodonSiteTools::numberOfSubstitutions(*site, gc, freqmin);
735  sns += CodonSiteTools::numberOfNonSynonymousSubstitutions(*site, gc, freqmin);
736  }
737  return static_cast<unsigned int>(st - sns);
738 }
739 
740 unsigned int SequenceStatistics::numberOfNonSynonymousSubstitutions(const PolymorphismSequenceContainer& psc, const GeneticCode& gc, double freqmin)
741 {
742  unsigned int sns = 0;
743  unique_ptr<ConstSiteIterator> si(new CompleteSiteContainerIterator(psc));
744  const Site* site = 0;
745  while (si->hasMoreSites())
746  {
747  site = si->nextSite();
748  sns += static_cast<unsigned int>(CodonSiteTools::numberOfNonSynonymousSubstitutions(*site, gc, freqmin));
749  }
750  return sns;
751 }
752 
753 vector<unsigned int> SequenceStatistics::fixedDifferences(const PolymorphismSequenceContainer& pscin, const PolymorphismSequenceContainer& pscout, PolymorphismSequenceContainer& psccons, const GeneticCode& gc)
754 {
755  unique_ptr<ConstSiteIterator> siIn(new CompleteSiteContainerIterator(pscin));
756  unique_ptr<ConstSiteIterator> siOut(new CompleteSiteContainerIterator(pscout));
757  unique_ptr<ConstSiteIterator> siCons(new CompleteSiteContainerIterator(psccons));
758  const Site* siteIn = 0;
759  const Site* siteOut = 0;
760  const Site* siteCons = 0;
761  size_t NfixS = 0;
762  size_t NfixA = 0;
763  while (siIn->hasMoreSites())
764  {
765  siteIn = siIn->nextSite();
766  siteOut = siOut->nextSite();
767  siteCons = siCons->nextSite();
768  vector<size_t> v = CodonSiteTools::fixedDifferences(*siteIn, *siteOut, siteCons->getValue(0), siteCons->getValue(1), gc);
769  NfixS += v[0];
770  NfixA += v[1];
771  }
772  vector<unsigned int> v(2);
773  v[0] = static_cast<unsigned int>(NfixS);
774  v[1] = static_cast<unsigned int>(NfixA);
775  return v;
776 }
777 
778 vector<unsigned int> SequenceStatistics::mkTable(const PolymorphismSequenceContainer& ingroup, const PolymorphismSequenceContainer& outgroup, const GeneticCode& gc, double freqmin)
779 {
780  PolymorphismSequenceContainer psctot(ingroup);
781  for (size_t i = 0; i < outgroup.getNumberOfSequences(); i++)
782  {
783  psctot.addSequence(outgroup.getSequence(i));
784  psctot.setAsOutgroupMember(i + ingroup.getNumberOfSequences());
785  }
786  unique_ptr<const PolymorphismSequenceContainer> psccomplet(PolymorphismSequenceContainerTools::getCompleteSites(psctot));
787  unique_ptr<const PolymorphismSequenceContainer> pscin (PolymorphismSequenceContainerTools::extractIngroup(*psccomplet));
788  unique_ptr<const PolymorphismSequenceContainer> pscout (PolymorphismSequenceContainerTools::extractOutgroup(*psccomplet));
789  unique_ptr<const Sequence> consensusIn (SiteContainerTools::getConsensus(*pscin, "consensusIn"));
790  unique_ptr<const Sequence> consensusOut(SiteContainerTools::getConsensus(*pscout, "consensusOut"));
791  unique_ptr<PolymorphismSequenceContainer> consensus(new PolymorphismSequenceContainer(ingroup.getAlphabet()));
792  consensus->addSequence(*consensusIn);
793  consensus->addSequence(*consensusOut);
794  vector<unsigned int> u = SequenceStatistics::fixedDifferences(*pscin, *pscout, *consensus, gc);
795  vector<unsigned int> v(4);
796  v[0] = SequenceStatistics::numberOfNonSynonymousSubstitutions(*pscin, gc, freqmin);
797  v[1] = SequenceStatistics::numberOfSynonymousSubstitutions(*pscin, gc, freqmin);
798  v[2] = u[1];
799  v[3] = u[0];
800  return v;
801 }
802 
803 double SequenceStatistics::neutralityIndex(const PolymorphismSequenceContainer& ingroup, const PolymorphismSequenceContainer& outgroup, const GeneticCode& gc, double freqmin)
804 {
805  vector<unsigned int> v = SequenceStatistics::mkTable(ingroup, outgroup, gc, freqmin);
806  if (v[1] != 0 && v[2] != 0)
807  return static_cast<double>(v[0] * v[3]) / static_cast<double>(v[1] * v[2]);
808  else
809  return -1;
810 }
811 
812 // ******************************************************************************
813 // Statistical tests
814 // ******************************************************************************
815 
816 double SequenceStatistics::tajimaDss(const PolymorphismSequenceContainer& psc, bool gapflag, bool ignoreUnknown)
817 {
818  unsigned int Sp = numberOfPolymorphicSites(psc, gapflag, ignoreUnknown);
819  if (Sp == 0)
820  throw ZeroDivisionException("SequenceStatistics::tajimaDss. S should not be 0.");
821  double S = static_cast<double>(Sp);
822  double tajima = tajima83(psc, gapflag, ignoreUnknown);
823  double watterson = watterson75(psc, gapflag, ignoreUnknown);
824  size_t n = psc.getNumberOfSequences();
825  map<string, double> values = getUsefulValues_(n);
826  return (tajima - watterson) / sqrt((values["e1"] * S) + (values["e2"] * S * (S - 1)));
827 }
828 
829 double SequenceStatistics::tajimaDtnm(const PolymorphismSequenceContainer& psc, bool gapflag, bool ignoreUnknown)
830 {
831  unsigned int etaP = totalNumberOfMutations(psc, gapflag);
832  if (etaP == 0)
833  throw ZeroDivisionException("SequenceStatistics::tajimaDtnm. Eta should not be 0.");
834  double eta = static_cast<double>(etaP);
835  double tajima = tajima83(psc, gapflag, ignoreUnknown);
836  size_t n = psc.getNumberOfSequences();
837  map<string, double> values = getUsefulValues_(n);
838  double eta_a1 = eta / values["a1"];
839  return (tajima - eta_a1) / sqrt((values["e1"] * eta) + (values["e2"] * eta * (eta - 1)));
840 }
841 
842 double SequenceStatistics::fuLiD(
843  const PolymorphismSequenceContainer& ingroup,
844  const PolymorphismSequenceContainer& outgroup,
845  bool useNbSingletons,
846  bool useNbSegregatingSites)
847 {
848  size_t n = ingroup.getNumberOfSequences();
849  map<string, double> values = getUsefulValues_(n);
850  double vD = getVD_(n, values["a1"], values["a2"], values["cn"]);
851  double uD = getUD_(values["a1"], vD);
852  unsigned int etaP = 0;
853  if (useNbSegregatingSites)
854  etaP = numberOfPolymorphicSites(ingroup);
855  else
856  etaP = totalNumberOfMutations(ingroup);
857  if (etaP == 0)
858  throw ZeroDivisionException("SequenceStatistics::fuLiD. Eta should not be 0.");
859  double eta = static_cast<double>(etaP);
860  double etae = 0.;
861  if (useNbSingletons)
862  etae = static_cast<double>(numberOfSingletons(outgroup));
863  else
864  etae = static_cast<double>(totalNumberOfMutationsOnExternalBranches(ingroup, outgroup)); // added by Khalid 13/07/2005
865  return (eta - (values["a1"] * etae)) / sqrt((uD * eta) + (vD * eta * eta));
866 }
867 
868 double SequenceStatistics::fuLiDStar(
869  const PolymorphismSequenceContainer& group,
870  bool useNbSegregatingSites)
871 {
872  size_t n = group.getNumberOfSequences();
873  double nn = static_cast<double>(n);
874  double _n = nn / (nn - 1.);
875  map<string, double> values = getUsefulValues_(n);
876  double vDs = getVDstar_(n, values["a1"], values["a2"], values["dn"]);
877  double uDs = getUDstar_(n, values["a1"], vDs);
878  unsigned int etaP = 0;
879  if (useNbSegregatingSites)
880  etaP = numberOfPolymorphicSites(group);
881  else
882  etaP = totalNumberOfMutations(group);
883  if (etaP == 0)
884  throw ZeroDivisionException("eta should not be null");
885  double eta = static_cast<double>(etaP);
886  double etas = static_cast<double>(numberOfSingletons(group));
887 
888  // Fu & Li 1993
889  return ((_n * eta) - (values["a1"] * etas)) / sqrt(uDs * eta + vDs * eta * eta);
890 
891  // Simonsen et al. 1995
892  /*
893  return ((eta / values["a1"]) - (etas * ((n - 1) / n))) / sqrt(uDs * eta + vDs * eta * eta);
894  */
895 }
896 
897 double SequenceStatistics::fuLiF(
898  const PolymorphismSequenceContainer& ingroup,
899  const PolymorphismSequenceContainer& outgroup,
900  bool useNbSingletons,
901  bool useNbSegregatingSites)
902 {
903  size_t n = ingroup.getNumberOfSequences();
904  double nn = static_cast<double>(n);
905  map<string, double> values = getUsefulValues_(n);
906  double pi = tajima83(ingroup, true);
907  double vF = (values["cn"] + values["b2"] - 2. / (nn - 1.)) / (pow(values["a1"], 2) + values["a2"]);
908  double uF = ((1. + values["b1"] - (4. * ((nn + 1.) / ((nn - 1.) * (nn - 1.)))) * (values["a1n"] - (2. * nn) / (nn + 1.))) / values["a1"]) - vF;
909  unsigned int etaP = 0;
910  if (useNbSegregatingSites)
911  etaP = numberOfPolymorphicSites(ingroup);
912  else
913  etaP = totalNumberOfMutations(ingroup);
914  if (etaP == 0)
915  throw ZeroDivisionException("eta should not be null");
916  double eta = static_cast<double>(etaP);
917  double etae = 0.;
918  if (useNbSingletons)
919  etae = static_cast<double>(numberOfSingletons(outgroup));
920  else
921  etae = static_cast<double>(totalNumberOfMutationsOnExternalBranches(ingroup, outgroup)); // added by Khalid 13/07/2005
922  return (pi - etae) / sqrt(uF * eta + vF * eta * eta);
923 }
924 
925 double SequenceStatistics::fuLiFStar(
926  const PolymorphismSequenceContainer& group,
927  bool useNbSegregatingSites)
928 {
929  double n = static_cast<double>(group.getNumberOfSequences());
930  map<string, double> values = getUsefulValues_(group.getNumberOfSequences());
931  double pi = tajima83(group, true);
932 
933  // Fu & Li 1993
934  // double vFs = (values["dn"] + values["b2"] - (2. / (nn - 1.)) * (4. * values["a2"] - 6. + 8. / nn)) / (pow(values["a1"], 2) + values["a2"]);
935  // double uFs = (((nn / (nn - 1.)) + values["b1"] - (4. / (nn * (nn - 1.))) + 2. * ((nn + 1.) / (pow((nn - 1.), 2))) * (values["a1n"] - 2. * nn / (nn + 1.))) / values["a1"]) - vFs;
936 
937  // Simonsen et al. 1995
938  double vFs = (((2 * n * n * n + 110 * n * n - 255 * n + 153) / (9 * n * n * (n - 1))) + ((2 * (n - 1) * values["a1"]) / (n * n)) - 8 * values["a2"] / n) / (pow(values["a1"], 2) + values["a2"]);
939  double uFs = (((4 * n * n + 19 * n + 3 - 12 * (n + 1) * values["a1n"]) / (3 * n * (n - 1))) / values["a1"]) - vFs;
940  unsigned int etaP = 0;
941  if (useNbSegregatingSites)
942  etaP = numberOfPolymorphicSites(group);
943  else
944  etaP = totalNumberOfMutations(group);
945  if (etaP == 0)
946  throw ZeroDivisionException("eta should not be null");
947  double eta = static_cast<double>(etaP);
948  double etas = static_cast<double>(numberOfSingletons(group));
949  // Fu & Li 1993
950  // Simonsen et al. 1995
951  return (pi - ((n - 1.) / n * etas)) / sqrt(uFs * eta + vFs * eta * eta);
952 }
953 
954 double SequenceStatistics::fstHudson92(const PolymorphismSequenceContainer& psc, size_t id1, size_t id2)
955 {
956  vector<double> vdiff;
957  double piIntra1, piIntra2, meanPiIntra, piInter, Fst;
958 
959  PolymorphismSequenceContainer* Pop1 = PolymorphismSequenceContainerTools::extractGroup(psc, id1);
960  PolymorphismSequenceContainer* Pop2 = PolymorphismSequenceContainerTools::extractGroup(psc, id2);
961 
962  piIntra1 = SequenceStatistics::tajima83(*Pop1, true);
963  piIntra2 = SequenceStatistics::tajima83(*Pop2, true);
964 
965  meanPiIntra = (piIntra1 + piIntra2) / 2;
966 
967  double n = 0;
968  for (size_t i = 0; i < Pop1->getNumberOfSequences(); i++)
969  {
970  const Sequence& s1 = Pop1->getSequence(i);
971  for (size_t j = 0; j < Pop2->getNumberOfSequences(); j++)
972  {
973  n++;
974  const Sequence& s2 = Pop2->getSequence(j);
975  vdiff.push_back(SiteContainerTools::computeSimilarity(s1, s2, true, "no gap", true));
976  }
977  }
978  piInter = (VectorTools::sum(vdiff) / n) * static_cast<double>(psc.getNumberOfSites());
979 
980 
981  Fst = 1.0 - meanPiIntra / piInter;
982 
983  delete Pop1;
984  delete Pop2;
985 
986  return Fst;
987 }
988 
989 // ******************************************************************************
990 // Linkage disequilibrium statistics
991 // ******************************************************************************
992 
993 /**********************/
994 /* Preliminary method */
995 /**********************/
996 
997 PolymorphismSequenceContainer* SequenceStatistics::generateLdContainer(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin)
998 {
999  SiteSelection ss;
1000  // Extract polymorphic site with only two alleles
1001  for (size_t i = 0; i < psc.getNumberOfSites(); i++)
1002  {
1003  if (keepsingleton)
1004  {
1005  if (SiteTools::isComplete(psc.getSite(i)) && !SiteTools::isConstant(psc.getSite(i)) && !SiteTools::isTriplet(psc.getSite(i)))
1006  {
1007  ss.push_back(i);
1008  }
1009  }
1010  else
1011  {
1012  if (SiteTools::isComplete(psc.getSite(i)) && !SiteTools::isConstant(psc.getSite(i)) && !SiteTools::isTriplet(psc.getSite(i)) && !SiteTools::hasSingleton(psc.getSite(i)))
1013  {
1014  ss.push_back(i);
1015  }
1016  }
1017  }
1018 
1019  const SiteContainer* sc = SiteContainerTools::getSelectedSites(psc, ss);
1020  Alphabet* alpha = new DNA(); // Sylvain Gaillard 17/03/2010: What if psc's Alphabet is not DNA
1022  // Assign 1 to the more frequent and 0 to the less frequent alleles
1023  for (size_t i = 0; i < sc->getNumberOfSites(); i++)
1024  {
1025  const Site& site = sc->getSite(i);
1026  Site siteclone(site);
1027  bool deletesite = false;
1028  map<int, double> freqs;
1029  SymbolListTools::getFrequencies(siteclone, freqs);
1030  int first = 0;
1031  for (map<int, double>::iterator it = freqs.begin(); it != freqs.end(); it++)
1032  {
1033  if (it->second >= 0.5)
1034  first = it->first;
1035  }
1036  for (size_t j = 0; j < sc->getNumberOfSequences(); j++)
1037  {
1038  if (freqs[site.getValue(j)] >= 0.5 && site.getValue(j) == first)
1039  {
1040  if (freqs[site.getValue(j)] <= 1 - freqmin)
1041  {
1042  siteclone.setElement(j, 1);
1043  first = site.getValue(j);
1044  }
1045  else
1046  deletesite = true;
1047  }
1048  else
1049  {
1050  if (freqs[site.getValue(j)] >= freqmin)
1051  siteclone.setElement(j, 0);
1052  else
1053  deletesite = true;
1054  }
1055  }
1056  if (!deletesite)
1057  ldpsc->addSite(siteclone);
1058  }
1059  delete alpha;
1060  return ldpsc;
1061 }
1062 
1063 /*************************************/
1064 /* Pairwise LD and distance measures */
1065 /*************************************/
1066 
1067 Vdouble SequenceStatistics::pairwiseDistances1(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin)
1068 {
1069  // get Positions with sites of interest
1070  SiteSelection ss;
1071  for (size_t i = 0; i < psc.getNumberOfSites(); i++)
1072  {
1073  if (keepsingleton)
1074  {
1075  if (SiteTools::isComplete(psc.getSite(i)) && !SiteTools::isConstant(psc.getSite(i)) && !SiteTools::isTriplet(psc.getSite(i)))
1076  {
1077  const Site& site = psc.getSite(i);
1078  bool deletesite = false;
1079  map<int, double> freqs;
1080  SymbolListTools::getFrequencies(site, freqs);
1081  for (int j = 0; j < static_cast<int>(site.getAlphabet()->getSize()); j++)
1082  {
1083  if (freqs[j] >= 1 - freqmin)
1084  deletesite = true;
1085  }
1086  if (!deletesite)
1087  ss.push_back(i);
1088  }
1089  }
1090  else
1091  {
1092  if (SiteTools::isComplete(psc.getSite(i)) && !SiteTools::isConstant(psc.getSite(i)) && !SiteTools::isTriplet(psc.getSite(i)) && !SiteTools::hasSingleton(psc.getSite(i)))
1093  {
1094  ss.push_back(i);
1095  const Site& site = psc.getSite(i);
1096  bool deletesite = false;
1097  map<int, double> freqs;
1098  SymbolListTools::getFrequencies(site, freqs);
1099  for (int j = 0; j < static_cast<int>(site.getAlphabet()->getSize()); j++)
1100  {
1101  if (freqs[j] >= 1 - freqmin)
1102  deletesite = true;
1103  }
1104  if (!deletesite)
1105  ss.push_back(i);
1106  }
1107  }
1108  }
1109  // compute pairwise distances
1110  if (ss.size() < 2)
1111  throw DimensionException("SequenceStatistics::pairwiseDistances1 : less than 2 sites are available", ss.size(), 2);
1112  Vdouble dist;
1113  for (size_t i = 0; i < ss.size() - 1; i++)
1114  {
1115  for (size_t j = i + 1; j < ss.size(); j++)
1116  {
1117  dist.push_back(static_cast<double>(ss[j] - ss[i]));
1118  }
1119  }
1120  return dist;
1121 }
1122 
1123 Vdouble SequenceStatistics::pairwiseDistances2(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin)
1124 {
1125  SiteSelection ss;
1126  for (size_t i = 0; i < psc.getNumberOfSites(); i++)
1127  {
1128  if (keepsingleton)
1129  {
1130  if (SiteTools::isComplete(psc.getSite(i)) && !SiteTools::isConstant(psc.getSite(i)) && !SiteTools::isTriplet(psc.getSite(i)))
1131  {
1132  const Site& site = psc.getSite(i);
1133  bool deletesite = false;
1134  map<int, double> freqs;
1135  SymbolListTools::getFrequencies(site, freqs);
1136  for (int j = 0; j < static_cast<int>(site.getAlphabet()->getSize()); j++)
1137  {
1138  if (freqs[j] >= 1 - freqmin)
1139  deletesite = true;
1140  }
1141  if (!deletesite)
1142  ss.push_back(i);
1143  }
1144  }
1145  else
1146  {
1147  if (SiteTools::isComplete(psc.getSite(i)) && !SiteTools::isConstant(psc.getSite(i)) && !SiteTools::isTriplet(psc.getSite(i)) && !SiteTools::hasSingleton(psc.getSite(i)))
1148  {
1149  ss.push_back(i);
1150  const Site& site = psc.getSite(i);
1151  bool deletesite = false;
1152  map<int, double> freqs;
1153  SymbolListTools::getFrequencies(site, freqs);
1154  for (int j = 0; j < static_cast<int>(site.getAlphabet()->getSize()); j++)
1155  {
1156  if (freqs[j] >= 1 - freqmin)
1157  deletesite = true;
1158  }
1159  if (!deletesite)
1160  ss.push_back(i);
1161  }
1162  }
1163  }
1164  size_t n = ss.size();
1165  if (n < 2)
1166  throw DimensionException("SequenceStatistics::pairwiseDistances1 : less than 2 sites are available", ss.size(), 2);
1167  Vdouble distance(n * (n - 1) / 2, 0);
1168  size_t nbsite = psc.getNumberOfSites();
1169  for (size_t k = 0; k < psc.getNumberOfSequences(); k++)
1170  {
1171  const Sequence& seq = psc.getSequence(k);
1172  SiteSelection gap, newss = ss;
1173  Vdouble dist;
1174  for (size_t i = 0; i < nbsite; i++)
1175  {
1176  if (seq.getValue(i) == -1)
1177  gap.push_back(i);
1178  }
1179  // Site positions are re-numbered to take gaps into account
1180  for (size_t i = 0; i < gap.size(); i++)
1181  {
1182  for (size_t j = 0; j < ss.size(); j++)
1183  {
1184  if (ss[j] > gap[i])
1185  newss[j]--;
1186  }
1187  }
1188  for (size_t i = 0; i < n - 1; i++)
1189  {
1190  for (size_t j = i + 1; j < n; j++)
1191  {
1192  dist.push_back(static_cast<double>(newss[j] - newss[i]));
1193  }
1194  }
1195  distance += dist;
1196  }
1197  distance = distance / static_cast<double>(psc.getNumberOfSequences());
1198  return distance;
1199 }
1200 
1201 Vdouble SequenceStatistics::pairwiseD(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin)
1202 {
1203  PolymorphismSequenceContainer* newpsc = SequenceStatistics::generateLdContainer(psc, keepsingleton, freqmin);
1204  Vdouble D;
1205  size_t nbsite = newpsc->getNumberOfSites();
1206  size_t nbseq = newpsc->getNumberOfSequences();
1207  if (nbsite < 2)
1208  throw DimensionException("SequenceStatistics::pairwiseD: less than two sites are available", nbsite, 2);
1209  if (nbseq < 2)
1210  throw DimensionException("SequenceStatistics::pairwiseD: less than two sequences are available", nbseq, 2);
1211  for (size_t i = 0; i < nbsite - 1; i++)
1212  {
1213  for (size_t j = i + 1; j < nbsite; j++)
1214  {
1215  double haplo = 0;
1216  const Site& site1 = newpsc->getSite(i);
1217  const Site& site2 = newpsc->getSite(j);
1218  map<int, double> freq1;
1219  map<int, double> freq2;
1220  SymbolListTools::getFrequencies(site1, freq1);
1221  SymbolListTools::getFrequencies(site2, freq2);
1222  for (size_t k = 0; k < nbseq; k++)
1223  {
1224  if (site1.getValue(k) + site2.getValue(k) == 2)
1225  haplo++;
1226  }
1227  haplo = haplo / static_cast<double>(nbseq);
1228  D.push_back(std::abs(haplo - freq1[1] * freq2[1]));
1229  }
1230  }
1231  return D;
1232 }
1233 
1234 Vdouble SequenceStatistics::pairwiseDprime(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin)
1235 {
1236  PolymorphismSequenceContainer* newpsc = SequenceStatistics::generateLdContainer(psc, keepsingleton, freqmin);
1237  Vdouble Dprime;
1238  size_t nbsite = newpsc->getNumberOfSites();
1239  size_t nbseq = newpsc->getNumberOfSequences();
1240  if (nbsite < 2)
1241  throw DimensionException("SequenceStatistics::pairwiseD: less than two sites are available", nbsite, 2);
1242  if (nbseq < 2)
1243  throw DimensionException("SequenceStatistics::pairwiseD: less than two sequences are available", nbseq, 2);
1244  for (size_t i = 0; i < nbsite - 1; i++)
1245  {
1246  for (size_t j = i + 1; j < nbsite; j++)
1247  {
1248  double haplo = 0;
1249  const Site& site1 = newpsc->getSite(i);
1250  const Site& site2 = newpsc->getSite(j);
1251  map<int, double> freq1;
1252  map<int, double> freq2;
1253  SymbolListTools::getFrequencies(site1, freq1);
1254  SymbolListTools::getFrequencies(site2, freq2);
1255  for (size_t k = 0; k < nbseq; k++)
1256  {
1257  if (site1.getValue(k) + site2.getValue(k) == 2)
1258  haplo++;
1259  }
1260  haplo = haplo / static_cast<double>(nbseq);
1261  double d, D = (haplo - freq1[1] * freq2[1]);
1262  if (D > 0)
1263  {
1264  if (freq1[1] * freq2[0] <= freq1[0] * freq2[1])
1265  {
1266  d = std::abs(D) / (freq1[1] * freq2[0]);
1267  }
1268  else
1269  {
1270  d = std::abs(D) / (freq1[0] * freq2[1]);
1271  }
1272  }
1273  else
1274  {
1275  if (freq1[1] * freq2[1] <= freq1[0] * freq2[0])
1276  {
1277  d = std::abs(D) / (freq1[1] * freq2[1]);
1278  }
1279  else
1280  {
1281  d = std::abs(D) / (freq1[0] * freq2[0]);
1282  }
1283  }
1284  Dprime.push_back(d);
1285  }
1286  }
1287  return Dprime;
1288 }
1289 
1290 Vdouble SequenceStatistics::pairwiseR2(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin)
1291 {
1292  PolymorphismSequenceContainer* newpsc = SequenceStatistics::generateLdContainer(psc, keepsingleton, freqmin);
1293  Vdouble R2;
1294  size_t nbsite = newpsc->getNumberOfSites();
1295  size_t nbseq = newpsc->getNumberOfSequences();
1296  if (nbsite < 2)
1297  throw DimensionException("SequenceStatistics::pairwiseD: less than two sites are available", nbsite, 2);
1298  if (nbseq < 2)
1299  throw DimensionException("SequenceStatistics::pairwiseD: less than two sequences are available", nbseq, 2);
1300  for (size_t i = 0; i < nbsite - 1; i++)
1301  {
1302  for (size_t j = i + 1; j < nbsite; j++)
1303  {
1304  double haplo = 0;
1305  const Site& site1 = newpsc->getSite(i);
1306  const Site& site2 = newpsc->getSite(j);
1307  map<int, double> freq1;
1308  map<int, double> freq2;
1309  SymbolListTools::getFrequencies(site1, freq1);
1310  SymbolListTools::getFrequencies(site2, freq2);
1311  for (size_t k = 0; k < nbseq; k++)
1312  {
1313  if (site1.getValue(k) + site2.getValue(k) == 2)
1314  haplo++;
1315  }
1316  haplo = haplo / static_cast<double>(nbseq);
1317  double r = ((haplo - freq1[1] * freq2[1]) * (haplo - freq1[1] * freq2[1])) / (freq1[0] * freq1[1] * freq2[0] * freq2[1]);
1318  R2.push_back(r);
1319  }
1320  }
1321  return R2;
1322 }
1323 
1324 /***********************************/
1325 /* Global LD and distance measures */
1326 /***********************************/
1327 
1328 double SequenceStatistics::meanD(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin)
1329 {
1330  Vdouble D = pairwiseD(psc, keepsingleton, freqmin);
1331  return VectorTools::mean<double, double>(D);
1332 }
1333 
1334 double SequenceStatistics::meanDprime(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin)
1335 {
1336  Vdouble Dprime = pairwiseDprime(psc, keepsingleton, freqmin);
1337  return VectorTools::mean<double, double>(Dprime);
1338 }
1339 
1340 double SequenceStatistics::meanR2(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin)
1341 {
1342  Vdouble R2 = SequenceStatistics::pairwiseR2(psc, keepsingleton, freqmin);
1343  return VectorTools::mean<double, double>(R2);
1344 }
1345 
1346 double SequenceStatistics::meanDistance1(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin)
1347 {
1348  Vdouble dist = pairwiseDistances1(psc, keepsingleton, freqmin);
1349  return VectorTools::mean<double, double>(dist);
1350 }
1351 
1352 double SequenceStatistics::meanDistance2(const PolymorphismSequenceContainer& psc, bool keepsingleton, double freqmin)
1353 {
1354  Vdouble dist = pairwiseDistances2(psc, keepsingleton, freqmin);
1355  return VectorTools::mean<double, double>(dist);
1356 }
1357 
1358 /**********************/
1359 /* Regression methods */
1360 /**********************/
1361 
1362 double SequenceStatistics::originRegressionD(const PolymorphismSequenceContainer& psc, bool distance1, bool keepsingleton, double freqmin)
1363 {
1364  Vdouble D = pairwiseD(psc, keepsingleton, freqmin) - 1;
1365  Vdouble dist;
1366  if (distance1)
1367  dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1368  else
1369  dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1370  return VectorTools::sum(D * dist) / VectorTools::sum(dist * dist);
1371 }
1372 
1373 double SequenceStatistics::originRegressionDprime(const PolymorphismSequenceContainer& psc, bool distance1, bool keepsingleton, double freqmin)
1374 {
1375  Vdouble Dprime = pairwiseDprime(psc, keepsingleton, freqmin) - 1;
1376  Vdouble dist;
1377  if (distance1)
1378  dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1379  else
1380  dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1381  return VectorTools::sum(Dprime * dist) / VectorTools::sum(dist * dist);
1382 }
1383 
1384 double SequenceStatistics::originRegressionR2(const PolymorphismSequenceContainer& psc, bool distance1, bool keepsingleton, double freqmin)
1385 {
1386  Vdouble R2 = pairwiseR2(psc, keepsingleton, freqmin) - 1;
1387  Vdouble dist;
1388  if (distance1)
1389  dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1390  else
1391  dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1392  return VectorTools::sum(R2 * dist) / VectorTools::sum(dist * dist);
1393 }
1394 
1395 Vdouble SequenceStatistics::linearRegressionD(const PolymorphismSequenceContainer& psc, bool distance1, bool keepsingleton, double freqmin)
1396 {
1397  Vdouble D = pairwiseD(psc, keepsingleton, freqmin);
1398  Vdouble dist;
1399  Vdouble reg(2);
1400  if (distance1)
1401  dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1402  else
1403  dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1404  reg[0] = VectorTools::cov<double, double>(dist, D) / VectorTools::var<double, double>(dist);
1405  reg[1] = VectorTools::mean<double, double>(D) - reg[0] * VectorTools::mean<double, double>(dist);
1406  return reg;
1407 }
1408 
1409 Vdouble SequenceStatistics::linearRegressionDprime(const PolymorphismSequenceContainer& psc, bool distance1, bool keepsingleton, double freqmin)
1410 {
1411  Vdouble Dprime = pairwiseDprime(psc, keepsingleton, freqmin);
1412  Vdouble dist;
1413  Vdouble reg(2);
1414  if (distance1)
1415  dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1416  else
1417  dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1418  reg[0] = VectorTools::cov<double, double>(dist, Dprime) / VectorTools::var<double, double>(dist);
1419  reg[1] = VectorTools::mean<double, double>(Dprime) - reg[0] * VectorTools::mean<double, double>(dist);
1420  return reg;
1421 }
1422 
1423 Vdouble SequenceStatistics::linearRegressionR2(const PolymorphismSequenceContainer& psc, bool distance1, bool keepsingleton, double freqmin)
1424 {
1425  Vdouble R2 = pairwiseR2(psc, keepsingleton, freqmin);
1426  Vdouble dist;
1427  Vdouble reg(2);
1428  if (distance1)
1429  dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1430  else
1431  dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1432  reg[0] = VectorTools::cov<double, double>(dist, R2) / VectorTools::var<double, double>(dist);
1433  reg[1] = VectorTools::mean<double, double>(R2) - reg[0] * VectorTools::mean<double, double>(dist);
1434  return reg;
1435 }
1436 
1437 double SequenceStatistics::inverseRegressionR2(const PolymorphismSequenceContainer& psc, bool distance1, bool keepsingleton, double freqmin)
1438 {
1439  Vdouble R2 = pairwiseR2(psc, keepsingleton, freqmin);
1440  Vdouble unit(R2.size(), 1);
1441  Vdouble R2transformed = unit / R2 - 1;
1442  Vdouble dist;
1443  if (distance1)
1444  dist = pairwiseDistances1(psc, keepsingleton, freqmin) / 1000;
1445  else
1446  dist = pairwiseDistances2(psc, keepsingleton, freqmin) / 1000;
1447  return VectorTools::sum(R2transformed * dist) / VectorTools::sum(dist * dist);
1448 }
1449 
1450 /**********************/
1451 /* Hudson method */
1452 /**********************/
1453 
1454 double SequenceStatistics::hudson87(const PolymorphismSequenceContainer& psc, double precision, double cinf, double csup)
1455 {
1456  double left = leftHandHudson_(psc);
1457  size_t n = psc.getNumberOfSequences();
1458  double dif = 1;
1459  double c1 = cinf;
1460  double c2 = csup;
1461  if (SequenceStatistics::numberOfPolymorphicSites(psc) < 2)
1462  return -1;
1463  if (rightHandHudson_(c1, n) < left)
1464  return cinf;
1465  if (rightHandHudson_(c2, n) > left)
1466  return csup;
1467  while (dif > precision)
1468  {
1469  if (rightHandHudson_((c1 + c2) / 2, n) > left)
1470  c1 = (c1 + c2) / 2;
1471  else
1472  c2 = (c1 + c2) / 2;
1473  dif = std::abs(2 * (c1 - c2) / (c1 + c2));
1474  }
1475  return (c1 + c2) / 2;
1476 }
1477 
1478 /*****************/
1479 /* Tests methods */
1480 /*****************/
1481 
1482 void SequenceStatistics::testUsefulValues(std::ostream& s, size_t n)
1483 {
1484  map<string, double> v = getUsefulValues_(n);
1485  double vD = getVD_(n, v["a1"], v["a2"], v["cn"]);
1486  double uD = getUD_(v["a1"], vD);
1487  double vDs = getVDstar_(n, v["a1"], v["a2"], v["dn"]);
1488  double uDs = getUDstar_(n, v["a1"], vDs);
1489 
1490  s << n << "\t";
1491  s << v["a1"] << "\t";
1492  s << v["a2"] << "\t";
1493  s << v["a1n"] << "\t";
1494  s << v["b1"] << "\t";
1495  s << v["b2"] << "\t";
1496  s << v["c1"] << "\t";
1497  s << v["c2"] << "\t";
1498  s << v["cn"] << "\t";
1499  s << v["dn"] << "\t";
1500  s << v["e1"] << "\t";
1501  s << v["e2"] << "\t";
1502  s << uD << "\t";
1503  s << vD << "\t";
1504  s << uDs << "\t";
1505  s << vDs << endl;
1506 }
1507 
1508 // ******************************************************************************
1509 // Private methods
1510 // ******************************************************************************
1511 
1512 unsigned int SequenceStatistics::getNumberOfMutations_(const Site& site)
1513 {
1514  //jdutheil 27/06/15: does not work if gaps and unknown!!!
1515  unsigned int tmp_count = 0;
1516  map<int, size_t> states_count;
1517  SymbolListTools::getCounts(site, states_count);
1518 
1519  for (map<int, size_t>::iterator it = states_count.begin(); it != states_count.end(); it++)
1520  {
1521  if (it->first >= 0)
1522  tmp_count++;
1523  }
1524  if (tmp_count > 0)
1525  tmp_count--;
1526  return tmp_count;
1527 }
1528 
1529 unsigned int SequenceStatistics::getNumberOfSingletons_(const Site& site)
1530 {
1531  unsigned int nus = 0;
1532  map<int, size_t> states_count;
1533  SymbolListTools::getCounts(site, states_count);
1534  for (map<int, size_t>::iterator it = states_count.begin(); it != states_count.end(); it++)
1535  {
1536  if (it->second == 1)
1537  nus++;
1538  }
1539  return nus;
1540 }
1541 
1542 unsigned int SequenceStatistics::getNumberOfDerivedSingletons_(const Site& site_in, const Site& site_out)
1543 {
1544  unsigned int nus = 0;
1545  map<int, size_t> states_count;
1546  map<int, size_t> outgroup_states_count;
1547  SymbolListTools::getCounts(site_in, states_count);
1548  SymbolListTools::getCounts(site_out, outgroup_states_count);
1549  // if there is more than one variant in the outgroup we will not be able to recover the ancestral state
1550  if (outgroup_states_count.size() == 1)
1551  {
1552  for (map<int, size_t>::iterator it = states_count.begin(); it != states_count.end(); it++)
1553  {
1554  if (it->second == 1)
1555  {
1556  if (outgroup_states_count.find(it->first) == outgroup_states_count.end())
1557  nus++;
1558  }
1559  }
1560  }
1561  return nus;
1562 }
1563 
1564 std::map<std::string, double> SequenceStatistics::getUsefulValues_(size_t n)
1565 {
1566  double nn = static_cast<double>(n);
1567  map<string, double> values;
1568  values["a1"] = 0.;
1569  values["a2"] = 0.;
1570  values["a1n"] = 0.;
1571  values["b1"] = 0.;
1572  values["b2"] = 0.;
1573  values["c1"] = 0.;
1574  values["c2"] = 0.;
1575  values["cn"] = 0.;
1576  values["dn"] = 0.;
1577  values["e1"] = 0.;
1578  values["e2"] = 0.;
1579  if (n > 1)
1580  {
1581  for (double i = 1; i < nn; i++)
1582  {
1583  values["a1"] += 1. / i;
1584  values["a2"] += 1. / (i * i);
1585  }
1586  values["a1n"] = values["a1"] + (1. / nn);
1587  values["b1"] = (nn + 1.) / (3. * (nn - 1.));
1588  values["b2"] = 2. * ((nn * nn) + nn + 3.) / (9. * nn * (nn - 1.));
1589  values["c1"] = values["b1"] - (1. / values["a1"]);
1590  values["c2"] = values["b2"] - ((nn + 2.) / (values["a1"] * nn)) + (values["a2"] / (values["a1"] * values["a1"]));
1591  if (n == 2)
1592  {
1593  values["cn"] = 1.;
1594  values["dn"] = 2.;
1595  }
1596  else
1597  {
1598  values["cn"] = 2. * ((nn * values["a1"]) - (2. * (nn - 1.))) / ((nn - 1.) * (nn - 2.));
1599  values["dn"] =
1600  values["cn"]
1601  + ((nn - 2.) / ((nn - 1.) * (nn - 1.)))
1602  + (2. / (nn - 1.))
1603  * ((3. / 2.) - (((2. * values["a1n"]) - 3.) / (nn - 2.)) - (1. / nn));
1604  }
1605  values["e1"] = values["c1"] / values["a1"];
1606  values["e2"] = values["c2"] / ((values["a1"] * values["a1"]) + values["a2"]);
1607  }
1608  return values;
1609 }
1610 
1611 double SequenceStatistics::getVD_(size_t n, double a1, double a2, double cn)
1612 {
1613  double nn = static_cast<double>(n);
1614  if (n < 3)
1615  return 0.;
1616  double vD = 1. + ((a1 * a1) / (a2 + (a1 * a1))) * (cn - ((nn + 1.) / (nn - 1.)));
1617  return vD;
1618 }
1619 
1620 double SequenceStatistics::getUD_(double a1, double vD)
1621 {
1622  return a1 - 1. - vD;
1623 }
1624 
1625 double SequenceStatistics::getVDstar_(size_t n, double a1, double a2, double dn)
1626 {
1627  double denom = (a1 * a1) + a2;
1628  if (n < 3 || denom == 0.)
1629  return 0.;
1630  double nn = static_cast<double>(n);
1631  double nnn = nn / (nn - 1.);
1632  // Fu & Li 1993
1633  double vDs = (
1634  (nnn * nnn * a2)
1635  + (a1 * a1 * dn)
1636  - (2. * (nn * a1 * (a1 + 1)) / ((nn - 1.) * (nn - 1.)))
1637  )
1638  /
1639  denom;
1640  // Simonsen et al. 1995
1641  /*
1642  double vDs = (
1643  (values["a2"] / pow(values["a1"], 2))
1644  - (2./nn) * (1. + 1./values["a1"] - values["a1"] + values["a1"]/nn)
1645  - 1./(nn*nn)
1646  )
1647  /
1648  (pow(values["a1"], 2) + values["a2"]);
1649  */
1650  return vDs;
1651 }
1652 
1653 double SequenceStatistics::getUDstar_(size_t n, double a1, double vDs)
1654 {
1655  if (n < 3)
1656  return 0.;
1657  double nn = static_cast<double>(n);
1658  double nnn = nn / (nn - 1.);
1659  // Fu & Li 1993
1660  double uDs = (nnn * (a1 - nnn)) - vDs;
1661  // Simonsen et al. 1995
1662  /*
1663  double uDs = (((nn - 1.)/nn - 1./values["a1"]) / values["a1"]) - vDs;
1664  */
1665  return uDs;
1666 }
1667 
1668 double SequenceStatistics::leftHandHudson_(const PolymorphismSequenceContainer& psc)
1669 {
1670  PolymorphismSequenceContainer* newpsc = PolymorphismSequenceContainerTools::getCompleteSites(psc);
1671  size_t nbseq = newpsc->getNumberOfSequences();
1672  double S1 = 0;
1673  double S2 = 0;
1674  for (size_t i = 0; i < nbseq - 1; i++)
1675  {
1676  for (size_t j = i + 1; j < nbseq; j++)
1677  {
1678  SequenceSelection ss(2);
1679  ss[0] = i;
1680  ss[1] = j;
1681  PolymorphismSequenceContainer* psc2 = PolymorphismSequenceContainerTools::getSelectedSequences(*newpsc, ss);
1682  S1 += SequenceStatistics::watterson75(*psc2, true);
1683  S2 += SequenceStatistics::watterson75(*psc2, true) * SequenceStatistics::watterson75(*psc2, true);
1684  delete psc2;
1685  }
1686  }
1687  double Sk = (2 * S2 - pow(2 * S1 / static_cast<double>(nbseq), 2.)) / pow(nbseq, 2.);
1688  double H = SequenceStatistics::heterozygosity(*newpsc);
1689  double H2 = SequenceStatistics::squaredHeterozygosity(*newpsc);
1690  delete newpsc;
1691  return static_cast<double>(Sk - H + H2) / pow(H * static_cast<double>(nbseq) / static_cast<double>(nbseq - 1), 2.);
1692 }
1693 
1694 double SequenceStatistics::rightHandHudson_(double c, size_t n)
1695 {
1696  double nn = static_cast<double>(n);
1697  return 1. / (97. * pow(c, 2.) * pow(nn, 3.)) * ((nn - 1.) * (97. * (c * (4. + (c - 2. * nn) * nn) + (-2. * (7. + c) + 4. * nn + (c - 1.) * pow(nn, 2.)) * log((18. + c * (13. + c)) / 18.)) + sqrt(97.) * (110. + nn * (49. * nn - 52.) + c * (2. + nn * (15. * nn - 8.))) * log(-1. + (72. + 26. * c) / (36. + 13. * c - c * sqrt(97.)))));
1698 }
1699 
virtual size_t getNumberOfSites() const=0
virtual int charToInt(const std::string &state) const=0
virtual unsigned int getSize() const=0
virtual void setElement(size_t pos, const std::string &c)
virtual bool hasMoreSites() const=0
virtual const Site * nextSite()=0
virtual const T & getValue(size_t pos) const=0
virtual const Alphabet * getAlphabet() const=0
virtual size_t size() const=0
virtual size_t getNumberOfSequences() const=0
The PolymorphismSequenceContainer class.
void setAsOutgroupMember(size_t index)
Set a sequence as outgroup member by index.
void addSequence(const Sequence &sequence, bool checkName=true)
virtual std::string getChar(size_t pos) const=0
virtual const Alphabet * getAlphabet() const=0
virtual const Site & getSite(size_t siteIndex) const=0
virtual const int & getValue(size_t pos) const
const Sequence & getSequence(size_t sequenceIndex) const
void addSite(const Site &site, bool checkPosition=true)
const Site & getSite(size_t siteIndex) const
size_t getNumberOfSites() const
size_t getNumberOfSequences() const
std::size_t count(const std::string &s, const std::string &pattern)
std::vector< size_t > SiteSelection
std::vector< double > Vdouble
std::vector< size_t > SequenceSelection