bpp-popgen  3.0.0
MultilocusGenotypeStatistics.cpp
Go to the documentation of this file.
1 /*
2  * File MultilocusGenotypeStatistics.cpp
3  * Author : Sylvain Gaillard <yragael2001@yahoo.fr>
4  * Last modification : Wednesday August 04 2004
5  *
6  */
7 /*
8  Copyright or © or Copr. Bio++ Development Team, (November 17, 2004)
9 
10 
11  This software is a computer program whose purpose is to provide classes
12  for population genetics analysis.
13 
14  This software is governed by the CeCILL license under French law and
15  abiding by the rules of distribution of free software. You can use,
16  modify and/ or redistribute the software under the terms of the CeCILL
17  license as circulated by CEA, CNRS and INRIA at the following URL
18  "http://www.cecill.info".
19 
20  As a counterpart to the access to the source code and rights to copy,
21  modify and redistribute granted by the license, users are provided only
22  with a limited warranty and the software's author, the holder of the
23  economic rights, and the successive licensors have only limited
24  liability.
25 
26  In this respect, the user's attention is drawn to the risks associated
27  with loading, using, modifying and/or developing or reproducing the
28  software by the user in light of its specific status of free software,
29  that may mean that it is complicated to manipulate, and that also
30  therefore means that it is reserved for developers and experienced
31  professionals having in-depth computer knowledge. Users are therefore
32  encouraged to load and test the software's suitability as regards their
33  requirements in conditions enabling the security of their systems and/or
34  data to be ensured and, more generally, to use and operate it in the
35  same conditions as regards security.
36 
37  The fact that you are presently reading this means that you have had
38  knowledge of the CeCILL license and that you accept its terms.
39  */
40 
41 #include <Bpp/Utils/MapTools.h>
42 
45 
46 using namespace bpp;
47 
48 // From STL
49 
50 #include <iostream>
51 #include <cmath>
52 #include <algorithm>
53 
54 using namespace std;
55 
56 vector<size_t> MultilocusGenotypeStatistics::getAllelesIdsForGroups(const PolymorphismMultiGContainer& pmgc, size_t locus_position, const set<size_t>& groups)
57 {
58  map<size_t, size_t> tmp_alleles;
59  try
60  {
61  tmp_alleles = getAllelesMapForGroups(pmgc, locus_position, groups);
62  }
63  catch (IndexOutOfBoundsException& ioobe)
64  {
65  throw IndexOutOfBoundsException("MultilocusGenotypeStatistics::getAllelesIdsForGroups: locus_position out of bounds.", ioobe.getBadIndex(), ioobe.getBounds()[0], ioobe.getBounds()[1]);
66  }
67  return MapTools::getKeys(tmp_alleles);
68 }
69 
70 size_t MultilocusGenotypeStatistics::countGametesForGroups(const PolymorphismMultiGContainer& pmgc, size_t locus_position, const set<size_t>& groups)
71 {
72  map<size_t, size_t> allele_count;
73  size_t nb_tot_allele = 0;
74  try
75  {
76  allele_count = getAllelesMapForGroups(pmgc, locus_position, groups);
77  }
78  catch (IndexOutOfBoundsException& ioobe)
79  {
80  throw IndexOutOfBoundsException("MultilocusGenotypeStatistics::countGametesForGroups: locus_position out of bounds.", ioobe.getBadIndex(), ioobe.getBounds()[0], ioobe.getBounds()[1]);
81  }
82  vector<size_t> counter = MapTools::getValues(allele_count);
83  for (size_t i = 0; i < counter.size(); i++)
84  {
85  nb_tot_allele += counter[i];
86  }
87  return nb_tot_allele;
88 }
89 
90 map<size_t, size_t> MultilocusGenotypeStatistics::getAllelesMapForGroups(const PolymorphismMultiGContainer& pmgc, size_t locus_position, const set<size_t>& groups)
91 {
92  map<size_t, size_t> alleles_count;
93  for (size_t i = 0; i < pmgc.size(); i++)
94  {
95  try
96  {
97  if (!pmgc.getMultilocusGenotype(i)->isMonolocusGenotypeMissing(locus_position) && (groups.find(pmgc.getGroupId(i)) != groups.end()) )
98  {
99  // if (! pmgc.getMultilocusGenotype(i)->isMonolocusGenotypeMissing(locus_position) && (find(groups.begin(), groups.end(), pmgc.getGroupId(i)) != groups.end()) ) {
100  vector<size_t> tmp_alleles = pmgc.getMultilocusGenotype(i)->getMonolocusGenotype(locus_position).getAlleleIndex();
101  for (size_t j = 0; j < tmp_alleles.size(); j++)
102  {
103  alleles_count[tmp_alleles[j]]++;
104  }
105  }
106  }
107  catch (IndexOutOfBoundsException& ioobe)
108  {
109  throw IndexOutOfBoundsException("MultilocusGenotypeStatistics::getAllelesMapForGroups: locus_position out of bounds.", ioobe.getBadIndex(), ioobe.getBounds()[0], ioobe.getBounds()[1]);
110  }
111  }
112  return alleles_count;
113 }
114 
115 map<size_t, double> MultilocusGenotypeStatistics::getAllelesFrqForGroups(const PolymorphismMultiGContainer& pmgc, size_t locus_position, const set<size_t>& groups)
116 {
117  map<size_t, double> alleles_frq;
118  size_t nb_tot_allele = 0;
119  map<size_t, size_t> tmp_alleles;
120  try
121  {
122  tmp_alleles = getAllelesMapForGroups(pmgc, locus_position, groups);
123  }
124  catch (IndexOutOfBoundsException& ioobe)
125  {
126  throw IndexOutOfBoundsException("MultilocusGenotypeStatistics::getAllelesFrqForGroups: locus_position out of bounds.", ioobe.getBadIndex(), ioobe.getBounds()[0], ioobe.getBounds()[1]);
127  }
128  vector<size_t> counter = MapTools::getValues(tmp_alleles);
129  for (size_t i = 0; i < counter.size(); i++)
130  {
131  nb_tot_allele += counter[i];
132  }
133  if (nb_tot_allele == 0)
134  throw ZeroDivisionException("MultilocusGenotypeStatistics::getAllelesFrqForGroups.");
135  for (map<size_t, size_t>::iterator it = tmp_alleles.begin(); it != tmp_alleles.end(); it++)
136  {
137  alleles_frq[it->first] = static_cast<double>(it->second) / static_cast<double>(nb_tot_allele);
138  }
139  return alleles_frq;
140 }
141 
142 size_t MultilocusGenotypeStatistics::countNonMissingForGroups(const PolymorphismMultiGContainer& pmgc, size_t locus_position, const set<size_t>& groups)
143 {
144  size_t counter = 0;
145  for (size_t i = 0; i < pmgc.size(); i++)
146  {
147  try
148  {
149  // if (! pmgc.getMultilocusGenotype(i)->isMonolocusGenotypeMissing(locus_position) && (find(groups.begin(), groups.end(), pmgc.getGroupId(i)) != groups.end()) )
150  if (!pmgc.getMultilocusGenotype(i)->isMonolocusGenotypeMissing(locus_position) && (groups.find(pmgc.getGroupId(i) ) != groups.end()) )
151  counter++;
152  }
153  catch (IndexOutOfBoundsException& ioobe)
154  {
155  throw IndexOutOfBoundsException("MultilocusGenotypeStatistics::countNonMissing: locus_position out of bounds.", ioobe.getBadIndex(), ioobe.getBounds()[0], ioobe.getBounds()[1]);
156  }
157  }
158  return counter;
159 }
160 
161 size_t MultilocusGenotypeStatistics::countBiAllelicForGroups(const PolymorphismMultiGContainer& pmgc, size_t locus_position, const set<size_t>& groups)
162 {
163  size_t counter = 0;
164  for (size_t i = 0; i < pmgc.size(); i++)
165  {
166  try
167  {
168  // if (! pmgc.getMultilocusGenotype(i)->isMonolocusGenotypeMissing(locus_position) && (find(groups.begin(), groups.end(), pmgc.getGroupId(i)) != groups.end()) )
169  if (!pmgc.getMultilocusGenotype(i)->isMonolocusGenotypeMissing(locus_position) && (groups.find(pmgc.getGroupId(i)) != groups.end()) )
170  if ((pmgc.getMultilocusGenotype(i)->getMonolocusGenotype(locus_position).getAlleleIndex()).size() == 2)
171  counter++;
172  }
173  catch (IndexOutOfBoundsException& ioobe)
174  {
175  throw IndexOutOfBoundsException("MultilocusGenotypeStatistics::countBiAllelic: locus_position out of bounds.", ioobe.getBadIndex(), ioobe.getBounds()[0], ioobe.getBounds()[1]);
176  }
177  }
178  return counter;
179 }
180 
181 map<size_t, size_t> MultilocusGenotypeStatistics::countHeterozygousForGroups(const PolymorphismMultiGContainer& pmgc, size_t locus_position, const set<size_t>& groups)
182 {
183  map<size_t, size_t> counter;
184  for (size_t i = 0; i < pmgc.size(); i++)
185  {
186  try
187  {
188  if (!pmgc.getMultilocusGenotype(i)->isMonolocusGenotypeMissing(locus_position) && (groups.find(pmgc.getGroupId(i)) != groups.end() ))
189  {
190  const MonolocusGenotype& tmp_mg = pmgc.getMultilocusGenotype(i)->getMonolocusGenotype(locus_position);
191  if ((tmp_mg.getAlleleIndex()).size() == 2)
192  {
193  if (!dynamic_cast<const BiAlleleMonolocusGenotype&>(tmp_mg).isHomozygous())
194  {
195  vector<size_t> tmp_alleles = tmp_mg.getAlleleIndex();
196  for (size_t j = 0; j < tmp_alleles.size(); j++)
197  {
198  counter[tmp_alleles[j]]++;
199  }
200  }
201  }
202  }
203  }
204  catch (IndexOutOfBoundsException& ioobe)
205  {
206  throw IndexOutOfBoundsException("MultilocusGenotypeStatistics::countHeterozygous: locus_position out of bounds.", ioobe.getBadIndex(), ioobe.getBounds()[0], ioobe.getBounds()[1]);
207  }
208  }
209  return counter;
210 }
211 
212 map<size_t, double> MultilocusGenotypeStatistics::getHeterozygousFrqForGroups(const PolymorphismMultiGContainer& pmgc, size_t locus_position, const set<size_t>& groups)
213 {
214  map<size_t, double> freq;
215  size_t counter = 0;
216  for (size_t i = 0; i < pmgc.size(); i++)
217  {
218  try
219  {
220  if (!pmgc.getMultilocusGenotype(i)->isMonolocusGenotypeMissing(locus_position) && (groups.find(pmgc.getGroupId(i)) != groups.end()) )
221  {
222  const MonolocusGenotype& tmp_mg = pmgc.getMultilocusGenotype(i)->getMonolocusGenotype(locus_position);
223  if ((tmp_mg.getAlleleIndex()).size() == 2)
224  {
225  counter++;
226  if (!dynamic_cast<const BiAlleleMonolocusGenotype&>(tmp_mg).isHomozygous())
227  {
228  vector<size_t> tmp_alleles = tmp_mg.getAlleleIndex();
229  for (size_t j = 0; j < tmp_alleles.size(); j++)
230  {
231  freq[tmp_alleles[j]]++;
232  }
233  }
234  }
235  }
236  }
237  catch (IndexOutOfBoundsException& ioobe)
238  {
239  throw IndexOutOfBoundsException("MultilocusGenotypeStatistics::getHeterozygousFrqForGroups: locus_position out of bounds.", ioobe.getBadIndex(), ioobe.getBounds()[0], ioobe.getBounds()[1]);
240  }
241  }
242  if (counter == 0)
243  throw ZeroDivisionException("MultilocusGenotypeStatistics::getHeterozygousFrqForGroups.");
244  for (map<size_t, double>::iterator i = freq.begin(); i != freq.end(); i++)
245  {
246  i->second = (double) i->second / (double) counter;
247  }
248  return freq;
249 }
250 
251 double MultilocusGenotypeStatistics::getHobsForGroups(const PolymorphismMultiGContainer& pmgc, size_t locus_position, const set<size_t>& groups)
252 {
253  map<size_t, double> heterozygous_frq;
254  double frq = 0.;
255  try
256  {
257  heterozygous_frq = getHeterozygousFrqForGroups(pmgc, locus_position, groups);
258  }
259  catch (IndexOutOfBoundsException& ioobe)
260  {
261  throw IndexOutOfBoundsException("MultilocusGenotypeStatistics::getHobsForGroups: locus_position out of bounds.", ioobe.getBadIndex(), ioobe.getBounds()[0], ioobe.getBounds()[1]);
262  }
263  catch (ZeroDivisionException& zde)
264  {
265  throw ZeroDivisionException("MultilocusGenotypeStatistics::getHobsForGroups.");
266  }
267  for (map<size_t, double>::iterator it = heterozygous_frq.begin(); it != heterozygous_frq.end(); it++)
268  {
269  frq += it->second;
270  }
271  return frq / static_cast<double>(heterozygous_frq.size());
272 }
273 
274 double MultilocusGenotypeStatistics::getHexpForGroups(const PolymorphismMultiGContainer& pmgc, size_t locus_position, const set<size_t>& groups)
275 {
276  map<size_t, double> allele_frq;
277  double frqsqr = 0.;
278  try
279  {
280  allele_frq = getAllelesFrqForGroups(pmgc, locus_position, groups);
281  }
282  catch (IndexOutOfBoundsException& ioobe)
283  {
284  throw IndexOutOfBoundsException("MultilocusGenotypeStatistics::getHexpForGroups: locus_position out of bounds.", ioobe.getBadIndex(), ioobe.getBounds()[0], ioobe.getBounds()[1]);
285  }
286  catch (ZeroDivisionException& zde)
287  {
288  throw ZeroDivisionException("MultilocusGenotypeStatistics::getHexpForGroups.");
289  }
290  for (map<size_t, double>::iterator it = allele_frq.begin(); it != allele_frq.end(); it++)
291  {
292  frqsqr += it->second * it->second;
293  }
294  return 1 - frqsqr;
295 }
296 
297 double MultilocusGenotypeStatistics::getHnbForGroups(const PolymorphismMultiGContainer& pmgc, size_t locus_position, const set<size_t>& groups)
298 {
299  size_t nb_alleles;
300  double Hexp;
301  try
302  {
303  nb_alleles = countGametesForGroups(pmgc, locus_position, groups);
304  Hexp = getHexpForGroups(pmgc, locus_position, groups);
305  }
306  catch (IndexOutOfBoundsException& ioobe)
307  {
308  throw IndexOutOfBoundsException("MultilocusGenotypeStatistics::getHnbForGroups: locus_position out of bounds.", ioobe.getBadIndex(), ioobe.getBounds()[0], ioobe.getBounds()[1]);
309  }
310  catch (ZeroDivisionException& zde)
311  {
312  throw ZeroDivisionException("MultilocusGenotypeStatistics::getHnbForGroups.");
313  }
314  return 2 * static_cast<double>(nb_alleles) * Hexp / static_cast<double>((2 * nb_alleles) - 1);
315 }
316 
317 double MultilocusGenotypeStatistics::getDnei72(const PolymorphismMultiGContainer& pmgc, vector<size_t> locus_positions, size_t grp1, size_t grp2)
318 {
319  map<size_t, double> allele_frq1, allele_frq2;
320  vector<size_t> allele_ids;
321  set<size_t> group1_id;
322  set<size_t> group2_id;
323  set<size_t> groups_id;
324  double Jx = 0.;
325  double Jy = 0.;
326  double Jxy = 0.;
327  group1_id.insert(grp1);
328  group2_id.insert(grp2);
329  groups_id.insert(grp1);
330  groups_id.insert(grp2);
331  for (size_t i = 0; i < locus_positions.size(); i++)
332  {
333  allele_ids.clear();
334  allele_frq1.clear();
335  allele_frq2.clear();
336  try
337  {
338  allele_ids = getAllelesIdsForGroups(pmgc, locus_positions[i], groups_id);
339  allele_frq1 = getAllelesFrqForGroups(pmgc, locus_positions[i], group1_id);
340  allele_frq2 = getAllelesFrqForGroups(pmgc, locus_positions[i], group2_id);
341  }
342  catch (Exception& e)
343  {
344  throw e;
345  }
346  for (size_t j = 0; j < allele_ids.size(); j++)
347  {
348  map<size_t, double>::iterator it1 = allele_frq1.find(allele_ids[j]);
349  map<size_t, double>::iterator it2 = allele_frq2.find(allele_ids[j]);
350  double tmp_frq1 = (it1 != allele_frq1.end()) ? it1->second : 0.;
351  double tmp_frq2 = (it2 != allele_frq2.end()) ? it2->second : 0.;
352  Jx += tmp_frq1 * tmp_frq1;
353  Jy += tmp_frq2 * tmp_frq2;
354  Jxy += tmp_frq1 * tmp_frq2;
355  }
356  }
357  if (Jx * Jy == 0.)
358  throw ZeroDivisionException("MultilocusGenotypeStatistics::getDnei72.");
359  return -log(Jxy / sqrt(Jx * Jy));
360 }
361 
362 double MultilocusGenotypeStatistics::getDnei78(const PolymorphismMultiGContainer& pmgc, vector<size_t> locus_positions, size_t grp1, size_t grp2)
363 {
364  map<size_t, double> allele_frq1, allele_frq2;
365  vector<size_t> allele_ids;
366  set<size_t> group1_id;
367  set<size_t> group2_id;
368  set<size_t> groups_id;
369  double Jx = 0.;
370  double Jy = 0.;
371  double Jxy = 0.;
372  size_t nx = 0, ny = 0;
373  group1_id.insert(grp1);
374  group2_id.insert(grp2);
375  groups_id.insert(grp1);
376  groups_id.insert(grp2);
377  for (size_t i = 0; i < locus_positions.size(); i++)
378  {
379  allele_ids.clear();
380  allele_frq1.clear();
381  allele_frq2.clear();
382  try
383  {
384  allele_ids = getAllelesIdsForGroups(pmgc, locus_positions[i], groups_id);
385  allele_frq1 = getAllelesFrqForGroups(pmgc, locus_positions[i], group1_id);
386  allele_frq2 = getAllelesFrqForGroups(pmgc, locus_positions[i], group2_id);
387  nx = countBiAllelicForGroups(pmgc, locus_positions[i], group1_id);
388  ny = countBiAllelicForGroups(pmgc, locus_positions[i], group2_id);
389  }
390  catch (Exception& e)
391  {
392  throw e;
393  }
394  double tmp_Jx = 0.;
395  double tmp_Jy = 0.;
396  for (size_t j = 0; j < allele_ids.size(); j++)
397  {
398  map<size_t, double>::iterator it1 = allele_frq1.find(allele_ids[j]);
399  map<size_t, double>::iterator it2 = allele_frq2.find(allele_ids[j]);
400  double tmp_frq1 = (it1 != allele_frq1.end()) ? it1->second : 0.;
401  double tmp_frq2 = (it2 != allele_frq2.end()) ? it2->second : 0.;
402  tmp_Jx += tmp_frq1 * tmp_frq1;
403  tmp_Jy += tmp_frq2 * tmp_frq2;
404  Jxy += tmp_frq1 * tmp_frq2;
405  }
406  Jx += ((2. * (double) nx * tmp_Jx) - 1.) / ((2. * (double) nx) - 1.);
407  Jy += ((2. * (double) ny * tmp_Jy) - 1.) / ((2. * (double) ny) - 1.);
408  }
409  double denom = Jx * Jy;
410  if (denom == 0.)
411  throw ZeroDivisionException("MultilocusGenotypeStatistics::getDnei78.");
412  return -log(Jxy / sqrt(denom));
413 }
414 
415 map<size_t, MultilocusGenotypeStatistics::Fstats> MultilocusGenotypeStatistics::getAllelesFstats(const PolymorphismMultiGContainer& pmgc, size_t locus_position, const set<size_t>& groups)
416 {
417  map<size_t, MultilocusGenotypeStatistics::VarComp> vc = getVarianceComponents(pmgc, locus_position, groups);
418  map<size_t, MultilocusGenotypeStatistics::Fstats> f_stats;
419  for (map<size_t, MultilocusGenotypeStatistics::VarComp>::iterator it = vc.begin(); it != vc.end(); it++)
420  {
421  double abc = it->second.a + it->second.b + it->second.c;
422  double bc = it->second.b + it->second.c;
423 
424  if (abc == 0)
425  {
426  f_stats[it->first].Fit = NAN;
427  f_stats[it->first].Fst = NAN;
428  }
429  {
430  f_stats[it->first].Fit = 1. - it->second.c / abc;
431  f_stats[it->first].Fst = it->second.a / abc;
432  }
433  if (bc == 0)
434  f_stats[it->first].Fis = NAN;
435  else
436  f_stats[it->first].Fis = 1. - it->second.c / bc;
437  }
438  return f_stats;
439 }
440 
441 map<size_t, double> MultilocusGenotypeStatistics::getAllelesFit(const PolymorphismMultiGContainer& pmgc, size_t locus_position, const set<size_t>& groups)
442 {
443  map<size_t, MultilocusGenotypeStatistics::VarComp> values = getVarianceComponents(pmgc, locus_position, groups);
444  map<size_t, double> Fit;
445  for (map<size_t, MultilocusGenotypeStatistics::VarComp>::iterator it = values.begin(); it != values.end(); it++)
446  {
447  Fit[it->first] = it->second.a + it->second.b + it->second.c;
448  if (Fit[it->first] == 0.)
449  throw ZeroDivisionException("MultilocusGenotypeStatistics::getAllelesFit.");
450  Fit[it->first] = 1. - it->second.c / Fit[it->first];
451  }
452  return Fit;
453 }
454 
455 map<size_t, double> MultilocusGenotypeStatistics::getAllelesFst(const PolymorphismMultiGContainer& pmgc, size_t locus_position, const set<size_t>& groups)
456 {
457  if (groups.size() <= 1)
458  throw BadIntegerException("MultilocusGenotypeStatistics::getAllelesFst: groups must be >= 2.", static_cast<int>(groups.size()));
459  map<size_t, MultilocusGenotypeStatistics::VarComp> values = getVarianceComponents(pmgc, locus_position, groups);
460  map<size_t, double> Fst;
461  for (map<size_t, MultilocusGenotypeStatistics::VarComp>::iterator it = values.begin(); it != values.end(); it++)
462  {
463  Fst[it->first] = it->second.a + it->second.b + it->second.c;
464  if (Fst[it->first] == 0.)
465  throw ZeroDivisionException("MultilocusGenotypeStatistics::getAllelesFst.");
466  Fst[it->first] = it->second.a / Fst[it->first];
467  }
468  return Fst;
469 }
470 
471 map<size_t, double> MultilocusGenotypeStatistics::getAllelesFis(const PolymorphismMultiGContainer& pmgc, size_t locus_position, const set<size_t>& groups)
472 {
473  map<size_t, MultilocusGenotypeStatistics::VarComp> values = getVarianceComponents(pmgc, locus_position, groups);
474  map<size_t, double> Fis;
475  for (map<size_t, MultilocusGenotypeStatistics::VarComp>::iterator it = values.begin(); it != values.end(); it++)
476  {
477  Fis[it->first] = it->second.b + it->second.c;
478  if (Fis[it->first] == 0.)
479  throw ZeroDivisionException("MultilocusGenotypeStatistics::getAllelesFis.");
480  Fis[it->first] = 1. - it->second.c / Fis[it->first];
481  }
482  return Fis;
483 }
484 
485 map<size_t, MultilocusGenotypeStatistics::VarComp> MultilocusGenotypeStatistics::getVarianceComponents(const PolymorphismMultiGContainer& pmgc, size_t locus_position, const set<size_t>& groups)
486 {
487  map<size_t, MultilocusGenotypeStatistics::VarComp> values;
488  // Base values computation
489  double nbar = 0.;
490  double nc = 0.;
491  vector<size_t> ids = getAllelesIdsForGroups(pmgc, locus_position, groups);
492  map<size_t, double> pbar;
493  map<size_t, double> s2;
494  map<size_t, double> hbar;
495  for (size_t i = 0; i < ids.size(); i++)
496  {
497  pbar[ids[i]] = 0.;
498  s2[ids[i]] = 0.;
499  hbar[ids[i]] = 0.;
500  }
501  double r = static_cast<double>(groups.size());
502  for (set<size_t>::iterator set_it = groups.begin(); set_it != groups.end(); set_it++)
503  {
504  size_t i = (*set_it);
505  double ni = static_cast<double>(pmgc.getLocusGroupSize(i, locus_position));
506  set<size_t> group_id;
507  group_id.insert( i );
508  map<size_t, double> pi = getAllelesFrqForGroups(pmgc, locus_position, group_id);
509  map<size_t, double> hi = getHeterozygousFrqForGroups(pmgc, locus_position, group_id);
510  nbar += ni;
511  if (r > 1)
512  nc += ni * ni;
513 
514  for (map<size_t, double>::iterator it = pi.begin(); it != pi.end(); it++)
515  {
516  pbar[it->first] += ni * it->second;
517  }
518  for (map<size_t, double>::iterator it = hi.begin(); it != hi.end(); it++)
519  {
520  hbar[it->first] += ni * it->second;
521  }
522 
523  group_id.clear();
524  }
525  nbar = nbar / r;
526  if (nbar <= 1)
527  throw ZeroDivisionException("MultilocusGenotypeStatistics::getVarianceComponents.");
528  if (r > 1)
529  nc = (r * nbar) - (nc / (r * nbar)) / (r - 1.);
530  for (map<size_t, double>::iterator it = pbar.begin(); it != pbar.end(); it++)
531  {
532  it->second = it->second / (r * nbar);
533  }
534  for (map<size_t, double>::iterator it = hbar.begin(); it != hbar.end(); it++)
535  {
536  it->second = it->second / ( r * nbar);
537  }
538 
539  for (set<size_t>::iterator set_it = groups.begin(); set_it != groups.end(); set_it++)
540  {
541  size_t i = (*set_it);
542  double ni = static_cast<double>(pmgc.getLocusGroupSize( i, locus_position));
543  set<size_t> group_id;
544  group_id.insert( i );
545  map<size_t, double> pi = getAllelesFrqForGroups(pmgc, locus_position, group_id);
546  for (size_t j = 0; j < ids.size(); j++)
547  {
548  pi[ids[j]];
549  }
550  for (map<size_t, double>::iterator it = pi.begin(); it != pi.end(); it++)
551  {
552  s2[it->first] += ni * (it->second - pbar[it->first]) * (it->second - pbar[it->first]);
553  }
554  group_id.clear();
555  }
556  for (map<size_t, double>::iterator it = s2.begin(); it != s2.end(); it++)
557  {
558  it->second = it->second / ((r - 1.) * nbar);
559  }
560 
561  // a, b, c computation
562  for (size_t i = 0; i < ids.size(); i++)
563  {
564  values[ids[i]];
565  }
566 
567  for (map<size_t, MultilocusGenotypeStatistics::VarComp>::iterator it = values.begin(); it != values.end(); it++)
568  {
569  it->second.a = (nbar / nc) * (s2[it->first] - ((1. / (nbar - 1.)) * ((pbar[it->first] * (1. - pbar[it->first])) - (s2[it->first] * ((double) r - 1.) / r) - ((1. / 4.) * hbar[it->first]))));
570  it->second.b = (nbar / (nbar - 1.)) * ((pbar[it->first] * (1. - pbar[it->first])) - (s2[it->first] * ((double) r - 1.) / (double) r) - ((((2. * nbar) - 1.) / (4. * nbar)) * hbar[it->first]));
571  it->second.c = hbar[it->first] / 2.;
572  }
573  return values;
574 }
575 
576 double MultilocusGenotypeStatistics::getWCMultilocusFst(const PolymorphismMultiGContainer& pmgc, vector<size_t> locus_positions, const set<size_t>& groups)
577 {
578  double A, B, C;
579  A = B = C = 0.0;
580  for (size_t i = 0; i < locus_positions.size(); i++)
581  {
582  //count total number of individuals without missing data
583  size_t ni = 0;
584  for (set<size_t>::iterator setIt = groups.begin() ; setIt != groups.end() ; setIt++)
585  {
586  ni += pmgc.getLocusGroupSize( (*setIt), i);
587  }
588 
589  // reduce computation for polymorphic loci for that groups
590  vector<size_t> ids = getAllelesIdsForGroups(pmgc, i, groups);
591  if (ids.size() >= 2 && ni >= 1)
592  {
593  map<size_t, MultilocusGenotypeStatistics::VarComp> values = getVarianceComponents(pmgc, locus_positions[i], groups);
594  for (map<size_t, MultilocusGenotypeStatistics::VarComp>::iterator it = values.begin(); it != values.end(); it++)
595  {
596  A += it->second.a;
597  B += it->second.b;
598  C += it->second.c;
599  }
600  }
601  }
602  if ((A + B + C) == 0)
603  throw ZeroDivisionException("MultilocusGenotypeStatistics::getWCMultilocusFst.");
604  return A / (A + B + C);
605 }
606 
607 double MultilocusGenotypeStatistics::getWCMultilocusFis(const PolymorphismMultiGContainer& pmgc, vector<size_t> locus_positions, const set<size_t>& groups)
608 {
609  double B, C;
610  B = C = 0.0;
611  for (size_t i = 0; i < locus_positions.size(); i++)
612  {
613  //count total number of individuals without missing data
614  size_t ni = 0;
615  for (set<size_t>::iterator setIt = groups.begin() ; setIt != groups.end() ; setIt++)
616  {
617  ni += pmgc.getLocusGroupSize( (*setIt), i);
618  }
619 
620  // reduce computation for polymorphic loci for that groups
621  vector<size_t> ids = getAllelesIdsForGroups(pmgc, i, groups);
622  if (ids.size() >= 2 && ni >= 1)
623  {
624  map<size_t, MultilocusGenotypeStatistics::VarComp> values = getVarianceComponents(pmgc, locus_positions[i], groups);
625  for (map<size_t, MultilocusGenotypeStatistics::VarComp>::iterator it = values.begin(); it != values.end(); it++)
626  {
627  B += it->second.b;
628  C += it->second.c;
629  }
630  }
631  }
632  if ((B + C) == 0)
633  throw ZeroDivisionException("MultilocusGenotypeStatistics::getWCMultilocusFis.");
634  return 1.0 - C / (B + C);
635 }
636 
638 {
639  // extract a PolymorphismMultiGContainer with only those groups
641  double nb_sup = 0.0;
642  double nb_inf = 0.0;
643  PermResults results;
644  results.Statistic = getWCMultilocusFst(sub_pmgc, locus_positions, groups);
645  if (nb_perm > 0)
646  {
647  for (int i = 0; i < nb_perm; i++)
648  {
650  double Fst_perm = getWCMultilocusFst(permuted_pmgc, locus_positions, groups);
651  // cout << Fst_perm << endl;
652  if (Fst_perm > results.Statistic)
653  nb_sup++;
654  if (Fst_perm < results.Statistic)
655  nb_inf++;
656  }
657 
658  nb_sup /= (double) nb_perm;
659  nb_inf /= (double) nb_perm;
660  }
661 
662  results.Percent_sup = nb_sup;
663  results.Percent_inf = nb_inf;
664  return results;
665 }
666 
668 {
669  // extract a PolymorphismMultiGContainer with only those groups
671  double nb_sup = 0.0;
672  double nb_inf = 0.0;
673  PermResults results;
674  results.Statistic = getWCMultilocusFis(sub_pmgc, locus_positions, groups);
675  if (nb_perm > 0)
676  {
677  for (int i = 0; i < nb_perm; i++)
678  {
680  double Fis_perm = getWCMultilocusFis(permuted_pmgc, locus_positions, groups);
681 
682  if (Fis_perm > results.Statistic)
683  nb_sup++;
684  if (Fis_perm < results.Statistic)
685  nb_inf++;
686  }
687 
688  nb_sup /= (double) nb_perm;
689  nb_inf /= (double) nb_perm;
690  }
691 
692  results.Percent_sup = nb_sup;
693  results.Percent_inf = nb_inf;
694  return results;
695 }
696 
697 double MultilocusGenotypeStatistics::getRHMultilocusFst(const PolymorphismMultiGContainer& pmgc, vector<size_t> locus_positions, const set<size_t>& groups)
698 {
699  double Au, Bu, Cu;
700  double RH = 0.0;
701  int nb_alleles = 0;
702  int total_alleles = 0;
703 
704  for (size_t i = 0; i < locus_positions.size(); i++)
705  {
706  // reduce computation for polymorphic loci for that groups
707  vector<size_t> ids = getAllelesIdsForGroups(pmgc, i, groups);
708  if (ids.size() >= 2)
709  {
710  nb_alleles = 0;
711  // mean allelic frequencies
712  map< size_t, double > P = MultilocusGenotypeStatistics::getAllelesFrqForGroups (pmgc, locus_positions[i], groups);
713  // variance components from W&C
714  map<size_t, MultilocusGenotypeStatistics::VarComp> values = getVarianceComponents(pmgc, locus_positions[i], groups);
715  for (map<size_t, MultilocusGenotypeStatistics::VarComp>::iterator it = values.begin(); it != values.end(); it++)
716  {
717  Au = it->second.a;
718  Bu = it->second.b;
719  Cu = it->second.c;
720  if ((Au + Bu + Cu) != 0)
721  {
722  double Pu = P[it->first]; // it->first is the allele number
723  RH += (1 - Pu) * Au / (Au + Bu + Cu);
724  nb_alleles++;
725  }
726  }
727  total_alleles += (nb_alleles - 1);
728  }
729  }
730  if (total_alleles == 0)
731  throw ZeroDivisionException("MultilocusGenotypeStatistics::getRHMultilocusFst.");
732  return RH / double(total_alleles);
733 }
734 
735 std::unique_ptr<DistanceMatrix> MultilocusGenotypeStatistics::getDistanceMatrix(const PolymorphismMultiGContainer& pmgc, vector<size_t> locus_positions, const set<size_t>& groups, string distance_methode)
736 {
737  vector<string> names = pmgc.getAllGroupsNames();
738  vector<size_t> grp_ids_vect;
739  for (set<size_t>::iterator i = groups.begin(); i != groups.end(); i++)
740  {
741  grp_ids_vect.push_back(*i);
742  }
743 
744  unique_ptr<DistanceMatrix> _dist(new DistanceMatrix(names));
745  for (size_t i = 0; i < groups.size(); i++)
746  {
747  (*_dist)(i, i) = 0;
748  }
749 
750  set<size_t> pairwise_grp;
751 
752  for (size_t j = 0; j < groups.size () - 1; j++)
753  {
754  for (size_t k = j + 1; k < groups.size (); k++)
755  {
756  double distance = 0;
757  if (distance_methode == "nei72")
758  distance = MultilocusGenotypeStatistics::getDnei72( pmgc, locus_positions, grp_ids_vect[j], grp_ids_vect[k] );
759  else if (distance_methode == "nei78")
760  distance = MultilocusGenotypeStatistics::getDnei78( pmgc, locus_positions, grp_ids_vect[j], grp_ids_vect[k] );
761  else if (distance_methode == "WC") // Fst multilocus selon W&C
762  {
763  pairwise_grp.insert(grp_ids_vect[j] );
764  pairwise_grp.insert(grp_ids_vect[k] );
765  distance = MultilocusGenotypeStatistics::getWCMultilocusFst( pmgc, locus_positions, pairwise_grp);
766  pairwise_grp.clear();
767  }
768  else if (distance_methode == "RH") // Fst multilocus selon ponderation Robertson & Hill
769  {
770  pairwise_grp.insert(grp_ids_vect[j] );
771  pairwise_grp.insert(grp_ids_vect[k] );
772  distance = MultilocusGenotypeStatistics::getRHMultilocusFst( pmgc, locus_positions, pairwise_grp);
773  pairwise_grp.clear();
774  }
775  else if (distance_methode == "Nm") // Nm déduit des Fst multilocus selon W&C modèle en îles Fst = 1/(1+4Nm)
776  {
777  pairwise_grp.insert(grp_ids_vect[j] );
778  pairwise_grp.insert(grp_ids_vect[k] );
779  distance = MultilocusGenotypeStatistics::getWCMultilocusFst( pmgc, locus_positions, pairwise_grp);
780  if (distance != 0)
781  distance = 0.25 * (1 - distance) / distance;
782  else
783  distance = NAN;
784  pairwise_grp.clear();
785  }
786  else if (distance_methode == "D") // D=-ln(1-Fst) of Reynolds, Weir and Cockerham, 1983
787  {
788  pairwise_grp.insert(grp_ids_vect[j] );
789  pairwise_grp.insert(grp_ids_vect[k] );
790  distance = MultilocusGenotypeStatistics::getWCMultilocusFst( pmgc, locus_positions, pairwise_grp);
791  if (distance != 1)
792  distance = -log(1 - distance);
793  else
794  distance = NAN;
795  pairwise_grp.clear();
796  }
797  else if (distance_methode == "Rousset") // Calcul de Fst/(1-Fst). Rousset F. 1997
798  {
799  pairwise_grp.insert(grp_ids_vect[j] );
800  pairwise_grp.insert(grp_ids_vect[k] );
801  distance = MultilocusGenotypeStatistics::getWCMultilocusFst( pmgc, locus_positions, pairwise_grp);
802  if (distance != 1)
803  distance = distance / (1 - distance);
804  else
805  distance = NAN;
806  pairwise_grp.clear();
807  }
808 
809  (*_dist)(k, j) = distance;
810  (*_dist)(j, k) = distance;
811  } // for k
812  } // for j
813 
814  return _dist;
815 }
816 
The BiAlleleMonolocusGenotype class.
std::size_t getBadIndex() const
const std::array< std::size_t, 2 > & getBounds() const
static std::vector< T > getValues(const std::map< Key, T, Cmp > &myMap)
static std::vector< Key > getKeys(const std::map< Key, T, Cmp > &myMap)
The MonolocusGenotype virtual class.
virtual std::vector< size_t > getAlleleIndex() const =0
Get the alleles' index.
static double getDnei78(const PolymorphismMultiGContainer &pmgc, std::vector< size_t > locus_positions, size_t grp1, size_t grp2)
Compute the Nei unbiased distance between two groups at a given number of loci.
static std::map< size_t, double > getAllelesFit(const PolymorphismMultiGContainer &pmgc, size_t locus_position, const std::set< size_t > &groups)
Compute the Weir and Cockerham Fit on a set of groups for each allele of a given locus.
static std::map< size_t, double > getHeterozygousFrqForGroups(const PolymorphismMultiGContainer &pmgc, size_t locus_position, const std::set< size_t > &groups)
Get the heterozygous frequencies for each allele at a locus in a set of groups.
static std::map< size_t, VarComp > getVarianceComponents(const PolymorphismMultiGContainer &pmgc, size_t locus_position, const std::set< size_t > &groups)
Get the variance components a, b and c (Weir and Cockerham, 1983).
static double getDnei72(const PolymorphismMultiGContainer &pmgc, std::vector< size_t > locus_positions, size_t grp1, size_t grp2)
Compute the Nei distance between two groups at one locus.
static std::map< size_t, size_t > countHeterozygousForGroups(const PolymorphismMultiGContainer &pmgc, size_t locus_position, const std::set< size_t > &groups)
Count how many times each allele is found in an heterozygous MonolocusGenotype in a set of groups.
static size_t countBiAllelicForGroups(const PolymorphismMultiGContainer &pmgc, size_t locus_position, const std::set< size_t > &groups)
Counr the number of bi-allelic MonolocusGenotype at a given locus for a set of groups.
static double getHnbForGroups(const PolymorphismMultiGContainer &pmgc, size_t locus_position, const std::set< size_t > &groups)
Compute the expected non biased heterozygosity for one locus.
static std::map< size_t, double > getAllelesFst(const PolymorphismMultiGContainer &pmgc, size_t locus_position, const std::set< size_t > &groups)
Compute the Weir and Cockerham on a set of groups for each allele of a given locus.
static PermResults getWCMultilocusFstAndPerm(const PolymorphismMultiGContainer &pmgc, std::vector< size_t > locus_positions, std::set< size_t > groups, int nb_perm)
Compute the Weir and Cockerham on a set of groups for a given set of loci and make a permutation tes...
static std::vector< size_t > getAllelesIdsForGroups(const PolymorphismMultiGContainer &pmgc, size_t locus_position, const std::set< size_t > &groups)
Get the alleles' id at one locus for a set of groups.
static size_t countGametesForGroups(const PolymorphismMultiGContainer &pmgc, size_t locus_position, const std::set< size_t > &groups)
Count the number of allele (gametes) at a locus for a set of groups.
static std::unique_ptr< DistanceMatrix > getDistanceMatrix(const PolymorphismMultiGContainer &pmgc, std::vector< size_t > locus_positions, const std::set< size_t > &groups, std::string distance_method)
Compute pairwise distances on a set of groups for a given set of loci. distance is either Nei72,...
static std::map< size_t, Fstats > getAllelesFstats(const PolymorphismMultiGContainer &pmgc, size_t locus_position, const std::set< size_t > &groups)
Compute the three F statistics of Weir and Cockerham for each allele of a given locus.
static double getHexpForGroups(const PolymorphismMultiGContainer &pmgc, size_t locus_position, const std::set< size_t > &groups)
Compute the expected heterozygosity for one locus.
static double getRHMultilocusFst(const PolymorphismMultiGContainer &pmgc, std::vector< size_t > locus_positions, const std::set< size_t > &groups)
Compute the on a set of groups for a given set of loci. The variance componenets for each allele are...
static double getHobsForGroups(const PolymorphismMultiGContainer &pmgc, size_t locus_position, const std::set< size_t > &groups)
Compute the observed heterozygosity for one locus.
static size_t countNonMissingForGroups(const PolymorphismMultiGContainer &pmgc, size_t locus_position, const std::set< size_t > &groups)
Count the number of non-missing data at a given locus for a set of groups.
static std::map< size_t, double > getAllelesFis(const PolymorphismMultiGContainer &pmgc, size_t locus_position, const std::set< size_t > &groups)
Compute the Weir and Cockerham Fis on a set of groups for each allele of a given locus.
static PermResults getWCMultilocusFisAndPerm(const PolymorphismMultiGContainer &pmgc, std::vector< size_t > locus_positions, std::set< size_t > groups, int nb_perm)
Compute the Weir and Cockerham Fis on a set of groups for a given set of loci and make a permutation ...
static std::map< size_t, size_t > getAllelesMapForGroups(const PolymorphismMultiGContainer &pmgc, size_t locus_position, const std::set< size_t > &groups)
Get a map of allele count for a set of groups.
static double getWCMultilocusFst(const PolymorphismMultiGContainer &pmgc, std::vector< size_t > locus_positions, const std::set< size_t > &groups)
Compute the Weir and Cockerham on a set of groups for a given set of loci. The variance componenets ...
static double getWCMultilocusFis(const PolymorphismMultiGContainer &pmgc, std::vector< size_t > locus_positions, const std::set< size_t > &groups)
Compute the Weir and Cockerham Fis on a set of groups for a given set of loci. The variance componene...
static std::map< size_t, double > getAllelesFrqForGroups(const PolymorphismMultiGContainer &pmgc, size_t locus_position, const std::set< size_t > &groups)
Get the alleles frequencies at one locus for a set of groups.
const MonolocusGenotype & getMonolocusGenotype(size_t locus_position) const
Get a MonolocusGenotype.
bool isMonolocusGenotypeMissing(size_t locus_position) const
Tell if a MonolocusGenotype is a missing data.
static PolymorphismMultiGContainer permutMultiG(const PolymorphismMultiGContainer &pmgc)
Permut the MultilocusGenotype in the whole PolymorphismMultiGContainer.
static PolymorphismMultiGContainer permutIntraGroupAlleles(const PolymorphismMultiGContainer &pmgc, const std::set< size_t > &groups)
Permut the Alleles between individuals in the same group.
static PolymorphismMultiGContainer extractGroups(const PolymorphismMultiGContainer &pmgc, const std::set< size_t > &groups)
The PolymorphismMultiGContainer class.
size_t getGroupId(size_t position) const
Get the Group id of a MultilocusGenotype.
const MultilocusGenotype * getMultilocusGenotype(size_t position) const
Get a MultilocusGenotype at a position.
std::vector< std::string > getAllGroupsNames() const
Get the groups names or ids if not available.
size_t getLocusGroupSize(size_t group, size_t locus_position) const
Get the size of a group for a given locus.
size_t size() const
Get the number of MultilocusGenotype.