bpp-seq3  3.0.0
SymbolListTools.h
Go to the documentation of this file.
1 // SPDX-FileCopyrightText: The Bio++ Development Group
2 //
3 // SPDX-License-Identifier: CECILL-2.1
4 
5 #ifndef BPP_SEQ_SYMBOLLISTTOOLS_H
6 #define BPP_SEQ_SYMBOLLISTTOOLS_H
7 
9 
11 #include "IntSymbolList.h"
13 
14 // From the STL:
15 #include <map>
16 
17 namespace bpp
18 {
23 {
24 public:
26  virtual ~SymbolListTools() {}
27 
28 public:
33  static bool hasGap(const IntSymbolListInterface& site);
34  static bool hasGap(const ProbabilisticSymbolListInterface& site);
35 
36  static bool hasGap(const CruxSymbolListInterface& site)
37  {
38  try
39  {
40  return hasGap(dynamic_cast<const ProbabilisticSymbolListInterface&>(site));
41  }
42  catch (std::bad_cast&) {}
43  try
44  {
45  return hasGap(dynamic_cast<const IntSymbolListInterface&>(site));
46  }
47  catch (std::bad_cast&) {}
48  throw Exception("SymbolListTools::hasGap : unsupported CruxSymbolListInterface implementation.");
49  }
50 
55  static bool hasUnresolved(const IntSymbolListInterface& site);
56 
61  static bool isGapOnly(const IntSymbolListInterface& site);
62  static bool isGapOnly(const ProbabilisticSymbolListInterface& site);
63 
64  static bool isGapOnly(const CruxSymbolListInterface& site)
65  {
66  try
67  {
68  return isGapOnly(dynamic_cast<const ProbabilisticSymbolListInterface&>(site));
69  }
70  catch (std::bad_cast&) {}
71  try
72  {
73  return isGapOnly(dynamic_cast<const IntSymbolListInterface&>(site));
74  }
75  catch (std::bad_cast&) {}
76  throw Exception("SymbolListTools::numberOfGaps : unsupported CruxSymbolListInterface implementation.");
77  }
78 
83  static size_t numberOfGaps(const IntSymbolListInterface& site);
84  static size_t numberOfGaps(const ProbabilisticSymbolListInterface& site);
85 
86  static size_t numberOfGaps(const CruxSymbolListInterface& site)
87  {
88  try
89  {
90  return numberOfGaps(dynamic_cast<const ProbabilisticSymbolListInterface&>(site));
91  }
92  catch (std::bad_cast&) {}
93  try
94  {
95  return numberOfGaps(dynamic_cast<const IntSymbolListInterface&>(site));
96  }
97  catch (std::bad_cast&) {}
98  throw Exception("SymbolListTools::numberOfGaps : unsupported CruxSymbolListInterface implementation.");
99  }
100 
105  static bool isGapOrUnresolvedOnly(const IntSymbolListInterface& site);
107 
109  {
110  try
111  {
112  return isGapOrUnresolvedOnly(dynamic_cast<const ProbabilisticSymbolListInterface&>(site));
113  }
114  catch (std::bad_cast&) {}
115  try
116  {
117  return isGapOrUnresolvedOnly(dynamic_cast<const IntSymbolListInterface&>(site));
118  }
119  catch (std::bad_cast&) {}
120  throw Exception("SymbolListTools::isGapOrUnresolvedOnly : unsupported CruxSymbolListInterface implementation.");
121  }
122 
127  static size_t numberOfUnresolved(const IntSymbolListInterface& site);
128  static size_t numberOfUnresolved(const ProbabilisticSymbolListInterface& site);
129 
130  static size_t numberOfUnresolved(const CruxSymbolListInterface& site)
131  {
132  try
133  {
134  return numberOfUnresolved(dynamic_cast<const ProbabilisticSymbolListInterface&>(site));
135  }
136  catch (std::bad_cast&) {}
137  try
138  {
139  return numberOfUnresolved(dynamic_cast<const IntSymbolListInterface&>(site));
140  }
141  catch (std::bad_cast&) {}
142  throw Exception("SymbolListTools::numberOfUnresolved : unsupported CruxSymbolListInterface implementation.");
143  }
144 
149  static bool hasUnknown(const IntSymbolListInterface& site);
150  static bool hasUnknown(const ProbabilisticSymbolListInterface& site);
151 
152  static bool hasUnknown(const CruxSymbolListInterface& site)
153  {
154  try
155  {
156  return hasUnknown(dynamic_cast<const ProbabilisticSymbolListInterface&>(site));
157  }
158  catch (std::bad_cast&) {}
159  try
160  {
161  return hasUnknown(dynamic_cast<const IntSymbolListInterface&>(site));
162  }
163  catch (std::bad_cast&) {}
164  throw Exception("SymbolListTools::hasUnknown : unsupported CruxSymbolListInterface implementation.");
165  }
166 
171  static bool isComplete(const IntSymbolListInterface& site);
172  static bool isComplete(const ProbabilisticSymbolListInterface& site);
173 
174  static bool isComplete(const CruxSymbolListInterface& site)
175  {
176  try
177  {
178  return isComplete(dynamic_cast<const ProbabilisticSymbolListInterface&>(site));
179  }
180  catch (std::bad_cast&) {}
181  try
182  {
183  return isComplete(dynamic_cast<const IntSymbolListInterface&>(site));
184  }
185  catch (std::bad_cast&) {}
186  throw Exception("SymbolListTools::isComplete : unsupported CruxSymbolListInterface implementation.");
187  }
188 
189 
199  static bool isConstant(
200  const IntSymbolListInterface& site,
201  bool ignoreUnknown = false,
202  bool unresolvedRaisesException = true);
203 
204  static bool isConstant(
206  bool unresolvedRaisesException = true);
207 
208  static bool isConstant(
209  const CruxSymbolListInterface& site,
210  bool ignoreUnknown = false,
211  bool unresolvedRaisesException = true)
212  {
213  try
214  {
215  return isConstant(dynamic_cast<const ProbabilisticSymbolListInterface&>(site), unresolvedRaisesException);
216  }
217  catch (std::bad_cast&) {}
218  try
219  {
220  return isConstant(dynamic_cast<const IntSymbolListInterface&>(site), ignoreUnknown, unresolvedRaisesException);
221  }
222  catch (std::bad_cast&) {}
223  throw Exception("SymbolListTools::isConstant : unsupported CruxSymbolListInterface implementation.");
224  }
225 
231  static bool areSymbolListsIdentical(
232  const IntSymbolListInterface& list1,
233  const IntSymbolListInterface& list2);
234 
235  static bool areSymbolListsIdentical(
237  const ProbabilisticSymbolListInterface& list2);
238 
240  const CruxSymbolListInterface& l1,
241  const CruxSymbolListInterface& l2)
242  {
243  try
244  {
245  return areSymbolListsIdentical(dynamic_cast<const ProbabilisticSymbolListInterface&>(l1), dynamic_cast<const ProbabilisticSymbolListInterface&>(l2));
246  }
247  catch (std::bad_cast&) {}
248  try
249  {
250  return areSymbolListsIdentical(dynamic_cast<const IntSymbolListInterface&>(l1), dynamic_cast<const IntSymbolListInterface&>(l2));
251  }
252  catch (std::bad_cast&) {}
253  throw Exception("SymbolListTools::areSymbolListsIdentical : unsupported CruxSymbolListInterface implementation.");
254  }
255 
256 
264  template<class count_type>
265  static void getCounts(
266  const IntSymbolListInterface& list,
267  std::map<int, count_type>& counts)
268  {
269  for (size_t i = 0; i < list.size(); ++i)
270  {
271  counts[list[i]]++;
272  }
273  }
274 
282  static void getCounts(
284  std::map<int, double_t>& counts)
285  {
286  for (size_t i = 0; i < list.size(); ++i)
287  {
288  const std::vector<double>& c = list[i];
289  for (size_t j = 0; j < c.size(); ++j)
290  {
291  counts[(int)j] += c.at(j);
292  }
293  }
294  }
295 
307  const IntSymbolListInterface& list,
308  std::map<int, double>& counts)
309  {
310  for (size_t i = 0; i < list.size(); ++i)
311  {
312  std::vector<int> alias = list.getAlphabet()->getAlias(list[i]);
313  double n = static_cast<double>(alias.size());
314  for (auto j : alias)
315  {
316  counts[j] += 1. / n;
317  }
318  }
319  }
320 
333  std::map<int, double>& counts)
334  {
335  for (size_t i = 0; i < list.size(); ++i)
336  {
337  const std::vector<double>& c = list[i];
338  double s = VectorTools::sum(c);
339 
340  if (s != 0)
341  for (size_t j = 0; j < c.size(); j++)
342  {
343  counts[(int)j] += c.at(j) / s;
344  }
345  }
346  }
347 
348 
360  static void getCounts(
361  const CruxSymbolListInterface& list,
362  std::map<int, double>& counts,
363  bool resolveUnknowns = false)
364  {
365  try
366  {
367  if (resolveUnknowns)
368  {
369  getCountsResolveUnknowns(dynamic_cast<const ProbabilisticSymbolListInterface&>(list), counts);
370  return;
371  }
372  else
373  {
374  getCounts(dynamic_cast<const ProbabilisticSymbolListInterface&>(list), counts);
375  return;
376  }
377  }
378  catch (std::bad_cast&) {}
379 
380  try
381  {
382  if (resolveUnknowns)
383  {
384  getCountsResolveUnknowns(dynamic_cast<const IntSymbolListInterface&>(list), counts);
385  return;
386  }
387  else
388  {
389  getCounts<double>(dynamic_cast<const IntSymbolListInterface&>(list), counts);
390  return;
391  }
392  }
393  catch (std::bad_cast&) {}
394 
395  throw Exception("SymbolListTools::getCounts : unsupported CruxSymbolListInterface implementation (" + std::string(typeid(list).name()) + ").");
396  }
397 
398 
411  template<class count_type>
412  static void getCounts(
413  const IntSymbolListInterface& list1,
414  const IntSymbolListInterface& list2,
415  std::map<int, std::map<int, count_type>>& counts)
416  {
417  if (list1.size() != list2.size()) throw DimensionException("SymbolListTools::getCounts: the two sites must have the same size.", list1.size(), list2.size());
418  for (size_t i = 0; i < list1.size(); ++i)
419  {
420  counts[list1[i]][list2[i]]++;
421  }
422  }
423 
437  static void getCounts(
440  std::map<int, std::map<int, double>>& counts)
441  {
442  if (list1.size() != list2.size()) throw DimensionException("SymbolListTools::getCounts: the two sites must have the same size.", list1.size(), list2.size());
443  for (size_t i = 0; i < list1.size(); ++i)
444  {
445  const std::vector<double>& c1(list1[i]), &c2(list2[i]);
446  for (size_t j = 0; j < c1.size(); ++j)
447  {
448  for (size_t k = 0; k < c2.size(); ++k)
449  {
450  counts[(int)j][(int)k] += c1.at(j) * c2.at(k);
451  }
452  }
453  }
454  }
455 
456 
472  static void getCountsResolveUnknowns(
473  const IntSymbolListInterface& list1,
474  const IntSymbolListInterface& list2,
475  std::map< int, std::map<int, double>>& counts);
476 
492  static void getCountsResolveUnknowns(
495  std::map< int, std::map<int, double>>& counts);
496 
514  static void getCounts(
515  const CruxSymbolListInterface& list1,
516  const CruxSymbolListInterface& list2,
517  std::map<int, std::map<int, double>>& counts,
518  bool resolveUnknowns)
519  {
520  try
521  {
522  if (resolveUnknowns)
523  {
524  getCountsResolveUnknowns(dynamic_cast<const ProbabilisticSymbolListInterface&>(list1), dynamic_cast<const ProbabilisticSymbolListInterface&>(list2), counts);
525  return;
526  }
527  else
528  {
529  getCounts(dynamic_cast<const ProbabilisticSymbolListInterface&>(list1), dynamic_cast<const ProbabilisticSymbolListInterface&>(list2), counts);
530  return;
531  }
532  }
533  catch (std::bad_cast&) {}
534 
535  try
536  {
537  if (resolveUnknowns)
538  {
539  getCountsResolveUnknowns(dynamic_cast<const IntSymbolListInterface&>(list1), dynamic_cast<const IntSymbolListInterface&>(list2), counts);
540  return;
541  }
542  else
543  {
544  getCounts<double>(dynamic_cast<const IntSymbolListInterface&>(list1), dynamic_cast<const IntSymbolListInterface&>(list2), counts);
545  return;
546  }
547  }
548  catch (std::bad_cast&) {}
549 
550  throw Exception("SymbolListTools::getCounts : unsupported CruxSymbolListInterface implementation (" + std::string(typeid(list1).name()) + ", " + std::string(typeid(list2).name()) + ").");
551  }
552 
553 
563  static void getFrequencies(
564  const CruxSymbolListInterface& list,
565  std::map<int, double>& frequencies,
566  bool resolveUnknowns = false);
567 
581  static void getFrequencies(
582  const CruxSymbolListInterface& list1,
583  const CruxSymbolListInterface& list2,
584  std::map<int, std::map<int, double>>& frequencies,
585  bool resolveUnknowns = false);
586 
599  static double getGCContent(
600  const IntSymbolListInterface& list,
601  bool ignoreUnresolved = true,
602  bool ignoreGap = true);
603 
604  static double getGCContent(
606  bool ignoreUnresolved = true,
607  bool ignoreGap = true);
608 
609  static double getGCContent(
610  const CruxSymbolListInterface& list,
611  bool ignoreUnresolved = true,
612  bool ignoreGap = true)
613  {
614  try
615  {
616  return getGCContent(dynamic_cast<const ProbabilisticSymbolListInterface&>(list));
617  }
618  catch (std::bad_cast&) {}
619  try
620  {
621  return getGCContent(dynamic_cast<const IntSymbolListInterface&>(list));
622  }
623  catch (std::bad_cast&) {}
624  throw Exception("SymbolListTools::getGCContent : unsupported CruxSymbolListInterface implementation.");
625  }
626 
638  static size_t getNumberOfDistinctPositions(
639  const IntSymbolListInterface& l1,
640  const IntSymbolListInterface& l2);
641 
642  static size_t getNumberOfDistinctPositions(
645 
647  const CruxSymbolListInterface& l1,
648  const CruxSymbolListInterface& l2)
649  {
650  try
651  {
652  return getNumberOfDistinctPositions(dynamic_cast<const ProbabilisticSymbolListInterface&>(l1), dynamic_cast<const ProbabilisticSymbolListInterface&>(l2));
653  }
654  catch (std::bad_cast&) {}
655  try
656  {
657  return getNumberOfDistinctPositions(dynamic_cast<const IntSymbolListInterface&>(l1), dynamic_cast<const IntSymbolListInterface&>(l2));
658  }
659  catch (std::bad_cast&) {}
660  throw Exception("SymbolListTools::getNumberOfDistinctPositions : unsupported CruxSymbolListInterface implementation.");
661  }
662 
674  static size_t getNumberOfPositionsWithoutGap(
675  const IntSymbolListInterface& l1,
676  const IntSymbolListInterface& l2);
677 
678  static size_t getNumberOfPositionsWithoutGap(
681 
683  const CruxSymbolListInterface& l1,
684  const CruxSymbolListInterface& l2)
685  {
686  try
687  {
688  return getNumberOfPositionsWithoutGap(dynamic_cast<const ProbabilisticSymbolListInterface&>(l1), dynamic_cast<const ProbabilisticSymbolListInterface&>(l2));
689  }
690  catch (std::bad_cast&) {}
691  try
692  {
693  return getNumberOfPositionsWithoutGap(dynamic_cast<const IntSymbolListInterface&>(l1), dynamic_cast<const IntSymbolListInterface&>(l2));
694  }
695  catch (std::bad_cast&) {}
696  throw Exception("SymbolListTools::getNumberOfPositionsWithoutGap : unsupported CruxSymbolListInterface implementation.");
697  }
698 
706 
708 
710  {
711  try
712  {
714  return;
715  }
716  catch (std::bad_cast&) {}
717  try
718  {
720  return;
721  }
722  catch (std::bad_cast&) {}
723  throw Exception("SymbolListTools::changeGapsToUnknownCharacters : unsupported CruxSymbolListInterface implementation.");
724  }
725 
733 
735 
737  {
738  try
739  {
741  return;
742  }
743  catch (std::bad_cast&) {}
744  try
745  {
747  return;
748  }
749  catch (std::bad_cast&) {}
750  throw Exception("SymbolListTools::changeUnresolvedCharactersToGaps : unsupported CruxSymbolListInterface implementation.");
751  }
752 
766  static double variabilityShannon(
767  const CruxSymbolListInterface& list,
768  bool resolveUnknowns);
769 
782  static double variabilityFactorial(const IntSymbolListInterface& list);
783 
799  static double mutualInformation(
800  const CruxSymbolListInterface& list1,
801  const CruxSymbolListInterface& list2,
802  bool resolveUnknowns);
803 
817  static double entropy(const CruxSymbolListInterface& list, bool resolveUnknowns)
818  {
819  return variabilityShannon(list, resolveUnknowns);
820  }
821 
836  static double jointEntropy(
837  const CruxSymbolListInterface& list1,
838  const CruxSymbolListInterface& list2,
839  bool resolveUnknowns);
840 
852  static double heterozygosity(const CruxSymbolListInterface& list);
853 
860  static size_t getNumberOfDistinctCharacters(const IntSymbolListInterface& list);
861 
868  static size_t getMajorAlleleFrequency(const IntSymbolListInterface& list);
869 
876  static int getMajorAllele(const CruxSymbolListInterface& list);
877 
884  static size_t getMinorAlleleFrequency(const IntSymbolListInterface& list);
885 
892  static int getMinorAllele(const CruxSymbolListInterface& list);
893 
901  static bool hasSingleton(const IntSymbolListInterface& list);
902 
911  static bool isParsimonyInformativeSite(const IntSymbolListInterface& site);
912 
913 
920  static bool isTriplet(const IntSymbolListInterface& list);
921 
928  static bool isDoubleton(const IntSymbolListInterface& list);
929 };
930 } // end of namespace bpp.
931 #endif // BPP_SEQ_SYMBOLLISTTOOLS_H
The CruxSymbolList interface.
virtual std::shared_ptr< const Alphabet > getAlphabet() const =0
Get the alphabet associated to the list.
virtual size_t size() const =0
Get the number of elements in the list.
The specific IntSymbolList interface.
Definition: IntSymbolList.h:29
The ProbabilisticSymbolList interface.
Utilitary functions dealing with both sites and sequences.
static bool isComplete(const CruxSymbolListInterface &site)
static double mutualInformation(const CruxSymbolListInterface &list1, const CruxSymbolListInterface &list2, bool resolveUnknowns)
Compute the mutual information between two lists.
static bool areSymbolListsIdentical(const IntSymbolListInterface &list1, const IntSymbolListInterface &list2)
static bool isParsimonyInformativeSite(const IntSymbolListInterface &site)
Tell if a site is a parsimony informative site.
static int getMinorAllele(const CruxSymbolListInterface &list)
return the state corresponding to the least common allele.
static bool isGapOnly(const IntSymbolListInterface &site)
static size_t numberOfGaps(const IntSymbolListInterface &site)
static void changeUnresolvedCharactersToGaps(IntSymbolListInterface &l)
Change all unknown characters to gap elements (or columns of 0).
static int getMajorAllele(const CruxSymbolListInterface &list)
return the state corresponding to the most common allele.
static size_t getNumberOfPositionsWithoutGap(const CruxSymbolListInterface &l1, const CruxSymbolListInterface &l2)
static size_t numberOfGaps(const CruxSymbolListInterface &site)
static size_t getNumberOfDistinctPositions(const CruxSymbolListInterface &l1, const CruxSymbolListInterface &l2)
static double variabilityFactorial(const IntSymbolListInterface &list)
Compute the factorial diversity index of a site.
static size_t getMajorAlleleFrequency(const IntSymbolListInterface &list)
return the number of occurrences of the most common allele.
static bool hasSingleton(const IntSymbolListInterface &list)
Tell if a list has singletons.
static bool hasUnknown(const CruxSymbolListInterface &site)
static void getCounts(const CruxSymbolListInterface &list1, const CruxSymbolListInterface &list2, std::map< int, std::map< int, double >> &counts, bool resolveUnknowns)
Count all pairs of states for two lists of the same size, optionally resolving unknown characters.
static void changeGapsToUnknownCharacters(IntSymbolListInterface &l)
Change all gap elements to unknown characters (or columns of 1).
static double entropy(const CruxSymbolListInterface &list, bool resolveUnknowns)
Compute the entropy of a site. This is an alias of method variabilityShannon.
static bool hasUnknown(const IntSymbolListInterface &site)
static bool hasGap(const CruxSymbolListInterface &site)
static bool isGapOrUnresolvedOnly(const CruxSymbolListInterface &site)
static bool isConstant(const IntSymbolListInterface &site, bool ignoreUnknown=false, bool unresolvedRaisesException=true)
Tell if a site is constant, that is displaying the same state in all sequences that do not present a ...
static double getGCContent(const IntSymbolListInterface &list, bool ignoreUnresolved=true, bool ignoreGap=true)
Get the GC content of a symbol list.
static void changeUnresolvedCharactersToGaps(CruxSymbolListInterface &l)
static void getCounts(const IntSymbolListInterface &list1, const IntSymbolListInterface &list2, std::map< int, std::map< int, count_type >> &counts)
Count all pair of states for two lists of the same size.
static double getGCContent(const CruxSymbolListInterface &list, bool ignoreUnresolved=true, bool ignoreGap=true)
static void getCounts(const IntSymbolListInterface &list, std::map< int, count_type > &counts)
Count all states in the list.
static double variabilityShannon(const CruxSymbolListInterface &list, bool resolveUnknowns)
Compute the Shannon entropy index of a SymbolList.
static void getFrequencies(const CruxSymbolListInterface &list, std::map< int, double > &frequencies, bool resolveUnknowns=false)
Get all states frequencies in the list.
static size_t getNumberOfDistinctCharacters(const IntSymbolListInterface &list)
Give the number of distinct characters at a list.
static void getCountsResolveUnknowns(const ProbabilisticSymbolListInterface &list, std::map< int, double > &counts)
Count all states in the list normalizing unknown characters.
static bool isGapOnly(const CruxSymbolListInterface &site)
static bool isDoubleton(const IntSymbolListInterface &list)
Tell if a list has exactly 2 distinct characters.
static void changeGapsToUnknownCharacters(CruxSymbolListInterface &l)
static void getCounts(const ProbabilisticSymbolListInterface &list, std::map< int, double_t > &counts)
Sum all states in the list.
static bool hasGap(const IntSymbolListInterface &site)
static bool isTriplet(const IntSymbolListInterface &list)
Tell if a list has more than 2 distinct characters.
static double heterozygosity(const CruxSymbolListInterface &list)
Compute the heterozygosity index of a list.
static size_t numberOfUnresolved(const IntSymbolListInterface &site)
static void getCounts(const CruxSymbolListInterface &list, std::map< int, double > &counts, bool resolveUnknowns=false)
Count all states in the list, optionally resolving unknown characters.
static size_t getNumberOfPositionsWithoutGap(const IntSymbolListInterface &l1, const IntSymbolListInterface &l2)
Get the number of positions without gap (or without null column).
static size_t getNumberOfDistinctPositions(const IntSymbolListInterface &l1, const IntSymbolListInterface &l2)
Get the number of distinct positions.
static bool areSymbolListsIdentical(const CruxSymbolListInterface &l1, const CruxSymbolListInterface &l2)
static size_t getMinorAlleleFrequency(const IntSymbolListInterface &list)
return the number of occurrences of the least common allele.
static void getCountsResolveUnknowns(const IntSymbolListInterface &list, std::map< int, double > &counts)
Count all states in the list normalizing unknown characters.
static bool isConstant(const CruxSymbolListInterface &site, bool ignoreUnknown=false, bool unresolvedRaisesException=true)
static bool isComplete(const IntSymbolListInterface &site)
static bool hasUnresolved(const IntSymbolListInterface &site)
static size_t numberOfUnresolved(const CruxSymbolListInterface &site)
static double jointEntropy(const CruxSymbolListInterface &list1, const CruxSymbolListInterface &list2, bool resolveUnknowns)
Compute the joint entropy between two lists.
static bool isGapOrUnresolvedOnly(const IntSymbolListInterface &site)
static void getCounts(const ProbabilisticSymbolListInterface &list1, const ProbabilisticSymbolListInterface &list2, std::map< int, std::map< int, double >> &counts)
Sum along the lists the joined probabilities for all pair of states for two lists of the same size.
static T sum(const std::vector< T > &v1)
This alphabet is used to deal NumericAlphabet.