25 for (
size_t i = 0; i < list.
size(); ++i)
36 for (
size_t i = 0; i < list.
size(); ++i)
38 if (VectorTools::sum(list[i]) <= NumConstants::TINY())
49 for (
size_t i = 0; i < list.
size(); ++i)
62 for (
size_t i = 0; i < list.
size(); ++i)
74 for (
size_t i = 0; i < list.
size(); ++i)
76 if (VectorTools::sum(list[i]) > NumConstants::TINY())
87 for (
size_t i = 0; i < list.
size(); ++i)
98 for (
size_t i = 0; i < list.
size(); ++i)
100 double ss = VectorTools::sum(list[i]);
101 if (ss > NumConstants::TINY() && ss < 1.)
112 for (
size_t i = 0; i < list.
size(); ++i)
114 if (list[i] == list.
getAlphabet()->getUnknownCharacterCode())
123 for (
size_t i = 0; i < list.
size(); ++i)
125 double ss = VectorTools::sum(list[i]);
137 for (
size_t i = 0; i < list.
size(); ++i)
148 for (
size_t i = 0; i < list.
size(); ++i)
150 double ss = VectorTools::sum(list[i]);
151 if (ss < NumConstants::TINY())
164 for (
size_t i = 0; i < list.
size(); ++i)
177 for (
size_t i = 0; i < list.
size(); ++i)
179 if (VectorTools::sum(list[i]) < NumConstants::TINY())
192 for (
size_t i = 0; i < list.
size(); ++i)
205 for (
size_t i = 0; i < list.
size(); ++i)
207 if (VectorTools::sum(list[i]) > 1.)
216 bool SymbolListTools::areSymbolListsIdentical(
227 for (
size_t i = 0; i < list1.
size(); ++i)
229 if (list1[i] != list2[i])
236 bool SymbolListTools::areSymbolListsIdentical(
247 for (
size_t i = 0; i < list1.
size(); ++i)
249 if (list1[i] != list2[i])
258 bool SymbolListTools::isConstant(
261 bool unresolvedRaisesException)
264 if (list.
size() == 0)
265 throw Exception(
"SymbolListTools::isConstant: Incorrect specified list, size must be > 0");
268 int gap = list.
getAlphabet()->getGapCharacterCode();
272 int unknown = list.
getAlphabet()->getUnknownCharacterCode();
274 while (i < list.
size() && (s == gap || s == unknown))
279 if (s == unknown || s == gap)
281 if (unresolvedRaisesException)
282 throw Exception(
"SymbolListTools::isConstant: IntCoreSymbolList is only made of gaps or generic characters.");
286 while (i < list.
size())
288 if (list[i] != s && list[i] != gap && list[i] != unknown)
297 while (i < list.
size() && s == gap)
304 if (unresolvedRaisesException)
305 throw Exception(
"SymbolListTools::isConstant: IntSymbolList is only made of gaps.");
309 while (i < list.
size())
311 if (list[i] != s && list[i] != gap)
320 bool SymbolListTools::isConstant(
322 bool unresolvedRaisesException)
325 if (list.
size() == 0)
326 throw Exception(
"SymbolListTools::isConstant: Incorrect specified list, size must be > 0");
330 double ss = VectorTools::sum(s);
332 while (i < list.
size() && (ss < NumConstants::TINY()))
335 ss = VectorTools::sum(s);
339 if (ss < NumConstants::TINY())
341 if (unresolvedRaisesException)
342 throw Exception(
"SymbolListTools::isConstant: ProbabilisticSymbolList is only made of gaps.");
346 while (i < list.
size())
348 if (list[i] != s && VectorTools::sum(list[i]) < NumConstants::TINY())
357 void SymbolListTools::getCountsResolveUnknowns(
360 map<
int, map<int, double>>& counts)
364 for (
size_t i = 0; i < list1.
size(); ++i)
366 vector<int> alias1 = list1.
getAlphabet()->getAlias(list1[i]);
367 vector<int> alias2 = list2.
getAlphabet()->getAlias(list2[i]);
368 double n1 = (double)alias1.size();
369 double n2 = (double)alias2.size();
370 for (
auto j : alias1)
372 for (
auto k : alias2)
374 counts[j][k] += 1. / (n1 * n2);
380 void SymbolListTools::getFrequencies(
382 map<int, double>& frequencies,
383 bool resolveUnknowns)
385 double n = (double)list.
size();
386 map<int, double> counts;
388 getCounts(list, counts, resolveUnknowns);
390 for (
auto it : counts)
392 frequencies[it.first] = it.second / n;
396 void SymbolListTools::getFrequencies(
399 map<
int, map<int, double>>& frequencies,
400 bool resolveUnknowns)
402 double n2 =
static_cast<double>(list1.
size()) *
static_cast<double>(list1.
size());
403 map<int, map<int, double>> counts;
404 getCounts(list1, list2, counts, resolveUnknowns);
406 for (
auto it1 : counts)
408 for (
auto it2 : it1.second)
410 frequencies[it1.first][it2.first] = it2.second / n2;
415 double SymbolListTools::getGCContent(
417 bool ignoreUnresolved,
421 if (!AlphabetTools::isNucleicAlphabet(*alphabet))
422 throw AlphabetException(
"SymbolListTools::getGCContent. Method only works on nucleotides.", alphabet);
425 for (
size_t i = 0; i < list.
size(); i++)
430 if (state == 1 || state == 2)
435 else if (state == 0 || state == 3)
441 if (!ignoreUnresolved)
446 case (7): gc++;
break;
447 case (4): gc += 0.5;
break;
448 case (5): gc += 0.5;
break;
449 case (6): gc += 0.5;
break;
450 case (9): gc += 0.5;
break;
451 case (10): gc += 2. / 3.;
break;
452 case (11): gc += 1. / 3.;
break;
453 case (12): gc += 1. / 3.;
break;
454 case (13): gc += 2. / 3.;
break;
455 case (14): gc += 0.5;
break;
466 return total != 0 ? gc / total : 0;
469 size_t SymbolListTools::getNumberOfDistinctPositions(
475 size_t n = min(l1.
size(), l2.
size());
477 for (
size_t i = 0; i < n; i++)
485 size_t SymbolListTools::getNumberOfPositionsWithoutGap(
491 size_t n = min(l1.
size(), l2.
size());
493 for (
size_t i = 0; i < n; i++)
495 if (l1[i] != -1 && l2[i] != -1)
503 int unknownCode = l.
getAlphabet()->getUnknownCharacterCode();
504 for (
size_t i = 0; i < l.
size(); i++)
513 int gapCode = l.
getAlphabet()->getGapCharacterCode();
514 for (
size_t i = 0; i < l.
size(); i++)
522 void SymbolListTools::getCountsResolveUnknowns(
525 map<
int, map<int, double>>& counts)
529 for (
size_t i = 0; i < list1.
size(); ++i)
531 const std::vector<double>& c1(list1[i]), &c2(list2[i]);
532 double s12 = VectorTools::sum(c1) * VectorTools::sum(c2);
534 for (
size_t j = 0; j < c1.size(); ++j)
536 for (
size_t k = 0; k < c2.size(); ++k)
538 counts[(int)j][(
int)k] += c1.at(j) * c2.at(k) / s12;
544 double SymbolListTools::getGCContent(
546 bool ignoreUnresolved,
550 if (!AlphabetTools::isNucleicAlphabet(*alphabet))
551 throw AlphabetException(
"SymbolListTools::getGCContent. Method only works on nucleotides.", alphabet);
554 for (
size_t i = 0; i < list.
size(); ++i)
557 double ss = VectorTools::sum(state);
563 gc += state.at(1) + state.at(2);
565 else if (!ignoreUnresolved)
568 gc += (state.at(1) + state.at(2)) / ss;
578 return total != 0 ? gc / total : 0;
581 size_t SymbolListTools::getNumberOfDistinctPositions(
588 size_t n = min(l1.
size(), l2.
size());
590 for (
size_t i = 0; i < n; ++i)
598 size_t SymbolListTools::getNumberOfPositionsWithoutGap(
604 size_t n = min(l1.
size(), l2.
size());
606 for (
size_t i = 0; i < n; ++i)
608 if (VectorTools::sum(l1[i]) > NumConstants::TINY() && VectorTools::sum(l2[i]) > NumConstants::TINY())
616 for (
size_t i = 0; i < l.
size(); ++i)
618 if (VectorTools::sum(l[i]) < NumConstants::TINY())
619 VectorTools::fill(l[i], 1.);
625 for (
size_t i = 0; i < l.
size(); ++i)
627 if (VectorTools::sum(l[i]) > 1.)
628 VectorTools::fill(l[i], 0.);
632 double SymbolListTools::variabilityShannon(
637 if (list.
size() == 0)
638 throw Exception(
"SymbolListTools::variabilityShannon: Incorrect specified list, size must be > 0.");
641 getFrequencies(list, p, resolveUnknown);
644 for (
int i = 0; i < static_cast<int>(list.
getAlphabet()->getSize()); i++)
656 double SymbolListTools::mutualInformation(
662 if (list1.
size() == 0)
663 throw Exception(
"SymbolListTools::mutualInformation: Incorrect specified list, size must be > 0");
664 if (list2.
size() == 0)
665 throw Exception(
"SymbolListTools::mutualInformation: Incorrect specified list, size must be > 0");
670 map<int, map<int, double>> p12;
671 getCounts(list1, list2, p12, resolveUnknown);
672 double mi = 0, tot = 0, pxy;
674 for (
size_t i = 0; i < list1.
getAlphabet()->getSize(); i++)
676 for (
size_t j = 0; j < list2.
getAlphabet()->getSize(); j++)
678 pxy = p12[
static_cast<int>(i)][
static_cast<int>(j)];
684 for (
size_t i = 0; i < list1.
getAlphabet()->getSize(); i++)
688 for (
size_t j = 0; j < list2.
getAlphabet()->getSize(); j++)
692 for (
size_t i = 0; i < list1.
getAlphabet()->getSize(); i++)
694 for (
size_t j = 0; j < list2.
getAlphabet()->getSize(); j++)
696 pxy = p12[
static_cast<int>(i)][
static_cast<int>(j)] / tot;
698 mi += pxy * log(pxy / (p1[i] * p2[j]));
706 double SymbolListTools::jointEntropy(
712 if (list1.
size() == 0)
713 throw Exception(
"SymbolListTools::jointEntropy: Incorrect specified list, size must be > 0");
714 if (list2.
size() == 0)
715 throw Exception(
"SymbolListTools::jointEntropy: Incorrect specified list, size must be > 0");
718 map<int, map<int, double>> p12;
719 getCounts(list1, list2, p12, resolveUnknown);
720 double tot = 0, pxy, h = 0;
722 for (
size_t i = 0; i < list1.
getAlphabet()->getSize(); ++i)
724 for (
size_t j = 0; j < list2.
getAlphabet()->getSize(); ++j)
726 pxy = p12[
static_cast<int>(i)][
static_cast<int>(j)];
730 for (
size_t i = 0; i < list1.
getAlphabet()->getSize(); ++i)
732 for (
size_t j = 0; j < list2.
getAlphabet()->getSize(); ++j)
734 pxy = p12[
static_cast<int>(i)][
static_cast<int>(j)] / tot;
744 double SymbolListTools::variabilityFactorial(
748 if (list.
size() == 0)
749 throw Exception(
"SymbolListTools::variabilityFactorial: Incorrect specified list, size must be > 0");
752 vector<size_t> c = MapTools::getValues(p);
753 size_t s = VectorTools::sum(c);
754 long double l =
static_cast<long double>(NumTools::fact(s)) /
static_cast<long double>(VectorTools::sum(VectorTools::fact(c)));
755 return static_cast<double>(std::log(l));
763 if (list.
size() == 0)
764 throw Exception(
"SymbolListTools::heterozygosity: Incorrect specified list, size must be > 0");
766 getFrequencies(list, p);
767 vector<double> c = MapTools::getValues(p);
768 double n = VectorTools::norm<double, double>(MapTools::getValues(p));
777 if (list.
size() == 0)
778 throw Exception(
"SymbolListTools::getNumberOfDistinctCharacters(): Incorrect specified list, size must be > 0");
780 if (SymbolListTools::isConstant(list))
782 map<int, size_t> counts;
783 SymbolListTools::getCounts(list, counts);
785 for (map<int, size_t>::iterator it = counts.begin(); it != counts.end(); it++)
798 if (list.
size() == 0)
799 throw Exception(
"SymbolListTools::getMajorAlleleFrequency(): Incorrect specified list, size must be > 0");
801 if (SymbolListTools::isConstant(list))
803 map<int, size_t> counts;
804 getCounts(list, counts);
806 for (map<int, size_t>::iterator it = counts.begin(); it != counts.end(); it++)
820 if (list.
size() == 0)
821 throw Exception(
"SymbolListTools::getMajorAllele(): Incorrect specified list, size must be > 0");
826 map<int, double> counts;
827 SymbolListTools::getCounts(list, counts);
830 for (
auto it : counts)
847 if (list.
size() == 0)
848 throw Exception(
"SymbolListTools::getMinorAlleleFrequency(): Incorrect specified list, size must be > 0.");
850 if (SymbolListTools::isConstant(list))
852 map<int, size_t> counts;
853 SymbolListTools::getCounts(list, counts);
854 size_t s = list.
size();
855 for (
auto it : counts)
869 if (list.
size() == 0)
870 throw Exception(
"SymbolListTools::getMinorAllele(): Incorrect specified list, size must be > 0.");
874 map<int, double> counts;
875 SymbolListTools::getCounts(list, counts);
876 double s = (double)list.
size();
878 for (
auto it : counts)
895 if (list.
size() == 0)
896 throw Exception(
"SymbolListTools::hasSingleton: Incorrect specified list, size must be > 0");
898 if (SymbolListTools::isConstant(list))
900 map<int, size_t> counts;
901 getCounts(list, counts);
902 for (map<int, size_t>::iterator it = counts.begin(); it != counts.end(); it++)
915 if (list.
size() == 0)
916 throw Exception(
"SymbolListTools::isParsimonyInformativeSite: Incorrect specified list, size must be > 0");
918 if (SymbolListTools::isConstant(list,
false,
false))
920 map<int, size_t> counts;
921 SymbolListTools::getCounts(list, counts);
923 for (map<int, size_t>::iterator it = counts.begin(); it != counts.end(); it++)
938 if (list.
size() == 0)
939 throw Exception(
"SymbolListTools::isTriplet: Incorrect specified list, size must be > 0");
941 return SymbolListTools::getNumberOfDistinctCharacters(list) >= 3;
949 if (list.
size() == 0)
950 throw Exception(
"SymbolListTools::isDoubleton: Incorrect specified list, size must be > 0");
952 return SymbolListTools::getNumberOfDistinctCharacters(list) == 2;
The alphabet exception base class.
Exception thrown when two alphabets do not match.
The CruxSymbolList interface.
virtual std::shared_ptr< const Alphabet > getAlphabet() const =0
Get the alphabet associated to the list.
virtual size_t size() const =0
Get the number of elements in the list.
The specific IntSymbolList interface.
The ProbabilisticSymbolList interface.
virtual const T & getValue(size_t pos) const =0
checked access to a character in list.
std::size_t count(const std::string &s, const std::string &pattern)
This alphabet is used to deal NumericAlphabet.
std::vector< double > Vdouble