bpp-seq3  3.0.0
SiteContainerTools.cpp
Go to the documentation of this file.
1 // SPDX-FileCopyrightText: The Bio++ Development Group
2 //
3 // SPDX-License-Identifier: CECILL-2.1
4 
6 
7 #include "../Alphabet/AlphabetTools.h"
8 #include "../Site.h"
9 #include "../CodonSiteTools.h"
10 #include "../SequenceTools.h"
11 #include "../SymbolListTools.h"
12 #include "SequenceContainerTools.h"
13 #include "SiteContainerIterator.h"
14 #include "SiteContainerTools.h"
15 #include "VectorSiteContainer.h"
16 
17 using namespace bpp;
18 
19 // From the STL:
20 #include <vector>
21 #include <deque>
22 #include <string>
23 
24 using namespace std;
25 
26 /******************************************************************************/
27 
28 unique_ptr<Sequence> SiteContainerTools::getConsensus(
29  const SiteContainerInterface& sc,
30  const std::string& name,
31  bool ignoreGap,
32  bool resolveUnknown)
33 {
34  Vint consensus;
36  const Site* site;
37  while (ssi.hasMoreSites())
38  {
39  site = &ssi.nextSite();
40  map<int, double> freq;
41  SiteTools::getFrequencies(*site, freq, resolveUnknown);
42  double max = 0;
43  int cons = -1; // default result
44  if (ignoreGap)
45  {
46  for (auto& it : freq)
47  {
48  if (it.second > max && it.first != -1)
49  {
50  max = it.second;
51  cons = it.first;
52  }
53  }
54  }
55  else
56  {
57  for (auto& it : freq)
58  {
59  if (it.second > max)
60  {
61  max = it.second;
62  cons = it.first;
63  }
64  }
65  }
66  consensus.push_back(cons);
67  }
68  auto alphaPtr = sc.getAlphabet();
69  auto seqConsensus = make_unique<Sequence>(name, consensus, alphaPtr);
70  return seqConsensus;
71 }
72 
73 /******************************************************************************/
74 
76 {
77  int unknownCode = sites.getAlphabet()->getUnknownCharacterCode();
78  for (size_t i = 0; i < sites.getNumberOfSites(); ++i)
79  {
80  unique_ptr<Site> site(sites.site(i).clone());
81  if (SiteTools::hasGap(*site))
82  {
83  for (size_t j = 0; j < sites.getNumberOfSequences(); ++j)
84  {
85  if (sites.getAlphabet()->isGap((*site)[j]))
86  {
87  (*site)[j] = unknownCode;
88  }
89  }
90  sites.setSite(i, site, false);
91  }
92  }
93 }
94 
95 /******************************************************************************/
96 
98 {
99  for (size_t i = 0; i < sites.getNumberOfSites(); ++i)
100  {
101  unique_ptr<ProbabilisticSite> site(sites.site(i).clone());
102  if (SiteTools::hasGap(*site))
103  {
104  for (size_t j = 0; j < sites.getNumberOfSequences(); ++j)
105  {
106  vector<double>& element = (*site)[j];
107  if (VectorTools::sum(element) <= NumConstants::TINY())
108  {
109  VectorTools::fill(element, 0.);
110  }
111  }
112  sites.setSite(i, site, false);
113  }
114  }
115 }
116 
117 /******************************************************************************/
118 
120 {
121  // NB: use iterators for a better algorithm?
122  int gapCode = sites.getAlphabet()->getGapCharacterCode();
123  for (size_t i = 0; i < sites.getNumberOfSites(); ++i)
124  {
125  unique_ptr<Site> site(sites.site(i).clone());
126  if (SiteTools::hasUnresolved(*site))
127  {
128  for (size_t j = 0; j < sites.getNumberOfSequences(); ++j)
129  {
130  if (sites.getAlphabet()->isUnresolved((*site)[j]))
131  (*site)[j] = gapCode;
132  }
133  sites.setSite(i, site, false);
134  }
135  }
136 }
137 
138 /******************************************************************************/
139 
140 unique_ptr<SiteContainerInterface> SiteContainerTools::resolveDottedAlignment(
141  const SiteContainerInterface& dottedAln,
142  std::shared_ptr<const Alphabet>& resolvedAlphabet)
143 {
144  if (!AlphabetTools::isDefaultAlphabet(dottedAln.alphabet()))
145  throw AlphabetException("SiteContainerTools::resolveDottedAlignment. Alignment alphabet should of class 'DefaultAlphabet'.", dottedAln.getAlphabet());
146 
147  // First we look for the reference sequence:
148  size_t n = dottedAln.getNumberOfSequences();
149  if (n == 0)
150  throw Exception("SiteContainerTools::resolveDottedAlignment. Input alignment contains no sequence.");
151 
152  const Sequence* refSeq = 0;
153  for (size_t i = 0; i < n; ++i) // Test each sequence
154  {
155  const Sequence& seq = dottedAln.sequence(i);
156  bool isRef = true;
157  for (size_t j = 0; isRef && j < seq.size(); ++j) // For each site in the sequence
158  {
159  if (seq.getChar(j) == ".")
160  isRef = false;
161  }
162  if (isRef) // We found the reference sequence!
163  {
164  refSeq = &seq;
165  }
166  }
167  if (!refSeq)
168  throw Exception("SiteContainerTools::resolveDottedAlignment. No reference sequence was found in the input alignment.");
169 
170  // Now we build a new VectorSiteContainer:
171  auto sites = make_unique<VectorSiteContainer>(dottedAln.getSequenceKeys(), resolvedAlphabet);
172  sites->setComments(dottedAln.getComments());
173 
174  // We add each site one by one:
175  size_t m = dottedAln.getNumberOfSites();
176  string state;
177  for (size_t i = 0; i < m; ++i)
178  {
179  string resolved = refSeq->getChar(i);
180  const Site& site = dottedAln.site(i);
181  auto resolvedSite = make_unique<Site>(resolvedAlphabet, site.getCoordinate());
182  for (size_t j = 0; j < n; ++j)
183  {
184  state = site.getChar(j);
185  if (state == ".")
186  {
187  state = resolved;
188  }
189  resolvedSite->addElement(state);
190  }
191  // Add the new site:
192  sites->addSite(resolvedSite, false);
193  }
194 
195  // Return result:
196  return sites;
197 }
198 
199 /******************************************************************************/
200 
201 std::map<size_t, size_t> SiteContainerTools::getSequencePositions(const Sequence& seq)
202 {
203  int gapCode = seq.getAlphabet()->getGapCharacterCode();
204  map<size_t, size_t> tln;
205  if (seq.size() == 0)
206  return tln;
207  size_t count = 0;
208  for (size_t i = 0; i < seq.size(); ++i)
209  {
210  if (seq[i] != gapCode)
211  {
212  count++;
213  tln[i + 1] = count;
214  }
215  }
216  return tln;
217 }
218 
219 /******************************************************************************/
220 
221 std::map<size_t, size_t> SiteContainerTools::getAlignmentPositions(const Sequence& seq)
222 {
223  int gapCode = seq.getAlphabet()->getGapCharacterCode();
224  map<size_t, size_t> tln;
225  if (seq.size() == 0)
226  return tln;
227  size_t count = 0;
228  for (size_t i = 0; i < seq.size(); ++i)
229  {
230  if (seq[i] != gapCode)
231  {
232  count++;
233  tln[count] = i + 1;
234  }
235  }
236  return tln;
237 }
238 
239 /******************************************************************************/
240 
241 std::map<size_t, size_t> SiteContainerTools::translateAlignment(const Sequence& seq1, const Sequence& seq2)
242 {
243  if (seq1.getAlphabet()->getAlphabetType() != seq2.getAlphabet()->getAlphabetType())
244  throw AlphabetMismatchException("SiteContainerTools::translateAlignment", seq1.getAlphabet(), seq2.getAlphabet());
245  map<size_t, size_t> tln;
246  if (seq1.size() == 0)
247  return tln;
248  unsigned int count1 = 0;
249  unsigned int count2 = 0;
250  if (seq2.size() == 0)
251  throw Exception("SiteContainerTools::translateAlignment. Sequences do not match at position " + TextTools::toString(count1 + 1) + " and " + TextTools::toString(count2 + 1) + ".");
252  int state1 = seq1[count1];
253  int state2 = seq2[count2];
254  bool end = false;
255  while (!end)
256  {
257  while (state1 == -1)
258  {
259  count1++;
260  if (count1 < seq1.size())
261  state1 = seq1[count1];
262  else
263  break;
264  }
265  while (state2 == -1)
266  {
267  count2++;
268  if (count2 < seq2.size())
269  state2 = seq2[count2];
270  else
271  break;
272  }
273  if (state1 != state2)
274  throw Exception("SiteContainerTools::translateAlignment. Sequences do not match at position " + TextTools::toString(count1 + 1) + " and " + TextTools::toString(count2 + 1) + ".");
275  tln[count1 + 1] = count2 + 1; // Count start at 1
276  if (count1 == seq1.size() - 1)
277  end = true;
278  else
279  {
280  if (count2 == seq2.size() - 1)
281  {
282  state1 = seq1[++count1];
283  while (state1 == -1)
284  {
285  count1++;
286  if (count1 < seq1.size())
287  state1 = seq1[count1];
288  else
289  break;
290  }
291  if (state1 == -1)
292  end = true;
293  else
294  throw Exception("SiteContainerTools::translateAlignment. Sequences do not match at position " + TextTools::toString(count1 + 1) + " and " + TextTools::toString(count2 + 1) + ".");
295  }
296  else
297  {
298  state1 = seq1[++count1];
299  state2 = seq2[++count2];
300  }
301  }
302  }
303  return tln;
304 }
305 
306 /******************************************************************************/
307 
308 std::map<size_t, size_t> SiteContainerTools::translateSequence(const SiteContainerInterface& sequences, size_t i1, size_t i2)
309 {
310  const Sequence& seq1 = sequences.sequence(i1);
311  const Sequence& seq2 = sequences.sequence(i2);
312  map<size_t, size_t> tln;
313  size_t count1 = 0; // Sequence 1 counter
314  size_t count2 = 0; // Sequence 2 counter
315  int state1;
316  int state2;
317  for (size_t i = 0; i < sequences.getNumberOfSites(); ++i)
318  {
319  state1 = seq1[i];
320  if (state1 != -1)
321  count1++;
322  state2 = seq2[i];
323  if (state2 != -1)
324  count2++;
325  if (state1 != -1)
326  {
327  tln[count1] = (state2 == -1 ? 0 : count2);
328  }
329  }
330  return tln;
331 }
332 
333 /******************************************************************************/
334 
335 std::unique_ptr<AlignedSequenceContainer> SiteContainerTools::alignNW(
336  const Sequence& seq1,
337  const Sequence& seq2,
338  const AlphabetIndex2& s,
339  double gap)
340 {
341  if (seq1.getAlphabet()->getAlphabetType() != seq2.getAlphabet()->getAlphabetType())
342  throw AlphabetMismatchException("SiteContainerTools::alignNW", seq1.getAlphabet(), seq2.getAlphabet());
343  if (seq1.getAlphabet()->getAlphabetType() != s.getAlphabet()->getAlphabetType())
344  throw AlphabetMismatchException("SiteContainerTools::alignNW", seq1.getAlphabet(), s.getAlphabet());
345  // Check that sequences have no gap!
346  unique_ptr<Sequence> s1(seq1.clone());
348  unique_ptr<Sequence> s2(seq2.clone());
350 
351  // 1) Initialize matrix:
352  RowMatrix<double> m(s1->size() + 1, s2->size() + 1);
353  RowMatrix<char> p(s1->size(), s2->size());
354  double choice1, choice2, choice3, mx;
355  char px;
356  for (size_t i = 0; i <= s1->size(); i++)
357  {
358  m(i, 0) = static_cast<double>(i) * gap;
359  }
360  for (size_t j = 0; j <= s2->size(); j++)
361  {
362  m(0, j) = static_cast<double>(j) * gap;
363  }
364  for (size_t i = 1; i <= s1->size(); i++)
365  {
366  for (size_t j = 1; j <= s2->size(); j++)
367  {
368  choice1 = m(i - 1, j - 1) + static_cast<double>(s.getIndex((*s1)[i - 1], (*s2)[j - 1]));
369  choice2 = m(i - 1, j) + gap;
370  choice3 = m(i, j - 1) + gap;
371  mx = choice1; px = 'd'; // Default in case of equality of scores.
372  if (choice2 > mx)
373  {
374  mx = choice2; px = 'u';
375  }
376  if (choice3 > mx)
377  {
378  mx = choice3; px = 'l';
379  }
380  m(i, j) = mx;
381  p(i - 1, j - 1) = px;
382  }
383  }
384 
385  // 2) Get alignment:
386  deque<int> a1, a2;
387  size_t i = s1->size(), j = s2->size();
388  char c;
389  while (i > 0 && j > 0)
390  {
391  c = p(i - 1, j - 1);
392  if (c == 'd')
393  {
394  a1.push_front((*s1)[i - 1]);
395  a2.push_front((*s2)[j - 1]);
396  i--;
397  j--;
398  }
399  else if (c == 'u')
400  {
401  a1.push_front((*s1)[i - 1]);
402  a2.push_front(-1);
403  i--;
404  }
405  else
406  {
407  a1.push_front(-1);
408  a2.push_front((*s2)[j - 1]);
409  j--;
410  }
411  }
412  while (i > 0)
413  {
414  a1.push_front((*s1)[i - 1]);
415  a2.push_front(-1);
416  i--;
417  }
418  while (j > 0)
419  {
420  a1.push_front(-1);
421  a2.push_front((*s2)[j - 1]);
422  j--;
423  }
424  s1->setContent(vector<int>(a1.begin(), a1.end()));
425  s2->setContent(vector<int>(a2.begin(), a2.end()));
426  auto alphaPtr = s1->getAlphabet();
427  auto asc = make_unique<AlignedSequenceContainer>(alphaPtr);
428  asc->addSequence(s1->getName(), s1);
429  asc->addSequence(s2->getName(), s2);
430  return asc;
431 }
432 
433 /******************************************************************************/
434 
435 unique_ptr<AlignedSequenceContainer> SiteContainerTools::alignNW(
436  const Sequence& seq1,
437  const Sequence& seq2,
438  const AlphabetIndex2& s,
439  double opening,
440  double extending)
441 {
442  if (seq1.getAlphabet()->getAlphabetType() != seq2.getAlphabet()->getAlphabetType())
443  throw AlphabetMismatchException("SiteContainerTools::alignNW", seq1.getAlphabet(), seq2.getAlphabet());
444  if (seq1.getAlphabet()->getAlphabetType() != s.getAlphabet()->getAlphabetType())
445  throw AlphabetMismatchException("SiteContainerTools::alignNW", seq1.getAlphabet(), s.getAlphabet());
446  // Check that sequences have no gap!
447  unique_ptr<Sequence> s1(seq1.clone());
449  unique_ptr<Sequence> s2(seq2.clone());
451 
452  // 1) Initialize matrix:
453  RowMatrix<double> m(s1->size() + 1, s2->size() + 1);
454  RowMatrix<double> v(s1->size() + 1, s2->size() + 1);
455  RowMatrix<double> h(s1->size() + 1, s2->size() + 1);
456  RowMatrix<char> p(s1->size(), s2->size());
457 
458  double choice1, choice2, choice3, mx;
459  char px;
460  m(0, 0) = 0.;
461  for (size_t i = 0; i <= s1->size(); i++)
462  {
463  v(i, 0) = log(0.);
464  }
465  for (size_t j = 0; j <= s2->size(); j++)
466  {
467  h(0, j) = log(0.);
468  }
469  for (size_t i = 1; i <= s1->size(); i++)
470  {
471  m(i, 0) = h(i, 0) = opening + static_cast<double>(i) * extending;
472  }
473  for (size_t j = 1; j <= s2->size(); j++)
474  {
475  m(0, j) = v(0, j) = opening + static_cast<double>(j) * extending;
476  }
477 
478  for (size_t i = 1; i <= s1->size(); i++)
479  {
480  for (size_t j = 1; j <= s2->size(); j++)
481  {
482  choice1 = m(i - 1, j - 1) + s.getIndex((*s1)[i - 1], (*s2)[j - 1]);
483  choice2 = h(i - 1, j - 1) + opening + extending;
484  choice3 = v(i - 1, j - 1) + opening + extending;
485  mx = choice1; // Default in case of equality of scores.
486  if (choice2 > mx)
487  {
488  mx = choice2;
489  }
490  if (choice3 > mx)
491  {
492  mx = choice3;
493  }
494  m(i, j) = mx;
495 
496  choice1 = m(i, j - 1) + opening + extending;
497  choice2 = h(i, j - 1) + extending;
498  mx = choice1; // Default in case of equality of scores.
499  if (choice2 > mx)
500  {
501  mx = choice2;
502  }
503  h(i, j) = mx;
504 
505  choice1 = m(i - 1, j) + opening + extending;
506  choice2 = v(i - 1, j) + extending;
507  mx = choice1; // Default in case of equality of scores.
508  if (choice2 > mx)
509  {
510  mx = choice2;
511  }
512  v(i, j) = mx;
513 
514  px = 'd';
515  if (v(i, j) > m(i, j))
516  px = 'u';
517  if (h(i, j) > m(i, j))
518  px = 'l';
519  p(i - 1, j - 1) = px;
520  }
521  }
522 
523  // 2) Get alignment:
524  deque<int> a1, a2;
525  size_t i = s1->size(), j = s2->size();
526  char c;
527  while (i > 0 && j > 0)
528  {
529  c = p(i - 1, j - 1);
530  if (c == 'd')
531  {
532  a1.push_front((*s1)[i - 1]);
533  a2.push_front((*s2)[j - 1]);
534  i--;
535  j--;
536  }
537  else if (c == 'u')
538  {
539  a1.push_front((*s1)[i - 1]);
540  a2.push_front(-1);
541  i--;
542  }
543  else
544  {
545  a1.push_front(-1);
546  a2.push_front((*s2)[j - 1]);
547  j--;
548  }
549  }
550  while (i > 0)
551  {
552  a1.push_front((*s1)[i - 1]);
553  a2.push_front(-1);
554  i--;
555  }
556  while (j > 0)
557  {
558  a1.push_front(-1);
559  a2.push_front((*s2)[j - 1]);
560  j--;
561  }
562  s1->setContent(vector<int>(a1.begin(), a1.end()));
563  s2->setContent(vector<int>(a2.begin(), a2.end()));
564  auto alphaPtr = s1->getAlphabet();
565  auto asc = make_unique<AlignedSequenceContainer>(alphaPtr);
566  asc->addSequence(s1->getName(), s1);
567  asc->addSequence(s2->getName(), s2);
568  return asc;
569 }
570 
571 /******************************************************************************/
572 
573 const string SiteContainerTools::SIMILARITY_ALL = "all sites";
574 const string SiteContainerTools::SIMILARITY_NOFULLGAP = "no full gap";
575 const string SiteContainerTools::SIMILARITY_NODOUBLEGAP = "no double gap";
576 const string SiteContainerTools::SIMILARITY_NOGAP = "no gap";
577 
578 /******************************************************************************/
579 
581  const SequenceInterface& seq1,
582  const SequenceInterface& seq2,
583  bool dist,
584  const std::string& gapOption,
585  bool unresolvedAsGap)
586 {
587  if (seq1.size() != seq2.size())
588  throw SequenceNotAlignedException("SiteContainerTools::computeSimilarity.", &seq2);
589  if (seq1.getAlphabet()->getAlphabetType() != seq2.getAlphabet()->getAlphabetType())
590  throw AlphabetMismatchException("SiteContainerTools::computeSimilarity.", seq1.getAlphabet(), seq2.getAlphabet());
591 
592  std::shared_ptr<const Alphabet> alpha = seq1.getAlphabet();
593  unsigned int s = 0;
594  unsigned int t = 0;
595  for (size_t i = 0; i < seq1.size(); i++)
596  {
597  int x = seq1[i];
598  int y = seq2[i];
599  int gapCode = alpha->getGapCharacterCode();
600  if (unresolvedAsGap)
601  {
602  if (alpha->isUnresolved(x))
603  x = gapCode;
604  if (alpha->isUnresolved(y))
605  y = gapCode;
606  }
607  if (gapOption == SIMILARITY_ALL)
608  {
609  t++;
610  if (x == y && !alpha->isGap(x) && !alpha->isGap(y))
611  s++;
612  }
613  else if (gapOption == SIMILARITY_NODOUBLEGAP)
614  {
615  if (!alpha->isGap(x) || !alpha->isGap(y))
616  {
617  t++;
618  if (x == y)
619  s++;
620  }
621  }
622  else if (gapOption == SIMILARITY_NOGAP)
623  {
624  if (!alpha->isGap(x) && !alpha->isGap(y))
625  {
626  t++;
627  if (x == y)
628  s++;
629  }
630  }
631  else
632  throw Exception("SiteContainerTools::computeSimilarity. Invalid gap option: " + gapOption);
633  }
634  double r = (t == 0 ? 0. : static_cast<double>(s) / static_cast<double>(t));
635  return dist ? 1 - r : r;
636 }
637 
638 /******************************************************************************/
639 
640 std::unique_ptr<DistanceMatrix> SiteContainerTools::computeSimilarityMatrix(
641  const SiteContainerInterface& sites,
642  bool dist,
643  const std::string& gapOption,
644  bool unresolvedAsGap)
645 {
646  size_t n = sites.getNumberOfSequences();
647  auto mat = make_unique<DistanceMatrix>(sites.getSequenceNames());
648  string pairwiseGapOption = gapOption;
649  unique_ptr<SiteContainerInterface> sites2;
650  if (gapOption == SIMILARITY_NOFULLGAP)
651  {
652  if (unresolvedAsGap)
653  {
654  auto tmp = removeGapOrUnresolvedOnlySites(sites);
655  sites2 = make_unique<AlignedSequenceContainer>(*tmp);
656  }
657  else
658  {
659  auto tmp = removeGapOnlySites(sites);
660  sites2 = make_unique<AlignedSequenceContainer>(*tmp);
661  }
662  pairwiseGapOption = SIMILARITY_ALL;
663  }
664  else
665  {
666  sites2 = make_unique<AlignedSequenceContainer>(sites);
667  }
668 
669  for (size_t i = 0; i < n; ++i)
670  {
671  (*mat)(i, i) = dist ? 0. : 1.;
672  const Sequence& seq1 = sites2->sequence(i);
673  for (size_t j = i + 1; j < n; ++j)
674  {
675  const Sequence& seq2 = sites2->sequence(j);
676  (*mat)(i, j) = (*mat)(j, i) = computeSimilarity(seq1, seq2, dist, pairwiseGapOption, unresolvedAsGap);
677  }
678  }
679  return mat;
680 }
681 
682 /******************************************************************************/
683 
685  const SiteContainerInterface& sites,
686  Matrix<size_t>& positions)
687 {
688  positions.resize(sites.getNumberOfSequences(), sites.getNumberOfSites());
689  int gapCode = sites.getAlphabet()->getGapCharacterCode();
690  for (size_t i = 0; i < sites.getNumberOfSequences(); ++i)
691  {
692  const Sequence& seq = sites.sequence(i);
693  size_t pos = 0;
694  for (size_t j = 0; j < sites.getNumberOfSites(); ++j)
695  {
696  if (seq[j] != gapCode)
697  {
698  ++pos;
699  positions(i, j) = pos;
700  }
701  else
702  {
703  positions(i, j) = 0;
704  }
705  }
706  }
707 }
708 
709 /******************************************************************************/
710 
711 vector<int> SiteContainerTools::getColumnScores(const Matrix<size_t>& positions1, const Matrix<size_t>& positions2, int na)
712 {
713  if (positions1.getNumberOfRows() != positions2.getNumberOfRows())
714  throw Exception("SiteContainerTools::getColumnScores. The two input alignments must have the same number of sequences!");
715  vector<int> scores(positions1.getNumberOfColumns());
716  for (size_t i = 0; i < positions1.getNumberOfColumns(); ++i)
717  {
718  // Find an anchor point:
719  size_t whichSeq = 0;
720  size_t whichPos = 0;
721  for (size_t j = 0; j < positions1.getNumberOfRows(); ++j)
722  {
723  if (positions1(j, i) > 0)
724  {
725  whichSeq = j;
726  whichPos = positions1(j, i);
727  break;
728  }
729  }
730  if (whichPos == 0)
731  {
732  // No anchor found, this alignment column is only made of gaps. We assign a score of 'na' and move to the next column.
733  scores[i] = na;
734  continue;
735  }
736  // We look for the anchor in the reference alignment:
737  size_t i2 = 0;
738  bool found = false;
739  for (size_t j = 0; !found && j < positions2.getNumberOfColumns(); ++j)
740  {
741  if (positions2(whichSeq, j) == whichPos)
742  {
743  i2 = j;
744  found = true;
745  }
746  }
747  if (!found)
748  {
749  throw Exception("SiteContainerTools::getColumnScores(). Position " + TextTools::toString(whichPos) + " of sequence " + TextTools::toString(whichSeq) + " not found in reference alignment. Please make sure the two indexes are built from the same data!");
750  }
751  // Now we compare all pairs of sequences between the two positions:
752  bool test = true;
753  for (size_t j = 0; test && j < positions1.getNumberOfRows(); ++j)
754  {
755  test = (positions1(j, i) == positions2(j, i2));
756  }
757  scores[i] = test ? 1 : 0;
758  }
759  return scores;
760 }
761 
762 /******************************************************************************/
763 
764 vector<double> SiteContainerTools::getSumOfPairsScores(const Matrix<size_t>& positions1, const Matrix<size_t>& positions2, double na)
765 {
766  if (positions1.getNumberOfRows() != positions2.getNumberOfRows())
767  throw Exception("SiteContainerTools::getColumnScores. The two input alignments must have the same number of sequences!");
768  vector<double> scores(positions1.getNumberOfColumns());
769  for (size_t i = 0; i < positions1.getNumberOfColumns(); ++i)
770  {
771  // For all positions in alignment 1...
772  size_t countAlignable = 0;
773  size_t countAligned = 0;
774  for (size_t j = 0; j < positions1.getNumberOfRows(); ++j)
775  {
776  // Get the corresponding column in alignment 2:
777  size_t whichPos = positions1(j, i);
778  if (whichPos == 0)
779  {
780  // No position for this sequence here.
781  continue;
782  }
783  // We look for the position in the second alignment:
784  size_t i2 = 0;
785  bool found = false;
786  for (size_t k = 0; !found && k < positions2.getNumberOfColumns(); ++k)
787  {
788  if (positions2(j, k) == whichPos)
789  {
790  i2 = k;
791  found = true;
792  }
793  }
794  if (!found)
795  {
796  throw Exception("SiteContainerTools::getColumnScores(). Position " + TextTools::toString(whichPos) + " of sequence " + TextTools::toString(j) + " not found in reference alignment. Please make sure the two indexes are built from the same data!");
797  }
798 
799  // Now we check all other positions and see if they are aligned with this one:
800  for (size_t k = j + 1; k < positions1.getNumberOfRows(); ++k)
801  {
802  size_t whichPos2 = positions1(k, i);
803  if (whichPos2 == 0)
804  {
805  // Empty position
806  continue;
807  }
808  countAlignable++;
809  // check position in alignment 2:
810  if (positions2(k, i2) == whichPos2)
811  countAligned++;
812  }
813  }
814  scores[i] = countAlignable == 0 ? na : static_cast<double>(countAligned) / static_cast<double>(countAlignable);
815  }
816  return scores;
817 }
818 
819 /******************************************************************************/
int getCoordinate() const override
Get the coordinate associated to this site.
Definition: CoreSite.h:144
std::shared_ptr< const Alphabet > getAlphabet() const override
Get the alphabet associated to the list.
Definition: SymbolList.h:120
size_t size() const override
Get the number of elements in the list.
Definition: SymbolList.h:124
The alphabet exception base class.
Two dimensionnal alphabet index interface.
virtual double getIndex(int state1, int state2) const =0
Get the index associated to a pair of states.
virtual std::shared_ptr< const Alphabet > getAlphabet() const =0
Get the alphabet associated to this index.
Exception thrown when two alphabets do not match.
static bool isDefaultAlphabet(const Alphabet &alphabet)
virtual const Comments & getComments() const =0
Get the comments.
virtual std::shared_ptr< const Alphabet > getAlphabet() const =0
Get the alphabet associated to the list.
virtual size_t size() const =0
Get the number of elements in the list.
std::string getChar(size_t pos) const override
Get the element at position 'pos' as a character.
virtual size_t getNumberOfColumns() const=0
virtual void resize(size_t nRows, size_t nCols)=0
virtual size_t getNumberOfRows() const=0
static double TINY()
The sequence interface.
Definition: Sequence.h:34
Exception thrown when a sequence is not align with others.
static void removeGaps(SequenceInterface &seq)
Remove gaps from a sequence.
A basic implementation of the Sequence interface.
Definition: Sequence.h:117
Sequence * clone() const override
Definition: Sequence.h:319
std::string getChar(size_t pos) const override
Get the element at position 'pos' as a character.
Definition: Sequence.h:346
Loop over all sites in a SiteContainer.
static void changeUnresolvedCharactersToGaps(SiteContainerInterface &sites)
Change all unresolved characters to gaps in a SiteContainer, according to its alphabet.
static std::vector< int > getColumnScores(const Matrix< size_t > &positions1, const Matrix< size_t > &positions2, int na=0)
Compare an alignment to a reference alignment, and compute the column scores.
static const std::string SIMILARITY_NOFULLGAP
static std::map< size_t, size_t > translateSequence(const SiteContainerInterface &sequences, size_t i1, size_t i2)
Translate sequence positions from a sequence to another in the same alignment.
static const std::string SIMILARITY_NOGAP
static std::map< size_t, size_t > getAlignmentPositions(const Sequence &seq)
Get the index of each alignment position in an aligned sequence.
static std::vector< double > getSumOfPairsScores(const Matrix< size_t > &positions1, const Matrix< size_t > &positions2, double na=0)
Compare an alignment to a reference alignment, and compute the sum-of-pairs scores.
static std::unique_ptr< Sequence > getConsensus(const SiteContainerInterface &sc, const std::string &name="consensus", bool ignoreGap=true, bool resolveUnknown=false)
create the consensus sequence of the alignment.
static void changeGapsToUnknownCharacters(SiteContainerInterface &sites)
Change all gaps to unknown state in a SiteContainer, according to its alphabet.
static std::unique_ptr< AlignedSequenceContainer > alignNW(const Sequence &seq1, const Sequence &seq2, const AlphabetIndex2 &s, double gap)
Align two sequences using the Needleman-Wunsch dynamic algorithm.
static std::unique_ptr< DistanceMatrix > computeSimilarityMatrix(const SiteContainerInterface &sites, bool dist=false, const std::string &gapOption=SIMILARITY_NOFULLGAP, bool unresolvedAsGap=true)
Compute the similarity matrix of an alignment.
static std::unique_ptr< SiteContainerInterface > resolveDottedAlignment(const SiteContainerInterface &dottedAln, std::shared_ptr< const Alphabet > &resolvedAlphabet)
Resolve a container with "." notations.
static double computeSimilarity(const SequenceInterface &seq1, const SequenceInterface &seq2, bool dist=false, const std::string &gapOption=SIMILARITY_NODOUBLEGAP, bool unresolvedAsGap=true)
Compute the similarity/distance score between two aligned sequences.
static std::map< size_t, size_t > translateAlignment(const Sequence &seq1, const Sequence &seq2)
Translate alignment positions from an aligned sequence to the same sequence in a different alignment.
static const std::string SIMILARITY_NODOUBLEGAP
static std::map< size_t, size_t > getSequencePositions(const Sequence &seq)
Get the index of each sequence position in an aligned sequence.
static const std::string SIMILARITY_ALL
The Site class.
Definition: Site.h:73
Site * clone() const
Definition: Site.h:184
static void getFrequencies(const CruxSymbolListInterface &list, std::map< int, double > &frequencies, bool resolveUnknowns=false)
Get all states frequencies in the list.
static bool hasGap(const IntSymbolListInterface &site)
static bool hasUnresolved(const IntSymbolListInterface &site)
virtual const SequenceType & sequence(const HashType &sequenceKey) const override=0
Retrieve a sequence object from the container.
virtual std::vector< std::string > getSequenceNames() const =0
virtual size_t getNumberOfSequences() const =0
Get the number of sequences in the container.
virtual std::vector< HashType > getSequenceKeys() const =0
virtual std::shared_ptr< const Alphabet > getAlphabet() const =0
Get a pointer toward the container's alphabet.
virtual const Alphabet & alphabet() const =0
Get the container's alphabet.
virtual void setSite(size_t sitePosition, std::unique_ptr< SiteType > &site, bool checkCoordinate=true)=0
Set a site in the container.
virtual const SiteType & site(size_t sitePosition) const override=0
Get a site from the container.
virtual size_t getNumberOfSites() const override=0
Get the number of aligned positions in the container.
static T sum(const std::vector< T > &v1)
static void fill(std::vector< T > &v, T value)
std::string toString(T t)
std::size_t count(const std::string &s, const std::string &pattern)
This alphabet is used to deal NumericAlphabet.
std::vector< int > Vint