bpp-seq3  3.0.0
AllelicAlphabet.cpp
Go to the documentation of this file.
1 // SPDX-FileCopyrightText: The Bio++ Development Group
2 //
3 // SPDX-License-Identifier: CECILL-2.1
4 
5 #include <Bpp/Text/TextTools.h>
6 
7 #include "AllelicAlphabet.h"
8 
9 using namespace bpp;
10 
11 // From the STL:
12 #include <iostream>
13 
14 using namespace std;
15 
16 AllelicAlphabet::AllelicAlphabet(std::shared_ptr<const Alphabet> alph, unsigned int nbAlleles) :
17  alph_(alph),
18  nbAlleles_(nbAlleles),
19  nbUnknown_(0)
20 {
21  if (nbAlleles_ <= 1)
22  throw BadIntException((int)nbAlleles_, "AllelicAlphabet::AllelicAlphabet : wrong number of alleles", this);
23 
24  unsigned int size = alph_->getSize();
25 
26  auto sword = alph_->getStateCodingSize();
27  auto snb = std::to_string(nbAlleles_).size();
28 
29  // Gap is such as "-6-0"
30 
31  string gapchar = alph_->getState(alph_->getGapCharacterCode()).getLetter();
32 
33  auto gapword = gapchar + std::string("0", snb);
34 
35  registerState(new AlphabetState(-1, gapchar + std::to_string(nbAlleles_) + gapword, "gap"));
36 
37 
38  // Monomorphic states are such as "A6-0"
39 
40  for (int i = 0; i < static_cast<int>(size); ++i)
41  {
42  auto desc = alph_->intToChar(i) + std::to_string(nbAlleles_) + gapword;
43  registerState(new AlphabetState(i, desc, desc));
44  }
45 
46  // Polymorphic states are such as "A4C2"
47 
48  for (int i = 0; i < static_cast<int>(size) - 1; ++i)
49  {
50  for ( int j = i + 1; j < static_cast<int>(size); ++j)
51  {
52  auto nbl = (i * static_cast<int>(size) + j) * static_cast<int>(nbAlleles_ - 1) + static_cast<int>(size);
53  for (int nba = 1; nba < static_cast<int>(nbAlleles_); ++nba)
54  {
55  auto sni = std::string("0", snb - std::to_string(static_cast<int>(nbAlleles_) - nba).size()) + std::to_string(static_cast<int>(nbAlleles_) - nba);
56  auto snj = std::string("0", snb - std::to_string(nba).size()) + std::to_string(nba);
57  auto desc = alph_->intToChar(i) + sni + alph_->intToChar(j) + snj;
58  registerState(new AlphabetState(static_cast<int>(nbl) + nba - 1, desc, desc));
59  }
60  }
61  }
62 
63  // Unknown
64 
65  nbUnknown_ = (int)(size * size * (nbAlleles_ - 1));
66 
67  auto desc = std::string("?", sword) + std::to_string(nbAlleles_) + gapword;
68  registerState(new AlphabetState(nbUnknown_, desc, desc));
69 }
70 
71 /******************************************************************************/
72 
74 {
75  return "Allelic(alphabet=" + alph_->getAlphabetType() + ",nbAlleles_=" + std::to_string(nbAlleles_) + ")";
76 }
77 
78 
79 // /******************************************************************************/
80 
81 bool AllelicAlphabet::isResolvedIn(int state1, int state2) const
82 {
83  if (!isIntInAlphabet(state1))
84  throw BadIntException(state1, "AllelicAlphabet::isResolvedIn(int, int): Specified base unknown.", this);
85 
86  if (!isIntInAlphabet(state2))
87  throw BadIntException(state2, "AllelicAlphabet::isResolvedIn(int, int): Specified base unknown.", this);
88 
89  if (isUnresolved(state2))
90  throw BadIntException(state2, "AllelicAlphabet::isResolvedIn(int, int): Unresolved base.", this);
91 
92  auto size = alph_->getSize();
93  return (static_cast<unsigned int>(state1) == size * size * (nbAlleles_ - 1)) ? (state2 >= 0) : (state1 == state2);
94 }
95 
96 /******************************************************************************/
97 
98 std::vector<int> AllelicAlphabet::getAlias(int state) const
99 {
100  if (!isIntInAlphabet(state))
101  throw BadIntException(state, "AllelicAlphabet::getAlias(int): Specified base unknown.", this);
102 
103  if (state == nbUnknown_)
104  return getSupportedInts();
105  else
106  return vector<int>(1, state);
107 }
108 
109 /******************************************************************************/
110 
111 std::vector<std::string> AllelicAlphabet::getAlias(const std::string& state) const
112 {
113  string locstate = TextTools::toUpper(state);
114  if (!isCharInAlphabet(locstate))
115  throw BadCharException(locstate, "AllelicAlphabet::getAlias(string): Specified base unknown.", this);
116 
117  if (state == getState(nbUnknown_).getLetter())
118  return getSupportedChars();
119  else
120  return vector<string>(1, state);
121 }
122 
123 /******************************************************************************/
124 
125 std::unique_ptr<ProbabilisticSequence> AllelicAlphabet::convertFromStateAlphabet(const CoreSequenceInterface& sequence) const
126 {
127  auto seq = dynamic_cast<const SequenceInterface*>(&sequence);
128  auto probseq = dynamic_cast<const ProbabilisticSequenceInterface*>(&sequence);
129 
130  if (!seq && !probseq)
131  throw Exception("AllelicAlphabet::convertFromStateAlphabet: unknown type for sequence: " + sequence.getName());
132 
133  auto alphabet = seq ? seq->getAlphabet() : probseq->getAlphabet();
134 
135  if (alphabet->getAlphabetType() != getStateAlphabet()->getAlphabetType())
136  throw AlphabetMismatchException("AllelicAlphabet::convertFromStateAlphabet", getStateAlphabet().get(), alphabet.get());
137 
138  auto alphaPtr = shared_from_this();
139  auto tSeq = make_unique<ProbabilisticSequence>(alphaPtr);
140  tSeq->setName(sequence.getName());
141  tSeq->setComments(sequence.getComments());
142 
143  auto tt = tSeq->getTable();
144 
145  auto size = seq ? seq->size() : probseq->size();
146 
147  auto alphasize = alphabet->getSize();
148 
149  int gap = alphabet->getGapCharacterCode();
150 
151  Vdouble tstate(alphasize);
152  Vdouble likelihood(getSize());
153 
154  for (unsigned int pos = 0; pos < size; ++pos)
155  {
156  // first observed values
157  if (seq)
158  {
159  int state = seq->getValue(pos);
160  if (state == gap)
161  {
162  for (size_t a = 0; a < alphasize; ++a)
163  {
164  tstate[a] = 1;
165  }
166  tSeq->addElement(tstate);
167  continue; // no binomial calculation
168  }
169  else
170  {
171  for (size_t a = 0; a < alphasize; a++)
172  {
173  tstate[a] = 0;
174  }
175  tstate[(size_t)state] = 1;
176  }
177  }
178  else
179  tstate = probseq->getValue(pos);
180 
181  computeLikelihoods(tstate, likelihood);
182  tSeq->addElement(likelihood);
183  }
184 
185  return tSeq;
186 }
187 
188 
189 void AllelicAlphabet::computeLikelihoods(const Vdouble& counts, Vdouble& likelihoods) const
190 {
191  auto alphasize = getStateAlphabet()->getSize();
192  if (counts.size() != alphasize)
193  throw BadSizeException("AllelicAlphabet::computeLikelihoods: bad vector size for counts.", alphasize, counts.size());
194 
195  likelihoods.resize(getSize(), 0);
196 
197  // vector of bool if strictly positive counts
198  std::vector<bool> presence(alphasize);
199  for (size_t state = 0; state < alphasize; ++state)
200  {
201  presence[state] = (counts[state] > NumConstants::TINY());
202  }
203 
204  // Monomorphic states (likelihood = 1 or 0)
205  for (size_t state = 0; state < alphasize; ++state)
206  {
207  if (presence[state])
208  {
209  likelihoods[state] = 1;
210  for (size_t ns = 0; ns < alphasize; ns++)
211  {
212  if (ns == state)
213  continue;
214 
215  if (presence[ns]) // because polymorphic state
216  {
217  likelihoods[state] = 0;
218  break;
219  }
220  }
221  }
222  else
223  likelihoods[state] = 0;
224  }
225 
226  // Polymorphic states are such as "A4C2"
227 
228  auto numetat = alphasize;
229  for (unsigned int i = 0; i < alphasize - 1; ++i) // A
230  {
231  for (unsigned int j = i + 1; j < alphasize; ++j) // C
232  {
233  bool todo = true;
234  for (unsigned int ns = 0; ns < alphasize; ++ns)
235  {
236  if ((ns == i) || (ns == j))
237  continue;
238  if (presence[ns]) // non null mismatch count
239  {
240  for (unsigned int nba = 1; nba < nbAlleles_; ++nba)
241  {
242  likelihoods[numetat] = 0;
243  numetat++;
244  }
245  todo = false;
246  break;
247  }
248  }
249 
250  if (todo)
251  {
252  auto lnorm = std::lgamma(counts[size_t(i)] + counts[size_t(j)] + 1) - std::lgamma(counts[size_t(i)] + 1) - std::lgamma(counts[size_t(j)] + 1);
253  for (size_t nba = 1; nba < nbAlleles_; nba++)
254  {
255  auto lprob = std::log((double)(nbAlleles_ - nba) / nbAlleles_) * counts[size_t(i)] + std::log((double)nba / nbAlleles_) * counts[size_t(j)];
256  likelihoods[numetat] = std::exp(lprob + lnorm);
257  if (likelihoods[numetat] < NumConstants::TINY())
258  likelihoods[numetat] = 0;
259  numetat++;
260  }
261  }
262  }
263  }
264 }
const std::vector< int > & getSupportedInts() const
const AlphabetState & getState(const std::string &letter) const
Get a state by its letter.
const std::vector< std::string > & getSupportedChars() const
virtual void registerState(AlphabetState *st)
Add a state to the Alphabet.
bool isIntInAlphabet(int state) const
Tell if a state (specified by its int description) is allowed by the the alphabet.
bool isCharInAlphabet(const std::string &state) const
Tell if a state (specified by its string description) is allowed by the the alphabet.
unsigned int nbAlleles_
the number of alleles.
std::unique_ptr< ProbabilisticSequence > convertFromStateAlphabet(const CoreSequenceInterface &sequence) const
Convert a CoreSequence in StateAlphabet to a ProbabilisticSequence of the likelihoods of the di-allel...
std::shared_ptr< const Alphabet > getStateAlphabet() const
Returns Base Alphabet.
void computeLikelihoods(const Vdouble &counts, Vdouble &likelihoods) const
Fills the vector of the likelihoods of a vector of counts in the states alphabet, given the di-allele...
std::string getAlphabetType() const override
Identification method.
int nbUnknown_
the unknown state number
std::shared_ptr< const Alphabet > alph_
std::vector< int > getAlias(int state) const override
Get all resolved states that match a generic state.
AllelicAlphabet(std::shared_ptr< const Alphabet > alph, unsigned int nbAlleles)
Builds a new word alphabet from an Alphabet.
bool isUnresolved(int state) const override
bool isResolvedIn(int state1, int state2) const override
Tells if a given (potentially unresolved) state can be resolved in another resolved state.
unsigned int getSize() const override
Get the number of resolved states in the alphabet (e.g. return 4 for DNA alphabet)....
Exception thrown when two alphabets do not match.
This is the base class to describe states in an Alphabet.
Definition: AlphabetState.h:22
An alphabet exception thrown when trying to specify a bad char to the alphabet.
An alphabet exception thrown when trying to specify a bad int to the alphabet.
virtual const Comments & getComments() const =0
Get the comments.
The core sequence interface.
Definition: CoreSequence.h:28
virtual const std::string & getName() const =0
Get the name of this sequence.
static double TINY()
The probabilistic sequence interface.
The sequence interface.
Definition: Sequence.h:34
std::string toUpper(const std::string &s)
This alphabet is used to deal NumericAlphabet.
std::vector< double > Vdouble