bpp-seq3  3.0.0
WordAlphabet.cpp
Go to the documentation of this file.
1 // SPDX-FileCopyrightText: The Bio++ Development Group
2 //
3 // SPDX-License-Identifier: CECILL-2.1
4 
5 #include <Bpp/Text/TextTools.h>
6 
7 #include "WordAlphabet.h"
8 
9 using namespace bpp;
10 
11 // From the STL:
12 #include <iostream>
13 
14 using namespace std;
15 
16 WordAlphabet::WordAlphabet(const vector< std::shared_ptr<const Alphabet>>& vAlpha) :
18  vAbsAlph_(vAlpha)
19 {
20  build_();
21 }
22 
23 WordAlphabet::WordAlphabet(std::shared_ptr<const Alphabet> pAlpha, size_t num) :
25  vAbsAlph_(0)
26 {
27  for (size_t i = 0; i < num; i++)
28  {
29  vAbsAlph_.push_back(pAlpha);
30  }
31 
32  build_();
33 }
34 
36 {
37  size_t size = 1;
38 
39  for (size_t i = 0; i < vAbsAlph_.size(); ++i)
40  {
41  size *= vAbsAlph_[i]->getSize();
42  }
43 
44  vector<AlphabetState*> states(size + 2);
45 
46  string s = "";
47  for (size_t i = 0; i < vAbsAlph_.size(); ++i)
48  {
49  s += "-";
50  }
51 
52  states[0] = new AlphabetState(-1, s, "gap");
53 
54  for (size_t i = 0; i < size; ++i)
55  {
56  states[i + 1] = new AlphabetState(static_cast<int>(i), "", "");
57  }
58 
59  size_t lr = size;
60  char c;
61  for (size_t na = 0; na < vAbsAlph_.size(); ++na)
62  {
63  lr /= vAbsAlph_[na]->getSize();
64  size_t j = 1;
65  int i = 0;
66  while (j <= size)
67  {
68  c = vAbsAlph_[na]->intToChar(i)[0];
69 
70  for (size_t k = 0; k < lr; k++)
71  {
72  states[j]->setLetter(states[j]->getLetter() + c);
73  j++;
74  // alphabet[j++].letter += c;
75  }
76 
77  if (++i == static_cast<int>(vAbsAlph_[na]->getSize()))
78  i = 0;
79  }
80  }
81 
82  s = "";
83  for (size_t i = 0; i < vAbsAlph_.size(); ++i)
84  {
85  s += "N";
86  }
87 
88  states[size + 1] = new AlphabetState(static_cast<int>(size), s, "Unresolved");
89 
90  // Now register all states once for all:
91  for (size_t i = 0; i < states.size(); ++i)
92  {
93  registerState(states[i]);
94  }
95 }
96 
97 /******************************************************************************/
98 
99 std::string WordAlphabet::getAlphabetType() const
100 {
101  string s = "Word(";
102  for (unsigned int i = 0; i < vAbsAlph_.size(); i++)
103  {
104  if (i != 0)
105  s += ",";
106 
107  s += "alphabet" + TextTools::toString(i + 1) + "=" + vAbsAlph_[i]->getAlphabetType();
108  }
109 
110  s += ")";
111 
112  return s;
113 }
114 
116 {
117  string s = vAbsAlph_[0]->getAlphabetType();
118  for (unsigned int i = 1; i < vAbsAlph_.size(); i++)
119  {
120  if (vAbsAlph_[i]->getAlphabetType() != s)
121  return false;
122  }
123  return true;
124 }
125 
126 bool WordAlphabet::containsUnresolved(const std::string& state) const
127 {
128  size_t s = vAbsAlph_.size();
129  if (state.length() != s)
130  throw BadCharException(state, "WordAlphabet::containsUnresolved", this);
131 
132  for (size_t i = 0; i < vAbsAlph_.size(); i++)
133  {
134  if (vAbsAlph_[i]->isUnresolved(state.substr(i, 1)))
135  {
136  return true;
137  }
138  }
139  return false;
140 }
141 
142 /******************************************************************************/
143 
144 bool WordAlphabet::containsGap(const std::string& state) const
145 {
146  size_t s = vAbsAlph_.size();
147  if (state.length() != s)
148  throw BadCharException(state, "WordAlphabet::containsGap", this);
149 
150  for (size_t i = 0; i < vAbsAlph_.size(); i++)
151  {
152  if (vAbsAlph_[i]->isGap(state.substr(i, 1)))
153  return true;
154  }
155 
156  return false;
157 }
158 
159 /******************************************************************************/
160 
161 std::string WordAlphabet::getName(const std::string& state) const
162 {
163  if (state.size() != vAbsAlph_.size())
164  throw BadCharException(state, "WordAlphabet::getName", this);
165  if (containsUnresolved(state))
166  return getStateAt(getSize() + 1).getName();
167  if (containsGap(state))
168  return getStateAt(0).getName();
169  else
170  return AbstractAlphabet::getName(state);
171 }
172 
173 /******************************************************************************/
174 
175 bool WordAlphabet::isResolvedIn(int state1, int state2) const
176 {
177  if (!isIntInAlphabet(state1))
178  throw BadIntException(state1, "WordAlphabet::isResolvedIn(int, int): Specified base unknown.", this);
179 
180  if (!isIntInAlphabet(state2))
181  throw BadIntException(state2, "WordAlphabet::isResolvedIn(int, int): Specified base unknown.", this);
182 
183  if (isUnresolved(state2))
184  throw BadIntException(state2, "WordAlphabet::isResolvedIn(int, int): Unresolved base.", this);
185 
186  return (state1 == (int)getSize()) ? (state2 >= 0) : (state1 == state2);
187 }
188 
189 /******************************************************************************/
190 
191 std::vector<int> WordAlphabet::getAlias(int state) const
192 {
193  if (!isIntInAlphabet(state))
194  throw BadIntException(state, "WordAlphabet::getAlias(int): Specified base unknown.", this);
195  vector<int> v;
196  size_t s = getSize();
197 
198  if (static_cast<size_t>(state) == s)
199  {
200  v.resize(s);
201  for (size_t i = 0; i < s; ++i)
202  {
203  v[i] = static_cast<int>(i);
204  }
205  }
206  else
207  {
208  v.resize(1); v[0] = state;
209  }
210  return v;
211 }
212 
213 /******************************************************************************/
214 
215 std::vector<std::string> WordAlphabet::getAlias(const std::string& state) const
216 {
217  string locstate = TextTools::toUpper(state);
218  if (!isCharInAlphabet(locstate))
219  throw BadCharException(locstate, "WordAlphabet::getAlias(string): Specified base unknown.", this);
220  vector<string> v;
221 
222  size_t s = getSize();
223 
224  string st = "";
225  for (size_t i = 0; i < vAbsAlph_.size(); ++i)
226  {
227  st += "N";
228  }
229 
230  if (locstate == st)
231  {
232  v.resize(s);
233  for (size_t i = 0; i < s; ++i)
234  {
235  v[i] = intToChar(static_cast<int>(i));
236  }
237  }
238  else
239  {
240  v.resize(1); v[0] = state;
241  }
242  return v;
243 }
244 
245 /******************************************************************************/
246 
247 int WordAlphabet::getGeneric(const std::vector<int>& states) const
248 {
249  return states[0];
250 }
251 
252 /******************************************************************************/
253 
254 std::string WordAlphabet::getGeneric(const std::vector<std::string>& states) const
255 {
256  return states[0];
257 }
258 
259 /******************************************************************************/
260 
261 int WordAlphabet::getWord(const Sequence& seq, size_t pos) const
262 {
263  if (seq.size() < pos + vAbsAlph_.size())
264  throw IndexOutOfBoundsException("WordAlphabet::getWord", pos, 0, seq.size() - vAbsAlph_.size());
265 
266  vector<string> vs;
267  for (size_t i = 0; i < vAbsAlph_.size(); i++)
268  {
269  vs.push_back(vAbsAlph_[i]->intToChar(seq[i + pos]));
270  }
271 
272  return charToInt(getWord(vs)); // This can't throw a BadCharException!
273 }
274 
275 
276 /******************************************************************************/
277 
278 int WordAlphabet::getWord(const std::vector<int>& vint, size_t pos) const
279 {
280  if (vint.size() < pos + vAbsAlph_.size())
281  throw IndexOutOfBoundsException("WordAlphabet::getWord", pos, 0, vint.size() - vAbsAlph_.size());
282 
283  vector<string> vs;
284  for (size_t i = 0; i < vAbsAlph_.size(); i++)
285  {
286  vs.push_back(vAbsAlph_[i]->intToChar(vint[i + pos]));
287  }
288 
289  return charToInt(getWord(vs)); // This can't throw a BadCharException!
290 }
291 
292 /****************************************************************************************/
293 
294 std::string WordAlphabet::getWord(const std::vector<string>& vpos, size_t pos) const
295 {
296  if (vpos.size() < pos + vAbsAlph_.size())
297  throw IndexOutOfBoundsException("WordAlphabet::getWord", pos, 0, vpos.size() - vAbsAlph_.size());
298 
299  string s = "";
300  for (size_t i = 0; i < vAbsAlph_.size(); i++)
301  {
302  s += vpos[pos + i];
303  }
304  // test
305  charToInt(s);
306  return s;
307 }
308 
309 /****************************************************************************************/
310 
311 unique_ptr<SequenceInterface> WordAlphabet::translate(const SequenceInterface& sequence, size_t pos) const
312 {
313  if ((!hasUniqueAlphabet()) or
314  (sequence.getAlphabet()->getAlphabetType() != vAbsAlph_[0]->getAlphabetType()))
315  throw AlphabetMismatchException("No matching alphabets", sequence.getAlphabet().get(), vAbsAlph_[0].get());
316 
317  vector<int> content;
318 
319  size_t s = sequence.size();
320  unsigned int l = getLength();
321  size_t i = pos;
322 
323  while (i + l <= s)
324  {
325  content.push_back(getWord(sequence, i));
326  i += l;
327  }
328 
329  auto alphaPtr = shared_from_this();
330  return make_unique<Sequence>(sequence.getName(), content, alphaPtr);
331 }
332 
333 /****************************************************************************************/
334 
335 unique_ptr<SequenceInterface> WordAlphabet::reverse(const SequenceInterface& sequence) const
336 {
337  if ((!hasUniqueAlphabet()) or
338  (sequence.getAlphabet()->getAlphabetType() != getAlphabetType()))
339  throw AlphabetMismatchException("No matching alphabets", sequence.getAlphabet().get(), this);
340 
341  auto alphaPtr = getNAlphabet(0);
342  auto seqPtr = make_unique<Sequence>(sequence.getName(), "", alphaPtr);
343 
344  size_t s = sequence.size();
345  for (size_t i = 0; i < s; i++)
346  {
347  seqPtr->append(getPositions(sequence[i]));
348  }
349 
350  return seqPtr;
351 }
352 
353 /****************************************************************************************/
A partial implementation of the Alphabet interface.
std::string getName(const std::string &state) const
Get the complete name of a state given its string description.
virtual AlphabetState & getStateAt(size_t stateIndex)
Get a state at a position in the alphabet_ vector.
std::string intToChar(int state) const
Give the string description of a state given its int description.
virtual void registerState(AlphabetState *st)
Add a state to the Alphabet.
bool isGap(int state) const
bool isIntInAlphabet(int state) const
Tell if a state (specified by its int description) is allowed by the the alphabet.
bool isCharInAlphabet(const std::string &state) const
Tell if a state (specified by its string description) is allowed by the the alphabet.
size_t size() const override
Get the number of elements in the list.
Definition: SymbolList.h:124
Exception thrown when two alphabets do not match.
This is the base class to describe states in an Alphabet.
Definition: AlphabetState.h:22
const std::string & getName() const
Get the name of the state.
Definition: AlphabetState.h:76
An alphabet exception thrown when trying to specify a bad char to the alphabet.
An alphabet exception thrown when trying to specify a bad int to the alphabet.
virtual const std::string & getName() const =0
Get the name of this sequence.
virtual std::shared_ptr< const Alphabet > getAlphabet() const =0
Get the alphabet associated to the list.
virtual size_t size() const =0
Get the number of elements in the list.
The sequence interface.
Definition: Sequence.h:34
A basic implementation of the Sequence interface.
Definition: Sequence.h:117
bool isUnresolved(int state) const override
Definition: WordAlphabet.h:264
unsigned int getLength() const override
Returns the length of the word.
Definition: WordAlphabet.h:242
bool containsGap(const std::string &state) const override
bool containsUnresolved(const std::string &state) const override
virtual int getWord(const Sequence &seq, size_t pos=0) const override
Get the int code for a word given the int code of the underlying positions.
std::vector< std::shared_ptr< const Alphabet > > vAbsAlph_
Definition: WordAlphabet.h:153
std::vector< int > getAlias(int state) const override
Get all resolved states that match a generic state.
bool isResolvedIn(int state1, int state2) const override
Tells if a given (potentially unresolved) state can be resolved in another resolved state.
bool hasUniqueAlphabet() const override
Returns True if the Alphabet of the letters in the word are the same type.
std::string getAlphabetType() const override
Identification method.
int getGeneric(const std::vector< int > &states) const override
Get the generic state that match a set of states.
std::unique_ptr< SequenceInterface > translate(const SequenceInterface &sequence, size_t=0) const override
Translate a whole sequence from letters alphabet to words alphabet.
std::unique_ptr< SequenceInterface > reverse(const SequenceInterface &sequence) const override
Translate a whole sequence from words alphabet to letters alphabet.
WordAlphabet(const std::vector< std::shared_ptr< const Alphabet >> &vAlpha)
Builds a new word alphabet from a vector of Alphabets.
int charToInt(const std::string &state) const override
Give the int description of a state given its string description.
Definition: WordAlphabet.h:210
unsigned int getSize() const override
Definition: WordAlphabet.h:221
std::vector< int > getPositions(int word) const override
Get the int codes of each position of a word given its int description.
Definition: WordAlphabet.h:367
std::string getName(const std::string &state) const override
Get the complete name of a state given its string description.
std::shared_ptr< const Alphabet > getNAlphabet(size_t n) const override
Get the pointer to the Alphabet at the n-position.
Definition: WordAlphabet.h:299
std::string toUpper(const std::string &s)
std::string toString(T t)
This alphabet is used to deal NumericAlphabet.