bpp-core3  3.0.0
ProteicAlphabet.cpp
Go to the documentation of this file.
1 // SPDX-FileCopyrightText: The Bio++ Development Group
2 //
3 // SPDX-License-Identifier: CECILL-2.1
4 
5 #include <Bpp/Text/TextTools.h>
6 #include <Bpp/Utils/MapTools.h>
7 
8 #include "ProteicAlphabet.h"
9 #include "ProteicAlphabetState.h"
10 
11 using namespace bpp;
12 using namespace std;
13 
14 // From STL:
15 #include <map>
16 
17 /******************************************************************************/
18 
20 {
21  // Alphabet content definition
22  registerState(new ProteicAlphabetState(-1, "-", "GAP", "Gap"));
23  registerState(new ProteicAlphabetState( 0, "A", "ALA", "Alanine"));
24  registerState(new ProteicAlphabetState( 1, "R", "ARG", "Arginine"));
25  registerState(new ProteicAlphabetState( 2, "N", "ASN", "Asparagine"));
26  registerState(new ProteicAlphabetState( 3, "D", "ASP", "Asparatic Acid"));
27  registerState(new ProteicAlphabetState( 4, "C", "CYS", "Cysteine"));
28  registerState(new ProteicAlphabetState( 5, "Q", "GLN", "Glutamine"));
29  registerState(new ProteicAlphabetState( 6, "E", "GLU", "Glutamic acid"));
30  registerState(new ProteicAlphabetState( 7, "G", "GLY", "Glycine"));
31  registerState(new ProteicAlphabetState( 8, "H", "HIS", "Histidine"));
32  registerState(new ProteicAlphabetState( 9, "I", "ILE", "Isoleucine"));
33  registerState(new ProteicAlphabetState(10, "L", "LEU", "Leucine"));
34  registerState(new ProteicAlphabetState(11, "K", "LYS", "Lysine"));
35  registerState(new ProteicAlphabetState(12, "M", "MET", "Methionine"));
36  registerState(new ProteicAlphabetState(13, "F", "PHE", "Phenylalanine"));
37  registerState(new ProteicAlphabetState(14, "P", "PRO", "Proline"));
38  registerState(new ProteicAlphabetState(15, "S", "SER", "Serine"));
39  registerState(new ProteicAlphabetState(16, "T", "THR", "Threonine"));
40  registerState(new ProteicAlphabetState(17, "W", "TRP", "Tryptophan"));
41  registerState(new ProteicAlphabetState(18, "Y", "TYR", "Tyrosine"));
42  registerState(new ProteicAlphabetState(19, "V", "VAL", "Valine"));
43  registerState(new ProteicAlphabetState(20, "B", "B", "N or D"));
44  registerState(new ProteicAlphabetState(21, "Z", "Z", "Q or E"));
45  registerState(new ProteicAlphabetState(22, "J", "J", "I or L"));
46  registerState(new ProteicAlphabetState(23, "X", "X", "Unresolved amino acid"));
47  registerState(new ProteicAlphabetState(23, "O", "O", "Unresolved amino acid"));
48  registerState(new ProteicAlphabetState(23, "0", "0", "Unresolved amino acid"));
49  registerState(new ProteicAlphabetState(23, "?", "?", "Unresolved amino acid"));
50  registerState(new ProteicAlphabetState(-2, "*", "STOP", "Stop"));
51 }
52 
53 /******************************************************************************/
54 
55 string ProteicAlphabet::getAbbr(const string& aa) const
56 {
57  string AA = TextTools::toUpper(aa);
58  return getState(aa).getAbbreviation();
59 }
60 
61 /******************************************************************************/
62 
63 string ProteicAlphabet::getAbbr(int aa) const
64 {
65  return getState(aa).getAbbreviation();
66 }
67 
68 /******************************************************************************/
69 
70 bool ProteicAlphabet::isResolvedIn(int state1, int state2) const
71 {
72  if (!isIntInAlphabet(state1))
73  throw BadIntException(state1, "DNA::isResolvedIn(int, int): Specified base unknown.", this);
74 
75  if (!isIntInAlphabet(state2))
76  throw BadIntException(state2, "DNA::isResolvedIn(int, int): Specified base unknown.", this);
77 
78  if (isUnresolved(state2))
79  throw BadIntException(state2, "DNA::isResolvedIn(int, int): Unresolved base.", this);
80 
81  if (state1 == 20)
82  return state2 == 2 || state2 == 3;
83  else if (state1 == 21)
84  return state2 == 5 || state2 == 6;
85  else if (state1 == 22)
86  return state2 == 9 || state2 == 10;
87  else if (state1 == 23)
88  return state2 > 0;
89  else
90  return state1 == state2;
91 }
92 
93 /******************************************************************************/
94 
95 
96 vector<int> ProteicAlphabet::getAlias(int state) const
97 {
98  if (!isIntInAlphabet(state))
99  throw BadIntException(state, "ProteicAlphabet::getAlias(int): Specified base unknown.", this);
100  vector<int> v;
101  if (state == 20) // N or D
102  {
103  v.resize(2); v[0] = 2; v[1] = 3;
104  }
105  else if (state == 21) // Q or E
106  {
107  v.resize(2); v[0] = 5; v[1] = 6;
108  }
109  else if (state == 22) // I or L
110  {
111  v.resize(2); v[0] = 9; v[1] = 10;
112  }
113  else if (state == 23) // all!
114  {
115  v.resize(20);
116  for (size_t i = 0; i < 20; i++)
117  {
118  v[i] = static_cast<int>(i);
119  }
120  }
121  else
122  {
123  v.resize(1); v[0] = state;
124  }
125  return v;
126 }
127 
128 /******************************************************************************/
129 
130 vector<string> ProteicAlphabet::getAlias(const string& state) const
131 {
132  string locstate = TextTools::toUpper(state);
133  if (!isCharInAlphabet(locstate))
134  throw BadCharException(locstate, "ProteicAlphabet::getAlias(int): Specified base unknown.", this);
135  vector<string> v;
136  if (locstate == "B") // N or D
137  {
138  v.resize(2); v[0] = "N"; v[1] = "D";
139  }
140  else if (locstate == "Z") // Q or E
141  {
142  v.resize(2); v[0] = "Q"; v[1] = "E";
143  }
144  else if (locstate == "J") // I or L
145  {
146  v.resize(2); v[0] = "I"; v[1] = "L";
147  }
148  else if (locstate == "X"
149  || locstate == "O"
150  || locstate == "0"
151  || locstate == "?") // all!
152  {
153  v.resize(20);
154  for (int i = 0; i < 20; i++)
155  {
156  v[static_cast<size_t>(i)] = getState(i).getLetter();
157  }
158  }
159  else
160  {
161  v.resize(1); v[0] = locstate;
162  }
163  return v;
164 }
165 
166 /******************************************************************************/
167 
168 int ProteicAlphabet::getGeneric(const vector<int>& states) const
169 {
170  map<int, int> m;
171  for (size_t i = 0; i < states.size(); ++i)
172  {
173  vector<int> tmp_s = this->getAlias(states[i]); // get the states for generic characters
174  for (size_t j = 0; j < tmp_s.size(); ++j)
175  {
176  m[tmp_s[j]]++; // add each state to the list
177  }
178  }
179  vector<int> ve = MapTools::getKeys(m);
180 
181  string key;
182  for (size_t i = 0; i < ve.size(); ++i)
183  {
184  if (!isIntInAlphabet(ve[i]))
185  throw BadIntException(ve[i], "ProteicAlphabet::getGeneric(const vector<int>): Specified base unknown.", this);
186  key += "_" + TextTools::toString(ve[i]);
187  }
188  map<string, int> g;
189  g["_2_3"] = 20;
190  g["_5_6"] = 21;
191  g["_9_10"] = 22;
192  int v;
193  map<string, int>::iterator it = g.find(key);
194  if (ve.size() == 1)
195  {
196  v = ve[0];
197  }
198  else if (it != g.end())
199  {
200  v = it->second;
201  }
202  else
203  {
204  v = 23;
205  }
206  return v;
207 }
208 
209 /******************************************************************************/
210 
211 string ProteicAlphabet::getGeneric(const vector<string>& states) const
212 {
213  map<string, int> m;
214  for (size_t i = 0; i < states.size(); ++i)
215  {
216  vector<string> tmp_s = this->getAlias(states[i]); // get the states for generic characters
217  for (size_t j = 0; j < tmp_s.size(); ++j)
218  {
219  m[tmp_s[j]]++; // add each state to the list
220  }
221  }
222  vector<string> ve = MapTools::getKeys(m);
223 
224  string key;
225  for (size_t i = 0; i < ve.size(); ++i)
226  {
227  if (!isCharInAlphabet(ve[i]))
228  throw BadCharException(ve[i], "ProteicAlphabet::getAlias(const vector<string>): Specified base unknown.", this);
229  key += TextTools::toString(ve[i]);
230  }
231  map<string, string> g;
232  g["DN"] = "B";
233  g["EQ"] = "Z";
234  g["IL"] = "J";
235  string v;
236  map<string, string>::iterator it = g.find(key);
237  if (ve.size() == 1)
238  {
239  v = ve[0];
240  }
241  else if (it != g.end())
242  {
243  v = it->second;
244  }
245  else
246  {
247  v = "?";
248  }
249  return v;
250 }
251 
252 /******************************************************************************/
An alphabet exception thrown when trying to specify a bad char to the alphabet.
std::vector< int > getAlias(int state) const
Get all resolved states that match a generic state.
This alphabet is used to deal NumericAlphabet.
STL namespace.
int getGeneric(const std::vector< int > &states) const
Get the generic state that match a set of states.
std::string getAbbr(const std::string &aa) const
Get the abbreviation (3 letter code) for a state coded as char.
This is the base class to describe states in a ProteicAlphabet.
An alphabet exception thrown when trying to specify a bad int to the alphabet.
bool isResolvedIn(int state1, int state2) const
Tells if a given (potentially unresolved) state can be resolved in another resolved state...