bpp-seq3  3.0.0
StringSequenceTools.cpp
Go to the documentation of this file.
1 // SPDX-FileCopyrightText: The Bio++ Development Group
2 //
3 // SPDX-License-Identifier: CECILL-2.1
4 
6 #include <Bpp/Text/TextTools.h>
7 
9 #include "Alphabet/DNA.h"
11 #include "Alphabet/RNA.h"
12 #include "StringSequenceTools.h"
13 
14 using namespace bpp;
15 
16 // From the STL:
17 #include <map>
18 #include <ctype.h>
19 #include <algorithm>
20 #include <iostream>
21 
22 using namespace std;
23 
24 /****************************************************************************************/
25 
26 string StringSequenceTools::subseq(const string& sequence, size_t begin, size_t end)
27 {
28  // Checking interval
29  if (end < begin)
30  throw Exception ("StringSequenceTools::subseq: Invalid interval");
31 
32  // Copy sequence
33  string temp(sequence);
34 
35  // Truncate sequence
36  temp.erase(temp.begin() + static_cast<ptrdiff_t>(end + 1), temp.end());
37  temp.erase(temp.begin(), temp.begin() + static_cast<ptrdiff_t>(begin));
38 
39  // Send result
40  return temp;
41 }
42 
43 /****************************************************************************************/
44 
45 string StringSequenceTools::setToSizeR(const string& sequence, size_t size)
46 {
47  return TextTools::resizeRight(sequence, size, '-');
48 }
49 
50 string StringSequenceTools::setToSizeL(const string& sequence, size_t size)
51 {
52  return TextTools::resizeLeft(sequence, size, '-');
53 }
54 
55 /****************************************************************************************/
56 
57 string StringSequenceTools::deleteChar(const string& sequence, char chars)
58 {
59  // Copy sequence
60  string result(sequence);
61 
62  // Search and delete specified char
63  for (unsigned int i = 0; i < result.size(); i++)
64  {
65  if (result[i] == chars)
66  result.erase(result.begin() + i);
67  }
68 
69  return result;
70 }
71 
72 /****************************************************************************************/
73 
74 string StringSequenceTools::deleteChar(const string& sequence, string chars)
75 {
76  // Copy sequence
77  string result(sequence);
78 
79  // For all characters to delete
80  for (unsigned int i = 0; i < chars.size(); i++)
81  {
82  // Search and delete char
83  for (unsigned int j = 0; j < result.size(); j++)
84  {
85  if (result[j] == chars[i])
86  result.erase(result.begin() + j);
87  }
88  }
89 
90  return result;
91 }
92 
93 /****************************************************************************************/
94 
95 string* StringSequenceTools::reverse(const string& sequence)
96 {
97  // Initializing
98  string* result = new string;
99 
100  // Main loop : reverse all characters of sequence
101  size_t size = sequence.size();
102  for (size_t i = 0; i < size; i++)
103  {
104  *result += sequence[size - i - 1];
105  }
106 
107  // Send result
108  return result;
109 }
110 
111 /****************************************************************************************/
112 
113 string* StringSequenceTools::complement(const string& sequence)
114 {
115  // Initializing
116  string* result = new string;
117 
118  // Main loop : completement all characters
119  size_t size = sequence.size();
120  for (unsigned int i = 0; i < size; i++)
121  {
122  switch (sequence[i])
123  {
124  case 'A': *result += 'T';
125  break;
126  case 'C': *result += 'G';
127  break;
128  case 'G': *result += 'C';
129  break;
130  case 'T': *result += 'A';
131  break;
132  case 'M': *result += 'K';
133  break;
134  case 'R': *result += 'Y';
135  break;
136  case 'Y': *result += 'R';
137  break;
138  case 'K': *result += 'M';
139  break;
140  case 'V': *result += 'B';
141  break;
142  case 'H': *result += 'D';
143  break;
144  case 'D': *result += 'H';
145  break;
146  case 'B': *result += 'V';
147  break;
148  default: *result += sequence[i];
149  break;
150  }
151  }
152 
153  // Send new sequence
154  return result;
155 }
156 
157 /****************************************************************************************/
158 
159 double StringSequenceTools::getGCcontent(const string& sequence, size_t pos, size_t window)
160 {
161  // Frequency counts for nucleotids A, C, G, T
162  map<char, double> counts;
163 
164  // Window size checking
165  if (window < sequence.size())
166  throw BadIntegerException("StringSequenceTools::getGCContent : specified window too high", static_cast<int>(window));
167 
168  // For last nucleotides
169  if (pos + window > sequence.size())
170  {
171  pos = sequence.size() - window;
172  }
173 
174  // Main loop
175  for (size_t i = pos; i < pos + window; i++)
176  {
177  switch (toupper(sequence[i]))
178  {
179  case 'A': counts['A'] += 1;
180  break;
181  case 'C': counts['C'] += 1;
182  break;
183  case 'G': counts['G'] += 1;
184  break;
185  case 'T': counts['T'] += 1;
186  break;
187  case 'M': counts['A'] += 0.5;
188  counts['C'] += 0.5;
189  break;
190  case 'R': counts['A'] += 0.5;
191  counts['G'] += 0.5;
192  break;
193  case 'W': counts['A'] += 0.5;
194  counts['T'] += 0.5;
195  break;
196  case 'S': counts['C'] += 0.5;
197  counts['G'] += 0.5;
198  break;
199  case 'Y': counts['C'] += 0.5;
200  counts['T'] += 0.5;
201  break;
202  case 'K': counts['G'] += 0.5;
203  counts['T'] += 0.5;
204  break;
205  case 'V': counts['A'] += 0.34;
206  counts['C'] += 0.34;
207  counts['G'] += 0.34;
208  break;
209  case 'H': counts['A'] += 0.34;
210  counts['C'] += 0.34;
211  counts['T'] += 0.34;
212  break;
213  case 'D': counts['A'] += 0.34;
214  counts['G'] += 0.34;
215  counts['T'] += 0.34;
216  break;
217  case 'B': counts['C'] += 0.34;
218  counts['G'] += 0.34;
219  counts['T'] += 0.34;
220  break;
221  case '-': throw Exception("StringSequenceTools::getGCContent : Gap found in sequence");
222  break;
223  // Unresolved bases
224  default: counts['A'] += 0.25;
225  counts['C'] += 0.25;
226  counts['G'] += 0.25;
227  counts['T'] += 0.25;
228  }
229  }
230 
231  // Calculate and send GC rate
232  return (counts['G'] + counts['C']) / static_cast<double>(window);
233 }
234 
235 /****************************************************************************************/
236 
237 vector<int> StringSequenceTools::codeSequence(const string& sequence, std::shared_ptr<const Alphabet>& alphabet)
238 {
239  unsigned int size = AlphabetTools::getAlphabetCodingSize(*alphabet); // Warning,
240  // an
241  // exception
242  // may
243  // be
244  // casted
245  // here!
246  vector<int> code(static_cast<size_t>(floor(static_cast<double>(sequence.size()) / static_cast<double>(size))));
247  size_t pos = 0;
248  size_t count = 0;
249  while (pos + size <= sequence.size())
250  {
251  code[count] = alphabet->charToInt(sequence.substr(pos, size));
252  count++;
253  pos += size;
254  }
255  return code;
256 }
257 
258 /****************************************************************************************/
259 
260 string StringSequenceTools::decodeSequence(const vector<int>& sequence, std::shared_ptr<const Alphabet>& alphabet)
261 {
262  string result = "";
263  for (auto i : sequence)
264  {
265  result += alphabet->intToChar(i);
266  }
267  return result;
268 }
269 
270 /****************************************************************************************/
271 
272 std::shared_ptr<const Alphabet> StringSequenceTools::getAlphabetFromSequence(const std::string& sequence)
273 {
274  // empty sequence test
275  if (sequence.size() == 0)
276  {
277  throw Exception("Sequence::getAlphabetFromSequence : Empty sequence string");
278  }
279 
280  // initialisation
281  bool p = false; // indicates that a protein specific character is found
282  bool r = false; // indicates that a RNA specific character is found
283  bool u = false; // indicates that an unknown character is found
284  bool pd = false; // Protein or DNA (T)
285 
286  // Main loop : for all character in sequence
287  for (auto i : sequence)
288  {
289  // Character analyse
290  switch (AlphabetTools::getType(i))
291  {
292  case 0: u = true; break;
293  case 3: p = true; break;
294  case 2: r = true; break;
295  case 5: pd = true; break;
296  }
297  }
298 
299  if (u)
300  throw Exception("Sequence::getAlphabetFromSequence : Unknown character detected in specified sequence");
301  if (r && pd)
302  throw Exception("Sequence::getAlphabetFromSequence : Both 'T' and 'U' in the same sequence!");
303  if (r && p)
304  throw Exception("Sequence::getAlphabetFromSequence : Protein character and 'U' in the same sequence!");
305  if (p)
307  if (r)
310 }
311 
312 /****************************************************************************************/
static int getType(char state)
Character identification method for sequence's alphabet identification.
static std::shared_ptr< const ProteicAlphabet > PROTEIN_ALPHABET
Definition: AlphabetTools.h:38
static std::shared_ptr< const DNA > DNA_ALPHABET
Definition: AlphabetTools.h:34
static unsigned int getAlphabetCodingSize(const Alphabet &alphabet)
In case that all states in the given alphabet have a string description of same length,...
static std::shared_ptr< const RNA > RNA_ALPHABET
Definition: AlphabetTools.h:35
static double getGCcontent(const std::string &sequence, size_t pos, size_t window)
Calculate the local GC content of a sequence.
static std::shared_ptr< const Alphabet > getAlphabetFromSequence(const std::string &sequence)
Parse a sequence and try to guess the correct alphabet to use.
static std::string deleteChar(const std::string &sequence, char chars)
Delete all occurrence of a character in the sequence.
static std::string * complement(const std::string &sequence)
Get the complement of a sequence.
static std::string * reverse(const std::string &sequence)
Reverse the sequence.
static std::string subseq(const std::string &sequence, size_t begin, size_t end)
Get a subsequence.
static std::string setToSizeR(const std::string &sequence, size_t size)
Set up the size of a sequence from the right side.
static std::string setToSizeL(const std::string &sequence, size_t size)
Set up the size of a sequence from the left side.
static std::string decodeSequence(const std::vector< int > &sequence, std::shared_ptr< const Alphabet > &alphabet)
Convert a sequence to its string representation.
static std::vector< int > codeSequence(const std::string &sequence, std::shared_ptr< const Alphabet > &alphabet)
Convert a string sequence to a vector of int.
std::string resizeLeft(const std::string &s, std::size_t newSize, char fill)
std::string resizeRight(const std::string &s, std::size_t newSize, char fill)
std::size_t count(const std::string &s, const std::string &pattern)
This alphabet is used to deal NumericAlphabet.