bpp-core3  3.0.0
TextTools.cpp
Go to the documentation of this file.
1 // SPDX-FileCopyrightText: The Bio++ Development Group
2 //
3 // SPDX-License-Identifier: CECILL-2.1
4 
5 #include <algorithm>
6 #include <cctype>
7 #include <iterator>
8 #include <vector>
9 
10 #include "../Exceptions.h"
11 #include "../Numeric/IntegerTools.h"
12 #include "TextTools.h"
13 
14 namespace bpp
15 {
16 namespace TextTools
17 {
18 /******************************************************************************/
19 
20 bool isEmpty(const std::string& s)
21 {
22  return std::all_of(s.begin(), s.end(), [](char c) {
23  return std::isspace(c);
24  });
25 }
26 
27 /******************************************************************************/
28 
29 std::string toUpper(const std::string& s)
30 {
31  std::string result;
32  result.reserve(s.size());
33  std::transform(s.begin(), s.end(), std::back_inserter(result), [](char c) {
34  return std::toupper(c);
35  });
36  return result;
37 }
38 
39 /******************************************************************************/
40 
41 std::string toLower(const std::string& s)
42 {
43  std::string result;
44  result.reserve(s.size());
45  std::transform(s.begin(), s.end(), std::back_inserter(result), [](char c) {
46  return std::tolower(c);
47  });
48  return result;
49 }
50 
51 /******************************************************************************/
52 
53 bool isWhiteSpaceCharacter(char c) { return std::isspace(c); }
54 
55 /******************************************************************************/
56 
57 std::string removeWhiteSpaces(const std::string& s)
58 {
59  // Only copy non whitespace chars in new string
60  std::string result;
61  std::remove_copy_if(s.begin(), s.end(), std::back_inserter(result), [](char c) {
62  return std::isspace(c);
63  });
64  return result;
65 }
66 
67 /******************************************************************************/
68 
69 std::string removeFirstWhiteSpaces(const std::string& s)
70 {
71  // Copy s from first non whitespace to end.
72  return std::string(std::find_if(s.begin(), s.end(), [](char c) {
73  return !std::isspace(c);
74  }), s.end());
75 }
76 
77 /******************************************************************************/
78 
79 std::string removeLastWhiteSpaces(const std::string& s)
80 {
81  // Copy s from start to last non whitespace char
82  auto lastNonWhitespace = std::find_if(s.rbegin(), s.rend(), [](char c) {
83  return !std::isspace(c);
84  });
85  return std::string(s.begin(), lastNonWhitespace.base());
86 }
87 
88 /******************************************************************************/
89 
90 std::string removeSurroundingWhiteSpaces(const std::string& s)
91 {
92  // Copy s from first non-whitespace to last non-whitespace
93  auto isNotWhitespace = [](char c) {
94  return !std::isspace(c);
95  };
96  auto firstNonWhitespace = std::find_if(s.begin(), s.end(), isNotWhitespace);
97  auto lastNonWhitespace = std::find_if(
98  s.rbegin(), std::reverse_iterator<std::string::const_iterator>(firstNonWhitespace), isNotWhitespace);
99  return std::string(firstNonWhitespace, lastNonWhitespace.base());
100 }
101 
102 /******************************************************************************/
103 
104 bool isNewLineCharacter(char c) { return (c == '\n') || (c == '\r'); }
105 
106 /******************************************************************************/
107 
108 std::string removeNewLines(const std::string& s)
109 {
110  // Only copy non newline chars in new string
111  std::string result;
112  std::remove_copy_if(s.begin(), s.end(), std::back_inserter(result), [](char c) {
113  return isNewLineCharacter(c);
114  });
115  return result;
116 }
117 
118 /******************************************************************************/
119 
120 std::string removeLastNewLines(const std::string& s)
121 {
122  // Copy s from start to last non newline char
123  auto lastNonNewline = std::find_if(s.rbegin(), s.rend(), [](char c) {
124  return !isNewLineCharacter(c);
125  });
126  return std::string(s.begin(), lastNonNewline.base());
127 }
128 
129 /******************************************************************************/
130 
131 bool isDecimalNumber(char c) { return std::isdigit(c); }
132 
133 /******************************************************************************/
134 
135 bool isDecimalNumber(const std::string& s, char dec, char scientificNotation)
136 {
137  if (isEmpty(s))
138  return false;
139 
140  std::size_t sepCount = 0;
141  std::size_t sciCount = 0;
142  std::size_t i = 0;
143  if (s[0] == '-')
144  i = 1;
145  for ( ; i < s.size(); ++i)
146  {
147  char c = s[i];
148  if (c == dec)
149  sepCount++;
150  else if (c == scientificNotation)
151  {
152  sciCount++;
153  if (i == s.size() - 1)
154  return false; // Must be sthg after scientific notation.
155  c = s[i + 1];
156  if (c == '-' || c == '+')
157  i++;
158  if (i == s.size() - 1)
159  return false; // Must be sthg after scientific notation.
160  if (sepCount == 0)
161  sepCount = 1; // We do not want any dec in the exponent.
162  }
163  else if (!isDecimalNumber(c))
164  return false;
165  if (sepCount > 1 || sciCount > 1)
166  return false;
167  }
168  return true;
169 }
170 
171 /******************************************************************************/
172 
173 bool isDecimalInteger(const std::string& s, char scientificNotation)
174 {
175  if (isEmpty(s))
176  return false;
177 
178  std::size_t sciCount = 0;
179  std::size_t i = 0;
180  if (s[0] == '-')
181  i = 1;
182  for ( ; i < s.size(); ++i)
183  {
184  char c = s[i];
185  if (c == scientificNotation)
186  {
187  sciCount++;
188  if (i == s.size() - 1)
189  return false; // Must be sthg after scientific notation.
190  c = s[i + 1];
191  if (c == '-')
192  return false; // Not an integer then!
193  if (c == '+')
194  i++;
195  if (i == s.size() - 1)
196  return false; // Must be sthg after scientific notation.
197  }
198  else if (!isDecimalNumber(c))
199  return false;
200  if (sciCount > 1)
201  return false;
202  }
203  return true;
204 }
205 
206 /******************************************************************************/
207 
208 int toInt(const std::string& s, char scientificNotation)
209 {
210  if (!isDecimalInteger(s, scientificNotation))
211  throw Exception("TextTools::toInt(). Invalid number specification: " + s);
212  return fromString<int>(s);
213 }
214 
215 /******************************************************************************/
216 
217 double toDouble(const std::string& s, char dec, char scientificNotation)
218 {
219  if (!isDecimalNumber(s, dec, scientificNotation))
220  throw Exception("TextTools::toDouble(). Invalid number specification: " + s);
221  return fromString<double>(s);
222 }
223 
224 /******************************************************************************/
225 
226 std::string resizeRight(const std::string& s, std::size_t newSize, char fill)
227 {
228  std::string result;
229  result.reserve(newSize);
230  if (newSize > s.size())
231  {
232  std::copy(s.begin(), s.end(), std::back_inserter(result));
233  std::fill_n(std::back_inserter(result), newSize - s.size(), fill);
234  }
235  else
236  {
237  std::copy_n(s.begin(), newSize, std::back_inserter(result));
238  }
239  return result;
240 }
241 
242 /******************************************************************************/
243 
244 std::string resizeLeft(const std::string& s, std::size_t newSize, char fill)
245 {
246  std::string result;
247  result.reserve(newSize);
248  if (newSize > s.size())
249  {
250  std::fill_n(std::back_inserter(result), newSize - s.size(), fill);
251  std::copy(s.begin(), s.end(), std::back_inserter(result));
252  }
253  else
254  {
255  using diff_type = typename std::iterator_traits<decltype(s.begin())>::difference_type;
256  std::copy(s.begin() + static_cast<diff_type>(s.size() - newSize), s.end(), std::back_inserter(result));
257  }
258  return result;
259 }
260 
261 /******************************************************************************/
262 
263 std::vector<std::string> split(const std::string& s, std::size_t n)
264 {
265  using diff_type = typename std::iterator_traits<decltype(s.begin())>::difference_type;
266  std::vector<std::string> v;
267  auto nbChunks = IntegerTools::divideUp(s.size(), n);
268  v.reserve(nbChunks);
269  // Copy chunks by chunks, and add the last incomplete one if s.size () % n != 0
270  auto nbCopiedChunks = IntegerTools::divideDown(s.size(), n);
271  for (std::size_t i = 0; i < nbCopiedChunks; ++i)
272  {
273  v.emplace_back(s.begin() + static_cast<diff_type>(i * n), s.begin() + static_cast<diff_type>((i + 1) * n));
274  }
275  if (v.size() < nbChunks)
276  v.emplace_back(s.begin() + static_cast<diff_type>(v.size() * n), s.end());
277  return v;
278 }
279 
280 /******************************************************************************/
281 
282 std::string removeSubstrings(const std::string& s, char blockBeginning, char blockEnding)
283 {
284  std::string result;
285  std::size_t blockDepth = 0;
286  for (std::size_t i = 0; i < s.size(); ++i)
287  {
288  auto c = s[i];
289  if (c == blockBeginning)
290  {
291  blockDepth++;
292  }
293  else if (c == blockEnding)
294  {
295  if (blockDepth == 0)
296  throw Exception(
297  std::string("TextTools::removeSubstrings(): unmatched block closing character at position ") +
298  std::to_string(i));
299  blockDepth--;
300  }
301  else if (blockDepth == 0)
302  {
303  result += c;
304  }
305  }
306  return result;
307 }
308 
309 /******************************************************************************/
310 
311 std::string removeSubstrings(const std::string& s,
312  char blockBeginning,
313  char blockEnding,
314  std::vector<std::string>& exceptionsBeginning,
315  std::vector<std::string>& exceptionsEnding)
316 {
317  // TODO didn't upgrade... move to a parser like system ? it is very specific...
318  std::string t;
319  int blockCount = 0;
320  std::size_t begPos = 0;
321  for (std::size_t i = 0; i < s.size(); i++)
322  {
323  char current = s[i];
324  if (current == blockBeginning)
325  {
326  bool except = false;
327  for (std::size_t j = 0; j < exceptionsBeginning.size(); j++)
328  {
329  std::size_t pos = exceptionsBeginning[j].find(blockBeginning);
330  if (pos != std::string::npos)
331  {
332  std::size_t left = i - pos;
333  std::size_t right = i + exceptionsBeginning[j].length() - pos;
334  if ((right < s.length() - 1) && (hasSubstring(s.substr(left, right), exceptionsBeginning[j])))
335  {
336  except = true;
337  break;
338  }
339  }
340  }
341  if (!except)
342  {
343  blockCount++;
344  t += s.substr(begPos, i - begPos);
345  }
346  }
347  else if ((current == blockEnding) && (blockCount > 0))
348  {
349  for (std::size_t j = 0; j < exceptionsEnding.size(); j++)
350  {
351  std::size_t pos = exceptionsEnding[j].find(blockEnding);
352  if (pos != std::string::npos)
353  {
354  std::size_t left = i - pos;
355  std::size_t right = i + exceptionsEnding[j].length() - pos;
356  if ((right < s.length() - 1) && (hasSubstring(s.substr(left, right), exceptionsEnding[j])))
357  {
358  break;
359  }
360  }
361  }
362  blockCount--;
363  if (blockCount == 0)
364  {
365  begPos = i + 1;
366  }
367  else if (blockCount < 0)
368  throw Exception("TextTools::removeSubstrings(). " +
369  std::string("Ending block character without corresponding beginning one at position ") +
370  toString((int)i) + ".");
371  }
372  }
373  t += s.substr(begPos);
374  return t;
375 }
376 
377 /******************************************************************************/
378 
379 std::string removeChar(const std::string& s, char c)
380 {
381  std::string result;
382  std::remove_copy(s.begin(), s.end(), std::back_inserter(result), c);
383  return result;
384 }
385 
386 /******************************************************************************/
387 
388 std::size_t count(const std::string& s, const std::string& pattern)
389 {
390  std::size_t count = 0;
391  auto it = std::search(s.begin(), s.end(), pattern.begin(), pattern.end());
392  while (it != s.end())
393  {
394  count++;
395  it = std::search(it + 1, s.end(), pattern.begin(), pattern.end());
396  }
397  return count;
398 }
399 
400 /******************************************************************************/
401 
402 bool startsWith(const std::string& s, const std::string& pattern)
403 {
404  if (s.size() < pattern.size())
405  return false;
406  return std::equal(pattern.begin(), pattern.end(), s.begin());
407 }
408 
409 /******************************************************************************/
410 
411 bool endsWith(const std::string& s, const std::string& pattern)
412 {
413  if (s.size() < pattern.size())
414  return false;
415  return std::equal(pattern.rbegin(), pattern.rend(), s.rbegin());
416 }
417 
418 /******************************************************************************/
419 
420 bool hasSubstring(const std::string& s, const std::string& pattern)
421 {
422  return std::search(s.begin(), s.end(), pattern.begin(), pattern.end()) != s.end();
423 }
424 
425 /******************************************************************************/
426 
427 void replaceAll(std::string& target, const std::string& query, const std::string& replacement)
428 {
429  using diff_type = typename std::iterator_traits<decltype(target.begin())>::difference_type;
430  if (query.empty())
431  return;
432  std::string result;
433  auto it = target.begin();
434  while (it != target.end())
435  {
436  // Find next start of pattern, copy s up to there, then append replacement, and move search after pattern.
437  auto nextPattern = std::search(it, target.end(), query.begin(), query.end());
438  std::copy(it, nextPattern, std::back_inserter(result));
439  if (nextPattern != target.end())
440  {
441  result += replacement;
442  it = nextPattern + static_cast<diff_type>(query.size());
443  }
444  else
445  {
446  it = nextPattern;
447  }
448  }
449  target = std::move(result);
450 }
451 
452 /******************************************************************************/
453 } // namespace TextTools
454 } // namespace bpp
std::string resizeRight(const std::string &s, std::size_t newSize, char fill)
Definition: TextTools.cpp:226
void replaceAll(std::string &target, const std::string &query, const std::string &replacement)
Replacement of all non-overlapping occurrences of a certain motif in a string.
Definition: TextTools.cpp:427
bool isDecimalNumber(char c)
Tell is a given character describes a decimal number.
Definition: TextTools.cpp:131
double toDouble(const std::string &s, char dec, char scientificNotation)
Convert from string to double.
Definition: TextTools.cpp:217
T divideUp(T n, T divisor) noexcept
Returns ceil (n/divisor).
Definition: IntegerTools.h:26
std::size_t count(const std::string &s, const std::string &pattern)
Count the occurences of a given pattern in a string.
Definition: TextTools.cpp:388
std::string removeLastNewLines(const std::string &s)
Remove all new line characters at the end of a string.
Definition: TextTools.cpp:120
std::string removeChar(const std::string &s, char c)
Remove all occurences of a character in a string.
Definition: TextTools.cpp:379
std::string removeSurroundingWhiteSpaces(const std::string &s)
Remove all white spaces characters at the beginning and the end of a string.
Definition: TextTools.cpp:90
std::string toLower(const std::string &s)
Make the string lowercase.
Definition: TextTools.cpp:41
std::string removeWhiteSpaces(const std::string &s)
Remove all white spaces characters in a string.
Definition: TextTools.cpp:57
std::string removeFirstWhiteSpaces(const std::string &s)
Remove all white spaces characters at the beginning of a string.
Definition: TextTools.cpp:69
bool isNewLineCharacter(char c)
Tell if a character is a new line character or not.
Definition: TextTools.cpp:104
std::string resizeLeft(const std::string &s, std::size_t newSize, char fill)
Definition: TextTools.cpp:244
std::string removeNewLines(const std::string &s)
Remove all new line characters in a string.
Definition: TextTools.cpp:108
std::string removeLastWhiteSpaces(const std::string &s)
Remove all white spaces characters at the end of a string.
Definition: TextTools.cpp:79
Exception base class. Overload exception constructor (to control the exceptions mechanism). Destructor is already virtual (from std::exception)
Definition: Exceptions.h:20
int toInt(const std::string &s, char scientificNotation)
Convert from string to int.
Definition: TextTools.cpp:208
bool hasSubstring(const std::string &s, const std::string &pattern)
Tell is a string contains a certain motif.
Definition: TextTools.cpp:420
bool isWhiteSpaceCharacter(char c)
Tell if a character is a white space or not.
Definition: TextTools.cpp:53
std::string toUpper(const std::string &s)
Make the string uppercase.
Definition: TextTools.cpp:29
bool isEmpty(const std::string &s)
Tell if a string is empty. A string is considered to be &#39;empty&#39; if it is only made of white spaces...
Definition: TextTools.cpp:20
std::string toString(T t)
General template method to convert to a string.
Definition: TextTools.h:115
bool isDecimalInteger(const std::string &s, char scientificNotation)
Tell is a given character string describes a decimal integer. FIXME: for now, this parser will not re...
Definition: TextTools.cpp:173
bool endsWith(const std::string &s, const std::string &pattern)
Tell is a string ends with a certain motif.
Definition: TextTools.cpp:411
bool startsWith(const std::string &s, const std::string &pattern)
Tell is a string begins with a certain motif.
Definition: TextTools.cpp:402
T divideDown(T n, T divisor) noexcept
Returns floor(n/divisor).
Definition: IntegerTools.h:18
std::string removeSubstrings(const std::string &s, char blockBeginning, char blockEnding)
Remove substrings from a string. All substrings beginning with blockBeginning and ending with blockEn...
Definition: TextTools.cpp:282
std::vector< std::string > split(const std::string &s, std::size_t n)
Definition: TextTools.cpp:263