bpp-core3  3.0.0
TextTools.cpp
Go to the documentation of this file.
1 //
2 // File: TextTools.cpp
3 // Authors:
4 // Julien Dutheil
5 // Francois Gindraud (2017)
6 // Created: 2003-08-08 12:57:50
7 // Last modified: 2017-06-27 00:00:00
8 //
9 
10 /*
11  Copyright or © or Copr. Bio++ Development Team, (November 17, 2004)
12 
13  This software is a computer program whose purpose is to provide utilitary
14  classes. This file belongs to the Bio++ Project.
15 
16  This software is governed by the CeCILL license under French law and
17  abiding by the rules of distribution of free software. You can use,
18  modify and/ or redistribute the software under the terms of the CeCILL
19  license as circulated by CEA, CNRS and INRIA at the following URL
20  "http://www.cecill.info".
21 
22  As a counterpart to the access to the source code and rights to copy,
23  modify and redistribute granted by the license, users are provided only
24  with a limited warranty and the software's author, the holder of the
25  economic rights, and the successive licensors have only limited
26  liability.
27 
28  In this respect, the user's attention is drawn to the risks associated
29  with loading, using, modifying and/or developing or reproducing the
30  software by the user in light of its specific status of free software,
31  that may mean that it is complicated to manipulate, and that also
32  therefore means that it is reserved for developers and experienced
33  professionals having in-depth computer knowledge. Users are therefore
34  encouraged to load and test the software's suitability as regards their
35  requirements in conditions enabling the security of their systems and/or
36  data to be ensured and, more generally, to use and operate it in the
37  same conditions as regards security.
38 
39  The fact that you are presently reading this means that you have had
40  knowledge of the CeCILL license and that you accept its terms.
41 */
42 
43 #include <algorithm>
44 #include <cctype>
45 #include <iterator>
46 #include <vector>
47 
48 #include "../Exceptions.h"
49 #include "../Numeric/IntegerTools.h"
50 #include "TextTools.h"
51 
52 namespace bpp
53 {
54 namespace TextTools
55 {
56 /******************************************************************************/
57 
58 bool isEmpty(const std::string& s)
59 {
60  return std::all_of(s.begin(), s.end(), [](char c) {
61  return std::isspace(c);
62  });
63 }
64 
65 /******************************************************************************/
66 
67 std::string toUpper(const std::string& s)
68 {
69  std::string result;
70  result.reserve(s.size());
71  std::transform(s.begin(), s.end(), std::back_inserter(result), [](char c) {
72  return std::toupper(c);
73  });
74  return result;
75 }
76 
77 /******************************************************************************/
78 
79 std::string toLower(const std::string& s)
80 {
81  std::string result;
82  result.reserve(s.size());
83  std::transform(s.begin(), s.end(), std::back_inserter(result), [](char c) {
84  return std::tolower(c);
85  });
86  return result;
87 }
88 
89 /******************************************************************************/
90 
91 bool isWhiteSpaceCharacter(char c) { return std::isspace(c); }
92 
93 /******************************************************************************/
94 
95 std::string removeWhiteSpaces(const std::string& s)
96 {
97  // Only copy non whitespace chars in new string
98  std::string result;
99  std::remove_copy_if(s.begin(), s.end(), std::back_inserter(result), [](char c) {
100  return std::isspace(c);
101  });
102  return result;
103 }
104 
105 /******************************************************************************/
106 
107 std::string removeFirstWhiteSpaces(const std::string& s)
108 {
109  // Copy s from first non whitespace to end.
110  return std::string(std::find_if(s.begin(), s.end(), [](char c) {
111  return !std::isspace(c);
112  }), s.end());
113 }
114 
115 /******************************************************************************/
116 
117 std::string removeLastWhiteSpaces(const std::string& s)
118 {
119  // Copy s from start to last non whitespace char
120  auto lastNonWhitespace = std::find_if(s.rbegin(), s.rend(), [](char c) {
121  return !std::isspace(c);
122  });
123  return std::string(s.begin(), lastNonWhitespace.base());
124 }
125 
126 /******************************************************************************/
127 
128 std::string removeSurroundingWhiteSpaces(const std::string& s)
129 {
130  // Copy s from first non-whitespace to last non-whitespace
131  auto isNotWhitespace = [](char c) {
132  return !std::isspace(c);
133  };
134  auto firstNonWhitespace = std::find_if(s.begin(), s.end(), isNotWhitespace);
135  auto lastNonWhitespace = std::find_if(
136  s.rbegin(), std::reverse_iterator<std::string::const_iterator>(firstNonWhitespace), isNotWhitespace);
137  return std::string(firstNonWhitespace, lastNonWhitespace.base());
138 }
139 
140 /******************************************************************************/
141 
142 bool isNewLineCharacter(char c) { return (c == '\n') || (c == '\r'); }
143 
144 /******************************************************************************/
145 
146 std::string removeNewLines(const std::string& s)
147 {
148  // Only copy non newline chars in new string
149  std::string result;
150  std::remove_copy_if(s.begin(), s.end(), std::back_inserter(result), [](char c) {
151  return isNewLineCharacter(c);
152  });
153  return result;
154 }
155 
156 /******************************************************************************/
157 
158 std::string removeLastNewLines(const std::string& s)
159 {
160  // Copy s from start to last non newline char
161  auto lastNonNewline = std::find_if(s.rbegin(), s.rend(), [](char c) {
162  return !isNewLineCharacter(c);
163  });
164  return std::string(s.begin(), lastNonNewline.base());
165 }
166 
167 /******************************************************************************/
168 
169 bool isDecimalNumber(char c) { return std::isdigit(c); }
170 
171 /******************************************************************************/
172 
173 bool isDecimalNumber(const std::string& s, char dec, char scientificNotation)
174 {
175  if (isEmpty(s))
176  return false;
177 
178  std::size_t sepCount = 0;
179  std::size_t sciCount = 0;
180  std::size_t i = 0;
181  if (s[0] == '-')
182  i = 1;
183  for ( ; i < s.size(); ++i)
184  {
185  char c = s[i];
186  if (c == dec)
187  sepCount++;
188  else if (c == scientificNotation)
189  {
190  sciCount++;
191  if (i == s.size() - 1)
192  return false; // Must be sthg after scientific notation.
193  c = s[i + 1];
194  if (c == '-' || c == '+')
195  i++;
196  if (i == s.size() - 1)
197  return false; // Must be sthg after scientific notation.
198  if (sepCount == 0)
199  sepCount = 1; // We do not want any dec in the exponent.
200  }
201  else if (!isDecimalNumber(c))
202  return false;
203  if (sepCount > 1 || sciCount > 1)
204  return false;
205  }
206  return true;
207 }
208 
209 /******************************************************************************/
210 
211 bool isDecimalInteger(const std::string& s, char scientificNotation)
212 {
213  if (isEmpty(s))
214  return false;
215 
216  std::size_t sciCount = 0;
217  std::size_t i = 0;
218  if (s[0] == '-')
219  i = 1;
220  for ( ; i < s.size(); ++i)
221  {
222  char c = s[i];
223  if (c == scientificNotation)
224  {
225  sciCount++;
226  if (i == s.size() - 1)
227  return false; // Must be sthg after scientific notation.
228  c = s[i + 1];
229  if (c == '-')
230  return false; // Not an integer then!
231  if (c == '+')
232  i++;
233  if (i == s.size() - 1)
234  return false; // Must be sthg after scientific notation.
235  }
236  else if (!isDecimalNumber(c))
237  return false;
238  if (sciCount > 1)
239  return false;
240  }
241  return true;
242 }
243 
244 /******************************************************************************/
245 
246 int toInt(const std::string& s, char scientificNotation)
247 {
248  if (!isDecimalInteger(s, scientificNotation))
249  throw Exception("TextTools::toInt(). Invalid number specification: " + s);
250  return fromString<int>(s);
251 }
252 
253 /******************************************************************************/
254 
255 double toDouble(const std::string& s, char dec, char scientificNotation)
256 {
257  if (!isDecimalNumber(s, dec, scientificNotation))
258  throw Exception("TextTools::toDouble(). Invalid number specification: " + s);
259  return fromString<double>(s);
260 }
261 
262 /******************************************************************************/
263 
264 std::string resizeRight(const std::string& s, std::size_t newSize, char fill)
265 {
266  std::string result;
267  result.reserve(newSize);
268  if (newSize > s.size())
269  {
270  std::copy(s.begin(), s.end(), std::back_inserter(result));
271  std::fill_n(std::back_inserter(result), newSize - s.size(), fill);
272  }
273  else
274  {
275  std::copy_n(s.begin(), newSize, std::back_inserter(result));
276  }
277  return result;
278 }
279 
280 /******************************************************************************/
281 
282 std::string resizeLeft(const std::string& s, std::size_t newSize, char fill)
283 {
284  std::string result;
285  result.reserve(newSize);
286  if (newSize > s.size())
287  {
288  std::fill_n(std::back_inserter(result), newSize - s.size(), fill);
289  std::copy(s.begin(), s.end(), std::back_inserter(result));
290  }
291  else
292  {
293  using diff_type = typename std::iterator_traits<decltype(s.begin())>::difference_type;
294  std::copy(s.begin() + static_cast<diff_type>(s.size() - newSize), s.end(), std::back_inserter(result));
295  }
296  return result;
297 }
298 
299 /******************************************************************************/
300 
301 std::vector<std::string> split(const std::string& s, std::size_t n)
302 {
303  using diff_type = typename std::iterator_traits<decltype(s.begin())>::difference_type;
304  std::vector<std::string> v;
305  auto nbChunks = IntegerTools::divideUp(s.size(), n);
306  v.reserve(nbChunks);
307  // Copy chunks by chunks, and add the last incomplete one if s.size () % n != 0
308  auto nbCopiedChunks = IntegerTools::divideDown(s.size(), n);
309  for (std::size_t i = 0; i < nbCopiedChunks; ++i)
310  {
311  v.emplace_back(s.begin() + static_cast<diff_type>(i * n), s.begin() + static_cast<diff_type>((i + 1) * n));
312  }
313  if (v.size() < nbChunks)
314  v.emplace_back(s.begin() + static_cast<diff_type>(v.size() * n), s.end());
315  return v;
316 }
317 
318 /******************************************************************************/
319 
320 std::string removeSubstrings(const std::string& s, char blockBeginning, char blockEnding)
321 {
322  std::string result;
323  std::size_t blockDepth = 0;
324  for (std::size_t i = 0; i < s.size(); ++i)
325  {
326  auto c = s[i];
327  if (c == blockBeginning)
328  {
329  blockDepth++;
330  }
331  else if (c == blockEnding)
332  {
333  if (blockDepth == 0)
334  throw Exception(
335  std::string("TextTools::removeSubstrings(): unmatched block closing character at position ") +
336  std::to_string(i));
337  blockDepth--;
338  }
339  else if (blockDepth == 0)
340  {
341  result += c;
342  }
343  }
344  return result;
345 }
346 
347 /******************************************************************************/
348 
349 std::string removeSubstrings(const std::string& s,
350  char blockBeginning,
351  char blockEnding,
352  std::vector<std::string>& exceptionsBeginning,
353  std::vector<std::string>& exceptionsEnding)
354 {
355  // TODO didn't upgrade... move to a parser like system ? it is very specific...
356  std::string t;
357  int blockCount = 0;
358  std::size_t begPos = 0;
359  for (std::size_t i = 0; i < s.size(); i++)
360  {
361  char current = s[i];
362  if (current == blockBeginning)
363  {
364  bool except = false;
365  for (std::size_t j = 0; j < exceptionsBeginning.size(); j++)
366  {
367  std::size_t pos = exceptionsBeginning[j].find(blockBeginning);
368  if (pos != std::string::npos)
369  {
370  std::size_t left = i - pos;
371  std::size_t right = i + exceptionsBeginning[j].length() - pos;
372  if ((right < s.length() - 1) && (hasSubstring(s.substr(left, right), exceptionsBeginning[j])))
373  {
374  except = true;
375  break;
376  }
377  }
378  }
379  if (!except)
380  {
381  blockCount++;
382  t += s.substr(begPos, i - begPos);
383  }
384  }
385  else if ((current == blockEnding) && (blockCount > 0))
386  {
387  for (std::size_t j = 0; j < exceptionsEnding.size(); j++)
388  {
389  std::size_t pos = exceptionsEnding[j].find(blockEnding);
390  if (pos != std::string::npos)
391  {
392  std::size_t left = i - pos;
393  std::size_t right = i + exceptionsEnding[j].length() - pos;
394  if ((right < s.length() - 1) && (hasSubstring(s.substr(left, right), exceptionsEnding[j])))
395  {
396  break;
397  }
398  }
399  }
400  blockCount--;
401  if (blockCount == 0)
402  {
403  begPos = i + 1;
404  }
405  else if (blockCount < 0)
406  throw Exception("TextTools::removeSubstrings(). " +
407  std::string("Ending block character without corresponding beginning one at position ") +
408  toString((int)i) + ".");
409  }
410  }
411  t += s.substr(begPos);
412  return t;
413 }
414 
415 /******************************************************************************/
416 
417 std::string removeChar(const std::string& s, char c)
418 {
419  std::string result;
420  std::remove_copy(s.begin(), s.end(), std::back_inserter(result), c);
421  return result;
422 }
423 
424 /******************************************************************************/
425 
426 std::size_t count(const std::string& s, const std::string& pattern)
427 {
428  std::size_t count = 0;
429  auto it = std::search(s.begin(), s.end(), pattern.begin(), pattern.end());
430  while (it != s.end())
431  {
432  count++;
433  it = std::search(it + 1, s.end(), pattern.begin(), pattern.end());
434  }
435  return count;
436 }
437 
438 /******************************************************************************/
439 
440 bool startsWith(const std::string& s, const std::string& pattern)
441 {
442  if (s.size() < pattern.size())
443  return false;
444  return std::equal(pattern.begin(), pattern.end(), s.begin());
445 }
446 
447 /******************************************************************************/
448 
449 bool endsWith(const std::string& s, const std::string& pattern)
450 {
451  if (s.size() < pattern.size())
452  return false;
453  return std::equal(pattern.rbegin(), pattern.rend(), s.rbegin());
454 }
455 
456 /******************************************************************************/
457 
458 bool hasSubstring(const std::string& s, const std::string& pattern)
459 {
460  return std::search(s.begin(), s.end(), pattern.begin(), pattern.end()) != s.end();
461 }
462 
463 /******************************************************************************/
464 
465 void replaceAll(std::string& target, const std::string& query, const std::string& replacement)
466 {
467  using diff_type = typename std::iterator_traits<decltype(target.begin())>::difference_type;
468  if (query.empty())
469  return;
470  std::string result;
471  auto it = target.begin();
472  while (it != target.end())
473  {
474  // Find next start of pattern, copy s up to there, then append replacement, and move search after pattern.
475  auto nextPattern = std::search(it, target.end(), query.begin(), query.end());
476  std::copy(it, nextPattern, std::back_inserter(result));
477  if (nextPattern != target.end())
478  {
479  result += replacement;
480  it = nextPattern + static_cast<diff_type>(query.size());
481  }
482  else
483  {
484  it = nextPattern;
485  }
486  }
487  target = std::move(result);
488 }
489 
490 /******************************************************************************/
491 } // namespace TextTools
492 } // namespace bpp
Exception base class. Overload exception constructor (to control the exceptions mechanism)....
Definition: Exceptions.h:59
T divideDown(T n, T divisor) noexcept
Returns floor(n/divisor).
Definition: IntegerTools.h:55
T divideUp(T n, T divisor) noexcept
Returns ceil (n/divisor).
Definition: IntegerTools.h:63
int toInt(const std::string &s, char scientificNotation)
Convert from string to int.
Definition: TextTools.cpp:246
std::string removeWhiteSpaces(const std::string &s)
Remove all white spaces characters in a string.
Definition: TextTools.cpp:95
double toDouble(const std::string &s, char dec, char scientificNotation)
Convert from string to double.
Definition: TextTools.cpp:255
std::string resizeLeft(const std::string &s, std::size_t newSize, char fill)
Definition: TextTools.cpp:282
std::string removeSurroundingWhiteSpaces(const std::string &s)
Remove all white spaces characters at the beginning and the end of a string.
Definition: TextTools.cpp:128
std::string toUpper(const std::string &s)
Make the string uppercase.
Definition: TextTools.cpp:67
bool isWhiteSpaceCharacter(char c)
Tell if a character is a white space or not.
Definition: TextTools.cpp:91
bool hasSubstring(const std::string &s, const std::string &pattern)
Tell is a string contains a certain motif.
Definition: TextTools.cpp:458
std::string removeSubstrings(const std::string &s, char blockBeginning, char blockEnding)
Remove substrings from a string. All substrings beginning with blockBeginning and ending with blockEn...
Definition: TextTools.cpp:320
std::string removeLastNewLines(const std::string &s)
Remove all new line characters at the end of a string.
Definition: TextTools.cpp:158
bool isEmpty(const std::string &s)
Tell if a string is empty. A string is considered to be 'empty' if it is only made of white spaces.
Definition: TextTools.cpp:58
std::vector< std::string > split(const std::string &s, std::size_t n)
Definition: TextTools.cpp:301
void replaceAll(std::string &target, const std::string &query, const std::string &replacement)
Replacement of all non-overlapping occurrences of a certain motif in a string.
Definition: TextTools.cpp:465
bool isDecimalInteger(const std::string &s, char scientificNotation)
Tell is a given character string describes a decimal integer. FIXME: for now, this parser will not re...
Definition: TextTools.cpp:211
bool startsWith(const std::string &s, const std::string &pattern)
Tell is a string begins with a certain motif.
Definition: TextTools.cpp:440
std::string removeChar(const std::string &s, char c)
Remove all occurences of a character in a string.
Definition: TextTools.cpp:417
bool endsWith(const std::string &s, const std::string &pattern)
Tell is a string ends with a certain motif.
Definition: TextTools.cpp:449
std::string removeNewLines(const std::string &s)
Remove all new line characters in a string.
Definition: TextTools.cpp:146
std::string toLower(const std::string &s)
Make the string lowercase.
Definition: TextTools.cpp:79
bool isDecimalNumber(char c)
Tell is a given character describes a decimal number.
Definition: TextTools.cpp:169
std::string removeFirstWhiteSpaces(const std::string &s)
Remove all white spaces characters at the beginning of a string.
Definition: TextTools.cpp:107
std::string toString(T t)
General template method to convert to a string.
Definition: TextTools.h:153
std::string resizeRight(const std::string &s, std::size_t newSize, char fill)
Definition: TextTools.cpp:264
std::string removeLastWhiteSpaces(const std::string &s)
Remove all white spaces characters at the end of a string.
Definition: TextTools.cpp:117
bool isNewLineCharacter(char c)
Tell if a character is a new line character or not.
Definition: TextTools.cpp:142
std::size_t count(const std::string &s, const std::string &pattern)
Count the occurences of a given pattern in a string.
Definition: TextTools.cpp:426