bpp-seq3  3.0.0
SequenceApplicationTools.h
Go to the documentation of this file.
1 // SPDX-FileCopyrightText: The Bio++ Development Group
2 //
3 // SPDX-License-Identifier: CECILL-2.1
4 
5 #ifndef BPP_SEQ_APP_SEQUENCEAPPLICATIONTOOLS_H
6 #define BPP_SEQ_APP_SEQUENCEAPPLICATIONTOOLS_H
7 
8 #include <map>
9 #include <string>
10 #include <memory>
11 
12 #include "../Alphabet/Alphabet.h"
13 #include "../AlphabetIndex/AlphabetIndex1.h"
14 #include "../AlphabetIndex/AlphabetIndex2.h"
15 #include "../GeneticCode/GeneticCode.h"
16 #include "../Container/SequenceContainer.h"
17 #include "../Container/VectorSiteContainer.h"
18 #include "../Container/SiteContainerTools.h"
19 #include "../SiteTools.h"
20 
21 namespace bpp
22 {
38 {
39 public:
42 
43 public:
68  static std::unique_ptr<Alphabet> getAlphabet(
69  const std::map<std::string, std::string>& params,
70  const std::string& suffix = "",
71  bool suffixIsOptional = true,
72  bool verbose = true,
73  bool allowGeneric = false,
74  int warn = 1);
75 
90  static std::unique_ptr<GeneticCode> getGeneticCode(
91  std::shared_ptr<const NucleicAlphabet> alphabet,
92  const std::string& description);
93 
104  static std::unique_ptr<AlphabetIndex1> getAlphabetIndex1(
105  std::shared_ptr<const Alphabet> alphabet,
106  const std::string& description,
107  const std::string& message = "Alphabet measure:",
108  bool verbose = true);
109 
110  static std::unique_ptr<AlphabetIndex1> getAlphabetIndex1(
111  std::shared_ptr<const CodonAlphabet> alphabet,
112  std::shared_ptr<const GeneticCode> gencode,
113  const std::string& description,
114  const std::string& message = "Alphabet measure:",
115  bool verbose = true);
116 
117 
129  static std::unique_ptr<AlphabetIndex2> getAlphabetIndex2(
130  std::shared_ptr<const Alphabet> alphabet,
131  const std::string& description,
132  const std::string& message = "Alphabet distance:",
133  bool verbose = true);
134 
135  static std::unique_ptr<AlphabetIndex2> getAlphabetIndex2(
136  std::shared_ptr<const CodonAlphabet> alphabet,
137  std::shared_ptr<const GeneticCode> gencode,
138  const std::string& description,
139  const std::string& message = "Alphabet distance:",
140  bool verbose = true);
141 
160  static std::unique_ptr<SequenceContainerInterface> getSequenceContainer(
161  std::shared_ptr<const Alphabet> alpha,
162  const std::map<std::string, std::string>& params,
163  const std::string& suffix = "",
164  bool suffixIsOptional = true,
165  bool verbose = true,
166  int warn = 1);
167 
184  static std::unique_ptr<VectorSiteContainer> getSiteContainer(
185  std::shared_ptr<const Alphabet> alpha,
186  const std::map<std::string, std::string>& params,
187  const std::string& suffix = "",
188  bool suffixIsOptional = true,
189  bool verbose = true,
190  int warn = 1);
191 
208  static std::unique_ptr<ProbabilisticVectorSiteContainer> getProbabilisticSiteContainer(
209  std::shared_ptr<const Alphabet> alpha,
210  const std::map<std::string, std::string>& params,
211  const std::string& suffix = "",
212  bool suffixIsOptional = true,
213  bool verbose = true,
214  int warn = 1);
215 
233  static std::map<size_t, std::unique_ptr<VectorSiteContainer>>
235  std::shared_ptr<const Alphabet> alpha,
236  const std::map<std::string, std::string>& params,
237  const std::string& prefix = "input.",
238  const std::string& suffix = "",
239  bool suffixIsOptional = true,
240  bool verbose = true,
241  int warn = 1);
242 
260  static std::map<size_t, std::unique_ptr<ProbabilisticVectorSiteContainer>>
262  std::shared_ptr<const Alphabet> alpha,
263  const std::map<std::string, std::string>& params,
264  const std::string& prefix = "input.",
265  const std::string& suffix = "",
266  bool suffixIsOptional = true,
267  bool verbose = true,
268  int warn = 1);
269 
270 
288  SequenceContainerInterface& allSequences,
289  const std::map<std::string, std::string>& params,
290  std::string suffix = "",
291  bool suffixIsOptional = true,
292  bool verbose = true,
293  int warn = 1);
294 
295 
324  template<class SiteType, class SequenceType>
325  static std::unique_ptr< TemplateVectorSiteContainer<SiteType, SequenceType>>
328  const std::map<std::string, std::string>& params,
329  std::string suffix = "",
330  bool suffixIsOptional = true,
331  bool gapAsUnknown = true,
332  bool verbose = true,
333  int warn = 1)
334  {
335  // Fully resolved sites, i.e. without jokers and gaps:
336  std::unique_ptr< TemplateVectorSiteContainer<SiteType, SequenceType>> sitesToAnalyse;
337 
338  size_t numSeq = allSites.getNumberOfSequences();
339 
340  std::string option = ApplicationTools::getStringParameter("input.sequence.sites_to_use", params, "complete", suffix, suffixIsOptional, warn);
341  if (verbose)
342  ApplicationTools::displayResult("Sites to use", option);
343 
344  if (option == "all")
345  {
346  sitesToAnalyse = std::make_unique< TemplateVectorSiteContainer<SiteType, SequenceType>>(allSites);
347  size_t nbSites = sitesToAnalyse->getNumberOfSites();
348 
349  std::string maxGapOption = ApplicationTools::getStringParameter("input.sequence.max_gap_allowed", params, "100%", suffix, suffixIsOptional, warn);
350 
351  double gapCount = 0;
352 
353  if (maxGapOption[maxGapOption.size() - 1] == '%')
354  {
355  double gapFreq = TextTools::toDouble(maxGapOption.substr(0, maxGapOption.size() - 1)) / 100;
356  gapCount = gapFreq * (int)numSeq;
357  }
358  else
359  gapCount = TextTools::to<int>(maxGapOption) - NumConstants::TINY();
360 
361  if (gapCount < static_cast<double>(numSeq) - NumConstants::TINY())
362  {
363  if (verbose)
364  ApplicationTools::displayTask("Remove sites with gaps", true);
365  for (size_t i = nbSites; i > 0; i--)
366  {
367  if (verbose)
368  ApplicationTools::displayGauge(nbSites - i, nbSites - 1, '=');
369 
370  if (static_cast<double>(SiteTools::numberOfGaps(sitesToAnalyse->site(i - 1))) > gapCount)
371  sitesToAnalyse->deleteSites(i - 1, 1);
372  }
373  if (verbose)
375  }
376 
377  std::string maxUnresolvedOption = ApplicationTools::getStringParameter("input.sequence.max_unresolved_allowed", params, "100%", suffix, suffixIsOptional, warn);
378 
379  double unresCount = 0;
380 
381  if (maxUnresolvedOption[maxUnresolvedOption.size() - 1] == '%')
382  {
383  double unresFreq = TextTools::toDouble(maxUnresolvedOption.substr(0, maxUnresolvedOption.size() - 1)) / 100;
384  unresCount = unresFreq * (int)numSeq;
385  }
386  else
387  unresCount = TextTools::to<double>(maxUnresolvedOption) - NumConstants::TINY();
388 
389  nbSites = sitesToAnalyse->getNumberOfSites();
390 
391  if (unresCount < static_cast<double>(numSeq) - NumConstants::TINY())
392  {
393  if (verbose)
394  ApplicationTools::displayTask("Remove unresolved sites", true);
395  for (size_t i = nbSites; i > 0; i--)
396  {
397  if (verbose)
398  ApplicationTools::displayGauge(nbSites - i, nbSites - 1, '=');
399 
400  if (static_cast<double>(SiteTools::numberOfUnresolved(sitesToAnalyse->site(i - 1))) > unresCount)
401  sitesToAnalyse->deleteSites(i - 1, 1);
402  }
403  if (verbose)
405  }
406  }
407  else if (option == "complete")
408  {
409  sitesToAnalyse = SiteContainerTools::getCompleteSites(allSites);
410  size_t nbSites = sitesToAnalyse->getNumberOfSites();
411  if (verbose)
412  ApplicationTools::displayResult("Complete sites", TextTools::toString(nbSites));
413  }
414  else if (option == "nogap")
415  {
416  sitesToAnalyse = SiteContainerTools::getSitesWithoutGaps(allSites);
417  size_t nbSites = sitesToAnalyse->getNumberOfSites();
418  if (verbose)
419  ApplicationTools::displayResult("Sites without gap", TextTools::toString(nbSites));
420  }
421  else
422  {
423  throw Exception("Option '" + option + "' unknown in parameter 'sequence.sites_to_use'.");
424  }
425 
426  auto ca = std::dynamic_pointer_cast<const CodonAlphabet>(sitesToAnalyse->getAlphabet());
427  if (ca)
428  {
429  option = ApplicationTools::getStringParameter("input.sequence.remove_stop_codons", params, "no", suffix, true, warn);
430  if ((option != "") && verbose)
431  ApplicationTools::displayResult("Remove Stop Codons", option);
432 
433  if (option == "yes")
434  {
435  std::string codeDesc = ApplicationTools::getStringParameter("genetic_code", params, "Standard", "", true, warn);
436  auto nucAlph = ca->getNucleicAlphabet();
437  auto gCode = getGeneticCode(nucAlph, codeDesc);
438  SiteContainerTools::removeSitesWithStopCodon(*sitesToAnalyse, *gCode);
439  }
440  }
441 
442  if (verbose)
443  ApplicationTools::displayResult("Number of sites", sitesToAnalyse->getNumberOfSites());
444 
445  return sitesToAnalyse;
446  }
447 
448 
464  static void writeSequenceFile(
465  const SequenceContainerInterface& sequences,
466  const std::map<std::string, std::string>& params,
467  const std::string& suffix = "",
468  bool verbose = true,
469  int warn = 1);
470 
484  static void writeAlignmentFile(
485  const SiteContainerInterface& sequences,
486  const std::map<std::string, std::string>& params,
487  const std::string& suffix = "",
488  bool verbose = true,
489  int warn = 1);
490 };
491 } // end of namespace bpp.
492 #endif // BPP_SEQ_APP_SEQUENCEAPPLICATIONTOOLS_H
static void displayTask(const std::string &text, bool eof=false)
static void displayTaskDone()
static std::string getStringParameter(const std::string &parameterName, const std::map< std::string, std::string > &params, const std::string &defaultValue, const std::string &suffix="", bool suffixIsOptional=true, int warn=0)
static void displayResult(const std::string &text, const T &result)
static void displayGauge(size_t iter, size_t total, char symbol='>', const std::string &mes="")
static double TINY()
This class provides some common tools for applications.
static std::unique_ptr< ProbabilisticVectorSiteContainer > getProbabilisticSiteContainer(std::shared_ptr< const Alphabet > alpha, const std::map< std::string, std::string > &params, const std::string &suffix="", bool suffixIsOptional=true, bool verbose=true, int warn=1)
Build a ProbabilisticSiteContainer object according to the BppO syntax.
static void restrictSelectedSequencesByName(SequenceContainerInterface &allSequences, const std::map< std::string, std::string > &params, std::string suffix="", bool suffixIsOptional=true, bool verbose=true, int warn=1)
Retrieves selected sequences (by name).
static std::map< size_t, std::unique_ptr< ProbabilisticVectorSiteContainer > > getProbabilisticSiteContainers(std::shared_ptr< const Alphabet > alpha, const std::map< std::string, std::string > &params, const std::string &prefix="input.", const std::string &suffix="", bool suffixIsOptional=true, bool verbose=true, int warn=1)
Build multiple ProbabilisticSiteContainer objects according to the BppO syntax.
static std::unique_ptr< TemplateVectorSiteContainer< SiteType, SequenceType > > getSitesToAnalyse(const TemplateSiteContainerInterface< SiteType, SequenceType, std::string > &allSites, const std::map< std::string, std::string > &params, std::string suffix="", bool suffixIsOptional=true, bool gapAsUnknown=true, bool verbose=true, int warn=1)
Retrieves sites suitable for the analysis.
static std::unique_ptr< GeneticCode > getGeneticCode(std::shared_ptr< const NucleicAlphabet > alphabet, const std::string &description)
Build a GeneticCode object according to options.
static std::map< size_t, std::unique_ptr< VectorSiteContainer > > getSiteContainers(std::shared_ptr< const Alphabet > alpha, const std::map< std::string, std::string > &params, const std::string &prefix="input.", const std::string &suffix="", bool suffixIsOptional=true, bool verbose=true, int warn=1)
Build multiple SiteContainer objects according to the BppO syntax.
static void writeAlignmentFile(const SiteContainerInterface &sequences, const std::map< std::string, std::string > &params, const std::string &suffix="", bool verbose=true, int warn=1)
Write a sequence alignment file according to options.
static void writeSequenceFile(const SequenceContainerInterface &sequences, const std::map< std::string, std::string > &params, const std::string &suffix="", bool verbose=true, int warn=1)
Write a sequence file according to options.
static std::unique_ptr< VectorSiteContainer > getSiteContainer(std::shared_ptr< const Alphabet > alpha, const std::map< std::string, std::string > &params, const std::string &suffix="", bool suffixIsOptional=true, bool verbose=true, int warn=1)
Build a SiteContainer object according to the BppO syntax.
static std::unique_ptr< SequenceContainerInterface > getSequenceContainer(std::shared_ptr< const Alphabet > alpha, const std::map< std::string, std::string > &params, const std::string &suffix="", bool suffixIsOptional=true, bool verbose=true, int warn=1)
Build a SequenceContainer object according to options.
static std::unique_ptr< AlphabetIndex2 > getAlphabetIndex2(std::shared_ptr< const Alphabet > alphabet, const std::string &description, const std::string &message="Alphabet distance:", bool verbose=true)
Build a AlphabetIndex2 object for a given alphabet.
static std::unique_ptr< Alphabet > getAlphabet(const std::map< std::string, std::string > &params, const std::string &suffix="", bool suffixIsOptional=true, bool verbose=true, bool allowGeneric=false, int warn=1)
Build an Alphabet object according to options.
static std::unique_ptr< AlphabetIndex1 > getAlphabetIndex1(std::shared_ptr< const Alphabet > alphabet, const std::string &description, const std::string &message="Alphabet measure:", bool verbose=true)
Build a AlphabetIndex1 object for a given alphabet.
static std::unique_ptr< TemplateVectorSiteContainer< SiteType, SequenceType > > getCompleteSites(const TemplateSiteContainerInterface< SiteType, SequenceType, std::string > &sites)
Retrieves complete sites.
static void removeSitesWithStopCodon(SiteContainerInterface &sites, const GeneticCode &gCode)
Remove sites with stop codons, if the alphabet is a CodonAlphabet, otherwise throws an Exception.
static std::unique_ptr< TemplateVectorSiteContainer< SiteType, SequenceType > > getSitesWithoutGaps(const TemplateSiteContainerInterface< SiteType, SequenceType, std::string > &sites)
Retrieves sites without gaps.
static size_t numberOfGaps(const IntSymbolListInterface &site)
static size_t numberOfUnresolved(const IntSymbolListInterface &site)
The SequenceContainer interface.
virtual size_t getNumberOfSequences() const =0
Get the number of sequences in the container.
double toDouble(const std::string &s, char dec='.', char scientificNotation='e')
std::string toString(T t)
This alphabet is used to deal NumericAlphabet.