bpp-seq3  3.0.0
SequenceTools.h
Go to the documentation of this file.
1 // SPDX-FileCopyrightText: The Bio++ Development Group
2 //
3 // SPDX-License-Identifier: CECILL-2.1
4 
5 #ifndef BPP_SEQ_SEQUENCETOOLS_H
6 #define BPP_SEQ_SEQUENCETOOLS_H
7 
8 #include <Bpp/Exceptions.h>
11 
12 #include "Alphabet/Alphabet.h"
13 #include "Alphabet/DNA.h"
14 #include "Alphabet/RNA.h"
15 #include "Alphabet/RNY.h"
18 #include "Sequence.h"
19 #include "SymbolListTools.h"
20 
21 // From the STL:
22 #include <string>
23 #include <map>
24 #include <vector>
25 #include <algorithm>
26 #include <memory>
27 
28 namespace bpp
29 {
33 class BowkerTest :
34  public StatTest
35 {
36 private:
37  double pvalue_;
38  double stat_;
39 
40 public:
42  stat_(0.) {}
43 
44  virtual ~BowkerTest() {}
45 
46  BowkerTest* clone() const { return new BowkerTest(*this); }
47 
48 public:
49  std::string getName() const { return "Bowker's test for homogeneity."; }
50  double getStatistic() const { return stat_; }
51  double getPValue() const { return pvalue_; }
52 
53  void setStatistic(double stat) { stat_ = stat; }
54  void setPValue(double pvalue) { pvalue_ = pvalue; }
55 };
56 
63  public SymbolListTools
64 {
65 private:
66  static std::shared_ptr<RNY> RNY_;
70 
71 public:
73  virtual ~SequenceTools() {}
74 
75 public:
81  static bool areSequencesIdentical(const SequenceInterface& seq1, const SequenceInterface& seq2);
82 
91  static void subseq(const SequenceInterface& sequence, size_t begin, size_t end, SequenceInterface& output)
92  {
93  if (end < begin || end >= sequence.size())
94  throw Exception("SequenceTools::subseq. Invalid coordinates begin=" + TextTools::toString(begin) + ", end=" + TextTools::toString(end) + " for a sequence of size " + TextTools::toString(sequence.size()) + ".");
95  std::vector<int> content(end - begin + 1);
96  for (size_t i = 0; i <= end - begin; ++i)
97  {
98  content[i] = sequence[begin + i];
99  }
100  output.append(content);
101  }
102 
111  template<class SequenceTypeOut>
112  static std::unique_ptr<SequenceTypeOut> subseq(const SequenceInterface& sequence, size_t begin, size_t end)
113  {
114  auto alphaPtr = sequence.getAlphabet();
115  auto seq = std::make_unique<SequenceTypeOut>(alphaPtr);
116  seq->setName(sequence.getName());
117  seq->setComments(sequence.getComments());
118  subseq(sequence, begin, end, *seq);
119  return seq;
120  }
121 
122 
136  template<class SequenceTypeOut>
137  static std::unique_ptr<SequenceTypeOut> concatenate(const SequenceInterface& seq1, const SequenceInterface& seq2)
138  {
139  // Sequence's alphabets matching verification
140  if ((seq1.alphabet().getAlphabetType()) != (seq2.alphabet().getAlphabetType()))
141  throw AlphabetMismatchException("SequenceTools::concatenate : Sequence's alphabets don't match ", seq1.getAlphabet(), seq2.getAlphabet());
142 
143  // Sequence's names matching verification
144  if (seq1.getName() != seq2.getName())
145  throw Exception ("SequenceTools::concatenate : Sequence's names don't match");
146 
147  // Concatenate sequences and send result
148  auto concat = std::make_unique<SequenceTypeOut>(seq1);
149  concat->setToSizeR(seq1.size() + seq2.size());
150  for (size_t i = 0; i < seq2.size(); ++i)
151  {
152  (*concat)[seq1.size() + i] = seq2[i];
153  }
154  return concat;
155  }
156 
164  static void complement(SequenceInterface& seq);
165 
174  static std::unique_ptr<Sequence> getComplement(const SequenceInterface& sequence);
175 
186  static std::unique_ptr<Sequence> transcript(const Sequence& sequence);
187 
198  static std::unique_ptr<Sequence> reverseTranscript(const Sequence& sequence);
199 
209  static void invert(SequenceInterface& seq);
210 
221  static std::unique_ptr<SequenceInterface> getInvert(const SequenceInterface& sequence);
222 
232  static void invertComplement(SequenceInterface& seq);
233 
243  static double getPercentIdentity(const SequenceInterface& seq1, const SequenceInterface& seq2, bool ignoreGaps = false);
244 
250  static size_t getNumberOfSites(const SequenceInterface& seq);
251 
257  static size_t getNumberOfCompleteSites(const SequenceInterface& seq);
258 
265  static std::unique_ptr<SequenceInterface> getSequenceWithCompleteSites(const SequenceInterface& seq);
266 
274  static size_t getNumberOfUnresolvedSites(const SequenceInterface& seq);
275 
276 
283  static void removeGaps(SequenceInterface& seq);
284 
294  static std::unique_ptr<SequenceInterface> getSequenceWithoutGaps(const SequenceInterface& seq);
295 
304  static void removeStops(SequenceInterface& seq, const GeneticCode& gCode);
305 
317  static std::unique_ptr<SequenceInterface> getSequenceWithoutStops(const SequenceInterface& seq, const GeneticCode& gCode);
318 
327  static void replaceStopsWithGaps(SequenceInterface& seq, const GeneticCode& gCode);
328 
344  static std::unique_ptr<BowkerTest> bowkerTest(const SequenceInterface& seq1, const SequenceInterface& seq2);
345 
358  static void getPutativeHaplotypes(
359  const SequenceInterface& seq,
360  std::vector<std::unique_ptr<SequenceInterface>>& hap,
361  unsigned int level = 2);
362 
368  static std::unique_ptr<Sequence> combineSequences(
369  const SequenceInterface& s1,
370  const SequenceInterface& s2);
371 
397  static std::unique_ptr<Sequence> subtractHaplotype(
398  const SequenceInterface& s,
399  const SequenceInterface& h,
400  std::string name = "",
401  unsigned int level = 1);
402 
416  static std::unique_ptr<Sequence> RNYslice(const SequenceInterface& sequence, int ph);
417 
430  static std::unique_ptr<Sequence> RNYslice(const SequenceInterface& sequence);
431 
442  static void getCDS(SequenceInterface& sequence, const GeneticCode& gCode, bool checkInit, bool checkStop, bool includeInit = true, bool includeStop = true);
443 
454  static size_t findFirstOf(const SequenceInterface& seq, const SequenceInterface& motif, bool strict = true);
455 
463  static std::unique_ptr<Sequence> getRandomSequence(std::shared_ptr<const Alphabet>& alphabet, size_t length);
464 };
465 } // end of namespace bpp.
466 #endif // BPP_SEQ_SEQUENCETOOLS_H
Exception thrown when two alphabets do not match.
virtual std::string getAlphabetType() const =0
Identification method.
Bowker's homogeneity test results class.
Definition: SequenceTools.h:35
void setStatistic(double stat)
Definition: SequenceTools.h:53
virtual ~BowkerTest()
Definition: SequenceTools.h:44
double getPValue() const
Definition: SequenceTools.h:51
std::string getName() const
Definition: SequenceTools.h:49
double getStatistic() const
Definition: SequenceTools.h:50
void setPValue(double pvalue)
Definition: SequenceTools.h:54
BowkerTest * clone() const
Definition: SequenceTools.h:46
virtual const Comments & getComments() const =0
Get the comments.
virtual const std::string & getName() const =0
Get the name of this sequence.
virtual std::shared_ptr< const Alphabet > getAlphabet() const =0
Get the alphabet associated to the list.
virtual size_t size() const =0
Get the number of elements in the list.
virtual const Alphabet & alphabet() const =0
Get the alphabet associated to the list.
Partial implementation of the Transliterator interface for genetic code object.
Definition: GeneticCode.h:50
Replication between to nucleic acids.
The sequence interface.
Definition: Sequence.h:34
virtual void append(const SequenceInterface &seq)=0
Append the content of the sequence.
SequenceTools static class.
Definition: SequenceTools.h:64
static bool areSequencesIdentical(const SequenceInterface &seq1, const SequenceInterface &seq2)
static void subseq(const SequenceInterface &sequence, size_t begin, size_t end, SequenceInterface &output)
Get a sub-sequence.
Definition: SequenceTools.h:91
static void getCDS(SequenceInterface &sequence, const GeneticCode &gCode, bool checkInit, bool checkStop, bool includeInit=true, bool includeStop=true)
Extract CDS part from a codon sequence. Optionally check for intiator and stop codons,...
static std::unique_ptr< Sequence > getRandomSequence(std::shared_ptr< const Alphabet > &alphabet, size_t length)
Get a random sequence of given size and alphabet, with all state with equal probability.
static size_t findFirstOf(const SequenceInterface &seq, const SequenceInterface &motif, bool strict=true)
Find the position of a motif in a sequence.
static std::unique_ptr< Sequence > transcript(const Sequence &sequence)
Get the transcription sequence of a DNA sequence.
static void invert(SequenceInterface &seq)
Inverse a sequence from 5'->3' to 3'->5' and vice-versa.
virtual ~SequenceTools()
Definition: SequenceTools.h:73
static NucleicAcidsReplication DNARep_
Definition: SequenceTools.h:67
static size_t getNumberOfSites(const SequenceInterface &seq)
static size_t getNumberOfCompleteSites(const SequenceInterface &seq)
static std::unique_ptr< BowkerTest > bowkerTest(const SequenceInterface &seq1, const SequenceInterface &seq2)
Bowker's test for homogeneity.
static std::unique_ptr< Sequence > combineSequences(const SequenceInterface &s1, const SequenceInterface &s2)
Combine two sequences.
static void removeGaps(SequenceInterface &seq)
Remove gaps from a sequence.
static void getPutativeHaplotypes(const SequenceInterface &seq, std::vector< std::unique_ptr< SequenceInterface >> &hap, unsigned int level=2)
Get all putatives haplotypes from an heterozygous sequence.
static std::unique_ptr< SequenceInterface > getSequenceWithoutStops(const SequenceInterface &seq, const GeneticCode &gCode)
Get a copy of the codon sequence without stops.
static std::unique_ptr< Sequence > RNYslice(const SequenceInterface &sequence, int ph)
Get the RNY decomposition of a DNA sequence.
static NucleicAcidsReplication RNARep_
Definition: SequenceTools.h:68
static std::unique_ptr< Sequence > reverseTranscript(const Sequence &sequence)
Get the reverse-transcription sequence of a RNA sequence.
static void removeStops(SequenceInterface &seq, const GeneticCode &gCode)
Remove stops from a codon sequence.
static double getPercentIdentity(const SequenceInterface &seq1, const SequenceInterface &seq2, bool ignoreGaps=false)
static void invertComplement(SequenceInterface &seq)
Inverse and complement a sequence.
static void complement(SequenceInterface &seq)
Complement the nucleotide sequence itself.
static void replaceStopsWithGaps(SequenceInterface &seq, const GeneticCode &gCode)
Replace stop codons by gaps.
static std::unique_ptr< SequenceInterface > getSequenceWithCompleteSites(const SequenceInterface &seq)
keep only complete sites in a sequence.
static std::unique_ptr< SequenceTypeOut > concatenate(const SequenceInterface &seq1, const SequenceInterface &seq2)
Concatenate two sequences.
static std::unique_ptr< SequenceInterface > getInvert(const SequenceInterface &sequence)
Inverse a sequence from 5'->3' to 3'->5' and vice-versa.
static size_t getNumberOfUnresolvedSites(const SequenceInterface &seq)
static std::unique_ptr< SequenceInterface > getSequenceWithoutGaps(const SequenceInterface &seq)
Get a copy of the sequence without gaps.
static std::shared_ptr< RNY > RNY_
Definition: SequenceTools.h:66
static NucleicAcidsReplication transc_
Definition: SequenceTools.h:69
static std::unique_ptr< Sequence > subtractHaplotype(const SequenceInterface &s, const SequenceInterface &h, std::string name="", unsigned int level=1)
Subtract haplotype from an heterozygous sequence.
static std::unique_ptr< SequenceTypeOut > subseq(const SequenceInterface &sequence, size_t begin, size_t end)
Get a sub-sequence.
static std::unique_ptr< Sequence > getComplement(const SequenceInterface &sequence)
Get the complementary sequence of a nucleotide sequence.
A basic implementation of the Sequence interface.
Definition: Sequence.h:117
Utilitary functions dealing with both sites and sequences.
std::string toString(T t)
This alphabet is used to deal NumericAlphabet.