bpp-seq3  3.0.0
SequenceContainerTools.h
Go to the documentation of this file.
1 // SPDX-FileCopyrightText: The Bio++ Development Group
2 //
3 // SPDX-License-Identifier: CECILL-2.1
4 
5 #ifndef BPP_SEQ_CONTAINER_SEQUENCECONTAINERTOOLS_H
6 #define BPP_SEQ_CONTAINER_SEQUENCECONTAINERTOOLS_H
7 
8 
9 // From the STL:
10 #include <string>
11 #include <vector>
12 #include <map>
13 #include <memory>
14 
16 #include "../SymbolListTools.h"
17 #include "SequenceContainer.h"
19 #include "../Alphabet/CodonAlphabet.h"
20 
21 namespace bpp
22 {
23 using SequenceSelection = std::vector<size_t>;
24 using SiteSelection = std::vector<size_t>;
25 
30 {
31 public:
34 
35 public:
50  template<class SequenceType, class HashType>
52  {
53  size_t nbSeq = sc.getNumberOfSequences();
54  for (size_t i = 0; i < nbSeq; ++i)
55  {
56  if (sc.sequence(i).getName() == name)
57  {
58  return true;
59  }
60  }
61  return false;
62  }
63 
77  template<class SequenceType, class HashType>
78  static std::unique_ptr< TemplateSequenceContainerInterface<SequenceType, HashType>> createContainerOfSpecifiedSize(std::shared_ptr<const Alphabet>& alphabet, size_t size)
79  {
80  auto vsc = std::make_unique< TemplateVectorSequenceContainer<SequenceType>>(alphabet);
81  for (size_t i = 0; i < size; ++i)
82  {
83  vsc->addSequence(SequenceType(TextTools::toString(i), "", alphabet), false);
84  }
85  return vsc;
86  }
87 
88 
101  template<class SequenceType, class HashType>
102  static std::unique_ptr< TemplateSequenceContainerInterface<SequenceType, HashType>> createContainerWithSequenceNames(
103  std::shared_ptr<const Alphabet>& alphabet,
104  const std::vector<std::string>& seqNames)
105  {
106  auto sc = createContainerOfSpecifiedSize<SequenceType, HashType>(alphabet, seqNames.size());
107  sc->setSequenceNames(seqNames, true);
108  return sc;
109  }
110 
111 
124  template<class ContFrom, class ContTo, class SequenceType>
125  static void convertContainer(const ContFrom& input, ContTo& output)
126  {
127  for (size_t i = 0; i < input.getNumberOfSequences(); ++i)
128  {
129  auto seq = std::make_unique<SequenceType>(input.sequence(i));
130  output.addSequence(seq->getName(), seq);
131  }
132  }
133 
149  template<class SequenceType, class HashType>
150  static void getSelectedSequences(
152  const SequenceSelection& selection,
154  {
155  for (size_t position : selection)
156  {
157  auto seq = std::make_unique<SequenceType>(sequences.sequence(position));
158  outputCont.addSequence(seq->getName(), seq);
159  }
160  }
161 
162 
180  template<class SequenceType, class HashType>
181  static void getSelectedSequences(
183  const std::vector<std::string>& selection,
185  bool strict = true)
186  {
187  for (const std::string& key : selection)
188  {
189  if (strict)
190  {
191  auto seq = std::make_unique<SequenceType>(sequences.sequence(key));
192  outputCont.addSequence(seq->getName(), seq);
193  }
194  else
195  {
196  if (sequences.hasSequence(key))
197  {
198  auto seq = std::make_unique<SequenceType>(sequences.sequence(key));
199  outputCont.addSequence(seq->getName(), seq);
200  }
201  }
202  }
203  }
204 
205 
218  template<class SequenceType, class HashType>
221  const SequenceSelection& selection)
222  {
223  std::vector<std::string> keys = sequences.getSequenceKeys();
224  std::vector<std::string> selectedKeys = VectorTools::extract<std::string>(keys, selection);
225  std::vector<std::string> keysToRemove;
226  VectorTools::diff(keys, selectedKeys, keysToRemove);
227  for (const std::string& key : keysToRemove)
228  {
229  // We need to do this because after removal the indices will not be the same!
230  // another solution would be to sort decreasingly the indices...
231  sequences.removeSequence(key);
232  }
233  }
234 
235 
242  template<class SequenceType, class HashType>
244  const TemplateSequenceContainerInterface<SequenceType,
245  HashType>& sc)
246  {
247  size_t ns = sc.getNumberOfSequences();
248  if (ns <= 1)
249  return true;
250  size_t length = sc.sequence(0).size();
251  for (size_t i = 1; i < ns; ++i)
252  {
253  if (sc.sequence(i).size() != length)
254  return false;
255  }
256  return true;
257  }
258 
259 
272  static void getCounts(const SequenceContainerInterface& sc, std::map<int, unsigned int>& f)
273  {
274  for (size_t i = 0; i < sc.getNumberOfSequences(); ++i)
275  {
276  const Sequence& seq = sc.sequence(i);
277  for (size_t j = 0; j < seq.size(); ++j)
278  {
279  f[seq[j]]++;
280  }
281  }
282  }
283 
284 
297  static void getFrequencies(
298  const SequenceContainerInterface& sc,
299  std::map<int, double>& f,
300  double pseudoCount = 0)
301  {
302  double n = 0;
303  for (size_t i = 0; i < sc.getNumberOfSequences(); ++i)
304  {
305  const Sequence& seq = sc.sequence(i);
306  SymbolListTools::getCounts(seq, f, true);
307  n += static_cast<double>(seq.size());
308  }
309 
310  if (pseudoCount != 0)
311  {
312  std::shared_ptr<const Alphabet> pA = sc.getAlphabet();
313  for (int i = 0; i < static_cast<int>(pA->getSize()); ++i)
314  {
315  f[i] += pseudoCount;
316  }
317  n += pseudoCount * static_cast<double>(pA->getSize());
318  }
319 
320  for (auto& i : f)
321  {
322  i.second = i.second / n;
323  }
324  }
325 
326 
339  static void getFrequencies(
341  std::map<int, double>& f,
342  double pseudoCount = 0)
343  {
344  double n = 0;
345  for (size_t i = 0; i < sc.getNumberOfSequences(); ++i)
346  {
347  const ProbabilisticSequence& seq = sc.sequence(i);
348  SymbolListTools::getCounts(seq, f, true);
349  n += static_cast<double>(seq.size());
350  }
351 
352  if (pseudoCount != 0)
353  {
354  std::shared_ptr<const Alphabet> pA = sc.getAlphabet();
355  for (int i = 0; i < static_cast<int>(pA->getSize()); ++i)
356  {
357  f[i] += pseudoCount;
358  }
359  n += pseudoCount * static_cast<double>(pA->getSize());
360  }
361 
362  for (auto& i : f)
363  {
364  i.second = i.second / n;
365  }
366  }
367 
368 
375  static void getFrequencies(
376  const SequenceDataInterface& sc,
377  std::map<int, double>& f,
378  double pseudoCount = 0)
379  {
380  try
381  {
382  getFrequencies(dynamic_cast<const SequenceContainerInterface&>(sc), f, pseudoCount);
383  return;
384  }
385  catch (std::bad_cast&) {}
386  try
387  {
388  getFrequencies(dynamic_cast<const ProbabilisticSequenceContainerInterface&>(sc), f, pseudoCount);
389  }
390  catch (std::bad_cast&)
391  {
392  throw Exception("SequenceContainerTools::getFrequencies : unsupported SequenceDataInterface implementation.");
393  }
394  }
395 
402  template<class SequenceType, class HashType>
403  static void append(
406  {
407  const auto& keys = seqCont2.getSequenceKeys();
408  for (size_t i = 0; i < seqCont2.getNumberOfSequences(); ++i)
409  {
410  auto tm = std::unique_ptr<SequenceType>(seqCont2.sequence(i).clone());
411  seqCont1.addSequence(keys[i], tm);
412  }
413  }
414 
415 
428  template<class SequenceType, class HashType>
429  static void mergeByKey(
433  {
434  if (seqCont1.getAlphabet()->getAlphabetType() != seqCont2.getAlphabet()->getAlphabetType())
435  throw AlphabetMismatchException("SequenceContainerTools::mergeByKey.", seqCont1.getAlphabet(), seqCont2.getAlphabet());
436 
437  for (const auto& key: seqCont1.getSequenceKeys())
438  {
439  auto tmp = std::unique_ptr<SequenceType>(seqCont1.sequence(key).clone());
440  tmp->append(seqCont2.sequence(key));
441  outputCont.addSequence(key, tmp);
442  }
443  }
444 
451  template<class SequenceType, class HashType>
452  static void
456  {
457  std::vector<std::string> sequenceKeys = seqCont.getSequenceKeys();
458  for (size_t i = 0; i < seqCont.getNumberOfSequences(); ++i)
459  {
460  std::string seqName = seqCont.sequence(i).getName();
461  std::string seqKey = sequenceKeys[i];
462  auto alpha = outputCont.getAlphabet();
463  auto seq = std::unique_ptr<SequenceType>(new SequenceType(seqName, seqCont.sequence(i).toString(), alpha));
464  outputCont.addSequence(seqKey, seq);
465  }
466  }
467 
476  template<class SequenceType>
477  static std::unique_ptr< TemplateSequenceContainerInterface<SequenceType>>
480  size_t pos)
481  {
482  auto calpha = std::dynamic_pointer_cast<const CodonAlphabet>(sequences.getAlphabet());
483  if (!calpha)
484  throw AlphabetException("SequenceContainerTools::getCodonPosition. Input sequences should be of type codon.", sequences.getAlphabet());
485  auto newcont = std::make_unique< TemplateVectorSequenceContainer<SequenceType>>(calpha->getNucleicAlphabet());
486  for (size_t i = 0; i < sequences.getNumberOfSequences(); ++i)
487  {
488  const SequenceType& seq = sequences.sequence(i);
489  std::vector<int> newseq(seq.size());
490  for (size_t j = 0; j < seq.size(); ++j)
491  {
492  newseq[i] = calpha->getNPosition(seq[i], pos);
493  }
494  std::shared_ptr<const bpp::Alphabet> na = calpha->getNucleicAlphabet();
495  auto s = std::make_unique<SequenceType>(seq.getName(), newseq, seq.getComments(), na);
496  newcont->addSequence(sequences.getSequenceKeys()[i], s);
497  }
498  return newcont;
499  }
500 };
501 } // end of namespace bpp.
502 #endif // BPP_SEQ_CONTAINER_SEQUENCECONTAINERTOOLS_H
size_t size() const override
Get the number of elements in the list.
Definition: SymbolList.h:124
The alphabet exception base class.
Exception thrown when two alphabets do not match.
A basic implementation of the ProbabilisticSequence interface.
size_t size() const override
Get the number of elements in the list.
Utilitary methods dealing with sequence containers.
static void getFrequencies(const SequenceContainerInterface &sc, std::map< int, double > &f, double pseudoCount=0)
Compute base frequencies of a BasicSequenceContainer.
static void mergeByKey(const TemplateSequenceContainerInterface< SequenceType, HashType > &seqCont1, const TemplateSequenceContainerInterface< SequenceType, HashType > &seqCont2, TemplateSequenceContainerInterface< SequenceType, HashType > &outputCont)
Concatenate the sequences from two containers.
static void append(TemplateSequenceContainerInterface< SequenceType, HashType > &seqCont1, const TemplateSequenceContainerInterface< SequenceType, HashType > &seqCont2)
Append all the sequences of a SequenceContainer to the end of another.
static void getFrequencies(const ProbabilisticSequenceContainerInterface &sc, std::map< int, double > &f, double pseudoCount=0)
Compute base frequencies of a ProbabilisticSequenceContainer.
static void convertContainer(const ContFrom &input, ContTo &output)
Generic function which creates a new container from another one, by specifying the class of sequence ...
static std::unique_ptr< TemplateSequenceContainerInterface< SequenceType > > getCodonPosition(const TemplateSequenceContainerInterface< SequenceType, std::string > &sequences, size_t pos)
Extract a certain position (1, 2 or 3) from a container of codon sequences and returns the resulting ...
static void getCounts(const SequenceContainerInterface &sc, std::map< int, unsigned int > &f)
Compute base counts.
static void getFrequencies(const SequenceDataInterface &sc, std::map< int, double > &f, double pseudoCount=0)
Compute base frequencies of an object implementing the SequenceDataInterface.
static std::unique_ptr< TemplateSequenceContainerInterface< SequenceType, HashType > > createContainerWithSequenceNames(std::shared_ptr< const Alphabet > &alphabet, const std::vector< std::string > &seqNames)
Create a container with specified names.
static bool hasSequenceWithName(const TemplateSequenceContainerInterface< SequenceType, HashType > &sc, const std::string &name)
Tells whether a sequence with the given name is present in the container.
static void keepOnlySelectedSequences(TemplateSequenceContainerInterface< SequenceType, HashType > &sequences, const SequenceSelection &selection)
Remove all sequences that are not in a given selection from a given container.
static void convertAlphabet(const TemplateSequenceContainerInterface< SequenceType, HashType > &seqCont, TemplateSequenceContainerInterface< SequenceType, HashType > &outputCont)
Convert a SequenceContainer to a new alphabet.
static void getSelectedSequences(const TemplateSequenceContainerInterface< SequenceType, HashType > &sequences, const SequenceSelection &selection, TemplateSequenceContainerInterface< SequenceType, HashType > &outputCont)
Add a specified set of sequences from a container to another.
static bool sequencesHaveTheSameLength(const TemplateSequenceContainerInterface< SequenceType, HashType > &sc)
Check if all sequences in a SequenceContainer have the same length.
static std::unique_ptr< TemplateSequenceContainerInterface< SequenceType, HashType > > createContainerOfSpecifiedSize(std::shared_ptr< const Alphabet > &alphabet, size_t size)
Create a container with void sequences.
static void getSelectedSequences(const TemplateSequenceContainerInterface< SequenceType, HashType > &sequences, const std::vector< std::string > &selection, TemplateSequenceContainerInterface< SequenceType, HashType > &outputCont, bool strict=true)
Add a specified set of sequences from a container to another.
A basic implementation of the Sequence interface.
Definition: Sequence.h:117
static void getCounts(const IntSymbolListInterface &list, std::map< int, count_type > &counts)
Count all states in the list.
virtual void addSequence(const HashType &sequenceKey, std::unique_ptr< SequenceType > &sequencePtr)=0
Add a sequence to the container.
virtual std::unique_ptr< SequenceType > removeSequence(const HashType &sequenceKey)=0
Remove a sequence from the container.
virtual const SequenceType & sequence(const HashType &sequenceKey) const override=0
Retrieve a sequence object from the container.
virtual bool hasSequence(const HashType &sequenceKey) const =0
Check if a certain key is associated to a sequence in the container.
virtual size_t getNumberOfSequences() const =0
Get the number of sequences in the container.
virtual std::vector< HashType > getSequenceKeys() const =0
virtual std::shared_ptr< const Alphabet > getAlphabet() const =0
Get a pointer toward the container's alphabet.
static void diff(std::vector< T > &v1, std::vector< T > &v2, std::vector< T > &v3)
std::string toString(T t)
This alphabet is used to deal NumericAlphabet.
std::vector< size_t > SiteSelection
std::vector< size_t > SequenceSelection