bpp-seq3  3.0.0
SequenceContainerTools.cpp
Go to the documentation of this file.
1 //
2 // File: SequenceContainerTools.cpp
3 // Authors:
4 // Julien Dutheil
5 // Created: 2003-10-04 09:18:34
6 //
7 
8 /*
9  Copyright or © or Copr. Bio++ Development Team, (November 17, 2004)
10 
11  This software is a computer program whose purpose is to provide classes
12  for sequences analysis.
13 
14  This software is governed by the CeCILL license under French law and
15  abiding by the rules of distribution of free software. You can use,
16  modify and/ or redistribute the software under the terms of the CeCILL
17  license as circulated by CEA, CNRS and INRIA at the following URL
18  "http://www.cecill.info".
19 
20  As a counterpart to the access to the source code and rights to copy,
21  modify and redistribute granted by the license, users are provided only
22  with a limited warranty and the software's author, the holder of the
23  economic rights, and the successive licensors have only limited
24  liability.
25 
26  In this respect, the user's attention is drawn to the risks associated
27  with loading, using, modifying and/or developing or reproducing the
28  software by the user in light of its specific status of free software,
29  that may mean that it is complicated to manipulate, and that also
30  therefore means that it is reserved for developers and experienced
31  professionals having in-depth computer knowledge. Users are therefore
32  encouraged to load and test the software's suitability as regards their
33  requirements in conditions enabling the security of their systems and/or
34  data to be ensured and, more generally, to use and operate it in the
35  same conditions as regards security.
36 
37  The fact that you are presently reading this means that you have had
38  knowledge of the CeCILL license and that you accept its terms.
39 */
40 
41 
42 #include "../Alphabet/CodonAlphabet.h"
43 #include "../SymbolListTools.h"
45 #include "SequenceContainerTools.h"
47 
48 // From bpp-core:
49 #include <Bpp/Text/TextTools.h>
50 
51 using namespace bpp;
52 
53 // From the STL:
54 #include <iostream>
55 
56 using namespace std;
57 
58 /******************************************************************************/
59 
61 {
63  for (size_t i = 0; i < size; ++i)
64  {
65  vsc->addSequence(BasicSequence(TextTools::toString(i), "", alphabet), false);
66  }
67  return vsc;
68 }
69 
70 /******************************************************************************/
71 
73  const Alphabet* alphabet,
74  const vector<string>& seqNames)
75 {
76  SequenceContainer* sc = createContainerOfSpecifiedSize(alphabet, seqNames.size());
77  sc->setSequenceNames(seqNames, true);
78  return sc;
79 }
80 
81 /******************************************************************************/
82 
84  const OrderedSequenceContainer& sequences,
85  const SequenceSelection& selection,
86  SequenceContainer& outputCont)
87 {
88  bool checkNames = outputCont.getNumberOfSequences() > 0;
89  for (size_t i = 0; i < selection.size(); i++)
90  {
91  outputCont.addSequence(sequences.getSequence(selection[i]), checkNames);
92  }
93 }
94 
95 /******************************************************************************/
96 
98  const SequenceContainer& sequences,
99  const std::vector<std::string>& selection,
100  SequenceContainer& outputCont, bool strict)
101 {
102  bool checkNames = outputCont.getNumberOfSequences() > 0;
103  for (size_t i = 0; i < selection.size(); i++)
104  {
105  if (strict)
106  {
107  outputCont.addSequence(sequences.getSequence(selection[i]), checkNames);
108  }
109  else
110  {
111  if (sequences.hasSequence(selection[i]))
112  outputCont.addSequence(sequences.getSequence(selection[i]), checkNames);
113  }
114  }
115 }
116 
117 /******************************************************************************/
118 
120  OrderedSequenceContainer& sequences,
121  const SequenceSelection& selection)
122 {
123  vector<string> names = sequences.getSequenceNames();
124  for (size_t i = 0; i < names.size(); i++)
125  {
126  // We need to do this because after removal the indices will not be the same!
127  // another solution would be to sort decreasingly the indices...
128  bool test = false;
129  for (size_t j = 0; j < selection.size() && !test; j++)
130  {
131  test = (selection[j] == i);
132  }
133  if (!test)
134  sequences.removeSequence(names[i]);
135  // WARNING: what if selection contains several times the same indice? ...
136  }
137 }
138 
139 /******************************************************************************/
140 
142 {
143  vector<string> seqNames = sequences.getSequenceNames();
144  if (seqNames.size() <= 1)
145  return true;
146  size_t length = sequences.getSequence(seqNames[0]).size();
147  for (size_t i = 1; i < seqNames.size(); i++)
148  {
149  if (sequences.getSequence(seqNames[i]).size() != length)
150  return false;
151  }
152  return true;
153 }
154 
155 /******************************************************************************/
156 
157 void SequenceContainerTools::getFrequencies(const SequencedValuesContainer& sequences, std::map<int, double>& f, double pseudoCount)
158 {
159  double n = 0;
160 
161  const SequenceContainer* sc = dynamic_cast<const SequenceContainer*>(&sequences);
162  const ProbabilisticSequenceContainer* psc = dynamic_cast<const ProbabilisticSequenceContainer*>(&sequences);
163 
164  for (const auto& name: sequences.getSequenceNames())
165  {
166  if (sc)
167  {
168  const Sequence& seq = sc->getSequence(name);
169  SymbolListTools::getCounts(seq, f, true);
170  n += static_cast<double>(seq.size());
171  }
172  else if (psc)
173  {
174  const ProbabilisticSequence& seq = psc->getSequence(name);
175  SymbolListTools::getCounts(seq, f, true);
176  n += static_cast<double>(seq.size());
177  }
178  else
179  throw Exception("SequenceContainerTools::getFrequencies : unknown SequenceContainer type.");
180  }
181 
182  if (pseudoCount != 0)
183  {
184  const Alphabet* pA = sequences.getAlphabet();
185  for (int i = 0; i < static_cast<int>(pA->getSize()); i++)
186  {
187  f[i] += pseudoCount;
188  }
189 
190  n += pseudoCount * static_cast<double>(pA->getSize());
191  }
192 
193  for (auto& i : f)
194  {
195  i.second = i.second / n;
196  }
197 }
198 
199 /******************************************************************************/
200 
201 void SequenceContainerTools::getCounts(const SequenceContainer& sequences, std::map<int, int>& f)
202 {
203  for (const auto& name: sequences.getSequenceNames())
204  {
205  const Sequence& seq = sequences.getSequence(name);
206  for (size_t i = 0; i < seq.size(); i++)
207  {
208  f[seq[i]]++;
209  }
210  }
211 }
212 
213 /******************************************************************************/
214 
216 {
217  const CodonAlphabet* calpha = dynamic_cast<const CodonAlphabet*>(sequences.getAlphabet());
218  if (!calpha)
219  throw AlphabetException("SequenceContainerTools::getCodonPosition. Input sequences should be of type codon.");
221  for (const auto& name: sequences.getSequenceNames())
222  {
223  const Sequence& seq = sequences.getSequence(name);
224  vector<int> newseq(seq.size());
225  for (size_t i = 0; i < seq.size(); i++)
226  {
227  newseq[i] = calpha->getNPosition(seq[i], pos);
228  }
229  BasicSequence s(name, newseq, sequences.getComments(name), calpha->getNucleicAlphabet());
230  newcont->addSequence(s);
231  }
232  return newcont;
233 }
234 
235 /******************************************************************************/
The alphabet exception base class.
The Alphabet interface.
Definition: Alphabet.h:133
virtual unsigned int getSize() const =0
Get the number of resolved states in the alphabet (e.g. return 4 for DNA alphabet)....
A basic implementation of the Sequence interface.
Definition: Sequence.h:155
Codon alphabet class.
Definition: CodonAlphabet.h:67
const NucleicAlphabet *const getNucleicAlphabet() const
int getNPosition(int codon, size_t pos) const
Get the int code of the n-position of a word given its int description.
virtual size_t size() const =0
Get the number of elements in the list.
The OrderedSequenceContainer interface.
virtual const Sequence & getSequence(size_t sequenceIndex) const =0
Retrieve a sequence object from the container.
virtual std::shared_ptr< Sequence > removeSequence(size_t sequenceIndex)=0
Extract (and remove) a sequence from the container.
The ProbabilisticSequenceContainer interface.
virtual const ProbabilisticSequence & getSequence(const std::string &name) const =0
Retrieve a probabilistic sequence object from the container.
The probabilistic sequence interface.
static void getCounts(const SequenceContainer &sequences, std::map< int, int > &)
Compute base counts.
static void getSelectedSequences(const OrderedSequenceContainer &sequences, const SequenceSelection &selection, SequenceContainer &outputCont)
Add a specified set of sequences from a container to another.
static SequenceContainer * createContainerWithSequenceNames(const Alphabet *alphabet, const std::vector< std::string > &seqNames)
Create a container with specified names.
static void getFrequencies(const SequencedValuesContainer &sequences, std::map< int, double > &f, double pseudoCount=0)
Compute base frequencies.
static void keepOnlySelectedSequences(OrderedSequenceContainer &sequences, const SequenceSelection &selection)
Remove all sequences that are not in a given selection from a given container.
static SequenceContainer * createContainerOfSpecifiedSize(const Alphabet *alphabet, size_t size)
Create a container with void sequences.
static bool sequencesHaveTheSameLength(const SequenceContainer &sequences)
Check if all sequences in a SequenceContainer have the same length.
static SequenceContainer * getCodonPosition(const SequenceContainer &sequences, size_t pos)
Extract a certain position (1, 2 or 3) from a container of codon sequences and returns the resulting ...
The SequenceContainer interface.
virtual void addSequence(const Sequence &sequence, bool checkName)=0
Add a sequence to the container.
virtual const Sequence & getSequence(const std::string &name) const =0
Retrieve a sequence object from the container.
The sequence interface.
Definition: Sequence.h:71
virtual void setSequenceNames(const std::vector< std::string > &names, bool checkNames)=0
Set all sequence names.
virtual const Alphabet * getAlphabet() const =0
Get container's alphabet.
virtual const Comments & getComments(const std::string &name) const =0
Get comments of a particular sequence.
virtual size_t getNumberOfSequences() const =0
Get the number of sequences in the container.
virtual std::vector< std::string > getSequenceNames() const =0
Get all the names of the sequences in the container.
virtual bool hasSequence(const std::string &name) const =0
Check if a sequence with a given name is present in the container.
static void getCounts(const IntCoreSymbolList &list, std::map< int, size_t > &counts)
Count all states in the list.
The VectorSequenceContainer class.
virtual void addSequence(const Sequence &sequence, bool checkName=true)
Add a sequence at the end of the container.
std::string toString(T t)
This alphabet is used to deal NumericAlphabet.
std::vector< size_t > SequenceSelection