bpp-phyl3  3.0.0
CoalaCore.cpp
Go to the documentation of this file.
1 // SPDX-FileCopyrightText: The Bio++ Development Group
2 //
3 // SPDX-License-Identifier: CECILL-2.1
4 
11 #include <Bpp/Seq/Sequence.h>
12 #include <Bpp/Seq/SequenceTools.h>
14 #include <Bpp/Text/TextTools.h>
15 
16 #include "CoalaCore.h"
17 
18 using namespace bpp;
19 
20 // From the STL:
21 #include <iostream>
22 #include <fstream>
23 #include <string>
24 
25 using namespace std;
26 
27 /******************************************************************************/
28 
29 CoalaCore::CoalaCore(size_t nbAxes) :
30  init_(true),
31  nbrOfAxes_(nbAxes),
32  P_(),
33  R_(),
34  colWeights_(),
35  paramValues_()
36 {}
37 
38 /******************************************************************************/
39 
40 ParameterList CoalaCore::computeCOA(const SequenceDataInterface& data, double pseudoCount, bool param)
41 {
42  ParameterList pList;
43  // Now we perform the Correspondence Analysis on from the matrix of observed frequencies computed on the alignment, to obtain the matrix of principal axes.
44  // First, the matrix of amino acid frequencies is calculated from the alignment:
45  vector<string> seqKeys = data.getSequenceKeys();
46  vector< map<int, double>> freqs(seqKeys.size()); // One map per sequence
47  // Each map is filled with the corresponding frequencies, which are then normalized.
48  for (size_t i = 0; i < seqKeys.size(); ++i)
49  {
50  const SequenceContainerInterface* sc = dynamic_cast<const SequenceContainerInterface*>(&data);
52 
53  shared_ptr<CruxSymbolListInterface> seq(sc ?
54  dynamic_cast<CruxSymbolListInterface*>(sc->sequence(seqKeys[i]).clone()) :
55  dynamic_cast<CruxSymbolListInterface*>(psc->sequence(seqKeys[i]).clone()));
56 
58  std::map<int, double> counts;
59  SequenceTools::getCounts(*seq, counts);
60 
61  // add pseudoCounts
62  for (int k = 0; k < 20; ++k)
63  {
64  counts[k] += pseudoCount;
65  }
66 
67  // Unknown characters are now ignored:
68  double t = 0;
69  for (int k = 0; k < 20; ++k)
70  {
71  t += counts[k];
72  }
73 
74  for (int k = 0; k < 20; ++k)
75  {
76  freqs.at(i)[k] = counts[k]/t;
77  }
78  }
79 
80  // The matrix of observed frequencies is filled. If an amino acid is
81  // completely absent from the alignment, its frequency is set to
82  // 10^-6.
83  RowMatrix<double> freqMatrix(seqKeys.size(), 20);
84  for (size_t i = 0; i < freqs.size(); ++i)
85  {
86  bool normalize = false;
87  for (size_t j = 0; j < 20; ++j)
88  {
89  map<int, double>::iterator it = freqs[i].find(static_cast<int>(j));
90  if (it != freqs[i].end())
91  {
92  freqMatrix(i, j) = (*it).second;
93  }
94  else
95  {
96  freqMatrix(i, j) = 0.000001;
97  normalize = true;
98  }
99  }
100  if (normalize)
101  {
102  double sum = 0;
103  for (size_t k = 0; k < 20; ++k)
104  {
105  sum += freqMatrix(i, k);
106  }
107  for (size_t l = 0; l < 20; ++l)
108  {
109  freqMatrix(i, l) = freqMatrix(i, l) / sum;
110  }
111  }
112  }
113 
114  // The COA analysis:
115  CorrespondenceAnalysis coa(freqMatrix, 19);
116  // Matrix of principal axes:
117  RowMatrix<double> ppalAxes = coa.getPrincipalAxes();
118 
119  // The transpose of the matrix of principal axes is computed:
120  MatrixTools::transpose(ppalAxes, P_);
121  // The matrix of row coordinates is stored:
122  R_ = coa.getRowCoordinates();
123  // The column weights are retrieved:
125 
126  if (param)
127  {
128  // Parameters are defined:
129  size_t nbAxesConserved = coa.getNbOfKeptAxes();
130  if (nbrOfAxes_ > nbAxesConserved)
131  {
132  ApplicationTools::displayWarning("The specified number of parameters per branch (" + TextTools::toString(nbrOfAxes_) +
133  ") is higher than the number of axes (" + TextTools::toString(nbAxesConserved) +
134  ")... The number of parameters per branch is now equal to the number of axes kept by the COA analysis (" + TextTools::toString(nbAxesConserved) + ")");
135  nbrOfAxes_ = nbAxesConserved;
136  }
137  for (unsigned int i = 0; i < nbrOfAxes_; ++i)
138  {
139  const vector<double> rCoords = R_.col(i);
140  double maxCoord = VectorTools::max(rCoords);
141  double minCoord = VectorTools::min(rCoords);
142  double sd = VectorTools::sd<double, double>(rCoords);
143  auto constraint = make_shared<IntervalConstraint>(minCoord - sd, maxCoord + sd, true, true);
145  pList.addParameter(new Parameter("Coala.AxPos" + TextTools::toString(i), paramValues_.getParameterValue("AxPos" + TextTools::toString(i).substr(0, 8)), constraint));
146  else
147  pList.addParameter(new Parameter("Coala.AxPos" + TextTools::toString(i), (minCoord+maxCoord)/2, constraint));
148  }
149  }
150  return pList;
151 }
152 
153 /******************************************************************************/
154 /* Function that computes the product of a matrix P of size nbrOfAxes_x20 with a vector V of size nbrOfAxes_, and returns a vector of size 20.*/
155 
156 vector<double> CoalaCore::prodMatrixVector(RowMatrix<double>& P, vector<double>& V)
157 {
158  vector<double> E(20, 0.0);
159 
160  for (unsigned int i = 0; i < 20; i++)
161  {
162  for (unsigned int j = 0; j < V.size(); j++)
163  {
164  E[i] = E[i] + P(j, i) * V[j];
165  }
166  }
167  return E;
168 }
169 
170 /******************************************************************************/
static void displayWarning(const std::string &text)
size_t nbrOfAxes_
Definition: CoalaCore.h:35
std::vector< double > colWeights_
Definition: CoalaCore.h:38
ParameterList paramValues_
Definition: CoalaCore.h:39
RowMatrix< double > P_
Definition: CoalaCore.h:36
CoalaCore(size_t nbAxes=0)
Definition: CoalaCore.cpp:29
std::vector< double > prodMatrixVector(RowMatrix< double > &P, std::vector< double > &V)
Definition: CoalaCore.cpp:156
RowMatrix< double > R_
Definition: CoalaCore.h:37
ParameterList computeCOA(const SequenceDataInterface &data, double pseudoCount=0, bool param=true)
Comoute COA from data and return the axis position parameters.
Definition: CoalaCore.cpp:40
const RowMatrix< double > & getPrincipalAxes() const
const RowMatrix< double > & getRowCoordinates() const
const std::vector< double > getColumnWeights() const
size_t getNbOfKeptAxes() const
static void transpose(const MatrixA &A, MatrixO &O)
virtual bool hasParameter(const std::string &name) const
virtual double getParameterValue(const std::string &name) const
virtual void addParameter(const Parameter &param)
std::vector< double > col(size_t j) const
static void changeGapsToUnknownCharacters(IntSymbolListInterface &l)
static void getCounts(const IntSymbolListInterface &list, std::map< int, count_type > &counts)
virtual const SequenceType & sequence(const HashType &sequenceKey) const override=0
virtual std::vector< std::string > getSequenceKeys() const =0
static T min(const std::vector< T > &v)
static T max(const std::vector< T > &v)
std::string toString(T t)
Defines the basic types of data flow nodes.