bpp-phyl3 3.0.0
CoalaCore.cpp
Go to the documentation of this file.
1// SPDX-FileCopyrightText: The Bio++ Development Group
2//
3// SPDX-License-Identifier: CECILL-2.1
4
11#include <Bpp/Seq/Sequence.h>
14#include <Bpp/Text/TextTools.h>
15
16#include "CoalaCore.h"
17
18using namespace bpp;
19
20// From the STL:
21#include <iostream>
22#include <fstream>
23#include <string>
24
25using namespace std;
26
27/******************************************************************************/
28
29CoalaCore::CoalaCore(size_t nbAxes) :
30 init_(true),
31 nbrOfAxes_(nbAxes),
32 P_(),
33 R_(),
34 colWeights_(),
35 paramValues_()
36{}
37
38/******************************************************************************/
39
40ParameterList CoalaCore::computeCOA(const SequenceDataInterface& data, double pseudoCount, bool param)
41{
42 ParameterList pList;
43 // Now we perform the Correspondence Analysis on from the matrix of observed frequencies computed on the alignment, to obtain the matrix of principal axes.
44 // First, the matrix of amino acid frequencies is calculated from the alignment:
45 vector<string> seqKeys = data.getSequenceKeys();
46 vector< map<int, double>> freqs(seqKeys.size()); // One map per sequence
47 // Each map is filled with the corresponding frequencies, which are then normalized.
48 for (size_t i = 0; i < seqKeys.size(); ++i)
49 {
50 const SequenceContainerInterface* sc = dynamic_cast<const SequenceContainerInterface*>(&data);
52
53 shared_ptr<CruxSymbolListInterface> seq(sc ?
54 dynamic_cast<CruxSymbolListInterface*>(sc->sequence(seqKeys[i]).clone()) :
55 dynamic_cast<CruxSymbolListInterface*>(psc->sequence(seqKeys[i]).clone()));
56
58 std::map<int, double> counts;
59 SequenceTools::getCounts(*seq, counts);
60
61 // add pseudoCounts
62 for (int k = 0; k < 20; ++k)
63 {
64 counts[k] += pseudoCount;
65 }
66
67 // Unknown characters are now ignored:
68 double t = 0;
69 for (int k = 0; k < 20; ++k)
70 {
71 t += counts[k];
72 }
73
74 for (int k = 0; k < 20; ++k)
75 {
76 freqs.at(i)[k] = counts[k] / t;
77 }
78 }
79
80 // The matrix of observed frequencies is filled. If an amino acid is
81 // completely absent from the alignment, its frequency is set to
82 // 10^-6.
83 RowMatrix<double> freqMatrix(seqKeys.size(), 20);
84 for (size_t i = 0; i < freqs.size(); ++i)
85 {
86 bool normalize = false;
87 for (size_t j = 0; j < 20; ++j)
88 {
89 map<int, double>::iterator it = freqs[i].find(static_cast<int>(j));
90 if (it != freqs[i].end())
91 {
92 freqMatrix(i, j) = (*it).second;
93 }
94 else
95 {
96 freqMatrix(i, j) = 0.000001;
97 normalize = true;
98 }
99 }
100 if (normalize)
101 {
102 double sum = 0;
103 for (size_t k = 0; k < 20; ++k)
104 {
105 sum += freqMatrix(i, k);
106 }
107 for (size_t l = 0; l < 20; ++l)
108 {
109 freqMatrix(i, l) = freqMatrix(i, l) / sum;
110 }
111 }
112 }
113
114 // The COA analysis:
115 CorrespondenceAnalysis coa(freqMatrix, 19);
116 // Matrix of principal axes:
117 RowMatrix<double> ppalAxes = coa.getPrincipalAxes();
118
119 // The transpose of the matrix of principal axes is computed:
120 MatrixTools::transpose(ppalAxes, P_);
121 // The matrix of row coordinates is stored:
122 R_ = coa.getRowCoordinates();
123 // The column weights are retrieved:
125
126 if (param)
127 {
128 // Parameters are defined:
129 size_t nbAxesConserved = coa.getNbOfKeptAxes();
130 if (nbrOfAxes_ > nbAxesConserved)
131 {
132 ApplicationTools::displayWarning("The specified number of parameters per branch (" + TextTools::toString(nbrOfAxes_) +
133 ") is higher than the number of axes (" + TextTools::toString(nbAxesConserved) +
134 ")... The number of parameters per branch is now equal to the number of axes kept by the COA analysis (" + TextTools::toString(nbAxesConserved) + ")");
135 nbrOfAxes_ = nbAxesConserved;
136 }
137 for (unsigned int i = 0; i < nbrOfAxes_; ++i)
138 {
139 const vector<double> rCoords = R_.col(i);
140 double maxCoord = VectorTools::max(rCoords);
141 double minCoord = VectorTools::min(rCoords);
142 double sd = VectorTools::sd<double, double>(rCoords);
143 auto constraint = make_shared<IntervalConstraint>(minCoord - sd, maxCoord + sd, true, true);
145 pList.addParameter(new Parameter("Coala.AxPos" + TextTools::toString(i), paramValues_.getParameterValue("AxPos" + TextTools::toString(i).substr(0, 8)), constraint));
146 else
147 pList.addParameter(new Parameter("Coala.AxPos" + TextTools::toString(i), (minCoord + maxCoord) / 2, constraint));
148 }
149 }
150 return pList;
151}
152
153/******************************************************************************/
154/* Function that computes the product of a matrix P of size nbrOfAxes_x20 with a vector V of size nbrOfAxes_, and returns a vector of size 20.*/
155
156vector<double> CoalaCore::prodMatrixVector(RowMatrix<double>& P, vector<double>& V)
157{
158 vector<double> E(20, 0.0);
159
160 for (unsigned int i = 0; i < 20; i++)
161 {
162 for (unsigned int j = 0; j < V.size(); j++)
163 {
164 E[i] = E[i] + P(j, i) * V[j];
165 }
166 }
167 return E;
168}
169
170/******************************************************************************/
static void displayWarning(const std::string &text)
size_t nbrOfAxes_
Definition: CoalaCore.h:35
std::vector< double > colWeights_
Definition: CoalaCore.h:38
ParameterList paramValues_
Definition: CoalaCore.h:39
RowMatrix< double > P_
Definition: CoalaCore.h:36
std::vector< double > prodMatrixVector(RowMatrix< double > &P, std::vector< double > &V)
Definition: CoalaCore.cpp:156
RowMatrix< double > R_
Definition: CoalaCore.h:37
ParameterList computeCOA(const SequenceDataInterface &data, double pseudoCount=0, bool param=true)
Comoute COA from data and return the axis position parameters.
Definition: CoalaCore.cpp:40
const RowMatrix< double > & getPrincipalAxes() const
const RowMatrix< double > & getRowCoordinates() const
const std::vector< double > getColumnWeights() const
size_t getNbOfKeptAxes() const
static void transpose(const MatrixA &A, MatrixO &O)
virtual bool hasParameter(const std::string &name) const
virtual double getParameterValue(const std::string &name) const
virtual void addParameter(const Parameter &param)
std::vector< double > col(size_t j) const
static void changeGapsToUnknownCharacters(IntSymbolListInterface &l)
static void getCounts(const IntSymbolListInterface &list, std::map< int, count_type > &counts)
virtual const SequenceType & sequence(const HashType &sequenceKey) const override=0
virtual std::vector< std::string > getSequenceKeys() const =0
static T min(const std::vector< T > &v)
static T max(const std::vector< T > &v)
std::string toString(T t)
Defines the basic types of data flow nodes.