bpp-phyl3 3.0.0
UserProteinSubstitutionModel.cpp
Go to the documentation of this file.
1// SPDX-FileCopyrightText: The Bio++ Development Group
2//
3// SPDX-License-Identifier: CECILL-2.1
4
6#include <Bpp/Io/FileTools.h>
7#include <Bpp/Io/FileTools.h>
10#include <Bpp/Text/TextTools.h>
11
13
14// From SeqLib:
16
17using namespace bpp;
18
19// From the STL:
20#include <iostream>
21#include <fstream>
22#include <string>
23
24using namespace std;
25
26/******************************************************************************/
27
28UserProteinSubstitutionModel::UserProteinSubstitutionModel(
29 std::shared_ptr<const ProteicAlphabet> alpha,
30 const string& path,
31 const string& prefix) :
33 AbstractReversibleProteinSubstitutionModel(alpha, make_shared<CanonicalStateMap>(alpha, false), prefix),
34 path_(path),
35 freqSet_(nullptr)
36{
38 freqSet_.reset(new FixedProteinFrequencySet(alpha, freq_));
40}
41
43 std::shared_ptr<const ProteicAlphabet> alpha,
44 const string& path,
45 unique_ptr<ProteinFrequencySetInterface> freqSet,
46 const string& prefix,
47 bool initFreqs) :
49 AbstractReversibleProteinSubstitutionModel(alpha, make_shared<CanonicalStateMap>(alpha, false), prefix),
50 path_(path),
51 freqSet_(std::move(freqSet))
52{
54 freqSet_->setNamespace(prefix + freqSet_->getName() + ".");
55 if (initFreqs)
56 freqSet->setFrequencies(freq_);
57 else
58 freq_ = freqSet_->getFrequencies();
59 addParameters_(freqSet_->getParameters());
61}
62
63/******************************************************************************/
64
66{
67 if (TextTools::hasSubstring(freqSet_->getNamespace(), "+F.") )
68 return "Empirical+F";
69 else
70 return "Empirical";
71}
72
73/******************************************************************************/
74
76{
77 if (!FileTools::fileExists(path_.c_str()))
78 throw Exception("UserProteinSubstitutionModel::readFromFile. Frequencies file not found : " + path_);
79
80 ifstream in(path_.c_str(), ios::in);
81 // Read exchangeability matrix:
82 for (unsigned int i = 1; i < 20; i++)
83 {
84 string line = FileTools::getNextLine(in);
85 StringTokenizer st(line);
86 for (unsigned int j = 0; j < i; j++)
87 {
88 double s = TextTools::toDouble(st.nextToken());
89 exchangeability_(i, j) = exchangeability_(j, i) = s;
90 }
91 }
92 // Read frequencies:
93 unsigned int fCount = 0;
94 while (in && fCount < 20)
95 {
96 string line = FileTools::getNextLine(in);
97 StringTokenizer st(line);
98 while (st.hasMoreToken() && fCount < 20)
99 {
100 freq_[fCount] = TextTools::toDouble(st.nextToken());
101 fCount++;
102 }
103 }
104 double sf = VectorTools::sum(freq_);
105 if (sf - 1 > 0.000001)
106 {
107 ApplicationTools::displayMessage("WARNING!!! Frequencies sum to " + TextTools::toString(sf) + ", frequencies have been scaled.");
108 sf *= 1. / sf;
109 }
110
111 // Now build diagonal of the exchangeability matrix:
112 for (unsigned int i = 0; i < 20; i++)
113 {
114 double sum = 0;
115 for (unsigned int j = 0; j < 20; j++)
116 {
117 if (j != i)
118 sum += exchangeability_(i, j);
119 }
120 exchangeability_(i, i) = -sum;
121 }
122
123 // Closing stream:
124 in.close();
125}
126
127/******************************************************************************/
128
130{
131 map<int, double> counts;
132 SequenceContainerTools::getFrequencies(data, counts, pseudoCount);
133 for (auto i : counts)
134 {
135 freq_[(size_t)i.first] = i.second;
136 }
137
138 freqSet_->setFrequencies(freq_);
139 // Update parameters and re-compute generator and eigen values:
140 matchParametersValues(freqSet_->getParameters());
141}
142
143/******************************************************************************/
void addParameters_(const ParameterList &parameters)
bool matchParametersValues(const ParameterList &parameters) override
Specialisation abstract class for reversible protein substitution model.
virtual void updateMatrices_() override
Compute and diagonalize the matrix, and fill the eigenValues_, leftEigenVectors_ and rightEigenVecto...
RowMatrix< double > exchangeability_
The exchangeability matrix of the model, defined as . When the model is reversible,...
Vdouble freq_
The vector of equilibrium frequencies.
static void displayMessage(const std::string &text)
This class implements a state map where all resolved states are modeled.
Definition: StateMap.h:168
static std::string getNextLine(std::istream &in)
static bool fileExists(const std::string &filename)
FrequencySet useful for homogeneous and stationary models, protein implementation.
static void getFrequencies(const SequenceContainerInterface &sc, std::map< int, double > &f, double pseudoCount=0)
const std::string & nextToken()
bool hasMoreToken() const
std::unique_ptr< ProteinFrequencySetInterface > freqSet_
void setFreqFromData(const SequenceDataInterface &data, double pseudoCount=0) override
Set equilibrium frequencies equal to the frequencies estimated from the data.
UserProteinSubstitutionModel(std::shared_ptr< const ProteicAlphabet > alpha, const std::string &path, const std::string &prefix)
Build a protein model from a PAML file, with original equilibrium frequencies.
std::string getName() const override
Get the name of the model.
static T sum(const std::vector< T > &v1)
double toDouble(const std::string &s, char dec='.', char scientificNotation='e')
bool hasSubstring(const std::string &s, const std::string &pattern)
std::string toString(T t)
Defines the basic types of data flow nodes.