bpp-phyl3  3.0.0
UserProteinSubstitutionModel.cpp
Go to the documentation of this file.
1 // SPDX-FileCopyrightText: The Bio++ Development Group
2 //
3 // SPDX-License-Identifier: CECILL-2.1
4 
6 #include <Bpp/Io/FileTools.h>
7 #include <Bpp/Io/FileTools.h>
10 #include <Bpp/Text/TextTools.h>
11 
13 
14 // From SeqLib:
16 
17 using namespace bpp;
18 
19 // From the STL:
20 #include <iostream>
21 #include <fstream>
22 #include <string>
23 
24 using namespace std;
25 
26 /******************************************************************************/
27 
29  std::shared_ptr<const ProteicAlphabet> alpha,
30  const string& path,
31  const string& prefix) :
33  AbstractReversibleProteinSubstitutionModel(alpha, make_shared<CanonicalStateMap>(alpha, false), prefix),
34  path_(path),
35  freqSet_(nullptr)
36 {
37  readFromFile();
38  freqSet_.reset(new FixedProteinFrequencySet(alpha, freq_));
40 }
41 
43  std::shared_ptr<const ProteicAlphabet> alpha,
44  const string& path,
45  unique_ptr<ProteinFrequencySetInterface> freqSet,
46  const string& prefix,
47  bool initFreqs) :
49  AbstractReversibleProteinSubstitutionModel(alpha, make_shared<CanonicalStateMap>(alpha, false), prefix),
50  path_(path),
51  freqSet_(std::move(freqSet))
52 {
53  readFromFile();
54  freqSet_->setNamespace(prefix + freqSet_->getName() + ".");
55  if (initFreqs)
56  freqSet->setFrequencies(freq_);
57  else
58  freq_ = freqSet_->getFrequencies();
59  addParameters_(freqSet_->getParameters());
61 }
62 
63 /******************************************************************************/
64 
66 {
67  if (TextTools::hasSubstring(freqSet_->getNamespace(), "+F.") )
68  return "Empirical+F";
69  else
70  return "Empirical";
71 }
72 
73 /******************************************************************************/
74 
76 {
77  if (!FileTools::fileExists(path_.c_str()))
78  throw Exception("UserProteinSubstitutionModel::readFromFile. Frequencies file not found : " + path_);
79 
80  ifstream in(path_.c_str(), ios::in);
81  // Read exchangeability matrix:
82  for (unsigned int i = 1; i < 20; i++)
83  {
84  string line = FileTools::getNextLine(in);
85  StringTokenizer st(line);
86  for (unsigned int j = 0; j < i; j++)
87  {
88  double s = TextTools::toDouble(st.nextToken());
89  exchangeability_(i, j) = exchangeability_(j, i) = s;
90  }
91  }
92  // Read frequencies:
93  unsigned int fCount = 0;
94  while (in && fCount < 20)
95  {
96  string line = FileTools::getNextLine(in);
97  StringTokenizer st(line);
98  while (st.hasMoreToken() && fCount < 20)
99  {
100  freq_[fCount] = TextTools::toDouble(st.nextToken());
101  fCount++;
102  }
103  }
104  double sf = VectorTools::sum(freq_);
105  if (sf - 1 > 0.000001)
106  {
107  ApplicationTools::displayMessage("WARNING!!! Frequencies sum to " + TextTools::toString(sf) + ", frequencies have been scaled.");
108  sf *= 1. / sf;
109  }
110 
111  // Now build diagonal of the exchangeability matrix:
112  for (unsigned int i = 0; i < 20; i++)
113  {
114  double sum = 0;
115  for (unsigned int j = 0; j < 20; j++)
116  {
117  if (j != i)
118  sum += exchangeability_(i, j);
119  }
120  exchangeability_(i, i) = -sum;
121  }
122 
123  // Closing stream:
124  in.close();
125 }
126 
127 /******************************************************************************/
128 
130 {
131  map<int, double> counts;
132  SequenceContainerTools::getFrequencies(data, counts, pseudoCount);
133  for (auto i : counts)
134  {
135  freq_[(size_t)i.first] = i.second;
136  }
137 
138  freqSet_->setFrequencies(freq_);
139  // Update parameters and re-compute generator and eigen values:
140  matchParametersValues(freqSet_->getParameters());
141 }
142 
143 /******************************************************************************/
void addParameters_(const ParameterList &parameters)
bool matchParametersValues(const ParameterList &parameters) override
Specialisation abstract class for reversible protein substitution model.
virtual void updateMatrices_() override
Compute and diagonalize the matrix, and fill the eigenValues_, leftEigenVectors_ and rightEigenVecto...
RowMatrix< double > exchangeability_
The exchangeability matrix of the model, defined as . When the model is reversible,...
Vdouble freq_
The vector of equilibrium frequencies.
static void displayMessage(const std::string &text)
This class implements a state map where all resolved states are modeled.
Definition: StateMap.h:168
static std::string getNextLine(std::istream &in)
static bool fileExists(const std::string &filename)
FrequencySet useful for homogeneous and stationary models, protein implementation.
static void getFrequencies(const SequenceContainerInterface &sc, std::map< int, double > &f, double pseudoCount=0)
const std::string & nextToken()
bool hasMoreToken() const
std::unique_ptr< ProteinFrequencySetInterface > freqSet_
void setFreqFromData(const SequenceDataInterface &data, double pseudoCount=0) override
Set equilibrium frequencies equal to the frequencies estimated from the data.
UserProteinSubstitutionModel(std::shared_ptr< const ProteicAlphabet > alpha, const std::string &path, const std::string &prefix)
Build a protein model from a PAML file, with original equilibrium frequencies.
std::string getName() const override
Get the name of the model.
static T sum(const std::vector< T > &v1)
double toDouble(const std::string &s, char dec='.', char scientificNotation='e')
bool hasSubstring(const std::string &s, const std::string &pattern)
std::string toString(T t)
Defines the basic types of data flow nodes.