bpp-phyl3  3.0.0
BppOTransitionModelFormat.cpp
Go to the documentation of this file.
1 // SPDX-FileCopyrightText: The Bio++ Development Group
2 //
3 // SPDX-License-Identifier: CECILL-2.1
4 
7 #include <Bpp/Io/FileTools.h>
8 #include <Bpp/Io/OutputStream.h>
11 #include <Bpp/Text/KeyvalTools.h>
13 #include <Bpp/Text/TextTools.h>
14 
15 #include "../App/PhylogeneticsApplicationTools.h"
16 #include "../Model/Codon/DFP07.h"
17 #include "../Model/Codon/RELAX.h"
18 #include "../Model/Codon/YNGP_M1.h"
19 #include "../Model/Codon/YNGP_M10.h"
20 #include "../Model/Codon/YNGP_M2.h"
21 #include "../Model/Codon/YNGP_M3.h"
22 #include "../Model/Codon/YNGP_M7.h"
23 #include "../Model/Codon/YNGP_M8.h"
24 #include "../Model/Codon/YNGP_M9.h"
25 #include "../Model/MixedTransitionModel.h"
26 #include "../Model/MixtureOfATransitionModel.h"
27 #include "../Model/MixtureOfTransitionModels.h"
28 #include "../Model/OneChangeRegisterTransitionModel.h"
29 #include "../Model/OneChangeTransitionModel.h"
30 #include "../Model/Protein/LG10_EX_EHO.h"
31 #include "../Model/Protein/LGL08_CAT.h"
32 #include "../Model/Protein/LLG08_EHO.h"
33 #include "../Model/Protein/LLG08_EX2.h"
34 #include "../Model/Protein/LLG08_EX3.h"
35 #include "../Model/Protein/LLG08_UL2.h"
36 #include "../Model/Protein/LLG08_UL3.h"
37 #include "BppOFrequencySetFormat.h"
40 
41 // From Numeric
42 
45 
46 
48 
49 using namespace bpp;
50 
51 // From the STL:
52 #include <iomanip>
53 #include <set>
54 
55 using namespace std;
56 
57 unique_ptr<TransitionModelInterface> BppOTransitionModelFormat::readTransitionModel(
58  std::shared_ptr<const Alphabet> alphabet,
59  const std::string& modelDescription,
60  const std::map<size_t, std::shared_ptr<const AlignmentDataInterface>>& mData,
61  size_t nData,
62  bool parseArguments)
63 {
64  unparsedArguments_.clear();
65  unique_ptr<TransitionModelInterface> model;
66  string modelName = "";
67  map<string, string> args;
68  KeyvalTools::parseProcedure(modelDescription, modelName, args);
69 
70  // get data number
71 
72  if (args.find("data") != args.end())
73  nData = TextTools::to<size_t>(args["data"]);
74 
75  // //////////////////////////////////
76  // / MIXED MODELS
77  // ////////////////////////////////
78 
79  if ((modelName == "MixedModel" || (modelName == "Mixture")) && allowMixed_)
80  model = readMixed_(alphabet, modelDescription, mData, nData);
81  else if (modelName == "OneChange")
82  {
83  // We have to parse the nested model first:
84  if (args.find("model") == args.end())
85  throw Exception("BppOTransitionModelFormat::read. Missing argument 'model' for model 'OneChange'.");
86  string nestedModelDescription = args["model"];
87  BppOSubstitutionModelFormat nestedReader(ALL, false, allowMixed_, allowGaps_, verbose_, warningLevel_);
88  if (geneticCode_)
89  nestedReader.setGeneticCode(geneticCode_);
90 
91  auto nestedModel = nestedReader.readSubstitutionModel(alphabet, nestedModelDescription, mData, nData, false);
92  map<string, string> unparsedParameterValuesNested(nestedReader.getUnparsedArguments());
93 
94  // We look for the register:
95  if (args.find("register") == args.end())
96  model = make_unique<OneChangeTransitionModel>(std::move(nestedModel));
97  else
98  {
99  shared_ptr<AlphabetIndex2> weights;
100  shared_ptr<AlphabetIndex2> distances;
101  string registerDescription = args["register"];
102  auto reg = PhylogeneticsApplicationTools::getSubstitutionRegister(registerDescription, nestedModel->getStateMap(), geneticCode_, weights, distances);
103 
104  if (args.find("numReg") == args.end())
105  throw Exception("Missing argument 'numReg' (number of event for register in model " + modelName);
106 
107  vector<size_t> vNumRegs;
108 
109  StringTokenizer nst(args["numReg"], "+");
110 
111  bool out = true;
112 
113  while (nst.hasMoreToken())
114  {
115  size_t n = TextTools::to<size_t>(nst.nextToken());
116  vNumRegs.push_back(n);
117  if (verbose_)
118  {
119  ApplicationTools::displayResult(out ? "Register types" : "", reg->getTypeName(n));
120  out = false;
121  }
122  }
123 
124  model = make_unique<OneChangeRegisterTransitionModel>(std::move(nestedModel), *reg, vNumRegs);
125  }
126 
127  // Then we update the parameter set:
128  for (auto& it : unparsedParameterValuesNested)
129  {
130  unparsedArguments_["OneChange." + it.first] = it.second;
131  }
132  }
133  // //////////////////
134  // PREDEFINED CODON MODELS
135  else if (((modelName.substr(0, 4) == "YNGP") || (modelName == "DFP07") || (modelName == "RELAX")) && (alphabetCode_ & CODON))
136  {
137  if (!(alphabetCode_ & CODON))
138  throw Exception("BppOTransitionModelFormat::read. Codon alphabet not supported.");
139  if (!geneticCode_)
140  throw Exception("BppOTransitionModelFormat::readTransitionModel(). No genetic code specified! Consider using 'setGeneticCode'.");
141 
142  if (!AlphabetTools::isCodonAlphabet(*alphabet))
143  throw Exception("Alphabet should be Codon Alphabet.");
144  auto pCA = dynamic_pointer_cast<const CodonAlphabet>(alphabet);
145 
146  if (args.find("genetic_code") != args.end())
147  {
148  ApplicationTools::displayWarning("'genetic_code' argument is no longer supported inside model description, and has been supersided by a global 'genetic_code' option.");
149  throw Exception("BppOSubstitutionModelFormat::read. Deprecated 'genetic_code' argument.");
150  }
151 
152  if (geneticCode_->getSourceAlphabet()->getAlphabetType() != pCA->getAlphabetType())
153  throw Exception("Mismatch between genetic code and codon alphabet");
154 
155 
156  unique_ptr<CodonFrequencySetInterface> codonFreqs;
157 
158  if (args.find("frequencies") != args.end())
159  {
160  string freqOpt = ApplicationTools::getStringParameter("frequencies", args, "F0", "", true, warningLevel_);
161  BppOFrequencySetFormat freqReader(BppOFrequencySetFormat::ALL, verbose_, warningLevel_);
162  freqReader.setGeneticCode(geneticCode_); // This uses the same instance as the one that will be used by the model.
163 
164  auto tmpP = freqReader.readFrequencySet(pCA, freqOpt, mData, nData, false);
165  codonFreqs = unique_ptr<CodonFrequencySetInterface>(dynamic_cast<CodonFrequencySetInterface*>(tmpP.release()));
166  for (const auto& it : freqReader.getUnparsedArguments())
167  {
168  unparsedArguments_[modelName + "." + it.first] = it.second;
169  }
170  }
171  else
172  throw Exception("Missing 'frequencies' for model " + modelName);
173 
174  if (modelName == "YNGP_M1")
175  model = make_unique<YNGP_M1>(geneticCode_, std::move(codonFreqs));
176  else if (modelName == "YNGP_M2")
177  model = make_unique<YNGP_M2>(geneticCode_, std::move(codonFreqs));
178  else if (modelName == "RELAX")
179  model = make_unique<RELAX>(geneticCode_, std::move(codonFreqs));
180  else if (modelName == "YNGP_M3")
181  if (args.find("n") == args.end())
182  model = make_unique<YNGP_M3>(geneticCode_, std::move(codonFreqs));
183  else
184  model = make_unique<YNGP_M3>(geneticCode_, std::move(codonFreqs), TextTools::to<unsigned int>(args["n"]));
185  else if ((modelName == "YNGP_M7") || modelName == "YNGP_M8" || modelName == "YNGP_M8a")
186  {
187  if (args.find("n") == args.end())
188  throw Exception("Missing argument 'n' (number of classes) in " + modelName + " distribution");
189  unsigned int nbClasses = TextTools::to<unsigned int>(args["n"]);
190  if (verbose_)
191  ApplicationTools::displayResult("Number of classes in model", nbClasses);
192 
193  if (modelName == "YNGP_M7")
194  model = make_unique<YNGP_M7>(geneticCode_, std::move(codonFreqs), nbClasses);
195  else if (modelName == "YNGP_M8")
196  model = make_unique<YNGP_M8>(geneticCode_, std::move(codonFreqs), nbClasses);
197  else if (modelName == "YNGP_M8a")
198  model = make_unique<YNGP_M8>(geneticCode_, std::move(codonFreqs), nbClasses, true);
199  }
200  else if (modelName == "YNGP_M9" || modelName == "YNGP_M10")
201  {
202  if (args.find("nbeta") == args.end())
203  throw Exception("Missing argument 'nbeta' (number of classes of beta distribution) in " + modelName + " distribution");
204  unsigned int nbBeta = TextTools::to<unsigned int>(args["nbeta"]);
205  if (args.find("ngamma") == args.end())
206  throw Exception("Missing argument 'ngamma' (number of classes of gamma distribution) in " + modelName + " distribution");
207  unsigned int nbGamma = TextTools::to<unsigned int>(args["ngamma"]);
208  if (verbose_)
209  ApplicationTools::displayResult("Number of classes in model", nbBeta + nbGamma);
210 
211  if (modelName == "YNGP_M9")
212  model = make_unique<YNGP_M9>(geneticCode_, std::move(codonFreqs), nbBeta, nbGamma);
213  else
214  model = make_unique<YNGP_M10>(geneticCode_, std::move(codonFreqs), nbBeta, nbGamma);
215  }
216  else if (modelName == "DFP07")
217  {
218  if (args.find("protmodel") == args.end())
219  throw Exception("Missing 'protmodel in model " + modelName + ".");
220 
221  BppOSubstitutionModelFormat nestedProtReader(PROTEIN, false, allowMixed_, allowGaps_, verbose_, warningLevel_);
222  auto tmpP = nestedProtReader.readSubstitutionModel(
223  geneticCode_->getTargetAlphabet(),
224  args["protmodel"], mData, nData, false);
225  auto nestedProtModel = unique_ptr<ProteinSubstitutionModelInterface>(
226  dynamic_cast<ProteinSubstitutionModelInterface*>(tmpP.release())
227  );
228 
229  auto unparsedParameterValuesNested = nestedProtReader.getUnparsedArguments();
230  unparsedArguments_.insert(unparsedParameterValuesNested.begin(), unparsedParameterValuesNested.end());
231 
232  model = make_unique<DFP07>(geneticCode_, std::move(nestedProtModel), std::move(codonFreqs));
233  }
234  }
235  else if (AlphabetTools::isProteicAlphabet(*alphabet))
236  {
237  if (!(alphabetCode_ & PROTEIN))
238  throw Exception("BppOTransitionModelFormat::read. Protein alphabet not supported.");
239  auto alpha = dynamic_pointer_cast<const ProteicAlphabet>(alphabet);
240 
241  if (modelName == "LLG08_EHO")
242  model = make_unique<LLG08_EHO>(alpha);
243  else if (modelName == "LLG08_EX2")
244  model = make_unique<LLG08_EX2>(alpha);
245  else if (modelName == "LLG08_EX3")
246  model = make_unique<LLG08_EX3>(alpha);
247  else if (modelName == "LLG08_UL2")
248  model = make_unique<LLG08_UL2>(alpha);
249  else if (modelName == "LLG08_UL3")
250  model = make_unique<LLG08_UL3>(alpha);
251  else if (modelName == "LG10_EX_EHO")
252  model = make_unique<LG10_EX_EHO>(alpha);
253  else if (modelName == "LGL08_CAT")
254  {
255  if (args.find("nbCat") == args.end())
256  throw Exception("'nbCat' argument is compulsory for model 'LGL08_CAT'");
257 
258  unsigned int nbCat = TextTools::to<unsigned int>(args["nbCat"]);
259  model = make_unique<LGL08_CAT>(alpha, nbCat);
260  }
261  }
262 
263  if (!model)
264  model = readSubstitutionModel(alphabet, modelDescription, mData, nData, parseArguments);
265  else
266  {
267  if (verbose_)
268  ApplicationTools::displayResult("Transition model", modelName);
269 
270  updateParameters_(*model, args);
271 
272  if (parseArguments)
273  {
274  if (nData)
275  initialize_(*model, mData.at(nData));
276  else
277  initialize_(*model, 0);
278  }
279  }
280 
281  return model;
282 }
283 
284 unique_ptr<MixedTransitionModelInterface> BppOTransitionModelFormat::readMixed_(
285  std::shared_ptr<const Alphabet> alphabet,
286  const std::string& modelDescription,
287  const std::map<size_t, std::shared_ptr<const AlignmentDataInterface>>& mData,
288  size_t nData)
289 {
290  unique_ptr<MixedTransitionModelInterface> model;
291 
292  string modelName = "";
293  map<string, string> args;
294  KeyvalTools::parseProcedure(modelDescription, modelName, args);
295  unique_ptr<TransitionModelInterface> pSM;
296 
297  if (modelName == "MixedModel")
298  {
299  if (args.find("model") == args.end())
300  throw Exception("The argument 'model' is missing from MixedModel description");
301  string nestedModelDescription = args["model"];
302  BppOTransitionModelFormat nestedReader(alphabetCode_, allowCovarions_, true, allowGaps_, false, warningLevel_);
303  if (geneticCode_)
304  nestedReader.setGeneticCode(geneticCode_); // This uses the same
305  // instance as the one
306  // that will be used
307  // by the model.
308  pSM = nestedReader.readTransitionModel(alphabet, nestedModelDescription, mData, nData, false);
309 
310  map<string, string> unparsedParameterValuesNested(nestedReader.getUnparsedArguments());
311 
312  map<string, unique_ptr<DiscreteDistributionInterface>> mdist;
313  map<string, string> unparsedParameterValuesNested2;
314 
315  for (auto& it : unparsedParameterValuesNested)
316  {
317  if (it.second.find("(") != string::npos)
318  {
320  mdist[pSM->getParameterNameWithoutNamespace(it.first)] = bIO.readDiscreteDistribution(it.second, false);
321  map<string, string> unparsedParameterValuesNested3(bIO.getUnparsedArguments());
322  for (auto& it2 : unparsedParameterValuesNested3)
323  {
324  unparsedParameterValuesNested2[it.first + "_" + it2.first] = it2.second;
325  }
326  }
327  else
328  unparsedParameterValuesNested2[it.first] = it.second;
329  }
330 
331  for (auto& it : unparsedParameterValuesNested2)
332  {
333  unparsedArguments_[it.first] = it.second;
334  }
335 
336 
337  int fi(-1), ti(-1);
338 
339  if (args.find("from") != args.end())
340  fi = alphabet->charToInt(args["from"]);
341  if (args.find("to") != args.end())
342  ti = alphabet->charToInt(args["to"]);
343 
344  string sModN = pSM->getName();
345  model = make_unique<MixtureOfATransitionModel>(alphabet, std::move(pSM), mdist, fi, ti);
346 
347  vector<string> v = model->getParameters().getParameterNames();
348 
349  if (verbose_)
350  {
351  ApplicationTools::displayResult("Mixture Of A TransitionModel Model", sModN);
352  ApplicationTools::displayResult("Number of classes", model->getNumberOfModels());
353  }
354  }
355  else if (modelName == "Mixture")
356  {
357  vector<string> v_nestedModelDescription;
358  vector< std::unique_ptr<TransitionModelInterface>> v_pSM;
359 
360  if (args.find("model1") == args.end())
361  {
362  throw Exception("Missing argument 'model1' for model " + modelName + ".");
363  }
364  unsigned int nbmodels = 0;
365 
366  while (args.find("model" + TextTools::toString(nbmodels + 1)) != args.end())
367  {
368  v_nestedModelDescription.push_back(args["model" + TextTools::toString(++nbmodels)]);
369  }
370 
371  if (nbmodels < 2)
372  throw Exception("Missing nested models for model " + modelName + ".");
373 
374  for (unsigned i = 0; i < v_nestedModelDescription.size(); ++i)
375  {
376  BppOTransitionModelFormat nestedReader(alphabetCode_, false, true, false, false, warningLevel_);
377  if (geneticCode_)
378  nestedReader.setGeneticCode(geneticCode_); // This uses the same instance as the one that will be used by the model.
379 
380  pSM = nestedReader.readTransitionModel(alphabet, v_nestedModelDescription[i], mData, nData, false);
381 
382  map<string, string> unparsedParameterValuesNested(nestedReader.getUnparsedArguments());
383  for (auto& it : unparsedParameterValuesNested)
384  {
385  unparsedArguments_[modelName + "." + TextTools::toString(i + 1) + "_" + it.first] = it.second;
386  }
387 
388  v_pSM.push_back(std::move(pSM));
389  }
390 
391  model = make_unique<MixtureOfTransitionModels>(alphabet, v_pSM);
392  if (verbose_)
393  ApplicationTools::displayResult("Mixture Of TransitionModel Models", modelName );
394  }
395  else
396  throw Exception("Unknown model name for mixture " + modelName);
397 
398  return model;
399 }
static bool isProteicAlphabet(const Alphabet &alphabet)
static bool isCodonAlphabet(const Alphabet &alphabet)
static std::string getStringParameter(const std::string &parameterName, const std::map< std::string, std::string > &params, const std::string &defaultValue, const std::string &suffix="", bool suffixIsOptional=true, int warn=0)
static void displayWarning(const std::string &text)
static void displayResult(const std::string &text, const T &result)
const std::map< std::string, std::string > & getUnparsedArguments() const
std::unique_ptr< DiscreteDistributionInterface > readDiscreteDistribution(const std::string &distDescription, bool parseArguments=true)
Frequencies set I/O in BppO format.
std::unique_ptr< FrequencySetInterface > readFrequencySet(std::shared_ptr< const Alphabet > alphabet, const std::string &freqDescription, const std::map< size_t, std::shared_ptr< const AlignmentDataInterface >> &mData, size_t nData, bool parseArguments=true) override
Read a frequencies set from a string.
void setGeneticCode(std::shared_ptr< const GeneticCode > gCode)
Set the genetic code to use in case a codon frequencies set should be built.
const std::map< std::string, std::string > & getUnparsedArguments() const override
Substitution model I/O in BppO format.
std::unique_ptr< SubstitutionModelInterface > readSubstitutionModel(std::shared_ptr< const Alphabet > alphabet, const std::string &modelDescription, const std::map< size_t, std::shared_ptr< const AlignmentDataInterface >> &mData, size_t nData, bool parseArguments=true) override
Read a substitution model from a string.
const std::map< std::string, std::string > & getUnparsedArguments() const override
void setGeneticCode(std::shared_ptr< const GeneticCode > gCode)
Set the genetic code to use in case a codon frequencies set should be built.
Transition model I/O in BppO format.
std::unique_ptr< TransitionModelInterface > readTransitionModel(std::shared_ptr< const Alphabet > alphabet, const std::string &modelDescription, const std::map< size_t, std::shared_ptr< const AlignmentDataInterface >> &mData, size_t nData, bool parseArguments=true)
std::unique_ptr< MixedTransitionModelInterface > readMixed_(std::shared_ptr< const Alphabet > alphabet, const std::string &modelDescription, const std::map< size_t, std::shared_ptr< const AlignmentDataInterface >> &mData, size_t nData)
Parametrize a set of state frequencies for codons.
static void parseProcedure(const std::string &desc, std::string &name, std::map< std::string, std::string > &args)
static std::unique_ptr< SubstitutionRegisterInterface > getSubstitutionRegister(const std::string &regTypeDesc, std::shared_ptr< const StateMapInterface > stateMap, std::shared_ptr< const GeneticCode > genCode, std::shared_ptr< AlphabetIndex2 > &weights, std::shared_ptr< AlphabetIndex2 > &distances, bool verbose=true)
Get a Register instance.
Specialized interface for protein substitution model.
const std::string & nextToken()
bool hasMoreToken() const
std::string toString(T t)
Defines the basic types of data flow nodes.