5 #include "../../App/PhylogeneticsApplicationTools.h"
6 #include "../../Io/BppOBranchModelFormat.h"
7 #include "../../Io/BppOFrequencySetFormat.h"
8 #include "../../Io/BppOMultiTreeReaderFormat.h"
9 #include "../../Io/BppOSubstitutionModelFormat.h"
10 #include "../../Io/BppOTransitionModelFormat.h"
11 #include "../../Io/BppOTreeReaderFormat.h"
12 #include "../../Io/Newick.h"
13 #include "../../Io/NexusIoTree.h"
14 #include "../../Io/Nhx.h"
15 #include "../../Model/FrequencySet/MvaFrequencySet.h"
16 #include "../../Model/MixedTransitionModel.h"
17 #include "../../Model/Protein/Coala.h"
18 #include "../../Model/SubstitutionModel.h"
19 #include "../../Model/WrappedModel.h"
20 #include "../../Tree/Tree.h"
21 #include "../../Tree/TreeTools.h"
22 #include "../OptimizationTools.h"
67 const map<string, string>& params,
68 const map<
size_t, std::shared_ptr<AlignmentDataInterface>>& mSeq,
69 map<string, string>& unparsedParams,
72 bool suffixIsOptional,
78 map<size_t, shared_ptr<Tree>> mTree;
80 for (
size_t nT = 0; nT < vTreesName.size(); nT++)
82 size_t poseq = vTreesName[nT].find(
"=");
84 size_t len = (prefix +
"tree").size();
85 string suff = vTreesName[nT].substr(len, poseq - len);
107 map<string, string> args;
111 if (treeName ==
"user")
115 if (args.find(
"format") != args.end())
116 format = args[
"format"];
125 unique_ptr<IMultiTree> treeReader;
126 if (format ==
"Newick")
127 treeReader.reset(
new Newick(
true));
128 else if (format ==
"Nexus")
130 else if (format ==
"NHX")
131 treeReader.reset(
new Nhx());
133 throw Exception(
"Unknown format for tree reading: " + format);
135 vector<unique_ptr<Tree>> trees;
136 treeReader->readTrees(treeFilePath, trees);
151 nbTree = trees.size();
153 for (
size_t i2 = 0; i2 < trees.size(); i2++)
155 if (mTree.find(i2 + 1) != mTree.end())
160 mTree[i2 + 1] = std::move(trees[i2]);
166 if (trees.size() > 1)
167 throw Exception(
"Error : Several trees for description of " + vTreesName[nT] +
".");
169 if (trees.size() == 1)
171 if (mTree.find(num) != mTree.end())
175 mTree[num] = std::move(trees[0]);
180 else if (treeName ==
"random")
184 if (args.find(
"data") == args.end())
193 if (mSeq.find(seqNum) == mSeq.end())
196 vector<string> names = mSeq.find(seqNum)->second->getSequenceNames();
198 tree->setBranchLengths(1.);
200 if (mTree.find(num) != mTree.end())
204 mTree[num] = std::move(tree);
212 map<string, string> cmdArgs;
215 if (cmdName ==
"Input")
219 if (midPointRootBrLengths ==
"yes")
223 for (
size_t i = 0; i < nbTree; i++)
232 else if (cmdName ==
"Equal")
236 throw Exception(
"Value for branch length must be superior to 0");
240 for (
size_t i = 0; i < nbTree; i++)
242 mTree[i + 1]->setBranchLengths(value);
246 mTree[num]->setBranchLengths(value);
248 else if (cmdName ==
"Clock")
252 for (
size_t i = 0; i < nbTree; i++)
260 else if (cmdName ==
"Grafen")
266 for (
size_t i = 0; i < nbTree; i++)
268 auto& tree = *mTree[i + 1];
269 if (grafenHeight ==
"input")
277 throw Exception(
"Height must be positive in Grafen's method.");
285 tree.scaleTree(h / nh);
290 auto& tree = *mTree[num];
291 if (grafenHeight ==
"input")
299 throw Exception(
"Height must be positive in Grafen's method.");
307 tree.scaleTree(h / nh);
311 throw Exception(
"Method '" + initBrLenMethod +
"' unknown for computing branch lengths.");
317 for (
size_t ib = 0; ib < vBrNb.size(); ib++)
319 string apeq = args[vBrNb[ib]];
320 string aveq = vBrNb[ib];
326 size_t posun = apeq.find(
"_");
327 size_t posd = aveq.find(
"_");
347 map<string, string>& unparsedParameterValues,
349 std::shared_ptr<const AlignmentDataInterface> data,
350 map<string, string>& sharedParams,
363 if (initFreqs ==
"observed")
366 throw Exception(
"Missing data for observed frequencies");
367 unsigned int psi = ApplicationTools::getParameter<unsigned int>(model.
getNamespace() +
"initFreqs.observedPseudoCount", unparsedParameterValues, 0);
368 tmodel.setFreqFromData(*data, psi);
370 else if (initFreqs.substr(0, 6) ==
"values")
373 map<int, double> frequencies;
375 string rf = initFreqs.substr(6);
380 tmodel.setFreq(frequencies);
383 throw Exception(
"Unknown initFreqs argument");
392 for (
size_t i = 0; i < pl.
size(); ++i)
398 for (
size_t i = 0; i < pl.
size(); ++i)
400 const string pName = pl[i].getName();
403 bool test1 = (initFreqs ==
"");
405 bool test3 = (unparsedParameterValues.find(pName) != unparsedParameterValues.end());
407 if (test1 || test2 || test3)
409 if (!test1 && !test2 && test3)
433 shared_ptr<const Alphabet> alphabet,
434 shared_ptr<const GeneticCode> gCode,
435 shared_ptr<const AlignmentDataInterface> data,
436 const map<string, string>& params,
437 const string& suffix,
438 bool suffixIsOptional,
443 throw Exception(
"A value is needed for this parameter: nonhomogeneous.number_of_models .");
444 size_t nbModels = ApplicationTools::getParameter<size_t>(
"nonhomogeneous.number_of_models", params, 1, suffix, suffixIsOptional, warn);
446 throw Exception(
"The number of models can't be 0 !");
449 for (
size_t i = 0; nomix& (i < nbModels); i++)
455 if (modelDesc.find(
"Mixed") != string::npos)
459 unique_ptr<SubstitutionModelSet> modelSet =
nullptr;
460 auto modelSet1 = make_unique<SubstitutionModelSet>(alphabet);
461 setSubstitutionModelSet(*modelSet1, alphabet, gCode, data, params, suffix, suffixIsOptional, verbose, warn);
463 if (modelSet1->hasMixedTransitionModel())
465 modelSet = make_unique<MixedSubstitutionModelSet>(*modelSet1);
466 completeMixedSubstitutionModelSet(
dynamic_cast<MixedSubstitutionModelSet&
>(*modelSet), alphabet, data, params, suffix, suffixIsOptional, verbose);
469 modelSet = std::move(modelSet1);
478 shared_ptr<const Alphabet> alphabet,
479 shared_ptr<const GeneticCode> gCode,
480 shared_ptr<const AlignmentDataInterface> data,
481 const map<string, string>& params,
482 const string& suffix,
483 bool suffixIsOptional,
489 throw Exception(
"You must specify this parameter: 'nonhomogeneous.number_of_models'.");
490 size_t nbModels = ApplicationTools::getParameter<size_t>(
"nonhomogeneous.number_of_models", params, 1, suffix, suffixIsOptional, warn);
492 throw Exception(
"The number of models can't be 0 !");
502 vector<double> rateFreqs;
507 throw Exception(
"PhylogeneticsApplicationTools::setSubstitutionModelSet(): a GeneticCode instance is required for instantiating a codon model.");
516 std::map<size_t, std::shared_ptr<const AlignmentDataInterface>> mData;
519 shared_ptr<TransitionModelInterface> tmp = bIO.
readTransitionModel(alphabet, tmpDesc, mData, 1,
false);
521 if (tmp->getNumberOfStates() != alphabet->getSize())
525 size_t n =
static_cast<size_t>(tmp->getNumberOfStates() / alphabet->getSize());
526 rateFreqs = vector<double>(n, 1. /
static_cast<double>(n));
533 map<string, string> unparsedParameters;
536 std::shared_ptr<FrequencySetInterface> rootFrequencies(0);
540 stationarity = !rootFrequencies;
542 if (freqDescription.substr(0, 10) ==
"MVAprotein")
544 auto tmp2 = dynamic_pointer_cast<Coala>(tmp);
546 dynamic_pointer_cast<MvaFrequencySet>(rootFrequencies)->initSet(*tmp2);
548 throw Exception(
"The MVAprotein frequencies set at the root can only be used if a Coala model is used on branches.");
563 for (
size_t i = 0; i < nbModels; ++i)
577 map<string, string> sharedParameters;
580 setSubstitutionModelParametersInitialValuesWithAliases(
582 unparsedModelParameters, i + 1, data,
586 unparsedParameters.insert(sharedParameters.begin(), sharedParameters.end());
588 vector<int> nodesId = ApplicationTools::getVectorParameter<int>(prefix +
".nodes_id", params,
',',
':',
"", suffix, suffixIsOptional, warn);
593 modelSet.
addModel(std::move(model), nodesId);
603 string::size_type index = alias.find(
"->");
604 if (index == string::npos)
605 throw Exception(
"PhylogeneticsApplicationTools::setSubstitutionModelSet. Bad alias syntax, should contain `->' symbol: " + alias);
606 unparsedParameters[alias.substr(0, index)] = alias.substr(index + 2);
618 shared_ptr<const Alphabet> alphabet,
619 shared_ptr<const AlignmentDataInterface> data,
620 const map<string, string>& params,
621 const string& suffix,
622 bool suffixIsOptional,
633 numd = ApplicationTools::getParameter<size_t>(
"site.number_of_paths", params, 1, suffix, suffixIsOptional, warn);
638 vector<string> vdesc;
642 if (desc.size() == 0)
645 vdesc.push_back(desc);
649 if (vdesc.size() == 0)
656 for (vector<string>::iterator it(vdesc.begin()); it != vdesc.end(); it++)
663 string::size_type indexo = submodel.find(
"[");
664 string::size_type indexf = submodel.find(
"]");
665 if ((indexo == string::npos) | (indexf == string::npos))
666 throw Exception(
"PhylogeneticsApplicationTools::setMixedSubstitutionModelSet. Bad path syntax, should contain `[]' symbols: " + submodel);
668 string p2 = submodel.substr(indexo + 1, indexf - indexo - 1);
670 auto pSM = dynamic_pointer_cast<const MixedTransitionModelInterface>(mixedModelSet.
getModel(
static_cast<size_t>(num - 1)));
672 throw BadIntegerException(
"LegacyPhylogeneticsApplicationTools::setMixedSubstitutionModelSet: Wrong model for number", num - 1);
673 Vuint submodnb = pSM->getSubmodelNumbers(p2);
675 mixedModelSet.
addToHyperNode(
static_cast<size_t>(num - 1), submodnb);
679 throw Exception(
"A path should own at least a submodel of each mixed model: " + *it);
687 throw Exception(
"All paths must be disjoint.");
691 st = (mixedModelSet.
complete()) ?
"Yes" :
"No";
699 throw Exception(
"The remaining submodels can not create a complete path.");
710 shared_ptr<TreeLikelihoodInterface> tl,
712 const map<string, string>& params,
713 const string& suffix,
714 bool suffixIsOptional,
719 if (optimization ==
"None")
722 map<string, string> optArgs;
725 unsigned int optVerbose = ApplicationTools::getParameter<unsigned int>(
"optimization.verbose", params, 2, suffix, suffixIsOptional, warn);
728 shared_ptr<OutputStream> messageHandler =
729 (mhPath ==
"none") ?
nullptr :
731 make_shared<StlOutputStream>(make_unique<ofstream>(mhPath.c_str(), ios::out));
736 shared_ptr<OutputStream> profiler =
737 (prPath ==
"none") ?
nullptr :
739 make_shared<StlOutputStream>(make_unique<ofstream>(prPath.c_str(), ios::out));
741 profiler->setPrecision(20);
754 unsigned int nbEvalMax = ApplicationTools::getParameter<unsigned int>(
"optimization.scale_first.max_number_f_eval", params, 1000000, suffix, suffixIsOptional, warn + 1);
772 if (paramListDesc.length() == 0)
780 if (param ==
"BrLen")
782 vector<string> vs = tl->getBranchLengthsParameters().getParameterNames();
787 else if (param ==
"Ancient")
789 auto nhtl = dynamic_pointer_cast<NonHomogeneousTreeLikelihood>(tl);
794 vector<string> vs = nhtl->getRootFrequenciesParameters().getParameterNames();
800 else if (param ==
"Model")
803 vector<string> vs1 = tl->getSubstitutionModelParameters().getParameterNames();
804 auto nhtl = dynamic_pointer_cast<NonHomogeneousTreeLikelihood>(tl);
807 vector<string> vs2 = nhtl->getRootFrequenciesParameters().getParameterNames();
817 else if (param ==
"*")
819 parametersToEstimate.
reset();
823 else if (param.find(
"*") != string::npos)
827 bool verbhere = verbose;
860 if (paramListDesc.length() == 0)
863 string constraint =
"";
864 string pc, param =
"";
872 string::size_type index = pc.find(
"=");
873 if (index == string::npos)
874 throw Exception(
"PhylogeneticsApplicationTools::optimizeParameters. Bad constrain syntax, should contain `=' symbol: " + pc);
875 param = pc.substr(0, index);
876 constraint = pc.substr(index + 1);
879 vector<string> parNames2;
881 if (param ==
"BrLen")
882 parNames2 = tl->getBranchLengthsParameters().getParameterNames();
883 else if (param ==
"Ancient")
885 auto nhtl = dynamic_pointer_cast<NonHomogeneousTreeLikelihood>(tl);
889 parNames2 = nhtl->getRootFrequenciesParameters().getParameterNames();
891 else if (param ==
"Model")
893 vector<string> vs1 = tl->getSubstitutionModelParameters().getParameterNames();
894 auto nhtl = dynamic_pointer_cast<NonHomogeneousTreeLikelihood>(tl);
897 vector<string> vs2 = nhtl->getRootFrequenciesParameters().getParameterNames();
903 else if (param ==
"*")
904 parNames2 = parToEstNames;
905 else if (param.find(
"*") != string::npos)
908 parNames2.push_back(param);
910 for (
size_t i = 0; i < parNames2.size(); ++i)
932 throw Exception(
"Parameter '" + param +
"' does not fit the constraint " + constraint);
940 unsigned int nbEvalMax = ApplicationTools::getParameter<unsigned int>(
"optimization.max_number_f_eval", params, 1000000, suffix, suffixIsOptional, warn + 1);
949 shared_ptr<BackupListener> backupListener;
951 if (backupFile !=
"none")
958 ifstream bck(backupFile.c_str(), ios::in);
962 for (
size_t l = 1; l < lines.size(); ++l)
969 cerr <<
"Corrupted backup file!!!" << endl;
970 cerr <<
"at line " << l <<
": " << lines[l] << endl;
985 tl->setParameters(pl);
986 if (
convert(
abs(tl->getValue() - fval)) > 0.000001)
998 if (nniMethod ==
"fast")
1002 else if (nniMethod ==
"better")
1006 else if (nniMethod ==
"phyml")
1011 throw Exception(
"Unknown NNI algorithm: '" + nniMethod +
"'.");
1015 string optMethodDeriv;
1016 if (order ==
"Gradient")
1020 else if (order ==
"Newton")
1024 else if (order ==
"BFGS")
1029 throw Exception(
"Unknown derivatives algorithm: '" + order +
"'.");
1042 if (clock !=
"None" && clock !=
"Global")
1043 throw Exception(
"Molecular clock option not recognized, should be one of 'Global' or 'None'.");
1044 bool useClock = (clock ==
"Global");
1045 if (useClock && optimizeTopo)
1046 throw Exception(
"PhylogeneticsApplicationTools::optimizeParameters. Cannot optimize topology with a molecular clock.");
1051 if ((optName ==
"D-Brent") || (optName ==
"D-BFGS"))
1054 string optMethodModel;
1055 if (optName ==
"D-Brent")
1060 unsigned int nstep = ApplicationTools::getParameter<unsigned int>(
"nstep", optArgs, 1,
"",
true, warn + 1);
1065 unsigned int topoNbStep = ApplicationTools::getParameter<unsigned int>(
"optimization.topology.nstep", params, 1,
"",
true, warn + 1);
1069 dynamic_pointer_cast<NNIHomogeneousTreeLikelihood>(tl), parametersToEstimate,
1070 optNumFirst, tolBefore, tolDuring, nbEvalMax, topoNbStep, messageHandler, profiler,
1071 reparam, optVerbose, optMethodDeriv, nstep, nniAlgo);
1074 if (verbose && nstep > 1)
1078 dynamic_pointer_cast<DiscreteRatesAcrossSitesTreeLikelihoodInterface>(tl), parametersToEstimate,
1079 backupListener, nstep, tolerance, nbEvalMax, messageHandler, profiler, reparam, optVerbose, optMethodDeriv, optMethodModel);
1081 else if (optName ==
"FullD")
1088 unsigned int topoNbStep = ApplicationTools::getParameter<unsigned int>(
"optimization.topology.nstep", params, 1,
"",
true, warn + 1);
1092 dynamic_pointer_cast<NNIHomogeneousTreeLikelihood>(tl), parametersToEstimate,
1093 optNumFirst, tolBefore, tolDuring, nbEvalMax, topoNbStep, messageHandler, profiler,
1094 reparam, optVerbose, optMethodDeriv, nniAlgo);
1099 dynamic_pointer_cast<DiscreteRatesAcrossSitesTreeLikelihoodInterface>(tl), parametersToEstimate,
1100 backupListener, tolerance, nbEvalMax, messageHandler, profiler, reparam, useClock, optVerbose, optMethodDeriv);
1103 throw Exception(
"Unknown optimization method: " + optName);
1106 shared_ptr<OptimizerInterface> finalOptimizer =
nullptr;
1107 if (finalMethod ==
"none")
1109 else if (finalMethod ==
"simplex")
1111 finalOptimizer = make_shared<DownhillSimplexMethod>(tl);
1113 else if (finalMethod ==
"powell")
1115 finalOptimizer = make_shared<PowellMultiDimensions>(tl);
1118 throw Exception(
"Unknown final optimization method: " + finalMethod);
1125 finalOptimizer->setProfiler(profiler);
1126 finalOptimizer->setMessageHandler(messageHandler);
1127 finalOptimizer->setMaximumNumberOfEvaluations(nbEvalMax);
1128 finalOptimizer->getStopCondition()->setTolerance(tolerance);
1129 finalOptimizer->setVerbose(verbose);
1131 finalOptimizer->init(parametersToEstimate);
1132 finalOptimizer->optimize();
1133 n += finalOptimizer->getNumberOfEvaluations();
1138 if (backupFile !=
"none")
1140 string bf = backupFile +
".def";
1141 rename(backupFile.c_str(), bf.c_str());
1152 const vector<const Tree*>& trees,
1153 const map<string, string>& params,
1154 const string& prefix,
1155 const string& suffix,
1156 bool suffixIsOptional,
1164 if (format ==
"Newick")
1165 treeWriter =
new Newick();
1166 else if (format ==
"Nexus")
1168 else if (format ==
"NHX")
1169 treeWriter =
new Nhx();
1171 throw Exception(
"Unknown format for tree writing: " + format);
1190 (out <<
"nonhomogeneous=general").endLine();
1191 (out <<
"nonhomogeneous.number_of_models=" << modelSet.
getNumberOfModels()).endLine();
1194 (out <<
"nonhomogeneous.stationarity = yes");
1197 map< size_t, vector<string>> modelLinks;
1198 map< string, set<size_t>> parameterLinks;
1199 vector<string> writtenNames;
1204 shared_ptr<const BranchModelInterface> model = modelSet.
getModel(i);
1208 map<string, string> aliases;
1213 for (
size_t np = 0; np < pl.
size(); np++)
1217 aliases[pl[np].getName()] = nfrom;
1222 writtenNames.clear();
1223 out.
endLine() <<
"model" << (i + 1) <<
"=";
1225 bIOsm.
write(*model, out, aliases, writtenNames);
1229 out <<
"model" << (i + 1) <<
".nodes_id=" << ids[0];
1230 for (
size_t j = 1; j < ids.size(); ++j)
1232 out <<
"," << ids[j];
1244 map<string, string> aliases;
1248 for (
size_t np = 0; np < plf.
size(); np++)
1250 string nfrom = modelSet.
getFrom(plf[np].getName());
1252 aliases[plf[np].getName()] = nfrom;
1258 (out <<
"# Root frequencies:").endLine();
1259 out <<
"nonhomogeneous.root_freq=";
std::string getFrom(const std::string &name) const
void aliasParameters(const std::string &p1, const std::string &p2)
static std::string CONSTRAINTS_AUTO
virtual void setMessageHandler(std::shared_ptr< OutputStream > mh)
Interface for all Branch models.
virtual std::string getName() const =0
Get the name of the model.
virtual std::string getDescription() const=0
virtual bool isEmpty() const=0
bool isComplete() const
checks if this HyperNode includes at least a submodel of each mixed model
Substitution models manager for Mixed Substitution Models. This class inherits from SubstitutionModel...
void computeHyperNodesProbabilities()
void addToHyperNode(size_t nM, const Vuint &vnS, int nH=-1)
size_t getNumberOfHyperNodes() const
bool hasExclusivePaths() const
HyperNode & getHyperNode(size_t i)
static const std::string PHYML
static const std::string FAST
static const std::string BETTER
The so-called 'newick' parenthetic format.
a simple parser for reading trees from a Nexus file.
The so-called 'Nhx - New Hampshire eXtended' parenthetic format.
General interface for tree writers.
virtual void writeTrees(const std::vector< const Tree * > &trees, const std::string &path, bool overwrite) const =0
Write trees to a file.
virtual OutputStream & endLine()=0
virtual const ParameterList & getIndependentParameters() const=0
virtual bool hasParameter(const std::string &name) const
virtual std::vector< std::string > getParameterNames() const
virtual void deleteParameter(const std::string &name)
virtual const Parameter & parameter(const std::string &name) const
virtual bool matchParametersValues(const ParameterList ¶ms, std::vector< size_t > *updatedParameters=0)
virtual void deleteParameters(const std::vector< std::string > &names, bool mustExist=true)
virtual void setParameter(size_t index, const Parameter ¶m)
virtual size_t whichParameterHasName(const std::string &name) const
virtual std::string parameter() const
virtual const ConstraintInterface & constraint() const
virtual std::shared_ptr< const ConstraintInterface > getConstraint() const
virtual void setConstraint(std::shared_ptr< ConstraintInterface > constraint)
virtual const std::string & getName() const
virtual bool hasConstraint() const
virtual std::string getParameterNameWithoutNamespace(const std::string &name) const=0
virtual bool matchParametersValues(const ParameterList ¶meters)=0
virtual std::string getNamespace() const=0
size_t numberOfRemainingTokens() const
const std::string & nextToken()
bool hasMoreToken() const
Substitution models manager for non-homogeneous / non-reversible models of evolution.
size_t getNumberOfModels() const
const std::vector< int > & getNodesWithModel(size_t i) const
Get a list of nodes id for which the given model is associated.
std::shared_ptr< const TransitionModelInterface > getModel(size_t i) const
Get one model from the set knowing its index.
void setRootFrequencies(std::shared_ptr< FrequencySetInterface > rootFreqs)
Sets a given FrequencySet for root frequencies.
void addModel(std::shared_ptr< TransitionModelInterface > model, const std::vector< int > &nodesId)
Add a new model to the set, and set relationships with nodes and params.
bool isStationary() const
const FrequencySetInterface & rootFrequencySet() const
void clear()
Resets all the information contained in this object.
Interface for all transition models.
int toInt(const std::string &s, char scientificNotation='e')
double toDouble(const std::string &s, char dec='.', char scientificNotation='e')
bool isEmpty(const std::string &s)
bool isDecimalInteger(const std::string &s, char scientificNotation='e')
std::string toString(T t)
Defines the basic types of data flow nodes.
double convert(const bpp::ExtendedFloat &ef)
ExtendedFloat abs(const ExtendedFloat &ef)
std::vector< unsigned int > Vuint