5 #include "../Model/FrequencySet/CodonFrequencySet.h"
6 #include "../Model/FrequencySet/MvaFrequencySet.h"
7 #include "../Model/FrequencySet/NucleotideFrequencySet.h"
8 #include "../Model/FrequencySet/ProteinFrequencySet.h"
45 std::shared_ptr<const Alphabet> alphabet,
46 const std::string& freqDescription,
47 const std::map<
size_t, std::shared_ptr<const AlignmentDataInterface>>& mData,
51 unparsedArguments_.clear();
53 map<string, string> args;
55 unique_ptr<FrequencySetInterface> pFS;
57 if (freqName ==
"Fixed")
61 if (alphabetCode_ & NUCLEOTIDE)
62 pFS = make_unique<FixedNucleotideFrequencySet>(dynamic_pointer_cast<const NucleicAlphabet>(alphabet));
64 throw Exception(
"Nucleotide alphabet not supported.");
68 if (alphabetCode_ & PROTEIN)
69 pFS = make_unique<FixedProteinFrequencySet>(dynamic_pointer_cast<const ProteicAlphabet>(alphabet));
71 throw Exception(
"Protein alphabet not supported.");
75 if (alphabetCode_ & CODON)
78 throw Exception(
"BppOFrequencySetFormat::readFrequencySet(). No genetic code specified! Consider using 'setGeneticCode'.");
83 throw Exception(
"Codon alphabet not supported.");
88 pFS = make_unique<FixedFrequencySet>(make_shared<const CanonicalStateMap>(alphabet,
false));
91 else if (freqName ==
"Full")
93 unsigned short method = 1;
94 if (args.find(
"method") != args.end())
96 if (args[
"method"] ==
"local")
98 else if (args[
"method"] ==
"binary")
104 if (alphabetCode_ & NUCLEOTIDE)
105 pFS = make_unique<FullNucleotideFrequencySet>(dynamic_pointer_cast<const NucleicAlphabet>(alphabet));
107 throw Exception(
"Nucleotide alphabet not supported.");
111 if (alphabetCode_ & PROTEIN)
112 pFS = make_unique<FullProteinFrequencySet>(dynamic_pointer_cast<const ProteicAlphabet>(alphabet),
false, method);
114 throw Exception(
"Protein alphabet not supported.");
118 if (alphabetCode_ & CODON)
121 throw Exception(
"BppOFrequencySetFormat::readFrequencySet(). No genetic code specified! Consider using 'setGeneticCode'.");
122 pFS = make_unique<FullCodonFrequencySet>(geneticCode_);
126 throw Exception(
"Codon alphabet not supported.");
132 pFS = make_unique<FullFrequencySet>(make_shared<CanonicalStateMap>(alphabet,
false),
false, method);
135 else if (freqName ==
"Empirical")
137 string fname = args[
"file"];
139 throw Exception(
"'file' argument missing for user-defined frequencies.");
142 if (args.find(
"col") != args.end())
147 if (alphabetCode_ & NUCLEOTIDE)
148 pFS = make_unique<UserNucleotideFrequencySet>(dynamic_pointer_cast<const NucleicAlphabet>(alphabet), fname, nCol);
150 throw Exception(
"Nucleotide alphabet not supported.");
154 if (alphabetCode_ & PROTEIN)
155 pFS = make_unique<UserProteinFrequencySet>(dynamic_pointer_cast<const ProteicAlphabet>(alphabet), fname, nCol);
157 throw Exception(
"Protein alphabet not supported.");
161 if (alphabetCode_ & CODON)
164 throw Exception(
"BppOFrequencySetFormat::readFrequencySet(). No genetic code specified! Consider using 'setGeneticCode'.");
165 pFS = make_unique<UserCodonFrequencySet>(geneticCode_, fname, nCol);
169 throw Exception(
"Codon alphabet not supported.");
174 pFS = make_unique<UserFrequencySet>(make_shared<const CanonicalStateMap>(alphabet,
false), fname, nCol);
177 else if (freqName ==
"GC")
180 throw Exception(
"Error, invalid frequencies " + freqName +
" with non-nucleic alphabet.");
181 if (alphabetCode_ & NUCLEOTIDE)
182 pFS = make_unique<GCFrequencySet>(dynamic_pointer_cast<const NucleicAlphabet>(alphabet));
184 throw Exception(
"Nucleotide alphabet not supported.");
188 else if (freqName ==
"Word")
190 if (!(alphabetCode_ & WORD))
191 throw Exception(
"Word alphabet not supported.");
193 throw Exception(
"BppOFrequencySetFormat::readFrequencySet(...).\n\t Bad alphabet type "
194 + alphabet->getAlphabetType() +
" for frequencies set " + freqName +
".");
196 auto pWA = dynamic_pointer_cast<const WordAlphabet>(alphabet);
199 throw Exception(
"BppOFrequencySetFormat::read : Word freq name is from WordAlphabet.");
201 if (args.find(
"frequency") != args.end())
203 string sAFS = args[
"frequency"];
205 unsigned int nbfreq = pWA->getLength();
207 for (
unsigned i = 0; i < nbfreq; ++i)
213 auto pFS2 = nestedReader.
readFrequencySet(pWA->getNAlphabet(0), sAFS, mData, nData,
false);
215 for (map<string, string>::iterator it = unparsedParameterValuesNested.begin(); it != unparsedParameterValuesNested.end(); it++)
217 unparsedArguments_[st +
"_" + it->first] = it->second;
219 pFS = make_unique<WordFromUniqueFrequencySet>(pWA, std::move(pFS2));
223 if (args.find(
"frequency1") == args.end())
224 throw Exception(
"BppOFrequencySetFormat::read. Missing argument 'frequency' or 'frequency1' for frequencies set 'Word'.");
225 vector<string> v_sAFS;
226 vector<unique_ptr<FrequencySetInterface>> v_AFS;
227 unsigned int nbfreq = 1;
234 if (v_sAFS.size() != pWA->getLength())
237 for (
size_t i = 0; i < v_sAFS.size(); ++i)
240 pFS = nestedReader.
readFrequencySet(pWA->getNAlphabet(i), v_sAFS[i], mData, nData,
false);
242 for (map<string, string>::iterator it = unparsedParameterValuesNested.begin(); it != unparsedParameterValuesNested.end(); it++)
246 v_AFS.push_back(std::move(pFS));
249 pFS = make_unique<WordFromIndependentFrequencySet>(pWA, v_AFS);
253 else if (freqName ==
"FromModel")
255 if (args.find(
"model") == args.end())
256 throw Exception(
"Missing argument 'model' for frequencies " + freqName +
".");
262 auto model = nestedReader.
readTransitionModel(alphabet, args[
"model"], mData, nData,
false);
263 pFS = make_unique<FromModelFrequencySet>(std::move(model));
265 for (
auto& it : unparsedParameterValuesNested)
267 unparsedArguments_[
"FromModel." + it.first] = it.second;
273 else if (freqName ==
"Codon")
275 if (!(alphabetCode_ & CODON))
276 throw Exception(
"Codon alphabet not supported.");
278 throw Exception(
"BppOFrequencySetFormat::read.\n\t Bad alphabet type "
279 + alphabet->getAlphabetType() +
" for frequenciesset " + freqName +
".");
281 throw Exception(
"BppOFrequencySetFormat::readFrequencySet(). No genetic code specified! Consider using 'setGeneticCode'.");
283 auto pWA = dynamic_pointer_cast<const CodonAlphabet>(alphabet);
285 if (args.find(
"frequency") != args.end())
287 string sAFS = args[
"frequency"];
290 auto pFS2 = nestedReader.
readFrequencySet(pWA->getNucleicAlphabet(), sAFS, mData, nData,
false);
293 for (map<string, string>::iterator it = unparsedParameterValuesNested.begin(); it != unparsedParameterValuesNested.end(); it++)
295 unparsedArguments_[
"123_" + it->first] = it->second;
298 pFS = make_unique<CodonFromUniqueFrequencySet>(geneticCode_, std::move(pFS2),
"Codon");
302 if (args.find(
"frequency1") == args.end())
303 throw Exception(
"BppOFrequencySetFormat::read. Missing argument 'frequency' or 'frequency1' for frequencies set.");
304 vector<string> v_sAFS;
305 vector<unique_ptr<FrequencySetInterface>> v_AFS;
306 unsigned int nbfreq = 1;
313 if (v_sAFS.size() != 3)
316 for (
size_t i = 0; i < v_sAFS.size(); ++i)
319 pFS = nestedReader.
readFrequencySet(pWA->getNucleicAlphabet(), v_sAFS[i], mData, nData,
false);
321 for (
auto& it : unparsedParameterValuesNested)
325 v_AFS.push_back(std::move(pFS));
329 throw Exception(
"BppOFrequencySetFormat::readFrequencySet(). No genetic code specified! Consider using 'setGeneticCode'.");
330 pFS = make_unique<CodonFromIndependentFrequencySet>(geneticCode_, v_AFS,
"Codon");
335 else if (freqName ==
"FullPerAA")
337 if (!(alphabetCode_ & CODON))
338 throw Exception(
"Codon alphabet not supported.");
340 throw Exception(
"BppOFrequencySetFormat::read.\n\t Bad alphabet type "
341 + alphabet->getAlphabetType() +
" for frequenciesset " + freqName +
".");
344 throw Exception(
"BppOFrequencySetFormat::readFrequencySet(). No genetic code specified! Consider using 'setGeneticCode'.");
346 auto pPA = dynamic_pointer_cast<const ProteicAlphabet>(geneticCode_->getTargetAlphabet());
348 unsigned short method = 1;
349 if (args.find(
"method") != args.end())
351 if (args[
"method"] ==
"local")
353 else if (args[
"method"] ==
"binary")
357 if (args.find(
"protein_frequencies") != args.end())
359 string sPFS = args[
"protein_frequencies"];
361 auto tmpPtr = nestedReader.
readFrequencySet(pPA, sPFS, mData, nData,
false);
365 for (
auto& it : unparsedParameterValuesNested)
367 unparsedArguments_[
"FullPerAA." + it.first] = it.second;
369 pFS = make_unique<FullPerAACodonFrequencySet>(geneticCode_, std::move(pPFS), method);
372 pFS = make_unique<FullPerAACodonFrequencySet>(geneticCode_, method);
379 if (!(alphabetCode_ & CODON))
380 throw Exception(
"Codon alphabet not supported.");
382 throw Exception(
"BppOFrequencySetFormat::readFrequencySet(). No genetic code specified! Consider using 'setGeneticCode'.");
384 auto pWA = dynamic_pointer_cast<const CodonAlphabet>(alphabet);
386 if (args.find(
"genetic_code") != args.end())
388 ApplicationTools::displayWarning(
"'genetic_code' argument is no longer supported inside model description, and has been supersided by a global 'genetic_code' option.");
389 throw Exception(
"BppOFrequencySetFormat::read. Deprecated 'genetic_code' argument.");
392 string mgmtStopCodon =
"quadratic";
394 if (freqName ==
"F0")
398 else if (freqName ==
"F1X4")
402 if (args.find(
"mgmtStopCodons") != args.end())
403 mgmtStopCodon = args[
"mgmtStopCodons"];
404 else if (args.find(
"mgmtStopCodon") != args.end())
405 mgmtStopCodon = args[
"mgmtStopCodon"];
410 if (args.find(
"frequency") != args.end())
412 string sAFS = args[
"frequency"];
415 auto pFS2 = nestedReader.
readFrequencySet(pWA->getNucleicAlphabet(), sAFS, mData, nData,
false);
416 if (pFS2->getName() !=
"Full")
417 throw Exception(
"BppOFrequencySetFormat::read. The frequency option in F1X4 can only be Full");
421 for (
auto& it : unparsedParameterValuesNested)
423 unparsedArguments_[
"123_" + it.first] = it.second;
427 if (args.find(
"123_theta") != args.end())
428 unparsedArguments_[
"123_Full.theta"] = args[
"123_theta"];
429 if (args.find(
"123_theta1") != args.end())
430 unparsedArguments_[
"123_Full.theta1"] = args[
"123_theta1"];
431 if (args.find(
"123_theta2") != args.end())
432 unparsedArguments_[
"123_Full.theta2"] = args[
"123_theta2"];
433 if (args.find(
"theta") != args.end())
434 unparsedArguments_[
"123_Full.theta"] = args[
"123_theta"];
435 if (args.find(
"theta1") != args.end())
436 unparsedArguments_[
"123_Full.theta1"] = args[
"123_theta1"];
437 if (args.find(
"theta2") != args.end())
438 unparsedArguments_[
"123_Full.theta2"] = args[
"123_theta2"];
440 else if (freqName ==
"F3X4")
444 if (args.find(
"mgmtStopCodons") != args.end())
445 mgmtStopCodon = args[
"mgmtStopCodons"];
446 else if (args.find(
"mgmtStopCodon") != args.end())
447 mgmtStopCodon = args[
"mgmtStopCodon"];
452 if (args.find(
"frequency1") != args.end() ||
453 args.find(
"frequency2") != args.end() ||
454 args.find(
"frequency3") != args.end())
456 vector<string> v_sAFS;
458 for (
unsigned int nbfreq = 1; nbfreq <= 3; ++nbfreq)
463 v_sAFS.push_back(
"");
466 for (
size_t i = 0; i < v_sAFS.size(); ++i)
470 auto sAFS = v_sAFS[i];
473 pFS = nestedReader.
readFrequencySet(pWA->getNucleicAlphabet(), sAFS, mData, nData,
false);
474 if (pFS->getName() !=
"Full")
475 throw Exception(
"BppOFrequencySetFormat::read. The frequency options in F3X4 can only be Full");
478 for (
auto& it : unparsedParameterValuesNested)
485 for (
unsigned int i = 1; i <= 3; ++i)
495 else if (freqName ==
"F61")
501 unsigned short method = 1;
502 if (args.find(
"method") != args.end())
504 if (args[
"method"] ==
"local")
506 else if (args[
"method"] ==
"binary")
512 throw Exception(
"Unknown frequency option: " + freqName);
516 else if (freqName ==
"MVAprotein")
518 if (args.find(
"model") == args.end())
519 throw Exception(
"Missing argument 'model' for frequencies " + freqName +
".");
523 auto model = nestedReader.
readTransitionModel(alphabet, args[
"model"], mData, nData,
false);
524 if (
dynamic_cast<Coala*
>(model.get())==0)
525 throw Exception(
"MVAprotein frequency set needs a Coala model");
527 auto coala = unique_ptr<Coala>(
dynamic_cast<Coala*
>(model.release()));
530 auto mvaFS = make_unique<MvaFrequencySet>(dynamic_pointer_cast<const ProteicAlphabet>(alphabet));
532 mvaFS->initSet(*coala);
534 pFS = std::move(mvaFS);
537 throw Exception(
"Unknown frequency option: " + freqName);
540 vector<string> pnames = pFS->getParameters().getParameterNames();
542 string pref = pFS->getNamespace();
543 for (
size_t i = 0; i < pnames.size(); i++)
545 string name = pFS->getParameterNameWithoutNamespace(pnames[i]);
546 if (args.find(name) != args.end())
547 unparsedArguments_[pref + name] = args[name];
552 string pname, pval, pname2;
553 for (
size_t i = 0; i < pl.
size(); i++)
555 pname = pFS->getParameterNameWithoutNamespace(pl[i].getName());
557 if (args.find(pname) == args.end())
562 (pval.find(
"(") != string::npos))
565 for (
size_t j = 0; j < pl.
size() && !found; j++)
567 pname2 = pFS->getParameterNameWithoutNamespace(pl[j].getName());
575 pFS->aliasParameters(pname2, pname);
586 if (args.find(
"init") != args.end())
587 unparsedArguments_[
"init"] = args[
"init"];
589 if (args.find(
"init.observedPseudoCount") != args.end())
590 unparsedArguments_[
"init.observedPseudoCount"] = args[
"init.observedPseudoCount"];
592 std::shared_ptr<const AlignmentDataInterface> data(0);
594 data = mData.at(nData);
596 if (args.find(
"values") != args.end())
598 unparsedArguments_[
"init"] =
"values" + args[
"values"];
599 initialize_(*pFS, data);
601 else if (parseArguments)
602 initialize_(*pFS, data);
610 std::map<std::string, std::string>& globalAliases,
611 std::vector<std::string>& writtenNames)
const
618 if (find(writtenNames.begin(), writtenNames.end(), pn) != writtenNames.end())
628 string name = freqset.
getName();
631 bool oValues =
false;
633 if ((name ==
"Fixed") || (name ==
"F0") || (name ==
"Full"))
637 for (
size_t i = 0; i < vf.size(); ++i)
646 writtenNames.insert(writtenNames.end(), npl.begin(), npl.end());
656 out <<
"file=" << ufs.getPath();
657 size_t nCol = ufs.getColumnNumber();
660 out <<
", col=" << nCol;
673 for (
size_t i = 0; i < pWFI.getLength(); ++i)
677 out <<
"frequency" << i + 1 <<
"=";
678 writeFrequencySet(pWFI.frequencySetForLetter(i), out, globalAliases, writtenNames);
691 for (
size_t i = 0; i < pWFU.getLength(); ++i)
696 writeFrequencySet(pWFU.frequencySetForLetter(i), out, globalAliases, writtenNames);
715 bIO.
write(*(pFMFS.getModel()), out, globalAliases, writtenNames);
726 out <<
"protein_frequencies=";
728 if (pFPA.hasProteinFrequencySet())
730 writeFrequencySet(pFPA.proteinFrequencySet(), out, globalAliases, writtenNames);
738 unsigned short meth = pFPA.getMethod();
743 out <<
"method=" << ((meth == 2) ?
"local" :
"binary");
759 out <<
"method=" << ((meth == 2) ?
"local" :
"binary");
774 out <<
"method=" << ((meth == 2) ?
"local" :
"binary");
785 if (pCFU.getMgmtStopCodon() !=
"quadratic")
789 out <<
"mgmtStopCodons=" << pCFU.getMgmtStopCodon();
799 if (pCFI.getMgmtStopCodon() !=
"quadratic")
803 out <<
"mgmtStopCodons=" << pCFI.getMgmtStopCodon();
823 std::shared_ptr<const AlignmentDataInterface> data)
825 if (unparsedArguments_.find(
"init") != unparsedArguments_.end())
828 string init = unparsedArguments_[
"init"];
830 if (init ==
"observed" && data)
832 unsigned int psc = 0;
833 if (unparsedArguments_.find(
"init.observedPseudoCount") != unparsedArguments_.end())
834 psc = TextTools::to<unsigned int>(unparsedArguments_[
"init.observedPseudoCount"]);
836 map<int, double> freqs;
841 else if (init.substr(0, 6) ==
"values")
844 vector<double> frequencies;
845 string rf = init.substr(6);
852 else if (init ==
"balanced")
857 throw Exception(
"Unknown init argument");
859 unparsedArguments_.erase(unparsedArguments_.find(
"init"));
860 if (unparsedArguments_.find(
"init.observedPseudoCount") != unparsedArguments_.end())
861 unparsedArguments_.erase(unparsedArguments_.find(
"init.observedPseudoCount"));
867 for (
size_t i = 0; i < pl.
size(); ++i)
875 for (
size_t i = 0; i < pl.
size(); ++i)
877 const string pName = pl[i].getName();
883 pl[i].setValue(value);
885 if (unparsedArguments_.find(pName) != unparsedArguments_.end())
886 unparsedArguments_.erase(unparsedArguments_.find(pName));
virtual void setMessageHandler(std::shared_ptr< OutputStream > mh)
The Coala branch-heterogeneous amino-acid substitution model.
static std::unique_ptr< CodonFrequencySetInterface > getFrequencySetForCodons(short option, std::shared_ptr< const GeneticCode > gCode, const std::string &mgmtStopCodon="quadratic", unsigned short method=1)
A helper function that provide frequencies set for codon models according to PAML option.
the Frequencies in codons are the product of Independent Frequencies in letters with the frequencies ...
the Frequencies in codons are the product of the frequencies for a unique FrequencySet in letters,...
FrequencySet useful for homogeneous and stationary models, codon implementation.
Parametrize a set of state frequencies.
virtual const Vdouble & getFrequencies() const =0
virtual void setFrequenciesFromAlphabetStatesFrequencies(const std::map< int, double > &frequencies)=0
Set the Frequencies from the one of the map which keys match with a letter of the Alphabet....
virtual std::string getName() const =0
virtual void setFrequencies(const std::vector< double > &frequencies)=0
Set the parameters in order to match a given set of frequencies.
FrequencySet defined from the equilibrium distribution of a given model.
A generic FrequencySet for Full Codon alphabets.
unsigned short getMethod() const
FrequencySet integrating ProteinFrequencySet inside CodonFrequencySet. In this case,...
Protein FrequencySet using 19 independent parameters to model the 20 frequencies.
virtual int getPrecision() const=0
virtual OutputStream & setPrecision(int digit)=0
virtual const ParameterList & getIndependentParameters() const=0
virtual std::vector< std::string > getParameterNames() const
virtual void deleteParameter(const std::string &name)
virtual void setParameter(size_t index, const Parameter ¶m)
virtual bool matchParametersValues(const ParameterList ¶meters)=0
virtual const ParameterList & getParameters() const=0
Parametrize a set of state frequencies for proteins.
const std::string & nextToken()
bool hasMoreToken() const
FrequencySet to be read in a file. More specifically, a frequency set is read in a column of a given ...
the Frequencies in words are the product of Independent Frequencies in letters
int toInt(const std::string &s, char scientificNotation='e')
double toDouble(const std::string &s, char dec='.', char scientificNotation='e')
bool isEmpty(const std::string &s)
bool isDecimalInteger(const std::string &s, char scientificNotation='e')
std::string toString(T t)
Defines the basic types of data flow nodes.