12 #include "../Alphabet/AlphabetTools.h"
13 #include "../Alphabet/AllelicAlphabet.h"
14 #include "../Alphabet/BinaryAlphabet.h"
15 #include "../Alphabet/CodonAlphabet.h"
16 #include "../Alphabet/DefaultAlphabet.h"
17 #include "../Alphabet/LexicalAlphabet.h"
18 #include "../Container/VectorSiteContainer.h"
19 #include "../GeneticCode/AscidianMitochondrialGeneticCode.h"
20 #include "../GeneticCode/CiliateNuclearGeneticCode.h"
21 #include "../GeneticCode/EchinodermMitochondrialGeneticCode.h"
22 #include "../GeneticCode/InvertebrateMitochondrialGeneticCode.h"
23 #include "../GeneticCode/MoldMitochondrialGeneticCode.h"
24 #include "../GeneticCode/StandardGeneticCode.h"
25 #include "../GeneticCode/VertebrateMitochondrialGeneticCode.h"
26 #include "../GeneticCode/YeastMitochondrialGeneticCode.h"
27 #include "../Io/BppOAlignmentReaderFormat.h"
28 #include "../Io/BppOAlignmentWriterFormat.h"
29 #include "../Io/BppOAlphabetIndex1Format.h"
30 #include "../Io/BppOAlphabetIndex2Format.h"
31 #include "../Io/BppOSequenceReaderFormat.h"
32 #include "../Io/BppOSequenceWriterFormat.h"
33 #include "../Io/MaseTools.h"
34 #include "../SequenceTools.h"
35 #include "../SymbolListTools.h"
44 const map<string, string>& params,
46 bool suffixIsOptional,
51 unique_ptr<Alphabet> chars =
nullptr;
55 map<string, string> args;
60 if (alphabet ==
"Word")
62 if (args.find(
"letter") != args.end())
64 args[
"alphabet"] = args[
"letter"];
68 if (args.find(
"length") == args.end())
69 throw Exception(
"Missing length parameter for Word alphabet");
71 size_t lg = TextTools::to<size_t>(args[
"length"]);
73 chars = make_unique<WordAlphabet>(std::move(inAlphabet), lg);
78 vector< shared_ptr<const Alphabet>> vAlph;
82 map<string, string> args2;
89 if (vAlph.size() == 0)
90 throw Exception(
"SequenceApplicationTools::getAlphabet. At least one alphabet is compulsory for Word alphabet");
92 chars = make_unique<WordAlphabet>(vAlph);
95 else if (alphabet ==
"RNY")
97 if (args.find(
"letter") == args.end())
98 throw Exception(
"Missing letter alphabet for RNY alphabet");
100 args[
"alphabet"] = args[
"letter"];
106 shared_ptr<Alphabet> charsTmp = std::move(chars);
107 chars = make_unique<RNY>(dynamic_pointer_cast<NucleicAlphabet>(charsTmp));
108 alphabet =
"RNY(" + alphabet +
")";
111 throw Exception(
"RNY needs a Nucleic Alphabet, instead of " + args[
"letter"]);
113 else if (alphabet ==
"Binary")
114 chars = make_unique<BinaryAlphabet>();
115 else if (alphabet ==
"Integer")
117 if (args.find(
"N") == args.end())
118 throw Exception(
"Missing 'N' argument in Integer for size:" + alphabet);
120 uint N = TextTools::to<unsigned int>(args[
"N"]);
121 chars = make_unique<IntegerAlphabet>(N);
123 else if (alphabet ==
"Lexicon")
125 vector<string> vWord = ApplicationTools::getVectorParameter<string>(
"words", args,
',',
"()");
127 if (vWord.size() == 0)
128 throw Exception(
"SequenceApplicationTools::getAlphabet. 'words' list is missing or empty for Lexicon alphabet");
130 chars = make_unique<LexicalAlphabet>(vWord);
132 else if (alphabet ==
"DNA")
135 chars = make_unique<DNA>(mark);
137 else if (alphabet ==
"RNA")
140 chars = make_unique<RNA>(mark);
142 else if (alphabet ==
"Protein")
143 chars = make_unique<ProteicAlphabet>();
144 else if (allowGeneric && alphabet ==
"Generic")
145 chars = make_unique<DefaultAlphabet>();
146 else if (alphabet ==
"Codon")
148 if (args.find(
"letter") == args.end())
149 throw Exception(
"Missing 'letter' argument in Codon :" + alphabet);
150 if (args.find(
"type") != args.end())
151 throw Exception(
"'type' argument in Codon is deprecated and has been superseded by the 'genetic_code' option.");
155 map<string, string> alphnArgs;
158 shared_ptr<const NucleicAlphabet> pnalph;
162 pnalph = make_shared<RNA>(mark);
164 else if (alphn ==
"DNA")
167 pnalph = make_shared<DNA>(mark);
170 throw Exception(
"Alphabet not known in Codon : " + alphn);
172 chars = make_unique<CodonAlphabet>(pnalph);
174 else if (alphabet ==
"Allelic")
176 if (args.find(
"base") == args.end())
177 throw Exception(
"Missing 'base' argument in Allelic for sequence alphabet :" + alphabet);
179 if (args.find(
"N") == args.end())
180 throw Exception(
"Missing 'N' argument in Allelic for number of alleles:" + alphabet);
182 uint N = TextTools::to<unsigned int>(args[
"N"]);
184 args[
"alphabet"] = args[
"base"];
188 chars = make_unique<AllelicAlphabet>(inAlphabet, N);
191 throw Exception(
"Alphabet not known: " + alphabet);
201 std::shared_ptr<const NucleicAlphabet> alphabet,
202 const string& description)
205 if (description.find(
"Standard") != string::npos || description.find(
"1") != string::npos)
207 else if (description.find(
"VertebrateMitochondrial") != string::npos || description.find(
"2") != string::npos)
209 else if (description.find(
"YeastMitochondrial") != string::npos || description.find(
"3") != string::npos)
211 else if (description.find(
"MoldMitochondrial") != string::npos || description.find(
"4") != string::npos)
213 else if (description.find(
"InvertebrateMitochondrial") != string::npos || description.find(
"5") != string::npos)
215 else if (description.find(
"CiliateNuclear") != string::npos || description.find(
"6") != string::npos)
217 else if (description.find(
"EchinodermMitochondrial") != string::npos || description.find(
"9") != string::npos)
219 else if (description.find(
"AscidianMitochondrial") != string::npos || description.find(
"13") != string::npos)
222 throw Exception(
"Unknown GeneticCode: " + description);
223 return unique_ptr<GeneticCode>(geneCode);
229 shared_ptr<const Alphabet> alphabet,
230 const string& description,
231 const string& message,
235 return reader.
read(description);
239 shared_ptr<const Alphabet> alphabet,
240 const string& description,
241 const string& message,
245 return reader.
read(description);
249 shared_ptr<const CodonAlphabet> alphabet,
250 shared_ptr<const GeneticCode> gencode,
251 const string& description,
252 const string& message,
256 return reader.
read(description);
260 shared_ptr<const CodonAlphabet> alphabet,
261 shared_ptr<const GeneticCode> gencode,
262 const string& description,
263 const string& message,
267 return reader.
read(description);
273 std::shared_ptr<const Alphabet> alpha,
274 const map<string, string>& params,
275 const string& suffix,
276 bool suffixIsOptional,
283 unique_ptr<ISequence> iSeq(bppoReader.
read(sequenceFormat));
289 std::unique_ptr<SequenceContainerInterface> sequences = iSeq->readSequences(sequenceFilePath, alpha);
292 restrictSelectedSequencesByName(*sequences, params, suffix, suffixIsOptional, verbose, warn);
300 map<size_t, unique_ptr<VectorSiteContainer>>
302 std::shared_ptr<const Alphabet> alpha,
303 const map<string, string>& params,
304 const string& prefix,
305 const string& suffix,
306 bool suffixIsOptional,
312 map<size_t, unique_ptr<VectorSiteContainer>> mCont;
314 for (
size_t nT = 0; nT < vContName.size(); nT++)
316 size_t poseq = vContName[nT].find(
"=");
318 size_t len = (prefix +
"data").size();
320 string suff = vContName[nT].substr(len, poseq - len);
323 num = TextTools::to<size_t>(suff);
331 map<string, string> args;
335 map<string, string> args2;
337 if (contName ==
"alignment")
341 if (args.find(
"file") != args.end())
342 args2[
"input.sequence.file"] = args[
"file"];
344 args2[
"input.sequence.file"] =
"";
346 if (args.find(
"format") != args.end())
347 args2[
"input.sequence.format"] = args[
"format"];
349 if (args.find(
"selection") != args.end())
350 args2[
"input.site.selection"] = args[
"selection"];
352 if (args.find(
"sites_to_use") != args.end())
353 args2[
"input.sequence.sites_to_use"] = args[
"sites_to_use"];
355 if (args.find(
"max_gap_allowed") != args.end())
356 args2[
"input.sequence.max_gap_allowed"] = args[
"max_gap_allowed"];
358 if (args.find(
"max_unresolved_allowed") != args.end())
359 args2[
"input.sequence.max_unresolved_allowed"] = args[
"max_unresolved_allowed"];
361 if (args.find(
"remove_stop_codons") != args.end())
362 args2[
"input.sequence.remove_stop_codons"] = args[
"remove_stop_codons"];
366 auto vsC = getSiteContainer(alpha, args2,
"",
true, verbose, warn);
371 vsC = getSitesToAnalyse(*vsC, args2,
"",
true,
false);
373 if (mCont.find(num) != mCont.end())
377 mCont.emplace(num, std::move(vsC));
386 map<size_t, unique_ptr<ProbabilisticVectorSiteContainer>>
388 std::shared_ptr<const Alphabet> alpha,
389 const map<string, string>& params,
390 const string& prefix,
391 const string& suffix,
392 bool suffixIsOptional,
398 map<size_t, unique_ptr<ProbabilisticVectorSiteContainer>> mCont;
400 for (
size_t nT = 0; nT < vContName.size(); nT++)
402 size_t poseq = vContName[nT].find(
"=");
404 size_t len = (prefix +
"data").size();
406 string suff = vContName[nT].substr(len, poseq - len);
409 num = TextTools::to<size_t>(suff);
417 map<string, string> args;
421 map<string, string> args2;
423 if (contName ==
"alignment")
427 if (args.find(
"file") != args.end())
428 args2[
"input.sequence.file"] = args[
"file"];
430 args2[
"input.sequence.file"] =
"";
432 if (args.find(
"format") != args.end())
433 args2[
"input.sequence.format"] = args[
"format"];
435 if (args.find(
"selection") != args.end())
436 args2[
"input.site.selection"] = args[
"selection"];
438 if (args.find(
"sites_to_use") != args.end())
439 args2[
"input.sequence.sites_to_use"] = args[
"sites_to_use"];
441 if (args.find(
"max_gap_allowed") != args.end())
442 args2[
"input.sequence.max_gap_allowed"] = args[
"max_gap_allowed"];
444 if (args.find(
"max_unresolved_allowed") != args.end())
445 args2[
"input.sequence.max_unresolved_allowed"] = args[
"max_unresolved_allowed"];
447 if (args.find(
"remove_stop_codons") != args.end())
448 args2[
"input.sequence.remove_stop_codons"] = args[
"remove_stop_codons"];
452 auto vsC = getProbabilisticSiteContainer(alpha, args2,
"",
true, verbose, warn);
457 vsC = getSitesToAnalyse(*vsC, args2,
"",
true,
false);
459 if (mCont.find(num) != mCont.end())
463 mCont.emplace(num, std::move(vsC));
473 std::shared_ptr<const Alphabet> alpha,
474 const map<string, string>& params,
475 const string& suffix,
476 bool suffixIsOptional,
484 unique_ptr<IAlignment> iAln(bppoReader.
read(sequenceFormat));
493 shared_ptr<const Alphabet> alpha2;
495 alpha2 = dynamic_pointer_cast<const RNY>(alpha)->getLetterAlphabet();
499 auto sites2 = make_unique<VectorSiteContainer>(*(iAln->readAlignment(sequenceFilePath, alpha2)));
501 auto sites = unique_ptr<VectorSiteContainer>();
506 sites = make_unique<VectorSiteContainer>(alpha);
507 for (
size_t i = 0; i < sites2->getNumberOfSequences(); ++i)
510 sites->addSequence(sites2->sequenceKey(i), seqP);
514 sites = std::move(sites2);
517 if (iAln->getFormatName() ==
"MASE file")
521 if (siteSet !=
"none")
523 unique_ptr<VectorSiteContainer> selectedSites;
527 selectedSites = std::move(sel);
535 if (selectedSites->getNumberOfSites() == 0)
537 throw Exception(
"Site set '" + siteSet +
"' is empty.");
539 sites = std::move(selectedSites);
544 size_t nbSites = sites->getNumberOfSites();
548 unique_ptr<VectorSiteContainer> selectedSites;
549 if (siteSet !=
"none")
551 if (siteSet[0] ==
'(')
552 siteSet = siteSet.substr(1, siteSet.size() - 2);
554 vector<size_t> vSite;
559 for (
size_t i = 0; i < vSite1.size(); ++i)
561 int x = (vSite1[i] >= 0 ? vSite1[i] :
static_cast<int>(nbSites) + vSite1[i] + 1);
562 if (x <= (
int)nbSites)
565 vSite.push_back(
static_cast<size_t>(x - 1));
573 vSite.push_back(
static_cast<size_t>(nbSites));
577 selectedSites = std::move(sel);
578 selectedSites->reindexSites();
583 map<string, string> selArgs;
585 if (seln ==
"Sample")
587 size_t n = ApplicationTools::getParameter<size_t>(
"n", selArgs, nbSites,
"",
true, warn + 1);
592 for (
size_t p = 0; p < nbSites; ++p)
600 selectedSites = std::move(sel);
602 selectedSites->reindexSites();
605 throw Exception(
"Unknown site selection description: " + siteSet);
611 if (selectedSites && (selectedSites->getNumberOfSites() == 0))
613 throw Exception(
"Site set '" + siteSet +
"' is empty.");
615 sites = std::move(selectedSites);
619 restrictSelectedSequencesByName(*sites, params, suffix, suffixIsOptional, verbose, warn);
627 shared_ptr<const Alphabet> alpha,
628 const map<string, string>& params,
629 const string& suffix,
630 bool suffixIsOptional,
638 unique_ptr<IAlignment> iAln;
639 unique_ptr<IProbabilisticAlignment> iProbAln;
643 iAln.reset(bppoReader.
read(sequenceFormat).release());
662 shared_ptr<const Alphabet> alpha2;
664 alpha2 = dynamic_pointer_cast<const RNY>(alpha)->getLetterAlphabet();
668 alpha2 = dynamic_pointer_cast<const AllelicAlphabet>(alpha)->getStateAlphabet();
673 unique_ptr<VectorSiteContainer> sites;
674 unique_ptr<ProbabilisticVectorSiteContainer> psites;
677 sites = make_unique<VectorSiteContainer>(*(iAln->readAlignment(sequenceFilePath, alpha2)));
679 psites = make_unique<ProbabilisticVectorSiteContainer>(*iProbAln->readAlignment(sequenceFilePath, alpha2));
689 for (
const auto& name : sites->getSequenceNames())
691 auto sl = ST.
RNYslice(sites->sequence(name));
692 tmpsites->addSequence(name, sl);
694 sites = std::move(tmpsites);
699 if (iAln->getFormatName() ==
"MASE file")
703 if (siteSet !=
"none")
705 unique_ptr<VectorSiteContainer> selectedSites;
716 if (selectedSites->getNumberOfSites() == 0)
718 throw Exception(
"Site set '" + siteSet +
"' is empty.");
720 sites = std::move(selectedSites);
728 auto names = psites->getSequenceNames();
730 auto chars = alpha2->getResolvedChars();
732 Table<double> dtable(chars.size(), sites->getNumberOfSites());
735 for (
const auto& name : names)
737 const auto& sequence = psites->sequence(name);
738 vector<double> vval(chars.size());
740 for (
size_t pos = 0; pos < sequence.size(); pos++)
745 vval[i] = sequence.getStateValueAt(pos, alpha2->charToInt(c));
752 unique_ptr<ProbabilisticSequence> seq(
new ProbabilisticSequence(name, dtable, sequence.getComments(), alpha2));
754 psites->addSequence(name, seq);
763 auto names = psites->getSequenceNames();
765 for (
const auto& name : names)
767 unique_ptr<ProbabilisticSequence> seq(dynamic_pointer_cast<const AllelicAlphabet>(alpha)->convertFromStateAlphabet(psites->sequence(name)));
769 pallsites->addSequence(name, seq);
772 psites = std::move(pallsites);
777 size_t nbSites = sites ? sites->getNumberOfSites() : psites->getNumberOfSites();
782 if (siteSet !=
"none")
784 if (siteSet[0] ==
'(')
785 siteSet = siteSet.substr(1, siteSet.size() - 2);
787 vector<size_t> vSite;
792 for (
auto ps : vSite1)
794 int x = (ps >= 0 ? ps :
static_cast<int>(nbSites) + ps + 1);
795 if (x <= (
int)nbSites)
798 vSite.push_back(
static_cast<size_t>(x - 1));
801 throw Exception(
"SequenceApplicationTools::getSiteContainer(). Incorrect null index");
808 vSite.push_back(
static_cast<size_t>(nbSites - 1));
816 map<string, string> selArgs;
818 if (seln ==
"Sample")
820 size_t n = ApplicationTools::getParameter<size_t>(
"n", selArgs, nbSites,
"",
true, warn + 1);
825 for (
size_t p = 0; p < nbSites; ++p)
833 throw Exception(
"Unknown site selection description: " + siteSet);
840 auto selectedSites = make_unique<ProbabilisticVectorSiteContainer>(alpha);
844 if (selectedSites && (selectedSites->getNumberOfSites() == 0))
846 throw Exception(
"Site set '" + siteSet +
"' is empty.");
851 psites = std::move(selectedSites);
864 const map<std::string, std::string>& params,
866 bool suffixIsOptional,
871 if (optionKeep !=
"all")
873 vector<string> selection = ApplicationTools::getVectorParameter<string>(
"input.sequence.keep_names", params,
',', optionKeep, suffix, suffixIsOptional, warn);
874 sort(selection.begin(), selection.end());
877 if (!binary_search(selection.begin(), selection.end(), name))
888 if (optionRemove !=
"none")
890 vector<string> selection = ApplicationTools::getVectorParameter<string>(
"input.sequence.remove_names", params,
',', optionRemove, suffix, suffixIsOptional, warn);
892 sort(seqNames.begin(), seqNames.end());
893 for (
const auto& name: selection)
895 if (binary_search(seqNames.begin(), seqNames.end(), name))
915 const map<string, string>& params,
916 const string& suffix,
923 unique_ptr<OSequence> oSeq(bppoWriter.
read(sequenceFormat));
931 oSeq->writeSequences(sequenceFilePath, sequences,
true);
938 const map<string, string>& params,
939 const string& suffix,
946 unique_ptr<OAlignment> oAln(bppoWriter.
read(sequenceFormat));
954 oAln->writeAlignment(sequenceFilePath, sites,
true);
This class implements the ascidian mitochondrial genetic code as describe on the NCBI web site: http:...
This class implements the mold, protozoan, and coelenterate mitochondrial code and the Mycoplasma/Spi...
This class implements the Echinoderm and Faltworms Mitochondrial genetic code as describe on the NCBI...
Partial implementation of the Transliterator interface for genetic code object.
This class implements the Invertebrate Mitochondrial genetic code as describe on the NCBI website: ht...
This class implements the mold, protozoan, and coelenterate mitochondrial code and the Mycoplasma/Spi...
A basic implementation of the ProbabilisticSequence interface.
Exception thrown when a sequence is not found The sequence not found exception base class.
This class implements the standard genetic code as describe on the NCBI web site: http://www....
void setRowNames(const std::vector< std::string > &rowNames)
void setColumn(const std::vector< T > &newColumn, size_t pos)
The SequenceContainer interface.
virtual std::unique_ptr< SequenceType > removeSequence(const HashType &sequenceKey)=0
Remove a sequence from the container.
virtual std::vector< std::string > getSequenceNames() const =0
The VectorSiteContainer class.
This class implements the vertebrate mitochondrial genetic code as describe on the NCBI web site: htt...
This class implements the Invertebrate Mitochondrial genetic code as describe on the NCBI website: ht...
bool isDecimalInteger(const std::string &s, char scientificNotation='e')
std::string toString(T t)
This alphabet is used to deal NumericAlphabet.
TemplateVectorSiteContainer< ProbabilisticSite, ProbabilisticSequence > ProbabilisticVectorSiteContainer
TemplateVectorSiteContainer< Site, Sequence > VectorSiteContainer