12 #include "../Alphabet/AlphabetTools.h"
13 #include "../Alphabet/AllelicAlphabet.h"
14 #include "../Alphabet/BinaryAlphabet.h"
15 #include "../Alphabet/CodonAlphabet.h"
16 #include "../Alphabet/DefaultAlphabet.h"
17 #include "../Alphabet/LexicalAlphabet.h"
18 #include "../Container/VectorSiteContainer.h"
19 #include "../GeneticCode/AscidianMitochondrialGeneticCode.h"
20 #include "../GeneticCode/CiliateNuclearGeneticCode.h"
21 #include "../GeneticCode/EchinodermMitochondrialGeneticCode.h"
22 #include "../GeneticCode/InvertebrateMitochondrialGeneticCode.h"
23 #include "../GeneticCode/MoldMitochondrialGeneticCode.h"
24 #include "../GeneticCode/StandardGeneticCode.h"
25 #include "../GeneticCode/VertebrateMitochondrialGeneticCode.h"
26 #include "../GeneticCode/YeastMitochondrialGeneticCode.h"
27 #include "../Io/BppOAlignmentReaderFormat.h"
28 #include "../Io/BppOAlignmentWriterFormat.h"
29 #include "../Io/BppOAlphabetIndex1Format.h"
30 #include "../Io/BppOAlphabetIndex2Format.h"
31 #include "../Io/BppOSequenceReaderFormat.h"
32 #include "../Io/BppOSequenceWriterFormat.h"
33 #include "../Io/MaseTools.h"
34 #include "../SequenceTools.h"
35 #include "../SymbolListTools.h"
44 const map<string, string>& params,
46 bool suffixIsOptional,
51 unique_ptr<Alphabet> chars =
nullptr;
55 map<string, string> args;
60 if (alphabet ==
"Word")
62 if (args.find(
"letter") != args.end())
64 args[
"alphabet"] = args[
"letter"];
68 if (args.find(
"length") == args.end())
69 throw Exception(
"Missing length parameter for Word alphabet");
71 size_t lg = TextTools::to<size_t>(args[
"length"]);
73 chars = make_unique<WordAlphabet>(std::move(inAlphabet), lg);
78 vector< shared_ptr<const Alphabet>> vAlph;
82 map<string, string> args2;
89 if (vAlph.size() == 0)
90 throw Exception(
"SequenceApplicationTools::getAlphabet. At least one alphabet is compulsory for Word alphabet");
92 chars = make_unique<WordAlphabet>(vAlph);
95 else if (alphabet ==
"RNY")
97 if (args.find(
"letter") == args.end())
98 throw Exception(
"Missing letter alphabet for RNY alphabet");
100 args[
"alphabet"] = args[
"letter"];
106 auto inNuc = dynamic_pointer_cast<const NucleicAlphabet>(inAlphabet);
107 chars = make_unique<RNY>(inNuc);
108 alphabet =
"RNY(" + alphabet +
")";
111 throw Exception(
"RNY needs a Nucleic Alphabet, instead of " + args[
"letter"]);
113 else if (alphabet ==
"Binary")
114 chars = make_unique<BinaryAlphabet>();
115 else if (alphabet ==
"Integer")
117 if (args.find(
"N") == args.end())
118 throw Exception(
"Missing 'N' argument in Integer for size:" + alphabet);
120 uint N = TextTools::to<unsigned int>(args[
"N"]);
121 chars = make_unique<IntegerAlphabet>(N);
123 else if (alphabet ==
"Lexicon")
125 vector<string> vWord = ApplicationTools::getVectorParameter<string>(
"words", args,
',',
"()");
127 if (vWord.size() == 0)
128 throw Exception(
"SequenceApplicationTools::getAlphabet. 'words' list is missing or empty for Lexicon alphabet");
130 chars = make_unique<LexicalAlphabet>(vWord);
132 else if (alphabet ==
"DNA")
135 chars = make_unique<DNA>(mark);
137 else if (alphabet ==
"RNA")
140 chars = make_unique<RNA>(mark);
142 else if (alphabet ==
"Protein")
143 chars = make_unique<ProteicAlphabet>();
144 else if (allowGeneric && alphabet ==
"Generic")
145 chars = make_unique<DefaultAlphabet>();
146 else if (alphabet ==
"Codon")
148 if (args.find(
"letter") == args.end())
149 throw Exception(
"Missing 'letter' argument in Codon :" + alphabet);
150 if (args.find(
"type") != args.end())
151 throw Exception(
"'type' argument in Codon is deprecated and has been superseded by the 'genetic_code' option.");
155 map<string, string> alphnArgs;
158 shared_ptr<const NucleicAlphabet> pnalph;
162 pnalph = make_shared<RNA>(mark);
164 else if (alphn ==
"DNA")
167 pnalph = make_shared<DNA>(mark);
170 throw Exception(
"Alphabet not known in Codon : " + alphn);
172 chars = make_unique<CodonAlphabet>(pnalph);
174 else if (alphabet ==
"Allelic")
176 if (args.find(
"base") == args.end())
177 throw Exception(
"Missing 'base' argument in Allelic for sequence alphabet :" + alphabet);
179 if (args.find(
"N") == args.end())
180 throw Exception(
"Missing 'N' argument in Allelic for number of alleles:" + alphabet);
182 uint N = TextTools::to<unsigned int>(args[
"N"]);
184 args[
"alphabet"] = args[
"base"];
188 chars = make_unique<AllelicAlphabet>(inAlphabet, N);
191 throw Exception(
"Alphabet not known: " + alphabet);
201 std::shared_ptr<const NucleicAlphabet> alphabet,
202 const string& description)
205 if (description.find(
"Standard") != string::npos || description.find(
"1") != string::npos)
207 else if (description.find(
"VertebrateMitochondrial") != string::npos || description.find(
"2") != string::npos)
209 else if (description.find(
"YeastMitochondrial") != string::npos || description.find(
"3") != string::npos)
211 else if (description.find(
"MoldMitochondrial") != string::npos || description.find(
"4") != string::npos)
213 else if (description.find(
"InvertebrateMitochondrial") != string::npos || description.find(
"5") != string::npos)
215 else if (description.find(
"CiliateNuclear") != string::npos || description.find(
"6") != string::npos)
217 else if (description.find(
"EchinodermMitochondrial") != string::npos || description.find(
"9") != string::npos)
219 else if (description.find(
"AscidianMitochondrial") != string::npos || description.find(
"13") != string::npos)
222 throw Exception(
"Unknown GeneticCode: " + description);
223 return unique_ptr<GeneticCode>(geneCode);
229 std::shared_ptr<const Alphabet> alphabet,
230 const string& description,
231 const string& message,
235 return reader.
read(description);
239 std::shared_ptr<const Alphabet> alphabet,
240 const string& description,
241 const string& message,
245 return reader.
read(description);
249 std::shared_ptr<const CodonAlphabet> alphabet,
250 std::shared_ptr<const GeneticCode> gencode,
251 const string& description,
252 const string& message,
256 return reader.
read(description);
260 std::shared_ptr<const CodonAlphabet> alphabet,
261 std::shared_ptr<const GeneticCode> gencode,
262 const string& description,
263 const string& message,
267 return reader.
read(description);
273 std::shared_ptr<const Alphabet> alpha,
274 const map<string, string>& params,
275 const string& suffix,
276 bool suffixIsOptional,
283 unique_ptr<ISequence> iSeq(bppoReader.
read(sequenceFormat));
289 std::unique_ptr<SequenceContainerInterface> sequences = iSeq->readSequences(sequenceFilePath, alpha);
292 restrictSelectedSequencesByName(*sequences, params, suffix, suffixIsOptional, verbose, warn);
300 map<size_t, unique_ptr<VectorSiteContainer>>
302 std::shared_ptr<const Alphabet> alpha,
303 const map<string, string>& params,
304 const string& prefix,
305 const string& suffix,
306 bool suffixIsOptional,
312 map<size_t, unique_ptr<VectorSiteContainer>> mCont;
314 for (
size_t nT = 0; nT < vContName.size(); nT++)
316 size_t poseq = vContName[nT].find(
"=");
318 size_t len = (prefix +
"data").size();
320 string suff = vContName[nT].substr(len, poseq - len);
323 num = TextTools::to<size_t>(suff);
331 map<string, string> args;
335 map<string, string> args2;
337 if (contName ==
"alignment")
341 if (args.find(
"file") != args.end())
342 args2[
"input.sequence.file"] = args[
"file"];
344 args2[
"input.sequence.file"] =
"";
346 if (args.find(
"format") != args.end())
347 args2[
"input.sequence.format"] = args[
"format"];
349 if (args.find(
"selection") != args.end())
350 args2[
"input.site.selection"] = args[
"selection"];
352 if (args.find(
"sites_to_use") != args.end())
353 args2[
"input.sequence.sites_to_use"] = args[
"sites_to_use"];
355 if (args.find(
"max_gap_allowed") != args.end())
356 args2[
"input.sequence.max_gap_allowed"] = args[
"max_gap_allowed"];
358 if (args.find(
"max_unresolved_allowed") != args.end())
359 args2[
"input.sequence.max_unresolved_allowed"] = args[
"max_unresolved_allowed"];
361 if (args.find(
"remove_stop_codons") != args.end())
362 args2[
"input.sequence.remove_stop_codons"] = args[
"remove_stop_codons"];
366 auto vsC = getSiteContainer(alpha, args2,
"",
true, verbose, warn);
371 vsC = getSitesToAnalyse(*vsC, args2,
"",
true,
false);
373 if (mCont.find(num) != mCont.end())
377 mCont.emplace(num, std::move(vsC));
380 throw Exception(
"Unknown sequence container name " + contName);
388 map<size_t, unique_ptr<ProbabilisticVectorSiteContainer>>
390 std::shared_ptr<const Alphabet> alpha,
391 const map<string, string>& params,
392 const string& prefix,
393 const string& suffix,
394 bool suffixIsOptional,
400 map<size_t, unique_ptr<ProbabilisticVectorSiteContainer>> mCont;
402 for (
size_t nT = 0; nT < vContName.size(); nT++)
404 size_t poseq = vContName[nT].find(
"=");
406 size_t len = (prefix +
"data").size();
408 string suff = vContName[nT].substr(len, poseq - len);
411 num = TextTools::to<size_t>(suff);
419 map<string, string> args;
423 map<string, string> args2;
425 if (contName ==
"alignment")
429 if (args.find(
"file") != args.end())
430 args2[
"input.sequence.file"] = args[
"file"];
432 args2[
"input.sequence.file"] =
"";
434 if (args.find(
"format") != args.end())
435 args2[
"input.sequence.format"] = args[
"format"];
437 if (args.find(
"selection") != args.end())
438 args2[
"input.site.selection"] = args[
"selection"];
440 if (args.find(
"sites_to_use") != args.end())
441 args2[
"input.sequence.sites_to_use"] = args[
"sites_to_use"];
443 if (args.find(
"max_gap_allowed") != args.end())
444 args2[
"input.sequence.max_gap_allowed"] = args[
"max_gap_allowed"];
446 if (args.find(
"max_unresolved_allowed") != args.end())
447 args2[
"input.sequence.max_unresolved_allowed"] = args[
"max_unresolved_allowed"];
449 if (args.find(
"remove_stop_codons") != args.end())
450 args2[
"input.sequence.remove_stop_codons"] = args[
"remove_stop_codons"];
454 auto vsC = getProbabilisticSiteContainer(alpha, args2,
"",
true, verbose, warn);
459 vsC = getSitesToAnalyse(*vsC, args2,
"",
true,
false);
461 if (mCont.find(num) != mCont.end())
465 mCont.emplace(num, std::move(vsC));
468 throw Exception(
"Unknown sequence container name " + contName);
478 std::shared_ptr<const Alphabet> alpha,
479 const map<string, string>& params,
480 const string& suffix,
481 bool suffixIsOptional,
489 unique_ptr<IAlignment> iAln(bppoReader.
read(sequenceFormat));
498 shared_ptr<const Alphabet> alpha2;
500 alpha2 = dynamic_pointer_cast<const RNY>(alpha)->getLetterAlphabet();
504 auto sites2 = make_unique<VectorSiteContainer>(*(iAln->readAlignment(sequenceFilePath, alpha2)));
506 auto sites = unique_ptr<VectorSiteContainer>();
511 sites = make_unique<VectorSiteContainer>(alpha);
512 for (
size_t i = 0; i < sites2->getNumberOfSequences(); ++i)
515 sites->addSequence(sites2->sequenceKey(i), seqP);
519 sites = std::move(sites2);
522 if (iAln->getFormatName() ==
"MASE file")
526 if (siteSet !=
"none")
528 unique_ptr<VectorSiteContainer> selectedSites;
532 selectedSites = std::move(sel);
540 if (selectedSites->getNumberOfSites() == 0)
542 throw Exception(
"Site set '" + siteSet +
"' is empty.");
544 sites = std::move(selectedSites);
549 size_t nbSites = sites->getNumberOfSites();
553 unique_ptr<VectorSiteContainer> selectedSites;
554 if (siteSet !=
"none")
556 if (siteSet[0] ==
'(')
557 siteSet = siteSet.substr(1, siteSet.size() - 2);
559 vector<size_t> vSite;
564 for (
size_t i = 0; i < vSite1.size(); ++i)
566 int x = (vSite1[i] >= 0 ? vSite1[i] :
static_cast<int>(nbSites) + vSite1[i] + 1);
567 if (x <= (
int)nbSites)
570 vSite.push_back(
static_cast<size_t>(x - 1));
578 vSite.push_back(
static_cast<size_t>(nbSites));
582 selectedSites = std::move(sel);
583 selectedSites->reindexSites();
588 map<string, string> selArgs;
590 if (seln ==
"Sample")
592 size_t n = ApplicationTools::getParameter<size_t>(
"n", selArgs, nbSites,
"",
true, warn + 1);
597 for (
size_t p = 0; p < nbSites; ++p)
605 selectedSites = std::move(sel);
607 selectedSites->reindexSites();
610 throw Exception(
"Unknown site selection description: " + siteSet);
616 if (selectedSites && (selectedSites->getNumberOfSites() == 0))
618 throw Exception(
"Site set '" + siteSet +
"' is empty.");
620 sites = std::move(selectedSites);
624 restrictSelectedSequencesByName(*sites, params, suffix, suffixIsOptional, verbose, warn);
632 std::shared_ptr<const Alphabet> alpha,
633 const map<string, string>& params,
634 const string& suffix,
635 bool suffixIsOptional,
643 unique_ptr<IAlignment> iAln;
644 unique_ptr<IProbabilisticAlignment> iProbAln;
648 iAln.reset(bppoReader.
read(sequenceFormat).release());
667 shared_ptr<const Alphabet> alpha2;
669 alpha2 = dynamic_pointer_cast<const RNY>(alpha)->getLetterAlphabet();
673 alpha2 = dynamic_pointer_cast<const AllelicAlphabet>(alpha)->getStateAlphabet();
678 unique_ptr<VectorSiteContainer> sites;
679 unique_ptr<ProbabilisticVectorSiteContainer> psites;
682 sites = make_unique<VectorSiteContainer>(*(iAln->readAlignment(sequenceFilePath, alpha2)));
684 psites = make_unique<ProbabilisticVectorSiteContainer>(*iProbAln->readAlignment(sequenceFilePath, alpha2));
694 for (
const auto& name : sites->getSequenceNames())
696 auto sl = ST.
RNYslice(sites->sequence(name));
697 tmpsites->addSequence(name, sl);
699 sites = std::move(tmpsites);
704 if (iAln->getFormatName() ==
"MASE file")
708 if (siteSet !=
"none")
710 unique_ptr<VectorSiteContainer> selectedSites;
721 if (selectedSites->getNumberOfSites() == 0)
723 throw Exception(
"Site set '" + siteSet +
"' is empty.");
725 sites = std::move(selectedSites);
733 auto names = psites->getSequenceNames();
735 auto chars = alpha2->getResolvedChars();
737 Table<double> dtable(chars.size(), sites->getNumberOfSites());
740 for (
const auto& name : names)
742 const auto& sequence = psites->sequence(name);
743 vector<double> vval(chars.size());
745 for (
size_t pos = 0; pos < sequence.size(); pos++)
750 vval[i] = sequence.getStateValueAt(pos, alpha2->charToInt(c));
757 unique_ptr<ProbabilisticSequence> seq(
new ProbabilisticSequence(name, dtable, sequence.getComments(), alpha2));
759 psites->addSequence(name, seq);
768 auto names = psites->getSequenceNames();
770 for (
const auto& name : names)
772 unique_ptr<ProbabilisticSequence> seq(dynamic_pointer_cast<const AllelicAlphabet>(alpha)->convertFromStateAlphabet(psites->sequence(name)));
774 pallsites->addSequence(name, seq);
777 psites = std::move(pallsites);
782 size_t nbSites = sites ? sites->getNumberOfSites() : psites->getNumberOfSites();
787 if (siteSet !=
"none")
789 if (siteSet[0] ==
'(')
790 siteSet = siteSet.substr(1, siteSet.size() - 2);
792 vector<size_t> vSite;
797 for (
auto ps : vSite1)
799 int x = (ps >= 0 ? ps :
static_cast<int>(nbSites) + ps + 1);
800 if (x <= (
int)nbSites)
803 vSite.push_back(
static_cast<size_t>(x - 1));
806 throw Exception(
"SequenceApplicationTools::getSiteContainer(). Incorrect null index");
813 vSite.push_back(
static_cast<size_t>(nbSites - 1));
821 map<string, string> selArgs;
823 if (seln ==
"Sample")
825 size_t n = ApplicationTools::getParameter<size_t>(
"n", selArgs, nbSites,
"",
true, warn + 1);
830 for (
size_t p = 0; p < nbSites; ++p)
838 throw Exception(
"Unknown site selection description: " + siteSet);
845 auto selectedSites = make_unique<ProbabilisticVectorSiteContainer>(alpha);
849 if (selectedSites && (selectedSites->getNumberOfSites() == 0))
851 throw Exception(
"Site set '" + siteSet +
"' is empty.");
856 psites = std::move(selectedSites);
869 const map<std::string, std::string>& params,
871 bool suffixIsOptional,
876 if (optionKeep !=
"all")
878 vector<string> selection = ApplicationTools::getVectorParameter<string>(
"input.sequence.keep_names", params,
',', optionKeep, suffix, suffixIsOptional, warn);
879 sort(selection.begin(), selection.end());
882 if (!binary_search(selection.begin(), selection.end(), name))
893 if (optionRemove !=
"none")
895 vector<string> selection = ApplicationTools::getVectorParameter<string>(
"input.sequence.remove_names", params,
',', optionRemove, suffix, suffixIsOptional, warn);
897 sort(seqNames.begin(), seqNames.end());
898 for (
const auto& name: selection)
900 if (binary_search(seqNames.begin(), seqNames.end(), name))
920 const map<string, string>& params,
921 const string& suffix,
928 unique_ptr<OSequence> oSeq(bppoWriter.
read(sequenceFormat));
936 oSeq->writeSequences(sequenceFilePath, sequences,
true);
943 const map<string, string>& params,
944 const string& suffix,
951 unique_ptr<OAlignment> oAln(bppoWriter.
read(sequenceFormat));
959 oAln->writeAlignment(sequenceFilePath, sites,
true);
This class implements the ascidian mitochondrial genetic code as describe on the NCBI web site: http:...
This class implements the mold, protozoan, and coelenterate mitochondrial code and the Mycoplasma/Spi...
This class implements the Echinoderm and Faltworms Mitochondrial genetic code as describe on the NCBI...
Partial implementation of the Transliterator interface for genetic code object.
This class implements the Invertebrate Mitochondrial genetic code as describe on the NCBI website: ht...
This class implements the mold, protozoan, and coelenterate mitochondrial code and the Mycoplasma/Spi...
A basic implementation of the ProbabilisticSequence interface.
Exception thrown when a sequence is not found The sequence not found exception base class.
This class implements the standard genetic code as describe on the NCBI web site: http://www....
void setRowNames(const std::vector< std::string > &rowNames)
void setColumn(const std::vector< T > &newColumn, size_t pos)
The SequenceContainer interface.
virtual std::unique_ptr< SequenceType > removeSequence(const HashType &sequenceKey)=0
Remove a sequence from the container.
virtual std::vector< std::string > getSequenceNames() const =0
The VectorSiteContainer class.
This class implements the vertebrate mitochondrial genetic code as describe on the NCBI web site: htt...
This class implements the Invertebrate Mitochondrial genetic code as describe on the NCBI website: ht...
bool isDecimalInteger(const std::string &s, char scientificNotation='e')
std::string toString(T t)
This alphabet is used to deal NumericAlphabet.
TemplateVectorSiteContainer< ProbabilisticSite, ProbabilisticSequence > ProbabilisticVectorSiteContainer
TemplateVectorSiteContainer< Site, Sequence > VectorSiteContainer