bpp-seq3  3.0.0
SequenceApplicationTools.cpp
Go to the documentation of this file.
1 // SPDX-FileCopyrightText: The Bio++ Development Group
2 //
3 // SPDX-License-Identifier: CECILL-2.1
4 
8 #include <Bpp/Text/KeyvalTools.h>
9 #include <Bpp/Text/TextTools.h>
10 #include <algorithm>
11 
12 #include "../Alphabet/AlphabetTools.h"
13 #include "../Alphabet/AllelicAlphabet.h"
14 #include "../Alphabet/BinaryAlphabet.h"
15 #include "../Alphabet/CodonAlphabet.h"
16 #include "../Alphabet/DefaultAlphabet.h"
17 #include "../Alphabet/LexicalAlphabet.h"
18 #include "../Container/VectorSiteContainer.h"
19 #include "../GeneticCode/AscidianMitochondrialGeneticCode.h"
20 #include "../GeneticCode/CiliateNuclearGeneticCode.h"
21 #include "../GeneticCode/EchinodermMitochondrialGeneticCode.h"
22 #include "../GeneticCode/InvertebrateMitochondrialGeneticCode.h"
23 #include "../GeneticCode/MoldMitochondrialGeneticCode.h"
24 #include "../GeneticCode/StandardGeneticCode.h"
25 #include "../GeneticCode/VertebrateMitochondrialGeneticCode.h"
26 #include "../GeneticCode/YeastMitochondrialGeneticCode.h"
27 #include "../Io/BppOAlignmentReaderFormat.h"
28 #include "../Io/BppOAlignmentWriterFormat.h"
29 #include "../Io/BppOAlphabetIndex1Format.h"
30 #include "../Io/BppOAlphabetIndex2Format.h"
31 #include "../Io/BppOSequenceReaderFormat.h"
32 #include "../Io/BppOSequenceWriterFormat.h"
33 #include "../Io/MaseTools.h"
34 #include "../SequenceTools.h"
35 #include "../SymbolListTools.h"
37 
38 using namespace bpp;
39 using namespace std;
40 
41 /******************************************************************************/
42 
44  const map<string, string>& params,
45  const string& suffix,
46  bool suffixIsOptional,
47  bool verbose,
48  bool allowGeneric,
49  int warn)
50 {
51  unique_ptr<Alphabet> chars = nullptr;
52 
53  string alphtt = ApplicationTools::getStringParameter("alphabet", params, "DNA", suffix, suffixIsOptional, warn);
54 
55  map<string, string> args;
56 
57  string alphabet;
58  KeyvalTools::parseProcedure(alphtt, alphabet, args);
59 
60  if (alphabet == "Word")
61  {
62  if (args.find("letter") != args.end())
63  {
64  args["alphabet"] = args["letter"];
65 
66  unique_ptr<const Alphabet> inAlphabet = SequenceApplicationTools::getAlphabet(args, suffix, suffixIsOptional, false, allowGeneric, warn + 1);
67 
68  if (args.find("length") == args.end())
69  throw Exception("Missing length parameter for Word alphabet");
70 
71  size_t lg = TextTools::to<size_t>(args["length"]);
72 
73  chars = make_unique<WordAlphabet>(std::move(inAlphabet), lg);
74  }
75  else
76  {
77  size_t indAlph = 1;
78  vector< shared_ptr<const Alphabet>> vAlph;
79 
80  while (args.find("alphabet" + TextTools::toString(indAlph)) != args.end())
81  {
82  map<string, string> args2;
83  args2["alphabet"] = args["alphabet" + TextTools::toString(indAlph)];
84 
85  vAlph.push_back(SequenceApplicationTools::getAlphabet(args2, suffix, suffixIsOptional, verbose, allowGeneric, warn));
86  indAlph++;
87  }
88 
89  if (vAlph.size() == 0)
90  throw Exception("SequenceApplicationTools::getAlphabet. At least one alphabet is compulsory for Word alphabet");
91 
92  chars = make_unique<WordAlphabet>(vAlph);
93  }
94  }
95  else if (alphabet == "RNY")
96  {
97  if (args.find("letter") == args.end())
98  throw Exception("Missing letter alphabet for RNY alphabet");
99 
100  args["alphabet"] = args["letter"];
101 
102  shared_ptr<Alphabet> inAlphabet = SequenceApplicationTools::getAlphabet(args, suffix, suffixIsOptional, verbose, allowGeneric, warn);
103 
104  if (AlphabetTools::isNucleicAlphabet(*inAlphabet))
105  {
106  auto inNuc = dynamic_pointer_cast<const NucleicAlphabet>(inAlphabet);
107  chars = make_unique<RNY>(inNuc);
108  alphabet = "RNY(" + alphabet + ")";
109  }
110  else
111  throw Exception("RNY needs a Nucleic Alphabet, instead of " + args["letter"]);
112  }
113  else if (alphabet == "Binary")
114  chars = make_unique<BinaryAlphabet>();
115  else if (alphabet == "Integer")
116  {
117  if (args.find("N") == args.end())
118  throw Exception("Missing 'N' argument in Integer for size:" + alphabet);
119 
120  uint N = TextTools::to<unsigned int>(args["N"]);
121  chars = make_unique<IntegerAlphabet>(N);
122  }
123  else if (alphabet == "Lexicon")
124  {
125  vector<string> vWord = ApplicationTools::getVectorParameter<string>("words", args, ',', "()");
126 
127  if (vWord.size() == 0)
128  throw Exception("SequenceApplicationTools::getAlphabet. 'words' list is missing or empty for Lexicon alphabet");
129 
130  chars = make_unique<LexicalAlphabet>(vWord);
131  }
132  else if (alphabet == "DNA")
133  {
134  bool mark = ApplicationTools::getBooleanParameter("bangAsGap", args, false, "", true, warn + 1);
135  chars = make_unique<DNA>(mark);
136  }
137  else if (alphabet == "RNA")
138  {
139  bool mark = ApplicationTools::getBooleanParameter("bangAsGap", args, false, "", true, warn + 1);
140  chars = make_unique<RNA>(mark);
141  }
142  else if (alphabet == "Protein")
143  chars = make_unique<ProteicAlphabet>();
144  else if (allowGeneric && alphabet == "Generic")
145  chars = make_unique<DefaultAlphabet>();
146  else if (alphabet == "Codon")
147  {
148  if (args.find("letter") == args.end())
149  throw Exception("Missing 'letter' argument in Codon :" + alphabet);
150  if (args.find("type") != args.end())
151  throw Exception("'type' argument in Codon is deprecated and has been superseded by the 'genetic_code' option.");
152 
153  string alphnDesc = ApplicationTools::getStringParameter("letter", args, "RNA");
154  string alphn;
155  map<string, string> alphnArgs;
156  KeyvalTools::parseProcedure(alphnDesc, alphn, alphnArgs);
157 
158  shared_ptr<const NucleicAlphabet> pnalph;
159  if (alphn == "RNA")
160  {
161  bool mark = ApplicationTools::getBooleanParameter("bangAsGap", alphnArgs, false, "", true, warn + 1);
162  pnalph = make_shared<RNA>(mark);
163  }
164  else if (alphn == "DNA")
165  {
166  bool mark = ApplicationTools::getBooleanParameter("bangAsGap", alphnArgs, false, "", true, warn + 1);
167  pnalph = make_shared<DNA>(mark);
168  }
169  else
170  throw Exception("Alphabet not known in Codon : " + alphn);
171 
172  chars = make_unique<CodonAlphabet>(pnalph);
173  }
174  else if (alphabet == "Allelic")
175  {
176  if (args.find("base") == args.end())
177  throw Exception("Missing 'base' argument in Allelic for sequence alphabet :" + alphabet);
178 
179  if (args.find("N") == args.end())
180  throw Exception("Missing 'N' argument in Allelic for number of alleles:" + alphabet);
181 
182  uint N = TextTools::to<unsigned int>(args["N"]);
183 
184  args["alphabet"] = args["base"];
185 
186  auto inAlphabet = std::shared_ptr<Alphabet>(SequenceApplicationTools::getAlphabet(args, suffix, suffixIsOptional, false, allowGeneric, warn + 1));
187 
188  chars = make_unique<AllelicAlphabet>(inAlphabet, N);
189  }
190  else
191  throw Exception("Alphabet not known: " + alphabet);
192 
193  if (verbose)
194  ApplicationTools::displayResult("Alphabet type ", chars->getAlphabetType());
195  return chars;
196 }
197 
198 /******************************************************************************/
199 
200 std::unique_ptr<GeneticCode> SequenceApplicationTools::getGeneticCode(
201  std::shared_ptr<const NucleicAlphabet> alphabet,
202  const string& description)
203 {
204  GeneticCode* geneCode;
205  if (description.find("Standard") != string::npos || description.find("1") != string::npos)
206  geneCode = new StandardGeneticCode(alphabet);
207  else if (description.find("VertebrateMitochondrial") != string::npos || description.find("2") != string::npos)
208  geneCode = new VertebrateMitochondrialGeneticCode(alphabet);
209  else if (description.find("YeastMitochondrial") != string::npos || description.find("3") != string::npos)
210  geneCode = new YeastMitochondrialGeneticCode(alphabet);
211  else if (description.find("MoldMitochondrial") != string::npos || description.find("4") != string::npos)
212  geneCode = new MoldMitochondrialGeneticCode(alphabet);
213  else if (description.find("InvertebrateMitochondrial") != string::npos || description.find("5") != string::npos)
214  geneCode = new InvertebrateMitochondrialGeneticCode(alphabet);
215  else if (description.find("CiliateNuclear") != string::npos || description.find("6") != string::npos)
216  geneCode = new CiliateNuclearGeneticCode(alphabet);
217  else if (description.find("EchinodermMitochondrial") != string::npos || description.find("9") != string::npos)
218  geneCode = new EchinodermMitochondrialGeneticCode(alphabet);
219  else if (description.find("AscidianMitochondrial") != string::npos || description.find("13") != string::npos)
220  geneCode = new AscidianMitochondrialGeneticCode(alphabet);
221  else
222  throw Exception("Unknown GeneticCode: " + description);
223  return unique_ptr<GeneticCode>(geneCode);
224 }
225 
226 /******************************************************************************/
227 
229  std::shared_ptr<const Alphabet> alphabet,
230  const string& description,
231  const string& message,
232  bool verbose)
233 {
234  BppOAlphabetIndex1Format reader(alphabet, message, verbose);
235  return reader.read(description);
236 }
237 
239  std::shared_ptr<const Alphabet> alphabet,
240  const string& description,
241  const string& message,
242  bool verbose)
243 {
244  BppOAlphabetIndex2Format reader(alphabet, message, verbose);
245  return reader.read(description);
246 }
247 
249  std::shared_ptr<const CodonAlphabet> alphabet,
250  std::shared_ptr<const GeneticCode> gencode,
251  const string& description,
252  const string& message,
253  bool verbose)
254 {
255  BppOAlphabetIndex1Format reader(alphabet, gencode, message, verbose);
256  return reader.read(description);
257 }
258 
260  std::shared_ptr<const CodonAlphabet> alphabet,
261  std::shared_ptr<const GeneticCode> gencode,
262  const string& description,
263  const string& message,
264  bool verbose)
265 {
266  BppOAlphabetIndex2Format reader(alphabet, gencode, message, verbose);
267  return reader.read(description);
268 }
269 
270 /******************************************************************************/
271 
272 std::unique_ptr<SequenceContainerInterface> SequenceApplicationTools::getSequenceContainer(
273  std::shared_ptr<const Alphabet> alpha,
274  const map<string, string>& params,
275  const string& suffix,
276  bool suffixIsOptional,
277  bool verbose,
278  int warn)
279 {
280  string sequenceFilePath = ApplicationTools::getAFilePath("input.sequence.file", params, true, true, suffix, suffixIsOptional, "none", warn);
281  string sequenceFormat = ApplicationTools::getStringParameter("input.sequence.format", params, "Fasta()", suffix, suffixIsOptional, warn);
282  BppOSequenceReaderFormat bppoReader(warn);
283  unique_ptr<ISequence> iSeq(bppoReader.read(sequenceFormat));
284  if (verbose)
285  {
286  ApplicationTools::displayResult("Sequence file " + suffix, sequenceFilePath);
287  ApplicationTools::displayResult("Sequence format " + suffix, iSeq->getFormatName());
288  }
289  std::unique_ptr<SequenceContainerInterface> sequences = iSeq->readSequences(sequenceFilePath, alpha);
290 
291  // Apply sequence selection:
292  restrictSelectedSequencesByName(*sequences, params, suffix, suffixIsOptional, verbose, warn);
293 
294  return sequences;
295 }
296 
297 
298 /******************************************************************************/
299 
300 map<size_t, unique_ptr<VectorSiteContainer>>
302  std::shared_ptr<const Alphabet> alpha,
303  const map<string, string>& params,
304  const string& prefix,
305  const string& suffix,
306  bool suffixIsOptional,
307  bool verbose,
308  int warn)
309 {
310  vector<string> vContName = ApplicationTools::matchingParameters(prefix + "data*", params);
311 
312  map<size_t, unique_ptr<VectorSiteContainer>> mCont;
313 
314  for (size_t nT = 0; nT < vContName.size(); nT++)
315  {
316  size_t poseq = vContName[nT].find("=");
317  size_t num = 0;
318  size_t len = (prefix + "data").size();
319 
320  string suff = vContName[nT].substr(len, poseq - len);
321 
322  if (TextTools::isDecimalInteger(suff, '$'))
323  num = TextTools::to<size_t>(suff);
324  else
325  num = 1;
326 
327  string contDesc = ApplicationTools::getStringParameter(vContName[nT], params, "", suffix, suffixIsOptional);
328 
329  string contName;
330 
331  map<string, string> args;
332 
333  KeyvalTools::parseProcedure(contDesc, contName, args);
334 
335  map<string, string> args2;
336 
337  if (contName == "alignment")
338  {
339  string format;
340 
341  if (args.find("file") != args.end())
342  args2["input.sequence.file"] = args["file"];
343  else
344  args2["input.sequence.file"] = "";
345 
346  if (args.find("format") != args.end())
347  args2["input.sequence.format"] = args["format"];
348 
349  if (args.find("selection") != args.end())
350  args2["input.site.selection"] = args["selection"];
351 
352  if (args.find("sites_to_use") != args.end())
353  args2["input.sequence.sites_to_use"] = args["sites_to_use"];
354 
355  if (args.find("max_gap_allowed") != args.end())
356  args2["input.sequence.max_gap_allowed"] = args["max_gap_allowed"];
357 
358  if (args.find("max_unresolved_allowed") != args.end())
359  args2["input.sequence.max_unresolved_allowed"] = args["max_unresolved_allowed"];
360 
361  if (args.find("remove_stop_codons") != args.end())
362  args2["input.sequence.remove_stop_codons"] = args["remove_stop_codons"];
363 
364  args2["genetic_code"] = ApplicationTools::getStringParameter("genetic_code", params, "", "", true, (AlphabetTools::isCodonAlphabet(*alpha) ? 0 : 1));
365 
366  auto vsC = getSiteContainer(alpha, args2, "", true, verbose, warn);
367 
370 
371  vsC = getSitesToAnalyse(*vsC, args2, "", true, false);
372 
373  if (mCont.find(num) != mCont.end())
374  {
375  ApplicationTools::displayWarning("Alignment " + TextTools::toString(num) + " already assigned, replaced by new one.");
376  }
377  mCont.emplace(num, std::move(vsC));
378  }
379  else
380  throw Exception("Unknown sequence container name " + contName);
381  }
382 
383  return mCont;
384 }
385 
386 /******************************************************************************/
387 
388 map<size_t, unique_ptr<ProbabilisticVectorSiteContainer>>
390  std::shared_ptr<const Alphabet> alpha,
391  const map<string, string>& params,
392  const string& prefix,
393  const string& suffix,
394  bool suffixIsOptional,
395  bool verbose,
396  int warn)
397 {
398  vector<string> vContName = ApplicationTools::matchingParameters(prefix + "data*", params);
399 
400  map<size_t, unique_ptr<ProbabilisticVectorSiteContainer>> mCont;
401 
402  for (size_t nT = 0; nT < vContName.size(); nT++)
403  {
404  size_t poseq = vContName[nT].find("=");
405  size_t num = 0;
406  size_t len = (prefix + "data").size();
407 
408  string suff = vContName[nT].substr(len, poseq - len);
409 
410  if (TextTools::isDecimalInteger(suff, '$'))
411  num = TextTools::to<size_t>(suff);
412  else
413  num = 1;
414 
415  string contDesc = ApplicationTools::getStringParameter(vContName[nT], params, "", suffix, suffixIsOptional);
416 
417  string contName;
418 
419  map<string, string> args;
420 
421  KeyvalTools::parseProcedure(contDesc, contName, args);
422 
423  map<string, string> args2;
424 
425  if (contName == "alignment")
426  {
427  string format;
428 
429  if (args.find("file") != args.end())
430  args2["input.sequence.file"] = args["file"];
431  else
432  args2["input.sequence.file"] = "";
433 
434  if (args.find("format") != args.end())
435  args2["input.sequence.format"] = args["format"];
436 
437  if (args.find("selection") != args.end())
438  args2["input.site.selection"] = args["selection"];
439 
440  if (args.find("sites_to_use") != args.end())
441  args2["input.sequence.sites_to_use"] = args["sites_to_use"];
442 
443  if (args.find("max_gap_allowed") != args.end())
444  args2["input.sequence.max_gap_allowed"] = args["max_gap_allowed"];
445 
446  if (args.find("max_unresolved_allowed") != args.end())
447  args2["input.sequence.max_unresolved_allowed"] = args["max_unresolved_allowed"];
448 
449  if (args.find("remove_stop_codons") != args.end())
450  args2["input.sequence.remove_stop_codons"] = args["remove_stop_codons"];
451 
452  args2["genetic_code"] = ApplicationTools::getStringParameter("genetic_code", params, "", "", true, (AlphabetTools::isCodonAlphabet(*alpha) ? 0 : 1));
453 
454  auto vsC = getProbabilisticSiteContainer(alpha, args2, "", true, verbose, warn);
455 
458 
459  vsC = getSitesToAnalyse(*vsC, args2, "", true, false);
460 
461  if (mCont.find(num) != mCont.end())
462  {
463  ApplicationTools::displayWarning("Alignment " + TextTools::toString(num) + " already assigned, replaced by new one.");
464  }
465  mCont.emplace(num, std::move(vsC));
466  }
467  else
468  throw Exception("Unknown sequence container name " + contName);
469 
470  }
471 
472  return mCont;
473 }
474 
475 /******************************************************************************/
476 
477 std::unique_ptr<VectorSiteContainer> SequenceApplicationTools::getSiteContainer(
478  std::shared_ptr<const Alphabet> alpha,
479  const map<string, string>& params,
480  const string& suffix,
481  bool suffixIsOptional,
482  bool verbose,
483  int warn)
484 {
485  string sequenceFilePath = ApplicationTools::getAFilePath("input.sequence.file", params, true, true, suffix, suffixIsOptional, "none", warn);
486  string sequenceFormat = ApplicationTools::getStringParameter("input.sequence.format", params, "Fasta()", suffix, suffixIsOptional, warn);
487  BppOAlignmentReaderFormat bppoReader(warn);
488 
489  unique_ptr<IAlignment> iAln(bppoReader.read(sequenceFormat));
490  map<string, string> args(bppoReader.getUnparsedArguments());
491 
492  if (verbose)
493  {
494  ApplicationTools::displayResult("Sequence file " + suffix, sequenceFilePath);
495  ApplicationTools::displayResult("Sequence format " + suffix, iAln->getFormatName());
496  }
497 
498  shared_ptr<const Alphabet> alpha2;
499  if (AlphabetTools::isRNYAlphabet(*alpha))
500  alpha2 = dynamic_pointer_cast<const RNY>(alpha)->getLetterAlphabet();
501  else
502  alpha2 = alpha;
503 
504  auto sites2 = make_unique<VectorSiteContainer>(*(iAln->readAlignment(sequenceFilePath, alpha2))); // We copy into a VectorSiteContainer, as most readers will generate an AlignedSequenceContainer)
505 
506  auto sites = unique_ptr<VectorSiteContainer>();
507 
509  if (AlphabetTools::isRNYAlphabet(*alpha))
510  {
511  sites = make_unique<VectorSiteContainer>(alpha);
512  for (size_t i = 0; i < sites2->getNumberOfSequences(); ++i)
513  {
514  auto seqP = SequenceTools::RNYslice(sites2->sequence(i));
515  sites->addSequence(sites2->sequenceKey(i), seqP);
516  }
517  }
518  else
519  sites = std::move(sites2);
520 
521  // Look for site selection:
522  if (iAln->getFormatName() == "MASE file")
523  {
524  // getting site set:
525  string siteSet = ApplicationTools::getStringParameter("siteSelection", args, "none", suffix, suffixIsOptional, warn + 1);
526  if (siteSet != "none")
527  {
528  unique_ptr<VectorSiteContainer> selectedSites;
529  try
530  {
531  auto sel = MaseTools::getSelectedSites(*sites, siteSet);
532  selectedSites = std::move(sel);
533  if (verbose)
534  ApplicationTools::displayResult("Set found", TextTools::toString(siteSet) + " sites.");
535  }
536  catch (IOException& ioe)
537  {
538  throw ioe;
539  }
540  if (selectedSites->getNumberOfSites() == 0)
541  {
542  throw Exception("Site set '" + siteSet + "' is empty.");
543  }
544  sites = std::move(selectedSites);
545  }
546  }
547 
548  // getting site set:
549  size_t nbSites = sites->getNumberOfSites();
550 
551  string siteSet = ApplicationTools::getStringParameter("input.site.selection", params, "none", suffix, suffixIsOptional, warn + 1);
552 
553  unique_ptr<VectorSiteContainer> selectedSites;
554  if (siteSet != "none")
555  {
556  if (siteSet[0] == '(')
557  siteSet = siteSet.substr(1, siteSet.size() - 2);
558 
559  vector<size_t> vSite;
560  try
561  {
562  vector<int> vSite1 = NumCalcApplicationTools::seqFromString(siteSet, ",", ":");
563 
564  for (size_t i = 0; i < vSite1.size(); ++i)
565  {
566  int x = (vSite1[i] >= 0 ? vSite1[i] : static_cast<int>(nbSites) + vSite1[i] + 1);
567  if (x <= (int)nbSites)
568  {
569  if (x > 0)
570  vSite.push_back(static_cast<size_t>(x - 1));
571  else
572  throw Exception("SequenceApplicationTools::getSiteContainer(). Incorrect null index: " + TextTools::toString(x));
573  }
574  else
575  {
576  ApplicationTools::displayResult("Site selection too large index", TextTools::toString(x));
577  ApplicationTools::displayResult("Limit to max length", TextTools::toString(nbSites));
578  vSite.push_back(static_cast<size_t>(nbSites));
579  }
580  }
581  auto sel = SiteContainerTools::getSelectedSites(*sites, vSite);
582  selectedSites = std::move(sel);
583  selectedSites->reindexSites();
584  }
585  catch (Exception& e)
586  {
587  string seln;
588  map<string, string> selArgs;
589  KeyvalTools::parseProcedure(siteSet, seln, selArgs);
590  if (seln == "Sample")
591  {
592  size_t n = ApplicationTools::getParameter<size_t>("n", selArgs, nbSites, "", true, warn + 1);
593  bool replace = ApplicationTools::getBooleanParameter("replace", selArgs, false, "", true, warn + 1);
594 
595  vSite.resize(n);
596  vector<size_t> vPos;
597  for (size_t p = 0; p < nbSites; ++p)
598  {
599  vPos.push_back(p);
600  }
601 
602  RandomTools::getSample(vPos, vSite, replace);
603 
604  auto sel = SiteContainerTools::getSelectedSites(*sites, vSite);
605  selectedSites = std::move(sel);
606  if (replace)
607  selectedSites->reindexSites();
608  }
609  else
610  throw Exception("Unknown site selection description: " + siteSet);
611  }
612 
613  if (verbose)
614  ApplicationTools::displayResult("Selected sites", TextTools::toString(siteSet));
615 
616  if (selectedSites && (selectedSites->getNumberOfSites() == 0))
617  {
618  throw Exception("Site set '" + siteSet + "' is empty.");
619  }
620  sites = std::move(selectedSites);
621  }
622 
623  // Apply sequence selection:
624  restrictSelectedSequencesByName(*sites, params, suffix, suffixIsOptional, verbose, warn);
625 
626  return sites;
627 }
628 
629 /******************************************************************************/
630 
631 unique_ptr<ProbabilisticVectorSiteContainer> SequenceApplicationTools::getProbabilisticSiteContainer(
632  std::shared_ptr<const Alphabet> alpha,
633  const map<string, string>& params,
634  const string& suffix,
635  bool suffixIsOptional,
636  bool verbose,
637  int warn)
638 {
639  string sequenceFilePath = ApplicationTools::getAFilePath("input.sequence.file", params, true, true, suffix, suffixIsOptional, "none", warn);
640  string sequenceFormat = ApplicationTools::getStringParameter("input.sequence.format", params, "Fasta()", suffix, suffixIsOptional, warn);
641  BppOAlignmentReaderFormat bppoReader(warn);
642 
643  unique_ptr<IAlignment> iAln;
644  unique_ptr<IProbabilisticAlignment> iProbAln;
645 
646  try
647  {
648  iAln.reset(bppoReader.read(sequenceFormat).release());
649  }
650  catch (Exception& e)
651  {
652  iProbAln.reset(bppoReader.readProbabilistic(sequenceFormat).release());
653  }
654 
655  // Probabilistic from Sequence format only possible for Allelic alphabet
656  if (iAln && !AlphabetTools::isAllelicAlphabet(*alpha))
657  throw IOException("Bad format");
658 
659 
660  map<string, string> args(bppoReader.getUnparsedArguments());
661  if (verbose)
662  {
663  ApplicationTools::displayResult("Sequence file " + suffix, sequenceFilePath);
664  ApplicationTools::displayResult("Sequence format " + suffix, iAln ? iAln->getFormatName() : iProbAln->getFormatName());
665  }
666 
667  shared_ptr<const Alphabet> alpha2;
668  if (AlphabetTools::isRNYAlphabet(*alpha))
669  alpha2 = dynamic_pointer_cast<const RNY>(alpha)->getLetterAlphabet();
670  else
671  {
673  alpha2 = dynamic_pointer_cast<const AllelicAlphabet>(alpha)->getStateAlphabet();
674  else
675  alpha2 = alpha;
676  }
677 
678  unique_ptr<VectorSiteContainer> sites;
679  unique_ptr<ProbabilisticVectorSiteContainer> psites;
680 
681  if (iAln)
682  sites = make_unique<VectorSiteContainer>(*(iAln->readAlignment(sequenceFilePath, alpha2)));
683  else
684  psites = make_unique<ProbabilisticVectorSiteContainer>(*iProbAln->readAlignment(sequenceFilePath, alpha2));
685 
686  if (sites)
687  {
689  if (AlphabetTools::isRNYAlphabet(*alpha2))
690  {
691  unique_ptr<VectorSiteContainer> tmpsites(new VectorSiteContainer(alpha2));
692 
693  const SequenceTools ST;
694  for (const auto& name : sites->getSequenceNames())
695  {
696  auto sl = ST.RNYslice(sites->sequence(name));
697  tmpsites->addSequence(name, sl);
698  }
699  sites = std::move(tmpsites);
700  }
701 
702  // Look for site selection:
703 
704  if (iAln->getFormatName() == "MASE file")
705  {
706  // getting site set:
707  string siteSet = ApplicationTools::getStringParameter("siteSelection", args, "none", suffix, suffixIsOptional, warn + 1);
708  if (siteSet != "none")
709  {
710  unique_ptr<VectorSiteContainer> selectedSites;
711  try
712  {
713  selectedSites.reset(dynamic_cast<VectorSiteContainer*>(MaseTools::getSelectedSites(*sites, siteSet).release()));
714  if (verbose)
715  ApplicationTools::displayResult("Set found", TextTools::toString(siteSet) + " sites.");
716  }
717  catch (IOException& ioe)
718  {
719  throw ioe;
720  }
721  if (selectedSites->getNumberOfSites() == 0)
722  {
723  throw Exception("Site set '" + siteSet + "' is empty.");
724  }
725  sites = std::move(selectedSites);
726  }
727  }
728 
729  // Then convert VectorSiteContainer in ProbabilisticVectorSiteContainer
730 
731  psites.reset(new ProbabilisticVectorSiteContainer(alpha2));
732 
733  auto names = psites->getSequenceNames();
734 
735  auto chars = alpha2->getResolvedChars();
736 
737  Table<double> dtable(chars.size(), sites->getNumberOfSites());
738  dtable.setRowNames(chars);
739 
740  for (const auto& name : names)
741  {
742  const auto& sequence = psites->sequence(name);
743  vector<double> vval(chars.size());
744 
745  for (size_t pos = 0; pos < sequence.size(); pos++)
746  {
747  size_t i = 0;
748  for (auto c:chars)
749  {
750  vval[i] = sequence.getStateValueAt(pos, alpha2->charToInt(c));
751  i++;
752  }
753 
754  dtable.setColumn(vval, pos);
755  }
756 
757  unique_ptr<ProbabilisticSequence> seq(new ProbabilisticSequence(name, dtable, sequence.getComments(), alpha2));
758 
759  psites->addSequence(name, seq);
760  }
761  }
762 
765  {
766  auto pallsites = unique_ptr<ProbabilisticVectorSiteContainer>(new ProbabilisticVectorSiteContainer(alpha));
767 
768  auto names = psites->getSequenceNames();
769 
770  for (const auto& name : names)
771  {
772  unique_ptr<ProbabilisticSequence> seq(dynamic_pointer_cast<const AllelicAlphabet>(alpha)->convertFromStateAlphabet(psites->sequence(name)));
773 
774  pallsites->addSequence(name, seq);
775  }
776 
777  psites = std::move(pallsites);
778  }
779 
780  // getting selection site set:
781 
782  size_t nbSites = sites ? sites->getNumberOfSites() : psites->getNumberOfSites();
783 
784  string siteSet = ApplicationTools::getStringParameter("input.site.selection", params, "none", suffix, suffixIsOptional, warn + 1);
785 
786 
787  if (siteSet != "none")
788  {
789  if (siteSet[0] == '(')
790  siteSet = siteSet.substr(1, siteSet.size() - 2);
791 
792  vector<size_t> vSite;
793  try
794  {
795  vector<int> vSite1 = NumCalcApplicationTools::seqFromString(siteSet, ",", ":");
796 
797  for (auto ps : vSite1)
798  {
799  int x = (ps >= 0 ? ps : static_cast<int>(nbSites) + ps + 1);
800  if (x <= (int)nbSites)
801  {
802  if (x > 0)
803  vSite.push_back(static_cast<size_t>(x - 1));
804  else
805  {
806  throw Exception("SequenceApplicationTools::getSiteContainer(). Incorrect null index");
807  }
808  }
809  else
810  {
811  ApplicationTools::displayMessage("Site selection too large index: " + TextTools::toString(x));
812  ApplicationTools::displayMessage("Limit to max length: " + TextTools::toString(nbSites));
813  vSite.push_back(static_cast<size_t>(nbSites - 1));
814  break;
815  }
816  }
817  }
818  catch (Exception& e)
819  {
820  string seln;
821  map<string, string> selArgs;
822  KeyvalTools::parseProcedure(siteSet, seln, selArgs);
823  if (seln == "Sample")
824  {
825  size_t n = ApplicationTools::getParameter<size_t>("n", selArgs, nbSites, "", true, warn + 1);
826  bool replace = ApplicationTools::getBooleanParameter("replace", selArgs, false, "", true, warn + 1);
827 
828  vSite.resize(n);
829  vector<size_t> vPos;
830  for (size_t p = 0; p < nbSites; ++p)
831  {
832  vPos.push_back(p);
833  }
834 
835  RandomTools::getSample(vPos, vSite, replace);
836  }
837  else
838  throw Exception("Unknown site selection description: " + siteSet);
839  }
840 
841 
842  if (verbose)
843  ApplicationTools::displayResult("Selected sites", TextTools::toString(siteSet));
844 
845  auto selectedSites = make_unique<ProbabilisticVectorSiteContainer>(alpha);
846 
847  SiteContainerTools::getSelectedSites(*psites, vSite, *selectedSites);
848 
849  if (selectedSites && (selectedSites->getNumberOfSites() == 0))
850  {
851  throw Exception("Site set '" + siteSet + "' is empty.");
852  }
853 
854  // if (replace)
855  // selectedSites->reindexSites();
856  psites = std::move(selectedSites);
857  }
858 
859  // Apply sequence selection:
860 // restrictSelectedSequencesByName(*psites, params, suffix, suffixIsOptional, verbose, warn);
861 
862  return psites;
863 }
864 
865 /******************************************************************************/
866 
868  SequenceContainerInterface& allSequences,
869  const map<std::string, std::string>& params,
870  string suffix,
871  bool suffixIsOptional,
872  bool verbose,
873  int warn)
874 {
875  string optionKeep = ApplicationTools::getStringParameter("input.sequence.keep_names", params, "all", suffix, suffixIsOptional, warn);
876  if (optionKeep != "all")
877  {
878  vector<string> selection = ApplicationTools::getVectorParameter<string>("input.sequence.keep_names", params, ',', optionKeep, suffix, suffixIsOptional, warn);
879  sort(selection.begin(), selection.end());
880  for (const auto& name: allSequences.getSequenceNames())
881  {
882  if (!binary_search(selection.begin(), selection.end(), name))
883  {
884  allSequences.removeSequence(name);
885  if (verbose)
886  {
887  ApplicationTools::displayResult("Discard sequence", name);
888  }
889  }
890  }
891  }
892  string optionRemove = ApplicationTools::getStringParameter("input.sequence.remove_names", params, "none", suffix, suffixIsOptional, warn);
893  if (optionRemove != "none")
894  {
895  vector<string> selection = ApplicationTools::getVectorParameter<string>("input.sequence.remove_names", params, ',', optionRemove, suffix, suffixIsOptional, warn);
896  vector<string> seqNames = allSequences.getSequenceNames();
897  sort(seqNames.begin(), seqNames.end());
898  for (const auto& name: selection)
899  {
900  if (binary_search(seqNames.begin(), seqNames.end(), name))
901  {
902  allSequences.removeSequence(name);
903  if (verbose)
904  {
905  ApplicationTools::displayResult("Discard sequence", name);
906  }
907  }
908  else
909  {
910  throw SequenceNotFoundException("No sequence with the specified name was found.", name);
911  }
912  }
913  }
914 }
915 
916 /******************************************************************************/
917 
919  const SequenceContainerInterface& sequences,
920  const map<string, string>& params,
921  const string& suffix,
922  bool verbose,
923  int warn)
924 {
925  string sequenceFilePath = ApplicationTools::getAFilePath("output.sequence.file", params, true, false, suffix, false, "none", warn);
926  string sequenceFormat = ApplicationTools::getStringParameter("output.sequence.format", params, "Fasta", suffix, false, warn);
927  BppOSequenceWriterFormat bppoWriter(warn);
928  unique_ptr<OSequence> oSeq(bppoWriter.read(sequenceFormat));
929  if (verbose)
930  {
931  ApplicationTools::displayResult("Output sequence file " + suffix, sequenceFilePath);
932  ApplicationTools::displayResult("Output sequence format " + suffix, oSeq->getFormatName());
933  }
934 
935  // Write sequences:
936  oSeq->writeSequences(sequenceFilePath, sequences, true);
937 }
938 
939 /******************************************************************************/
940 
942  const SiteContainerInterface& sites,
943  const map<string, string>& params,
944  const string& suffix,
945  bool verbose,
946  int warn)
947 {
948  string sequenceFilePath = ApplicationTools::getAFilePath("output.sequence.file", params, true, false, suffix, false, "none", warn);
949  string sequenceFormat = ApplicationTools::getStringParameter("output.sequence.format", params, "Fasta", suffix, false, warn);
950  BppOAlignmentWriterFormat bppoWriter(warn);
951  unique_ptr<OAlignment> oAln(bppoWriter.read(sequenceFormat));
952  if (verbose)
953  {
954  ApplicationTools::displayResult("Output alignment file " + suffix, sequenceFilePath);
955  ApplicationTools::displayResult("Output alignment format " + suffix, oAln->getFormatName());
956  }
957 
958  // Write sequences:
959  oAln->writeAlignment(sequenceFilePath, sites, true);
960 }
961 
962 /******************************************************************************/
static bool isRNYAlphabet(const Alphabet &alphabet)
static bool isNucleicAlphabet(const Alphabet &alphabet)
static bool isCodonAlphabet(const Alphabet &alphabet)
static bool isAllelicAlphabet(const Alphabet &alphabet)
static void displayMessage(const std::string &text)
static std::vector< std::string > matchingParameters(const std::string &pattern, const std::map< std::string, std::string > &params)
static bool getBooleanParameter(const std::string &parameterName, const std::map< std::string, std::string > &params, bool defaultValue, const std::string &suffix="", bool suffixIsOptional=true, int warn=0)
static std::string getStringParameter(const std::string &parameterName, const std::map< std::string, std::string > &params, const std::string &defaultValue, const std::string &suffix="", bool suffixIsOptional=true, int warn=0)
static void displayWarning(const std::string &text)
static void displayResult(const std::string &text, const T &result)
static std::string getAFilePath(const std::string &parameter, const std::map< std::string, std::string > &params, bool isRequired=true, bool mustExist=true, const std::string &suffix="", bool suffixIsOptional=false, const std::string &defaultPath="none", int warn=0)
This class implements the ascidian mitochondrial genetic code as describe on the NCBI web site: http:...
Sequence I/O in BppO format.
std::unique_ptr< IAlignment > read(const std::string &description)
Read a IAlignment object from a string.
virtual const std::map< std::string, std::string > & getUnparsedArguments() const
std::unique_ptr< IProbabilisticAlignment > readProbabilistic(const std::string &description)
Sequence I/O in BppO format.
std::unique_ptr< OAlignment > read(const std::string &description)
Read a OAlignment object from a string.
AlphabetIndex1 I/O in BppO format.
std::unique_ptr< AlphabetIndex1 > read(const std::string &description)
Read a AlphabetIndex1 object from a string.
AlphabetIndex2 I/O in BppO format.
std::unique_ptr< AlphabetIndex2 > read(const std::string &description)
Read a AlphabetIndex2 object from a string.
Sequence I/O in BppO format.
std::unique_ptr< ISequence > read(const std::string &description)
Read a ISequence object from a string.
Sequence I/O in BppO format.
std::unique_ptr< OSequence > read(const std::string &description)
Read a OSequence object from a string.
This class implements the mold, protozoan, and coelenterate mitochondrial code and the Mycoplasma/Spi...
This class implements the Echinoderm and Faltworms Mitochondrial genetic code as describe on the NCBI...
Partial implementation of the Transliterator interface for genetic code object.
Definition: GeneticCode.h:50
This class implements the Invertebrate Mitochondrial genetic code as describe on the NCBI website: ht...
static void parseProcedure(const std::string &desc, std::string &name, std::map< std::string, std::string > &args)
static std::unique_ptr< TemplateVectorSiteContainer< SiteType, SequenceType > > getSelectedSites(const TemplateSiteContainerInterface< SiteType, SequenceType, std::string > &sequences, const std::string &setName)
Create a new container corresponding to a site set given in the mase+ format.
Definition: MaseTools.h:62
This class implements the mold, protozoan, and coelenterate mitochondrial code and the Mycoplasma/Spi...
static std::vector< int > seqFromString(const std::string &s, const std::string &delim=",", const std::string &seqdelim="-")
A basic implementation of the ProbabilisticSequence interface.
static void getSample(const std::vector< T > &vin, std::vector< T > &vout, bool replace=false)
static std::unique_ptr< ProbabilisticVectorSiteContainer > getProbabilisticSiteContainer(std::shared_ptr< const Alphabet > alpha, const std::map< std::string, std::string > &params, const std::string &suffix="", bool suffixIsOptional=true, bool verbose=true, int warn=1)
Build a ProbabilisticSiteContainer object according to the BppO syntax.
static void restrictSelectedSequencesByName(SequenceContainerInterface &allSequences, const std::map< std::string, std::string > &params, std::string suffix="", bool suffixIsOptional=true, bool verbose=true, int warn=1)
Retrieves selected sequences (by name).
static std::map< size_t, std::unique_ptr< ProbabilisticVectorSiteContainer > > getProbabilisticSiteContainers(std::shared_ptr< const Alphabet > alpha, const std::map< std::string, std::string > &params, const std::string &prefix="input.", const std::string &suffix="", bool suffixIsOptional=true, bool verbose=true, int warn=1)
Build multiple ProbabilisticSiteContainer objects according to the BppO syntax.
static std::unique_ptr< GeneticCode > getGeneticCode(std::shared_ptr< const NucleicAlphabet > alphabet, const std::string &description)
Build a GeneticCode object according to options.
static std::map< size_t, std::unique_ptr< VectorSiteContainer > > getSiteContainers(std::shared_ptr< const Alphabet > alpha, const std::map< std::string, std::string > &params, const std::string &prefix="input.", const std::string &suffix="", bool suffixIsOptional=true, bool verbose=true, int warn=1)
Build multiple SiteContainer objects according to the BppO syntax.
static void writeAlignmentFile(const SiteContainerInterface &sequences, const std::map< std::string, std::string > &params, const std::string &suffix="", bool verbose=true, int warn=1)
Write a sequence alignment file according to options.
static void writeSequenceFile(const SequenceContainerInterface &sequences, const std::map< std::string, std::string > &params, const std::string &suffix="", bool verbose=true, int warn=1)
Write a sequence file according to options.
static std::unique_ptr< VectorSiteContainer > getSiteContainer(std::shared_ptr< const Alphabet > alpha, const std::map< std::string, std::string > &params, const std::string &suffix="", bool suffixIsOptional=true, bool verbose=true, int warn=1)
Build a SiteContainer object according to the BppO syntax.
static std::unique_ptr< SequenceContainerInterface > getSequenceContainer(std::shared_ptr< const Alphabet > alpha, const std::map< std::string, std::string > &params, const std::string &suffix="", bool suffixIsOptional=true, bool verbose=true, int warn=1)
Build a SequenceContainer object according to options.
static std::unique_ptr< AlphabetIndex2 > getAlphabetIndex2(std::shared_ptr< const Alphabet > alphabet, const std::string &description, const std::string &message="Alphabet distance:", bool verbose=true)
Build a AlphabetIndex2 object for a given alphabet.
static std::unique_ptr< Alphabet > getAlphabet(const std::map< std::string, std::string > &params, const std::string &suffix="", bool suffixIsOptional=true, bool verbose=true, bool allowGeneric=false, int warn=1)
Build an Alphabet object according to options.
static std::unique_ptr< AlphabetIndex1 > getAlphabetIndex1(std::shared_ptr< const Alphabet > alphabet, const std::string &description, const std::string &message="Alphabet measure:", bool verbose=true)
Build a AlphabetIndex1 object for a given alphabet.
Exception thrown when a sequence is not found The sequence not found exception base class.
SequenceTools static class.
Definition: SequenceTools.h:64
static std::unique_ptr< Sequence > RNYslice(const SequenceInterface &sequence, int ph)
Get the RNY decomposition of a DNA sequence.
static void getSelectedSites(const TemplateSiteContainerInterface< SiteType, SequenceType, HashType > &sites, const SiteSelection &selection, TemplateSiteContainerInterface< SiteType, SequenceType, HashType > &outputSites)
Extract a specified set of sites.
This class implements the standard genetic code as describe on the NCBI web site: http://www....
void setRowNames(const std::vector< std::string > &rowNames)
void setColumn(const std::vector< T > &newColumn, size_t pos)
The SequenceContainer interface.
virtual std::unique_ptr< SequenceType > removeSequence(const HashType &sequenceKey)=0
Remove a sequence from the container.
virtual std::vector< std::string > getSequenceNames() const =0
The VectorSiteContainer class.
This class implements the vertebrate mitochondrial genetic code as describe on the NCBI web site: htt...
This class implements the Invertebrate Mitochondrial genetic code as describe on the NCBI website: ht...
bool isDecimalInteger(const std::string &s, char scientificNotation='e')
std::string toString(T t)
This alphabet is used to deal NumericAlphabet.
TemplateVectorSiteContainer< ProbabilisticSite, ProbabilisticSequence > ProbabilisticVectorSiteContainer
TemplateVectorSiteContainer< Site, Sequence > VectorSiteContainer