bpp-seq3  3.0.0
SequenceApplicationTools.cpp
Go to the documentation of this file.
1 // SPDX-FileCopyrightText: The Bio++ Development Group
2 //
3 // SPDX-License-Identifier: CECILL-2.1
4 
8 #include <Bpp/Text/KeyvalTools.h>
9 #include <Bpp/Text/TextTools.h>
10 #include <algorithm>
11 
12 #include "../Alphabet/AlphabetTools.h"
13 #include "../Alphabet/AllelicAlphabet.h"
14 #include "../Alphabet/BinaryAlphabet.h"
15 #include "../Alphabet/CodonAlphabet.h"
16 #include "../Alphabet/DefaultAlphabet.h"
17 #include "../Alphabet/LexicalAlphabet.h"
18 #include "../Container/VectorSiteContainer.h"
19 #include "../GeneticCode/AscidianMitochondrialGeneticCode.h"
20 #include "../GeneticCode/CiliateNuclearGeneticCode.h"
21 #include "../GeneticCode/EchinodermMitochondrialGeneticCode.h"
22 #include "../GeneticCode/InvertebrateMitochondrialGeneticCode.h"
23 #include "../GeneticCode/MoldMitochondrialGeneticCode.h"
24 #include "../GeneticCode/StandardGeneticCode.h"
25 #include "../GeneticCode/VertebrateMitochondrialGeneticCode.h"
26 #include "../GeneticCode/YeastMitochondrialGeneticCode.h"
27 #include "../Io/BppOAlignmentReaderFormat.h"
28 #include "../Io/BppOAlignmentWriterFormat.h"
29 #include "../Io/BppOAlphabetIndex1Format.h"
30 #include "../Io/BppOAlphabetIndex2Format.h"
31 #include "../Io/BppOSequenceReaderFormat.h"
32 #include "../Io/BppOSequenceWriterFormat.h"
33 #include "../Io/MaseTools.h"
34 #include "../SequenceTools.h"
35 #include "../SymbolListTools.h"
37 
38 using namespace bpp;
39 using namespace std;
40 
41 /******************************************************************************/
42 
44  const map<string, string>& params,
45  const string& suffix,
46  bool suffixIsOptional,
47  bool verbose,
48  bool allowGeneric,
49  int warn)
50 {
51  unique_ptr<Alphabet> chars = nullptr;
52 
53  string alphtt = ApplicationTools::getStringParameter("alphabet", params, "DNA", suffix, suffixIsOptional, warn);
54 
55  map<string, string> args;
56 
57  string alphabet;
58  KeyvalTools::parseProcedure(alphtt, alphabet, args);
59 
60  if (alphabet == "Word")
61  {
62  if (args.find("letter") != args.end())
63  {
64  args["alphabet"] = args["letter"];
65 
66  unique_ptr<const Alphabet> inAlphabet = SequenceApplicationTools::getAlphabet(args, suffix, suffixIsOptional, false, allowGeneric, warn + 1);
67 
68  if (args.find("length") == args.end())
69  throw Exception("Missing length parameter for Word alphabet");
70 
71  size_t lg = TextTools::to<size_t>(args["length"]);
72 
73  chars = make_unique<WordAlphabet>(std::move(inAlphabet), lg);
74  }
75  else
76  {
77  size_t indAlph = 1;
78  vector< shared_ptr<const Alphabet>> vAlph;
79 
80  while (args.find("alphabet" + TextTools::toString(indAlph)) != args.end())
81  {
82  map<string, string> args2;
83  args2["alphabet"] = args["alphabet" + TextTools::toString(indAlph)];
84 
85  vAlph.push_back(SequenceApplicationTools::getAlphabet(args2, suffix, suffixIsOptional, verbose, allowGeneric, warn));
86  indAlph++;
87  }
88 
89  if (vAlph.size() == 0)
90  throw Exception("SequenceApplicationTools::getAlphabet. At least one alphabet is compulsory for Word alphabet");
91 
92  chars = make_unique<WordAlphabet>(vAlph);
93  }
94  }
95  else if (alphabet == "RNY")
96  {
97  if (args.find("letter") == args.end())
98  throw Exception("Missing letter alphabet for RNY alphabet");
99 
100  args["alphabet"] = args["letter"];
101 
102  shared_ptr<Alphabet> inAlphabet = SequenceApplicationTools::getAlphabet(args, suffix, suffixIsOptional, verbose, allowGeneric, warn);
103 
104  if (AlphabetTools::isNucleicAlphabet(*inAlphabet))
105  {
106  shared_ptr<Alphabet> charsTmp = std::move(chars); // unique -> shared
107  chars = make_unique<RNY>(dynamic_pointer_cast<NucleicAlphabet>(charsTmp));
108  alphabet = "RNY(" + alphabet + ")";
109  }
110  else
111  throw Exception("RNY needs a Nucleic Alphabet, instead of " + args["letter"]);
112  }
113  else if (alphabet == "Binary")
114  chars = make_unique<BinaryAlphabet>();
115  else if (alphabet == "Integer")
116  {
117  if (args.find("N") == args.end())
118  throw Exception("Missing 'N' argument in Integer for size:" + alphabet);
119 
120  uint N = TextTools::to<unsigned int>(args["N"]);
121  chars = make_unique<IntegerAlphabet>(N);
122  }
123  else if (alphabet == "Lexicon")
124  {
125  vector<string> vWord = ApplicationTools::getVectorParameter<string>("words", args, ',', "()");
126 
127  if (vWord.size() == 0)
128  throw Exception("SequenceApplicationTools::getAlphabet. 'words' list is missing or empty for Lexicon alphabet");
129 
130  chars = make_unique<LexicalAlphabet>(vWord);
131  }
132  else if (alphabet == "DNA")
133  {
134  bool mark = ApplicationTools::getBooleanParameter("bangAsGap", args, false, "", true, warn + 1);
135  chars = make_unique<DNA>(mark);
136  }
137  else if (alphabet == "RNA")
138  {
139  bool mark = ApplicationTools::getBooleanParameter("bangAsGap", args, false, "", true, warn + 1);
140  chars = make_unique<RNA>(mark);
141  }
142  else if (alphabet == "Protein")
143  chars = make_unique<ProteicAlphabet>();
144  else if (allowGeneric && alphabet == "Generic")
145  chars = make_unique<DefaultAlphabet>();
146  else if (alphabet == "Codon")
147  {
148  if (args.find("letter") == args.end())
149  throw Exception("Missing 'letter' argument in Codon :" + alphabet);
150  if (args.find("type") != args.end())
151  throw Exception("'type' argument in Codon is deprecated and has been superseded by the 'genetic_code' option.");
152 
153  string alphnDesc = ApplicationTools::getStringParameter("letter", args, "RNA");
154  string alphn;
155  map<string, string> alphnArgs;
156  KeyvalTools::parseProcedure(alphnDesc, alphn, alphnArgs);
157 
158  shared_ptr<const NucleicAlphabet> pnalph;
159  if (alphn == "RNA")
160  {
161  bool mark = ApplicationTools::getBooleanParameter("bangAsGap", alphnArgs, false, "", true, warn + 1);
162  pnalph = make_shared<RNA>(mark);
163  }
164  else if (alphn == "DNA")
165  {
166  bool mark = ApplicationTools::getBooleanParameter("bangAsGap", alphnArgs, false, "", true, warn + 1);
167  pnalph = make_shared<DNA>(mark);
168  }
169  else
170  throw Exception("Alphabet not known in Codon : " + alphn);
171 
172  chars = make_unique<CodonAlphabet>(pnalph);
173  }
174  else if (alphabet == "Allelic")
175  {
176  if (args.find("base") == args.end())
177  throw Exception("Missing 'base' argument in Allelic for sequence alphabet :" + alphabet);
178 
179  if (args.find("N") == args.end())
180  throw Exception("Missing 'N' argument in Allelic for number of alleles:" + alphabet);
181 
182  uint N = TextTools::to<unsigned int>(args["N"]);
183 
184  args["alphabet"] = args["base"];
185 
186  auto inAlphabet = std::shared_ptr<Alphabet>(SequenceApplicationTools::getAlphabet(args, suffix, suffixIsOptional, false, allowGeneric, warn + 1));
187 
188  chars = make_unique<AllelicAlphabet>(inAlphabet, N);
189  }
190  else
191  throw Exception("Alphabet not known: " + alphabet);
192 
193  if (verbose)
194  ApplicationTools::displayResult("Alphabet type ", chars->getAlphabetType());
195  return chars;
196 }
197 
198 /******************************************************************************/
199 
200 std::unique_ptr<GeneticCode> SequenceApplicationTools::getGeneticCode(
201  std::shared_ptr<const NucleicAlphabet> alphabet,
202  const string& description)
203 {
204  GeneticCode* geneCode;
205  if (description.find("Standard") != string::npos || description.find("1") != string::npos)
206  geneCode = new StandardGeneticCode(alphabet);
207  else if (description.find("VertebrateMitochondrial") != string::npos || description.find("2") != string::npos)
208  geneCode = new VertebrateMitochondrialGeneticCode(alphabet);
209  else if (description.find("YeastMitochondrial") != string::npos || description.find("3") != string::npos)
210  geneCode = new YeastMitochondrialGeneticCode(alphabet);
211  else if (description.find("MoldMitochondrial") != string::npos || description.find("4") != string::npos)
212  geneCode = new MoldMitochondrialGeneticCode(alphabet);
213  else if (description.find("InvertebrateMitochondrial") != string::npos || description.find("5") != string::npos)
214  geneCode = new InvertebrateMitochondrialGeneticCode(alphabet);
215  else if (description.find("CiliateNuclear") != string::npos || description.find("6") != string::npos)
216  geneCode = new CiliateNuclearGeneticCode(alphabet);
217  else if (description.find("EchinodermMitochondrial") != string::npos || description.find("9") != string::npos)
218  geneCode = new EchinodermMitochondrialGeneticCode(alphabet);
219  else if (description.find("AscidianMitochondrial") != string::npos || description.find("13") != string::npos)
220  geneCode = new AscidianMitochondrialGeneticCode(alphabet);
221  else
222  throw Exception("Unknown GeneticCode: " + description);
223  return unique_ptr<GeneticCode>(geneCode);
224 }
225 
226 /******************************************************************************/
227 
229  shared_ptr<const Alphabet> alphabet,
230  const string& description,
231  const string& message,
232  bool verbose)
233 {
234  BppOAlphabetIndex1Format reader(alphabet, message, verbose);
235  return reader.read(description);
236 }
237 
239  shared_ptr<const Alphabet> alphabet,
240  const string& description,
241  const string& message,
242  bool verbose)
243 {
244  BppOAlphabetIndex2Format reader(alphabet, message, verbose);
245  return reader.read(description);
246 }
247 
249  shared_ptr<const CodonAlphabet> alphabet,
250  shared_ptr<const GeneticCode> gencode,
251  const string& description,
252  const string& message,
253  bool verbose)
254 {
255  BppOAlphabetIndex1Format reader(alphabet, gencode, message, verbose);
256  return reader.read(description);
257 }
258 
260  shared_ptr<const CodonAlphabet> alphabet,
261  shared_ptr<const GeneticCode> gencode,
262  const string& description,
263  const string& message,
264  bool verbose)
265 {
266  BppOAlphabetIndex2Format reader(alphabet, gencode, message, verbose);
267  return reader.read(description);
268 }
269 
270 /******************************************************************************/
271 
272 std::unique_ptr<SequenceContainerInterface> SequenceApplicationTools::getSequenceContainer(
273  std::shared_ptr<const Alphabet> alpha,
274  const map<string, string>& params,
275  const string& suffix,
276  bool suffixIsOptional,
277  bool verbose,
278  int warn)
279 {
280  string sequenceFilePath = ApplicationTools::getAFilePath("input.sequence.file", params, true, true, suffix, suffixIsOptional, "none", warn);
281  string sequenceFormat = ApplicationTools::getStringParameter("input.sequence.format", params, "Fasta()", suffix, suffixIsOptional, warn);
282  BppOSequenceReaderFormat bppoReader(warn);
283  unique_ptr<ISequence> iSeq(bppoReader.read(sequenceFormat));
284  if (verbose)
285  {
286  ApplicationTools::displayResult("Sequence file " + suffix, sequenceFilePath);
287  ApplicationTools::displayResult("Sequence format " + suffix, iSeq->getFormatName());
288  }
289  std::unique_ptr<SequenceContainerInterface> sequences = iSeq->readSequences(sequenceFilePath, alpha);
290 
291  // Apply sequence selection:
292  restrictSelectedSequencesByName(*sequences, params, suffix, suffixIsOptional, verbose, warn);
293 
294  return sequences;
295 }
296 
297 
298 /******************************************************************************/
299 
300 map<size_t, unique_ptr<VectorSiteContainer>>
302  std::shared_ptr<const Alphabet> alpha,
303  const map<string, string>& params,
304  const string& prefix,
305  const string& suffix,
306  bool suffixIsOptional,
307  bool verbose,
308  int warn)
309 {
310  vector<string> vContName = ApplicationTools::matchingParameters(prefix + "data*", params);
311 
312  map<size_t, unique_ptr<VectorSiteContainer>> mCont;
313 
314  for (size_t nT = 0; nT < vContName.size(); nT++)
315  {
316  size_t poseq = vContName[nT].find("=");
317  size_t num = 0;
318  size_t len = (prefix + "data").size();
319 
320  string suff = vContName[nT].substr(len, poseq - len);
321 
322  if (TextTools::isDecimalInteger(suff, '$'))
323  num = TextTools::to<size_t>(suff);
324  else
325  num = 1;
326 
327  string contDesc = ApplicationTools::getStringParameter(vContName[nT], params, "", suffix, suffixIsOptional);
328 
329  string contName;
330 
331  map<string, string> args;
332 
333  KeyvalTools::parseProcedure(contDesc, contName, args);
334 
335  map<string, string> args2;
336 
337  if (contName == "alignment")
338  {
339  string format;
340 
341  if (args.find("file") != args.end())
342  args2["input.sequence.file"] = args["file"];
343  else
344  args2["input.sequence.file"] = "";
345 
346  if (args.find("format") != args.end())
347  args2["input.sequence.format"] = args["format"];
348 
349  if (args.find("selection") != args.end())
350  args2["input.site.selection"] = args["selection"];
351 
352  if (args.find("sites_to_use") != args.end())
353  args2["input.sequence.sites_to_use"] = args["sites_to_use"];
354 
355  if (args.find("max_gap_allowed") != args.end())
356  args2["input.sequence.max_gap_allowed"] = args["max_gap_allowed"];
357 
358  if (args.find("max_unresolved_allowed") != args.end())
359  args2["input.sequence.max_unresolved_allowed"] = args["max_unresolved_allowed"];
360 
361  if (args.find("remove_stop_codons") != args.end())
362  args2["input.sequence.remove_stop_codons"] = args["remove_stop_codons"];
363 
364  args2["genetic_code"] = ApplicationTools::getStringParameter("genetic_code", params, "", "", true, (AlphabetTools::isCodonAlphabet(*alpha) ? 0 : 1));
365 
366  auto vsC = getSiteContainer(alpha, args2, "", true, verbose, warn);
367 
370 
371  vsC = getSitesToAnalyse(*vsC, args2, "", true, false);
372 
373  if (mCont.find(num) != mCont.end())
374  {
375  ApplicationTools::displayWarning("Alignment " + TextTools::toString(num) + " already assigned, replaced by new one.");
376  }
377  mCont.emplace(num, std::move(vsC));
378  }
379  }
380 
381  return mCont;
382 }
383 
384 /******************************************************************************/
385 
386 map<size_t, unique_ptr<ProbabilisticVectorSiteContainer>>
388  std::shared_ptr<const Alphabet> alpha,
389  const map<string, string>& params,
390  const string& prefix,
391  const string& suffix,
392  bool suffixIsOptional,
393  bool verbose,
394  int warn)
395 {
396  vector<string> vContName = ApplicationTools::matchingParameters(prefix + "data*", params);
397 
398  map<size_t, unique_ptr<ProbabilisticVectorSiteContainer>> mCont;
399 
400  for (size_t nT = 0; nT < vContName.size(); nT++)
401  {
402  size_t poseq = vContName[nT].find("=");
403  size_t num = 0;
404  size_t len = (prefix + "data").size();
405 
406  string suff = vContName[nT].substr(len, poseq - len);
407 
408  if (TextTools::isDecimalInteger(suff, '$'))
409  num = TextTools::to<size_t>(suff);
410  else
411  num = 1;
412 
413  string contDesc = ApplicationTools::getStringParameter(vContName[nT], params, "", suffix, suffixIsOptional);
414 
415  string contName;
416 
417  map<string, string> args;
418 
419  KeyvalTools::parseProcedure(contDesc, contName, args);
420 
421  map<string, string> args2;
422 
423  if (contName == "alignment")
424  {
425  string format;
426 
427  if (args.find("file") != args.end())
428  args2["input.sequence.file"] = args["file"];
429  else
430  args2["input.sequence.file"] = "";
431 
432  if (args.find("format") != args.end())
433  args2["input.sequence.format"] = args["format"];
434 
435  if (args.find("selection") != args.end())
436  args2["input.site.selection"] = args["selection"];
437 
438  if (args.find("sites_to_use") != args.end())
439  args2["input.sequence.sites_to_use"] = args["sites_to_use"];
440 
441  if (args.find("max_gap_allowed") != args.end())
442  args2["input.sequence.max_gap_allowed"] = args["max_gap_allowed"];
443 
444  if (args.find("max_unresolved_allowed") != args.end())
445  args2["input.sequence.max_unresolved_allowed"] = args["max_unresolved_allowed"];
446 
447  if (args.find("remove_stop_codons") != args.end())
448  args2["input.sequence.remove_stop_codons"] = args["remove_stop_codons"];
449 
450  args2["genetic_code"] = ApplicationTools::getStringParameter("genetic_code", params, "", "", true, (AlphabetTools::isCodonAlphabet(*alpha) ? 0 : 1));
451 
452  auto vsC = getProbabilisticSiteContainer(alpha, args2, "", true, verbose, warn);
453 
456 
457  vsC = getSitesToAnalyse(*vsC, args2, "", true, false);
458 
459  if (mCont.find(num) != mCont.end())
460  {
461  ApplicationTools::displayWarning("Alignment " + TextTools::toString(num) + " already assigned, replaced by new one.");
462  }
463  mCont.emplace(num, std::move(vsC));
464  }
465  }
466 
467  return mCont;
468 }
469 
470 /******************************************************************************/
471 
472 std::unique_ptr<VectorSiteContainer> SequenceApplicationTools::getSiteContainer(
473  std::shared_ptr<const Alphabet> alpha,
474  const map<string, string>& params,
475  const string& suffix,
476  bool suffixIsOptional,
477  bool verbose,
478  int warn)
479 {
480  string sequenceFilePath = ApplicationTools::getAFilePath("input.sequence.file", params, true, true, suffix, suffixIsOptional, "none", warn);
481  string sequenceFormat = ApplicationTools::getStringParameter("input.sequence.format", params, "Fasta()", suffix, suffixIsOptional, warn);
482  BppOAlignmentReaderFormat bppoReader(warn);
483 
484  unique_ptr<IAlignment> iAln(bppoReader.read(sequenceFormat));
485  map<string, string> args(bppoReader.getUnparsedArguments());
486 
487  if (verbose)
488  {
489  ApplicationTools::displayResult("Sequence file " + suffix, sequenceFilePath);
490  ApplicationTools::displayResult("Sequence format " + suffix, iAln->getFormatName());
491  }
492 
493  shared_ptr<const Alphabet> alpha2;
494  if (AlphabetTools::isRNYAlphabet(*alpha))
495  alpha2 = dynamic_pointer_cast<const RNY>(alpha)->getLetterAlphabet();
496  else
497  alpha2 = alpha;
498 
499  auto sites2 = make_unique<VectorSiteContainer>(*(iAln->readAlignment(sequenceFilePath, alpha2))); // We copy into a VectorSiteContainer, as most readers will generate an AlignedSequenceContainer)
500 
501  auto sites = unique_ptr<VectorSiteContainer>();
502 
504  if (AlphabetTools::isRNYAlphabet(*alpha))
505  {
506  sites = make_unique<VectorSiteContainer>(alpha);
507  for (size_t i = 0; i < sites2->getNumberOfSequences(); ++i)
508  {
509  auto seqP = SequenceTools::RNYslice(sites2->sequence(i));
510  sites->addSequence(sites2->sequenceKey(i), seqP);
511  }
512  }
513  else
514  sites = std::move(sites2);
515 
516  // Look for site selection:
517  if (iAln->getFormatName() == "MASE file")
518  {
519  // getting site set:
520  string siteSet = ApplicationTools::getStringParameter("siteSelection", args, "none", suffix, suffixIsOptional, warn + 1);
521  if (siteSet != "none")
522  {
523  unique_ptr<VectorSiteContainer> selectedSites;
524  try
525  {
526  auto sel = MaseTools::getSelectedSites(*sites, siteSet);
527  selectedSites = std::move(sel);
528  if (verbose)
529  ApplicationTools::displayResult("Set found", TextTools::toString(siteSet) + " sites.");
530  }
531  catch (IOException& ioe)
532  {
533  throw ioe;
534  }
535  if (selectedSites->getNumberOfSites() == 0)
536  {
537  throw Exception("Site set '" + siteSet + "' is empty.");
538  }
539  sites = std::move(selectedSites);
540  }
541  }
542 
543  // getting site set:
544  size_t nbSites = sites->getNumberOfSites();
545 
546  string siteSet = ApplicationTools::getStringParameter("input.site.selection", params, "none", suffix, suffixIsOptional, warn + 1);
547 
548  unique_ptr<VectorSiteContainer> selectedSites;
549  if (siteSet != "none")
550  {
551  if (siteSet[0] == '(')
552  siteSet = siteSet.substr(1, siteSet.size() - 2);
553 
554  vector<size_t> vSite;
555  try
556  {
557  vector<int> vSite1 = NumCalcApplicationTools::seqFromString(siteSet, ",", ":");
558 
559  for (size_t i = 0; i < vSite1.size(); ++i)
560  {
561  int x = (vSite1[i] >= 0 ? vSite1[i] : static_cast<int>(nbSites) + vSite1[i] + 1);
562  if (x <= (int)nbSites)
563  {
564  if (x > 0)
565  vSite.push_back(static_cast<size_t>(x - 1));
566  else
567  throw Exception("SequenceApplicationTools::getSiteContainer(). Incorrect null index: " + TextTools::toString(x));
568  }
569  else
570  {
571  ApplicationTools::displayResult("Site selection too large index", TextTools::toString(x));
572  ApplicationTools::displayResult("Limit to max length", TextTools::toString(nbSites));
573  vSite.push_back(static_cast<size_t>(nbSites));
574  }
575  }
576  auto sel = SiteContainerTools::getSelectedSites(*sites, vSite);
577  selectedSites = std::move(sel);
578  selectedSites->reindexSites();
579  }
580  catch (Exception& e)
581  {
582  string seln;
583  map<string, string> selArgs;
584  KeyvalTools::parseProcedure(siteSet, seln, selArgs);
585  if (seln == "Sample")
586  {
587  size_t n = ApplicationTools::getParameter<size_t>("n", selArgs, nbSites, "", true, warn + 1);
588  bool replace = ApplicationTools::getBooleanParameter("replace", selArgs, false, "", true, warn + 1);
589 
590  vSite.resize(n);
591  vector<size_t> vPos;
592  for (size_t p = 0; p < nbSites; ++p)
593  {
594  vPos.push_back(p);
595  }
596 
597  RandomTools::getSample(vPos, vSite, replace);
598 
599  auto sel = SiteContainerTools::getSelectedSites(*sites, vSite);
600  selectedSites = std::move(sel);
601  if (replace)
602  selectedSites->reindexSites();
603  }
604  else
605  throw Exception("Unknown site selection description: " + siteSet);
606  }
607 
608  if (verbose)
609  ApplicationTools::displayResult("Selected sites", TextTools::toString(siteSet));
610 
611  if (selectedSites && (selectedSites->getNumberOfSites() == 0))
612  {
613  throw Exception("Site set '" + siteSet + "' is empty.");
614  }
615  sites = std::move(selectedSites);
616  }
617 
618  // Apply sequence selection:
619  restrictSelectedSequencesByName(*sites, params, suffix, suffixIsOptional, verbose, warn);
620 
621  return sites;
622 }
623 
624 /******************************************************************************/
625 
626 unique_ptr<ProbabilisticVectorSiteContainer> SequenceApplicationTools::getProbabilisticSiteContainer(
627  shared_ptr<const Alphabet> alpha,
628  const map<string, string>& params,
629  const string& suffix,
630  bool suffixIsOptional,
631  bool verbose,
632  int warn)
633 {
634  string sequenceFilePath = ApplicationTools::getAFilePath("input.sequence.file", params, true, true, suffix, suffixIsOptional, "none", warn);
635  string sequenceFormat = ApplicationTools::getStringParameter("input.sequence.format", params, "Fasta()", suffix, suffixIsOptional, warn);
636  BppOAlignmentReaderFormat bppoReader(warn);
637 
638  unique_ptr<IAlignment> iAln;
639  unique_ptr<IProbabilisticAlignment> iProbAln;
640 
641  try
642  {
643  iAln.reset(bppoReader.read(sequenceFormat).release());
644  }
645  catch (Exception& e)
646  {
647  iProbAln.reset(bppoReader.readProbabilistic(sequenceFormat).release());
648  }
649 
650  // Probabilistic from Sequence format only possible for Allelic alphabet
651  if (iAln && !AlphabetTools::isAllelicAlphabet(*alpha))
652  throw IOException("Bad format");
653 
654 
655  map<string, string> args(bppoReader.getUnparsedArguments());
656  if (verbose)
657  {
658  ApplicationTools::displayResult("Sequence file " + suffix, sequenceFilePath);
659  ApplicationTools::displayResult("Sequence format " + suffix, iAln ? iAln->getFormatName() : iProbAln->getFormatName());
660  }
661 
662  shared_ptr<const Alphabet> alpha2;
663  if (AlphabetTools::isRNYAlphabet(*alpha))
664  alpha2 = dynamic_pointer_cast<const RNY>(alpha)->getLetterAlphabet();
665  else
666  {
668  alpha2 = dynamic_pointer_cast<const AllelicAlphabet>(alpha)->getStateAlphabet();
669  else
670  alpha2 = alpha;
671  }
672 
673  unique_ptr<VectorSiteContainer> sites;
674  unique_ptr<ProbabilisticVectorSiteContainer> psites;
675 
676  if (iAln)
677  sites = make_unique<VectorSiteContainer>(*(iAln->readAlignment(sequenceFilePath, alpha2)));
678  else
679  psites = make_unique<ProbabilisticVectorSiteContainer>(*iProbAln->readAlignment(sequenceFilePath, alpha2));
680 
681  if (sites)
682  {
684  if (AlphabetTools::isRNYAlphabet(*alpha2))
685  {
686  unique_ptr<VectorSiteContainer> tmpsites(new VectorSiteContainer(alpha2));
687 
688  const SequenceTools ST;
689  for (const auto& name : sites->getSequenceNames())
690  {
691  auto sl = ST.RNYslice(sites->sequence(name));
692  tmpsites->addSequence(name, sl);
693  }
694  sites = std::move(tmpsites);
695  }
696 
697  // Look for site selection:
698 
699  if (iAln->getFormatName() == "MASE file")
700  {
701  // getting site set:
702  string siteSet = ApplicationTools::getStringParameter("siteSelection", args, "none", suffix, suffixIsOptional, warn + 1);
703  if (siteSet != "none")
704  {
705  unique_ptr<VectorSiteContainer> selectedSites;
706  try
707  {
708  selectedSites.reset(dynamic_cast<VectorSiteContainer*>(MaseTools::getSelectedSites(*sites, siteSet).release()));
709  if (verbose)
710  ApplicationTools::displayResult("Set found", TextTools::toString(siteSet) + " sites.");
711  }
712  catch (IOException& ioe)
713  {
714  throw ioe;
715  }
716  if (selectedSites->getNumberOfSites() == 0)
717  {
718  throw Exception("Site set '" + siteSet + "' is empty.");
719  }
720  sites = std::move(selectedSites);
721  }
722  }
723 
724  // Then convert VectorSiteContainer in ProbabilisticVectorSiteContainer
725 
726  psites.reset(new ProbabilisticVectorSiteContainer(alpha2));
727 
728  auto names = psites->getSequenceNames();
729 
730  auto chars = alpha2->getResolvedChars();
731 
732  Table<double> dtable(chars.size(), sites->getNumberOfSites());
733  dtable.setRowNames(chars);
734 
735  for (const auto& name : names)
736  {
737  const auto& sequence = psites->sequence(name);
738  vector<double> vval(chars.size());
739 
740  for (size_t pos = 0; pos < sequence.size(); pos++)
741  {
742  size_t i = 0;
743  for (auto c:chars)
744  {
745  vval[i] = sequence.getStateValueAt(pos, alpha2->charToInt(c));
746  i++;
747  }
748 
749  dtable.setColumn(vval, pos);
750  }
751 
752  unique_ptr<ProbabilisticSequence> seq(new ProbabilisticSequence(name, dtable, sequence.getComments(), alpha2));
753 
754  psites->addSequence(name, seq);
755  }
756  }
757 
760  {
761  auto pallsites = unique_ptr<ProbabilisticVectorSiteContainer>(new ProbabilisticVectorSiteContainer(alpha));
762 
763  auto names = psites->getSequenceNames();
764 
765  for (const auto& name : names)
766  {
767  unique_ptr<ProbabilisticSequence> seq(dynamic_pointer_cast<const AllelicAlphabet>(alpha)->convertFromStateAlphabet(psites->sequence(name)));
768 
769  pallsites->addSequence(name, seq);
770  }
771 
772  psites = std::move(pallsites);
773  }
774 
775  // getting selection site set:
776 
777  size_t nbSites = sites ? sites->getNumberOfSites() : psites->getNumberOfSites();
778 
779  string siteSet = ApplicationTools::getStringParameter("input.site.selection", params, "none", suffix, suffixIsOptional, warn + 1);
780 
781 
782  if (siteSet != "none")
783  {
784  if (siteSet[0] == '(')
785  siteSet = siteSet.substr(1, siteSet.size() - 2);
786 
787  vector<size_t> vSite;
788  try
789  {
790  vector<int> vSite1 = NumCalcApplicationTools::seqFromString(siteSet, ",", ":");
791 
792  for (auto ps : vSite1)
793  {
794  int x = (ps >= 0 ? ps : static_cast<int>(nbSites) + ps + 1);
795  if (x <= (int)nbSites)
796  {
797  if (x > 0)
798  vSite.push_back(static_cast<size_t>(x - 1));
799  else
800  {
801  throw Exception("SequenceApplicationTools::getSiteContainer(). Incorrect null index");
802  }
803  }
804  else
805  {
806  ApplicationTools::displayMessage("Site selection too large index: " + TextTools::toString(x));
807  ApplicationTools::displayMessage("Limit to max length: " + TextTools::toString(nbSites));
808  vSite.push_back(static_cast<size_t>(nbSites - 1));
809  break;
810  }
811  }
812  }
813  catch (Exception& e)
814  {
815  string seln;
816  map<string, string> selArgs;
817  KeyvalTools::parseProcedure(siteSet, seln, selArgs);
818  if (seln == "Sample")
819  {
820  size_t n = ApplicationTools::getParameter<size_t>("n", selArgs, nbSites, "", true, warn + 1);
821  bool replace = ApplicationTools::getBooleanParameter("replace", selArgs, false, "", true, warn + 1);
822 
823  vSite.resize(n);
824  vector<size_t> vPos;
825  for (size_t p = 0; p < nbSites; ++p)
826  {
827  vPos.push_back(p);
828  }
829 
830  RandomTools::getSample(vPos, vSite, replace);
831  }
832  else
833  throw Exception("Unknown site selection description: " + siteSet);
834  }
835 
836 
837  if (verbose)
838  ApplicationTools::displayResult("Selected sites", TextTools::toString(siteSet));
839 
840  auto selectedSites = make_unique<ProbabilisticVectorSiteContainer>(alpha);
841 
842  SiteContainerTools::getSelectedSites(*psites, vSite, *selectedSites);
843 
844  if (selectedSites && (selectedSites->getNumberOfSites() == 0))
845  {
846  throw Exception("Site set '" + siteSet + "' is empty.");
847  }
848 
849  // if (replace)
850  // selectedSites->reindexSites();
851  psites = std::move(selectedSites);
852  }
853 
854  // Apply sequence selection:
855 // restrictSelectedSequencesByName(*psites, params, suffix, suffixIsOptional, verbose, warn);
856 
857  return psites;
858 }
859 
860 /******************************************************************************/
861 
863  SequenceContainerInterface& allSequences,
864  const map<std::string, std::string>& params,
865  string suffix,
866  bool suffixIsOptional,
867  bool verbose,
868  int warn)
869 {
870  string optionKeep = ApplicationTools::getStringParameter("input.sequence.keep_names", params, "all", suffix, suffixIsOptional, warn);
871  if (optionKeep != "all")
872  {
873  vector<string> selection = ApplicationTools::getVectorParameter<string>("input.sequence.keep_names", params, ',', optionKeep, suffix, suffixIsOptional, warn);
874  sort(selection.begin(), selection.end());
875  for (const auto& name: allSequences.getSequenceNames())
876  {
877  if (!binary_search(selection.begin(), selection.end(), name))
878  {
879  allSequences.removeSequence(name);
880  if (verbose)
881  {
882  ApplicationTools::displayResult("Discard sequence", name);
883  }
884  }
885  }
886  }
887  string optionRemove = ApplicationTools::getStringParameter("input.sequence.remove_names", params, "none", suffix, suffixIsOptional, warn);
888  if (optionRemove != "none")
889  {
890  vector<string> selection = ApplicationTools::getVectorParameter<string>("input.sequence.remove_names", params, ',', optionRemove, suffix, suffixIsOptional, warn);
891  vector<string> seqNames = allSequences.getSequenceNames();
892  sort(seqNames.begin(), seqNames.end());
893  for (const auto& name: selection)
894  {
895  if (binary_search(seqNames.begin(), seqNames.end(), name))
896  {
897  allSequences.removeSequence(name);
898  if (verbose)
899  {
900  ApplicationTools::displayResult("Discard sequence", name);
901  }
902  }
903  else
904  {
905  throw SequenceNotFoundException("No sequence with the specified name was found.", name);
906  }
907  }
908  }
909 }
910 
911 /******************************************************************************/
912 
914  const SequenceContainerInterface& sequences,
915  const map<string, string>& params,
916  const string& suffix,
917  bool verbose,
918  int warn)
919 {
920  string sequenceFilePath = ApplicationTools::getAFilePath("output.sequence.file", params, true, false, suffix, false, "none", warn);
921  string sequenceFormat = ApplicationTools::getStringParameter("output.sequence.format", params, "Fasta", suffix, false, warn);
922  BppOSequenceWriterFormat bppoWriter(warn);
923  unique_ptr<OSequence> oSeq(bppoWriter.read(sequenceFormat));
924  if (verbose)
925  {
926  ApplicationTools::displayResult("Output sequence file " + suffix, sequenceFilePath);
927  ApplicationTools::displayResult("Output sequence format " + suffix, oSeq->getFormatName());
928  }
929 
930  // Write sequences:
931  oSeq->writeSequences(sequenceFilePath, sequences, true);
932 }
933 
934 /******************************************************************************/
935 
937  const SiteContainerInterface& sites,
938  const map<string, string>& params,
939  const string& suffix,
940  bool verbose,
941  int warn)
942 {
943  string sequenceFilePath = ApplicationTools::getAFilePath("output.sequence.file", params, true, false, suffix, false, "none", warn);
944  string sequenceFormat = ApplicationTools::getStringParameter("output.sequence.format", params, "Fasta", suffix, false, warn);
945  BppOAlignmentWriterFormat bppoWriter(warn);
946  unique_ptr<OAlignment> oAln(bppoWriter.read(sequenceFormat));
947  if (verbose)
948  {
949  ApplicationTools::displayResult("Output alignment file " + suffix, sequenceFilePath);
950  ApplicationTools::displayResult("Output alignment format " + suffix, oAln->getFormatName());
951  }
952 
953  // Write sequences:
954  oAln->writeAlignment(sequenceFilePath, sites, true);
955 }
956 
957 /******************************************************************************/
static bool isRNYAlphabet(const Alphabet &alphabet)
static bool isNucleicAlphabet(const Alphabet &alphabet)
static bool isCodonAlphabet(const Alphabet &alphabet)
static bool isAllelicAlphabet(const Alphabet &alphabet)
static void displayMessage(const std::string &text)
static std::vector< std::string > matchingParameters(const std::string &pattern, const std::map< std::string, std::string > &params)
static bool getBooleanParameter(const std::string &parameterName, const std::map< std::string, std::string > &params, bool defaultValue, const std::string &suffix="", bool suffixIsOptional=true, int warn=0)
static std::string getStringParameter(const std::string &parameterName, const std::map< std::string, std::string > &params, const std::string &defaultValue, const std::string &suffix="", bool suffixIsOptional=true, int warn=0)
static void displayWarning(const std::string &text)
static void displayResult(const std::string &text, const T &result)
static std::string getAFilePath(const std::string &parameter, const std::map< std::string, std::string > &params, bool isRequired=true, bool mustExist=true, const std::string &suffix="", bool suffixIsOptional=false, const std::string &defaultPath="none", int warn=0)
This class implements the ascidian mitochondrial genetic code as describe on the NCBI web site: http:...
Sequence I/O in BppO format.
std::unique_ptr< IAlignment > read(const std::string &description)
Read a IAlignment object from a string.
virtual const std::map< std::string, std::string > & getUnparsedArguments() const
std::unique_ptr< IProbabilisticAlignment > readProbabilistic(const std::string &description)
Sequence I/O in BppO format.
std::unique_ptr< OAlignment > read(const std::string &description)
Read a OAlignment object from a string.
AlphabetIndex1 I/O in BppO format.
std::unique_ptr< AlphabetIndex1 > read(const std::string &description)
Read a AlphabetIndex1 object from a string.
AlphabetIndex2 I/O in BppO format.
std::unique_ptr< AlphabetIndex2 > read(const std::string &description)
Read a AlphabetIndex2 object from a string.
Sequence I/O in BppO format.
std::unique_ptr< ISequence > read(const std::string &description)
Read a ISequence object from a string.
Sequence I/O in BppO format.
std::unique_ptr< OSequence > read(const std::string &description)
Read a OSequence object from a string.
This class implements the mold, protozoan, and coelenterate mitochondrial code and the Mycoplasma/Spi...
This class implements the Echinoderm and Faltworms Mitochondrial genetic code as describe on the NCBI...
Partial implementation of the Transliterator interface for genetic code object.
Definition: GeneticCode.h:50
This class implements the Invertebrate Mitochondrial genetic code as describe on the NCBI website: ht...
static void parseProcedure(const std::string &desc, std::string &name, std::map< std::string, std::string > &args)
static std::unique_ptr< TemplateVectorSiteContainer< SiteType, SequenceType > > getSelectedSites(const TemplateSiteContainerInterface< SiteType, SequenceType, std::string > &sequences, const std::string &setName)
Create a new container corresponding to a site set given in the mase+ format.
Definition: MaseTools.h:62
This class implements the mold, protozoan, and coelenterate mitochondrial code and the Mycoplasma/Spi...
static std::vector< int > seqFromString(const std::string &s, const std::string &delim=",", const std::string &seqdelim="-")
A basic implementation of the ProbabilisticSequence interface.
static void getSample(const std::vector< T > &vin, std::vector< T > &vout, bool replace=false)
static std::unique_ptr< ProbabilisticVectorSiteContainer > getProbabilisticSiteContainer(std::shared_ptr< const Alphabet > alpha, const std::map< std::string, std::string > &params, const std::string &suffix="", bool suffixIsOptional=true, bool verbose=true, int warn=1)
Build a ProbabilisticSiteContainer object according to the BppO syntax.
static void restrictSelectedSequencesByName(SequenceContainerInterface &allSequences, const std::map< std::string, std::string > &params, std::string suffix="", bool suffixIsOptional=true, bool verbose=true, int warn=1)
Retrieves selected sequences (by name).
static std::map< size_t, std::unique_ptr< ProbabilisticVectorSiteContainer > > getProbabilisticSiteContainers(std::shared_ptr< const Alphabet > alpha, const std::map< std::string, std::string > &params, const std::string &prefix="input.", const std::string &suffix="", bool suffixIsOptional=true, bool verbose=true, int warn=1)
Build multiple ProbabilisticSiteContainer objects according to the BppO syntax.
static std::unique_ptr< GeneticCode > getGeneticCode(std::shared_ptr< const NucleicAlphabet > alphabet, const std::string &description)
Build a GeneticCode object according to options.
static std::map< size_t, std::unique_ptr< VectorSiteContainer > > getSiteContainers(std::shared_ptr< const Alphabet > alpha, const std::map< std::string, std::string > &params, const std::string &prefix="input.", const std::string &suffix="", bool suffixIsOptional=true, bool verbose=true, int warn=1)
Build multiple SiteContainer objects according to the BppO syntax.
static void writeAlignmentFile(const SiteContainerInterface &sequences, const std::map< std::string, std::string > &params, const std::string &suffix="", bool verbose=true, int warn=1)
Write a sequence alignment file according to options.
static void writeSequenceFile(const SequenceContainerInterface &sequences, const std::map< std::string, std::string > &params, const std::string &suffix="", bool verbose=true, int warn=1)
Write a sequence file according to options.
static std::unique_ptr< VectorSiteContainer > getSiteContainer(std::shared_ptr< const Alphabet > alpha, const std::map< std::string, std::string > &params, const std::string &suffix="", bool suffixIsOptional=true, bool verbose=true, int warn=1)
Build a SiteContainer object according to the BppO syntax.
static std::unique_ptr< SequenceContainerInterface > getSequenceContainer(std::shared_ptr< const Alphabet > alpha, const std::map< std::string, std::string > &params, const std::string &suffix="", bool suffixIsOptional=true, bool verbose=true, int warn=1)
Build a SequenceContainer object according to options.
static std::unique_ptr< AlphabetIndex2 > getAlphabetIndex2(std::shared_ptr< const Alphabet > alphabet, const std::string &description, const std::string &message="Alphabet distance:", bool verbose=true)
Build a AlphabetIndex2 object for a given alphabet.
static std::unique_ptr< Alphabet > getAlphabet(const std::map< std::string, std::string > &params, const std::string &suffix="", bool suffixIsOptional=true, bool verbose=true, bool allowGeneric=false, int warn=1)
Build an Alphabet object according to options.
static std::unique_ptr< AlphabetIndex1 > getAlphabetIndex1(std::shared_ptr< const Alphabet > alphabet, const std::string &description, const std::string &message="Alphabet measure:", bool verbose=true)
Build a AlphabetIndex1 object for a given alphabet.
Exception thrown when a sequence is not found The sequence not found exception base class.
SequenceTools static class.
Definition: SequenceTools.h:64
static std::unique_ptr< Sequence > RNYslice(const SequenceInterface &sequence, int ph)
Get the RNY decomposition of a DNA sequence.
static void getSelectedSites(const TemplateSiteContainerInterface< SiteType, SequenceType, HashType > &sites, const SiteSelection &selection, TemplateSiteContainerInterface< SiteType, SequenceType, HashType > &outputSites)
Extract a specified set of sites.
This class implements the standard genetic code as describe on the NCBI web site: http://www....
void setRowNames(const std::vector< std::string > &rowNames)
void setColumn(const std::vector< T > &newColumn, size_t pos)
The SequenceContainer interface.
virtual std::unique_ptr< SequenceType > removeSequence(const HashType &sequenceKey)=0
Remove a sequence from the container.
virtual std::vector< std::string > getSequenceNames() const =0
The VectorSiteContainer class.
This class implements the vertebrate mitochondrial genetic code as describe on the NCBI web site: htt...
This class implements the Invertebrate Mitochondrial genetic code as describe on the NCBI website: ht...
bool isDecimalInteger(const std::string &s, char scientificNotation='e')
std::string toString(T t)
This alphabet is used to deal NumericAlphabet.
TemplateVectorSiteContainer< ProbabilisticSite, ProbabilisticSequence > ProbabilisticVectorSiteContainer
TemplateVectorSiteContainer< Site, Sequence > VectorSiteContainer