bpp-phyl3  3.0.0
SimpleSubstitutionProcessSiteSimulator.cpp
Go to the documentation of this file.
1 // SPDX-FileCopyrightText: The Bio++ Development Group
2 //
3 // SPDX-License-Identifier: CECILL-2.1
4 
8 #include <algorithm>
9 
11 
12 // From bpp-seq:
14 
16 
17 using namespace bpp;
18 using namespace std;
19 
20 /******************************************************************************/
21 
23  std::shared_ptr<const SubstitutionProcessInterface> process) :
24  process_(process),
25  phyloTree_(process_->getParametrizablePhyloTree()),
26  tree_(ProcessComputationTree(process_)),
27  qRates_(),
28  qRoots_(),
29  seqIndexes_(),
30  seqNames_(phyloTree_->getAllLeavesNames()),
31  speciesNodes_(),
32  nbNodes_(),
33  nbClasses_(process_->getNumberOfClasses()),
34  nbStates_(process_->getNumberOfStates()),
35  continuousRates_(false),
36  outputInternalSites_(false)
37 {
38  init();
39 }
40 
41 /******************************************************************************/
42 
44 {
45  // Initialize sons & fathers of tree_ Nodes
46  // set sequence names
47 
49 
50  // Set up cumsum rates
51 
52  const auto dRate = process_->getRateDistribution();
53  qRates_ = VectorTools::cumSum(dRate->getProbabilities());
54 
55  // Initialize root frequencies
56 
57  auto cr = VectorTools::cumSum(process_->getRootFrequencies());
58 
59  qRoots_.resize(nbClasses_);
60  for (auto& roots : qRoots_)
61  {
62  roots = cr;
63  }
64 
65  // Initialize cumulative pxy for edges that have models
66  auto edges = tree_.getAllEdges();
67 
68  for (auto& edge : edges)
69  {
70  const auto model = edge->getModel();
71  if (edge->useProb())
72  continue;
73 
74  const auto transmodel = dynamic_pointer_cast<const TransitionModelInterface>(model);
75  if (!transmodel)
76  throw Exception("SubstitutionProcessSiteSimulator::init : model " + model->getName() + " on branch " + TextTools::toString(tree_.getEdgeIndex(edge)) + " is not a TransitionModel.");
77 
78  VVVdouble* cumpxy_node_ = &edge->cumpxy_;
79  cumpxy_node_->resize(nbClasses_);
80 
81  for (size_t c = 0; c < nbClasses_; c++)
82  {
83  double brlen = dRate->getCategory(c) * phyloTree_->getEdge(edge->getSpeciesIndex())->getLength();
84 
85  VVdouble* cumpxy_node_c_ = &(*cumpxy_node_)[c];
86 
87  cumpxy_node_c_->resize(nbStates_);
88 
89  // process transition probabilities already consider rates &
90  // branch length
91 
92 
93  const Matrix<double>* P;
94 
95  const auto& vSub(edge->subModelNumbers());
96 
97  if (vSub.size() == 0)
98  P = &transmodel->getPij_t(brlen);
99  else
100  {
101  if (vSub.size() > 1)
102  throw Exception("SubstitutionProcessSiteSimulator::init : only 1 submodel can be used.");
103 
104  const auto mmodel = dynamic_pointer_cast<const MixedTransitionModelInterface>(transmodel);
105 
106  const auto model2 = mmodel->getNModel(vSub[0]);
107 
108  P = &model2->getPij_t(brlen);
109  }
110 
111  for (size_t x = 0; x < nbStates_; x++)
112  {
113  Vdouble* cumpxy_node_c_x_ = &(*cumpxy_node_c_)[x];
114  cumpxy_node_c_x_->resize(nbStates_);
115  (*cumpxy_node_c_x_)[0] = (*P)(x, 0);
116  for (size_t y = 1; y < nbStates_; y++)
117  {
118  (*cumpxy_node_c_x_)[y] = (*cumpxy_node_c_x_)[y - 1] + (*P)(x, y);
119  }
120  }
121  }
122  }
123 
124  // Initialize cumulative prob for mixture nodes
125  auto nodes = tree_.getAllNodes();
126 
127  for (auto node:nodes)
128  {
129  if (node->isMixture()) // set probas to chose
130  {
131  auto outEdges = tree_.getOutgoingEdges(node);
132  Vdouble vprob(0);
133 
134  for (auto edge : outEdges)
135  {
136  auto model = dynamic_pointer_cast<const MixedTransitionModelInterface>(edge->getModel());
137  if (!model)
138  throw Exception("SubstitutionProcessSiteSimulator::init : model in edge " + TextTools::toString(tree_.getEdgeIndex(edge)) + " is not a mixture.");
139 
140  const auto& vNb(edge->subModelNumbers());
141 
142  double x = 0.;
143  for (auto nb:vNb)
144  {
145  x += model->getNProbability(nb);
146  }
147 
148  vprob.push_back(x);
149  node->sons_.push_back(tree_.getSon(edge));
150  }
151 
152  vprob /= VectorTools::sum(vprob);
153 
154  // Here there is no use to have one cumProb_ per class, but this
155  // is used for a posteriori simulations
156 
157  node->cumProb_.resize(nbClasses_);
158  for (size_t c = 0; c < nbClasses_; c++)
159  {
160  node->cumProb_[c] = VectorTools::cumSum(vprob);
161  }
162  }
163  }
164 }
165 
166 /******************************************************************************/
167 
169 {
170  if (continuousRates_ && process_->getRateDistribution())
171  {
172  double rate = process_->getRateDistribution()->randC();
173  return simulateSite(rate);
174  }
175  else
176  {
177  size_t rateClass = RandomTools::pickFromCumSum(qRates_);
178  return simulateSite(rateClass);
179  }
180 }
181 
182 
183 unique_ptr<Site> SimpleSubstitutionProcessSiteSimulator::simulateSite(double rate) const
184 {
185  // Draw an initial state randomly according to equilibrium frequencies:
186  // Use rate class 0
187 
188  size_t initialStateIndex = RandomTools::pickFromCumSum(qRoots_[0]);
189 
190  shared_ptr<SimProcessNode> root = tree_.getRoot();
191  root->state_ = initialStateIndex;
192 
193  evolveInternal(root, rate);
194 
195  // Now create a Site object:
196  Vint site(seqNames_.size());
197  for (size_t i = 0; i < seqNames_.size(); ++i)
198  {
199  site[i] = process_->stateMap().getAlphabetStateAsInt(speciesNodes_.at(seqIndexes_[i])->state_);
200  }
201  auto alphabet = getAlphabet();
202  return make_unique<Site>(site, alphabet);
203 }
204 
205 
206 unique_ptr<Site> SimpleSubstitutionProcessSiteSimulator::simulateSite(size_t rateClass) const
207 {
208  // Draw an initial state randomly according to equilibrium frequencies:
209 
210  size_t initialStateIndex = RandomTools::pickFromCumSum(qRoots_[rateClass]);
211 
212  shared_ptr<SimProcessNode> root = tree_.getRoot();
213  root->state_ = initialStateIndex;
214 
215  evolveInternal(root, rateClass);
216 
217  // Now create a Site object:
218  Vint site(seqNames_.size());
219  for (size_t i = 0; i < seqNames_.size(); ++i)
220  {
221  site[i] = process_->stateMap().getAlphabetStateAsInt(speciesNodes_.at(seqIndexes_[i])->state_);
222  }
223 
224  auto alphabet = getAlphabet();
225  return make_unique<Site>(site, alphabet);
226 }
227 
228 
229 unique_ptr<Site> SimpleSubstitutionProcessSiteSimulator::simulateSite(size_t ancestralStateIndex, double rate) const
230 {
231  shared_ptr<SimProcessNode> root = tree_.getRoot();
232  root->state_ = ancestralStateIndex;
233 
234  evolveInternal(root, rate);
235 
236  // Now create a Site object:
237  Vint site(seqNames_.size());
238  for (size_t i = 0; i < seqNames_.size(); ++i)
239  {
240  site[i] = process_->stateMap().getAlphabetStateAsInt(speciesNodes_.at(seqIndexes_[i])->state_);
241  }
242 
243  auto alphabet = getAlphabet();
244  return make_unique<Site>(site, alphabet);
245 }
246 
247 /******************************************************************************/
248 
249 unique_ptr<SiteSimulationResult> SimpleSubstitutionProcessSiteSimulator::dSimulateSite() const
250 {
251  if (continuousRates_ && process_->getRateDistribution())
252  {
253  double rate = process_->getRateDistribution()->randC();
254  return dSimulateSite(rate);
255  }
256  else
257  {
258  size_t rateClass = RandomTools::pickFromCumSum(qRates_);
259  return dSimulateSite(rateClass);
260  }
261 }
262 
263 
264 unique_ptr<SiteSimulationResult> SimpleSubstitutionProcessSiteSimulator::dSimulateSite(double rate) const
265 {
266  // Draw an initial state randomly according to equilibrium frequencies:
267  // Use rate class 0
268 
269  size_t initialStateIndex = RandomTools::pickFromCumSum(qRoots_[0]);
270 
271  shared_ptr<SimProcessNode> root = tree_.getRoot();
272  root->state_ = initialStateIndex;
273 
274  auto ssr = make_unique<SiteSimulationResult>(phyloTree_, process_->getStateMap(), initialStateIndex);
275 
276  evolveInternal(root, rate, ssr.get());
277 
278  return ssr;
279 }
280 
281 unique_ptr<SiteSimulationResult> SimpleSubstitutionProcessSiteSimulator::dSimulateSite(size_t rateClass) const
282 {
283  // Draw an initial state randomly according to equilibrium frequencies:
284  // Use rate class 0
285 
286  size_t initialStateIndex = RandomTools::pickFromCumSum(qRoots_[rateClass]);
287 
288  shared_ptr<SimProcessNode> root = tree_.getRoot();
289  root->state_ = initialStateIndex;
290 
291  auto ssr = make_unique<SiteSimulationResult>(phyloTree_, process_->getStateMap(), initialStateIndex);
292 
293  evolveInternal(root, rateClass, ssr.get());
294 
295  return ssr;
296 }
297 
298 unique_ptr<SiteSimulationResult> SimpleSubstitutionProcessSiteSimulator::dSimulateSite(size_t ancestralStateIndex, double rate) const
299 {
300  shared_ptr<SimProcessNode> root = tree_.getRoot();
301  root->state_ = ancestralStateIndex;
302 
303  auto ssr = make_unique<SiteSimulationResult>(phyloTree_, process_->getStateMap(), ancestralStateIndex);
304 
305  evolveInternal(root, rate, ssr.get());
306 
307  return ssr;
308 }
309 
310 
311 /******************************************************************************/
312 
314  std::shared_ptr<SimProcessNode> node,
315  size_t rateClass,
316  SiteSimulationResult* ssr) const
317 {
318  speciesNodes_[node->getSpeciesIndex()] = node;
319 
320  if (node->isSpeciation())
321  {
322  auto vEdge = tree_.getOutgoingEdges(node);
323 
324  for (auto edge : vEdge)
325  {
326  auto son = tree_.getSon(edge);
327 
328  if (edge->getModel())
329  {
330  if (ssr) // Detailed simulation
331  {
332  auto tm = dynamic_pointer_cast<const SubstitutionModelInterface>(edge->getModel());
333 
334  if (!tm)
335  throw Exception("SimpleSubstitutionProcessSiteSimulator::EvolveInternal : detailed simulation not possible for non-markovian model on edge " + TextTools::toString(son->getSpeciesIndex()) + " for model " + edge->getModel()->getName());
336 
337  SimpleMutationProcess process(tm);
338 
339  double brlen = process_->getRateDistribution()->getCategory(rateClass) * phyloTree_->getEdge(edge->getSpeciesIndex())->getLength();
340 
341  MutationPath mp(tm->getAlphabet(), node->state_, brlen);
342  if (dynamic_cast<const GivenDataSubstitutionProcessSiteSimulator*>(this) == 0)
343  {
344  mp = process.detailedEvolve(node->state_, brlen);
345  son->state_ = mp.getFinalState();
346  }
347  else // First get final state
348  {
349  son->state_ = RandomTools::pickFromCumSum(edge->cumpxy_[rateClass][node->state_]);
350  mp = process.detailedEvolve(node->state_, son->state_, brlen);
351  }
352 
353  // Now append infos in ssr:
354  ssr->addNode(edge->getSpeciesIndex(), mp);
355  }
356  else
357  son->state_ = RandomTools::pickFromCumSum(edge->cumpxy_[rateClass][node->state_]);
358  }
359  else
360  son->state_ = node->state_;
361 
362  evolveInternal(son, rateClass, ssr);
363  }
364  }
365  else if (node->isMixture())
366  {
367  const auto& cumProb = node->cumProb_[rateClass];
368 
369  size_t y = RandomTools::pickFromCumSum(cumProb);
370  auto son = node->sons_[y];
371  son->state_ = node->state_;
372  evolveInternal(son, rateClass, ssr);
373  }
374  else
375  throw Exception("SimpleSubstitutionProcessSiteSimulator::evolveInternal : unknown property for node " + TextTools::toString(tree_.getNodeIndex(node)));
376 }
377 
378 /******************************************************************************/
379 
381  std::shared_ptr<SimProcessNode> node,
382  double rate,
383  SiteSimulationResult* ssr) const
384 {
385  speciesNodes_[node->getSpeciesIndex()] = node;
386 
387  if (node->isSpeciation())
388  {
389  auto vEdge = tree_.getOutgoingEdges(node);
390 
391  for (auto edge : vEdge)
392  {
393  auto son = tree_.getSon(edge);
394 
395  if (edge->getModel())
396  {
397  auto tm = dynamic_pointer_cast<const TransitionModelInterface>(edge->getModel());
398 
399  double brlen = rate * phyloTree_->getEdge(edge->getSpeciesIndex())->getLength();
400 
401 
402  if (ssr) // Detailed simulation
403  {
404  auto sm = dynamic_pointer_cast<const SubstitutionModelInterface>(edge->getModel());
405 
406  if (!sm)
407  throw Exception("SimpleSubstitutionProcessSiteSimulator::EvolveInternal : detailed simulation not possible for non-markovian model on edge " + TextTools::toString(son->getSpeciesIndex()) + " for model " + tm->getName());
408 
409  SimpleMutationProcess process(sm);
410 
411  MutationPath mp(tm->getAlphabet(), node->state_, brlen);
412 
413  if (dynamic_cast<const GivenDataSubstitutionProcessSiteSimulator*>(this) == 0)
414  {
415  mp = process.detailedEvolve(node->state_, brlen);
416  son->state_ = mp.getFinalState();
417  }
418  else // First get final state
419  {
420  // Look for the rateClass where rate is (approximation)
421  size_t rateClass = process_->getRateDistribution()->getCategoryIndex(rate);
422 
423  son->state_ = RandomTools::pickFromCumSum(edge->cumpxy_[rateClass][node->state_]);
424  mp = process.detailedEvolve(node->state_, son->state_, brlen);
425  }
426 
427  // Now append infos in ssr:
428  ssr->addNode(edge->getSpeciesIndex(), mp);
429  }
430  else
431  {
432  // process transition probabilities already consider rates &
433  // branch length
434 
435  const Matrix<double>* P;
436 
437  const auto& vSub(edge->subModelNumbers());
438 
439  if (vSub.size() == 0)
440  P = &tm->getPij_t(brlen);
441  else
442  {
443  if (vSub.size() > 1)
444  throw Exception("SubstitutionProcessSiteSimulator::init : only 1 submodel can be used.");
445 
446  const auto mmodel = dynamic_pointer_cast<const MixedTransitionModelInterface>(tm);
447  const auto model = mmodel->getNModel(vSub[0]);
448 
449  P = &model->getPij_t(brlen);
450  }
451 
453  for (size_t y = 0; y < nbStates_; y++)
454  {
455  rand -= (*P)(node->state_, y);
456  if (rand <= 0)
457  {
458  son->state_ = y;
459  break;
460  }
461  }
462  }
463  }
464  else
465  {
466  son->state_ = node->state_;
467  }
468 
469  evolveInternal(son, rate, ssr);
470  }
471  }
472  else if (node->isMixture())
473  {
474  const auto& cumProb = node->cumProb_[0]; // index 0 because it is only possible in a priori simulations, ie all class mixture probabilities are the same
475 
476  size_t y = RandomTools::pickFromCumSum(cumProb);
477  auto son = node->sons_[y];
478  son->state_ = node->state_;
479  evolveInternal(son, rate, ssr);
480  }
481  else
482  throw Exception("SimpleSubstitutionProcessSiteSimulator::evolveInternal : unknown property for node " + TextTools::toString(tree_.getNodeIndex(node)));
483 }
484 
485 
487 {
489 
491  {
492  auto vCN = phyloTree_->getAllNodes();
493  seqNames_.resize(vCN.size());
494  seqIndexes_.resize(vCN.size());
495  for (size_t i = 0; i < seqNames_.size(); i++)
496  {
497  auto index = phyloTree_->getNodeIndex(vCN[i]);
498  seqNames_[i] = (phyloTree_->isLeaf(vCN[i])) ? vCN[i]->getName() : TextTools::toString(index);
499  seqIndexes_[i] = index;
500  }
501  }
502  else
503  {
504  auto vCN = phyloTree_->getAllLeaves();
505  seqNames_.resize(vCN.size());
506  seqIndexes_.resize(vCN.size());
507  for (size_t i = 0; i < seqNames_.size(); i++)
508  {
509  seqNames_[i] = vCN[i]->getName();
510  seqIndexes_[i] = phyloTree_->getNodeIndex(vCN[i]);
511  }
512  }
513 }
MutationPath detailedEvolve(size_t initialState, double time) const
Simulation a character evolution during a specified time according to the given substitution model an...
virtual std::vector< std::shared_ptr< E > > getAllEdges() const=0
virtual std::shared_ptr< N > getRoot() const=0
virtual EdgeIndex getEdgeIndex(const std::shared_ptr< E > edgeObject) const=0
virtual std::vector< std::shared_ptr< N > > getAllNodes() const=0
virtual NodeIndex getNodeIndex(const std::shared_ptr< N > nodeObject) const=0
virtual std::vector< std::shared_ptr< E > > getOutgoingEdges(const std::shared_ptr< N > node) const=0
std::shared_ptr< N > getSon(const std::shared_ptr< E > edge) const
Site simulation under a unique substitution process, given data.
virtual void resize(size_t nRows, size_t nCols)=0
This class is used by MutationProcess to store detailed results of simulations.
size_t getFinalState() const
Retrieve the final state of this path.
static size_t pickFromCumSum(const std::vector< double > &w)
static double giveRandomNumberBetweenZeroAndEntry(double entry)
Generally used mutation process model.
std::shared_ptr< const ParametrizablePhyloTree > phyloTree_
std::shared_ptr< const SubstitutionProcessInterface > process_
void evolveInternal(std::shared_ptr< SimProcessNode > node, size_t rateClass, SiteSimulationResult *ssr=nullptr) const
SPTree tree_
To store states & transition probabilities of the simulator.
SimpleSubstitutionProcessSiteSimulator(std::shared_ptr< const SubstitutionProcessInterface > process)
std::map< size_t, std::shared_ptr< SimProcessNode > > speciesNodes_
Map between species Indexes & used nodes, may change at each simulation.
std::vector< size_t > seqIndexes_
Vector of indexes of sequenced output species.
void outputInternalSites(bool yn) override
Sets whether we will output the internal sequences or not.
std::vector< std::string > seqNames_
Vector of names of sequenced output species.
std::shared_ptr< const Alphabet > getAlphabet() const override
std::unique_ptr< SiteSimulationResult > dSimulateSite() const override
Get a detailed simulation result for one site.
Vdouble qRates_
cumsum probas of the substitution rates
Data structure to store the result of a DetailedSiteSimulator.
virtual void addNode(unsigned int nodeId, MutationPath path)
static T sum(const std::vector< T > &v1)
static std::vector< T > cumSum(const std::vector< T > &v1)
std::string toString(T t)
Defines the basic types of data flow nodes.
std::vector< double > Vdouble
std::vector< int > Vint
std::vector< VVdouble > VVVdouble
std::vector< Vdouble > VVdouble