bpp-phyl3 3.0.0
SimpleSubstitutionProcessSiteSimulator.cpp
Go to the documentation of this file.
1// SPDX-FileCopyrightText: The Bio++ Development Group
2//
3// SPDX-License-Identifier: CECILL-2.1
4
8#include <algorithm>
9
11
12// From bpp-seq:
14
16
17using namespace bpp;
18using namespace std;
19
20/******************************************************************************/
21
22SimpleSubstitutionProcessSiteSimulator::SimpleSubstitutionProcessSiteSimulator(
23 std::shared_ptr<const SubstitutionProcessInterface> process) :
24 process_(process),
25 phyloTree_(process_->getParametrizablePhyloTree()),
26 tree_(ProcessComputationTree(process_)),
27 qRates_(),
28 qRoots_(),
29 seqIndexes_(),
30 seqNames_(phyloTree_->getAllLeavesNames()),
31 speciesNodes_(),
32 nbNodes_(),
33 nbClasses_(process_->getNumberOfClasses()),
34 nbStates_(process_->getNumberOfStates()),
35 continuousRates_(false),
36 outputInternalSites_(false)
37{
38 init();
39}
40
41/******************************************************************************/
42
44{
45 // Initialize sons & fathers of tree_ Nodes
46 // set sequence names
47
49
50 // Set up cumsum rates
51
52 const auto dRate = process_->getRateDistribution();
53 qRates_ = VectorTools::cumSum(dRate->getProbabilities());
54
55 // Initialize root frequencies
56
57 auto cr = VectorTools::cumSum(process_->getRootFrequencies());
58
59 qRoots_.resize(nbClasses_);
60 for (auto& roots : qRoots_)
61 {
62 roots = cr;
63 }
64
65 // Initialize cumulative pxy for edges that have models
66 auto edges = tree_.getAllEdges();
67
68 for (auto& edge : edges)
69 {
70 const auto model = edge->getModel();
71 if (edge->useProb())
72 continue;
73
74 const auto transmodel = dynamic_pointer_cast<const TransitionModelInterface>(model);
75 if (!transmodel)
76 throw Exception("SubstitutionProcessSiteSimulator::init : model " + model->getName() + " on branch " + TextTools::toString(tree_.getEdgeIndex(edge)) + " is not a TransitionModel.");
77
78 VVVdouble* cumpxy_node_ = &edge->cumpxy_;
79 cumpxy_node_->resize(nbClasses_);
80
81 for (size_t c = 0; c < nbClasses_; c++)
82 {
83 double brlen = dRate->getCategory(c) * phyloTree_->getEdge(edge->getSpeciesIndex())->getLength();
84
85 VVdouble* cumpxy_node_c_ = &(*cumpxy_node_)[c];
86
87 cumpxy_node_c_->resize(nbStates_);
88
89 // process transition probabilities already consider rates &
90 // branch length
91
92
93 const Matrix<double>* P;
94
95 const auto& vSub(edge->subModelNumbers());
96
97 if (vSub.size() == 0)
98 P = &transmodel->getPij_t(brlen);
99 else
100 {
101 if (vSub.size() > 1)
102 throw Exception("SubstitutionProcessSiteSimulator::init : only 1 submodel can be used.");
103
104 const auto mmodel = dynamic_pointer_cast<const MixedTransitionModelInterface>(transmodel);
105
106 const auto model2 = mmodel->getNModel(vSub[0]);
107
108 P = &model2->getPij_t(brlen);
109 }
110
111 for (size_t x = 0; x < nbStates_; x++)
112 {
113 Vdouble* cumpxy_node_c_x_ = &(*cumpxy_node_c_)[x];
114 cumpxy_node_c_x_->resize(nbStates_);
115 (*cumpxy_node_c_x_)[0] = (*P)(x, 0);
116 for (size_t y = 1; y < nbStates_; y++)
117 {
118 (*cumpxy_node_c_x_)[y] = (*cumpxy_node_c_x_)[y - 1] + (*P)(x, y);
119 }
120 }
121 }
122 }
123
124 // Initialize cumulative prob for mixture nodes
125 auto nodes = tree_.getAllNodes();
126
127 for (auto node:nodes)
128 {
129 if (node->isMixture()) // set probas to chose
130 {
131 auto outEdges = tree_.getOutgoingEdges(node);
132 Vdouble vprob(0);
133
134 for (auto edge : outEdges)
135 {
136 auto model = dynamic_pointer_cast<const MixedTransitionModelInterface>(edge->getModel());
137 if (!model)
138 throw Exception("SubstitutionProcessSiteSimulator::init : model in edge " + TextTools::toString(tree_.getEdgeIndex(edge)) + " is not a mixture.");
139
140 const auto& vNb(edge->subModelNumbers());
141
142 double x = 0.;
143 for (auto nb:vNb)
144 {
145 x += model->getNProbability(nb);
146 }
147
148 vprob.push_back(x);
149 node->sons_.push_back(tree_.getSon(edge));
150 }
151
152 vprob /= VectorTools::sum(vprob);
153
154 // Here there is no use to have one cumProb_ per class, but this
155 // is used for a posteriori simulations
156
157 node->cumProb_.resize(nbClasses_);
158 for (size_t c = 0; c < nbClasses_; c++)
159 {
160 node->cumProb_[c] = VectorTools::cumSum(vprob);
161 }
162 }
163 }
164}
165
166/******************************************************************************/
167
169{
170 if (continuousRates_ && process_->getRateDistribution())
171 {
172 double rate = process_->getRateDistribution()->randC();
173 return simulateSite(rate);
174 }
175 else
176 {
177 size_t rateClass = RandomTools::pickFromCumSum(qRates_);
178 return simulateSite(rateClass);
179 }
180}
181
182
184{
185 // Draw an initial state randomly according to given frequencies:
186 // Use rate class 0
187
188 size_t initialStateIndex = RandomTools::pickFromCumSum(qRoots_[0]);
189
190 shared_ptr<SimProcessNode> root = tree_.getRoot();
191 root->state_ = initialStateIndex;
192
193 evolveInternal(root, rate);
194
195 // Now create a Site object:
196 Vint site(seqNames_.size());
197 for (size_t i = 0; i < seqNames_.size(); ++i)
198 {
199 site[i] = process_->stateMap().getAlphabetStateAsInt(speciesNodes_.at(seqIndexes_[i])->state_);
200 }
201 auto alphabet = getAlphabet();
202 return make_unique<Site>(site, alphabet);
203}
204
205
206unique_ptr<Site> SimpleSubstitutionProcessSiteSimulator::simulateSite(size_t rateClass) const
207{
208 // Draw an initial state randomly according to equilibrium frequencies:
209
210 size_t initialStateIndex = RandomTools::pickFromCumSum(qRoots_[rateClass]);
211
212 shared_ptr<SimProcessNode> root = tree_.getRoot();
213 root->state_ = initialStateIndex;
214
215 evolveInternal(root, rateClass);
216
217 // Now create a Site object:
218 Vint site(seqNames_.size());
219 for (size_t i = 0; i < seqNames_.size(); ++i)
220 {
221 site[i] = process_->stateMap().getAlphabetStateAsInt(speciesNodes_.at(seqIndexes_[i])->state_);
222 }
223
224 auto alphabet = getAlphabet();
225 return make_unique<Site>(site, alphabet);
226}
227
228
229unique_ptr<Site> SimpleSubstitutionProcessSiteSimulator::simulateSite(size_t ancestralStateIndex, double rate) const
230{
231 shared_ptr<SimProcessNode> root = tree_.getRoot();
232 root->state_ = ancestralStateIndex;
233
234 evolveInternal(root, rate);
235
236 // Now create a Site object:
237 Vint site(seqNames_.size());
238 for (size_t i = 0; i < seqNames_.size(); ++i)
239 {
240 site[i] = process_->stateMap().getAlphabetStateAsInt(speciesNodes_.at(seqIndexes_[i])->state_);
241 }
242
243 auto alphabet = getAlphabet();
244 return make_unique<Site>(site, alphabet);
245}
246
247/******************************************************************************/
248
249unique_ptr<SiteSimulationResult> SimpleSubstitutionProcessSiteSimulator::dSimulateSite() const
250{
251 if (continuousRates_ && process_->getRateDistribution())
252 {
253 double rate = process_->getRateDistribution()->randC();
254 return dSimulateSite(rate);
255 }
256 else
257 {
258 size_t rateClass = RandomTools::pickFromCumSum(qRates_);
259 return dSimulateSite(rateClass);
260 }
261}
262
263
264unique_ptr<SiteSimulationResult> SimpleSubstitutionProcessSiteSimulator::dSimulateSite(double rate) const
265{
266 // Draw an initial state randomly according to equilibrium frequencies:
267 // Use rate class 0
268
269 size_t initialStateIndex = RandomTools::pickFromCumSum(qRoots_[0]);
270
271 shared_ptr<SimProcessNode> root = tree_.getRoot();
272 root->state_ = initialStateIndex;
273
274 auto ssr = make_unique<SiteSimulationResult>(phyloTree_, process_->getStateMap(), initialStateIndex);
275
276 evolveInternal(root, rate, ssr.get());
277
278 return ssr;
279}
280
281unique_ptr<SiteSimulationResult> SimpleSubstitutionProcessSiteSimulator::dSimulateSite(size_t rateClass) const
282{
283 // Draw an initial state randomly according to equilibrium frequencies:
284 // Use rate class 0
285
286 size_t initialStateIndex = RandomTools::pickFromCumSum(qRoots_[rateClass]);
287
288 shared_ptr<SimProcessNode> root = tree_.getRoot();
289 root->state_ = initialStateIndex;
290
291 auto ssr = make_unique<SiteSimulationResult>(phyloTree_, process_->getStateMap(), initialStateIndex);
292
293 evolveInternal(root, rateClass, ssr.get());
294
295 return ssr;
296}
297
298unique_ptr<SiteSimulationResult> SimpleSubstitutionProcessSiteSimulator::dSimulateSite(size_t ancestralStateIndex, double rate) const
299{
300 shared_ptr<SimProcessNode> root = tree_.getRoot();
301 root->state_ = ancestralStateIndex;
302
303 auto ssr = make_unique<SiteSimulationResult>(phyloTree_, process_->getStateMap(), ancestralStateIndex);
304
305 evolveInternal(root, rate, ssr.get());
306
307 return ssr;
308}
309
310
311/******************************************************************************/
312
314 std::shared_ptr<SimProcessNode> node,
315 size_t rateClass,
316 SiteSimulationResult* ssr) const
317{
318 speciesNodes_[node->getSpeciesIndex()] = node;
319
320 if (node->isSpeciation())
321 {
322 auto vEdge = tree_.getOutgoingEdges(node);
323
324 for (auto edge : vEdge)
325 {
326 auto son = tree_.getSon(edge);
327
328 if (edge->getModel())
329 {
330 if (ssr) // Detailed simulation
331 {
332 auto tm = dynamic_pointer_cast<const SubstitutionModelInterface>(edge->getModel());
333
334 if (!tm)
335 throw Exception("SimpleSubstitutionProcessSiteSimulator::EvolveInternal : detailed simulation not possible for non-markovian model on edge " + TextTools::toString(son->getSpeciesIndex()) + " for model " + edge->getModel()->getName());
336
337 SimpleMutationProcess process(tm);
338
339 double brlen = process_->getRateDistribution()->getCategory(rateClass) * phyloTree_->getEdge(edge->getSpeciesIndex())->getLength();
340 MutationPath mp(tm->getAlphabet(), node->state_, brlen);
341 if (dynamic_cast<const GivenDataSubstitutionProcessSiteSimulator*>(this) == 0)
342 {
343 mp = process.detailedEvolve(node->state_, brlen);
344 son->state_ = mp.getFinalState();
345 }
346 else // First get final state
347 {
348 son->state_ = RandomTools::pickFromCumSum(edge->cumpxy_[rateClass][node->state_]);
349 mp = process.detailedEvolve(node->state_, son->state_, brlen);
350 }
351
352 // Now append infos in ssr:
353 ssr->addNode(tree_.getNodeIndex(tree_.getSon(edge)), mp);
354 }
355 else
356 son->state_ = RandomTools::pickFromCumSum(edge->cumpxy_[rateClass][node->state_]);
357 }
358 else
359 son->state_ = node->state_;
360
361 evolveInternal(son, rateClass, ssr);
362 }
363 }
364 else if (node->isMixture())
365 {
366 const auto& cumProb = node->cumProb_[rateClass];
367
368 size_t y = RandomTools::pickFromCumSum(cumProb);
369 auto son = node->sons_[y];
370 son->state_ = node->state_;
371 evolveInternal(son, rateClass, ssr);
372 }
373 else
374 throw Exception("SimpleSubstitutionProcessSiteSimulator::evolveInternal : unknown property for node " + TextTools::toString(tree_.getNodeIndex(node)));
375}
376
377/******************************************************************************/
378
380 std::shared_ptr<SimProcessNode> node,
381 double rate,
382 SiteSimulationResult* ssr) const
383{
384 speciesNodes_[node->getSpeciesIndex()] = node;
385
386 if (node->isSpeciation())
387 {
388 auto vEdge = tree_.getOutgoingEdges(node);
389
390 for (auto edge : vEdge)
391 {
392 auto son = tree_.getSon(edge);
393
394 if (edge->getModel())
395 {
396 auto tm = dynamic_pointer_cast<const TransitionModelInterface>(edge->getModel());
397
398 double brlen = rate * phyloTree_->getEdge(edge->getSpeciesIndex())->getLength();
399
400
401 if (ssr) // Detailed simulation
402 {
403 auto sm = dynamic_pointer_cast<const SubstitutionModelInterface>(edge->getModel());
404
405 if (!sm)
406 throw Exception("SimpleSubstitutionProcessSiteSimulator::EvolveInternal : detailed simulation not possible for non-markovian model on edge " + TextTools::toString(son->getSpeciesIndex()) + " for model " + tm->getName());
407
408 SimpleMutationProcess process(sm);
409
410 MutationPath mp(tm->getAlphabet(), node->state_, brlen);
411
412 if (dynamic_cast<const GivenDataSubstitutionProcessSiteSimulator*>(this) == 0)
413 {
414 mp = process.detailedEvolve(node->state_, brlen);
415 son->state_ = mp.getFinalState();
416 }
417 else // First get final state
418 {
419 // Look for the rateClass where rate is (approximation)
420 size_t rateClass = process_->getRateDistribution()->getCategoryIndex(rate);
421
422 son->state_ = RandomTools::pickFromCumSum(edge->cumpxy_[rateClass][node->state_]);
423 mp = process.detailedEvolve(node->state_, son->state_, brlen);
424 }
425
426 // Now append infos in ssr:
427 ssr->addNode(edge->getSpeciesIndex(), mp);
428 }
429 else
430 {
431 // process transition probabilities already consider rates &
432 // branch length
433
434 const Matrix<double>* P;
435
436 const auto& vSub(edge->subModelNumbers());
437
438 if (vSub.size() == 0)
439 P = &tm->getPij_t(brlen);
440 else
441 {
442 if (vSub.size() > 1)
443 throw Exception("SubstitutionProcessSiteSimulator::init : only 1 submodel can be used.");
444
445 const auto mmodel = dynamic_pointer_cast<const MixedTransitionModelInterface>(tm);
446 const auto model = mmodel->getNModel(vSub[0]);
447
448 P = &model->getPij_t(brlen);
449 }
450
452 for (size_t y = 0; y < nbStates_; y++)
453 {
454 rand -= (*P)(node->state_, y);
455 if (rand <= 0)
456 {
457 son->state_ = y;
458 break;
459 }
460 }
461 }
462 }
463 else
464 {
465 son->state_ = node->state_;
466 }
467
468 evolveInternal(son, rate, ssr);
469 }
470 }
471 else if (node->isMixture())
472 {
473 const auto& cumProb = node->cumProb_[0]; // index 0 because it is only possible in a priori simulations, ie all class mixture probabilities are the same
474
475 size_t y = RandomTools::pickFromCumSum(cumProb);
476 auto son = node->sons_[y];
477 son->state_ = node->state_;
478 evolveInternal(son, rate, ssr);
479 }
480 else
481 throw Exception("SimpleSubstitutionProcessSiteSimulator::evolveInternal : unknown property for node " + TextTools::toString(tree_.getNodeIndex(node)));
482}
483
484
486{
488
490 {
491 auto vCN = phyloTree_->getAllNodes();
492 seqNames_.resize(vCN.size());
493 seqIndexes_.resize(vCN.size());
494 for (size_t i = 0; i < seqNames_.size(); i++)
495 {
496 auto index = phyloTree_->getNodeIndex(vCN[i]);
497 seqNames_[i] = (phyloTree_->isLeaf(vCN[i])) ? vCN[i]->getName() : TextTools::toString(index);
498 seqIndexes_[i] = index;
499 }
500 }
501 else
502 {
503 auto vCN = phyloTree_->getAllLeaves();
504 seqNames_.resize(vCN.size());
505 seqIndexes_.resize(vCN.size());
506 for (size_t i = 0; i < seqNames_.size(); i++)
507 {
508 seqNames_[i] = vCN[i]->getName();
509 seqIndexes_[i] = phyloTree_->getNodeIndex(vCN[i]);
510 }
511 }
512}
MutationPath detailedEvolve(size_t initialState, double time) const
Simulation a character evolution during a specified time according to the given substitution model an...
virtual std::vector< std::shared_ptr< E > > getAllEdges() const=0
virtual std::shared_ptr< N > getRoot() const=0
virtual EdgeIndex getEdgeIndex(const std::shared_ptr< E > edgeObject) const=0
virtual std::vector< std::shared_ptr< N > > getAllNodes() const=0
virtual NodeIndex getNodeIndex(const std::shared_ptr< N > nodeObject) const=0
virtual std::vector< std::shared_ptr< E > > getOutgoingEdges(const std::shared_ptr< N > node) const=0
std::shared_ptr< N > getSon(const std::shared_ptr< E > edge) const
Site simulation under a unique substitution process, given data.
virtual void resize(size_t nRows, size_t nCols)=0
This class is used by MutationProcess to store detailed results of simulations.
size_t getFinalState() const
Retrieve the final state of this path.
static size_t pickFromCumSum(const std::vector< double > &w)
static double giveRandomNumberBetweenZeroAndEntry(double entry)
Generally used mutation process model.
std::shared_ptr< const ParametrizablePhyloTree > phyloTree_
std::shared_ptr< const SubstitutionProcessInterface > process_
void evolveInternal(std::shared_ptr< SimProcessNode > node, size_t rateClass, SiteSimulationResult *ssr=nullptr) const
SPTree tree_
To store states & transition probabilities of the simulator.
std::map< size_t, std::shared_ptr< SimProcessNode > > speciesNodes_
Map between species Indexes & used nodes, may change at each simulation.
std::vector< size_t > seqIndexes_
Vector of indexes of sequenced output species.
void outputInternalSites(bool yn) override
Sets whether we will output the internal sequences or not.
std::vector< std::string > seqNames_
Vector of names of sequenced output species.
std::unique_ptr< SiteSimulationResult > dSimulateSite() const override
Get a detailed simulation result for one site.
Vdouble qRates_
cumsum probas of the substitution rates
std::shared_ptr< const Alphabet > getAlphabet() const override
Data structure to store the result of a DetailedSiteSimulator.
virtual void addNode(unsigned int nodeId, MutationPath path)
static T sum(const std::vector< T > &v1)
static std::vector< T > cumSum(const std::vector< T > &v1)
std::string toString(T t)
Defines the basic types of data flow nodes.
std::vector< double > Vdouble
std::vector< int > Vint
std::vector< VVdouble > VVVdouble
std::vector< Vdouble > VVdouble