bpp-phyl3  3.0.0
RewardMappingTools.cpp
Go to the documentation of this file.
1 // SPDX-FileCopyrightText: The Bio++ Development Group
2 //
3 // SPDX-License-Identifier: CECILL-2.1
4 
5 #include "RewardMappingTools.h"
6 #include "../Likelihood/DRTreeLikelihoodTools.h"
7 #include "../Likelihood/MarginalAncestralStateReconstruction.h"
8 
9 #include <Bpp/Text/TextTools.h>
12 #include <Bpp/Numeric/DataTable.h>
13 
14 using namespace bpp;
15 
16 // From the STL:
17 #include <iomanip>
18 
19 using namespace std;
20 
21 /******************************************************************************/
22 
23 std::unique_ptr<LegacyProbabilisticRewardMapping> LegacyRewardMappingTools::computeRewardVectors(
24  std::shared_ptr<const DRTreeLikelihoodInterface> drtl,
25  const vector<int>& nodeIds,
26  std::shared_ptr<Reward> reward,
27  bool verbose)
28 {
29  // Preamble:
30  if (!drtl->isInitialized())
31  throw Exception("RewardMappingTools::computeRewardVectors(). Likelihood object is not initialized.");
32 
33  // A few variables we'll need:
34 
35  const TreeTemplate<Node> tree(drtl->tree());
36  const auto& sequences = drtl->data();
37  auto& rDist = drtl->rateDistribution();
38 
39  size_t nbSites = sequences.getNumberOfSites();
40  size_t nbDistinctSites = drtl->likelihoodData().getNumberOfDistinctSites();
41  size_t nbStates = sequences.alphabet().getSize();
42  size_t nbClasses = rDist.getNumberOfCategories();
43  vector<const Node*> nodes = tree.getNodes();
44  const vector<size_t>& rootPatternLinks = drtl->likelihoodData().getRootArrayPositions();
45  nodes.pop_back(); // Remove root node.
46  size_t nbNodes = nodes.size();
47 
48  // We create a new ProbabilisticRewardMapping object:
49  auto rewards = make_unique<LegacyProbabilisticRewardMapping>(tree, reward, nbSites);
50 
51  // Store likelihood for each rate for each site:
52  VVVdouble lik;
53  drtl->computeLikelihoodAtNode(tree.getRootId(), lik);
54  Vdouble Lr(nbDistinctSites, 0);
55  Vdouble rcProbs = rDist.getProbabilities();
56  Vdouble rcRates = rDist.getCategories();
57  for (size_t i = 0; i < nbDistinctSites; i++)
58  {
59  VVdouble& lik_i = lik[i];
60  for (size_t c = 0; c < nbClasses; c++)
61  {
62  Vdouble& lik_i_c = lik_i[c];
63  double rc = rDist.getProbability(c);
64  for (size_t s = 0; s < nbStates; s++)
65  {
66  Lr[i] += lik_i_c[s] * rc;
67  }
68  }
69  }
70 
71  // Compute the reward for each class and each branch in the tree:
72  if (verbose)
73  ApplicationTools::displayTask("Compute joint node-pairs likelihood", true);
74 
75  for (size_t l = 0; l < nbNodes; ++l)
76  {
77  // For each node,
78  const Node* currentNode = nodes[l];
79  if (nodeIds.size() > 0 && !VectorTools::contains(nodeIds, currentNode->getId()))
80  continue;
81 
82  const Node* father = currentNode->getFather();
83 
84  double d = currentNode->getDistanceToFather();
85 
86  if (verbose)
87  ApplicationTools::displayGauge(l, nbNodes - 1);
88  Vdouble rewardsForCurrentNode(nbDistinctSites);
89 
90  // Now we've got to compute likelihoods in a smart manner... ;)
91  VVVdouble likelihoodsFatherConstantPart(nbDistinctSites);
92  for (size_t i = 0; i < nbDistinctSites; i++)
93  {
94  VVdouble& likelihoodsFatherConstantPart_i = likelihoodsFatherConstantPart[i];
95  likelihoodsFatherConstantPart_i.resize(nbClasses);
96  for (size_t c = 0; c < nbClasses; c++)
97  {
98  Vdouble& likelihoodsFatherConstantPart_i_c = likelihoodsFatherConstantPart_i[c];
99  likelihoodsFatherConstantPart_i_c.resize(nbStates);
100  double rc = rDist.getProbability(c);
101  for (size_t s = 0; s < nbStates; s++)
102  {
103  // (* likelihoodsFatherConstantPart_i_c)[s] = rc * model->freq(s);
104  // freq is already accounted in the array
105  likelihoodsFatherConstantPart_i_c[s] = rc;
106  }
107  }
108  }
109 
110  // First, what will remain constant:
111  size_t nbSons = father->getNumberOfSons();
112  for (size_t n = 0; n < nbSons; n++)
113  {
114  const Node* currentSon = father->getSon(n);
115  if (currentSon->getId() != currentNode->getId())
116  {
117  const VVVdouble& likelihoodsFather_son = drtl->likelihoodData().getLikelihoodArray(father->getId(), currentSon->getId());
118 
119  // Now iterate over all site partitions:
120  unique_ptr<TreeLikelihoodInterface::ConstBranchModelIterator> mit(drtl->getNewBranchModelIterator(currentSon->getId()));
121  VVVdouble pxy;
122  bool first;
123  while (mit->hasNext())
124  {
125  auto bmd = mit->next();
126  unique_ptr<TreeLikelihoodInterface::SiteIterator> sit(bmd->getNewSiteIterator());
127  first = true;
128  while (sit->hasNext())
129  {
130  size_t i = sit->next();
131  // We retrieve the transition probabilities for this site partition:
132  if (first)
133  {
134  pxy = drtl->getTransitionProbabilitiesPerRateClass(currentSon->getId(), i);
135  first = false;
136  }
137  const VVdouble& likelihoodsFather_son_i = likelihoodsFather_son[i];
138  VVdouble& likelihoodsFatherConstantPart_i = likelihoodsFatherConstantPart[i];
139  for (size_t c = 0; c < nbClasses; c++)
140  {
141  const Vdouble& likelihoodsFather_son_i_c = likelihoodsFather_son_i[c];
142  Vdouble& likelihoodsFatherConstantPart_i_c = likelihoodsFatherConstantPart_i[c];
143  VVdouble& pxy_c = pxy[c];
144  for (size_t x = 0; x < nbStates; x++)
145  {
146  Vdouble& pxy_c_x = pxy_c[x];
147  double likelihood = 0.;
148  for (size_t y = 0; y < nbStates; y++)
149  {
150  likelihood += pxy_c_x[y] * likelihoodsFather_son_i_c[y];
151  }
152  likelihoodsFatherConstantPart_i_c[x] *= likelihood;
153  }
154  }
155  }
156  }
157  }
158  }
159  if (father->hasFather())
160  {
161  const Node* currentSon = father->getFather();
162  const VVVdouble& likelihoodsFather_son = drtl->likelihoodData().getLikelihoodArray(father->getId(), currentSon->getId());
163  // Now iterate over all site partitions:
164  unique_ptr<TreeLikelihoodInterface::ConstBranchModelIterator> mit(drtl->getNewBranchModelIterator(father->getId()));
165  VVVdouble pxy;
166  bool first;
167  while (mit->hasNext())
168  {
169  auto bmd = mit->next();
170  unique_ptr<TreeLikelihoodInterface::SiteIterator> sit(bmd->getNewSiteIterator());
171  first = true;
172  while (sit->hasNext())
173  {
174  size_t i = sit->next();
175  // We retrieve the transition probabilities for this site partition:
176  if (first)
177  {
178  pxy = drtl->getTransitionProbabilitiesPerRateClass(father->getId(), i);
179  first = false;
180  }
181  const VVdouble& likelihoodsFather_son_i = likelihoodsFather_son[i];
182  VVdouble& likelihoodsFatherConstantPart_i = likelihoodsFatherConstantPart[i];
183  for (size_t c = 0; c < nbClasses; c++)
184  {
185  const Vdouble& likelihoodsFather_son_i_c = likelihoodsFather_son_i[c];
186  Vdouble& likelihoodsFatherConstantPart_i_c = likelihoodsFatherConstantPart_i[c];
187  VVdouble& pxy_c = pxy[c];
188  for (size_t x = 0; x < nbStates; x++)
189  {
190  double likelihood = 0.;
191  for (size_t y = 0; y < nbStates; y++)
192  {
193  Vdouble& pxy_c_x = pxy_c[y];
194  likelihood += pxy_c_x[x] * likelihoodsFather_son_i_c[y];
195  }
196  likelihoodsFatherConstantPart_i_c[x] *= likelihood;
197  }
198  }
199  }
200  }
201  }
202  else
203  {
204  // Account for root frequencies:
205  for (size_t i = 0; i < nbDistinctSites; i++)
206  {
207  vector<double> freqs = drtl->getRootFrequencies(i);
208  VVdouble* likelihoodsFatherConstantPart_i = &likelihoodsFatherConstantPart[i];
209  for (size_t c = 0; c < nbClasses; c++)
210  {
211  Vdouble* likelihoodsFatherConstantPart_i_c = &(*likelihoodsFatherConstantPart_i)[c];
212  for (size_t x = 0; x < nbStates; x++)
213  {
214  (*likelihoodsFatherConstantPart_i_c)[x] *= freqs[x];
215  }
216  }
217  }
218  }
219 
220  // Then, we deal with the node of interest.
221  // We first average upon 'y' to save computations, and then upon 'x'.
222  // ('y' is the state at 'node' and 'x' the state at 'father'.)
223 
224  // Iterate over all site partitions:
225  const VVVdouble& likelihoodsFather_node = drtl->likelihoodData().getLikelihoodArray(father->getId(), currentNode->getId());
226  unique_ptr<TreeLikelihoodInterface::ConstBranchModelIterator> mit(drtl->getNewBranchModelIterator(currentNode->getId()));
227  VVVdouble pxy;
228  bool first;
229  while (mit->hasNext())
230  {
231  auto bmd = mit->next();
232  reward->setSubstitutionModel(bmd->getSubstitutionModel());
233  // compute all nxy first:
234  VVVdouble nxy(nbClasses);
235  for (size_t c = 0; c < nbClasses; ++c)
236  {
237  VVdouble& nxy_c = nxy[c];
238  double rc = rcRates[c];
239  auto nij = reward->getAllRewards(d * rc);
240  nxy_c.resize(nbStates);
241  for (size_t x = 0; x < nbStates; ++x)
242  {
243  Vdouble& nxy_c_x = nxy_c[x];
244  nxy_c_x.resize(nbStates);
245  for (size_t y = 0; y < nbStates; ++y)
246  {
247  nxy_c_x[y] = (*nij)(x, y);
248  }
249  }
250  }
251 
252  // Now loop over sites:
253  unique_ptr<TreeLikelihoodInterface::SiteIterator> sit(bmd->getNewSiteIterator());
254  first = true;
255  while (sit->hasNext())
256  {
257  size_t i = sit->next();
258  // We retrieve the transition probabilities and substitution counts for this site partition:
259  if (first)
260  {
261  pxy = drtl->getTransitionProbabilitiesPerRateClass(currentNode->getId(), i);
262  first = false;
263  }
264  const VVdouble& likelihoodsFather_node_i = likelihoodsFather_node[i];
265  VVdouble& likelihoodsFatherConstantPart_i = likelihoodsFatherConstantPart[i];
266  for (size_t c = 0; c < nbClasses; ++c)
267  {
268  const Vdouble& likelihoodsFather_node_i_c = likelihoodsFather_node_i[c];
269  Vdouble& likelihoodsFatherConstantPart_i_c = likelihoodsFatherConstantPart_i[c];
270  const VVdouble& pxy_c = pxy[c];
271  VVdouble& nxy_c = nxy[c];
272  for (size_t x = 0; x < nbStates; ++x)
273  {
274  double& likelihoodsFatherConstantPart_i_c_x = likelihoodsFatherConstantPart_i_c[x];
275  const Vdouble& pxy_c_x = pxy_c[x];
276  for (size_t y = 0; y < nbStates; ++y)
277  {
278  double likelihood_cxy = likelihoodsFatherConstantPart_i_c_x
279  * pxy_c_x[y]
280  * likelihoodsFather_node_i_c[y];
281 
282  // Now the vector computation:
283  rewardsForCurrentNode[i] += likelihood_cxy * nxy_c[x][y];
284  // <------------> <--------->
285  // Posterior probability | |
286  // for site i and rate class c * | |
287  // likelihood for this site------+ |
288  // |
289  // Reward function for site i and rate class c----+
290  }
291  }
292  }
293  }
294  }
295 
296  // Now we just have to copy the substitutions into the result vector:
297  for (size_t i = 0; i < nbSites; ++i)
298  {
299  (*rewards)(l, i) = rewardsForCurrentNode[rootPatternLinks[i]] / Lr[rootPatternLinks[i]];
300  }
301  }
302  if (verbose)
303  {
307  }
308  return rewards;
309 }
310 
311 /**************************************************************************************************/
312 
314  const LegacyProbabilisticRewardMapping& rewards,
315  const SiteContainerInterface& sites,
316  ostream& out)
317 {
318  if (!out)
319  throw IOException("RewardMappingTools::writeToFile. Can't write to stream.");
320  out << "Branches";
321  out << "\tMean";
322  for (size_t i = 0; i < rewards.getNumberOfSites(); i++)
323  {
324  out << "\tSite" << sites.site(i).getCoordinate();
325  }
326  out << endl;
327 
328  for (size_t j = 0; j < rewards.getNumberOfBranches(); j++)
329  {
330  out << rewards.getNode(j)->getId() << "\t" << rewards.getNode(j)->getDistanceToFather();
331  for (size_t i = 0; i < rewards.getNumberOfSites(); i++)
332  {
333  out << "\t" << rewards(j, i);
334  }
335  out << endl;
336  }
337 }
338 
339 /**************************************************************************************************/
340 
342 {
343  try
344  {
345  auto data = DataTable::read(in, "\t", true, -1);
346  vector<string> ids = data->getColumn(0);
347  data->deleteColumn(0); // Remove ids
348  data->deleteColumn(0); // Remove means
349  // Now parse the table:
350  size_t nbSites = data->getNumberOfColumns();
351  rewards.setNumberOfSites(nbSites);
352  size_t nbBranches = data->getNumberOfRows();
353  for (size_t i = 0; i < nbBranches; i++)
354  {
355  int id = TextTools::toInt(ids[i]);
356  size_t br = rewards.getNodeIndex(id);
357  for (size_t j = 0; j < nbSites; j++)
358  {
359  rewards(br, j) = TextTools::toDouble((*data)(i, j));
360  }
361  }
362  // Parse the header:
363  for (size_t i = 0; i < nbSites; i++)
364  {
365  string siteTxt = data->getColumnName(i);
366  int site = 0;
367  if (siteTxt.substr(0, 4) == "Site")
368  site = TextTools::to<int>(siteTxt.substr(4));
369  else
370  site = TextTools::to<int>(siteTxt);
371  rewards.setSitePosition(i, site);
372  }
373  }
374  catch (Exception& e)
375  {
376  throw IOException(string("Bad input file. ") + e.what());
377  }
378 }
379 
380 /**************************************************************************************************/
381 
383 {
384  size_t nbSites = smap.getNumberOfSites();
385  double v = 0;
386  for (size_t i = 0; i < nbSites; ++i)
387  {
388  v += smap(branchIndex, i);
389  }
390  return v;
391 }
392 
393 /**************************************************************************************************/
394 
396 {
397  size_t nbBranches = smap.getNumberOfBranches();
398  double v = 0;
399  for (size_t i = 0; i < nbBranches; ++i)
400  {
401  v += smap(i, siteIndex);
402  }
403  return v;
404 }
405 
406 /**************************************************************************************************/
int getCoordinate() const override
static void displayTask(const std::string &text, bool eof=false)
static std::shared_ptr< OutputStream > message
static void displayTaskDone()
static void displayGauge(size_t iter, size_t total, char symbol='>', const std::string &mes="")
static std::unique_ptr< DataTable > read(std::istream &in, const std::string &sep="\t", bool header=true, int rowNames=-1)
const char * what() const noexcept override
size_t getNumberOfSites() const override
Definition: Mapping.h:158
void setSitePosition(size_t index, int position) override
Set the position of a given site.
Definition: Mapping.h:152
virtual size_t getNodeIndex(int nodeId) const override
Definition: Mapping.h:184
size_t getNumberOfBranches() const override
Definition: Mapping.h:160
virtual const Node * getNode(size_t nodeIndex) const
Definition: Mapping.h:162
virtual size_t getNumberOfBranches() const =0
virtual size_t getNumberOfSites() const =0
Legacy data storage class for probabilistic rewards mappings.
virtual void setNumberOfSites(size_t numberOfSites) override
Legacy interface for storing reward mapping data.
Definition: RewardMapping.h:28
static double computeSumForSite(const LegacyRewardMappingInterface &smap, size_t siteIndex)
Sum all substitutions for each type of a given site (specified by its index).
static void readFromStream(std::istream &in, LegacyProbabilisticRewardMapping &rewards)
Read the reward vectors from a stream.
static std::unique_ptr< LegacyProbabilisticRewardMapping > computeRewardVectors(std::shared_ptr< const DRTreeLikelihoodInterface > drtl, const std::vector< int > &nodeIds, std::shared_ptr< Reward > reward, bool verbose=true)
Compute the reward vectors for a particular dataset using the double-recursive likelihood computation...
static void writeToStream(const LegacyProbabilisticRewardMapping &rewards, const SiteContainerInterface &sites, std::ostream &out)
Write the reward vectors to a stream.
static double computeSumForBranch(const LegacyRewardMappingInterface &smap, size_t branchIndex)
Sum all rewards of a given branch (specified by its index).
The phylogenetic node class.
Definition: Node.h:59
virtual int getId() const
Get the node's id.
Definition: Node.h:170
virtual const Node * getSon(size_t pos) const
Definition: Node.h:362
virtual const Node * getFather() const
Get the father of this node is there is one.
Definition: Node.h:306
virtual bool hasFather() const
Tell if this node has a father node.
Definition: Node.h:346
virtual double getDistanceToFather() const
Get the distance to the father node is there is one, otherwise throw a NodeException.
Definition: Node.h:250
virtual size_t getNumberOfSons() const
Definition: Node.h:355
virtual const Site & site(size_t sitePosition) const override=0
The phylogenetic tree class.
Definition: TreeTemplate.h:59
virtual std::vector< const N * > getNodes() const
Definition: TreeTemplate.h:397
int getRootId() const
Definition: TreeTemplate.h:127
static bool contains(const std::vector< T > &vec, T el)
int toInt(const std::string &s, char scientificNotation='e')
double toDouble(const std::string &s, char dec='.', char scientificNotation='e')
Defines the basic types of data flow nodes.
std::vector< double > Vdouble
std::vector< VVdouble > VVVdouble
std::vector< Vdouble > VVdouble