bpp-phyl3  3.0.0
HierarchicalClustering.cpp
Go to the documentation of this file.
1 // SPDX-FileCopyrightText: The Bio++ Development Group
2 //
3 // SPDX-License-Identifier: CECILL-2.1
4 
5 #include "../Tree/NodeTemplate.h"
7 
8 using namespace bpp;
9 using namespace std;
10 
11 const string HierarchicalClustering::COMPLETE = "Complete";
12 const string HierarchicalClustering::SINGLE = "Single";
13 const string HierarchicalClustering::AVERAGE = "Average";
14 const string HierarchicalClustering::MEDIAN = "Median";
15 const string HierarchicalClustering::WARD = "Ward";
16 const string HierarchicalClustering::CENTROID = "Centroid";
17 
19 {
20  vector<size_t> bestPair(2);
21  double distMin = -std::log(0.);
22  for (map<size_t, Node*>::iterator i = currentNodes_.begin(); i != currentNodes_.end(); i++)
23  {
24  size_t id = i->first;
25  map<size_t, Node*>::iterator j = i;
26  j++;
27  for ( ; j != currentNodes_.end(); j++)
28  {
29  size_t jd = j->first;
30  double dist = matrix_(id, jd);
31  if (dist < distMin)
32  {
33  distMin = dist;
34  bestPair[0] = id;
35  bestPair[1] = jd;
36  }
37  }
38  }
39  if (distMin == -std::log(0.))
40  {
41  cout << "---------------------------------------------------------------------------------" << endl;
42  for (map<size_t, Node*>::iterator i = currentNodes_.begin(); i != currentNodes_.end(); i++)
43  {
44  size_t id = i->first;
45  map<size_t, Node*>::iterator j = i;
46  j++;
47  for ( ; j != currentNodes_.end(); j++)
48  {
49  size_t jd = j->first;
50  double dist = matrix_(id, jd);
51  cout << dist << "\t";
52  }
53  cout << endl;
54  }
55  cout << "---------------------------------------------------------------------------------" << endl;
56 
57  throw Exception("Unexpected error: no minimum found in the distance matrix.");
58  }
59 
60  return bestPair;
61 }
62 vector<double> HierarchicalClustering::computeBranchLengthsForPair(const vector<size_t>& pair)
63 {
64  vector<double> d(2);
65  double dist = matrix_(pair[0], pair[1]) / 2.;
66  d[0] = dist - dynamic_cast<NodeTemplate<ClusterInfos>*>(currentNodes_[pair[0]])->getInfos().length;
67  d[1] = dist - dynamic_cast<NodeTemplate<ClusterInfos>*>(currentNodes_[pair[1]])->getInfos().length;
68  return d;
69 }
70 
71 double HierarchicalClustering::computeDistancesFromPair(const vector<size_t>& pair, const vector<double>& branchLengths, size_t pos)
72 {
73  double w1, w2, w3, w4;
74  if (method_ == "Single")
75  {
76  w1 = .5;
77  w2 = .5;
78  w3 = 0.;
79  w4 = -.5;
80  }
81  else if (method_ == "Complete")
82  {
83  w1 = .5;
84  w2 = .5;
85  w3 = 0.;
86  w4 = .5;
87  }
88  else if (method_ == "Median")
89  {
90  w1 = .5;
91  w2 = .5;
92  w3 = -0.25;
93  w4 = 0.;
94  }
95  else if (method_ == "Average")
96  {
97  double n1 = static_cast<double>(dynamic_cast<NodeTemplate<ClusterInfos>*>(currentNodes_[pair[0]])->getInfos().numberOfLeaves);
98  double n2 = static_cast<double>(dynamic_cast<NodeTemplate<ClusterInfos>*>(currentNodes_[pair[1]])->getInfos().numberOfLeaves);
99  w1 = n1 / (n1 + n2);
100  w2 = n2 / (n1 + n2);
101  w3 = 0.;
102  w4 = 0.;
103  }
104  else if (method_ == "Ward")
105  {
106  double n1 = static_cast<double>(dynamic_cast<NodeTemplate<ClusterInfos>*>(currentNodes_[pair[0]])->getInfos().numberOfLeaves);
107  double n2 = static_cast<double>(dynamic_cast<NodeTemplate<ClusterInfos>*>(currentNodes_[pair[1]])->getInfos().numberOfLeaves);
108  double n3 = static_cast<double>(dynamic_cast<NodeTemplate<ClusterInfos>*>(currentNodes_[pos])->getInfos().numberOfLeaves);
109  w1 = (n1 + n3) / (n1 + n2 + n3);
110  w2 = (n2 + n3) / (n1 + n2 + n3);
111  w3 = -n3 / (n1 + n2 + n3);
112  w4 = 0.;
113  }
114  else if (method_ == "Centroid")
115  {
116  double n1 = static_cast<double>(dynamic_cast<NodeTemplate<ClusterInfos>*>(currentNodes_[pair[0]])->getInfos().numberOfLeaves);
117  double n2 = static_cast<double>(dynamic_cast<NodeTemplate<ClusterInfos>*>(currentNodes_[pair[1]])->getInfos().numberOfLeaves);
118  w1 = n1 / (n1 + n2);
119  w2 = n2 / (n1 + n2);
120  w3 = -n1 * n2 / pow(n1 + n2, 2.);
121  w4 = 0.;
122  }
123  else
124  throw Exception("HierarchicalClustering::computeBranchLengthsForPair. unknown method '" + method_ + "'.");
125  double d1 = matrix_(pair[0], pos);
126  double d2 = matrix_(pair[1], pos);
127  double d3 = matrix_(pair[0], pair[1]);
128  return w1 * d1 + w2 * d2 + w3 * d3 + w4 * std::abs(d1 - d2);
129 }
130 
132 {
134  map<size_t, Node*>::iterator it = currentNodes_.begin();
135  size_t i1 = it->first;
136  Node* n1 = it->second;
137  it++;
138  size_t i2 = it->first;
139  Node* n2 = it->second;
140  double d = matrix_(i1, i2) / 2;
141  root->addSon(n1);
142  root->addSon(n2);
143  n1->setDistanceToFather(d - dynamic_cast<NodeTemplate<ClusterInfos>*>(n1)->getInfos().length);
144  n2->setDistanceToFather(d - dynamic_cast<NodeTemplate<ClusterInfos>*>(n2)->getInfos().length);
145  tree_.reset(new TreeTemplate<NodeTemplate<ClusterInfos>>(root));
146 }
147 
148 Node* HierarchicalClustering::getLeafNode(int id, const string& name)
149 {
150  ClusterInfos infos;
151  infos.numberOfLeaves = 1;
152  infos.length = 0.;
154  leaf->setInfos(infos);
155  return leaf;
156 }
157 
159 {
160  ClusterInfos infos;
161  infos.numberOfLeaves =
162  dynamic_cast<NodeTemplate<ClusterInfos>*>(son1)->getInfos().numberOfLeaves
163  + dynamic_cast<NodeTemplate<ClusterInfos>*>(son2)->getInfos().numberOfLeaves;
164  infos.length = dynamic_cast<NodeTemplate<ClusterInfos>*>(son1)->getInfos().length + son1->getDistanceToFather();
165  Node* parent = new NodeTemplate<ClusterInfos>(id);
166  dynamic_cast<NodeTemplate<ClusterInfos>*>(parent)->setInfos(infos);
167  parent->addSon(son1);
168  parent->addSon(son2);
169  return parent;
170 }
double computeDistancesFromPair(const std::vector< size_t > &pair, const std::vector< double > &branchLengths, size_t pos)
Actualizes the distance matrix according to a given pair and the corresponding branch lengths.
std::vector< double > computeBranchLengthsForPair(const std::vector< size_t > &pair)
Compute the branch lengths for two nodes to agglomerate.
static const std::string AVERAGE
static const std::string CENTROID
virtual Node * getLeafNode(int id, const std::string &name)
Get a leaf node.
std::vector< size_t > getBestPair()
Get the best pair of nodes to agglomerate.
static const std::string COMPLETE
static const std::string MEDIAN
virtual Node * getParentNode(int id, Node *son1, Node *son2)
Get an inner node.
void finalStep(int idRoot)
Method called when there ar eonly three remaining node to agglomerate, and creates the root node of t...
static const std::string SINGLE
static const std::string WARD
The NodeTemplate class.
Definition: NodeTemplate.h:39
virtual void setInfos(const NodeInfos &infos)
Set the information to be associated to this node.
Definition: NodeTemplate.h:156
The phylogenetic node class.
Definition: Node.h:59
virtual void setDistanceToFather(double distance)
Set or update the distance toward the father node.
Definition: Node.h:266
virtual void addSon(size_t pos, Node *node)
Definition: Node.h:374
virtual double getDistanceToFather() const
Get the distance to the father node is there is one, otherwise throw a NodeException.
Definition: Node.h:250
The phylogenetic tree class.
Definition: TreeTemplate.h:59
Defines the basic types of data flow nodes.
ExtendedFloat pow(const ExtendedFloat &ef, double exp)
ExtendedFloat abs(const ExtendedFloat &ef)