bpp-phyl3  3.0.0
PatternTools.cpp
Go to the documentation of this file.
1 // SPDX-FileCopyrightText: The Bio++ Development Group
2 //
3 // SPDX-License-Identifier: CECILL-2.1
4 
5 #include "PatternTools.h"
6 
7 // From bpp-seq:
8 #include <Bpp/Seq/SiteTools.h>
9 
10 using namespace bpp;
11 
12 // From the STL:
13 #include <iostream>
14 #include <algorithm>
15 
16 using namespace std;
17 
18 /******************************************************************************/
19 
20 std::unique_ptr<AlignmentDataInterface> PatternTools::getSequenceSubset(
21  const AlignmentDataInterface& sequenceSet,
22  const Node& node)
23 {
24  try
25  {
26  const auto& siteContainer = dynamic_cast<const SiteContainerInterface&>(sequenceSet);
27  return getSequenceSubset(siteContainer, node);
28  }
29  catch (std::bad_cast& e)
30  {}
31 
32  try
33  {
34  const auto& siteContainer = dynamic_cast<const ProbabilisticSiteContainerInterface&>(sequenceSet);
35  return getSequenceSubset(siteContainer, node);
36  }
37  catch (std::bad_cast& e)
38  {}
39 
40  throw Exception("PatternTools::getSequenceSubset : unsupported sequence type.");
41 }
42 
43 std::unique_ptr<SiteContainerInterface> PatternTools::getSequenceSubset(
44  const SiteContainerInterface& sequenceSet,
45  const Node& node)
46 {
47  auto alphabet = sequenceSet.getAlphabet();
48  size_t nbSites = sequenceSet.getNumberOfSites();
49  auto sequenceSubset = std::make_unique<VectorSiteContainer>(alphabet);
50 
51  auto leaves = TreeTemplateTools::getLeaves(node);
52 
53  for (auto i : leaves)
54  {
55  if (i->hasName())
56  {
57  // Use sequence name as key.
58  try
59  {
60  auto newSeq = std::make_unique<Sequence>(sequenceSet.sequence(i->getName()));
61  sequenceSubset->addSequence(i->getName(), newSeq);
62  }
63  catch (std::exception& e)
64  {
65  ApplicationTools::displayWarning("PatternTools::getSequenceSubset : Leaf name not found in sequence file: " + i->getName() + " : Replaced with unknown sequence");
66 
67  auto seq = std::make_unique<Sequence>(i->getName(), "", alphabet);
68  seq->setToSizeR(nbSites);
70  sequenceSubset->addSequence(i->getName(), seq);
71  }
72  }
73  }
74  sequenceSubset->setSiteCoordinates(sequenceSet.getSiteCoordinates());
75  return sequenceSubset;
76 }
77 
78 std::unique_ptr<ProbabilisticSiteContainerInterface> PatternTools::getSequenceSubset(
79  const ProbabilisticSiteContainerInterface& sequenceSet,
80  const Node& node)
81 {
82  auto alphabet = sequenceSet.getAlphabet();
83  size_t nbSites = sequenceSet.getNumberOfSites();
84  auto sequenceSubset = std::make_unique<ProbabilisticVectorSiteContainer>(alphabet);
85 
86  auto leaves = TreeTemplateTools::getLeaves(node);
87 
88  for (auto i : leaves)
89  {
90  if (i->hasName())
91  {
92  // Use sequence name as key.
93  try
94  {
95  auto newSeq = std::make_unique<ProbabilisticSequence>(sequenceSet.sequence(i->getName()));
96  sequenceSubset->addSequence(i->getName(), newSeq);
97  }
98  catch (std::exception const& e)
99  {
100  ApplicationTools::displayWarning("PatternTools::getSequenceSubset : Leaf name not found in sequence file: " + i->getName() + " : Replaced with unknown sequence");
101 
102  auto newSeq = std::make_unique<ProbabilisticSequence>(i->getName(), Table<double>(alphabet->getSize(), 0), alphabet);
103  newSeq->setToSizeR(nbSites);
105  sequenceSubset->addSequence(i->getName(), newSeq);
106  }
107  }
108  }
109  sequenceSubset->setSiteCoordinates(sequenceSet.getSiteCoordinates());
110  return sequenceSubset;
111 }
112 
113 /******************************************************************************/
114 
115 std::unique_ptr<AlignmentDataInterface> PatternTools::getSequenceSubset(
116  const AlignmentDataInterface& sequenceSet,
117  const std::vector<std::string>& names)
118 {
119  auto alphabet = sequenceSet.getAlphabet();
120 
121  try
122  {
123  const auto& siteContainer = dynamic_cast<const SiteContainerInterface&>(sequenceSet);
124  return getSequenceSubset(siteContainer, names);
125  }
126  catch (std::bad_cast& e)
127  {}
128 
129  try
130  {
131  const auto& siteContainer = dynamic_cast<const ProbabilisticSiteContainerInterface&>(sequenceSet);
132  return getSequenceSubset(siteContainer, names);
133  }
134  catch (std::bad_cast& e)
135  {}
136 
137  throw Exception("PatternTools::getSequenceSubset : unsupported sequence type.");
138 }
139 
140 
141 std::unique_ptr<SiteContainerInterface> PatternTools::getSequenceSubset(
142  const SiteContainerInterface& sequenceSet,
143  const std::vector<std::string>& names)
144 {
145  auto alphabet = sequenceSet.getAlphabet();
146  auto sequenceSubset = std::make_unique<VectorSiteContainer>(alphabet);
147 
148  for (auto& i : names)
149  {
150  if (sequenceSet.hasSequence(i))
151  {
152  auto newSeq = std::make_unique<Sequence>(sequenceSet.sequence(i));
153  sequenceSubset->addSequence(i, newSeq);
154  }
155  else
156  throw SequenceNotFoundException("PatternTools ERROR: name not found in sequence container: ", i);
157  }
158  sequenceSubset->setSiteCoordinates(sequenceSet.getSiteCoordinates());
159  return std::move(sequenceSubset);
160 }
161 
162 std::unique_ptr<ProbabilisticSiteContainerInterface> PatternTools::getSequenceSubset(
163  const ProbabilisticSiteContainerInterface& sequenceSet,
164  const std::vector<std::string>& names)
165 {
166  auto alphabet = sequenceSet.getAlphabet();
167  auto sequenceSubset = std::make_unique<ProbabilisticVectorSiteContainer>(alphabet);
168 
169  for (auto& i : names)
170  {
171  if (sequenceSet.hasSequence(i))
172  {
173  auto newSeq = std::make_unique<ProbabilisticSequence>(sequenceSet.sequence(i));
174  sequenceSubset->addSequence(i, newSeq);
175  }
176  else
177  throw SequenceNotFoundException("PatternTools ERROR: name not found in sequence container: ", i);
178  }
179  sequenceSubset->setSiteCoordinates(sequenceSet.getSiteCoordinates());
180  return std::move(sequenceSubset);
181 }
182 
183 /******************************************************************************/
184 
185 std::unique_ptr<AlignmentDataInterface> PatternTools::shrinkSiteSet(
186  const AlignmentDataInterface& siteSet)
187 {
188  try
189  {
190  const auto& sc = dynamic_cast<const SiteContainerInterface&>(siteSet);
191  return shrinkSiteSet(sc);
192  }
193  catch (std::bad_cast& e)
194  {}
195 
196  try
197  {
198  const auto& psc = dynamic_cast<const ProbabilisticSiteContainerInterface&>(siteSet);
199  return shrinkSiteSet(psc);
200  }
201  catch (std::bad_cast& e)
202  {}
203 
204  throw Exception("PatternTools::shrinkSiteSet : unsupported sequence type.");
205 }
206 
207 std::unique_ptr<SiteContainerInterface> PatternTools::shrinkSiteSet(
208  const SiteContainerInterface& siteSet)
209 {
210  auto alphabet = siteSet.getAlphabet();
211 
212  if (siteSet.getNumberOfSites() == 0)
213  throw Exception("PatternTools::shrinkSiteSet siteSet is void.");
214 
215  vector<std::unique_ptr<Site>> sites;
216 
217  for (unsigned int i = 0; i < siteSet.getNumberOfSites(); ++i)
218  {
219  const auto& currentSite = siteSet.site(i);
220  bool siteExists = false;
221  for (unsigned int j = 0; !siteExists && j < sites.size(); ++j)
222  {
223  if (SiteTools::areSymbolListsIdentical(currentSite, *sites[j]))
224  siteExists = true;
225  }
226  if (!siteExists)
227  sites.push_back(make_unique<Site>(currentSite));
228  }
229  auto result = make_unique<VectorSiteContainer>(sites, alphabet, false);
230  result->setSequenceNames(siteSet.getSequenceNames(), true); // Update keys too
231  return result;
232 }
233 
234 std::unique_ptr<ProbabilisticSiteContainerInterface> PatternTools::shrinkSiteSet(
236 {
237  auto alphabet = siteSet.getAlphabet();
238 
239  if (siteSet.getNumberOfSites() == 0)
240  throw Exception("PatternTools::shrinkSiteSet siteSet is void.");
241 
242  vector<unique_ptr<ProbabilisticSite>> sites;
243 
244  for (unsigned int i = 0; i < siteSet.getNumberOfSites(); ++i)
245  {
246  const auto& currentSite = siteSet.site(i);
247  bool siteExists = false;
248  for (unsigned int j = 0; !siteExists && j < sites.size(); ++j)
249  {
250  if (SiteTools::areSymbolListsIdentical(currentSite, *sites[j]))
251  siteExists = true;
252  }
253  if (!siteExists)
254  sites.push_back(make_unique<ProbabilisticSite>(currentSite));
255  }
256  auto result = make_unique<ProbabilisticVectorSiteContainer>(sites, alphabet, false);
257  result->setSequenceNames(siteSet.getSequenceNames(), false);
258  return result;
259 }
260 
261 /******************************************************************************/
262 
264  const AlignmentDataInterface& sequences1,
265  const AlignmentDataInterface& sequences2)
266 {
267  size_t nbSites = sequences1.getNumberOfSites();
268  Vint indexes(nbSites);
269 
270  try
271  {
272  const auto& sc1 = dynamic_cast<const SiteContainerInterface&>(sequences1);
273  const auto& sc2 = dynamic_cast<const SiteContainerInterface&>(sequences2);
274 
275  for (size_t i = 0; i < nbSites; ++i)
276  {
277  // For each site in sequences1,
278  indexes[i] = -1;
279  const auto& site1 = dynamic_cast<const Site&>(sc1.site(i));
280  for (size_t j = 0; j < sequences2.getNumberOfSites(); ++j)
281  {
282  const auto& site2 = dynamic_cast<const Site&>(sc2.site(i));
283  if (SiteTools::areSymbolListsIdentical(site1, site2))
284  {
285  indexes[i] = static_cast<int>(j);
286  break;
287  }
288  }
289  }
290  return indexes;
291  }
292  catch (std::bad_cast& e)
293  {}
294 
295  try
296  {
297  const auto& psc1 = dynamic_cast<const ProbabilisticSiteContainerInterface&>(sequences1);
298  const auto& psc2 = dynamic_cast<const ProbabilisticSiteContainerInterface&>(sequences2);
299 
300  for (size_t i = 0; i < nbSites; ++i)
301  {
302  // For each site in sequences1,
303  indexes[i] = -1;
304  const auto& site1 = dynamic_cast<const ProbabilisticSite&>(psc1.site(i));
305  for (size_t j = 0; j < sequences2.getNumberOfSites(); ++j)
306  {
307  const auto& site2 = dynamic_cast<const ProbabilisticSite&>(psc2.site(i));
308  if (SiteTools::areSymbolListsIdentical(site1, site2))
309  {
310  indexes[i] = static_cast<int>(j);
311  break;
312  }
313  }
314  }
315  return indexes;
316  }
317  catch (std::bad_cast& e)
318  {}
319 
320  throw Exception("PatternTools::shrinkSiteSet : unsupported sequence type.");
321 }
322 
323 /******************************************************************************/
static void displayWarning(const std::string &text)
The phylogenetic node class.
Definition: Node.h:59
static std::unique_ptr< AlignmentDataInterface > shrinkSiteSet(const AlignmentDataInterface &siteSet)
Compress a site container by removing duplicated sites.
static std::unique_ptr< AlignmentDataInterface > getSequenceSubset(const AlignmentDataInterface &sequenceSet, const std::shared_ptr< N > node, const AssociationTreeGraphImplObserver< N, E, I > &tree)
Extract the sequences corresponding to a given subtree.
Definition: PatternTools.h:44
static Vint getIndexes(const AlignmentDataInterface &sequences1, const AlignmentDataInterface &sequences2)
Look for the occurrence of each site in sequences1 in sequences2 and send the position of the first o...
static bool areSymbolListsIdentical(const IntSymbolListInterface &list1, const IntSymbolListInterface &list2)
static void changeGapsToUnknownCharacters(IntSymbolListInterface &l)
virtual size_t getNumberOfSites() const=0
virtual bool hasSequence(const std::string &sequenceKey) const =0
virtual std::vector< std::string > getSequenceNames() const =0
virtual const CoreSequenceInterface & sequence(const std::string &sequenceKey) const =0
virtual std::shared_ptr< const Alphabet > getAlphabet() const =0
virtual const Site & site(size_t sitePosition) const override=0
static std::vector< N * > getLeaves(N &node)
Retrieve all leaves from a subtree.
Defines the basic types of data flow nodes.
std::vector< int > Vint