bpp-phyl3 3.0.0
PatternTools.cpp
Go to the documentation of this file.
1// SPDX-FileCopyrightText: The Bio++ Development Group
2//
3// SPDX-License-Identifier: CECILL-2.1
4
5#include "PatternTools.h"
6
7// From bpp-seq:
8#include <Bpp/Seq/SiteTools.h>
9
10using namespace bpp;
11
12// From the STL:
13#include <iostream>
14#include <algorithm>
15
16using namespace std;
17
18/******************************************************************************/
19
20std::unique_ptr<AlignmentDataInterface> PatternTools::getSequenceSubset(
21 const AlignmentDataInterface& sequenceSet,
22 const Node& node)
23{
24 try
25 {
26 const auto& siteContainer = dynamic_cast<const SiteContainerInterface&>(sequenceSet);
27 return getSequenceSubset(siteContainer, node);
28 }
29 catch (std::bad_cast& e)
30 {}
31
32 try
33 {
34 const auto& siteContainer = dynamic_cast<const ProbabilisticSiteContainerInterface&>(sequenceSet);
35 return getSequenceSubset(siteContainer, node);
36 }
37 catch (std::bad_cast& e)
38 {}
39
40 throw Exception("PatternTools::getSequenceSubset : unsupported sequence type.");
41}
42
43std::unique_ptr<SiteContainerInterface> PatternTools::getSequenceSubset(
44 const SiteContainerInterface& sequenceSet,
45 const Node& node)
46{
47 auto alphabet = sequenceSet.getAlphabet();
48 size_t nbSites = sequenceSet.getNumberOfSites();
49 auto sequenceSubset = std::make_unique<VectorSiteContainer>(alphabet);
50
51 auto leaves = TreeTemplateTools::getLeaves(node);
52
53 for (auto i : leaves)
54 {
55 if (i->hasName())
56 {
57 // Use sequence name as key.
58 try
59 {
60 auto newSeq = std::make_unique<Sequence>(sequenceSet.sequence(i->getName()));
61 sequenceSubset->addSequence(i->getName(), newSeq);
62 }
63 catch (std::exception& e)
64 {
65 ApplicationTools::displayWarning("PatternTools::getSequenceSubset : Leaf name not found in sequence file: " + i->getName() + " : Replaced with unknown sequence");
66
67 auto seq = std::make_unique<Sequence>(i->getName(), "", alphabet);
68 seq->setToSizeR(nbSites);
70 sequenceSubset->addSequence(i->getName(), seq);
71 }
72 }
73 }
74 sequenceSubset->setSiteCoordinates(sequenceSet.getSiteCoordinates());
75 return sequenceSubset;
76}
77
78std::unique_ptr<ProbabilisticSiteContainerInterface> PatternTools::getSequenceSubset(
79 const ProbabilisticSiteContainerInterface& sequenceSet,
80 const Node& node)
81{
82 auto alphabet = sequenceSet.getAlphabet();
83 size_t nbSites = sequenceSet.getNumberOfSites();
84 auto sequenceSubset = std::make_unique<ProbabilisticVectorSiteContainer>(alphabet);
85
86 auto leaves = TreeTemplateTools::getLeaves(node);
87
88 for (auto i : leaves)
89 {
90 if (i->hasName())
91 {
92 // Use sequence name as key.
93 try
94 {
95 auto newSeq = std::make_unique<ProbabilisticSequence>(sequenceSet.sequence(i->getName()));
96 sequenceSubset->addSequence(i->getName(), newSeq);
97 }
98 catch (std::exception const& e)
99 {
100 ApplicationTools::displayWarning("PatternTools::getSequenceSubset : Leaf name not found in sequence file: " + i->getName() + " : Replaced with unknown sequence");
101
102 auto newSeq = std::make_unique<ProbabilisticSequence>(i->getName(), Table<double>(alphabet->getSize(), 0), alphabet);
103 newSeq->setToSizeR(nbSites);
105 sequenceSubset->addSequence(i->getName(), newSeq);
106 }
107 }
108 }
109 sequenceSubset->setSiteCoordinates(sequenceSet.getSiteCoordinates());
110 return sequenceSubset;
111}
112
113/******************************************************************************/
114
115std::unique_ptr<AlignmentDataInterface> PatternTools::getSequenceSubset(
116 const AlignmentDataInterface& sequenceSet,
117 const std::vector<std::string>& names)
118{
119 auto alphabet = sequenceSet.getAlphabet();
120
121 try
122 {
123 const auto& siteContainer = dynamic_cast<const SiteContainerInterface&>(sequenceSet);
124 return getSequenceSubset(siteContainer, names);
125 }
126 catch (std::bad_cast& e)
127 {}
128
129 try
130 {
131 const auto& siteContainer = dynamic_cast<const ProbabilisticSiteContainerInterface&>(sequenceSet);
132 return getSequenceSubset(siteContainer, names);
133 }
134 catch (std::bad_cast& e)
135 {}
136
137 throw Exception("PatternTools::getSequenceSubset : unsupported sequence type.");
138}
139
140
141std::unique_ptr<SiteContainerInterface> PatternTools::getSequenceSubset(
142 const SiteContainerInterface& sequenceSet,
143 const std::vector<std::string>& names)
144{
145 auto alphabet = sequenceSet.getAlphabet();
146 auto sequenceSubset = std::make_unique<VectorSiteContainer>(alphabet);
147
148 for (auto& i : names)
149 {
150 if (sequenceSet.hasSequence(i))
151 {
152 auto newSeq = std::make_unique<Sequence>(sequenceSet.sequence(i));
153 sequenceSubset->addSequence(i, newSeq);
154 }
155 else
156 throw SequenceNotFoundException("PatternTools ERROR: name not found in sequence container: ", i);
157 }
158 sequenceSubset->setSiteCoordinates(sequenceSet.getSiteCoordinates());
159 return std::move(sequenceSubset);
160}
161
162std::unique_ptr<ProbabilisticSiteContainerInterface> PatternTools::getSequenceSubset(
163 const ProbabilisticSiteContainerInterface& sequenceSet,
164 const std::vector<std::string>& names)
165{
166 auto alphabet = sequenceSet.getAlphabet();
167 auto sequenceSubset = std::make_unique<ProbabilisticVectorSiteContainer>(alphabet);
168
169 for (auto& i : names)
170 {
171 if (sequenceSet.hasSequence(i))
172 {
173 auto newSeq = std::make_unique<ProbabilisticSequence>(sequenceSet.sequence(i));
174 sequenceSubset->addSequence(i, newSeq);
175 }
176 else
177 throw SequenceNotFoundException("PatternTools ERROR: name not found in sequence container: ", i);
178 }
179 sequenceSubset->setSiteCoordinates(sequenceSet.getSiteCoordinates());
180 return std::move(sequenceSubset);
181}
182
183/******************************************************************************/
184
185std::unique_ptr<AlignmentDataInterface> PatternTools::shrinkSiteSet(
186 const AlignmentDataInterface& siteSet)
187{
188 try
189 {
190 const auto& sc = dynamic_cast<const SiteContainerInterface&>(siteSet);
191 return shrinkSiteSet(sc);
192 }
193 catch (std::bad_cast& e)
194 {}
195
196 try
197 {
198 const auto& psc = dynamic_cast<const ProbabilisticSiteContainerInterface&>(siteSet);
199 return shrinkSiteSet(psc);
200 }
201 catch (std::bad_cast& e)
202 {}
203
204 throw Exception("PatternTools::shrinkSiteSet : unsupported sequence type.");
205}
206
207std::unique_ptr<SiteContainerInterface> PatternTools::shrinkSiteSet(
208 const SiteContainerInterface& siteSet)
209{
210 auto alphabet = siteSet.getAlphabet();
211
212 if (siteSet.getNumberOfSites() == 0)
213 throw Exception("PatternTools::shrinkSiteSet siteSet is void.");
214
215 vector<std::unique_ptr<Site>> sites;
216
217 for (unsigned int i = 0; i < siteSet.getNumberOfSites(); ++i)
218 {
219 const auto& currentSite = siteSet.site(i);
220 bool siteExists = false;
221 for (unsigned int j = 0; !siteExists && j < sites.size(); ++j)
222 {
223 if (SiteTools::areSymbolListsIdentical(currentSite, *sites[j]))
224 siteExists = true;
225 }
226 if (!siteExists)
227 sites.push_back(make_unique<Site>(currentSite));
228 }
229 auto result = make_unique<VectorSiteContainer>(sites, alphabet, false);
230 result->setSequenceNames(siteSet.getSequenceNames(), true); // Update keys too
231 return result;
232}
233
234std::unique_ptr<ProbabilisticSiteContainerInterface> PatternTools::shrinkSiteSet(
236{
237 auto alphabet = siteSet.getAlphabet();
238
239 if (siteSet.getNumberOfSites() == 0)
240 throw Exception("PatternTools::shrinkSiteSet siteSet is void.");
241
242 vector<unique_ptr<ProbabilisticSite>> sites;
243
244 for (unsigned int i = 0; i < siteSet.getNumberOfSites(); ++i)
245 {
246 const auto& currentSite = siteSet.site(i);
247 bool siteExists = false;
248 for (unsigned int j = 0; !siteExists && j < sites.size(); ++j)
249 {
250 if (SiteTools::areSymbolListsIdentical(currentSite, *sites[j]))
251 siteExists = true;
252 }
253 if (!siteExists)
254 sites.push_back(make_unique<ProbabilisticSite>(currentSite));
255 }
256 auto result = make_unique<ProbabilisticVectorSiteContainer>(sites, alphabet, false);
257 result->setSequenceNames(siteSet.getSequenceNames(), false);
258 return result;
259}
260
261/******************************************************************************/
262
264 const AlignmentDataInterface& sequences1,
265 const AlignmentDataInterface& sequences2)
266{
267 size_t nbSites = sequences1.getNumberOfSites();
268 Vint indexes(nbSites);
269
270 try
271 {
272 const auto& sc1 = dynamic_cast<const SiteContainerInterface&>(sequences1);
273 const auto& sc2 = dynamic_cast<const SiteContainerInterface&>(sequences2);
274
275 for (size_t i = 0; i < nbSites; ++i)
276 {
277 // For each site in sequences1,
278 indexes[i] = -1;
279 const auto& site1 = dynamic_cast<const Site&>(sc1.site(i));
280 for (size_t j = 0; j < sequences2.getNumberOfSites(); ++j)
281 {
282 const auto& site2 = dynamic_cast<const Site&>(sc2.site(i));
283 if (SiteTools::areSymbolListsIdentical(site1, site2))
284 {
285 indexes[i] = static_cast<int>(j);
286 break;
287 }
288 }
289 }
290 return indexes;
291 }
292 catch (std::bad_cast& e)
293 {}
294
295 try
296 {
297 const auto& psc1 = dynamic_cast<const ProbabilisticSiteContainerInterface&>(sequences1);
298 const auto& psc2 = dynamic_cast<const ProbabilisticSiteContainerInterface&>(sequences2);
299
300 for (size_t i = 0; i < nbSites; ++i)
301 {
302 // For each site in sequences1,
303 indexes[i] = -1;
304 const auto& site1 = dynamic_cast<const ProbabilisticSite&>(psc1.site(i));
305 for (size_t j = 0; j < sequences2.getNumberOfSites(); ++j)
306 {
307 const auto& site2 = dynamic_cast<const ProbabilisticSite&>(psc2.site(i));
308 if (SiteTools::areSymbolListsIdentical(site1, site2))
309 {
310 indexes[i] = static_cast<int>(j);
311 break;
312 }
313 }
314 }
315 return indexes;
316 }
317 catch (std::bad_cast& e)
318 {}
319
320 throw Exception("PatternTools::shrinkSiteSet : unsupported sequence type.");
321}
322
323/******************************************************************************/
static void displayWarning(const std::string &text)
The phylogenetic node class.
Definition: Node.h:59
static std::unique_ptr< AlignmentDataInterface > shrinkSiteSet(const AlignmentDataInterface &siteSet)
Compress a site container by removing duplicated sites.
static std::unique_ptr< AlignmentDataInterface > getSequenceSubset(const AlignmentDataInterface &sequenceSet, const std::shared_ptr< N > node, const AssociationTreeGraphImplObserver< N, E, I > &tree)
Extract the sequences corresponding to a given subtree.
Definition: PatternTools.h:44
static Vint getIndexes(const AlignmentDataInterface &sequences1, const AlignmentDataInterface &sequences2)
Look for the occurrence of each site in sequences1 in sequences2 and send the position of the first o...
static bool areSymbolListsIdentical(const IntSymbolListInterface &list1, const IntSymbolListInterface &list2)
static void changeGapsToUnknownCharacters(IntSymbolListInterface &l)
virtual size_t getNumberOfSites() const=0
virtual bool hasSequence(const std::string &sequenceKey) const =0
virtual std::vector< std::string > getSequenceNames() const =0
virtual const CoreSequenceInterface & sequence(const std::string &sequenceKey) const =0
virtual std::shared_ptr< const Alphabet > getAlphabet() const =0
virtual const Site & site(size_t sitePosition) const override=0
static std::vector< N * > getLeaves(N &node)
Retrieve all leaves from a subtree.
Defines the basic types of data flow nodes.
std::vector< int > Vint