bpp-seq3  3.0.0
NexusIoSequence.cpp
Go to the documentation of this file.
1 // SPDX-FileCopyrightText: The Bio++ Development Group
2 //
3 // SPDX-License-Identifier: CECILL-2.1
4 
5 #include <Bpp/Io/FileTools.h>
6 #include <Bpp/Text/KeyvalTools.h>
7 #include <Bpp/Text/TextTools.h>
8 
9 #include "../Alphabet/AlphabetTools.h"
10 #include "../Container/SiteContainerTools.h"
11 #include "NexusIoSequence.h"
12 #include "NexusTools.h"
13 
14 using namespace bpp;
15 
16 // From the STL:
17 #include <sstream>
18 
19 using namespace std;
20 
21 /******************************************************************************/
22 
23 const std::vector<std::string> NexusIOSequence::splitNameAndSequence_(const std::string& s) const
24 {
25  vector<string> v(2);
26  string::size_type index = s.find(" ");
27  if (index == string::npos)
28  throw Exception("NexusIOSequence::splitNameAndSequence_(). No sequence name found.");
29  v[0] = TextTools::removeSurroundingWhiteSpaces(s.substr(0, index));
30  v[1] = TextTools::removeFirstWhiteSpaces(s.substr(index + 1));
31  return v;
32 }
33 
34 
35 /******************************************************************************/
36 
38 {
39  // Checking the existence of specified file
40  if (!input)
41  {
42  throw IOException ("NexusIOSequence::read(). Fail to open file");
43  }
44 
45  // Look for the DATA block:
46  string line = "";
47  while (TextTools::toUpper(line) != "BEGIN DATA;")
48  {
49  if (input.eof())
50  throw Exception("NexusIOSequence::appendFromStream(). No data block was found.");
52  }
53 
54  // Look for the DIMENSIONS command:
55  string cmdName = "", cmdArgs = "";
56  while (cmdName != "DIMENSIONS")
57  {
58  if (input.eof())
59  throw Exception("NexusIOSequence::appendFromStream(). No DIMENSIONS command was found.");
60  NexusTools::getNextCommand(input, cmdName, cmdArgs);
61  cmdName = TextTools::toUpper(cmdName);
62  }
63  map<string, string> args;
64  KeyvalTools::multipleKeyvals(cmdArgs, args, " ");
65  map<string, string> argsUp;
66  for (map<string, string>::iterator it = args.begin(); it != args.end(); it++)
67  {
68  argsUp[TextTools::toUpper(it->first)] = it->second;
69  }
70  if (argsUp["NTAX"] == "")
71  throw Exception("NexusIOSequence::appendFromStream(). DIMENSIONS command does not have a NTAX argument.");
72  size_t ntax = TextTools::to<size_t>(argsUp["NTAX"]);
73 
74  // Look for the FORMAT command:
75  while (cmdName != "FORMAT")
76  {
77  if (input.eof())
78  throw Exception("NexusIOSequence::appendFromStream(). No FORMAT command was found.");
79  NexusTools::getNextCommand(input, cmdName, cmdArgs);
80  cmdName = TextTools::toUpper(cmdName);
81  }
82  if (TextTools::hasSubstring(cmdArgs, "TRANSPOSE"))
83  throw Exception("NexusIOSequence::appendFromStream(). TRANSPOSE option is not supported.");
84 
85  // Check if the alignment is dotted or not:
86  bool matchChar = TextTools::hasSubstring(TextTools::toUpper(cmdArgs), "MATCHCHAR");
87 
88  auto alphaPtr = std::dynamic_pointer_cast<const Alphabet>(AlphabetTools::DEFAULT_ALPHABET);
89  auto alignment = make_unique<AlignedSequenceContainer>(alphaPtr);
90 
91  // Look for the MATRIX command:
92  line = "";
93  while (!TextTools::startsWith(TextTools::toUpper(line), "MATRIX"))
94  {
95  if (input.eof())
96  throw Exception("NexusIOSequence::appendFromStream(). No MATRIX command was found.");
98  }
99  line = FileTools::getNextLine(input);
100 
101  vector<string> names, seqs;
102  // Read first block:
103  bool commandFinished = false;
104  for (size_t i = 0; i < ntax && !input.eof(); ++i)
105  {
106  if (TextTools::endsWith(line, ";"))
107  {
108  if (i < ntax - 1)
109  throw IOException("NexusIOSequence::appendFromStream. Early end of MATRIX command, some sequences are missing.");
110  else
111  {
112  commandFinished = true;
113  line = line.substr(0, line.size() - 1); // Remove trailing semi-colon.
114  }
115  }
116  vector<string> v = splitNameAndSequence_(line);
117  names.push_back(v[0]);
118  seqs.push_back(v[1]);
119  line = FileTools::getNextLine(input);
120  }
121 
122  // Then read all other blocks:
123  commandFinished = TextTools::removeSurroundingWhiteSpaces(line) == ";"; // In case the end of command is on a separate line.
124  while (!commandFinished)
125  {
126  for (size_t i = 0; i < ntax && !input.eof(); ++i)
127  {
128  if (TextTools::endsWith(line, ";"))
129  {
130  if (i < ntax - 1)
131  throw IOException("NexusIOSequence::appendFromStream. Early end of MATRIX command, some sequences are missing.");
132  else
133  {
134  commandFinished = true;
135  line = line.substr(0, line.size() - 1); // Remove trailing semi-colon.
136  }
137  }
138 
139  vector<string> v = splitNameAndSequence_(line);
140  if (v[0] != names[i])
141  throw IOException("NexusIOSequence::appendFromStream. Bad file, the sequences are not in the same order in interleaved blocks, or one taxon is missing.");
142  seqs[i] += v[1];
143  line = FileTools::getNextLine(input);
144  commandFinished = TextTools::removeSurroundingWhiteSpaces(line) == ";"; // In case the end of command is on a separate line.
145  }
146  }
147  for (size_t i = 0; i < names.size(); ++i)
148  {
149  auto seqPtr = make_unique<Sequence>(names[i], seqs[i], alphaPtr);
150  alignment->addSequence(seqPtr->getName(), seqPtr);
151  }
152 
153  if (matchChar)
154  {
155  // Now we resolve the alignment:
156  auto resolvedAlignment =
157  SiteContainerTools::resolveDottedAlignment(*alignment, alphaPtr);
158  for (size_t i = 0; i < resolvedAlignment->getNumberOfSequences(); ++i)
159  {
160  auto seqPtr = unique_ptr<Sequence>(resolvedAlignment->sequence(i).clone());
161  vsc.addSequence(seqPtr->getName(), seqPtr);
162  }
163  }
164  else
165  {
166  for (size_t i = 0; i < alignment->getNumberOfSequences(); ++i)
167  {
168  auto seqPtr = unique_ptr<Sequence>(alignment->sequence(i).clone());
169  vsc.addSequence(seqPtr->getName(), seqPtr);
170  }
171  }
172 }
173 
174 /******************************************************************************/
175 
176 const std::string NexusIOSequence::getFormatName() const { return "Nexus"; }
177 
178 /******************************************************************************/
179 
180 const std::string NexusIOSequence::getFormatDescription() const
181 {
182  return "Nexus file format.";
183 }
184 
185 /******************************************************************************/
static std::shared_ptr< const DefaultAlphabet > DEFAULT_ALPHABET
Definition: AlphabetTools.h:39
static std::string getNextLine(std::istream &in)
static void multipleKeyvals(const std::string &desc, std::map< std::string, std::string > &keyvals, const std::string &split=",", bool nested=true)
void appendAlignmentFromStream(std::istream &input, SequenceContainerInterface &sc) const override
Append sequences to a container from a stream.
const std::vector< std::string > splitNameAndSequence_(const std::string &s) const
const std::string getFormatDescription() const override
const std::string getFormatName() const override
static bool getNextCommand(std::istream &input, std::string &name, std::string &arguments, bool lineBrk=true)
parse the next command name within a block.
Definition: NexusTools.cpp:38
static std::unique_ptr< SiteContainerInterface > resolveDottedAlignment(const SiteContainerInterface &dottedAln, std::shared_ptr< const Alphabet > &resolvedAlphabet)
Resolve a container with "." notations.
The SequenceContainer interface.
virtual void addSequence(const HashType &sequenceKey, std::unique_ptr< SequenceType > &sequencePtr)=0
Add a sequence to the container.
std::string removeSurroundingWhiteSpaces(const std::string &s)
std::string toUpper(const std::string &s)
bool hasSubstring(const std::string &s, const std::string &pattern)
bool startsWith(const std::string &s, const std::string &pattern)
bool endsWith(const std::string &s, const std::string &pattern)
std::string removeFirstWhiteSpaces(const std::string &s)
This alphabet is used to deal NumericAlphabet.