21 missing_data_symbol_(
'$') {}
24 const std::string& data_separator) :
26 missing_data_symbol_(
'$')
43 if (missing_data_symbol.size() != 1 || isdigit(missing_data_symbol[0])
47 throw Exception(
"PopgenlibIO::setMissingData: not expected value for missing_data_symbol.");
56 else if (data_separator ==
TAB)
58 else if (data_separator ==
COMA)
64 if (isdigit(data_separator[0])
67 throw Exception(
"PopgenlibIO::setDataSeparator: not expected value for data_separator.");
82 case (
'\t'):
return TAB;
83 case (
','):
return COMA;
102 throw IOException(
"PopgenlibIO::read: fail to open stream.");
104 vector<string> temp_v;
108 vector<LocusInfo> tmp_locinf;
110 bool section1 =
true;
111 bool section2 =
true;
112 bool section3 =
true;
113 bool section4 =
true;
114 bool section5 =
true;
115 size_t current_section = 0;
116 size_t previous_section = 0;
124 if (temp.find(
"[General]", 0) != string::npos)
126 previous_section = current_section;
130 else if (temp.find(
"[Localities]", 0) != string::npos)
132 previous_section = current_section;
136 else if (temp.find(
"[Sequences]", 0) != string::npos)
138 previous_section = current_section;
142 else if (temp.find(
"[Loci]", 0) != string::npos)
144 previous_section = current_section;
148 else if (temp.find(
"[Individuals]", 0) != string::npos)
150 previous_section = current_section;
155 if (current_section == 1 && previous_section < 1)
157 temp_v.push_back(temp);
159 if (section1 && current_section != 1 && previous_section == 1)
169 if (current_section == 2 && previous_section < 2)
171 if (temp.find(
">", 0) != string::npos)
175 temp_v.push_back(temp);
178 temp_v.push_back(temp);
180 if (section2 && current_section != 2 && previous_section == 2)
188 if (current_section == 3 && previous_section < 3)
190 if (temp.find(
">", 0) != string::npos)
194 temp_v.push_back(temp);
197 temp_v.push_back(temp);
199 if (section3 && current_section != 3 && previous_section == 3)
207 if (current_section == 4 && previous_section < 4)
209 if (temp.find(
">", 0) != string::npos)
213 temp_v.push_back(temp);
216 temp_v.push_back(temp);
218 if (section4 && current_section != 4 && previous_section == 4)
224 for (
size_t i = 0; i < tmp_locinf.size(); i++)
232 if (current_section == 5 && previous_section < 5)
234 if (temp.find(
">", 0) != string::npos)
238 temp_v.push_back(temp);
241 temp_v.push_back(temp);
243 if (section5 && current_section != 5 && previous_section == 5)
251 if (section2 && current_section == 2)
253 if (section3 && current_section == 3)
255 if (section5 && current_section == 5)
263 for (
size_t i = 0; i < in.size(); i++)
268 while (!is.eof() && in.size() != 0)
271 if (temp.find(
"MissingData", 0) != string::npos)
273 if (temp.find(
"DataSeparator", 0) != string::npos)
275 if (temp.find(
"SequenceType", 0) != string::npos)
283 for (
size_t i = 0; i < in.size(); i++)
289 while (!is.eof() && in.size() != 0)
293 if (temp.find(
">", 0) != string::npos)
297 if (temp.find(
"Coord", 0) != string::npos)
304 if (tmp_locality.
getName() !=
"")
312 for (
size_t i = 0; i < in.size(); i++)
322 for (
size_t i = 0; i < in.size(); i++)
326 string locinf_name =
"";
332 if (temp.find(
">", 0) != string::npos)
336 if (temp.find(
"Ploidy", 0) != string::npos)
344 else if (tmp_str_ploidy ==
HAPLOID)
348 else if (tmp_str_ploidy ==
UNKNOWN)
351 if (temp.find(
"NbAlleles", 0) != string::npos)
356 if (locinf_name !=
"")
357 locus_info.push_back(
LocusInfo(locinf_name, locinf_ploidy));
363 size_t tmp_group_pos = 0;
365 for (
size_t i = 0; i < in.size(); i++)
368 if (in[i].find(
">", 0) != string::npos)
373 if (in[i].find(
"Group", 0) != string::npos)
376 tmp_group_pos = TextTools::to<size_t>(
getValues_(temp,
"=")[0]);
385 if (in[i].find(
"Locality", 0) != string::npos)
388 size_t sep_pos = temp.find(
"=", 0);
398 if (in[i].find(
"Coord", 0) != string::npos)
404 if (in[i].find(
"Date", 0) != string::npos)
415 if (in[i].find(
"SequenceData", 0) != string::npos)
419 vector<string> seq_pos_str =
getValues_(temp,
"");
420 for (
size_t j = 0; j < seq_pos_str.size(); ++j)
426 auto tmpSeq = unique_ptr<Sequence>(vsc.
sequence(TextTools::to<size_t>(seq_pos_str[j]) - 1).clone());
435 if (in[i].find(
"AllelicData", 0) != string::npos)
437 string temp1 = in[++i];
438 string temp2 = in[++i];
439 vector<string> allele_pos_str1 =
getValues_(temp1,
"");
440 vector<string> allele_pos_str2 =
getValues_(temp2,
"");
447 if (allele_pos_str1.size() == allele_pos_str2.size())
449 for (
size_t j = 0; j < allele_pos_str1.size(); j++)
453 vector<string> tmp_alleles_id;
463 tmp_alleles_id.push_back(allele_pos_str1[j]);
475 tmp_alleles_id.push_back(allele_pos_str2[j]);
487 if (tmpIndiv.
getId() !=
"")
517 os <<
"[General]" << endl;
523 os <<
"SequenceType = " << seq_type << endl;
528 os << endl <<
"[Localities]" << endl;
541 os << endl <<
"[Sequences]" << endl;
554 os << endl <<
"[Loci]" << endl;
558 os <<
">" << tmp_locus_info.
getName() << endl;
574 os << endl <<
"[Individuals]" << endl;
582 os <<
">" << tmpInd.
getId() << endl;
584 if (tmpInd.hasLocality())
585 os <<
"Locality = " << tmpInd.locality().getName() << endl;
586 if (tmpInd.hasCoord())
587 os <<
"Coord = " << tmpInd.getX() <<
" " << tmpInd.getY() << endl;
588 if (tmpInd.hasDate())
589 os <<
"Date = " << tmpInd.date().getDateStr() << endl;
590 if (tmpInd.hasSequences())
592 size_t nbss = tmpInd.getNumberOfSequences();
593 os <<
"SequenceData = {" << endl;
594 for (
size_t k = 0; k < nbss; k++)
598 tmpInd.sequenceAtPosition(k);
612 if (tmpInd.hasGenotype())
615 vector<vector<string>> output(tmp_genotype.
size());
616 os <<
"AllelicData = {" << endl;
617 for (
size_t k = 0; k < tmp_genotype.
size(); k++)
629 if (tmp_all_ind.size() > 1)
635 for (
size_t k = 0; k < output.size(); k++)
638 if (k < output.size() - 1)
643 for (
size_t k = 0; k < output.size(); k++)
646 if (k < output.size() - 1)
664 vector<string> values;
665 size_t limit = param_line.find(delim, 0);
666 if (limit != string::npos)
667 param_line = string(param_line.begin() +
static_cast<ptrdiff_t
>(limit + delim.size()), param_line.end());
674 values.push_back(
string(param_line.begin() +
static_cast<ptrdiff_t
>(bi), param_line.begin() +
static_cast<ptrdiff_t
>(bs)));
678 values.push_back(
string(param_line.begin() +
static_cast<ptrdiff_t
>(bi), param_line.end()));
virtual void read(std::istream &is, DataSet &data_set)=0
Read a DataSet on istream.
void readSequences(std::istream &input, SequenceContainerInterface &sc) const override
virtual void write(std::ostream &os, const DataSet &data_set) const =0
Write a DataSet on ostream.
virtual const std::string & getId() const =0
Get the identitier of the allele.
void setLocusInfo(size_t locusPosition, const LocusInfo &locus)
Set a LocusInfo.
The BasicAlleleInfo class.
const Group & getGroupAtPosition(size_t groupPosition) const
Get a group by position.
size_t getNumberOfGroups() const
Get the number of Groups.
std::string getAlphabetType() const
Get the alphabet type as a string.
bool hasAlleleicData() const
Tell if there is alelelic data.
void addAlleleInfoByLocusPosition(size_t locus_position, const AlleleInfo &allele)
Add an AlleleInfo to a LocusInfo.
std::shared_ptr< const Alphabet > getAlphabet() const
Get a pointer toward the alphabet if there is sequence data.
const Individual & getIndividualAtPositionFromGroup(size_t groupPosition, size_t individualPosition) const
Get an Individual from a Group.
void setAnalyzedLoci(const AnalyzedLoci &analyzedLoci)
Set the AnalyzedLoci to the DataSet.
size_t getGroupPosition(size_t group_id) const
Get the position of a Group.
size_t getNumberOfIndividualsInGroup(size_t groupPosition) const
Get the number of Individuals in a Group.
bool hasSequenceData() const
Tell if at least one individual has at least one sequence.
void setAlphabet(std::shared_ptr< const Alphabet > alpha)
Set the alphabet of the AnalyzedSequences.
const Locality< double > & localityAtPosition(size_t localityPosition) const
Get a Locality by localityPosition.
size_t getNumberOfLocalities() const
Get the number of Localities.
std::shared_ptr< const Locality< double > > getLocalityByName(const std::string &name) const
Get a Locality by name.
const LocusInfo & getLocusInfoAtPosition(size_t locus_position) const
Get a LocusInfo by its position.
size_t getNumberOfLoci() const
Get the number of loci.
void addIndividualToGroup(size_t groupPosition, const Individual &individual)
Add an Individual to a Group.
bool hasLocality() const
Tell if there is at least one locality.
void addEmptyGroup(size_t group_id)
Add an empty Group to the DataSet.
void addLocality(const Locality< double > &locality)
Add a locality to the DataSet.
void writeSequences(std::ostream &output, const SequenceContainerInterface &sc) const override
void addSequence(size_t sequenceKey, std::unique_ptr< Sequence > &sequence)
Add a sequence to the Individual.
const std::string & getId() const
Get the id of the Individual.
void setCoord(const Point2D< double > &coord)
Set the coordinates of the Individual.
void setId(const std::string &id)
Set the id of the Individual.
void setLocality(std::shared_ptr< const Locality< double >> locality)
Set the locality of the Individual.
void setDate(const Date &date)
Set the date of the Individual.
void initGenotype(size_t lociNumber)
Init the genotype.
const SequenceContainerInterface & sequences() const
Get a reference to the sequence container.
void setMonolocusGenotypeByAlleleId(size_t locusPosition, const std::vector< std::string > alleleId, const LocusInfo &locusInfo)
Set a MonolocusGenotype.
const std::string & getName() const
Get the name of the locality.
void setName(const std::string &name)
Set the name of the locality.
static unsigned int UNKNOWN
unsigned int getPloidy() const
Get the ploidy of the locus.
size_t getNumberOfAlleles() const
Get the number of alleles at this locus.
static unsigned int DIPLOID
const AlleleInfo & getAlleleInfoByKey(size_t key) const
Retrieve an AlleleInfo object of the LocusInfo.
static unsigned int HAPLOID
static unsigned int HAPLODIPLOID
const std::string & getName() const
Get the name of the locus.
virtual std::vector< size_t > getAlleleIndex() const =0
Get the alleles' index.
The MultilocusGenotype class.
const MonolocusGenotypeInterface & monolocusGenotype(size_t locusPosition) const
Get a MonolocusGenotype.
bool isMonolocusGenotypeMissing(size_t locusPosition) const
Tell if a MonolocusGenotype is a missing data.
size_t size() const
Count the number of loci.
static const std::string UNKNOWN
void parseIndividual_(const std::vector< std::string > &in, DataSet &data_set, const VectorSequenceContainer &vsc)
static const std::string HAPLODIPLOID
void parseLocality_(const std::vector< std::string > &in, DataSet &data_set)
static const std::string COMA
void write(std::ostream &os, const DataSet &data_set) const
Write a DataSet on ostream.
char missing_data_symbol_
static const std::string TAB
static const std::string SEMICOLON
void read(std::istream &is, DataSet &data_set)
Read a DataSet on istream.
std::string getDataSeparator() const
Get the code for data separator.
static const std::string DIPLOID
std::string getMissingDataSymbol() const
Get the code for missing data.
static const std::string WHITESPACE
std::vector< std::string > getValues_(std::string ¶m_line, const std::string &delim)
char getMissingDataChar() const
Get the character for missing data.
void parseLoci_(const std::vector< std::string > &in, std::vector< LocusInfo > &locus_info)
void setDataSeparator(const std::string &data_separator)
Set the code for data separator.
static const std::string HAPLOID
void parseGeneral_(const std::vector< std::string > &in, DataSet &data_set)
void setMissingDataSymbol(const std::string &missing_data_symbol)
Set the code for missing data.
void parseSequence_(const std::vector< std::string > &in, VectorSequenceContainer &vsc)
char getDataSeparatorChar() const
Get the data separator char.
const SequenceType & sequence(const std::string &sequenceKey) const override
int toInt(const std::string &s, char scientificNotation='e')
double toDouble(const std::string &s, char dec='.', char scientificNotation='e')
std::string removeSurroundingWhiteSpaces(const std::string &s)
std::string toUpper(const std::string &s)
bool isWhiteSpaceCharacter(char c)
std::string toString(T t)
TemplateVectorSequenceContainer< Sequence > VectorSequenceContainer