bpp-popgen3  3.0.0
GeneMapperCsvExport.cpp
Go to the documentation of this file.
1 // SPDX-FileCopyrightText: The Bio++ Development Group
2 //
3 // SPDX-License-Identifier: CECILL-2.1
4 
5 #include "GeneMapperCsvExport.h"
6 
7 using namespace bpp;
8 using namespace std;
9 
10 const std::string GeneMapperCsvExport::SAMPLE_FILE_H = "Sample File";
11 const std::string GeneMapperCsvExport::SAMPLE_NAME_H = "Sample Name";
12 const std::string GeneMapperCsvExport::PANEL_H = "Panel";
13 const std::string GeneMapperCsvExport::MARKER_H = "Marker";
14 const std::string GeneMapperCsvExport::DYE_H = "Dye";
15 const std::string GeneMapperCsvExport::ALLELE_H = "Allele ";
16 const std::string GeneMapperCsvExport::SIZE_H = "Size ";
17 const std::string GeneMapperCsvExport::HEIGHT_H = "Height ";
18 const std::string GeneMapperCsvExport::PEAK_AREA_H = "Peak Area ";
19 const std::string GeneMapperCsvExport::DAC_H = "DAC";
20 const std::string GeneMapperCsvExport::AN_H = "AN";
21 
22 // GeneMapperCsvExport::GeneMapperCsvExport(bool ia) : IndependentAlleles_(ia) {}
23 
25 
26 void GeneMapperCsvExport::read(std::istream& is, DataSet& dataset)
27 {
28  if (!is)
29  throw IOException("GeneMapperCsvExport::read: fail to open stream.");
30 
31  /*
32  * Feed a DataTable with the data
33  */
34  auto dtp = DataTable::read(is, "\t", true, -1);
35  DataTable& dt = *dtp;
36 
37  /*
38  * Fixes the individuals' name if there is duplicate in the file
39  */
40  vector<string> ind_names;
41  vector<string> markers;
42  try
43  {
44  ind_names = dt.getColumn(SAMPLE_NAME_H);
45  markers = dt.getColumn(MARKER_H);
46  }
47  catch (Exception& e)
48  {
49  throw e;
50  }
51  map<string, int> indname_marker;
52  for (size_t i = 0; i < dt.getNumberOfRows(); i++)
53  {
54  string test_lab = dt(i, SAMPLE_NAME_H) + dt(i, MARKER_H);
55  if (indname_marker.find(test_lab) != indname_marker.end())
56  {
57  string new_lab = dt(i, SAMPLE_NAME_H) + "_" + TextTools::toString(indname_marker[test_lab] + 1);
58  dt (i, SAMPLE_NAME_H) = new_lab;
59  }
60  indname_marker[test_lab]++;
61  }
62  ind_names = dt.getColumn(SAMPLE_NAME_H);
63 
64  map<string, size_t> ind_count = VectorTools::countValues(ind_names);
65  ind_names = VectorTools::unique(ind_names);
66  markers = VectorTools::unique(markers);
67  size_t loc_nbr = markers.size();
68 
69  /*
70  * Loci number
71  */
72  dataset.initAnalyzedLoci(loc_nbr);
73 
74  /*
75  * Group of individuals
76  */
77  dataset.addEmptyGroup(0);
78  for (unsigned int i = 0; i < ind_names.size(); i++)
79  {
80  Individual ind(ind_names[i]);
81  dataset.addIndividualToGroup(dataset.getGroupPosition(0), ind);
82  }
83 
84  /*
85  * Loci data
86  */
87  AnalyzedLoci al(markers.size());
88  vector<string> col_names = dt.getColumnNames();
89 
90  // Finds columns containing allele data
91  vector<size_t> alleles_cols;
92  for (size_t i = 0; i < col_names.size(); i++)
93  {
94  if (TextTools::startsWith(col_names[i], ALLELE_H))
95  alleles_cols.push_back(i);
96  }
97  // Set LocusInfo
98  vector<vector<size_t>> alleles_pos;
99  for (size_t i = 0; i < markers.size(); i++)
100  {
101  al.setLocusInfo(i, LocusInfo(markers[i], LocusInfo::UNKNOWN));
102  }
103  std::map< std::string, std::set< std::string >> markerAlleles;
104  for (size_t i = 0; i < dt.getNumberOfRows(); ++i)
105  {
106  for (size_t j = 0; j < alleles_cols.size(); ++j)
107  {
108  if (dt(i, alleles_cols[j]) != "")
109  {
110  markerAlleles[dt(i, MARKER_H)].insert(dt(i, alleles_cols[j]));
111  }
112  }
113  }
114  for (std::map< std::string, std::set< std::string >>::iterator itm = markerAlleles.begin(); itm != markerAlleles.end(); itm++)
115  {
116  std::set< std::string >& s = itm->second;
117  for (std::set< std::string >::iterator its = s.begin(); its != s.end(); its++)
118  {
119  al.addAlleleInfoByLocusName(itm->first, BasicAlleleInfo(*its));
120  }
121  }
122  dataset.setAnalyzedLoci(al);
123 
124  /*
125  * Individuals information
126  */
127  size_t ind_col_index = VectorTools::which(dt.getColumnNames(), SAMPLE_NAME_H);
128  size_t mark_col_index = VectorTools::which(dt.getColumnNames(), MARKER_H);
129  for (size_t i = 0; i < dt.getNumberOfRows(); i++)
130  {
131  vector<size_t> alleles;
132  for (size_t j = 0; j < alleles_cols.size(); j++)
133  {
134  if (!TextTools::isEmpty(dt(i, alleles_cols[j])))
135  {
136  unsigned int num = (dataset.getLocusInfoByName(dt(i, mark_col_index))).getAlleleInfoKey(dt(i, alleles_cols[j]));
137  alleles.push_back(num);
138  }
139  }
140  alleles = VectorTools::unique(alleles);
141  MultiAlleleMonolocusGenotype ma(alleles);
142  if (!dataset.getIndividualByIdFromGroup(0, dt(i, ind_col_index)).hasGenotype())
143  dataset.initIndividualGenotypeInGroup(0, dataset.getIndividualPositionInGroup(0, dt(i, ind_col_index)));
144  if (alleles.size())
145  dataset.setIndividualMonolocusGenotypeInGroup(0, dataset.getIndividualPositionInGroup(0, dt(i, ind_col_index)), dataset.analyzedLoci().getLocusInfoPosition(dt(i, mark_col_index)), ma);
146  }
147 }
148 
149 void GeneMapperCsvExport::read(const std::string& path, DataSet& dataset)
150 {
151  AbstractIDataSet::read(path, dataset);
152 }
153 
155 {
156  return AbstractIDataSet::read(is);
157 }
158 
159 DataSet* GeneMapperCsvExport::read(const std::string& path)
160 {
161  return AbstractIDataSet::read(path);
162 }
163 
164 // --- GeneMapperCsvExport::Record ---
165 GeneMapperCsvExport::Record::Record(const std::string& row) : sampleFile_(),
166  sampleName_(),
167  panel_(),
168  markerName_(),
169  dye_(),
170  alleles_(),
171  dac_(),
172  an_(0.)
173 {
174  StringTokenizer st(row, "\t", true, false);
175  /*
176  if (st.numberOfRemainingTokens() != 7 + 4 * alleleNumber) {
177  throw Exception("GeneMapperCsvExport::Record::Record: bad number of allele");
178  }
179  */
180  size_t itemNum = st.numberOfRemainingTokens();
181  size_t alleleNum = (itemNum - 7) / 4;
182  sampleFile_ = st.getToken(0);
183  sampleName_ = st.getToken(1);
184  panel_ = st.getToken(2);
185  markerName_ = st.getToken(3);
186  dye_ = st.getToken(4);
187  dac_ = st.getToken(itemNum - 2);
188  an_ = TextTools::toDouble(st.getToken(itemNum - 1));
189  for (unsigned int i = 0; i < alleleNum; ++i)
190  {
192  st.getToken(5 + i),
193  TextTools::toDouble(st.getToken(5 + alleleNum + i)),
194  TextTools::to<unsigned int>(st.getToken(5 + (2 * alleleNum) + i)),
195  TextTools::toDouble(st.getToken(5 + (3 * alleleNum) + i))
196  );
197  alleles_.push_back(al);
198  }
199 }
virtual void read(std::istream &is, DataSet &data_set)=0
Read a DataSet on istream.
The AnalyzedLoci class.
Definition: AnalyzedLoci.h:31
void addAlleleInfoByLocusName(const std::string &locusName, const AlleleInfo &allele)
Add an AlleleInfo to a LocusInfo by LocusInfo name.
void setLocusInfo(size_t locusPosition, const LocusInfo &locus)
Set a LocusInfo.
size_t getLocusInfoPosition(const std::string &locusName) const
Get the position of a LocusInfo.
The BasicAlleleInfo class.
The DataSet class.
Definition: DataSet.h:37
void initAnalyzedLoci(size_t numberOfLoci)
Initialize the AnalyzedLoci for number of loci.
Definition: DataSet.h:609
size_t getIndividualPositionInGroup(size_t groupPosition, const std::string &individual_id) const
Get the position of an Individual in a Group.
Definition: DataSet.cpp:399
const AnalyzedLoci & analyzedLoci() const
Get the AnalyzedLoci if there is one.
Definition: DataSet.h:621
void setAnalyzedLoci(const AnalyzedLoci &analyzedLoci)
Set the AnalyzedLoci to the DataSet.
Definition: DataSet.h:599
size_t getGroupPosition(size_t group_id) const
Get the position of a Group.
Definition: DataSet.cpp:221
const Individual & getIndividualByIdFromGroup(size_t groupPosition, const std::string &individualId) const
Get an Individual from a Group.
Definition: DataSet.cpp:431
const LocusInfo & getLocusInfoByName(const std::string &locus_name) const
Get a LocusInfo by its name.
Definition: DataSet.cpp:1014
void addIndividualToGroup(size_t groupPosition, const Individual &individual)
Add an Individual to a Group.
Definition: DataSet.cpp:356
void setIndividualMonolocusGenotypeInGroup(size_t groupPosition, size_t individualPosition, size_t locusPosition, const MonolocusGenotypeInterface &monogen)
Set a MonolocusGenotype of an Individual from a group.
Definition: DataSet.cpp:872
void initIndividualGenotypeInGroup(size_t groupPosition, size_t individualPosition)
Initialize the genotype of an Individual in a Group.
Definition: DataSet.cpp:828
void addEmptyGroup(size_t group_id)
Add an empty Group to the DataSet.
Definition: DataSet.cpp:166
static std::unique_ptr< DataTable > read(std::istream &in, const std::string &sep="\t", bool header=true, int rowNames=-1)
std::vector< std::string > getColumnNames() const
std::vector< std::string > & getColumn(size_t index)
size_t getNumberOfRows() const
Store data for one allele.
Record(const std::string &row)
Constructor.
std::vector< GeneMapperCsvExport::Allele > alleles_
static const std::string PEAK_AREA_H
static const std::string SAMPLE_NAME_H
static const std::string SIZE_H
static const std::string MARKER_H
static const std::string ALLELE_H
static const std::string HEIGHT_H
static const std::string AN_H
static const std::string DYE_H
void read(std::istream &is, DataSet &data_set)
Read a DataSet on istream.
static const std::string SAMPLE_FILE_H
static const std::string PANEL_H
static const std::string DAC_H
The Individual class.
Definition: Individual.h:40
The LocusInfo class.
Definition: LocusInfo.h:31
static unsigned int UNKNOWN
Definition: LocusInfo.h:41
The MultiAlleleMonolocusGenotype class.
size_t numberOfRemainingTokens() const
const std::string & getToken(size_t pos) const
static size_t which(const std::vector< T > &v, const T &which)
static std::vector< T > unique(const std::vector< T > &v)
static std::map< T, size_t > countValues(const std::vector< T > &v)
double toDouble(const std::string &s, char dec='.', char scientificNotation='e')
bool isEmpty(const std::string &s)
bool startsWith(const std::string &s, const std::string &pattern)
std::string toString(T t)