bpp-popgen  3.0.0
GeneMapperCsvExport.cpp
Go to the documentation of this file.
1 //
2 // File: GeneMapperCsvExport.cpp
3 // Author: Sylvain Gaillard
4 // Created: April 2, 2008
5 //
6 
7 /*
8  Copyright or © or Copr. Bio++ Development Team, (April 2, 2008)
9 
10  This software is a computer program whose purpose is to provide classes
11  for population genetics analysis.
12 
13  This software is governed by the CeCILL license under French law and
14  abiding by the rules of distribution of free software. You can use,
15  modify and/ or redistribute the software under the terms of the CeCILL
16  license as circulated by CEA, CNRS and INRIA at the following URL
17  "http://www.cecill.info".
18 
19  As a counterpart to the access to the source code and rights to copy,
20  modify and redistribute granted by the license, users are provided only
21  with a limited warranty and the software's author, the holder of the
22  economic rights, and the successive licensors have only limited
23  liability.
24 
25  In this respect, the user's attention is drawn to the risks associated
26  with loading, using, modifying and/or developing or reproducing the
27  software by the user in light of its specific status of free software,
28  that may mean that it is complicated to manipulate, and that also
29  therefore means that it is reserved for developers and experienced
30  professionals having in-depth computer knowledge. Users are therefore
31  encouraged to load and test the software's suitability as regards their
32  requirements in conditions enabling the security of their systems and/or
33  data to be ensured and, more generally, to use and operate it in the
34  same conditions as regards security.
35 
36  The fact that you are presently reading this means that you have had
37  knowledge of the CeCILL license and that you accept its terms.
38  */
39 
40 #include "GeneMapperCsvExport.h"
41 
42 using namespace bpp;
43 using namespace std;
44 
45 const std::string GeneMapperCsvExport::SAMPLE_FILE_H = "Sample File";
46 const std::string GeneMapperCsvExport::SAMPLE_NAME_H = "Sample Name";
47 const std::string GeneMapperCsvExport::PANEL_H = "Panel";
48 const std::string GeneMapperCsvExport::MARKER_H = "Marker";
49 const std::string GeneMapperCsvExport::DYE_H = "Dye";
50 const std::string GeneMapperCsvExport::ALLELE_H = "Allele ";
51 const std::string GeneMapperCsvExport::SIZE_H = "Size ";
52 const std::string GeneMapperCsvExport::HEIGHT_H = "Height ";
53 const std::string GeneMapperCsvExport::PEAK_AREA_H = "Peak Area ";
54 const std::string GeneMapperCsvExport::DAC_H = "DAC";
55 const std::string GeneMapperCsvExport::AN_H = "AN";
56 
57 //GeneMapperCsvExport::GeneMapperCsvExport(bool ia) : IndependentAlleles_(ia) {}
58 
60 
61 void GeneMapperCsvExport::read(std::istream& is, DataSet& data_set)
62 {
63  if (!is)
64  throw IOException("GeneMapperCsvExport::read: fail to open stream.");
65 
66  /*
67  * Feed a DataTable with the data
68  */
69  DataTable* dtp = DataTable::read(is, "\t", true, -1);
70  DataTable& dt = *dtp;
71 
72  /*
73  * Fixe the individuals' name if there is duplicate in the file
74  */
75  vector<string> ind_names;
76  vector<string> markers;
77  try
78  {
79  ind_names = dt.getColumn(SAMPLE_NAME_H);
80  markers = dt.getColumn(MARKER_H);
81  }
82  catch (Exception& e)
83  {
84  throw e;
85  }
86  map<string, int> indname_marker;
87  for (size_t i = 0; i < dt.getNumberOfRows(); i++)
88  {
89  string test_lab = dt(i, SAMPLE_NAME_H) + dt(i, MARKER_H);
90  if (indname_marker.find(test_lab) != indname_marker.end())
91  {
92  string new_lab = dt(i, SAMPLE_NAME_H) + "_" + TextTools::toString(indname_marker[test_lab] + 1);
93  dt (i, SAMPLE_NAME_H) = new_lab;
94  }
95  indname_marker[test_lab]++;
96  }
97  ind_names = dt.getColumn(SAMPLE_NAME_H);
98 
99  map<string, size_t> ind_count = VectorTools::countValues(ind_names);
100  ind_names = VectorTools::unique(ind_names);
101  markers = VectorTools::unique(markers);
102  size_t loc_nbr = markers.size();
103 
104  /*
105  * Loci number
106  */
107  data_set.initAnalyzedLoci(loc_nbr);
108 
109  /*
110  * Group of individuals
111  */
112  data_set.addEmptyGroup(0);
113  for (unsigned int i = 0; i < ind_names.size(); i++)
114  {
115  Individual ind(ind_names[i]);
116  data_set.addIndividualToGroup(data_set.getGroupPosition(0), ind);
117  }
118 
119  /*
120  * Loci data
121  */
122  AnalyzedLoci al(markers.size());
123  vector<string> col_names = dt.getColumnNames();
124 
125  // Finds columns containing allele data
126  vector<size_t> alleles_cols;
127  for (size_t i = 0; i < col_names.size(); i++)
128  {
129  if (TextTools::startsWith(col_names[i], ALLELE_H))
130  alleles_cols.push_back(i);
131  }
132  // Set LocusInfo
133  vector<vector<size_t> > alleles_pos;
134  for (size_t i = 0; i < markers.size(); i++)
135  {
136  al.setLocusInfo(i, LocusInfo(markers[i], LocusInfo::UNKNOWN));
137  }
138  std::map< std::string, std::set< std::string > > markerAlleles;
139  for (size_t i = 0; i < dt.getNumberOfRows(); ++i)
140  {
141  for (size_t j = 0; j < alleles_cols.size(); ++j)
142  {
143  if (dt(i, alleles_cols[j]) != "")
144  {
145  markerAlleles[dt(i, MARKER_H)].insert(dt(i, alleles_cols[j]));
146  }
147  }
148  }
149  for (std::map< std::string, std::set< std::string > >::iterator itm = markerAlleles.begin(); itm != markerAlleles.end(); itm++)
150  {
151  std::set< std::string >& s = itm->second;
152  for (std::set< std::string >::iterator its = s.begin(); its != s.end(); its++)
153  {
154  al.addAlleleInfoByLocusName(itm->first, BasicAlleleInfo(*its));
155  }
156  }
157  data_set.setAnalyzedLoci(al);
158 
159  /*
160  * Individuals informations
161  */
162  size_t ind_col_index = VectorTools::which(dt.getColumnNames(), SAMPLE_NAME_H);
163  size_t mark_col_index = VectorTools::which(dt.getColumnNames(), MARKER_H);
164  for (size_t i = 0; i < dt.getNumberOfRows(); i++)
165  {
166  vector<size_t> alleles;
167  for (size_t j = 0; j < alleles_cols.size(); j++)
168  {
169  if (!TextTools::isEmpty(dt(i, alleles_cols[j])))
170  {
171  unsigned int num = (data_set.getLocusInfoByName(dt(i, mark_col_index))).getAlleleInfoKey(dt(i, alleles_cols[j]));
172  alleles.push_back(num);
173  }
174  }
175  alleles = VectorTools::unique(alleles);
176  MultiAlleleMonolocusGenotype ma(alleles);
177  if (!data_set.getIndividualByIdFromGroup(0, dt(i, ind_col_index))->hasGenotype())
178  data_set.initIndividualGenotypeInGroup(0, data_set.getIndividualPositionInGroup(0, dt(i, ind_col_index)));
179  if (alleles.size())
180  data_set.setIndividualMonolocusGenotypeInGroup(0, data_set.getIndividualPositionInGroup(0, dt(i, ind_col_index)), data_set.getAnalyzedLoci()->getLocusInfoPosition(dt(i, mark_col_index)), ma);
181  }
182  delete dtp;
183 }
184 
185 void GeneMapperCsvExport::read(const std::string& path, DataSet& data_set)
186 {
187  AbstractIDataSet::read(path, data_set);
188 }
189 
191 {
192  return AbstractIDataSet::read(is);
193 }
194 
195 DataSet* GeneMapperCsvExport::read(const std::string& path)
196 {
197  return AbstractIDataSet::read(path);
198 }
199 
200 // --- GeneMapperCsvExport::Record ---
201 GeneMapperCsvExport::Record::Record(const std::string& row) : sampleFile_(),
202  sampleName_(),
203  panel_(),
204  markerName_(),
205  dye_(),
206  alleles_(),
207  dac_(),
208  an_(0.)
209 {
210  StringTokenizer st(row, "\t", true, false);
211  /*
212  if (st.numberOfRemainingTokens() != 7 + 4 * alleleNumber) {
213  throw Exception("GeneMapperCsvExport::Record::Record: bad number of allele");
214  }
215  */
216  size_t itemNum = st.numberOfRemainingTokens();
217  size_t alleleNum = (itemNum - 7) / 4;
218  sampleFile_ = st.getToken(0);
219  sampleName_ = st.getToken(1);
220  panel_ = st.getToken(2);
221  markerName_ = st.getToken(3);
222  dye_ = st.getToken(4);
223  dac_ = st.getToken(itemNum - 2);
224  an_ = TextTools::toDouble(st.getToken(itemNum - 1));
225  for (unsigned int i = 0; i < alleleNum; ++i)
226  {
228  st.getToken(5 + i),
229  TextTools::toDouble(st.getToken(5 + alleleNum + i)),
230  TextTools::to<unsigned int>(st.getToken(5 + (2 * alleleNum) + i)),
231  TextTools::toDouble(st.getToken(5 + (3 * alleleNum) + i))
232  );
233  alleles_.push_back(al);
234  }
235 }
virtual void read(std::istream &is, DataSet &data_set)=0
Read a DataSet on istream.
The AnalyzedLoci class.
Definition: AnalyzedLoci.h:65
size_t getLocusInfoPosition(const std::string &locus_name) const
Get the position of a LocusInfo.
void addAlleleInfoByLocusName(const std::string &locus_name, const AlleleInfo &allele)
Add an AlleleInfo to a LocusInfo by LocusInfo name.
void setLocusInfo(size_t locus_position, const LocusInfo &locus)
Set a LocusInfo.
The BasicAlleleInfo class.
The DataSet class.
Definition: DataSet.h:73
void setAnalyzedLoci(const AnalyzedLoci &analyzedLoci)
Set the AnalyzedLoci to the DataSet.
Definition: DataSet.cpp:1066
void setIndividualMonolocusGenotypeInGroup(size_t group_position, size_t individual_position, size_t locus_position, const MonolocusGenotype &monogen)
Set a MonolocusGenotype of an Individual from a group.
Definition: DataSet.cpp:921
void initIndividualGenotypeInGroup(size_t group_position, size_t individual_position)
Initialyze the genotype of an Individual in a Group.
Definition: DataSet.cpp:877
void initAnalyzedLoci(size_t number_of_loci)
Initialize the AnalyzedLoci for number of loci.
Definition: DataSet.cpp:1084
size_t getGroupPosition(size_t group_id) const
Get the position of a Group.
Definition: DataSet.cpp:269
const Individual * getIndividualByIdFromGroup(size_t group_position, const std::string &individual_id) const
Get an Individual from a Group.
Definition: DataSet.cpp:480
const LocusInfo & getLocusInfoByName(const std::string &locus_name) const
Get a LocusInfo by its name.
Definition: DataSet.cpp:1126
const AnalyzedLoci * getAnalyzedLoci() const
Get the AnalyzedLoci if there is one.
Definition: DataSet.cpp:1093
void addIndividualToGroup(size_t group_position, const Individual &individual)
Add an Individual to a Group.
Definition: DataSet.cpp:405
size_t getIndividualPositionInGroup(size_t group_position, const std::string &individual_id) const
Get the position of an Individual in a Group.
Definition: DataSet.cpp:448
void addEmptyGroup(size_t group_id)
Add an empty Group to the DataSet.
Definition: DataSet.cpp:217
std::vector< std::string > getColumnNames() const
std::vector< std::string > & getColumn(size_t index)
static DataTable * read(std::istream &in, const std::string &sep="\t", bool header=true, int rowNames=-1)
size_t getNumberOfRows() const
Store data for one allele.
Record(const std::string &row)
Constructor.
std::vector< GeneMapperCsvExport::Allele > alleles_
static const std::string PEAK_AREA_H
static const std::string SAMPLE_NAME_H
static const std::string SIZE_H
static const std::string MARKER_H
static const std::string ALLELE_H
static const std::string HEIGHT_H
static const std::string AN_H
static const std::string DYE_H
void read(std::istream &is, DataSet &data_set)
Read a DataSet on istream.
static const std::string SAMPLE_FILE_H
static const std::string PANEL_H
static const std::string DAC_H
The Individual class.
Definition: Individual.h:76
The LocusInfo class.
Definition: LocusInfo.h:64
static unsigned int UNKNOWN
Definition: LocusInfo.h:74
The MultiAlleleMonolocusGenotype class.
size_t numberOfRemainingTokens() const
const std::string & getToken(size_t pos) const
static size_t which(const std::vector< T > &v, const T &which)
static std::vector< T > unique(const std::vector< T > &v)
static std::map< T, size_t > countValues(const std::vector< T > &v)
double toDouble(const std::string &s, char dec='.', char scientificNotation='e')
bool isEmpty(const std::string &s)
bool startsWith(const std::string &s, const std::string &pattern)
std::string toString(T t)