Csv.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief Support for importing and exporting data from and to character separated value (CSV) files
6  *
7  *
8  * \par
9  * The most important application of the methods provided in this
10  * file is the import of data from CSV files into Shark data
11  * containers.
12  *
13  *
14  *
15  *
16  * \author T. Voss, M. Tuma
17  * \date 2010
18  *
19  *
20  * \par Copyright 1995-2015 Shark Development Team
21  *
22  * <BR><HR>
23  * This file is part of Shark.
24  * <http://image.diku.dk/shark/>
25  *
26  * Shark is free software: you can redistribute it and/or modify
27  * it under the terms of the GNU Lesser General Public License as published
28  * by the Free Software Foundation, either version 3 of the License, or
29  * (at your option) any later version.
30  *
31  * Shark is distributed in the hope that it will be useful,
32  * but WITHOUT ANY WARRANTY; without even the implied warranty of
33  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
34  * GNU Lesser General Public License for more details.
35  *
36  * You should have received a copy of the GNU Lesser General Public License
37  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
38  *
39  */
40 //===========================================================================
41 
42 #ifndef SHARK_DATA_CSV_H
43 #define SHARK_DATA_CSV_H
44 
45 #include <shark/Core/DLLSupport.h>
46 #include <shark/Data/Dataset.h>
47 
48 #include <boost/algorithm/string.hpp>
49 #include <boost/algorithm/string/trim.hpp>
50 #include <boost/format.hpp>
51 #include <boost/iostreams/copy.hpp>
52 #include <boost/iostreams/filter/newline.hpp>
53 #include <boost/iostreams/filtering_stream.hpp>
54 #include <boost/lexical_cast.hpp>
55 #include <boost/type_traits.hpp>
56 
57 #include <exception>
58 #include <fstream>
59 #include <map>
60 #include <string>
61 
62 namespace shark {
63 
64 /**
65  * \ingroup shark_globals
66  *
67  * @{
68  */
69 
70 
71 /// \brief Position of the label in a CSV file
72 ///
73 /// \par
74 /// This type describes the position of the label in a record of a CSV file.
75 /// The label can be positioned either in the first or the last column, or
76 /// there can be no label present at all.
80 };
81 
82 namespace detail {
83 
84  // export function for unlabeled data
85  template<typename T, typename Stream>
86  void exportCSV(const T &data, // Container that holds the samples
87  Stream &out, // The file to be read from
88  char separator, // The separator between elements
89  bool scientific = true, //scientific notation?
90  unsigned int fieldwidth = 0
91  ) {
92  if (!out) {
93  throw(std::invalid_argument("[exportCSV (1)] Stream cannot be opened for writing."));
94  }
95 
96  // set output format
97  if (scientific)
98  out.setf(std::ios_base::scientific);
99  std::streamsize ss = out.precision();
100  out.precision(10);
101 
102  // write out
103  typename T::const_iterator it = data.begin();
104  for (; it != data.end(); ++it) {
105  SHARK_CHECK(it->begin() != it->end(), "[exportCSV (1)] record must not be empty");
106  for (std::size_t i=0; i<(*it).size()-1; i++) {
107  out << std::setw(fieldwidth) << (*it)(i) << separator;
108  }
109  out << std::setw(fieldwidth) << (*it)((*it).size()-1) << std::endl;
110  }
111 
112  // restore output format
113  out.precision(ss);
114  }
115 
116  // export function for labeled data
117 
118  template<typename T, typename U, typename Stream>
119  void exportCSV_labeled(const T &input, // Container that holds the samples
120  const U &labels, // Container that holds the labels
121  Stream &out, // The file to be read from
122  LabelPosition lp, // The position of the label
123  char separator, // The separator between elements
124  bool scientific = true, //scientific notation?
125  unsigned int fieldwidth = 0, //column-align using this field width
126  typename boost::enable_if<
127  boost::is_arithmetic<typename boost::range_value<U>::type>
128  >::type* dummy = 0//enable this only for arithmetic types
129  ) {
130 
131  if (!out) {
132  throw(std::invalid_argument("[exportCSV (2)] Stream cannot be opened for writing."));
133  }
134 
135 
136  if (scientific)
137  out.setf(std::ios_base::scientific);
138  std::streamsize ss = out.precision();
139  out.precision(10);
140 
141  typename T::const_iterator iti = input.begin();
142  typename U::const_iterator itl = labels.begin();
143 
144 
145  for (; iti != input.end(); ++iti, ++itl) {
146  SHARK_CHECK(iti->begin() != iti->end(), "[exportCSV (2)] record must not be empty");
147  if (lp == FIRST_COLUMN)
148  out << *itl << separator;
149  for (std::size_t i=0; i<(*iti).size()-1; i++) {
150  out << std::setw(fieldwidth) << (*iti)(i) << separator;
151  }
152  if (lp == FIRST_COLUMN) {
153  out << std::setw(fieldwidth) << (*iti)((*iti).size()-1) << std::endl;
154  } else {
155  out << std::setw(fieldwidth) << (*iti)((*iti).size()-1) << separator << *itl << std::endl;
156  }
157  }
158  out.precision(ss);
159  }
160 
161  // export function for data with vector labels
162  template<typename T, typename U, typename Stream>
163  void exportCSV_labeled(
164  const T &input, // Container that holds the samples
165  const U &labels, // Container that holds the labels
166  Stream &out, // The file to be read from
167  LabelPosition lp, // The position of the label
168  char separator, // The separator between elements
169  bool scientific = true, //scientific notation?
170  unsigned int fieldwidth = 0, //column-align using this field width
171  typename boost::disable_if<
172  boost::is_arithmetic<typename boost::range_value<U>::type>
173  >::type* dummy = 0//enable this only for complex types
174  ) {
175 
176  if (!out) {
177  throw(std::invalid_argument("[exportCSV (2)] Stream cannot be opened for writing."));
178  }
179 
180 
181  if (scientific)
182  out.setf(std::ios_base::scientific);
183  std::streamsize ss = out.precision();
184  out.precision(10);
185 
186  typename T::const_iterator iti = input.begin();
187  typename U::const_iterator itl = labels.begin();
188 
189  for (; iti != input.end(); ++iti, ++itl) {
190  SHARK_CHECK(iti->begin() != iti->end(), "[exportCSV (2)] record must not be empty");
191  if (lp == FIRST_COLUMN) {
192  for (std::size_t j = 0; j < itl->size(); j++) out << std::setw(fieldwidth) << (*itl)(j) << separator;
193  }
194  for (std::size_t i=0; i<(*iti).size()-1; i++) {
195  out << std::setw(fieldwidth) << (*iti)(i) << separator;
196  }
197  if (lp == FIRST_COLUMN) {
198  out << std::setw(fieldwidth) << (*iti)((*iti).size()-1) << std::endl;
199  } else {
200  out << std::setw(fieldwidth) << (*iti)((*iti).size()-1);
201  for (std::size_t j = 0; j < itl->size(); j++) out << std::setw(fieldwidth) << separator << (*itl)(j);
202  out << std::endl;
203  }
204  }
205  out.precision(ss);
206  }
207 } // namespace detail
208 
209 
210 
211 // ACTUAL READ IN ROUTINES BELOW
212 
213 /// \brief Import unlabeled vectors from a read-in character-separated value file.
214 ///
215 /// \param data Container storing the loaded data
216 /// \param contents The read in csv-file
217 /// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
218 /// \param comment Trailing character indicating comment line. By dfault it is '#'
219 /// \param maximumBatchSize Size of batches in the dataset
221  Data<FloatVector> &data,
222  std::string const& contents,
223  char separator = ',',
224  char comment = '#',
225  std::size_t maximumBatchSize = Data<RealVector>::DefaultBatchSize
226 );
227 
228 /// \brief Import unlabeled vectors from a read-in character-separated value file.
229 ///
230 /// \param data Container storing the loaded data
231 /// \param contents The read in csv-file
232 /// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
233 /// \param comment Trailing character indicating comment line. By dfault it is '#'
234 /// \param maximumBatchSize Size of batches in the dataset
236  Data<RealVector> &data,
237  std::string const& contents,
238  char separator = ',',
239  char comment = '#',
240  std::size_t maximumBatchSize = Data<RealVector>::DefaultBatchSize
241 );
242 
243 /// \brief Import "csv" from string consisting only of a single unsigned int per row
244 ///
245 /// \param data Container storing the loaded data
246 /// \param contents The read in csv-file
247 /// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
248 /// \param comment Trailing characters indicating comment line. By default it is "#"
249 /// \param maximumBatchSize Size of batches in the dataset
251  Data<unsigned int> &data,
252  std::string const& contents,
253  char separator = ',',
254  char comment = '#',
255  std::size_t maximumBatchSize = Data<unsigned int>::DefaultBatchSize
256 );
257 
258 /// \brief Import "csv" from string consisting only of a single int per row
259 ///
260 /// \param data Container storing the loaded data
261 /// \param contents The read in csv-file
262 /// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
263 /// \param comment Trailing characters indicating comment line. By default it is "#"
264 /// \param maximumBatchSize Size of batches in the dataset
266  Data<int> &data,
267  std::string const& contents,
268  char separator = ',',
269  char comment = '#',
270  std::size_t maximumBatchSize = Data<int>::DefaultBatchSize
271 );
272 
273 /// \brief Import "csv" from string consisting only of a single double per row
274 ///
275 /// \param data Container storing the loaded data
276 /// \param contents The read in csv-file
277 /// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
278 /// \param comment Trailing characters indicating comment line. By default it is "#"
279 /// \param maximumBatchSize Size of batches in the dataset
281  Data<float> &data,
282  std::string const& contents,
283  char separator = ',',
284  char comment = '#',
285  std::size_t maximumBatchSize = Data<double>::DefaultBatchSize
286 );
287 
288 /// \brief Import "csv" from string consisting only of a single double per row
289 ///
290 /// \param data Container storing the loaded data
291 /// \param contents The read in csv-file
292 /// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
293 /// \param comment Trailing characters indicating comment line. By default it is "#"
294 /// \param maximumBatchSize Size of batches in the dataset
296  Data<double> &data,
297  std::string const& contents,
298  char separator = ',',
299  char comment = '#',
300  std::size_t maximumBatchSize = Data<double>::DefaultBatchSize
301 );
302 
303 /// \brief Import labeled data from a character-separated value file.
304 ///
305 /// \param dataset Container storing the loaded data
306 /// \param contents the read-in file contents.
307 /// \param lp Position of the label in the record, either first or last column
308 /// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
309 /// \param comment Character for indicating a comment, by default '#'
310 /// \param maximumBatchSize maximum size of a batch in the dataset after import
313  std::string const& contents,
314  LabelPosition lp,
315  char separator = ',',
316  char comment = '#',
317  std::size_t maximumBatchSize = LabeledData<RealVector, unsigned int>::DefaultBatchSize
318 );
319 
320 /// \brief Import labeled data from a character-separated value file.
321 ///
322 /// \param dataset Container storing the loaded data
323 /// \param contents the read-in file contents.
324 /// \param lp Position of the label in the record, either first or last column
325 /// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
326 /// \param comment Character for indicating a comment, by default '#'
327 /// \param maximumBatchSize maximum size of a batch in the dataset after import
330  std::string const& contents,
331  LabelPosition lp,
332  char separator = ',',
333  char comment = '#',
334  std::size_t maximumBatchSize = LabeledData<RealVector, unsigned int>::DefaultBatchSize
335 );
336 
337 
338 /// \brief Import regression data from a read-in character-separated value file.
339 ///
340 /// \param dataset Container storing the loaded data
341 /// \param contents The read in csv-file
342 /// \param lp Position of the label in the record, either first or last column
343 /// \param separator Separator between entries, typically a comma or a space
344 /// \param comment Character for indicating a comment, by default empty
345 /// \param numberOfOutputs Dimensionality of label/output
346 /// \param maximumBatchSize maximum size of a batch in the dataset after import
349  std::string const& contents,
350  LabelPosition lp,
351  std::size_t numberOfOutputs = 1,
352  char separator = ',',
353  char comment = '#',
354  std::size_t maximumBatchSize = LabeledData<RealVector, RealVector>::DefaultBatchSize
355 );
356 
357 /// \brief Import regression data from a read-in character-separated value file.
358 ///
359 /// \param dataset Container storing the loaded data
360 /// \param contents The read in csv-file
361 /// \param lp Position of the label in the record, either first or last column
362 /// \param separator Separator between entries, typically a comma or a space
363 /// \param comment Character for indicating a comment, by default empty
364 /// \param numberOfOutputs Dimensionality of label/output
365 /// \param maximumBatchSize maximum size of a batch in the dataset after import
368  std::string const& contents,
369  LabelPosition lp,
370  std::size_t numberOfOutputs = 1,
371  char separator = ',',
372  char comment = '#',
373  std::size_t maximumBatchSize = LabeledData<RealVector, RealVector>::DefaultBatchSize
374 );
375 
376 
377 
378 /// \brief Import a Dataset from a csv file
379 ///
380 /// \param data Container storing the loaded data
381 /// \param fn The file to be read from
382 /// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
383 /// \param comment Trailing character indicating comment line. By dfault it is '#'
384 /// \param maximumBatchSize Size of batches in the dataset
385 /// \param titleLines Specifies a number of lines to be skipped in the beginning of the file
386 template<class T>
388  Data<T>& data,
389  std::string fn,
390  char separator = ',',
391  char comment = '#',
392  std::size_t maximumBatchSize = Data<T>::DefaultBatchSize,
393  std::size_t titleLines = 0
394 ){
395  std::ifstream stream(fn.c_str());
396  stream.unsetf(std::ios::skipws);
397 
398  for(std::size_t i=0; i < titleLines; ++i) // ignoring the first lines
399  stream.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
400 
401  std::istream_iterator<char> streamBegin(stream);
402  std::string contents(//read contents of file in string
403  streamBegin,
404  std::istream_iterator<char>()
405  );
406  //call the actual parser
407  csvStringToData(data,contents,separator,comment,maximumBatchSize);
408 }
409 
410 /// \brief Import a labeled Dataset from a csv file
411 ///
412 /// \param data Container storing the loaded data
413 /// \param fn The file to be read from
414 /// \param lp Position of the label in the record, either first or last column
415 /// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
416 /// \param comment Trailing character indicating comment line. By dfault it is '#'
417 /// \param maximumBatchSize Size of batches in the dataset
418 template<class T>
420  LabeledData<blas::vector<T>, unsigned int>& data,
421  std::string fn,
422  LabelPosition lp,
423  char separator = ',',
424  char comment = '#',
425  std::size_t maximumBatchSize = LabeledData<RealVector, unsigned int>::DefaultBatchSize
426 ){
427  std::ifstream stream(fn.c_str());
428  stream.unsetf(std::ios::skipws);
429  std::istream_iterator<char> streamBegin(stream);
430  std::string contents(//read contents of file in string
431  streamBegin,
432  std::istream_iterator<char>()
433  );
434  //call the actual parser
435  csvStringToData(data,contents,lp,separator,comment,maximumBatchSize);
436 }
437 
438 /// \brief Import a labeled Dataset from a csv file
439 ///
440 /// \param data Container storing the loaded data
441 /// \param fn The file to be read from
442 /// \param lp Position of the label in the record, either first or last column
443 /// \param numberOfOutputs dimensionality of the labels
444 /// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
445 /// \param comment Trailing character indicating comment line. By dfault it is '#'
446 /// \param maximumBatchSize Size of batches in the dataset
447 template<class T>
450  std::string fn,
451  LabelPosition lp,
452  std::size_t numberOfOutputs = 1,
453  char separator = ',',
454  char comment = '#',
455  std::size_t maximumBatchSize = LabeledData<RealVector, RealVector>::DefaultBatchSize
456 ){
457  std::ifstream stream(fn.c_str());
458  stream.unsetf(std::ios::skipws);
459  std::istream_iterator<char> streamBegin(stream);
460  std::string contents(//read contents of file in string
461  streamBegin,
462  std::istream_iterator<char>()
463  );
464  //call the actual parser
465  csvStringToData(data,contents,lp, numberOfOutputs, separator,comment,maximumBatchSize);
466 }
467 
468 /// \brief Format unlabeled data into a character-separated value file.
469 ///
470 /// \param set Container to be exported
471 /// \param fn The file to be written to
472 /// \param separator Separator between entries, typically a comma or a space
473 /// \param sci should the output be in scientific notation?
474 /// \param width argument to std::setw when writing the output
475 template<typename Type>
477  Data<Type> const& set,
478  std::string fn,
479  char separator = ',',
480  bool sci = true,
481  unsigned int width = 0
482 ) {
483  std::ofstream ofs(fn.c_str());
484  detail::exportCSV(set.elements(), ofs, separator, sci, width);
485 }
486 
487 
488 /// \brief Format labeled data into a character-separated value file.
489 ///
490 /// \param dataset Container to be exported
491 /// \param fn The file to be written to
492 /// \param lp Position of the label in the record, either first or last column
493 /// \param separator Separator between entries, typically a comma or a space
494 /// \param sci should the output be in scientific notation?
495 /// \param width argument to std::setw when writing the output
496 template<typename InputType, typename LabelType>
498  LabeledData<InputType, LabelType> const &dataset,
499  std::string fn,
500  LabelPosition lp,
501  char separator = ',',
502  bool sci = true,
503  unsigned int width = 0
504 ) {
505  std::ofstream ofs(fn.c_str());
506  detail::exportCSV_labeled(dataset.inputs().elements(), dataset.labels().elements(), ofs, lp, separator, sci, width);
507 }
508 
509 
510 /** @}*/
511 
512 } // namespace shark
513 #endif // SHARK_ML_CSV_H