28 #include <unordered_map>
44 class csv_reader_options_builder;
58 std::size_t _byte_range_offset = 0;
60 std::size_t _byte_range_size = 0;
62 std::vector<std::string> _names;
66 bool _mangle_dupe_cols =
true;
71 std::vector<std::string> _use_cols_names;
73 std::vector<int> _use_cols_indexes;
86 char _lineterminator =
'\n';
88 char _delimiter =
',';
90 char _thousands =
'\0';
95 bool _windowslinetermination =
false;
97 bool _delim_whitespace =
false;
99 bool _skipinitialspace =
false;
101 bool _skip_blank_lines =
true;
105 char _quotechar =
'"';
107 bool _doublequote =
true;
109 std::vector<std::string> _parse_dates_names;
111 std::vector<int> _parse_dates_indexes;
113 std::vector<std::string> _parse_hex_names;
115 std::vector<int> _parse_hex_indexes;
120 std::variant<std::vector<data_type>, std::map<std::string, data_type>> _dtypes;
122 std::vector<std::string> _true_values{
"True",
"TRUE",
"true"};
124 std::vector<std::string> _false_values{
"False",
"FALSE",
"false"};
126 std::vector<std::string> _na_values;
128 bool _keep_default_na =
true;
130 bool _na_filter =
true;
132 bool _dayfirst =
false;
196 if (_byte_range_size == 0) {
210 auto const num_names = _names.size();
211 auto const num_dtypes = std::visit([](
auto const& dtypes) {
return dtypes.size(); }, _dtypes);
212 auto const num_columns = std::max(num_dtypes, num_names);
214 auto const max_row_bytes = 16 * 1024;
215 auto const column_bytes = 64;
216 auto const base_padding = 1024;
218 if (num_columns == 0) {
220 return max_row_bytes;
224 return base_padding + num_columns * column_bytes;
232 [[nodiscard]] std::vector<std::string>
const&
get_names()
const {
return _names; }
239 [[nodiscard]] std::string
get_prefix()
const {
return _prefix; }
255 return _use_cols_names;
384 return _parse_dates_names;
394 return _parse_dates_indexes;
404 return _parse_hex_names;
419 std::variant<std::vector<data_type>, std::map<std::string, data_type>>
const&
get_dtypes()
const
443 std::vector<std::string>
const&
get_na_values()
const {
return _na_values; }
487 if ((offset != 0) and ((_skiprows != 0) or (_skipfooter != 0) or (_nrows != -1))) {
489 "When there is valid value in skiprows or skipfooter or nrows, offset can't have non-zero "
492 _byte_range_offset = offset;
502 if ((size != 0) and ((_skiprows != 0) or (_skipfooter != 0) or (_nrows != -1))) {
504 "If the value of any of skiprows, skipfooter or nrows is valid, range size cannot be "
507 _byte_range_size = size;
515 void set_names(std::vector<std::string> col_names) { _names = std::move(col_names); }
538 _use_cols_names = std::move(col_names);
548 _use_cols_indexes = std::move(col_indices);
558 CUDF_EXPECTS((nrows == 0) or (_skipfooter == 0),
"Cannot use both `nrows` and `skipfooter`");
559 if ((nrows != -1) and ((_byte_range_offset != 0) or (_byte_range_size != 0))) {
561 "nrows can't be a non negative value if range offset and/or range size has been set");
574 if ((skiprows != 0) and ((_byte_range_offset != 0) or (_byte_range_size != 0))) {
575 CUDF_FAIL(
"skiprows must be zero if range offset or range size has been set",
576 std::invalid_argument);
578 _skiprows = skiprows;
589 "Cannot use both `nrows` and `skipfooter`",
590 std::invalid_argument);
591 if ((skipfooter != 0) and ((_byte_range_offset != 0) or (_byte_range_size != 0))) {
592 CUDF_FAIL(
"skipfooter must be zero if range offset or range size has been set",
593 std::invalid_argument);
596 _skipfooter = skipfooter;
682 "Only MINIMAL and NONE are supported for quoting.");
707 _parse_dates_names = std::move(col_names);
717 _parse_dates_indexes = std::move(col_indices);
727 _parse_hex_names = std::move(col_names);
735 void set_parse_hex(std::vector<int> col_indices) { _parse_hex_indexes = std::move(col_indices); }
742 void set_dtypes(std::map<std::string, data_type> types) { _dtypes = std::move(types); }
749 void set_dtypes(std::vector<data_type> types) { _dtypes = std::move(types); }
758 _true_values.insert(_true_values.end(), vals.begin(), vals.end());
768 _false_values.insert(_false_values.end(), vals.begin(), vals.end());
778 if ((!vals.empty()) and (!_na_filter)) {
779 CUDF_FAIL(
"Can't set na_values when na_filtering is disabled");
782 _na_values = std::move(vals);
799 if (!val) { _na_values.clear(); }
848 options._compression = comp;
884 options._names = std::move(col_names);
896 options._prefix = pfx;
908 options._mangle_dupe_cols = val;
920 options._use_cols_names = std::move(col_names);
932 options._use_cols_indexes = std::move(col_indices);
980 options._header = hdr;
992 options._lineterminator = term;
1004 options._delimiter = delim;
1016 options._thousands = val;
1028 options._decimal = val;
1040 options._comment = val;
1052 options._windowslinetermination = val;
1064 options._delim_whitespace = val;
1076 options._skipinitialspace = val;
1088 options._skip_blank_lines = val;
1100 options._quoting = style;
1112 options._quotechar = ch;
1124 options._doublequote = val;
1136 options._parse_dates_names = std::move(col_names);
1148 options._parse_dates_indexes = std::move(col_indices);
1160 options._parse_hex_names = std::move(col_names);
1172 options._parse_hex_indexes = std::move(col_indices);
1184 options._dtypes = std::move(types);
1196 options._dtypes = std::move(types);
1208 options._true_values.insert(options._true_values.end(), vals.begin(), vals.end());
1220 options._false_values.insert(options._false_values.end(), vals.begin(), vals.end());
1268 options._dayfirst = val;
1280 options._timestamp_type = type;
1342 std::string _na_rep =
"";
1344 bool _include_header =
true;
1346 size_type _rows_per_chunk = std::numeric_limits<size_type>::max();
1348 std::string _line_terminator =
"\n";
1350 char _inter_column_delimiter =
',';
1352 std::string _true_value = std::string{
"true"};
1354 std::string _false_value = std::string{
"false"};
1356 std::vector<std::string> _names;
1410 [[nodiscard]] std::vector<std::string>
const&
get_names()
const {
return _names; }
1479 void set_names(std::vector<std::string> names) { _names = std::move(names); }
1550 "Only MINIMAL and NONE are supported for quoting.");
1576 : options{sink,
table}
1588 options._names =
names;
1600 options._na_rep = val;
1612 options._include_header = val;
1624 options._rows_per_chunk = val;
1636 options._line_terminator = term;
1648 options._inter_column_delimiter = delim;
1660 options._true_value = val;
1672 options._false_value = val;
Indicator for the logical data type of an element in a column.
Builder to build options for read_csv().
csv_reader_options_builder & dtypes(std::vector< data_type > types)
Sets per-column types.
csv_reader_options_builder & false_values(std::vector< std::string > vals)
Sets additional values to recognize as boolean false values.
csv_reader_options_builder & use_cols_names(std::vector< std::string > col_names)
Sets names of the columns to be read.
csv_reader_options_builder & doublequote(bool val)
Sets a quote inside a value is double-quoted.
csv_reader_options_builder & parse_hex(std::vector< std::string > col_names)
Sets names of columns to parse as hexadecimal.
csv_reader_options_builder & byte_range_offset(std::size_t offset)
Sets number of bytes to skip from source start.
csv_reader_options_builder & delim_whitespace(bool val)
Sets whether to treat whitespace as field delimiter.
csv_reader_options_builder & skiprows(size_type skip)
Sets number of rows to skip from start.
csv_reader_options_builder & skip_blank_lines(bool val)
Sets whether to ignore empty lines or parse line values as invalid.
csv_reader_options && build()
move csv_reader_options member once it's built.
csv_reader_options_builder & dtypes(std::map< std::string, data_type > types)
Sets per-column types.
csv_reader_options_builder & quotechar(char ch)
Sets quoting character.
csv_reader_options_builder & na_values(std::vector< std::string > vals)
Sets additional values to recognize as null values.
csv_reader_options_builder & true_values(std::vector< std::string > vals)
Sets additional values to recognize as boolean true values.
csv_reader_options_builder & decimal(char val)
Sets decimal point character.
csv_reader_options_builder & na_filter(bool val)
Sets whether to disable null filter.
csv_reader_options_builder & thousands(char val)
Sets numeric data thousands separator.
csv_reader_options_builder & parse_hex(std::vector< int > col_indices)
Sets indexes of columns to parse as hexadecimal.
csv_reader_options_builder & windowslinetermination(bool val)
Sets whether to treat \r\n as line terminator.
csv_reader_options_builder & parse_dates(std::vector< int > col_indices)
Sets indexes of columns to read as datetime.
csv_reader_options_builder & nrows(size_type rows)
Sets number of rows to read.
csv_reader_options_builder & names(std::vector< std::string > col_names)
Sets names of the column.
csv_reader_options_builder & timestamp_type(data_type type)
Sets timestamp_type to which all timestamp columns will be cast.
csv_reader_options_builder & mangle_dupe_cols(bool val)
Sets whether to rename duplicate column names.
csv_reader_options_builder & skipfooter(size_type skip)
Sets number of rows to skip from end.
csv_reader_options_builder()=default
Default constructor.
csv_reader_options_builder & byte_range_size(std::size_t size)
Sets number of bytes to read.
csv_reader_options_builder & keep_default_na(bool val)
Sets whether to keep the built-in default NA values.
csv_reader_options_builder & quoting(quote_style style)
Sets quoting style.
csv_reader_options_builder & lineterminator(char term)
Sets line terminator.
csv_reader_options_builder & delimiter(char delim)
Sets field delimiter.
csv_reader_options_builder & use_cols_indexes(std::vector< int > col_indices)
Sets indexes of columns to read.
csv_reader_options_builder & parse_dates(std::vector< std::string > col_names)
Sets names of columns to read as datetime.
csv_reader_options_builder(source_info src)
Constructor from source info.
csv_reader_options_builder & comment(char val)
Sets comment line start character.
csv_reader_options_builder & compression(compression_type comp)
Sets compression format of the source.
csv_reader_options_builder & header(size_type hdr)
Sets header row index.
csv_reader_options_builder & dayfirst(bool val)
Sets whether to parse dates as DD/MM versus MM/DD.
csv_reader_options_builder & prefix(std::string pfx)
Sets prefix to be used for column ID.
csv_reader_options_builder & skipinitialspace(bool val)
Sets whether to skip whitespace after the delimiter.
Settings to use for read_csv().
void enable_doublequote(bool val)
Sets a quote inside a value is double-quoted.
void set_use_cols_indexes(std::vector< int > col_indices)
Sets indexes of columns to read.
size_type get_skiprows() const
Returns number of rows to skip from start.
bool is_enabled_delim_whitespace() const
Whether to treat whitespace as field delimiter.
std::vector< int > const & get_parse_dates_indexes() const
Returns indexes of columns to read as datetime.
void set_byte_range_offset(std::size_t offset)
Sets number of bytes to skip from source start.
quote_style get_quoting() const
Returns quoting style.
void set_parse_dates(std::vector< std::string > col_names)
Sets names of columns to read as datetime.
void set_parse_dates(std::vector< int > col_indices)
Sets indexes of columns to read as datetime.
bool is_enabled_doublequote() const
Whether a quote inside a value is double-quoted.
char get_delimiter() const
Returns field delimiter.
char get_lineterminator() const
Returns line terminator.
csv_reader_options()=default
Default constructor.
void set_false_values(std::vector< std::string > vals)
Sets additional values to recognize as boolean false values.
void set_decimal(char val)
Sets decimal point character.
std::string get_prefix() const
Returns prefix to be used for column ID.
std::vector< std::string > const & get_na_values() const
Returns additional values to recognize as null values.
char get_thousands() const
Returns numeric data thousands separator.
bool is_enabled_mangle_dupe_cols() const
Whether to rename duplicate column names.
void set_dtypes(std::map< std::string, data_type > types)
Sets per-column types.
std::size_t get_byte_range_size_with_padding() const
Returns number of bytes to read with padding.
std::vector< int > const & get_parse_hex_indexes() const
Returns indexes of columns to read as hexadecimal.
std::vector< std::string > const & get_false_values() const
Returns additional values to recognize as boolean false values.
void enable_dayfirst(bool val)
Sets whether to parse dates as DD/MM versus MM/DD.
void set_na_values(std::vector< std::string > vals)
Sets additional values to recognize as null values.
void set_quoting(quote_style quoting)
Sets the expected quoting style used in the input CSV data.
std::variant< std::vector< data_type >, std::map< std::string, data_type > > const & get_dtypes() const
Returns per-column types.
void set_timestamp_type(data_type type)
Sets timestamp_type to which all timestamp columns will be cast.
std::size_t get_byte_range_offset() const
Returns number of bytes to skip from source start.
data_type get_timestamp_type() const
Returns timestamp_type to which all timestamp columns will be cast.
bool is_enabled_na_filter() const
Whether to disable null filter.
bool is_enabled_skip_blank_lines() const
Whether to ignore empty lines or parse line values as invalid.
char get_comment() const
Returns comment line start character.
void set_lineterminator(char term)
Sets line terminator.
void set_quotechar(char ch)
Sets quoting character.
bool is_enabled_windowslinetermination() const
Whether to treat \r\n as line terminator.
void enable_skip_blank_lines(bool val)
Sets whether to ignore empty lines or parse line values as invalid.
void enable_windowslinetermination(bool val)
Sets whether to treat \r\n as line terminator.
void set_skiprows(size_type skiprows)
Sets number of rows to skip from start.
void set_compression(compression_type comp)
Sets compression format of the source.
void enable_delim_whitespace(bool val)
Sets whether to treat whitespace as field delimiter.
std::vector< std::string > const & get_names() const
Returns names of the columns.
void set_dtypes(std::vector< data_type > types)
Sets per-column types.
void set_skipfooter(size_type skipfooter)
Sets number of rows to skip from end.
bool is_enabled_dayfirst() const
Whether to parse dates as DD/MM versus MM/DD.
std::size_t get_byte_range_padding() const
Returns number of bytes to pad when reading.
void set_names(std::vector< std::string > col_names)
Sets names of the column.
source_info const & get_source() const
Returns source info.
void enable_keep_default_na(bool val)
Sets whether to keep the built-in default NA values.
std::vector< std::string > const & get_parse_dates_names() const
Returns names of columns to read as datetime.
void set_prefix(std::string pfx)
Sets prefix to be used for column ID.
static csv_reader_options_builder builder(source_info src)
Creates a csv_reader_options_builder which will build csv_reader_options.
std::size_t get_byte_range_size() const
Returns number of bytes to read.
std::vector< int > const & get_use_cols_indexes() const
Returns indexes of columns to read.
std::vector< std::string > const & get_use_cols_names() const
Returns names of the columns to be read.
void set_use_cols_names(std::vector< std::string > col_names)
Sets names of the columns to be read.
compression_type get_compression() const
Returns compression format of the source.
char get_quotechar() const
Returns quoting character.
void set_true_values(std::vector< std::string > vals)
Sets additional values to recognize as boolean true values.
bool is_enabled_keep_default_na() const
Whether to keep the built-in default NA values.
void set_header(size_type hdr)
Sets header row index.
char get_decimal() const
Returns decimal point character.
std::vector< std::string > const & get_true_values() const
Returns additional values to recognize as boolean true values.
void set_parse_hex(std::vector< int > col_indices)
Sets indexes of columns to parse as hexadecimal.
void set_thousands(char val)
Sets numeric data thousands separator.
void enable_na_filter(bool val)
Sets whether to disable null filter.
void set_byte_range_size(std::size_t size)
Sets number of bytes to read.
void set_delimiter(char delim)
Sets field delimiter.
void enable_mangle_dupe_cols(bool val)
Sets whether to rename duplicate column names.
size_type get_nrows() const
Returns number of rows to read.
std::vector< std::string > const & get_parse_hex_names() const
Returns names of columns to read as hexadecimal.
void enable_skipinitialspace(bool val)
Sets whether to skip whitespace after the delimiter.
size_type get_skipfooter() const
Returns number of rows to skip from end.
void set_nrows(size_type nrows)
Sets number of rows to read.
void set_comment(char val)
Sets comment line start character.
bool is_enabled_skipinitialspace() const
Whether to skip whitespace after the delimiter.
size_type get_header() const
Returns header row index.
void set_parse_hex(std::vector< std::string > col_names)
Sets names of columns to parse as hexadecimal.
Builder to build options for writer_csv()
csv_writer_options_builder()=default
Default constructor.
csv_writer_options && build()
move csv_writer_options member once it's built.
csv_writer_options_builder & quoting(quote_style quoting)
Sets the quote style for the writer.
csv_writer_options_builder & true_value(std::string val)
Sets string used for values != 0 in INT8 types.
csv_writer_options_builder & include_header(bool val)
Enables/Disables headers being written to csv.
csv_writer_options_builder & na_rep(std::string val)
Sets string to used for null entries.
csv_writer_options_builder & line_terminator(std::string term)
Sets character used for separating lines.
csv_writer_options_builder & false_value(std::string val)
Sets string used for values == 0 in INT8 types.
csv_writer_options_builder & names(std::vector< std::string > names)
Sets optional column names.
csv_writer_options_builder & inter_column_delimiter(char delim)
Sets character used for separating column values.
csv_writer_options_builder(sink_info const &sink, table_view const &table)
Constructor from sink and table.
csv_writer_options_builder & rows_per_chunk(int val)
Sets maximum number of rows to process for each file write.
Settings to use for write_csv().
void set_table(table_view const &table)
(Re)sets the table being written.
void set_rows_per_chunk(size_type val)
Sets maximum number of rows to process for each file write.
std::string get_false_value() const
Returns string used for values == 0 in INT8 types.
void set_quoting(quote_style quoting)
Sets the quote style for the writer.
std::string get_line_terminator() const
Returns character used for separating lines.
void set_line_terminator(std::string term)
Sets character used for separating lines.
csv_writer_options()=default
Default constructor.
std::string get_true_value() const
Returns string used for values != 0 in INT8 types.
void set_inter_column_delimiter(char delim)
Sets character used for separating column values.
void set_true_value(std::string val)
Sets string used for values != 0 in INT8 types.
static csv_writer_options_builder builder(sink_info const &sink, table_view const &table)
Create builder to create csv_writer_options.
table_view const & get_table() const
Returns table that would be written to output.
std::string get_na_rep() const
Returns string to used for null entries.
void enable_include_header(bool val)
Enables/Disables headers being written to csv.
bool is_enabled_include_header() const
Whether to write headers to csv.
void set_na_rep(std::string val)
Sets string to used for null entries.
char get_inter_column_delimiter() const
Returns character used for separating column values.
sink_info const & get_sink() const
Returns sink used for writer output.
std::vector< std::string > const & get_names() const
Returns names of the columns.
quote_style get_quoting() const
Returns the quote style for the writer.
void set_false_value(std::string val)
Sets string used for values == 0 in INT8 types.
size_type get_rows_per_chunk() const
Returns maximum number of rows to process for each file write.
void set_names(std::vector< std::string > names)
Sets optional associated column names.
A set of cudf::column_view's of the same size.
A set of cudf::column's of the same size.
size_type num_rows() const noexcept
Returns the number of rows.
table_with_metadata read_csv(csv_reader_options options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Reads a CSV dataset into a set of columns.
void write_csv(csv_writer_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Writes a set of columns to CSV format.
device_memory_resource * get_current_device_resource()
#define CUDF_EXPECTS(...)
Macro for checking (pre-)conditions that throws an exception when a condition is violated.
#define CUDF_FAIL(...)
Indicates that an erroneous code path has been taken.
int32_t size_type
Row index type for columns and tables.
@ EMPTY
Always null with no underlying data.
cuDF-IO API type definitions
compression_type
Compression algorithms.
@ AUTO
Automatically detect or select compression format.
quote_style
Behavior when handling quotations in field data.
@ MINIMAL
Quote only fields which contain special characters.
@ NONE
Never quote fields; disable quotation parsing.
rmm::cuda_stream_view const get_default_stream()
Get the current default stream.
Destination information for write interfaces.
Source information for read interfaces.
Class definitions for (mutable)_table_view
Type declarations for libcudf.