libcudf  23.12.00
orc.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2020-2023, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <cudf/io/detail/orc.hpp>
20 #include <cudf/io/types.hpp>
22 #include <cudf/types.hpp>
23 
25 
26 #include <memory>
27 #include <optional>
28 #include <string>
29 #include <unordered_map>
30 #include <vector>
31 
32 namespace cudf {
33 namespace io {
40 constexpr size_t default_stripe_size_bytes = 64 * 1024 * 1024;
41 constexpr size_type default_stripe_size_rows = 1000000;
43 
48 
53  source_info _source;
54 
55  // Names of column to read; `nullopt` is all
56  std::optional<std::vector<std::string>> _columns;
57 
58  // List of individual stripes to read (ignored if empty)
59  std::vector<std::vector<size_type>> _stripes;
60  // Rows to skip from the start; ORC stores the number of rows as uint64_t
61  uint64_t _skip_rows = 0;
62  // Rows to read; `nullopt` is all
63  std::optional<size_type> _num_rows;
64 
65  // Whether to use row index to speed-up reading
66  bool _use_index = true;
67 
68  // Whether to use numpy-compatible dtypes
69  bool _use_np_dtypes = true;
70  // Cast timestamp columns to a specific type
71  data_type _timestamp_type{type_id::EMPTY};
72 
73  // Columns that should be read as Decimal128
74  std::vector<std::string> _decimal128_columns;
75 
77 
83  explicit orc_reader_options(source_info src) : _source{std::move(src)} {}
84 
85  public:
91  orc_reader_options() = default;
92 
100 
106  [[nodiscard]] source_info const& get_source() const { return _source; }
107 
113  [[nodiscard]] auto const& get_columns() const { return _columns; }
114 
120  [[nodiscard]] auto const& get_stripes() const { return _stripes; }
121 
127  uint64_t get_skip_rows() const { return _skip_rows; }
128 
135  std::optional<size_type> const& get_num_rows() const { return _num_rows; }
136 
142  bool is_enabled_use_index() const { return _use_index; }
143 
149  bool is_enabled_use_np_dtypes() const { return _use_np_dtypes; }
150 
156  data_type get_timestamp_type() const { return _timestamp_type; }
157 
163  std::vector<std::string> const& get_decimal128_columns() const { return _decimal128_columns; }
164 
165  // Setters
166 
172  void set_columns(std::vector<std::string> col_names) { _columns = std::move(col_names); }
173 
184  void set_stripes(std::vector<std::vector<size_type>> stripes)
185  {
186  CUDF_EXPECTS(stripes.empty() or (_skip_rows == 0), "Can't set stripes along with skip_rows");
187  CUDF_EXPECTS(stripes.empty() or not _num_rows.has_value(),
188  "Can't set stripes along with num_rows");
189  _stripes = std::move(stripes);
190  }
191 
200  void set_skip_rows(uint64_t rows)
201  {
202  CUDF_EXPECTS(rows == 0 or _stripes.empty(), "Can't set both skip_rows along with stripes");
203  _skip_rows = rows;
204  }
205 
215  {
216  CUDF_EXPECTS(nrows >= 0, "num_rows cannot be negative");
217  CUDF_EXPECTS(_stripes.empty(), "Can't set both num_rows and stripes");
218  _num_rows = nrows;
219  }
220 
226  void enable_use_index(bool use) { _use_index = use; }
227 
233  void enable_use_np_dtypes(bool use) { _use_np_dtypes = use; }
234 
240  void set_timestamp_type(data_type type) { _timestamp_type = type; }
241 
247  void set_decimal128_columns(std::vector<std::string> val)
248  {
249  _decimal128_columns = std::move(val);
250  }
251 };
252 
257  orc_reader_options options;
258 
259  public:
265  explicit orc_reader_options_builder() = default;
266 
272  explicit orc_reader_options_builder(source_info src) : options{std::move(src)} {};
273 
280  orc_reader_options_builder& columns(std::vector<std::string> col_names)
281  {
282  options._columns = std::move(col_names);
283  return *this;
284  }
285 
292  orc_reader_options_builder& stripes(std::vector<std::vector<size_type>> stripes)
293  {
294  options.set_stripes(std::move(stripes));
295  return *this;
296  }
297 
305  {
306  options.set_skip_rows(rows);
307  return *this;
308  }
309 
317  {
318  options.set_num_rows(nrows);
319  return *this;
320  }
321 
329  {
330  options._use_index = use;
331  return *this;
332  }
333 
341  {
342  options._use_np_dtypes = use;
343  return *this;
344  }
345 
353  {
354  options._timestamp_type = type;
355  return *this;
356  }
357 
364  orc_reader_options_builder& decimal128_columns(std::vector<std::string> val)
365  {
366  options._decimal128_columns = std::move(val);
367  return *this;
368  }
369 
373  operator orc_reader_options&&() { return std::move(options); }
374 
382  orc_reader_options&& build() { return std::move(options); }
383 };
384 
402  orc_reader_options const& options,
404  // end of group
416 
426 static constexpr statistics_freq ORC_STATISTICS_STRIPE = statistics_freq::STATISTICS_ROWGROUP;
427 static constexpr statistics_freq ORC_STATISTICS_ROW_GROUP = statistics_freq::STATISTICS_PAGE;
428 
433  // Specify the sink to use for writer output
434  sink_info _sink;
435  // Specify the compression format to use
437  // Specify frequency of statistics collection
438  statistics_freq _stats_freq = ORC_STATISTICS_ROW_GROUP;
439  // Maximum size of each stripe (unless smaller than a single row group)
440  size_t _stripe_size_bytes = default_stripe_size_bytes;
441  // Maximum number of rows in stripe (unless smaller than a single row group)
442  size_type _stripe_size_rows = default_stripe_size_rows;
443  // Row index stride (maximum number of rows in each row group)
444  size_type _row_index_stride = default_row_index_stride;
445  // Set of columns to output
446  table_view _table;
447  // Optional associated metadata
448  std::optional<table_input_metadata> _metadata;
449  // Optional footer key_value_metadata
450  std::map<std::string, std::string> _user_data;
451  // Optional compression statistics
452  std::shared_ptr<writer_compression_statistics> _compression_stats;
453  // Specify whether string dictionaries should be alphabetically sorted
454  bool _enable_dictionary_sort = true;
455 
457 
464  explicit orc_writer_options(sink_info const& sink, table_view const& table)
465  : _sink(sink), _table(table)
466  {
467  }
468 
469  public:
475  explicit orc_writer_options() = default;
476 
486 
492  [[nodiscard]] sink_info const& get_sink() const { return _sink; }
493 
499  [[nodiscard]] compression_type get_compression() const { return _compression; }
500 
506  [[nodiscard]] bool is_enabled_statistics() const
507  {
508  return _stats_freq != statistics_freq::STATISTICS_NONE;
509  }
510 
516  [[nodiscard]] statistics_freq get_statistics_freq() const { return _stats_freq; }
517 
523  [[nodiscard]] auto get_stripe_size_bytes() const { return _stripe_size_bytes; }
524 
530  [[nodiscard]] auto get_stripe_size_rows() const { return _stripe_size_rows; }
531 
537  auto get_row_index_stride() const
538  {
539  auto const unaligned_stride = std::min(_row_index_stride, get_stripe_size_rows());
540  return unaligned_stride - unaligned_stride % 8;
541  }
542 
548  [[nodiscard]] table_view get_table() const { return _table; }
549 
555  [[nodiscard]] auto const& get_metadata() const { return _metadata; }
556 
562  [[nodiscard]] std::map<std::string, std::string> const& get_key_value_metadata() const
563  {
564  return _user_data;
565  }
566 
572  [[nodiscard]] std::shared_ptr<writer_compression_statistics> get_compression_statistics() const
573  {
574  return _compression_stats;
575  }
576 
582  [[nodiscard]] bool get_enable_dictionary_sort() const { return _enable_dictionary_sort; }
583 
584  // Setters
585 
591  void set_compression(compression_type comp) { _compression = comp; }
592 
603  void enable_statistics(statistics_freq val) { _stats_freq = val; }
604 
612  void set_stripe_size_bytes(size_t size_bytes)
613  {
614  CUDF_EXPECTS(size_bytes >= 64 << 10, "64KB is the minimum stripe size");
615  _stripe_size_bytes = size_bytes;
616  }
617 
629  {
630  CUDF_EXPECTS(size_rows >= 512, "Maximum stripe size cannot be smaller than 512");
631  _stripe_size_rows = size_rows;
632  }
633 
644  {
645  CUDF_EXPECTS(stride >= 512, "Row index stride cannot be smaller than 512");
646  _row_index_stride = stride;
647  }
648 
654  void set_table(table_view tbl) { _table = tbl; }
655 
661  void set_metadata(table_input_metadata meta) { _metadata = std::move(meta); }
662 
668  void set_key_value_metadata(std::map<std::string, std::string> metadata)
669  {
670  _user_data = std::move(metadata);
671  }
672 
678  void set_compression_statistics(std::shared_ptr<writer_compression_statistics> comp_stats)
679  {
680  _compression_stats = std::move(comp_stats);
681  }
682 
688  void set_enable_dictionary_sort(bool val) { _enable_dictionary_sort = val; }
689 };
690 
695  orc_writer_options options;
696 
697  public:
704 
711  orc_writer_options_builder(sink_info const& sink, table_view const& table) : options{sink, table}
712  {
713  }
714 
722  {
723  options._compression = comp;
724  return *this;
725  }
726 
739  {
740  options._stats_freq = val;
741  return *this;
742  }
743 
751  {
752  options.set_stripe_size_bytes(val);
753  return *this;
754  }
755 
763  {
764  options.set_stripe_size_rows(val);
765  return *this;
766  }
767 
775  {
776  options.set_row_index_stride(val);
777  return *this;
778  }
779 
787  {
788  options._table = tbl;
789  return *this;
790  }
791 
799  {
800  options._metadata = std::move(meta);
801  return *this;
802  }
803 
810  orc_writer_options_builder& key_value_metadata(std::map<std::string, std::string> metadata)
811  {
812  options._user_data = std::move(metadata);
813  return *this;
814  }
815 
823  std::shared_ptr<writer_compression_statistics> const& comp_stats)
824  {
825  options._compression_stats = comp_stats;
826  return *this;
827  }
828 
836  {
837  options._enable_dictionary_sort = val;
838  return *this;
839  }
840 
844  operator orc_writer_options&&() { return std::move(options); }
845 
853  orc_writer_options&& build() { return std::move(options); }
854 };
855 
868 void write_orc(orc_writer_options const& options);
869 
874 
879  // Specify the sink to use for writer output
880  sink_info _sink;
881  // Specify the compression format to use
883  // Specify granularity of statistics collection
884  statistics_freq _stats_freq = ORC_STATISTICS_ROW_GROUP;
885  // Maximum size of each stripe (unless smaller than a single row group)
886  size_t _stripe_size_bytes = default_stripe_size_bytes;
887  // Maximum number of rows in stripe (unless smaller than a single row group)
888  size_type _stripe_size_rows = default_stripe_size_rows;
889  // Row index stride (maximum number of rows in each row group)
890  size_type _row_index_stride = default_row_index_stride;
891  // Optional associated metadata
892  std::optional<table_input_metadata> _metadata;
893  // Optional footer key_value_metadata
894  std::map<std::string, std::string> _user_data;
895  // Optional compression statistics
896  std::shared_ptr<writer_compression_statistics> _compression_stats;
897  // Specify whether string dictionaries should be alphabetically sorted
898  bool _enable_dictionary_sort = true;
899 
901 
907  chunked_orc_writer_options(sink_info const& sink) : _sink(sink) {}
908 
909  public:
915  explicit chunked_orc_writer_options() = default;
916 
925 
931  [[nodiscard]] sink_info const& get_sink() const { return _sink; }
932 
938  [[nodiscard]] compression_type get_compression() const { return _compression; }
939 
945  [[nodiscard]] statistics_freq get_statistics_freq() const { return _stats_freq; }
946 
952  [[nodiscard]] auto get_stripe_size_bytes() const { return _stripe_size_bytes; }
953 
959  [[nodiscard]] auto get_stripe_size_rows() const { return _stripe_size_rows; }
960 
966  auto get_row_index_stride() const
967  {
968  auto const unaligned_stride = std::min(_row_index_stride, get_stripe_size_rows());
969  return unaligned_stride - unaligned_stride % 8;
970  }
971 
977  [[nodiscard]] auto const& get_metadata() const { return _metadata; }
978 
984  [[nodiscard]] std::map<std::string, std::string> const& get_key_value_metadata() const
985  {
986  return _user_data;
987  }
988 
994  [[nodiscard]] std::shared_ptr<writer_compression_statistics> get_compression_statistics() const
995  {
996  return _compression_stats;
997  }
998 
1004  [[nodiscard]] bool get_enable_dictionary_sort() const { return _enable_dictionary_sort; }
1005 
1006  // Setters
1007 
1013  void set_compression(compression_type comp) { _compression = comp; }
1014 
1025  void enable_statistics(statistics_freq val) { _stats_freq = val; }
1026 
1034  void set_stripe_size_bytes(size_t size_bytes)
1035  {
1036  CUDF_EXPECTS(size_bytes >= 64 << 10, "64KB is the minimum stripe size");
1037  _stripe_size_bytes = size_bytes;
1038  }
1039 
1051  {
1052  CUDF_EXPECTS(size_rows >= 512, "maximum stripe size cannot be smaller than 512");
1053  _stripe_size_rows = size_rows;
1054  }
1055 
1066  {
1067  CUDF_EXPECTS(stride >= 512, "Row index stride cannot be smaller than 512");
1068  _row_index_stride = stride;
1069  }
1070 
1076  void metadata(table_input_metadata meta) { _metadata = std::move(meta); }
1077 
1083  void set_key_value_metadata(std::map<std::string, std::string> metadata)
1084  {
1085  _user_data = std::move(metadata);
1086  }
1087 
1093  void set_compression_statistics(std::shared_ptr<writer_compression_statistics> comp_stats)
1094  {
1095  _compression_stats = std::move(comp_stats);
1096  }
1097 
1103  void set_enable_dictionary_sort(bool val) { _enable_dictionary_sort = val; }
1104 };
1105 
1111 
1112  public:
1119 
1125  explicit chunked_orc_writer_options_builder(sink_info const& sink) : options{sink} {}
1126 
1134  {
1135  options._compression = comp;
1136  return *this;
1137  }
1138 
1151  {
1152  options._stats_freq = val;
1153  return *this;
1154  }
1155 
1163  {
1164  options.set_stripe_size_bytes(val);
1165  return *this;
1166  }
1167 
1175  {
1176  options.set_stripe_size_rows(val);
1177  return *this;
1178  }
1179 
1187  {
1188  options.set_row_index_stride(val);
1189  return *this;
1190  }
1191 
1199  {
1200  options._metadata = std::move(meta);
1201  return *this;
1202  }
1203 
1211  std::map<std::string, std::string> metadata)
1212  {
1213  options._user_data = std::move(metadata);
1214  return *this;
1215  }
1216 
1224  std::shared_ptr<writer_compression_statistics> const& comp_stats)
1225  {
1226  options._compression_stats = comp_stats;
1227  return *this;
1228  }
1229 
1237  {
1238  options._enable_dictionary_sort = val;
1239  return *this;
1240  }
1241 
1245  operator chunked_orc_writer_options&&() { return std::move(options); }
1246 
1254  chunked_orc_writer_options&& build() { return std::move(options); }
1255 };
1256 
1279  public:
1284  orc_chunked_writer() = default;
1285 
1292 
1300 
1304  void close();
1305 
1307  std::unique_ptr<cudf::io::detail::orc::writer> writer;
1308 };
1309  // end of group
1311 } // namespace io
1312 } // namespace cudf
Indicator for the logical data type of an element in a column.
Definition: types.hpp:227
Builds settings to use for write_orc_chunked().
Definition: orc.hpp:1109
chunked_orc_writer_options_builder & enable_dictionary_sort(bool val)
Sets whether string dictionaries should be sorted.
Definition: orc.hpp:1236
chunked_orc_writer_options && build()
move chunked_orc_writer_options member once it's built.
Definition: orc.hpp:1254
chunked_orc_writer_options_builder & stripe_size_bytes(size_t val)
Sets the maximum stripe size, in bytes.
Definition: orc.hpp:1162
chunked_orc_writer_options_builder()=default
Default constructor.
chunked_orc_writer_options_builder & stripe_size_rows(size_type val)
Sets the maximum number of rows in output stripes.
Definition: orc.hpp:1174
chunked_orc_writer_options_builder & compression_statistics(std::shared_ptr< writer_compression_statistics > const &comp_stats)
Sets the pointer to the output compression statistics.
Definition: orc.hpp:1223
chunked_orc_writer_options_builder & key_value_metadata(std::map< std::string, std::string > metadata)
Sets Key-Value footer metadata.
Definition: orc.hpp:1210
chunked_orc_writer_options_builder & compression(compression_type comp)
Sets compression type.
Definition: orc.hpp:1133
chunked_orc_writer_options_builder & metadata(table_input_metadata meta)
Sets associated metadata.
Definition: orc.hpp:1198
chunked_orc_writer_options_builder(sink_info const &sink)
Constructor from sink and table.
Definition: orc.hpp:1125
chunked_orc_writer_options_builder & enable_statistics(statistics_freq val)
Choose granularity of statistics collection.
Definition: orc.hpp:1150
chunked_orc_writer_options_builder & row_index_stride(size_type val)
Sets the row index stride.
Definition: orc.hpp:1186
Settings to use for write_orc_chunked().
Definition: orc.hpp:878
void set_stripe_size_bytes(size_t size_bytes)
Sets the maximum stripe size, in bytes.
Definition: orc.hpp:1034
chunked_orc_writer_options()=default
Default constructor.
void metadata(table_input_metadata meta)
Sets associated metadata.
Definition: orc.hpp:1076
void set_key_value_metadata(std::map< std::string, std::string > metadata)
Sets Key-Value footer metadata.
Definition: orc.hpp:1083
void set_compression_statistics(std::shared_ptr< writer_compression_statistics > comp_stats)
Sets the pointer to the output compression statistics.
Definition: orc.hpp:1093
sink_info const & get_sink() const
Returns sink info.
Definition: orc.hpp:931
auto get_stripe_size_rows() const
Returns maximum stripe size, in rows.
Definition: orc.hpp:959
auto get_row_index_stride() const
Returns the row index stride.
Definition: orc.hpp:966
void set_row_index_stride(size_type stride)
Sets the row index stride.
Definition: orc.hpp:1065
statistics_freq get_statistics_freq() const
Returns granularity of statistics collection.
Definition: orc.hpp:945
void set_compression(compression_type comp)
Sets compression type.
Definition: orc.hpp:1013
std::map< std::string, std::string > const & get_key_value_metadata() const
Returns Key-Value footer metadata information.
Definition: orc.hpp:984
void set_enable_dictionary_sort(bool val)
Sets whether string dictionaries should be sorted.
Definition: orc.hpp:1103
auto const & get_metadata() const
Returns associated metadata.
Definition: orc.hpp:977
bool get_enable_dictionary_sort() const
Returns whether string dictionaries should be sorted.
Definition: orc.hpp:1004
compression_type get_compression() const
Returns compression type.
Definition: orc.hpp:938
void set_stripe_size_rows(size_type size_rows)
Sets the maximum stripe size, in rows.
Definition: orc.hpp:1050
std::shared_ptr< writer_compression_statistics > get_compression_statistics() const
Returns a shared pointer to the user-provided compression statistics.
Definition: orc.hpp:994
auto get_stripe_size_bytes() const
Returns maximum stripe size, in bytes.
Definition: orc.hpp:952
void enable_statistics(statistics_freq val)
Choose granularity of statistics collection.
Definition: orc.hpp:1025
static chunked_orc_writer_options_builder builder(sink_info const &sink)
Create builder to create chunked_orc_writer_options.
Chunked orc writer class writes an ORC file in a chunked/stream form.
Definition: orc.hpp:1278
std::unique_ptr< cudf::io::detail::orc::writer > writer
Unique pointer to impl writer class.
Definition: orc.hpp:1307
orc_chunked_writer & write(table_view const &table)
Writes table to output.
void close()
Finishes the chunked/streamed write process.
orc_chunked_writer()=default
Default constructor, this should never be used. This is added just to satisfy cython.
orc_chunked_writer(chunked_orc_writer_options const &options)
Constructor with chunked writer options.
Builds settings to use for read_orc().
Definition: orc.hpp:256
orc_reader_options_builder & use_index(bool use)
Enable/Disable use of row index to speed-up reading.
Definition: orc.hpp:328
orc_reader_options_builder & decimal128_columns(std::vector< std::string > val)
Columns that should be read as 128-bit Decimal.
Definition: orc.hpp:364
orc_reader_options_builder & use_np_dtypes(bool use)
Enable/Disable use of numpy-compatible dtypes.
Definition: orc.hpp:340
orc_reader_options_builder & skip_rows(uint64_t rows)
Sets number of rows to skip from the start.
Definition: orc.hpp:304
orc_reader_options_builder()=default
Default constructor.
orc_reader_options_builder(source_info src)
Constructor from source info.
Definition: orc.hpp:272
orc_reader_options_builder & num_rows(size_type nrows)
Sets number of row to read.
Definition: orc.hpp:316
orc_reader_options_builder & stripes(std::vector< std::vector< size_type >> stripes)
Sets list of individual stripes to read per source.
Definition: orc.hpp:292
orc_reader_options_builder & columns(std::vector< std::string > col_names)
Sets names of the column to read.
Definition: orc.hpp:280
orc_reader_options && build()
move orc_reader_options member once it's built.
Definition: orc.hpp:382
orc_reader_options_builder & timestamp_type(data_type type)
Sets timestamp type to which timestamp column will be cast.
Definition: orc.hpp:352
Settings to use for read_orc().
Definition: orc.hpp:52
std::optional< size_type > const & get_num_rows() const
Returns number of row to read.
Definition: orc.hpp:135
orc_reader_options()=default
Default constructor.
void enable_use_np_dtypes(bool use)
Enable/Disable use of numpy-compatible dtypes.
Definition: orc.hpp:233
auto const & get_stripes() const
Returns vector of vectors, stripes to read for each input source.
Definition: orc.hpp:120
void set_decimal128_columns(std::vector< std::string > val)
Set columns that should be read as 128-bit Decimal.
Definition: orc.hpp:247
void enable_use_index(bool use)
Enable/Disable use of row index to speed-up reading.
Definition: orc.hpp:226
void set_columns(std::vector< std::string > col_names)
Sets names of the column to read.
Definition: orc.hpp:172
uint64_t get_skip_rows() const
Returns number of rows to skip from the start.
Definition: orc.hpp:127
void set_stripes(std::vector< std::vector< size_type >> stripes)
Sets list of stripes to read for each input source.
Definition: orc.hpp:184
data_type get_timestamp_type() const
Returns timestamp type to which timestamp column will be cast.
Definition: orc.hpp:156
void set_num_rows(size_type nrows)
Sets number of row to read.
Definition: orc.hpp:214
auto const & get_columns() const
Returns names of the columns to read, if set.
Definition: orc.hpp:113
void set_skip_rows(uint64_t rows)
Sets number of rows to skip from the start.
Definition: orc.hpp:200
static orc_reader_options_builder builder(source_info src)
Creates orc_reader_options_builder which will build orc_reader_options.
source_info const & get_source() const
Returns source info.
Definition: orc.hpp:106
bool is_enabled_use_np_dtypes() const
Whether to use numpy-compatible dtypes.
Definition: orc.hpp:149
bool is_enabled_use_index() const
Whether to use row index to speed-up reading.
Definition: orc.hpp:142
std::vector< std::string > const & get_decimal128_columns() const
Returns fully qualified names of columns that should be read as 128-bit Decimal.
Definition: orc.hpp:163
void set_timestamp_type(data_type type)
Sets timestamp type to which timestamp column will be cast.
Definition: orc.hpp:240
Builds settings to use for write_orc().
Definition: orc.hpp:694
orc_writer_options_builder & table(table_view tbl)
Sets table to be written to output.
Definition: orc.hpp:786
orc_writer_options_builder & row_index_stride(size_type val)
Sets the row index stride.
Definition: orc.hpp:774
orc_writer_options_builder & enable_statistics(statistics_freq val)
Choose granularity of column statistics to be written.
Definition: orc.hpp:738
orc_writer_options_builder & metadata(table_input_metadata meta)
Sets associated metadata.
Definition: orc.hpp:798
orc_writer_options_builder(sink_info const &sink, table_view const &table)
Constructor from sink and table.
Definition: orc.hpp:711
orc_writer_options && build()
move orc_writer_options member once it's built.
Definition: orc.hpp:853
orc_writer_options_builder()=default
Default constructor.
orc_writer_options_builder & key_value_metadata(std::map< std::string, std::string > metadata)
Sets Key-Value footer metadata.
Definition: orc.hpp:810
orc_writer_options_builder & compression_statistics(std::shared_ptr< writer_compression_statistics > const &comp_stats)
Sets the pointer to the output compression statistics.
Definition: orc.hpp:822
orc_writer_options_builder & stripe_size_rows(size_type val)
Sets the maximum number of rows in output stripes.
Definition: orc.hpp:762
orc_writer_options_builder & enable_dictionary_sort(bool val)
Sets whether string dictionaries should be sorted.
Definition: orc.hpp:835
orc_writer_options_builder & compression(compression_type comp)
Sets compression type.
Definition: orc.hpp:721
orc_writer_options_builder & stripe_size_bytes(size_t val)
Sets the maximum stripe size, in bytes.
Definition: orc.hpp:750
Settings to use for write_orc().
Definition: orc.hpp:432
void enable_statistics(statistics_freq val)
Choose granularity of statistics collection.
Definition: orc.hpp:603
auto const & get_metadata() const
Returns associated metadata.
Definition: orc.hpp:555
std::map< std::string, std::string > const & get_key_value_metadata() const
Returns Key-Value footer metadata information.
Definition: orc.hpp:562
std::shared_ptr< writer_compression_statistics > get_compression_statistics() const
Returns a shared pointer to the user-provided compression statistics.
Definition: orc.hpp:572
bool is_enabled_statistics() const
Whether writing column statistics is enabled/disabled.
Definition: orc.hpp:506
auto get_stripe_size_bytes() const
Returns maximum stripe size, in bytes.
Definition: orc.hpp:523
void set_stripe_size_rows(size_type size_rows)
Sets the maximum stripe size, in rows.
Definition: orc.hpp:628
void set_key_value_metadata(std::map< std::string, std::string > metadata)
Sets metadata.
Definition: orc.hpp:668
auto get_stripe_size_rows() const
Returns maximum stripe size, in rows.
Definition: orc.hpp:530
table_view get_table() const
Returns table to be written to output.
Definition: orc.hpp:548
void set_metadata(table_input_metadata meta)
Sets associated metadata.
Definition: orc.hpp:661
statistics_freq get_statistics_freq() const
Returns frequency of statistics collection.
Definition: orc.hpp:516
void set_compression_statistics(std::shared_ptr< writer_compression_statistics > comp_stats)
Sets the pointer to the output compression statistics.
Definition: orc.hpp:678
auto get_row_index_stride() const
Returns the row index stride.
Definition: orc.hpp:537
void set_table(table_view tbl)
Sets table to be written to output.
Definition: orc.hpp:654
orc_writer_options()=default
Default constructor.
void set_compression(compression_type comp)
Sets compression type.
Definition: orc.hpp:591
void set_enable_dictionary_sort(bool val)
Sets whether string dictionaries should be sorted.
Definition: orc.hpp:688
void set_row_index_stride(size_type stride)
Sets the row index stride.
Definition: orc.hpp:643
compression_type get_compression() const
Returns compression type.
Definition: orc.hpp:499
static orc_writer_options_builder builder(sink_info const &sink, table_view const &table)
Create builder to create orc_writer_options.
bool get_enable_dictionary_sort() const
Returns whether string dictionaries should be sorted.
Definition: orc.hpp:582
void set_stripe_size_bytes(size_t size_bytes)
Sets the maximum stripe size, in bytes.
Definition: orc.hpp:612
sink_info const & get_sink() const
Returns sink info.
Definition: orc.hpp:492
Metadata for a table.
Definition: io/types.hpp:803
A set of cudf::column_view's of the same size.
Definition: table_view.hpp:187
A set of cudf::column's of the same size.
Definition: table.hpp:40
table_with_metadata read_orc(orc_reader_options const &options, rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Reads an ORC dataset into a set of columns.
constexpr size_type default_stripe_size_rows
1M rows default orc stripe rows
Definition: orc.hpp:41
constexpr size_type default_row_index_stride
10K rows default orc row index stride
Definition: orc.hpp:42
constexpr size_t default_stripe_size_bytes
64MB default orc stripe size
Definition: orc.hpp:40
void write_orc(orc_writer_options const &options)
Writes a set of columns to ORC format.
device_memory_resource * get_current_device_resource()
#define CUDF_EXPECTS(...)
Macro for checking (pre-)conditions that throws an exception when a condition is violated.
Definition: error.hpp:170
int32_t size_type
Row index type for columns and tables.
Definition: types.hpp:80
@ EMPTY
Always null with no underlying data.
cuDF-IO API type definitions
compression_type
Compression algorithms.
Definition: io/types.hpp:50
@ AUTO
Automatically detect or select compression format.
statistics_freq
Column statistics granularity type for parquet/orc writers.
Definition: io/types.hpp:89
@ STATISTICS_ROWGROUP
Per-Rowgroup column statistics.
Definition: io/types.hpp:91
@ STATISTICS_NONE
No column statistics.
Definition: io/types.hpp:90
@ STATISTICS_PAGE
Per-page column statistics.
Definition: io/types.hpp:92
cuDF interfaces
Definition: aggregation.hpp:34
Destination information for write interfaces.
Definition: io/types.hpp:463
Source information for read interfaces.
Definition: io/types.hpp:288
Table with table metadata used by io readers to return the metadata by value.
Definition: io/types.hpp:243
Class definitions for (mutable)_table_view
Type declarations for libcudf.