libcudf  24.04.00
parquet.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2020-2024, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <cudf/ast/expressions.hpp>
20 #include <cudf/io/detail/parquet.hpp>
21 #include <cudf/io/types.hpp>
23 #include <cudf/types.hpp>
24 
25 #include <rmm/mr/device/per_device_resource.hpp>
26 
27 #include <iostream>
28 #include <memory>
29 #include <optional>
30 #include <string>
31 #include <vector>
32 
33 namespace cudf::io {
40 constexpr size_t default_row_group_size_bytes = 128 * 1024 * 1024;
42 constexpr size_t default_max_page_size_bytes = 512 * 1024;
44 constexpr int32_t default_column_index_truncate_length = 64;
45 constexpr size_t default_max_dictionary_size = 1024 * 1024;
47 
49 
54  source_info _source;
55 
56  // Path in schema of column to read; `nullopt` is all
57  std::optional<std::vector<std::string>> _columns;
58 
59  // List of individual row groups to read (ignored if empty)
60  std::vector<std::vector<size_type>> _row_groups;
61  // Number of rows to skip from the start; Parquet stores the number of rows as int64_t
62  int64_t _skip_rows = 0;
63  // Number of rows to read; `nullopt` is all
64  std::optional<size_type> _num_rows;
65 
66  // Predicate filter as AST to filter output rows.
67  std::optional<std::reference_wrapper<ast::expression const>> _filter;
68 
69  // Whether to store string data as categorical type
70  bool _convert_strings_to_categories = false;
71  // Whether to use PANDAS metadata to load columns
72  bool _use_pandas_metadata = true;
73  // Cast timestamp columns to a specific type
74  data_type _timestamp_type{type_id::EMPTY};
75 
76  std::optional<std::vector<reader_column_schema>> _reader_column_schema;
77 
83  explicit parquet_reader_options(source_info src) : _source{std::move(src)} {}
84 
86 
87  public:
93  explicit parquet_reader_options() = default;
94 
102 
108  [[nodiscard]] source_info const& get_source() const { return _source; }
109 
116  [[nodiscard]] bool is_enabled_convert_strings_to_categories() const
117  {
118  return _convert_strings_to_categories;
119  }
120 
126  [[nodiscard]] bool is_enabled_use_pandas_metadata() const { return _use_pandas_metadata; }
127 
133  [[nodiscard]] std::optional<std::vector<reader_column_schema>> get_column_schema() const
134  {
135  return _reader_column_schema;
136  }
137 
143  [[nodiscard]] int64_t get_skip_rows() const { return _skip_rows; }
144 
151  [[nodiscard]] std::optional<size_type> const& get_num_rows() const { return _num_rows; }
152 
158  [[nodiscard]] auto const& get_columns() const { return _columns; }
159 
165  [[nodiscard]] auto const& get_row_groups() const { return _row_groups; }
166 
172  [[nodiscard]] auto const& get_filter() const { return _filter; }
173 
179  data_type get_timestamp_type() const { return _timestamp_type; }
180 
186  void set_columns(std::vector<std::string> col_names) { _columns = std::move(col_names); }
187 
193  void set_row_groups(std::vector<std::vector<size_type>> row_groups);
194 
200  void set_filter(ast::expression const& filter) { _filter = filter; }
201 
207  void enable_convert_strings_to_categories(bool val) { _convert_strings_to_categories = val; }
208 
214  void enable_use_pandas_metadata(bool val) { _use_pandas_metadata = val; }
215 
222  void set_column_schema(std::vector<reader_column_schema> val)
223  {
224  _reader_column_schema = std::move(val);
225  }
226 
232  void set_skip_rows(int64_t val);
233 
240 
246  void set_timestamp_type(data_type type) { _timestamp_type = type; }
247 };
248 
253  parquet_reader_options options;
254 
255  public:
262 
268  explicit parquet_reader_options_builder(source_info src) : options{std::move(src)} {}
269 
276  parquet_reader_options_builder& columns(std::vector<std::string> col_names)
277  {
278  options._columns = std::move(col_names);
279  return *this;
280  }
281 
288  parquet_reader_options_builder& row_groups(std::vector<std::vector<size_type>> row_groups)
289  {
290  options.set_row_groups(std::move(row_groups));
291  return *this;
292  }
293 
301  {
302  options.set_filter(filter);
303  return *this;
304  }
305 
313  {
314  options._convert_strings_to_categories = val;
315  return *this;
316  }
317 
325  {
326  options._use_pandas_metadata = val;
327  return *this;
328  }
329 
336  parquet_reader_options_builder& set_column_schema(std::vector<reader_column_schema> val)
337  {
338  options._reader_column_schema = std::move(val);
339  return *this;
340  }
341 
349  {
350  options.set_skip_rows(val);
351  return *this;
352  }
353 
361  {
362  options.set_num_rows(val);
363  return *this;
364  }
365 
373  {
374  options._timestamp_type = type;
375  return *this;
376  }
377 
381  operator parquet_reader_options&&() { return std::move(options); }
382 
390  parquet_reader_options&& build() { return std::move(options); }
391 };
392 
411  parquet_reader_options const& options,
412  rmm::cuda_stream_view stream = cudf::get_default_stream(),
413  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
414 
425  public:
432 
447  std::size_t chunk_read_limit,
448  parquet_reader_options const& options,
449  rmm::cuda_stream_view stream = cudf::get_default_stream(),
450  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
451 
472  std::size_t chunk_read_limit,
473  std::size_t pass_read_limit,
474  parquet_reader_options const& options,
475  rmm::cuda_stream_view stream = cudf::get_default_stream(),
476  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
477 
486 
492  [[nodiscard]] bool has_next() const;
493 
505  [[nodiscard]] table_with_metadata read_chunk() const;
506 
507  private:
508  std::unique_ptr<cudf::io::parquet::detail::chunked_reader> reader;
509 };
510  // end of group
519 
524  // Specify the sink to use for writer output
525  sink_info _sink;
526  // Specify the compression format to use
528  // Specify the level of statistics in the output file
530  // Sets of columns to output
531  table_view _table;
532  // Partitions described as {start_row, num_rows} pairs
533  std::vector<partition_info> _partitions;
534  // Optional associated metadata
535  std::optional<table_input_metadata> _metadata;
536  // Optional footer key_value_metadata
537  std::vector<std::map<std::string, std::string>> _user_data;
538  // Parquet writer can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS.
539  // If true then overrides any per-column setting in _metadata.
540  bool _write_timestamps_as_int96 = false;
541  // Parquet writer can write timestamps as UTC
542  // Defaults to true because libcudf timestamps are implicitly UTC
543  bool _write_timestamps_as_UTC = true;
544  // Column chunks file paths to be set in the raw output metadata. One per output file
545  std::vector<std::string> _column_chunks_file_paths;
546  // Maximum size of each row group (unless smaller than a single page)
547  size_t _row_group_size_bytes = default_row_group_size_bytes;
548  // Maximum number of rows in row group (unless smaller than a single page)
549  size_type _row_group_size_rows = default_row_group_size_rows;
550  // Maximum size of each page (uncompressed)
551  size_t _max_page_size_bytes = default_max_page_size_bytes;
552  // Maximum number of rows in a page
553  size_type _max_page_size_rows = default_max_page_size_rows;
554  // Maximum size of min or max values in column index
555  int32_t _column_index_truncate_length = default_column_index_truncate_length;
556  // When to use dictionary encoding for data
557  dictionary_policy _dictionary_policy = dictionary_policy::ALWAYS;
558  // Maximum size of column chunk dictionary (in bytes)
559  size_t _max_dictionary_size = default_max_dictionary_size;
560  // Maximum number of rows in a page fragment
561  std::optional<size_type> _max_page_fragment_size;
562  // Optional compression statistics
563  std::shared_ptr<writer_compression_statistics> _compression_stats;
564  // write V2 page headers?
565  bool _v2_page_headers = false;
566 
573  explicit parquet_writer_options(sink_info const& sink, table_view const& table)
574  : _sink(sink), _table(table)
575  {
576  }
577 
579 
580  public:
587 
597 
604 
610  [[nodiscard]] sink_info const& get_sink() const { return _sink; }
611 
617  [[nodiscard]] compression_type get_compression() const { return _compression; }
618 
624  [[nodiscard]] statistics_freq get_stats_level() const { return _stats_level; }
625 
631  [[nodiscard]] table_view get_table() const { return _table; }
632 
638  [[nodiscard]] std::vector<partition_info> const& get_partitions() const { return _partitions; }
639 
645  [[nodiscard]] auto const& get_metadata() const { return _metadata; }
646 
652  std::vector<std::map<std::string, std::string>> const& get_key_value_metadata() const
653  {
654  return _user_data;
655  }
656 
662  bool is_enabled_int96_timestamps() const { return _write_timestamps_as_int96; }
663 
669  [[nodiscard]] auto is_enabled_utc_timestamps() const { return _write_timestamps_as_UTC; }
670 
676  std::vector<std::string> const& get_column_chunks_file_paths() const
677  {
678  return _column_chunks_file_paths;
679  }
680 
686  auto get_row_group_size_bytes() const { return _row_group_size_bytes; }
687 
693  auto get_row_group_size_rows() const { return _row_group_size_rows; }
694 
703  {
704  return std::min(_max_page_size_bytes, get_row_group_size_bytes());
705  }
706 
715  {
716  return std::min(_max_page_size_rows, get_row_group_size_rows());
717  }
718 
724  auto get_column_index_truncate_length() const { return _column_index_truncate_length; }
725 
731  [[nodiscard]] dictionary_policy get_dictionary_policy() const { return _dictionary_policy; }
732 
738  [[nodiscard]] auto get_max_dictionary_size() const { return _max_dictionary_size; }
739 
745  [[nodiscard]] auto get_max_page_fragment_size() const { return _max_page_fragment_size; }
746 
752  [[nodiscard]] std::shared_ptr<writer_compression_statistics> get_compression_statistics() const
753  {
754  return _compression_stats;
755  }
756 
762  [[nodiscard]] auto is_enabled_write_v2_headers() const { return _v2_page_headers; }
763 
770  void set_partitions(std::vector<partition_info> partitions);
771 
777  void set_metadata(table_input_metadata metadata) { _metadata = std::move(metadata); }
778 
784  void set_key_value_metadata(std::vector<std::map<std::string, std::string>> metadata);
785 
791  void set_stats_level(statistics_freq sf) { _stats_level = sf; }
792 
798  void set_compression(compression_type compression) { _compression = compression; }
799 
806  void enable_int96_timestamps(bool req) { _write_timestamps_as_int96 = req; }
807 
813  void enable_utc_timestamps(bool val) { _write_timestamps_as_UTC = val; }
814 
821  void set_column_chunks_file_paths(std::vector<std::string> file_paths);
822 
828  void set_row_group_size_bytes(size_t size_bytes);
829 
836 
842  void set_max_page_size_bytes(size_t size_bytes);
843 
850 
856  void set_column_index_truncate_length(int32_t size_bytes);
857 
864 
870  void set_max_dictionary_size(size_t size_bytes);
871 
878 
884  void set_compression_statistics(std::shared_ptr<writer_compression_statistics> comp_stats)
885  {
886  _compression_stats = std::move(comp_stats);
887  }
888 
894  void enable_write_v2_headers(bool val) { _v2_page_headers = val; }
895 };
896 
901  parquet_writer_options options;
902 
903  public:
909  explicit parquet_writer_options_builder() = default;
910 
918  : options(sink, table)
919  {
920  }
921 
929  parquet_writer_options_builder& partitions(std::vector<partition_info> partitions);
930 
938  {
939  options._metadata = std::move(metadata);
940  return *this;
941  }
942 
950  std::vector<std::map<std::string, std::string>> metadata);
951 
959  {
960  options._stats_level = sf;
961  return *this;
962  }
963 
971  {
972  options._compression = compression;
973  return *this;
974  }
975 
983  parquet_writer_options_builder& column_chunks_file_paths(std::vector<std::string> file_paths);
984 
992  {
993  options.set_row_group_size_bytes(val);
994  return *this;
995  }
996 
1004  {
1005  options.set_row_group_size_rows(val);
1006  return *this;
1007  }
1008 
1020  {
1021  options.set_max_page_size_bytes(val);
1022  return *this;
1023  }
1024 
1033  {
1034  options.set_max_page_size_rows(val);
1035  return *this;
1036  }
1037 
1052  {
1053  options.set_column_index_truncate_length(val);
1054  return *this;
1055  }
1056 
1075 
1088 
1100 
1108  std::shared_ptr<writer_compression_statistics> const& comp_stats)
1109  {
1110  options._compression_stats = comp_stats;
1111  return *this;
1112  }
1113 
1121  {
1122  options._write_timestamps_as_int96 = enabled;
1123  return *this;
1124  }
1125 
1133  {
1134  options._write_timestamps_as_UTC = enabled;
1135  return *this;
1136  }
1137 
1145 
1149  operator parquet_writer_options&&() { return std::move(options); }
1150 
1158  parquet_writer_options&& build() { return std::move(options); }
1159 };
1160 
1177 std::unique_ptr<std::vector<uint8_t>> write_parquet(
1178  parquet_writer_options const& options, rmm::cuda_stream_view stream = cudf::get_default_stream());
1179 
1189 std::unique_ptr<std::vector<uint8_t>> merge_row_group_metadata(
1190  std::vector<std::unique_ptr<std::vector<uint8_t>>> const& metadata_list);
1191 
1193 
1198  // Specify the sink to use for writer output
1199  sink_info _sink;
1200  // Specify the compression format to use
1202  // Specify the level of statistics in the output file
1204  // Optional associated metadata.
1205  std::optional<table_input_metadata> _metadata;
1206  // Optional footer key_value_metadata
1207  std::vector<std::map<std::string, std::string>> _user_data;
1208  // Parquet writer can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS.
1209  // If true then overrides any per-column setting in _metadata.
1210  bool _write_timestamps_as_int96 = false;
1211  // Parquet writer can write timestamps as UTC. Defaults to true.
1212  bool _write_timestamps_as_UTC = true;
1213  // Maximum size of each row group (unless smaller than a single page)
1214  size_t _row_group_size_bytes = default_row_group_size_bytes;
1215  // Maximum number of rows in row group (unless smaller than a single page)
1216  size_type _row_group_size_rows = default_row_group_size_rows;
1217  // Maximum size of each page (uncompressed)
1218  size_t _max_page_size_bytes = default_max_page_size_bytes;
1219  // Maximum number of rows in a page
1220  size_type _max_page_size_rows = default_max_page_size_rows;
1221  // Maximum size of min or max values in column index
1222  int32_t _column_index_truncate_length = default_column_index_truncate_length;
1223  // When to use dictionary encoding for data
1224  dictionary_policy _dictionary_policy = dictionary_policy::ALWAYS;
1225  // Maximum size of column chunk dictionary (in bytes)
1226  size_t _max_dictionary_size = default_max_dictionary_size;
1227  // Maximum number of rows in a page fragment
1228  std::optional<size_type> _max_page_fragment_size;
1229  // Optional compression statistics
1230  std::shared_ptr<writer_compression_statistics> _compression_stats;
1231  // write V2 page headers?
1232  bool _v2_page_headers = false;
1233 
1239  explicit chunked_parquet_writer_options(sink_info const& sink) : _sink(sink) {}
1240 
1242 
1243  public:
1250 
1256  [[nodiscard]] sink_info const& get_sink() const { return _sink; }
1257 
1263  [[nodiscard]] compression_type get_compression() const { return _compression; }
1264 
1270  [[nodiscard]] statistics_freq get_stats_level() const { return _stats_level; }
1271 
1277  [[nodiscard]] auto const& get_metadata() const { return _metadata; }
1278 
1284  std::vector<std::map<std::string, std::string>> const& get_key_value_metadata() const
1285  {
1286  return _user_data;
1287  }
1288 
1294  bool is_enabled_int96_timestamps() const { return _write_timestamps_as_int96; }
1295 
1301  [[nodiscard]] auto is_enabled_utc_timestamps() const { return _write_timestamps_as_UTC; }
1302 
1308  auto get_row_group_size_bytes() const { return _row_group_size_bytes; }
1309 
1315  auto get_row_group_size_rows() const { return _row_group_size_rows; }
1316 
1326  {
1327  return std::min(_max_page_size_bytes, get_row_group_size_bytes());
1328  }
1329 
1338  {
1339  return std::min(_max_page_size_rows, get_row_group_size_rows());
1340  }
1341 
1347  auto get_column_index_truncate_length() const { return _column_index_truncate_length; }
1348 
1354  [[nodiscard]] dictionary_policy get_dictionary_policy() const { return _dictionary_policy; }
1355 
1361  [[nodiscard]] auto get_max_dictionary_size() const { return _max_dictionary_size; }
1362 
1368  [[nodiscard]] auto get_max_page_fragment_size() const { return _max_page_fragment_size; }
1369 
1375  [[nodiscard]] std::shared_ptr<writer_compression_statistics> get_compression_statistics() const
1376  {
1377  return _compression_stats;
1378  }
1379 
1385  [[nodiscard]] auto is_enabled_write_v2_headers() const { return _v2_page_headers; }
1386 
1392  void set_metadata(table_input_metadata metadata) { _metadata = std::move(metadata); }
1393 
1399  void set_key_value_metadata(std::vector<std::map<std::string, std::string>> metadata);
1400 
1406  void set_stats_level(statistics_freq sf) { _stats_level = sf; }
1407 
1413  void set_compression(compression_type compression) { _compression = compression; }
1414 
1422  void enable_int96_timestamps(bool req) { _write_timestamps_as_int96 = req; }
1423 
1429  void enable_utc_timestamps(bool val) { _write_timestamps_as_UTC = val; }
1430 
1436  void set_row_group_size_bytes(size_t size_bytes);
1437 
1444 
1450  void set_max_page_size_bytes(size_t size_bytes);
1451 
1458 
1464  void set_column_index_truncate_length(int32_t size_bytes);
1465 
1472 
1478  void set_max_dictionary_size(size_t size_bytes);
1479 
1486 
1492  void set_compression_statistics(std::shared_ptr<writer_compression_statistics> comp_stats)
1493  {
1494  _compression_stats = std::move(comp_stats);
1495  }
1496 
1502  void enable_write_v2_headers(bool val) { _v2_page_headers = val; }
1503 
1512 };
1513 
1519 
1520  public:
1527 
1533  chunked_parquet_writer_options_builder(sink_info const& sink) : options(sink){};
1534 
1542  {
1543  options._metadata = std::move(metadata);
1544  return *this;
1545  }
1546 
1554  std::vector<std::map<std::string, std::string>> metadata);
1555 
1563  {
1564  options._stats_level = sf;
1565  return *this;
1566  }
1567 
1575  {
1576  options._compression = compression;
1577  return *this;
1578  }
1579 
1590  {
1591  options._write_timestamps_as_int96 = enabled;
1592  return *this;
1593  }
1594 
1602  {
1603  options._write_timestamps_as_UTC = enabled;
1604  return *this;
1605  }
1606 
1614 
1622  {
1623  options.set_row_group_size_bytes(val);
1624  return *this;
1625  }
1626 
1634  {
1635  options.set_row_group_size_rows(val);
1636  return *this;
1637  }
1638 
1649  {
1650  options.set_max_page_size_bytes(val);
1651  return *this;
1652  }
1653 
1662  {
1663  options.set_max_page_size_rows(val);
1664  return *this;
1665  }
1666 
1681  {
1682  options.set_column_index_truncate_length(val);
1683  return *this;
1684  }
1685 
1704 
1717 
1729 
1737  std::shared_ptr<writer_compression_statistics> const& comp_stats)
1738  {
1739  options._compression_stats = comp_stats;
1740  return *this;
1741  }
1742 
1746  operator chunked_parquet_writer_options&&() { return std::move(options); }
1747 
1755  chunked_parquet_writer_options&& build() { return std::move(options); }
1756 };
1757 
1778  public:
1784 
1792  rmm::cuda_stream_view stream = cudf::get_default_stream());
1793 
1806  std::vector<partition_info> const& partitions = {});
1807 
1816  std::unique_ptr<std::vector<uint8_t>> close(
1817  std::vector<std::string> const& column_chunks_file_paths = {});
1818 
1820  std::unique_ptr<parquet::detail::writer> writer;
1821 };
1822  // end of group
1824 
1825 } // namespace cudf::io
Indicator for the logical data type of an element in a column.
Definition: types.hpp:241
The chunked parquet reader class to read Parquet file iteratively in to a series of tables,...
Definition: parquet.hpp:424
table_with_metadata read_chunk() const
Read a chunk of rows in the given Parquet file.
bool has_next() const
Check if there is any data in the given file has not yet read.
chunked_parquet_reader()=default
Default constructor, this should never be used.
chunked_parquet_reader(std::size_t chunk_read_limit, parquet_reader_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Constructor for chunked reader.
chunked_parquet_reader(std::size_t chunk_read_limit, std::size_t pass_read_limit, parquet_reader_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Constructor for chunked reader.
~chunked_parquet_reader()
Destructor, destroying the internal reader instance.
Builds options for chunked_parquet_writer_options.
Definition: parquet.hpp:1517
chunked_parquet_writer_options_builder & max_dictionary_size(size_t val)
Sets the maximum dictionary size, in bytes.
chunked_parquet_writer_options_builder & stats_level(statistics_freq sf)
Sets the level of statistics in chunked_parquet_writer_options.
Definition: parquet.hpp:1562
chunked_parquet_writer_options_builder & max_page_fragment_size(size_type val)
Sets the maximum page fragment size, in rows.
chunked_parquet_writer_options && build()
move chunked_parquet_writer_options member once it's is built.
Definition: parquet.hpp:1755
chunked_parquet_writer_options_builder & max_page_size_rows(size_type val)
Sets the maximum page size, in rows. Counts only top-level rows, ignoring any nesting....
Definition: parquet.hpp:1661
chunked_parquet_writer_options_builder & int96_timestamps(bool enabled)
Set to true if timestamps should be written as int96 types instead of int64 types....
Definition: parquet.hpp:1589
chunked_parquet_writer_options_builder()=default
Default constructor.
chunked_parquet_writer_options_builder & row_group_size_bytes(size_t val)
Sets the maximum row group size, in bytes.
Definition: parquet.hpp:1621
chunked_parquet_writer_options_builder & row_group_size_rows(size_type val)
Sets the maximum number of rows in output row groups.
Definition: parquet.hpp:1633
chunked_parquet_writer_options_builder & dictionary_policy(enum dictionary_policy val)
Sets the policy for dictionary use.
chunked_parquet_writer_options_builder & utc_timestamps(bool enabled)
Set to true if timestamps are to be written as UTC.
Definition: parquet.hpp:1601
chunked_parquet_writer_options_builder & key_value_metadata(std::vector< std::map< std::string, std::string >> metadata)
Sets Key-Value footer metadata in parquet_writer_options.
chunked_parquet_writer_options_builder & metadata(table_input_metadata metadata)
Sets metadata to chunked_parquet_writer_options.
Definition: parquet.hpp:1541
chunked_parquet_writer_options_builder & compression(compression_type compression)
Sets compression type to chunked_parquet_writer_options.
Definition: parquet.hpp:1574
chunked_parquet_writer_options_builder & compression_statistics(std::shared_ptr< writer_compression_statistics > const &comp_stats)
Sets the pointer to the output compression statistics.
Definition: parquet.hpp:1736
chunked_parquet_writer_options_builder & column_index_truncate_length(int32_t val)
Sets the desired maximum size in bytes for min and max values in the column index.
Definition: parquet.hpp:1680
chunked_parquet_writer_options_builder & write_v2_headers(bool enabled)
Set to true if V2 page headers are to be written.
chunked_parquet_writer_options_builder & max_page_size_bytes(size_t val)
Sets the maximum uncompressed page size, in bytes.
Definition: parquet.hpp:1648
chunked_parquet_writer_options_builder(sink_info const &sink)
Constructor from sink.
Definition: parquet.hpp:1533
Settings for write_parquet_chunked().
Definition: parquet.hpp:1197
std::vector< std::map< std::string, std::string > > const & get_key_value_metadata() const
Returns Key-Value footer metadata information.
Definition: parquet.hpp:1284
auto const & get_metadata() const
Returns metadata information.
Definition: parquet.hpp:1277
void set_compression_statistics(std::shared_ptr< writer_compression_statistics > comp_stats)
Sets the pointer to the output compression statistics.
Definition: parquet.hpp:1492
auto get_column_index_truncate_length() const
Returns maximum length of min or max values in column index, in bytes.
Definition: parquet.hpp:1347
auto get_max_page_fragment_size() const
Returns maximum page fragment size, in rows.
Definition: parquet.hpp:1368
void set_metadata(table_input_metadata metadata)
Sets metadata.
Definition: parquet.hpp:1392
void enable_int96_timestamps(bool req)
Sets timestamp writing preferences.
Definition: parquet.hpp:1422
static chunked_parquet_writer_options_builder builder(sink_info const &sink)
creates builder to build chunked_parquet_writer_options.
std::shared_ptr< writer_compression_statistics > get_compression_statistics() const
Returns a shared pointer to the user-provided compression statistics.
Definition: parquet.hpp:1375
void set_key_value_metadata(std::vector< std::map< std::string, std::string >> metadata)
Sets Key-Value footer metadata.
auto get_row_group_size_rows() const
Returns maximum row group size, in rows.
Definition: parquet.hpp:1315
void set_stats_level(statistics_freq sf)
Sets the level of statistics in parquet_writer_options.
Definition: parquet.hpp:1406
void set_max_page_size_rows(size_type size_rows)
Sets the maximum page size, in rows.
auto get_row_group_size_bytes() const
Returns maximum row group size, in bytes.
Definition: parquet.hpp:1308
void set_row_group_size_rows(size_type size_rows)
Sets the maximum row group size, in rows.
void enable_utc_timestamps(bool val)
Sets preference for writing timestamps as UTC. Write timestamps as UTC if set to true.
Definition: parquet.hpp:1429
auto get_max_dictionary_size() const
Returns maximum dictionary size, in bytes.
Definition: parquet.hpp:1361
void set_max_page_fragment_size(size_type size_rows)
Sets the maximum page fragment size, in rows.
auto get_max_page_size_rows() const
Returns maximum page size, in rows.
Definition: parquet.hpp:1337
dictionary_policy get_dictionary_policy() const
Returns policy for dictionary use.
Definition: parquet.hpp:1354
void set_max_page_size_bytes(size_t size_bytes)
Sets the maximum uncompressed page size, in bytes.
statistics_freq get_stats_level() const
Returns level of statistics requested in output file.
Definition: parquet.hpp:1270
auto get_max_page_size_bytes() const
Returns maximum uncompressed page size, in bytes.
Definition: parquet.hpp:1325
auto is_enabled_write_v2_headers() const
Returns true if V2 page headers should be written.
Definition: parquet.hpp:1385
bool is_enabled_int96_timestamps() const
Returns true if timestamps will be written as INT96.
Definition: parquet.hpp:1294
chunked_parquet_writer_options()=default
Default constructor.
compression_type get_compression() const
Returns compression format used.
Definition: parquet.hpp:1263
void set_column_index_truncate_length(int32_t size_bytes)
Sets the maximum length of min or max values in column index, in bytes.
void set_row_group_size_bytes(size_t size_bytes)
Sets the maximum row group size, in bytes.
void set_dictionary_policy(dictionary_policy policy)
Sets the policy for dictionary use.
void enable_write_v2_headers(bool val)
Sets preference for V2 page headers. Write V2 page headers if set to true.
Definition: parquet.hpp:1502
void set_compression(compression_type compression)
Sets compression type.
Definition: parquet.hpp:1413
void set_max_dictionary_size(size_t size_bytes)
Sets the maximum dictionary size, in bytes.
sink_info const & get_sink() const
Returns sink info.
Definition: parquet.hpp:1256
auto is_enabled_utc_timestamps() const
Returns true if timestamps will be written as UTC.
Definition: parquet.hpp:1301
chunked parquet writer class to handle options and write tables in chunks.
Definition: parquet.hpp:1777
parquet_chunked_writer()=default
Default constructor, this should never be used. This is added just to satisfy cython.
std::unique_ptr< std::vector< uint8_t > > close(std::vector< std::string > const &column_chunks_file_paths={})
Finishes the chunked/streamed write process.
std::unique_ptr< parquet::detail::writer > writer
Unique pointer to impl writer class.
Definition: parquet.hpp:1820
parquet_chunked_writer & write(table_view const &table, std::vector< partition_info > const &partitions={})
Writes table to output.
parquet_chunked_writer(chunked_parquet_writer_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream())
Constructor with chunked writer options.
Builds parquet_reader_options to use for read_parquet().
Definition: parquet.hpp:252
parquet_reader_options_builder(source_info src)
Constructor from source info.
Definition: parquet.hpp:268
parquet_reader_options_builder & skip_rows(int64_t val)
Sets number of rows to skip.
Definition: parquet.hpp:348
parquet_reader_options_builder & columns(std::vector< std::string > col_names)
Sets names of the columns to be read.
Definition: parquet.hpp:276
parquet_reader_options_builder & timestamp_type(data_type type)
timestamp_type used to cast timestamp columns.
Definition: parquet.hpp:372
parquet_reader_options_builder & use_pandas_metadata(bool val)
Sets to enable/disable use of pandas metadata to read.
Definition: parquet.hpp:324
parquet_reader_options_builder()=default
Default constructor.
parquet_reader_options_builder & row_groups(std::vector< std::vector< size_type >> row_groups)
Sets vector of individual row groups to read.
Definition: parquet.hpp:288
parquet_reader_options_builder & set_column_schema(std::vector< reader_column_schema > val)
Sets reader metadata.
Definition: parquet.hpp:336
parquet_reader_options && build()
move parquet_reader_options member once it's built.
Definition: parquet.hpp:390
parquet_reader_options_builder & filter(ast::expression const &filter)
Sets vector of individual row groups to read.
Definition: parquet.hpp:300
parquet_reader_options_builder & num_rows(size_type val)
Sets number of rows to read.
Definition: parquet.hpp:360
parquet_reader_options_builder & convert_strings_to_categories(bool val)
Sets enable/disable conversion of strings to categories.
Definition: parquet.hpp:312
Settings for read_parquet().
Definition: parquet.hpp:53
data_type get_timestamp_type() const
Returns timestamp type used to cast timestamp columns.
Definition: parquet.hpp:179
parquet_reader_options()=default
Default constructor.
static parquet_reader_options_builder builder(source_info src)
Creates a parquet_reader_options_builder which will build parquet_reader_options.
void set_skip_rows(int64_t val)
Sets number of rows to skip.
void set_columns(std::vector< std::string > col_names)
Sets names of the columns to be read.
Definition: parquet.hpp:186
void enable_convert_strings_to_categories(bool val)
Sets to enable/disable conversion of strings to categories.
Definition: parquet.hpp:207
std::optional< std::vector< reader_column_schema > > get_column_schema() const
Returns optional tree of metadata.
Definition: parquet.hpp:133
source_info const & get_source() const
Returns source info.
Definition: parquet.hpp:108
auto const & get_row_groups() const
Returns list of individual row groups to be read.
Definition: parquet.hpp:165
std::optional< size_type > const & get_num_rows() const
Returns number of rows to read.
Definition: parquet.hpp:151
void set_row_groups(std::vector< std::vector< size_type >> row_groups)
Sets vector of individual row groups to read.
void set_num_rows(size_type val)
Sets number of rows to read.
auto const & get_columns() const
Returns names of column to be read, if set.
Definition: parquet.hpp:158
void set_timestamp_type(data_type type)
Sets timestamp_type used to cast timestamp columns.
Definition: parquet.hpp:246
bool is_enabled_convert_strings_to_categories() const
Returns true/false depending on whether strings should be converted to categories or not.
Definition: parquet.hpp:116
void enable_use_pandas_metadata(bool val)
Sets to enable/disable use of pandas metadata to read.
Definition: parquet.hpp:214
bool is_enabled_use_pandas_metadata() const
Returns true/false depending whether to use pandas metadata or not while reading.
Definition: parquet.hpp:126
void set_column_schema(std::vector< reader_column_schema > val)
Sets reader column schema.
Definition: parquet.hpp:222
void set_filter(ast::expression const &filter)
Sets AST based filter for predicate pushdown.
Definition: parquet.hpp:200
auto const & get_filter() const
Returns AST based filter for predicate pushdown.
Definition: parquet.hpp:172
int64_t get_skip_rows() const
Returns number of rows to skip from the start.
Definition: parquet.hpp:143
Class to build parquet_writer_options.
Definition: parquet.hpp:900
parquet_writer_options_builder(sink_info const &sink, table_view const &table)
Constructor from sink and table.
Definition: parquet.hpp:917
parquet_writer_options_builder & metadata(table_input_metadata metadata)
Sets metadata in parquet_writer_options.
Definition: parquet.hpp:937
parquet_writer_options_builder & dictionary_policy(enum dictionary_policy val)
Sets the policy for dictionary use.
parquet_writer_options_builder & max_page_size_bytes(size_t val)
Sets the maximum uncompressed page size, in bytes.
Definition: parquet.hpp:1019
parquet_writer_options_builder & stats_level(statistics_freq sf)
Sets the level of statistics in parquet_writer_options.
Definition: parquet.hpp:958
parquet_writer_options_builder & row_group_size_rows(size_type val)
Sets the maximum number of rows in output row groups.
Definition: parquet.hpp:1003
parquet_writer_options_builder & key_value_metadata(std::vector< std::map< std::string, std::string >> metadata)
Sets Key-Value footer metadata in parquet_writer_options.
parquet_writer_options_builder()=default
Default constructor.
parquet_writer_options && build()
move parquet_writer_options member once it's built.
Definition: parquet.hpp:1158
parquet_writer_options_builder & row_group_size_bytes(size_t val)
Sets the maximum row group size, in bytes.
Definition: parquet.hpp:991
parquet_writer_options_builder & max_page_size_rows(size_type val)
Sets the maximum page size, in rows. Counts only top-level rows, ignoring any nesting....
Definition: parquet.hpp:1032
parquet_writer_options_builder & utc_timestamps(bool enabled)
Set to true if timestamps are to be written as UTC.
Definition: parquet.hpp:1132
parquet_writer_options_builder & max_page_fragment_size(size_type val)
Sets the maximum page fragment size, in rows.
parquet_writer_options_builder & compression(compression_type compression)
Sets compression type in parquet_writer_options.
Definition: parquet.hpp:970
parquet_writer_options_builder & write_v2_headers(bool enabled)
Set to true if V2 page headers are to be written.
parquet_writer_options_builder & partitions(std::vector< partition_info > partitions)
Sets partitions in parquet_writer_options.
parquet_writer_options_builder & max_dictionary_size(size_t val)
Sets the maximum dictionary size, in bytes.
parquet_writer_options_builder & compression_statistics(std::shared_ptr< writer_compression_statistics > const &comp_stats)
Sets the pointer to the output compression statistics.
Definition: parquet.hpp:1107
parquet_writer_options_builder & int96_timestamps(bool enabled)
Sets whether int96 timestamps are written or not in parquet_writer_options.
Definition: parquet.hpp:1120
parquet_writer_options_builder & column_chunks_file_paths(std::vector< std::string > file_paths)
Sets column chunks file path to be set in the raw output metadata.
parquet_writer_options_builder & column_index_truncate_length(int32_t val)
Sets the desired maximum size in bytes for min and max values in the column index.
Definition: parquet.hpp:1051
Settings for write_parquet().
Definition: parquet.hpp:523
void enable_write_v2_headers(bool val)
Sets preference for V2 page headers. Write V2 page headers if set to true.
Definition: parquet.hpp:894
void set_partitions(std::vector< partition_info > partitions)
Sets partitions.
statistics_freq get_stats_level() const
Returns level of statistics requested in output file.
Definition: parquet.hpp:624
static parquet_writer_options_builder builder(sink_info const &sink, table_view const &table)
Create builder to create parquet_writer_options.
void set_dictionary_policy(dictionary_policy policy)
Sets the policy for dictionary use.
parquet_writer_options()=default
Default constructor.
auto const & get_metadata() const
Returns associated metadata.
Definition: parquet.hpp:645
std::vector< std::map< std::string, std::string > > const & get_key_value_metadata() const
Returns Key-Value footer metadata information.
Definition: parquet.hpp:652
void set_max_dictionary_size(size_t size_bytes)
Sets the maximum dictionary size, in bytes.
auto get_row_group_size_bytes() const
Returns maximum row group size, in bytes.
Definition: parquet.hpp:686
void set_max_page_fragment_size(size_type size_rows)
Sets the maximum page fragment size, in rows.
compression_type get_compression() const
Returns compression format used.
Definition: parquet.hpp:617
auto get_max_dictionary_size() const
Returns maximum dictionary size, in bytes.
Definition: parquet.hpp:738
void set_compression_statistics(std::shared_ptr< writer_compression_statistics > comp_stats)
Sets the pointer to the output compression statistics.
Definition: parquet.hpp:884
auto get_max_page_size_bytes() const
Returns the maximum uncompressed page size, in bytes.
Definition: parquet.hpp:702
bool is_enabled_int96_timestamps() const
Returns true if timestamps will be written as INT96.
Definition: parquet.hpp:662
sink_info const & get_sink() const
Returns sink info.
Definition: parquet.hpp:610
void set_compression(compression_type compression)
Sets compression type.
Definition: parquet.hpp:798
std::vector< std::string > const & get_column_chunks_file_paths() const
Returns Column chunks file paths to be set in the raw output metadata.
Definition: parquet.hpp:676
auto get_column_index_truncate_length() const
Returns maximum length of min or max values in column index, in bytes.
Definition: parquet.hpp:724
void set_max_page_size_rows(size_type size_rows)
Sets the maximum page size, in rows.
auto is_enabled_utc_timestamps() const
Returns true if timestamps will be written as UTC.
Definition: parquet.hpp:669
auto get_max_page_fragment_size() const
Returns maximum page fragment size, in rows.
Definition: parquet.hpp:745
void set_row_group_size_bytes(size_t size_bytes)
Sets the maximum row group size, in bytes.
void enable_utc_timestamps(bool val)
Sets preference for writing timestamps as UTC. Write timestamps as UTC if set to true.
Definition: parquet.hpp:813
void set_max_page_size_bytes(size_t size_bytes)
Sets the maximum uncompressed page size, in bytes.
std::shared_ptr< writer_compression_statistics > get_compression_statistics() const
Returns a shared pointer to the user-provided compression statistics.
Definition: parquet.hpp:752
void set_stats_level(statistics_freq sf)
Sets the level of statistics.
Definition: parquet.hpp:791
dictionary_policy get_dictionary_policy() const
Returns policy for dictionary use.
Definition: parquet.hpp:731
auto get_row_group_size_rows() const
Returns maximum row group size, in rows.
Definition: parquet.hpp:693
table_view get_table() const
Returns table_view.
Definition: parquet.hpp:631
void set_column_chunks_file_paths(std::vector< std::string > file_paths)
Sets column chunks file path to be set in the raw output metadata.
void enable_int96_timestamps(bool req)
Sets timestamp writing preferences. INT96 timestamps will be written if true and TIMESTAMP_MICROS wil...
Definition: parquet.hpp:806
auto is_enabled_write_v2_headers() const
Returns true if V2 page headers should be written.
Definition: parquet.hpp:762
void set_row_group_size_rows(size_type size_rows)
Sets the maximum row group size, in rows.
void set_key_value_metadata(std::vector< std::map< std::string, std::string >> metadata)
Sets metadata.
auto get_max_page_size_rows() const
Returns maximum page size, in rows.
Definition: parquet.hpp:714
void set_metadata(table_input_metadata metadata)
Sets metadata.
Definition: parquet.hpp:777
void set_column_index_truncate_length(int32_t size_bytes)
Sets the maximum length of min or max values in column index, in bytes.
static parquet_writer_options_builder builder()
Create builder to create parquet_writer_options.
std::vector< partition_info > const & get_partitions() const
Returns partitions.
Definition: parquet.hpp:638
Metadata for a table.
Definition: io/types.hpp:858
A set of cudf::column_view's of the same size.
Definition: table_view.hpp:187
A set of cudf::column's of the same size.
Definition: table.hpp:40
rmm::cuda_stream_view const get_default_stream()
Get the current default stream.
constexpr size_type default_row_group_size_rows
1 million rows per row group
Definition: parquet.hpp:41
constexpr int32_t default_column_index_truncate_length
truncate to 64 bytes
Definition: parquet.hpp:44
constexpr size_t default_row_group_size_bytes
128MB per row group
Definition: parquet.hpp:40
constexpr size_type default_max_page_fragment_size
5000 rows per page fragment
Definition: parquet.hpp:46
table_with_metadata read_parquet(parquet_reader_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Reads a Parquet dataset into a set of columns.
constexpr size_t default_max_dictionary_size
1MB dictionary size
Definition: parquet.hpp:45
constexpr size_t default_max_page_size_bytes
512KB per page
Definition: parquet.hpp:42
constexpr size_type default_max_page_size_rows
20k rows per page
Definition: parquet.hpp:43
compression_type
Compression algorithms.
Definition: io/types.hpp:56
statistics_freq
Column statistics granularity type for parquet/orc writers.
Definition: io/types.hpp:95
dictionary_policy
Control use of dictionary encoding for parquet writer.
Definition: io/types.hpp:223
@ SNAPPY
Snappy format, using byte-oriented LZ77.
@ AUTO
Automatically detect or select compression format.
@ STATISTICS_ROWGROUP
Per-Rowgroup column statistics.
Definition: io/types.hpp:97
@ ALWAYS
Use dictionary regardless of impact on compression.
Definition: io/types.hpp:226
std::unique_ptr< std::vector< uint8_t > > merge_row_group_metadata(std::vector< std::unique_ptr< std::vector< uint8_t >>> const &metadata_list)
Merges multiple raw metadata blobs that were previously created by write_parquet into a single metada...
std::unique_ptr< std::vector< uint8_t > > write_parquet(parquet_writer_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream())
Writes a set of columns to parquet format.
int32_t size_type
Row index type for columns and tables.
Definition: types.hpp:93
@ EMPTY
Always null with no underlying data.
cuDF-IO API type definitions
IO interfaces.
A generic expression that can be evaluated to return a value.
Definition: expressions.hpp:46
Destination information for write interfaces.
Definition: io/types.hpp:489
Source information for read interfaces.
Definition: io/types.hpp:314
Table with table metadata used by io readers to return the metadata by value.
Definition: io/types.hpp:269
Class definitions for (mutable)_table_view
Type declarations for libcudf.