parquet_schema.hpp
1 /*
2  * Copyright (c) 2018-2025, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <cudf/types.hpp>
20 
21 #include <cuda/std/optional>
22 
23 #include <cstdint>
24 #include <optional>
25 #include <string>
26 #include <vector>
27 
28 namespace CUDF_EXPORT cudf {
29 
30 namespace io::parquet {
31 
35 enum class Type : int8_t {
36  UNDEFINED = -1, // Undefined for non-leaf nodes
37  BOOLEAN = 0,
38  INT32 = 1,
39  INT64 = 2,
40  INT96 = 3, // Deprecated
41  FLOAT = 4,
42  DOUBLE = 5,
43  BYTE_ARRAY = 6,
44  FIXED_LEN_BYTE_ARRAY = 7,
45 };
46 
50 enum class ConvertedType : int8_t {
51  UNKNOWN = -1, // No type information present
52  UTF8 = 0, // a BYTE_ARRAY may contain UTF8 encoded chars
53  MAP = 1, // a map is converted as an optional field containing a repeated key/value pair
54  MAP_KEY_VALUE = 2, // a key/value pair is converted into a group of two fields
55  LIST =
56  3, // a list is converted into an optional field containing a repeated field for its values
57  ENUM = 4, // an enum is converted into a binary field
58  DECIMAL = 5, // A decimal value. 10^(-scale) encoded as 2's complement big endian
59  // (precision=number of digits, scale=location of decimal point)
60  DATE = 6, // A Date, stored as days since Unix epoch, encoded as the INT32 physical type.
61  TIME_MILLIS = 7, // A time. The total number of milliseconds since midnight.The value is stored
62  // as an INT32 physical type.
63  TIME_MICROS = 8, // A time. The total number of microseconds since midnight. The value is stored
64  // as an INT64 physical type.
65  TIMESTAMP_MILLIS = 9, // A date/time combination, recorded as milliseconds since the Unix epoch
66  // using physical type of INT64.
67  TIMESTAMP_MICROS = 10, // A date/time combination, microseconds since the Unix epoch as INT64
68  UINT_8 = 11, // An unsigned integer 8-bit value as INT32
69  UINT_16 = 12, // An unsigned integer 16-bit value as INT32
70  UINT_32 = 13, // An unsigned integer 32-bit value as INT32
71  UINT_64 = 14, // An unsigned integer 64-bit value as INT64
72  INT_8 = 15, // A signed integer 8-bit value as INT32
73  INT_16 = 16, // A signed integer 16-bit value as INT32
74  INT_32 = 17, // A signed integer 32-bit value as INT32
75  INT_64 = 18, // A signed integer 8-bit value as INT64
76  JSON = 19, // A JSON document embedded within a single UTF8 column.
77  BSON = 20, // A BSON document embedded within a single BINARY column.
78  INTERVAL = 21, // This type annotates a time interval stored as a FIXED_LEN_BYTE_ARRAY of length
79  // 12 for 3 integers {months,days,milliseconds}
80  NA = 25, // No Type information, For eg, all-nulls.
81 };
82 
86 enum class Encoding : uint8_t {
87  PLAIN = 0,
88  GROUP_VAR_INT = 1, // Deprecated, never used
89  PLAIN_DICTIONARY = 2,
90  RLE = 3,
91  BIT_PACKED = 4, // Deprecated by parquet-format in 2013, superseded by RLE
92  DELTA_BINARY_PACKED = 5,
93  DELTA_LENGTH_BYTE_ARRAY = 6,
94  DELTA_BYTE_ARRAY = 7,
95  RLE_DICTIONARY = 8,
96  BYTE_STREAM_SPLIT = 9,
97  NUM_ENCODINGS = 10,
98 };
99 
103 enum class Compression : uint8_t {
104  UNCOMPRESSED = 0,
105  SNAPPY = 1,
106  GZIP = 2,
107  LZO = 3,
108  BROTLI = 4, // Added in 2.3.2
109  LZ4 = 5, // deprecated; based on LZ4, but with an additional undocumented framing scheme
110  ZSTD = 6, // Added in 2.3.2
111  LZ4_RAW = 7, // "standard" LZ4 block format
112 };
113 
117 enum class FieldRepetitionType : int8_t {
118  UNSPECIFIED = -1,
119  REQUIRED = 0, // This field is required (can not be null) and each record has exactly 1 value.
120  OPTIONAL = 1, // The field is optional (can be null) and each record has 0 or 1 values.
121  REPEATED = 2, // The field is repeated and can contain 0 or more values
122 };
123 
127 enum class PageType : uint8_t {
128  DATA_PAGE = 0,
129  INDEX_PAGE = 1,
130  DICTIONARY_PAGE = 2,
131  DATA_PAGE_V2 = 3,
132 };
133 
138 enum class BoundaryOrder : uint8_t {
139  UNORDERED = 0,
140  ASCENDING = 1,
141  DESCENDING = 2,
142 };
143 
147 enum class FieldType : uint8_t {
148  BOOLEAN_TRUE = 1,
149  BOOLEAN_FALSE = 2,
150  I8 = 3,
151  I16 = 4,
152  I32 = 5,
153  I64 = 6,
154  DOUBLE = 7,
155  BINARY = 8,
156  LIST = 9,
157  SET = 10,
158  MAP = 11,
159  STRUCT = 12,
160  UUID = 13,
161 };
162 
168  uint32_t magic;
169 };
170 
174 struct file_ender_s {
176  uint32_t footer_len;
178  uint32_t magic;
179 };
180 
186 struct DecimalType {
188  int32_t scale = 0;
190  int32_t precision = 0;
191 };
192 
196 struct TimeUnit {
198  enum Type { UNDEFINED, MILLIS, MICROS, NANOS };
201 };
202 
208 struct TimeType {
211  bool isAdjustedToUTC = true;
213  TimeUnit unit = {TimeUnit::MILLIS};
214 };
215 
224  bool isAdjustedToUTC = true;
226  TimeUnit unit = {TimeUnit::MILLIS};
227 };
228 
234 struct IntType {
236  int8_t bitWidth = 0;
238  bool isSigned = false;
239 };
240 
244 struct LogicalType {
246  enum Type {
247  UNDEFINED,
248  STRING,
249  MAP,
250  LIST,
251  ENUM,
252  DECIMAL,
253  DATE,
254  TIME,
255  TIMESTAMP,
256  // 9 is reserved
257  INTEGER = 10,
258  UNKNOWN,
259  JSON,
260  BSON
261  };
262 
266  cuda::std::optional<DecimalType> decimal_type;
268  cuda::std::optional<TimeType> time_type;
270  cuda::std::optional<TimestampType> timestamp_type;
272  cuda::std::optional<IntType> int_type;
273 
279  LogicalType(Type tp = UNDEFINED) : type(tp) {}
280 
286  LogicalType(DecimalType&& dt) : type(DECIMAL), decimal_type(dt) {}
287 
293  LogicalType(TimeType&& tt) : type(TIME), time_type(tt) {}
294 
300  LogicalType(TimestampType&& tst) : type(TIMESTAMP), timestamp_type(tst) {}
301 
307  LogicalType(IntType&& it) : type(INTEGER), int_type(it) {}
308 
314  [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_time_millis() const
315  {
316  return type == TIME and time_type->unit.type == TimeUnit::MILLIS;
317  }
318 
324  [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_time_micros() const
325  {
326  return type == TIME and time_type->unit.type == TimeUnit::MICROS;
327  }
328 
334  [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_time_nanos() const
335  {
336  return type == TIME and time_type->unit.type == TimeUnit::NANOS;
337  }
338 
344  [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_timestamp_millis() const
345  {
346  return type == TIMESTAMP and timestamp_type->unit.type == TimeUnit::MILLIS;
347  }
348 
354  [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_timestamp_micros() const
355  {
356  return type == TIMESTAMP and timestamp_type->unit.type == TimeUnit::MICROS;
357  }
358 
364  [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_timestamp_nanos() const
365  {
366  return type == TIMESTAMP and timestamp_type->unit.type == TimeUnit::NANOS;
367  }
368 
374  [[nodiscard]] CUDF_HOST_DEVICE constexpr int8_t bit_width() const
375  {
376  return type == INTEGER ? int_type->bitWidth : -1;
377  }
378 
384  [[nodiscard]] constexpr bool is_signed() const { return type == INTEGER and int_type->isSigned; }
385 
391  [[nodiscard]] constexpr int32_t scale() const
392  {
393  return type == DECIMAL ? decimal_type->scale : -1;
394  }
395 
401  [[nodiscard]] CUDF_HOST_DEVICE constexpr int32_t precision() const
402  {
403  return type == DECIMAL ? decimal_type->precision : -1;
404  }
405 };
406 
410 struct ColumnOrder {
412  enum Type { UNDEFINED, TYPE_ORDER };
415 };
416 
425  Type type = Type::UNDEFINED;
427  int32_t type_length = 0;
429  FieldRepetitionType repetition_type = FieldRepetitionType::REQUIRED;
431  std::string name = "";
433  int32_t num_children = 0;
435  std::optional<ConvertedType> converted_type;
437  int32_t decimal_scale = 0;
439  int32_t decimal_precision = 0;
441  std::optional<int32_t> field_id;
443  std::optional<LogicalType> logical_type;
444 
446  bool output_as_byte_array = false;
447 
449  std::optional<type_id> arrow_type;
450 
451  // The following fields are filled in later during schema initialization
452 
454  int max_definition_level = 0;
456  int max_repetition_level = 0;
458  size_type parent_idx = 0;
460  std::vector<size_type> children_idx;
461 
468  bool operator==(SchemaElement const& other) const
469  {
470  return type == other.type && converted_type == other.converted_type &&
471  type_length == other.type_length && name == other.name &&
472  num_children == other.num_children && decimal_scale == other.decimal_scale &&
473  decimal_precision == other.decimal_precision && field_id == other.field_id;
474  }
475 
476  // the parquet format is a little squishy when it comes to interpreting
477  // repeated fields. sometimes repeated fields act as "stubs" in the schema
478  // that don't represent a true nesting level.
479  //
480  // this is the case with plain lists:
481  //
482  // optional group my_list (LIST) {
483  // repeated group element { <-- not part of the output hierarchy
484  // required binary str (UTF8);
485  // };
486  // }
487  //
488  // However, for backwards compatibility reasons, there are a few special cases, namely
489  // List<Struct<>> (which also corresponds to how the map type is specified), where
490  // this does not hold true
491  //
492  // optional group my_list (LIST) {
493  // repeated group element { <-- part of the hierarchy because it represents a struct
494  // required binary str (UTF8);
495  // required int32 num;
496  // };
497  // }
498 
504  [[nodiscard]] bool is_stub() const
505  {
506  return repetition_type == FieldRepetitionType::REPEATED && num_children == 1;
507  }
508 
519  [[nodiscard]] bool is_one_level_list(SchemaElement const& parent) const
520  {
521  return repetition_type == FieldRepetitionType::REPEATED and num_children == 0 and
522  not parent.is_list();
523  }
524 
530  [[nodiscard]] bool is_list() const { return converted_type == ConvertedType::LIST; }
531 
540  [[nodiscard]] bool is_struct() const
541  {
542  return type == Type::UNDEFINED &&
543  // this assumption might be a little weak.
544  ((repetition_type != FieldRepetitionType::REPEATED) ||
545  (repetition_type == FieldRepetitionType::REPEATED && num_children > 1));
546  }
547 };
548 
552 struct Statistics {
554  std::optional<std::vector<uint8_t>> max;
556  std::optional<std::vector<uint8_t>> min;
558  std::optional<int64_t> null_count;
560  std::optional<int64_t> distinct_count;
562  std::optional<std::vector<uint8_t>> max_value;
564  std::optional<std::vector<uint8_t>> min_value;
566  std::optional<bool> is_max_value_exact;
568  std::optional<bool> is_min_value_exact;
569 };
570 
577  std::optional<int64_t> unencoded_byte_array_data_bytes;
586  std::optional<std::vector<int64_t>> repetition_level_histogram;
587 
593  std::optional<std::vector<int64_t>> definition_level_histogram;
594 };
595 
600 struct PageLocation {
602  int64_t offset;
608 };
609 
613 struct OffsetIndex {
615  std::vector<PageLocation> page_locations;
618  std::optional<std::vector<int64_t>> unencoded_byte_array_data_bytes;
619 };
620 
624 struct ColumnIndex {
626  std::vector<bool> null_pages;
628  std::vector<std::vector<uint8_t>> min_values;
630  std::vector<std::vector<uint8_t>> max_values;
632  BoundaryOrder boundary_order = BoundaryOrder::UNORDERED;
634  std::optional<std::vector<int64_t>> null_counts;
636  std::optional<std::vector<int64_t>> repetition_level_histogram;
638  std::optional<std::vector<int64_t>> definition_level_histogram;
639 };
640 
650  int32_t count;
651 };
652 
658  int32_t column_idx;
663 };
664 
670  Type type = Type::BOOLEAN;
673  std::vector<Encoding> encodings;
675  std::vector<std::string> path_in_schema;
677  Compression codec = Compression::UNCOMPRESSED;
679  int64_t num_values = 0;
681  int64_t total_uncompressed_size = 0;
683  int64_t total_compressed_size = 0;
685  int64_t data_page_offset = 0;
687  int64_t index_page_offset = 0;
689  int64_t dictionary_page_offset = 0;
694  std::optional<std::vector<PageEncodingStats>> encoding_stats;
696  std::optional<int64_t> bloom_filter_offset;
701  std::optional<int32_t> bloom_filter_length;
705  std::optional<SizeStatistics> size_statistics;
706 };
707 
713  enum Algorithm : uint8_t { UNDEFINED, SPLIT_BLOCK };
715  Algorithm algorithm{SPLIT_BLOCK};
716 };
717 
723  enum Hash : uint8_t { UNDEFINED, XXHASH };
725  Hash hash{XXHASH};
726 };
727 
733  enum Compression : uint8_t { UNDEFINED, UNCOMPRESSED };
735  Compression compression{UNCOMPRESSED};
736 };
737 
746  int32_t num_bytes;
753 };
754 
763 struct ColumnChunk {
766  std::string file_path = "";
768  int64_t file_offset = 0;
773  int64_t offset_index_offset = 0;
775  int32_t offset_index_length = 0;
777  int64_t column_index_offset = 0;
779  int32_t column_index_length = 0;
780 
781  // Following fields are derived from other fields
782 
784  int schema_idx = -1;
785 
786  // The indexes don't really live here, but it's a convenient place to hang them.
787 
789  std::optional<OffsetIndex> offset_index;
791  std::optional<ColumnIndex> column_index;
792 };
793 
800 struct RowGroup {
802  std::vector<ColumnChunk> columns;
804  int64_t total_byte_size = 0;
806  int64_t num_rows = 0;
808  std::optional<std::vector<SortingColumn>> sorting_columns;
810  std::optional<int64_t> file_offset;
812  std::optional<int64_t> total_compressed_size;
814  std::optional<int16_t> ordinal;
815 };
816 
820 struct KeyValue {
822  std::string key;
824  std::string value;
825 };
826 
834 struct FileMetaData {
836  int32_t version = 0;
841  std::vector<SchemaElement> schema;
843  int64_t num_rows = 0;
845  std::vector<RowGroup> row_groups;
847  std::vector<KeyValue> key_value_metadata;
849  std::string created_by = "";
852  std::optional<std::vector<ColumnOrder>> column_orders;
853 };
854 
860  int32_t num_values = 0;
862  Encoding encoding = Encoding::PLAIN;
864  Encoding definition_level_encoding = Encoding::PLAIN;
866  Encoding repetition_level_encoding = Encoding::PLAIN;
867 };
868 
874  int32_t num_values = 0;
876  int32_t num_nulls = 0;
879  int32_t num_rows = 0;
881  Encoding encoding = Encoding::PLAIN;
883  int32_t definition_levels_byte_length = 0;
885  int32_t repetition_levels_byte_length = 0;
887  bool is_compressed = true;
888 };
889 
895  int32_t num_values = 0;
897  Encoding encoding = Encoding::PLAIN;
898 };
899 
909 struct PageHeader {
911  PageType type = PageType::DATA_PAGE;
913  int32_t uncompressed_page_size = 0;
915  int32_t compressed_page_size = 0;
916 
917  // Headers for page specific data. One only will be set.
918 
925 };
926 
927 } // namespace io::parquet
928 } // namespace CUDF_EXPORT cudf
int32_t size_type
Row index type for columns and tables.
Definition: types.hpp:95
ConvertedType
High-level data types in Parquet, determines how data is logically interpreted.
FieldRepetitionType
Compression codec used for compressed data pages.
PageType
Types of pages.
Encoding
Encoding types for the actual data stream.
Type
Basic data types in Parquet, determines how data is physically stored.
FieldType
Thrift compact protocol struct field types.
Compression
Compression codec used for compressed data pages.
BoundaryOrder
Enum to annotate whether lists of min/max elements inside ColumnIndex are ordered and if so,...
cuDF interfaces
Definition: host_udf.hpp:37
The algorithm used in bloom filter.
Algorithm
Available bloom filter algorithms.
The compression used in the bloom filter.
Compression
Available bloom filter compression types.
The hash function used in Bloom filter.
Hash
Available bloom filter hashers.
Bloom filter header struct.
BloomFilterCompression compression
The compression used in the bloom filter.
BloomFilterHash hash
The hash function used for bloom filter.
BloomFilterAlgorithm algorithm
The algorithm for setting bits.
int32_t num_bytes
The size of bitset in bytes.
Thrift-derived struct describing a column chunk.
std::optional< std::vector< PageEncodingStats > > encoding_stats
Statistics statistics
Optional statistics for this column chunk.
std::optional< int64_t > bloom_filter_offset
Byte offset from beginning of file to Bloom filter data.
std::optional< int32_t > bloom_filter_length
std::optional< SizeStatistics > size_statistics
std::vector< std::string > path_in_schema
Path in schema.
Thrift-derived struct describing a chunk of data for a particular column.
std::optional< ColumnIndex > column_index
ColumnIndex for this column chunk
std::optional< OffsetIndex > offset_index
OffsetIndex for this column chunk
Thrift-derived struct describing the column index.
std::optional< std::vector< int64_t > > definition_level_histogram
Definition level histogram for the column chunk.
std::optional< std::vector< int64_t > > null_counts
Optional count of null values per page.
std::vector< std::vector< uint8_t > > max_values
Upper bound for values in each page.
std::optional< std::vector< int64_t > > repetition_level_histogram
Repetition level histogram for the column chunk.
std::vector< bool > null_pages
Boolean used to determine if a page contains only null values.
std::vector< std::vector< uint8_t > > min_values
Lower bound for values in each page.
Union to specify the order used for the min_value and max_value fields for a column.
Type type
Column order type.
Type
Available column order types.
Thrift-derived struct describing the header for a V2 data page.
Thrift-derived struct describing the header for a data page.
Struct that describes the decimal logical type annotation.
Thrift-derived struct describing the header for a dictionary page.
Thrift-derived struct describing file-level metadata.
std::optional< std::vector< ColumnOrder > > column_orders
std::vector< RowGroup > row_groups
Row groups in this file.
std::vector< KeyValue > key_value_metadata
Optional key/value metadata.
std::vector< SchemaElement > schema
Struct that describes the integer logical type annotation.
Thrift-derived struct describing a key-value pair, for user metadata.
std::string key
string key
std::string value
string value
Struct that describes the logical type annotation.
constexpr CUDF_HOST_DEVICE bool is_time_millis() const
Check if the time is in milliseconds.
LogicalType(Type tp=UNDEFINED)
Default constructor.
cuda::std::optional< IntType > int_type
Integer type.
cuda::std::optional< TimeType > time_type
Time type.
constexpr CUDF_HOST_DEVICE bool is_timestamp_millis() const
Check if the timestamp is in milliseconds.
constexpr CUDF_HOST_DEVICE bool is_timestamp_micros() const
Check if the timestamp is in microseconds.
constexpr int32_t scale() const
Get the scale of the decimal type.
constexpr CUDF_HOST_DEVICE bool is_time_nanos() const
Check if the time is in nanoseconds.
LogicalType(IntType &&it)
Constructor for Integer logical type.
constexpr bool is_signed() const
Check if the integer is signed.
Type
Logical type annotations to replace ConvertedType.
constexpr CUDF_HOST_DEVICE bool is_timestamp_nanos() const
Check if the timestamp is in nanoseconds.
cuda::std::optional< DecimalType > decimal_type
Decimal type.
LogicalType(TimeType &&tt)
Constructor for Time logical type.
cuda::std::optional< TimestampType > timestamp_type
Timestamp type.
LogicalType(TimestampType &&tst)
Constructor for Timestamp logical type.
constexpr CUDF_HOST_DEVICE int32_t precision() const
Get the precision of the decimal type.
constexpr CUDF_HOST_DEVICE int8_t bit_width() const
Get the bit width of the integer type.
constexpr CUDF_HOST_DEVICE bool is_time_micros() const
Check if the time is in microseconds.
LogicalType(DecimalType &&dt)
Constructor for Decimal logical type.
Thrift-derived struct describing the offset index.
std::vector< PageLocation > page_locations
Page locations.
std::optional< std::vector< int64_t > > unencoded_byte_array_data_bytes
Thrift-derived struct describing page encoding statistics.
Encoding encoding
Encoding of the page.
int32_t count
Number of pages of this type with this encoding.
PageType page_type
The page type (data/dic/...)
Thrift-derived struct describing the page header.
DataPageHeader data_page_header
Data page header.
DictionaryPageHeader dictionary_page_header
Dictionary page header.
DataPageHeaderV2 data_page_header_v2
V2 data page header.
Thrift-derived struct describing page location information stored in the offsets index.
int32_t compressed_page_size
Compressed page size in bytes plus the heeader length.
int64_t offset
Offset of the page in the file.
Thrift-derived struct describing a group of row data.
std::optional< int16_t > ordinal
Row group ordinal in the file.
std::optional< int64_t > file_offset
Byte offset from beginning of file to first page (data or dictionary) in this row group.
std::optional< std::vector< SortingColumn > > sorting_columns
If set, specifies a sort ordering of the rows in this RowGroup.
std::vector< ColumnChunk > columns
Metadata for each column chunk in this row group.
std::optional< int64_t > total_compressed_size
Total byte size of all compressed (and potentially encrypted) column data in this row group.
Struct for describing an element/field in the Parquet format schema.
Type type
1: parquet physical type for output
std::optional< type_id > arrow_type
cudf type determined from arrow:schema
int32_t decimal_precision
8: DEPRECATED: record the precision for DECIMAL converted type
std::optional< LogicalType > logical_type
10: replaces converted type
std::optional< int32_t > field_id
9: save field_id from original schema
bool is_struct() const
Check if the schema element is a struct.
std::string name
4: name of the field
int32_t decimal_scale
7: DEPRECATED: record the scale for DECIMAL converted type
bool is_stub() const
Check if the schema element is a stub.
bool is_one_level_list(SchemaElement const &parent) const
Check if the schema element is a one-level list.
int32_t num_children
5: nested fields
int32_t type_length
2: byte length of FIXED_LENGTH_BYTE_ARRAY elements, or maximum bit length for other types
std::vector< size_type > children_idx
Children indices.
std::optional< ConvertedType > converted_type
6: DEPRECATED: record the original type before conversion to parquet type
bool operator==(SchemaElement const &other) const
Check if two schema elements are equal.
bool is_list() const
Check if the schema element is a list.
Thrift-derived struct containing statistics used to estimate page and column chunk sizes.
std::optional< std::vector< int64_t > > repetition_level_histogram
std::optional< std::vector< int64_t > > definition_level_histogram
std::optional< int64_t > unencoded_byte_array_data_bytes
Thrift-derived struct describing column sort order.
bool nulls_first
If true, nulls will come before non-null values.
bool descending
If true, indicates this column is sorted in descending order.
int32_t column_idx
The column index (in this row group)
Thrift-derived struct describing column chunk statistics.
std::optional< bool > is_min_value_exact
If true, min_value is the actual minimum value for a column.
std::optional< std::vector< uint8_t > > max_value
max value for column determined by ColumnOrder
std::optional< int64_t > null_count
count of null values in the column
std::optional< std::vector< uint8_t > > max
deprecated max value in signed comparison order
std::optional< std::vector< uint8_t > > min_value
min value for column determined by ColumnOrder
std::optional< bool > is_max_value_exact
If true, max_value is the actual maximum value for a column.
std::optional< std::vector< uint8_t > > min
deprecated min value in signed comparison order
std::optional< int64_t > distinct_count
count of distinct values occurring
Struct that describes the time logical type annotation.
Time units for temporal logical types.
Type
Available time units.
Struct that describes the timestamp logical type annotation.
Struct that describes the Parquet file data postscript.
uint32_t footer_len
Length of the footer.
uint32_t magic
Parquet 4-byte magic number "PAR1".
Struct that describes the Parquet file data header.
uint32_t magic
Parquet 4-byte magic number "PAR1".
Type declarations for libcudf.
#define CUDF_HOST_DEVICE
Indicates that the function or method is usable on host and device.
Definition: types.hpp:32