libcudf: parquet_schema.hpp Source File

 /*

  * Copyright (c) 2018-2025, NVIDIA CORPORATION.

  *

  * Licensed under the Apache License, Version 2.0 (the "License");

  * you may not use this file except in compliance with the License.

  * You may obtain a copy of the License at

  *

  *     http://www.apache.org/licenses/LICENSE-2.0

  *

  * Unless required by applicable law or agreed to in writing, software

  * distributed under the License is distributed on an "AS IS" BASIS,

  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

  * See the License for the specific language governing permissions and

  * limitations under the License.

  */


 #pragma once


 #include <cudf/types.hpp>


 #include <cuda/std/optional>


 #include <cstdint>

 #include <optional>

 #include <string>

 #include <vector>


 namespace CUDF_EXPORT cudf {


 namespace io::parquet {


 enum class Type : int8_t {

   UNDEFINED            = -1,  // Undefined for non-leaf nodes

   BOOLEAN              = 0,

   INT32                = 1,

   INT64                = 2,

   INT96                = 3,  // Deprecated

   FLOAT                = 4,

   DOUBLE               = 5,

   BYTE_ARRAY           = 6,

   FIXED_LEN_BYTE_ARRAY = 7,

 };


 enum class ConvertedType : int8_t {

   UNKNOWN = -1,  // No type information present

   UTF8    = 0,   // a BYTE_ARRAY may contain UTF8 encoded chars

   MAP     = 1,   // a map is converted as an optional field containing a repeated key/value pair

   MAP_KEY_VALUE = 2,  // a key/value pair is converted into a group of two fields

   LIST =

     3,  // a list is converted into an optional field containing a repeated field for its values

   ENUM    = 4,      // an enum is converted into a binary field

   DECIMAL = 5,      // A decimal value. 10^(-scale) encoded as 2's complement big endian

                     // (precision=number of digits, scale=location of decimal point)

   DATE        = 6,  // A Date, stored as days since Unix epoch, encoded as the INT32 physical type.

   TIME_MILLIS = 7,  // A time. The total number of milliseconds since midnight.The value is stored

                     // as an INT32 physical type.

   TIME_MICROS = 8,  // A time. The total number of microseconds since midnight.  The value is stored

                     // as an INT64 physical type.

   TIMESTAMP_MILLIS = 9,   // A date/time combination, recorded as milliseconds since the Unix epoch

                           // using physical type of INT64.

   TIMESTAMP_MICROS = 10,  // A date/time combination, microseconds since the Unix epoch as INT64

   UINT_8           = 11,  // An unsigned integer 8-bit value as INT32

   UINT_16          = 12,  // An unsigned integer 16-bit value as INT32

   UINT_32          = 13,  // An unsigned integer 32-bit value as INT32

   UINT_64          = 14,  // An unsigned integer 64-bit value as INT64

   INT_8            = 15,  // A signed integer 8-bit value as INT32

   INT_16           = 16,  // A signed integer 16-bit value as INT32

   INT_32           = 17,  // A signed integer 32-bit value as INT32

   INT_64           = 18,  // A signed integer 8-bit value as INT64

   JSON             = 19,  // A JSON document embedded within a single UTF8 column.

   BSON             = 20,  // A BSON document embedded within a single BINARY column.

   INTERVAL = 21,  // This type annotates a time interval stored as a FIXED_LEN_BYTE_ARRAY of length

                   // 12 for 3 integers {months,days,milliseconds}

   NA = 25,        // No Type information, For eg, all-nulls.

 };


 enum class Encoding : uint8_t {

   PLAIN                   = 0,

   GROUP_VAR_INT           = 1,  // Deprecated, never used

   PLAIN_DICTIONARY        = 2,

   RLE                     = 3,

   BIT_PACKED              = 4,  // Deprecated by parquet-format in 2013, superseded by RLE

   DELTA_BINARY_PACKED     = 5,

   DELTA_LENGTH_BYTE_ARRAY = 6,

   DELTA_BYTE_ARRAY        = 7,

   RLE_DICTIONARY          = 8,

   BYTE_STREAM_SPLIT       = 9,

   NUM_ENCODINGS           = 10,

 };


 enum class Compression : uint8_t {

   UNCOMPRESSED = 0,

   SNAPPY       = 1,

   GZIP         = 2,

   LZO          = 3,

   BROTLI       = 4,  // Added in 2.3.2

   LZ4          = 5,  // deprecated; based on LZ4, but with an additional undocumented framing scheme

   ZSTD         = 6,  // Added in 2.3.2

   LZ4_RAW      = 7,  // "standard" LZ4 block format

 };


 enum class FieldRepetitionType : int8_t {

   UNSPECIFIED = -1,

   REQUIRED    = 0,  // This field is required (can not be null) and each record has exactly 1 value.

   OPTIONAL    = 1,  // The field is optional (can be null) and each record has 0 or 1 values.

   REPEATED    = 2,  // The field is repeated and can contain 0 or more values

 };


 enum class PageType : uint8_t {

   DATA_PAGE       = 0,

   INDEX_PAGE      = 1,

   DICTIONARY_PAGE = 2,

   DATA_PAGE_V2    = 3,

 };


 enum class BoundaryOrder : uint8_t {

   UNORDERED  = 0,

   ASCENDING  = 1,

   DESCENDING = 2,

 };


 enum class FieldType : uint8_t {

   BOOLEAN_TRUE  = 1,

   BOOLEAN_FALSE = 2,

   I8            = 3,

   I16           = 4,

   I32           = 5,

   I64           = 6,

   DOUBLE        = 7,

   BINARY        = 8,

   LIST          = 9,

   SET           = 10,

   MAP           = 11,

   STRUCT        = 12,

   UUID          = 13,

 };


 struct file_header_s {

   uint32_t magic;

 };


 struct file_ender_s {

   uint32_t footer_len;

   uint32_t magic;

 };


 struct DecimalType {

   int32_t scale = 0;

   int32_t precision = 0;

 };


 struct TimeUnit {

   enum Type { UNDEFINED, MILLIS, MICROS, NANOS };

   Type type;

 };


 struct TimeType {

   bool isAdjustedToUTC = true;

   TimeUnit unit = {TimeUnit::MILLIS};

 };


 struct TimestampType {

   bool isAdjustedToUTC = true;

   TimeUnit unit = {TimeUnit::MILLIS};

 };


 struct IntType {

   int8_t bitWidth = 0;

   bool isSigned = false;

 };


 struct LogicalType {

   enum Type {

     UNDEFINED,

     STRING,

     MAP,

     LIST,

     ENUM,

     DECIMAL,

     DATE,

     TIME,

     TIMESTAMP,

     // 9 is reserved

     INTEGER = 10,

     UNKNOWN,

     JSON,

     BSON

   };


   Type type;

   cuda::std::optional<DecimalType> decimal_type;

   cuda::std::optional<TimeType> time_type;

   cuda::std::optional<TimestampType> timestamp_type;

   cuda::std::optional<IntType> int_type;


   LogicalType(Type tp = UNDEFINED) : type(tp) {}


   LogicalType(DecimalType&& dt) : type(DECIMAL), decimal_type(dt) {}


   LogicalType(TimeType&& tt) : type(TIME), time_type(tt) {}


   LogicalType(TimestampType&& tst) : type(TIMESTAMP), timestamp_type(tst) {}


   LogicalType(IntType&& it) : type(INTEGER), int_type(it) {}


   [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_time_millis() const

   {

     return type == TIME and time_type->unit.type == TimeUnit::MILLIS;

   }


   [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_time_micros() const

   {

     return type == TIME and time_type->unit.type == TimeUnit::MICROS;

   }


   [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_time_nanos() const

   {

     return type == TIME and time_type->unit.type == TimeUnit::NANOS;

   }


   [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_timestamp_millis() const

   {

     return type == TIMESTAMP and timestamp_type->unit.type == TimeUnit::MILLIS;

   }


   [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_timestamp_micros() const

   {

     return type == TIMESTAMP and timestamp_type->unit.type == TimeUnit::MICROS;

   }


   [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_timestamp_nanos() const

   {

     return type == TIMESTAMP and timestamp_type->unit.type == TimeUnit::NANOS;

   }


   [[nodiscard]] CUDF_HOST_DEVICE constexpr int8_t bit_width() const

   {

     return type == INTEGER ? int_type->bitWidth : -1;

   }


   [[nodiscard]] constexpr bool is_signed() const { return type == INTEGER and int_type->isSigned; }


   [[nodiscard]] constexpr int32_t scale() const

   {

     return type == DECIMAL ? decimal_type->scale : -1;

   }


   [[nodiscard]] CUDF_HOST_DEVICE constexpr int32_t precision() const

   {

     return type == DECIMAL ? decimal_type->precision : -1;

   }

 };


 struct ColumnOrder {

   enum Type { UNDEFINED, TYPE_ORDER };

   Type type;

 };


 struct SchemaElement {

   Type type = Type::UNDEFINED;

   int32_t type_length = 0;

   FieldRepetitionType repetition_type = FieldRepetitionType::REQUIRED;

   std::string name = "";

   int32_t num_children = 0;

   std::optional<ConvertedType> converted_type;

   int32_t decimal_scale = 0;

   int32_t decimal_precision = 0;

   std::optional<int32_t> field_id;

   std::optional<LogicalType> logical_type;


   bool output_as_byte_array = false;


   std::optional<type_id> arrow_type;


   // The following fields are filled in later during schema initialization


   int max_definition_level = 0;

   int max_repetition_level = 0;

   size_type parent_idx = 0;

   std::vector<size_type> children_idx;


   bool operator==(SchemaElement const& other) const

   {

     return type == other.type && converted_type == other.converted_type &&

            type_length == other.type_length && name == other.name &&

            num_children == other.num_children && decimal_scale == other.decimal_scale &&

            decimal_precision == other.decimal_precision && field_id == other.field_id;

   }


   // the parquet format is a little squishy when it comes to interpreting

   // repeated fields. sometimes repeated fields act as "stubs" in the schema

   // that don't represent a true nesting level.

   //

   // this is the case with plain lists:

   //

   // optional group my_list (LIST) {

   //   repeated group element {        <-- not part of the output hierarchy

   //     required binary str (UTF8);

   //   };

   // }

   //

   // However, for backwards compatibility reasons, there are a few special cases, namely

   // List<Struct<>> (which also corresponds to how the map type is specified), where

   // this does not hold true

   //

   // optional group my_list (LIST) {

   //   repeated group element {        <-- part of the hierarchy because it represents a struct

   //     required binary str (UTF8);

   //     required int32 num;

   //  };

   // }


   [[nodiscard]] bool is_stub() const

   {

     return repetition_type == FieldRepetitionType::REPEATED && num_children == 1;

   }


   [[nodiscard]] bool is_one_level_list(SchemaElement const& parent) const

   {

     return repetition_type == FieldRepetitionType::REPEATED and num_children == 0 and

            not parent.is_list();

   }


   [[nodiscard]] bool is_list() const { return converted_type == ConvertedType::LIST; }


   [[nodiscard]] bool is_struct() const

   {

     return type == Type::UNDEFINED &&

            // this assumption might be a little weak.

            ((repetition_type != FieldRepetitionType::REPEATED) ||

             (repetition_type == FieldRepetitionType::REPEATED && num_children > 1));

   }

 };


 struct Statistics {

   std::optional<std::vector<uint8_t>> max;

   std::optional<std::vector<uint8_t>> min;

   std::optional<int64_t> null_count;

   std::optional<int64_t> distinct_count;

   std::optional<std::vector<uint8_t>> max_value;

   std::optional<std::vector<uint8_t>> min_value;

   std::optional<bool> is_max_value_exact;

   std::optional<bool> is_min_value_exact;

 };


 struct SizeStatistics {

   std::optional<int64_t> unencoded_byte_array_data_bytes;

   std::optional<std::vector<int64_t>> repetition_level_histogram;


   std::optional<std::vector<int64_t>> definition_level_histogram;

 };


 struct PageLocation {

   int64_t offset;

   int32_t compressed_page_size;

   int64_t first_row_index;

 };


 struct OffsetIndex {

   std::vector<PageLocation> page_locations;

   std::optional<std::vector<int64_t>> unencoded_byte_array_data_bytes;

 };


 struct ColumnIndex {

   std::vector<bool> null_pages;

   std::vector<std::vector<uint8_t>> min_values;

   std::vector<std::vector<uint8_t>> max_values;

   BoundaryOrder boundary_order = BoundaryOrder::UNORDERED;

   std::optional<std::vector<int64_t>> null_counts;

   std::optional<std::vector<int64_t>> repetition_level_histogram;

   std::optional<std::vector<int64_t>> definition_level_histogram;

 };


 struct PageEncodingStats {

   PageType page_type;

   Encoding encoding;

   int32_t count;

 };


 struct SortingColumn {

   int32_t column_idx;

   bool descending;

   bool nulls_first;

 };


 struct ColumnChunkMetaData {

   Type type = Type::BOOLEAN;

   std::vector<Encoding> encodings;

   std::vector<std::string> path_in_schema;

   Compression codec = Compression::UNCOMPRESSED;

   int64_t num_values = 0;

   int64_t total_uncompressed_size = 0;

   int64_t total_compressed_size = 0;

   int64_t data_page_offset = 0;

   int64_t index_page_offset = 0;

   int64_t dictionary_page_offset = 0;

   Statistics statistics;

   std::optional<std::vector<PageEncodingStats>> encoding_stats;

   std::optional<int64_t> bloom_filter_offset;

   std::optional<int32_t> bloom_filter_length;

   std::optional<SizeStatistics> size_statistics;

 };


 struct BloomFilterAlgorithm {

   enum Algorithm : uint8_t { UNDEFINED, SPLIT_BLOCK };

   Algorithm algorithm{SPLIT_BLOCK};

 };


 struct BloomFilterHash {

   enum Hash : uint8_t { UNDEFINED, XXHASH };

   Hash hash{XXHASH};

 };


 struct BloomFilterCompression {

   enum Compression : uint8_t { UNDEFINED, UNCOMPRESSED };

   Compression compression{UNCOMPRESSED};

 };


 struct BloomFilterHeader {

   int32_t num_bytes;

   BloomFilterAlgorithm algorithm;

   BloomFilterHash hash;

   BloomFilterCompression compression;

 };


 struct ColumnChunk {

   std::string file_path = "";

   int64_t file_offset = 0;

   ColumnChunkMetaData meta_data;

   int64_t offset_index_offset = 0;

   int32_t offset_index_length = 0;

   int64_t column_index_offset = 0;

   int32_t column_index_length = 0;


   // Following fields are derived from other fields


   int schema_idx = -1;


   // The indexes don't really live here, but it's a convenient place to hang them.


   std::optional<OffsetIndex> offset_index;

   std::optional<ColumnIndex> column_index;

 };


 struct RowGroup {

   std::vector<ColumnChunk> columns;

   int64_t total_byte_size = 0;

   int64_t num_rows = 0;

   std::optional<std::vector<SortingColumn>> sorting_columns;

   std::optional<int64_t> file_offset;

   std::optional<int64_t> total_compressed_size;

   std::optional<int16_t> ordinal;

 };


 struct KeyValue {

   std::string key;

   std::string value;

 };


 struct FileMetaData {

   int32_t version = 0;

   std::vector<SchemaElement> schema;

   int64_t num_rows = 0;

   std::vector<RowGroup> row_groups;

   std::vector<KeyValue> key_value_metadata;

   std::string created_by = "";

   std::optional<std::vector<ColumnOrder>> column_orders;

 };


 struct DataPageHeader {

   int32_t num_values = 0;

   Encoding encoding = Encoding::PLAIN;

   Encoding definition_level_encoding = Encoding::PLAIN;

   Encoding repetition_level_encoding = Encoding::PLAIN;

 };


 struct DataPageHeaderV2 {

   int32_t num_values = 0;

   int32_t num_nulls = 0;

   int32_t num_rows = 0;

   Encoding encoding = Encoding::PLAIN;

   int32_t definition_levels_byte_length = 0;

   int32_t repetition_levels_byte_length = 0;

   bool is_compressed = true;

 };


 struct DictionaryPageHeader {

   int32_t num_values = 0;

   Encoding encoding = Encoding::PLAIN;

 };


 struct PageHeader {

   PageType type = PageType::DATA_PAGE;

   int32_t uncompressed_page_size = 0;

   int32_t compressed_page_size = 0;


   // Headers for page specific data. One only will be set.


   DataPageHeader data_page_header;

   DictionaryPageHeader dictionary_page_header;

   DataPageHeaderV2 data_page_header_v2;

 };


 }  // namespace io::parquet

 }  // namespace CUDF_EXPORT cudf

cudf::size_type
int32_t size_type
Row index type for columns and tables.
Definition: types.hpp:95

cudf::io::parquet::ConvertedType
ConvertedType
High-level data types in Parquet, determines how data is logically interpreted.
Definition: parquet_schema.hpp:50

cudf::io::parquet::FieldRepetitionType
FieldRepetitionType
Compression codec used for compressed data pages.
Definition: parquet_schema.hpp:117

cudf::io::parquet::PageType
PageType
Types of pages.
Definition: parquet_schema.hpp:127

cudf::io::parquet::Encoding
Encoding
Encoding types for the actual data stream.
Definition: parquet_schema.hpp:86

cudf::io::parquet::Type
Type
Basic data types in Parquet, determines how data is physically stored.
Definition: parquet_schema.hpp:35

cudf::io::parquet::FieldType
FieldType
Thrift compact protocol struct field types.
Definition: parquet_schema.hpp:147

cudf::io::parquet::Compression
Compression
Compression codec used for compressed data pages.
Definition: parquet_schema.hpp:103

cudf::io::parquet::BoundaryOrder
BoundaryOrder
Enum to annotate whether lists of min/max elements inside ColumnIndex are ordered and if so,...
Definition: parquet_schema.hpp:138

cudf
cuDF interfaces
Definition: host_udf.hpp:37

cudf::io::parquet::BloomFilterAlgorithm
The algorithm used in bloom filter.
Definition: parquet_schema.hpp:711

cudf::io::parquet::BloomFilterAlgorithm::Algorithm
Algorithm
Available bloom filter algorithms.
Definition: parquet_schema.hpp:713

cudf::io::parquet::BloomFilterCompression
The compression used in the bloom filter.
Definition: parquet_schema.hpp:731

cudf::io::parquet::BloomFilterCompression::Compression
Compression
Available bloom filter compression types.
Definition: parquet_schema.hpp:733

cudf::io::parquet::BloomFilterHash
The hash function used in Bloom filter.
Definition: parquet_schema.hpp:721

cudf::io::parquet::BloomFilterHash::Hash
Hash
Available bloom filter hashers.
Definition: parquet_schema.hpp:723

cudf::io::parquet::BloomFilterHeader
Bloom filter header struct.
Definition: parquet_schema.hpp:744

cudf::io::parquet::BloomFilterHeader::compression
BloomFilterCompression compression
The compression used in the bloom filter.
Definition: parquet_schema.hpp:752

cudf::io::parquet::BloomFilterHeader::hash
BloomFilterHash hash
The hash function used for bloom filter.
Definition: parquet_schema.hpp:750

cudf::io::parquet::BloomFilterHeader::algorithm
BloomFilterAlgorithm algorithm
The algorithm for setting bits.
Definition: parquet_schema.hpp:748

cudf::io::parquet::BloomFilterHeader::num_bytes
int32_t num_bytes
The size of bitset in bytes.
Definition: parquet_schema.hpp:746

cudf::io::parquet::ColumnChunkMetaData
Thrift-derived struct describing a column chunk.
Definition: parquet_schema.hpp:668

cudf::io::parquet::ColumnChunkMetaData::encodings
std::vector< Encoding > encodings
Definition: parquet_schema.hpp:673

cudf::io::parquet::ColumnChunkMetaData::encoding_stats
std::optional< std::vector< PageEncodingStats > > encoding_stats
Definition: parquet_schema.hpp:694

cudf::io::parquet::ColumnChunkMetaData::statistics
Statistics statistics
Optional statistics for this column chunk.
Definition: parquet_schema.hpp:691

cudf::io::parquet::ColumnChunkMetaData::bloom_filter_offset
std::optional< int64_t > bloom_filter_offset
Byte offset from beginning of file to Bloom filter data.
Definition: parquet_schema.hpp:696

cudf::io::parquet::ColumnChunkMetaData::bloom_filter_length
std::optional< int32_t > bloom_filter_length
Definition: parquet_schema.hpp:701

cudf::io::parquet::ColumnChunkMetaData::size_statistics
std::optional< SizeStatistics > size_statistics
Definition: parquet_schema.hpp:705

cudf::io::parquet::ColumnChunkMetaData::path_in_schema
std::vector< std::string > path_in_schema
Path in schema.
Definition: parquet_schema.hpp:675

cudf::io::parquet::ColumnChunk
Thrift-derived struct describing a chunk of data for a particular column.
Definition: parquet_schema.hpp:763

cudf::io::parquet::ColumnChunk::column_index
std::optional< ColumnIndex > column_index
ColumnIndex for this column chunk
Definition: parquet_schema.hpp:791

cudf::io::parquet::ColumnChunk::offset_index
std::optional< OffsetIndex > offset_index
OffsetIndex for this column chunk
Definition: parquet_schema.hpp:789

cudf::io::parquet::ColumnChunk::meta_data
ColumnChunkMetaData meta_data
Definition: parquet_schema.hpp:771

cudf::io::parquet::ColumnIndex
Thrift-derived struct describing the column index.
Definition: parquet_schema.hpp:624

cudf::io::parquet::ColumnIndex::definition_level_histogram
std::optional< std::vector< int64_t > > definition_level_histogram
Definition level histogram for the column chunk.
Definition: parquet_schema.hpp:638

cudf::io::parquet::ColumnIndex::null_counts
std::optional< std::vector< int64_t > > null_counts
Optional count of null values per page.
Definition: parquet_schema.hpp:634

cudf::io::parquet::ColumnIndex::max_values
std::vector< std::vector< uint8_t > > max_values
Upper bound for values in each page.
Definition: parquet_schema.hpp:630

cudf::io::parquet::ColumnIndex::repetition_level_histogram
std::optional< std::vector< int64_t > > repetition_level_histogram
Repetition level histogram for the column chunk.
Definition: parquet_schema.hpp:636

cudf::io::parquet::ColumnIndex::null_pages
std::vector< bool > null_pages
Boolean used to determine if a page contains only null values.
Definition: parquet_schema.hpp:626

cudf::io::parquet::ColumnIndex::min_values
std::vector< std::vector< uint8_t > > min_values
Lower bound for values in each page.
Definition: parquet_schema.hpp:628

cudf::io::parquet::ColumnOrder
Union to specify the order used for the min_value and max_value fields for a column.
Definition: parquet_schema.hpp:410

cudf::io::parquet::ColumnOrder::type
Type type
Column order type.
Definition: parquet_schema.hpp:414

cudf::io::parquet::ColumnOrder::Type
Type
Available column order types.
Definition: parquet_schema.hpp:412

cudf::io::parquet::DataPageHeaderV2
Thrift-derived struct describing the header for a V2 data page.
Definition: parquet_schema.hpp:872

cudf::io::parquet::DataPageHeader
Thrift-derived struct describing the header for a data page.
Definition: parquet_schema.hpp:858

cudf::io::parquet::DecimalType
Struct that describes the decimal logical type annotation.
Definition: parquet_schema.hpp:186

cudf::io::parquet::DictionaryPageHeader
Thrift-derived struct describing the header for a dictionary page.
Definition: parquet_schema.hpp:893

cudf::io::parquet::FileMetaData
Thrift-derived struct describing file-level metadata.
Definition: parquet_schema.hpp:834

cudf::io::parquet::FileMetaData::column_orders
std::optional< std::vector< ColumnOrder > > column_orders
Definition: parquet_schema.hpp:852

cudf::io::parquet::FileMetaData::row_groups
std::vector< RowGroup > row_groups
Row groups in this file.
Definition: parquet_schema.hpp:845

cudf::io::parquet::FileMetaData::key_value_metadata
std::vector< KeyValue > key_value_metadata
Optional key/value metadata.
Definition: parquet_schema.hpp:847

cudf::io::parquet::FileMetaData::schema
std::vector< SchemaElement > schema
Definition: parquet_schema.hpp:841

cudf::io::parquet::IntType
Struct that describes the integer logical type annotation.
Definition: parquet_schema.hpp:234

cudf::io::parquet::KeyValue
Thrift-derived struct describing a key-value pair, for user metadata.
Definition: parquet_schema.hpp:820

cudf::io::parquet::KeyValue::key
std::string key
string key
Definition: parquet_schema.hpp:822

cudf::io::parquet::KeyValue::value
std::string value
string value
Definition: parquet_schema.hpp:824

cudf::io::parquet::LogicalType
Struct that describes the logical type annotation.
Definition: parquet_schema.hpp:244

cudf::io::parquet::LogicalType::is_time_millis
constexpr CUDF_HOST_DEVICE bool is_time_millis() const
Check if the time is in milliseconds.
Definition: parquet_schema.hpp:314

cudf::io::parquet::LogicalType::LogicalType
LogicalType(Type tp=UNDEFINED)
Default constructor.
Definition: parquet_schema.hpp:279

cudf::io::parquet::LogicalType::int_type
cuda::std::optional< IntType > int_type
Integer type.
Definition: parquet_schema.hpp:272

cudf::io::parquet::LogicalType::time_type
cuda::std::optional< TimeType > time_type
Time type.
Definition: parquet_schema.hpp:268

cudf::io::parquet::LogicalType::is_timestamp_millis
constexpr CUDF_HOST_DEVICE bool is_timestamp_millis() const
Check if the timestamp is in milliseconds.
Definition: parquet_schema.hpp:344

cudf::io::parquet::LogicalType::is_timestamp_micros
constexpr CUDF_HOST_DEVICE bool is_timestamp_micros() const
Check if the timestamp is in microseconds.
Definition: parquet_schema.hpp:354

cudf::io::parquet::LogicalType::scale
constexpr int32_t scale() const
Get the scale of the decimal type.
Definition: parquet_schema.hpp:391

cudf::io::parquet::LogicalType::is_time_nanos
constexpr CUDF_HOST_DEVICE bool is_time_nanos() const
Check if the time is in nanoseconds.
Definition: parquet_schema.hpp:334

cudf::io::parquet::LogicalType::LogicalType
LogicalType(IntType &&it)
Constructor for Integer logical type.
Definition: parquet_schema.hpp:307

cudf::io::parquet::LogicalType::is_signed
constexpr bool is_signed() const
Check if the integer is signed.
Definition: parquet_schema.hpp:384

cudf::io::parquet::LogicalType::Type
Type
Logical type annotations to replace ConvertedType.
Definition: parquet_schema.hpp:246

cudf::io::parquet::LogicalType::is_timestamp_nanos
constexpr CUDF_HOST_DEVICE bool is_timestamp_nanos() const
Check if the timestamp is in nanoseconds.
Definition: parquet_schema.hpp:364

cudf::io::parquet::LogicalType::decimal_type
cuda::std::optional< DecimalType > decimal_type
Decimal type.
Definition: parquet_schema.hpp:266

cudf::io::parquet::LogicalType::LogicalType
LogicalType(TimeType &&tt)
Constructor for Time logical type.
Definition: parquet_schema.hpp:293

cudf::io::parquet::LogicalType::timestamp_type
cuda::std::optional< TimestampType > timestamp_type
Timestamp type.
Definition: parquet_schema.hpp:270

cudf::io::parquet::LogicalType::LogicalType
LogicalType(TimestampType &&tst)
Constructor for Timestamp logical type.
Definition: parquet_schema.hpp:300

cudf::io::parquet::LogicalType::type
Type type
Logical type.
Definition: parquet_schema.hpp:264

cudf::io::parquet::LogicalType::precision
constexpr CUDF_HOST_DEVICE int32_t precision() const
Get the precision of the decimal type.
Definition: parquet_schema.hpp:401

cudf::io::parquet::LogicalType::bit_width
constexpr CUDF_HOST_DEVICE int8_t bit_width() const
Get the bit width of the integer type.
Definition: parquet_schema.hpp:374

cudf::io::parquet::LogicalType::is_time_micros
constexpr CUDF_HOST_DEVICE bool is_time_micros() const
Check if the time is in microseconds.
Definition: parquet_schema.hpp:324

cudf::io::parquet::LogicalType::LogicalType
LogicalType(DecimalType &&dt)
Constructor for Decimal logical type.
Definition: parquet_schema.hpp:286

cudf::io::parquet::OffsetIndex
Thrift-derived struct describing the offset index.
Definition: parquet_schema.hpp:613

cudf::io::parquet::OffsetIndex::page_locations
std::vector< PageLocation > page_locations
Page locations.
Definition: parquet_schema.hpp:615

cudf::io::parquet::OffsetIndex::unencoded_byte_array_data_bytes
std::optional< std::vector< int64_t > > unencoded_byte_array_data_bytes
Definition: parquet_schema.hpp:618

cudf::io::parquet::PageEncodingStats
Thrift-derived struct describing page encoding statistics.
Definition: parquet_schema.hpp:644

cudf::io::parquet::PageEncodingStats::encoding
Encoding encoding
Encoding of the page.
Definition: parquet_schema.hpp:648

cudf::io::parquet::PageEncodingStats::count
int32_t count
Number of pages of this type with this encoding.
Definition: parquet_schema.hpp:650

cudf::io::parquet::PageEncodingStats::page_type
PageType page_type
The page type (data/dic/...)
Definition: parquet_schema.hpp:646

cudf::io::parquet::PageHeader
Thrift-derived struct describing the page header.
Definition: parquet_schema.hpp:909

cudf::io::parquet::PageHeader::data_page_header
DataPageHeader data_page_header
Data page header.
Definition: parquet_schema.hpp:920

cudf::io::parquet::PageHeader::dictionary_page_header
DictionaryPageHeader dictionary_page_header
Dictionary page header.
Definition: parquet_schema.hpp:922

cudf::io::parquet::PageHeader::data_page_header_v2
DataPageHeaderV2 data_page_header_v2
V2 data page header.
Definition: parquet_schema.hpp:924

cudf::io::parquet::PageLocation
Thrift-derived struct describing page location information stored in the offsets index.
Definition: parquet_schema.hpp:600

cudf::io::parquet::PageLocation::first_row_index
int64_t first_row_index
Definition: parquet_schema.hpp:607

cudf::io::parquet::PageLocation::compressed_page_size
int32_t compressed_page_size
Compressed page size in bytes plus the heeader length.
Definition: parquet_schema.hpp:604

cudf::io::parquet::PageLocation::offset
int64_t offset
Offset of the page in the file.
Definition: parquet_schema.hpp:602

cudf::io::parquet::RowGroup
Thrift-derived struct describing a group of row data.
Definition: parquet_schema.hpp:800

cudf::io::parquet::RowGroup::ordinal
std::optional< int16_t > ordinal
Row group ordinal in the file.
Definition: parquet_schema.hpp:814

cudf::io::parquet::RowGroup::file_offset
std::optional< int64_t > file_offset
Byte offset from beginning of file to first page (data or dictionary) in this row group.
Definition: parquet_schema.hpp:810

cudf::io::parquet::RowGroup::sorting_columns
std::optional< std::vector< SortingColumn > > sorting_columns
If set, specifies a sort ordering of the rows in this RowGroup.
Definition: parquet_schema.hpp:808

cudf::io::parquet::RowGroup::columns
std::vector< ColumnChunk > columns
Metadata for each column chunk in this row group.
Definition: parquet_schema.hpp:802

cudf::io::parquet::RowGroup::total_compressed_size
std::optional< int64_t > total_compressed_size
Total byte size of all compressed (and potentially encrypted) column data in this row group.
Definition: parquet_schema.hpp:812

cudf::io::parquet::SchemaElement
Struct for describing an element/field in the Parquet format schema.
Definition: parquet_schema.hpp:423

cudf::io::parquet::SchemaElement::type
Type type
1: parquet physical type for output
Definition: parquet_schema.hpp:425

cudf::io::parquet::SchemaElement::arrow_type
std::optional< type_id > arrow_type
cudf type determined from arrow:schema
Definition: parquet_schema.hpp:449

cudf::io::parquet::SchemaElement::decimal_precision
int32_t decimal_precision
8: DEPRECATED: record the precision for DECIMAL converted type
Definition: parquet_schema.hpp:439

cudf::io::parquet::SchemaElement::logical_type
std::optional< LogicalType > logical_type
10: replaces converted type
Definition: parquet_schema.hpp:443

cudf::io::parquet::SchemaElement::field_id
std::optional< int32_t > field_id
9: save field_id from original schema
Definition: parquet_schema.hpp:441

cudf::io::parquet::SchemaElement::is_struct
bool is_struct() const
Check if the schema element is a struct.
Definition: parquet_schema.hpp:540

cudf::io::parquet::SchemaElement::name
std::string name
4: name of the field
Definition: parquet_schema.hpp:431

cudf::io::parquet::SchemaElement::decimal_scale
int32_t decimal_scale
7: DEPRECATED: record the scale for DECIMAL converted type
Definition: parquet_schema.hpp:437

cudf::io::parquet::SchemaElement::is_stub
bool is_stub() const
Check if the schema element is a stub.
Definition: parquet_schema.hpp:504

cudf::io::parquet::SchemaElement::is_one_level_list
bool is_one_level_list(SchemaElement const &parent) const
Check if the schema element is a one-level list.
Definition: parquet_schema.hpp:519

cudf::io::parquet::SchemaElement::num_children
int32_t num_children
5: nested fields
Definition: parquet_schema.hpp:433

cudf::io::parquet::SchemaElement::type_length
int32_t type_length
2: byte length of FIXED_LENGTH_BYTE_ARRAY elements, or maximum bit length for other types
Definition: parquet_schema.hpp:427

cudf::io::parquet::SchemaElement::children_idx
std::vector< size_type > children_idx
Children indices.
Definition: parquet_schema.hpp:460

cudf::io::parquet::SchemaElement::converted_type
std::optional< ConvertedType > converted_type
6: DEPRECATED: record the original type before conversion to parquet type
Definition: parquet_schema.hpp:435

cudf::io::parquet::SchemaElement::operator==
bool operator==(SchemaElement const &other) const
Check if two schema elements are equal.
Definition: parquet_schema.hpp:468

cudf::io::parquet::SchemaElement::is_list
bool is_list() const
Check if the schema element is a list.
Definition: parquet_schema.hpp:530

cudf::io::parquet::SizeStatistics
Thrift-derived struct containing statistics used to estimate page and column chunk sizes.
Definition: parquet_schema.hpp:574

cudf::io::parquet::SizeStatistics::repetition_level_histogram
std::optional< std::vector< int64_t > > repetition_level_histogram
Definition: parquet_schema.hpp:586

cudf::io::parquet::SizeStatistics::definition_level_histogram
std::optional< std::vector< int64_t > > definition_level_histogram
Definition: parquet_schema.hpp:593

cudf::io::parquet::SizeStatistics::unencoded_byte_array_data_bytes
std::optional< int64_t > unencoded_byte_array_data_bytes
Definition: parquet_schema.hpp:577

cudf::io::parquet::SortingColumn
Thrift-derived struct describing column sort order.
Definition: parquet_schema.hpp:656

cudf::io::parquet::SortingColumn::nulls_first
bool nulls_first
If true, nulls will come before non-null values.
Definition: parquet_schema.hpp:662

cudf::io::parquet::SortingColumn::descending
bool descending
If true, indicates this column is sorted in descending order.
Definition: parquet_schema.hpp:660

cudf::io::parquet::SortingColumn::column_idx
int32_t column_idx
The column index (in this row group)
Definition: parquet_schema.hpp:658

cudf::io::parquet::Statistics
Thrift-derived struct describing column chunk statistics.
Definition: parquet_schema.hpp:552

cudf::io::parquet::Statistics::is_min_value_exact
std::optional< bool > is_min_value_exact
If true, min_value is the actual minimum value for a column.
Definition: parquet_schema.hpp:568

cudf::io::parquet::Statistics::max_value
std::optional< std::vector< uint8_t > > max_value
max value for column determined by ColumnOrder
Definition: parquet_schema.hpp:562

cudf::io::parquet::Statistics::null_count
std::optional< int64_t > null_count
count of null values in the column
Definition: parquet_schema.hpp:558

cudf::io::parquet::Statistics::max
std::optional< std::vector< uint8_t > > max
deprecated max value in signed comparison order
Definition: parquet_schema.hpp:554

cudf::io::parquet::Statistics::min_value
std::optional< std::vector< uint8_t > > min_value
min value for column determined by ColumnOrder
Definition: parquet_schema.hpp:564

cudf::io::parquet::Statistics::is_max_value_exact
std::optional< bool > is_max_value_exact
If true, max_value is the actual maximum value for a column.
Definition: parquet_schema.hpp:566

cudf::io::parquet::Statistics::min
std::optional< std::vector< uint8_t > > min
deprecated min value in signed comparison order
Definition: parquet_schema.hpp:556

cudf::io::parquet::Statistics::distinct_count
std::optional< int64_t > distinct_count
count of distinct values occurring
Definition: parquet_schema.hpp:560

cudf::io::parquet::TimeType
Struct that describes the time logical type annotation.
Definition: parquet_schema.hpp:208

cudf::io::parquet::TimeUnit
Time units for temporal logical types.
Definition: parquet_schema.hpp:196

cudf::io::parquet::TimeUnit::Type
Type
Available time units.
Definition: parquet_schema.hpp:198

cudf::io::parquet::TimeUnit::type
Type type
Time unit type.
Definition: parquet_schema.hpp:200

cudf::io::parquet::TimestampType
Struct that describes the timestamp logical type annotation.
Definition: parquet_schema.hpp:221

cudf::io::parquet::file_ender_s
Struct that describes the Parquet file data postscript.
Definition: parquet_schema.hpp:174

cudf::io::parquet::file_ender_s::footer_len
uint32_t footer_len
Length of the footer.
Definition: parquet_schema.hpp:176

cudf::io::parquet::file_ender_s::magic
uint32_t magic
Parquet 4-byte magic number "PAR1".
Definition: parquet_schema.hpp:178

cudf::io::parquet::file_header_s
Struct that describes the Parquet file data header.
Definition: parquet_schema.hpp:166

cudf::io::parquet::file_header_s::magic
uint32_t magic
Parquet 4-byte magic number "PAR1".
Definition: parquet_schema.hpp:168

types.hpp
Type declarations for libcudf.

CUDF_HOST_DEVICE
#define CUDF_HOST_DEVICE
Indicates that the function or method is usable on host and device.
Definition: types.hpp:32