From 514e0e9a6695127dba188b29c45aaf2da275b983 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Wed, 27 Nov 2024 11:01:43 -0800 Subject: [PATCH] Add Parquet Reader options classes to pylibcudf --- cpp/include/cudf/io/parquet.hpp | 1 + python/pylibcudf/pylibcudf/io/parquet.pxd | 17 +++++++++++++++++ python/pylibcudf/pylibcudf/io/parquet.pyi | 21 ++++++++++++++++++++- 3 files changed, 38 insertions(+), 1 deletion(-) diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index bfe76d5690c..b561d0989e9 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -410,6 +410,7 @@ class parquet_reader_options_builder { * * @param val Boolean value whether to read matching projected and filter columns from mismatched * Parquet sources. + * * @return this for chaining. */ parquet_reader_options_builder& allow_mismatched_pq_schemas(bool val) diff --git a/python/pylibcudf/pylibcudf/io/parquet.pxd b/python/pylibcudf/pylibcudf/io/parquet.pxd index 1a61c20d783..0d83c06d841 100644 --- a/python/pylibcudf/pylibcudf/io/parquet.pxd +++ b/python/pylibcudf/pylibcudf/io/parquet.pxd @@ -24,6 +24,23 @@ from pylibcudf.table cimport Table from pylibcudf.types cimport DataType +cdef class ParquetReaderOptions: + cdef parquet_reader_options + cdef SourceInfo source + cpdef void set_row_groups(self, list row_groups) + cpdef void set_num_rows(self, size_type nrows) + cpdef void set_skip_rows(self, int64_t skip_rows) + cpdef void set_columns(self, list col_names) + cpdef void set_filter(self, Expression filter) + +cdef class ParquetReaderOptionsBuilder: + cpdef ParquetReaderOptionsBuilder convert_strings_to_categories(self, bool val) + cpdef ParquetReaderOptionsBuilder use_pandas_metadata(self, bool val) + cpdef ParquetReaderOptionsBuilder allow_mismatched_pq_schemas(self, bool val) + cpdef ParquetReaderOptionsBuilder use_arrow_schema(self, bool val) + cpdef build(self) + + cdef class ChunkedParquetReader: cdef unique_ptr[cpp_chunked_parquet_reader] reader diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyi b/python/pylibcudf/pylibcudf/io/parquet.pyi index eb2ca68109b..09a52d03312 100644 --- a/python/pylibcudf/pylibcudf/io/parquet.pyi +++ b/python/pylibcudf/pylibcudf/io/parquet.pyi @@ -1,7 +1,8 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from collections.abc import Mapping -from typing import Self + +from typing_extensions import Self from pylibcudf.expressions import Expression from pylibcudf.io.types import ( @@ -16,6 +17,24 @@ from pylibcudf.io.types import ( ) from pylibcudf.table import Table +class ParquetReaderOptions: + def __init__(self): ... + def set_row_groups(self, row_groups: list[list[int]]): ... + def set_num_rows(self, nrows: int): ... + def set_skip_rows(self, skip_rows: int): ... + def set_columns(self, col_names: list[str]): ... + def set_filter(self, filter: Expression): ... + @staticmethod + def builder(source: SourceInfo) -> ParquetReaderOptionsBuilder: ... + +class ParquetReaderOptionsBuilder: + def __init__(self): ... + def convert_strings_to_categories(self, val: bool) -> Self: ... + def use_pandas_metadata(self, val: bool) -> Self: ... + def allow_mismatched_pq_schemas(self, val: bool) -> Self: ... + def use_arrow_schema(self, val: bool) -> Self: ... + def build(self) -> ParquetReaderOptions: ... + class ChunkedParquetReader: def __init__( self,