Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Start migrating I/O to pylibcudf #15899

Merged
merged 13 commits into from
Jun 6, 2024
1 change: 1 addition & 0 deletions docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ This page provides API documentation for pylibcudf.
filling
gpumemoryview
groupby
io/index.rst
join
lists
merge
Expand Down
6 changes: 6 additions & 0 deletions docs/cudf/source/user_guide/api_docs/pylibcudf/io/avro.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
====
Avro
====

.. automodule:: cudf._lib.pylibcudf.io.avro
:members:
18 changes: 18 additions & 0 deletions docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
===
I/O
===

I/O Utility Classes
===================

.. automodule:: cudf._lib.pylibcudf.io.types
:members:


I/O Functions
=============

.. toctree::
:maxdepth: 1

avro
50 changes: 14 additions & 36 deletions python/cudf/cudf/_lib/avro.pyx
Original file line number Diff line number Diff line change
@@ -1,20 +1,12 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from libcpp.string cimport string
from libcpp.utility cimport move
from libcpp.vector cimport vector
from cudf._lib.utils cimport data_from_pylibcudf_io

from cudf._lib.io.utils cimport make_source_info
from cudf._lib.pylibcudf.libcudf.io.avro cimport (
avro_reader_options,
read_avro as libcudf_read_avro,
)
from cudf._lib.pylibcudf.libcudf.io.types cimport table_with_metadata
from cudf._lib.pylibcudf.libcudf.types cimport size_type
from cudf._lib.utils cimport data_from_unique_ptr
import cudf._lib.pylibcudf as plc
from cudf._lib.pylibcudf.io.types import SourceInfo


cpdef read_avro(datasource, columns=None, skip_rows=-1, num_rows=-1):
cpdef read_avro(datasource, columns=None, skip_rows=0, num_rows=-1):
"""
Cython function to call libcudf read_avro, see `read_avro`.

Expand All @@ -28,28 +20,14 @@ cpdef read_avro(datasource, columns=None, skip_rows=-1, num_rows=-1):

if not isinstance(num_rows, int) or num_rows < -1:
raise TypeError("num_rows must be an int >= -1")
if not isinstance(skip_rows, int) or skip_rows < -1:
raise TypeError("skip_rows must be an int >= -1")

cdef vector[string] c_columns
if columns is not None and len(columns) > 0:
c_columns.reserve(len(columns))
for col in columns:
c_columns.push_back(str(col).encode())

cdef avro_reader_options options = move(
avro_reader_options.builder(make_source_info([datasource]))
.columns(c_columns)
.skip_rows(<size_type> skip_rows)
.num_rows(<size_type> num_rows)
.build()
if not isinstance(skip_rows, int) or skip_rows < 0:
raise TypeError("skip_rows must be an int >= 0")

return data_from_pylibcudf_io(
plc.io.avro.read_avro(
SourceInfo([datasource]),
columns,
skip_rows,
num_rows
)
)

cdef table_with_metadata c_result

with nogil:
c_result = move(libcudf_read_avro(options))

names = [info.name.decode() for info in c_result.metadata.schema_info]

return data_from_unique_ptr(move(c_result.tbl), column_names=names)
8 changes: 4 additions & 4 deletions python/cudf/cudf/_lib/csv.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -151,14 +151,14 @@ cdef csv_reader_options make_csv_reader_options(
)

if quoting == 1:
c_quoting = quote_style.QUOTE_ALL
c_quoting = quote_style.ALL
elif quoting == 2:
c_quoting = quote_style.QUOTE_NONNUMERIC
c_quoting = quote_style.NONNUMERIC
elif quoting == 3:
c_quoting = quote_style.QUOTE_NONE
c_quoting = quote_style.NONE
else:
# Default value
c_quoting = quote_style.QUOTE_MINIMAL
c_quoting = quote_style.MINIMAL

cdef csv_reader_options csv_reader_options_c = move(
csv_reader_options.builder(c_source_info)
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/parquet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -491,7 +491,7 @@ def write_parquet(
"Valid values are '1.0' and '2.0'"
)

dict_policy = (
cdef cudf_io_types.dictionary_policy dict_policy = (
wence- marked this conversation as resolved.
Show resolved Hide resolved
cudf_io_types.dictionary_policy.ADAPTIVE
if use_dictionary
else cudf_io_types.dictionary_policy.NEVER
Expand Down
1 change: 1 addition & 0 deletions python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,4 @@ link_to_pyarrow_headers(pylibcudf_interop)

add_subdirectory(libcudf)
add_subdirectory(strings)
add_subdirectory(io)
25 changes: 25 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# =============================================================================
# Copyright (c) 2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied. See the License for the specific language governing permissions and limitations under
# the License.
# =============================================================================

set(cython_sources avro.pyx types.pyx)

set(linked_libraries cudf::cudf)
rapids_cython_create_modules(
CXX
SOURCE_FILES "${cython_sources}"
LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_io_ ASSOCIATED_TARGETS cudf
)

set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_types)
link_to_pyarrow_headers("${targets_using_arrow_headers}")
4 changes: 4 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . cimport avro, types
from .types cimport SourceInfo, TableWithMetadata
4 changes: 4 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/io/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . import avro, types
from .types import SourceInfo, TableWithMetadata
15 changes: 15 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/io/avro.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr

from cudf._lib.pylibcudf.io.types cimport SourceInfo, TableWithMetadata
from cudf._lib.pylibcudf.libcudf.io.avro cimport avro_reader_options
from cudf._lib.pylibcudf.libcudf.types cimport size_type


cpdef TableWithMetadata read_avro(
SourceInfo source_info,
list columns = *,
size_type skip_rows = *,
size_type num_rows = *
)
59 changes: 59 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/io/avro.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp.string cimport string
from libcpp.utility cimport move
from libcpp.vector cimport vector

from cudf._lib.pylibcudf.io.types cimport SourceInfo, TableWithMetadata
from cudf._lib.pylibcudf.libcudf.io.avro cimport (
avro_reader_options,
read_avro as cpp_read_avro,
)
from cudf._lib.pylibcudf.libcudf.types cimport size_type


cpdef TableWithMetadata read_avro(
SourceInfo source_info,
list columns = [], # no-cython-lint
lithomas1 marked this conversation as resolved.
Show resolved Hide resolved
size_type skip_rows = 0,
size_type num_rows = -1
):
"""
Reads an Avro dataset into a set of columns.

For details, see :cpp:class:`cudf::io::read_avro`.
lithomas1 marked this conversation as resolved.
Show resolved Hide resolved

Parameters
----------
source_info: SourceInfo
The SourceInfo object to read the avro dataset from.
columns: list, default []
lithomas1 marked this conversation as resolved.
Show resolved Hide resolved
skip_rows: size_type, default 0
The number of rows to skip.
num_rows: size_type, default -1
The number of rows to read, after skipping rows.
If -1 is passed, all rows will be read.

Returns
-------
TableWithMetadata
The Table and its corresponding metadata that was read in.
"""
cdef vector[string] c_columns
if columns is not None and len(columns) > 0:
c_columns.reserve(len(columns))
for col in columns:
c_columns.push_back(str(col).encode())

cdef avro_reader_options avro_opts = move(
avro_reader_options.builder(source_info.c_obj)
.columns(c_columns)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit:

I would perhaps rather rely on the default-constructed avro_reader_options object, and not repeat default values in this interface.

We could take optional skip_rows and num_rows and do:

cdef avro_reader_options opts = move(avro_reader_options.builder(source_info.c_obj))
if skip_rows is not None:
    opts = move(opts.skip_rows(skip_rows))

etc...

WDYT?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the old Avro (and I/O) Cython files do it my way (passing all arguments) in general.

Unless there's a C++ trap I'm missing, I think I'd prefer to keep it more compact like this.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't mind doing it "all at once", it's just that (the old code did this too), with this mechanism we encode the "default" values in two places: once canonically in libcudf C++ "skip_rows = -1 means all of them", and once non-canonically here.

.skip_rows(skip_rows)
.num_rows(num_rows)
.build()
)

with nogil:
c_result = move(cpp_read_avro(avro_opts))

return TableWithMetadata.from_libcudf(c_result)
32 changes: 32 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/io/types.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr

from cudf._lib.pylibcudf.libcudf.io.types cimport (
column_encoding,
column_in_metadata,
column_name_info,
compression_type,
dictionary_policy,
io_type,
partition_info,
quote_style,
sink_info,
source_info,
statistics_freq,
table_input_metadata,
table_metadata,
table_with_metadata,
)
from cudf._lib.pylibcudf.table cimport Table


cdef class TableWithMetadata:
cdef public Table tbl
wence- marked this conversation as resolved.
Show resolved Hide resolved
cdef table_metadata metadata

@staticmethod
cdef TableWithMetadata from_libcudf(table_with_metadata& tbl)

cdef class SourceInfo:
cdef source_info c_obj
106 changes: 106 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/io/types.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp.string cimport string
from libcpp.utility cimport move
from libcpp.vector cimport vector

from cudf._lib.pylibcudf.libcudf.io.types cimport (
host_buffer,
source_info,
table_with_metadata,
)

import errno
import io
import os


cdef class TableWithMetadata:
"""A container holding a table and its associated metadata
(e.g. column names)

For details, see :cpp:class:`cudf::io::table_with_metadata`.
"""

@property
def columns(self):
"""
Return a list containing the columns of the table
"""
return self.tbl._columns
lithomas1 marked this conversation as resolved.
Show resolved Hide resolved

@property
def column_names(self):
"""
Return a list containing the column names of the table
"""
# TODO: Handle nesting (columns with child columns)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: perhaps raise if unhandled?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added an assert (since this hopefully should just be temporary until I get around to implementing a reader that can read child columns).

return [col_info.name.decode() for col_info in self.metadata.schema_info]

@staticmethod
cdef TableWithMetadata from_libcudf(table_with_metadata& tbl_with_meta):
"""Create a Python TableWithMetadata from a libcudf table_with_metadata"""
cdef TableWithMetadata out = TableWithMetadata.__new__(TableWithMetadata)
out.tbl = Table.from_libcudf(move(tbl_with_meta.tbl))
out.metadata = tbl_with_meta.metadata
return out

cdef class SourceInfo:
"""A class containing details on a source to read from.

For details, see :cpp:class:`cudf::io::source_info`.

Parameters
----------
sources : List[Union[str, bytes, io.BytesIO]]
A homogeneous list of sources (this can be a string filename,
bytes, or an io.BytesIO) to read from.

Mixing different types of sources will raise a `ValueError`.
"""

def __init__(self, list sources):
if not sources:
raise ValueError("Need to pass at least one source")

cdef vector[string] c_files

if isinstance(sources[0], (os.PathLike, str)):
c_files.reserve(len(sources))

for src in sources:
if not isinstance(src, (os.PathLike, str)):
raise ValueError("All sources must be of the same type!")
if not os.path.isfile(src):
raise FileNotFoundError(errno.ENOENT,
os.strerror(errno.ENOENT),
src)

c_files.push_back(<string> str(src).encode())

self.c_obj = move(source_info(c_files))
return

# TODO: host_buffer is deprecated API, use host_span instead
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The deprecated API usage was originally in make_source_info, which this code is based off of.

I tried to have a go at switching over to host_span, but my C++ isn't good enough for that yet 😭 .

IMO, it should be fine to keep it as is for now, and tackle the deprecated API usage in a followup (I'll open a followup issue).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably (sketch):

Add the follow declaration to utilities/host_span.pxd:

host_span(T*, size_type)

and then

from ...libcudf.utilities.host_span cimport host_span

cdef vector[host_span[char]] c_host_buffers

# fill host buffers
...
c_host_buffers.push_back(host_span[char](<char *>&c_buffer[0], c_buffer.shape[0]))

cdef host_span[host_span[char]] c_spans = host_span[host_span[char]](c_host_buffers)
...

cdef vector[host_buffer] c_host_buffers
cdef const unsigned char[::1] c_buffer
cdef bint empty_buffer = False
if isinstance(sources[0], bytes):
empty_buffer = True
for buffer in sources:
if not isinstance(buffer, bytes):
raise ValueError("All sources must be of the same type!")
if (len(buffer) > 0):
c_buffer = buffer
wence- marked this conversation as resolved.
Show resolved Hide resolved
c_host_buffers.push_back(host_buffer(<char*>&c_buffer[0],
c_buffer.shape[0]))
empty_buffer = False
elif isinstance(sources[0], io.BytesIO):
for bio in sources:
if not isinstance(bio, io.BytesIO):
raise ValueError("All sources must be of the same type!")
c_buffer = bio.getbuffer() # check if empty?
c_host_buffers.push_back(host_buffer(<char*>&c_buffer[0],
c_buffer.shape[0]))

self.c_obj = source_info(c_host_buffers)
Loading
Loading