Skip to content

Commit

Permalink
Fully implement metadata
Browse files Browse the repository at this point in the history
  • Loading branch information
vyasr committed Sep 14, 2023
1 parent c62c71b commit d6eee4b
Show file tree
Hide file tree
Showing 9 changed files with 84 additions and 27 deletions.
2 changes: 2 additions & 0 deletions cpp/include/cudf/interop.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,10 +139,12 @@ std::shared_ptr<arrow::Table> to_arrow(table_view input,
* Converts the `cudf::scalar` to `arrow::Scalar`.
*
* @param input scalar that needs to be converted to arrow Scalar
* @param metadata Contains hierarchy of names of columns and children
* @param ar_mr arrow memory pool to allocate memory for arrow Scalar
* @return arrow Scalar generated from `input`
*/
std::shared_ptr<arrow::Scalar> to_arrow(cudf::scalar const& input,
column_metadata const& metadata = {},
arrow::MemoryPool* ar_mr = arrow::default_memory_pool());
/**
* @brief Create `cudf::table` from given arrow Table input
Expand Down
7 changes: 5 additions & 2 deletions cpp/src/interop/to_arrow.cu
Original file line number Diff line number Diff line change
Expand Up @@ -457,12 +457,15 @@ std::shared_ptr<arrow::Table> to_arrow(table_view input,
return detail::to_arrow(input, metadata, cudf::get_default_stream(), ar_mr);
}

std::shared_ptr<arrow::Scalar> to_arrow(cudf::scalar const& input, arrow::MemoryPool* ar_mr)
std::shared_ptr<arrow::Scalar> to_arrow(cudf::scalar const& input,
column_metadata const& metadata,

arrow::MemoryPool* ar_mr)
{
auto stream = cudf::get_default_stream();
auto column = cudf::make_column_from_scalar(input, 1);
cudf::table_view tv{{column->view()}};
auto arrow_table = cudf::to_arrow(tv, {column_metadata{""}});
auto arrow_table = cudf::to_arrow(tv, {metadata});
auto ac = arrow_table->column(0);
auto maybe_scalar = ac->GetScalar(0);
if (!maybe_scalar.ok()) { CUDF_FAIL("Failed to produce a scalar"); }
Expand Down
8 changes: 1 addition & 7 deletions python/cudf/cudf/_lib/cpp/interop.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -33,18 +33,12 @@ cdef extern from "cudf/interop.hpp" namespace "cudf" \
string name
vector[column_metadata] children_meta

# TODO: Adding this for pylibcudf because in pylibcudf we don't have column
# names. However we need to figure out how this will propagate up when cudf
# starts using pylibcudf for interop functionality.
cdef shared_ptr[CTable] to_arrow(
table_view input,
) except +

cdef shared_ptr[CTable] to_arrow(
table_view input,
vector[column_metadata] metadata,
) except +

cdef shared_ptr[CScalar] to_arrow(
const scalar& input,
column_metadata metadata,
) except +
11 changes: 9 additions & 2 deletions python/cudf/cudf/_lib/pylibcudf/interop.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,17 @@

from pyarrow.lib cimport Scalar as pa_Scalar, Table as pa_Table

from cudf._lib.cpp.interop cimport column_metadata

from .scalar cimport Scalar
from .table cimport Table


cdef class ColumnMetadata:
cdef public object name
cdef public object children_meta
cdef column_metadata to_c_metadata(self)

cpdef Table from_arrow(
pa_Table pyarrow_table,
)
Expand All @@ -14,6 +21,6 @@ cpdef Scalar from_arrow_scalar(
pa_Scalar pyarrow_scalar,
)

cpdef pa_Table to_arrow(Table tbl)
cpdef pa_Table to_arrow(Table tbl, list metadata)

cpdef pa_Scalar to_arrow_scalar(Scalar slr)
cpdef pa_Scalar to_arrow_scalar(Scalar slr, ColumnMetadata metadata)
35 changes: 31 additions & 4 deletions python/cudf/cudf/_lib/pylibcudf/interop.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from cython.operator cimport dereference
from libcpp.memory cimport shared_ptr, unique_ptr
from libcpp.utility cimport move
from libcpp.vector cimport vector
from pyarrow.lib cimport (
CScalar as pa_CScalar,
CTable as pa_CTable,
Expand All @@ -15,6 +16,7 @@ from pyarrow.lib cimport (
)

from cudf._lib.cpp.interop cimport (
column_metadata,
from_arrow as cpp_from_arrow,
to_arrow as cpp_to_arrow,
)
Expand All @@ -25,6 +27,26 @@ from .scalar cimport Scalar
from .table cimport Table


cdef class ColumnMetadata:
def __init__(self, name):
self.name = name
self.children_meta = []

cdef column_metadata to_c_metadata(self):
"""Convert to C++ column_metadata.
Since this class is mutable and cheap, it is easier to create the C++
object on the fly rather than have it directly backing the storage for
the Cython class.
"""
cdef column_metadata c_metadata
cdef ColumnMetadata child_meta
c_metadata.name = self.name.encode()
for child_meta in self.children_meta:
c_metadata.children_meta.push_back(child_meta.to_c_metadata())
return c_metadata


cpdef Table from_arrow(
pa_Table pyarrow_table,
):
Expand Down Expand Up @@ -53,19 +75,24 @@ cpdef Scalar from_arrow_scalar(
return Scalar.from_libcudf(move(c_result))


cpdef pa_Table to_arrow(Table tbl):
cpdef pa_Table to_arrow(Table tbl, list metadata):
cdef shared_ptr[pa_CTable] c_result
cdef vector[column_metadata] c_metadata
cdef ColumnMetadata meta
for meta in metadata:
c_metadata.push_back(meta.to_c_metadata())

with nogil:
c_result = move(cpp_to_arrow(tbl.view()))
c_result = move(cpp_to_arrow(tbl.view(), c_metadata))

return pyarrow_wrap_table(c_result)


cpdef pa_Scalar to_arrow_scalar(Scalar slr):
cpdef pa_Scalar to_arrow_scalar(Scalar slr, ColumnMetadata metadata):
cdef shared_ptr[pa_CScalar] c_result
cdef column_metadata c_metadata = metadata.to_c_metadata()

with nogil:
c_result = move(cpp_to_arrow(dereference(slr.c_obj.get())))
c_result = move(cpp_to_arrow(dereference(slr.c_obj.get()), c_metadata))

return pyarrow_wrap_scalar(c_result)
4 changes: 3 additions & 1 deletion python/cudf/cudf/_lib/pylibcudf/scalar.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,6 @@ cdef class Scalar:
@staticmethod
cdef Scalar from_libcudf(unique_ptr[scalar] libcudf_scalar, dtype=*)

cpdef to_pyarrow_scalar(self)
# TODO: Make sure I'm correct to avoid typing the metadata as
# ColumnMetadata, I assume that will cause circular cimport problems
cpdef to_pyarrow_scalar(self, metadata)
4 changes: 2 additions & 2 deletions python/cudf/cudf/_lib/pylibcudf/scalar.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -90,9 +90,9 @@ cdef class Scalar:
)
return s

cpdef to_pyarrow_scalar(self):
cpdef to_pyarrow_scalar(self, metadata):
from .interop import to_arrow_scalar
return to_arrow_scalar(self)
return to_arrow_scalar(self, metadata)

cdef const scalar* get(self) except *:
return self.c_obj.get()
Expand Down
36 changes: 29 additions & 7 deletions python/cudf/cudf/_lib/scalar.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

import cudf
from cudf._lib.pylibcudf.interop import ColumnMetadata

from cudf._lib.pylibcudf.types cimport type_id

Expand All @@ -30,6 +31,7 @@ from cudf._lib.types import (
datetime_unit_map,
duration_unit_map,
)
from cudf.api.types import is_list_dtype, is_struct_dtype
from cudf.core.dtypes import ListDtype, StructDtype
from cudf.core.missing import NA, NaT

Expand Down Expand Up @@ -100,6 +102,25 @@ def _replace_nested_none(obj):
_replace_nested_none(v)


def gather_metadata(dtypes):
# dtypes is a dict mapping names to column dtypes
# This interface is a bit clunky, but it matches libcudf. May want to
# consider better approaches to building up the metadata eventually.
out = []
for name, dtype in dtypes.items():
v = ColumnMetadata(name)
if is_struct_dtype(dtype):
v.children_meta = gather_metadata(dtype.fields)
elif is_list_dtype(dtype):
# Offsets column is unnamed and has no children
v.children_meta.append(ColumnMetadata(""))
v.children_meta.extend(
gather_metadata({"": dtype.element_type})
)
out.append(v)
return out


cdef class DeviceScalar:

# I think this should be removable, except that currently the way that
Expand Down Expand Up @@ -159,8 +180,9 @@ cdef class DeviceScalar:
self._dtype = dtype

def _to_host_scalar(self):
metadata = gather_metadata({"": self.dtype})[0]
if isinstance(self.dtype, cudf.core.dtypes.DecimalDtype):
ps = self.c_value.to_pyarrow_scalar()
ps = self.c_value.to_pyarrow_scalar(metadata)
if not ps.is_valid:
return NA
return ps.as_py()
Expand All @@ -170,38 +192,38 @@ cdef class DeviceScalar:
# overflow. However, the old implementation didn't handle these cases
# either, so we can leave that for a follow-up PR.
elif cudf.api.types.is_struct_dtype(self.dtype):
ps = self.c_value.to_pyarrow_scalar()
ps = self.c_value.to_pyarrow_scalar(metadata)
if not ps.is_valid:
return NA
ret = ps.as_py()
_replace_nested_none(ret)
return ret
elif cudf.api.types.is_list_dtype(self.dtype):
ps = self.c_value.to_pyarrow_scalar()
ps = self.c_value.to_pyarrow_scalar(metadata)
if not ps.is_valid:
return NA
ret = ps.as_py()
_replace_nested_none(ret)
return ret
elif pd.api.types.is_string_dtype(self.dtype):
ps = self.c_value.to_pyarrow_scalar()
ps = self.c_value.to_pyarrow_scalar(metadata)
if not ps.is_valid:
return NA
return ps.as_py()
elif pd.api.types.is_numeric_dtype(self.dtype):
ps = self.c_value.to_pyarrow_scalar()
ps = self.c_value.to_pyarrow_scalar(metadata)
if not ps.is_valid:
return NA
return ps.type.to_pandas_dtype()(ps.as_py())
elif pd.api.types.is_datetime64_dtype(self.dtype):
ps = self.c_value.to_pyarrow_scalar()
ps = self.c_value.to_pyarrow_scalar(metadata)
if not ps.is_valid:
return NaT
time_unit, _ = np.datetime_data(self.dtype)
# Cast to int64 to avoid overflow
return np.datetime64(ps.cast('int64').as_py(), time_unit)
elif pd.api.types.is_timedelta64_dtype(self.dtype):
ps = self.c_value.to_pyarrow_scalar()
ps = self.c_value.to_pyarrow_scalar(metadata)
if not ps.is_valid:
return NaT
time_unit, _ = np.datetime_data(self.dtype)
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/tests/test_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -895,14 +895,14 @@ def test_memory_usage():
"data, idx",
[
(
[[{"f2": {"a": 100}, "f1": "a"}, {"f1": "sf12", "f2": None}]],
[[{"f2": {"a": 100}, "f1": "a"}, {"f1": "sf12", "f2": NA}]],
0,
),
(
[
[
{"f2": {"a": 100, "c": 90, "f2": 10}, "f1": "a"},
{"f1": "sf12", "f2": None},
{"f1": "sf12", "f2": NA},
]
],
0,
Expand Down

0 comments on commit d6eee4b

Please sign in to comment.