Skip to content

Commit

Permalink
Move cudf._lib.search to cudf.core._internals (#17411)
Browse files Browse the repository at this point in the history
Contributes to #17317

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: #17411
  • Loading branch information
mroeschke authored Nov 23, 2024
1 parent 8b7127f commit d1d4420
Show file tree
Hide file tree
Showing 10 changed files with 87 additions and 88 deletions.
1 change: 0 additions & 1 deletion python/cudf/cudf/_lib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ set(cython_sources
rolling.pyx
round.pyx
scalar.pyx
search.pyx
sort.pyx
stream_compaction.pyx
string_casting.pyx
Expand Down
1 change: 0 additions & 1 deletion python/cudf/cudf/_lib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
reshape,
rolling,
round,
search,
sort,
stream_compaction,
string_casting,
Expand Down
68 changes: 0 additions & 68 deletions python/cudf/cudf/_lib/search.pyx

This file was deleted.

56 changes: 56 additions & 0 deletions python/cudf/cudf/core/_internals/search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.
from __future__ import annotations

from typing import TYPE_CHECKING, Literal

import pylibcudf as plc

from cudf._lib.column import Column
from cudf.core.buffer import acquire_spill_lock

if TYPE_CHECKING:
from cudf.core.column import ColumnBase


@acquire_spill_lock()
def search_sorted(
source: list[ColumnBase],
values: list[ColumnBase],
side: Literal["left", "right"],
ascending: bool = True,
na_position: Literal["first", "last"] = "last",
) -> ColumnBase:
"""Find indices where elements should be inserted to maintain order
Parameters
----------
source : list of columns
List of columns to search in
values : List of columns
List of value columns to search for
side : str {'left', 'right'} optional
If 'left', the index of the first suitable location is given.
If 'right', return the last such index
"""
# Note: We are ignoring index columns here
column_order = [
plc.types.Order.ASCENDING if ascending else plc.types.Order.DESCENDING
] * len(source)
null_precedence = [
plc.types.NullOrder.AFTER
if na_position == "last"
else plc.types.NullOrder.BEFORE
] * len(source)

func = getattr(
plc.search,
"lower_bound" if side == "left" else "upper_bound",
)
return Column.from_pylibcudf(
func(
plc.Table([col.to_pylibcudf(mode="read") for col in source]),
plc.Table([col.to_pylibcudf(mode="read") for col in values]),
column_order,
null_precedence,
)
)
23 changes: 20 additions & 3 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -757,7 +757,7 @@ def indices_of(
raise ValueError("value must be a scalar")
else:
value = as_column(value, dtype=self.dtype, length=1)
mask = libcudf.search.contains(value, self)
mask = value.contains(self)
return apply_boolean_mask(
[as_column(range(0, len(self)), dtype=size_type_dtype)], mask
)[0]
Expand Down Expand Up @@ -914,7 +914,7 @@ def _obtain_isin_result(self, rhs: ColumnBase) -> ColumnBase:
# self.isin(other) asks "which values of self are in other"
# contains(haystack, needles) asks "which needles are in haystack"
# hence this argument ordering.
result = libcudf.search.contains(rhs, self)
result = rhs.contains(self)
if self.null_count > 0:
# If one of the needles is null, then the result contains
# nulls, these nulls should be replaced by whether or not the
Expand Down Expand Up @@ -956,6 +956,23 @@ def is_monotonic_decreasing(self) -> bool:
[self], [False], None
)

def contains(self, other: ColumnBase) -> ColumnBase:
"""
Check whether column contains multiple values.
Parameters
----------
other : Column
A column of values to search for
"""
with acquire_spill_lock():
return Column.from_pylibcudf(
plc.search.contains(
self.to_pylibcudf(mode="read"),
other.to_pylibcudf(mode="read"),
)
)

def sort_values(
self: Self,
ascending: bool = True,
Expand Down Expand Up @@ -1190,7 +1207,7 @@ def searchsorted(
raise ValueError(
"Column searchsorted expects values to be column of same dtype"
)
return libcudf.search.search_sorted(
return cudf.core._internals.search.search_sorted( # type: ignore[return-value]
[self],
[value],
side=side,
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@

import cudf
from cudf import _lib as libcudf
from cudf._lib.search import search_sorted
from cudf.core._compat import PANDAS_GE_220
from cudf.core._internals import unary
from cudf.core._internals.search import search_sorted
from cudf.core._internals.timezones import (
check_ambiguous_and_nonexistent,
get_compatible_timezone,
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,8 @@ def __contains__(self, item: ScalarLike) -> bool:
except (TypeError, ValueError):
return False
# TODO: Use `scalar`-based `contains` wrapper
return libcudf.search.contains(
self, column.as_column([search_item], dtype=self.dtype)
return self.contains(
column.as_column([search_item], dtype=self.dtype)
).any()

def indices_of(self, value: ScalarLike) -> NumericalColumn:
Expand Down
10 changes: 2 additions & 8 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -5857,14 +5857,8 @@ def sum(
return result_col

def __contains__(self, item: ScalarLike) -> bool:
if is_scalar(item):
return True in libcudf.search.contains(
self, column.as_column([item], dtype=self.dtype)
)
else:
return True in libcudf.search.contains(
self, column.as_column(item, dtype=self.dtype)
)
other = [item] if is_scalar(item) else item
return self.contains(column.as_column(other, dtype=self.dtype)).any()

def as_numerical_column(
self, dtype: Dtype
Expand Down
8 changes: 5 additions & 3 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
from collections import abc
from typing import TYPE_CHECKING, Any, Literal

# TODO: The `numpy` import is needed for typing purposes during doc builds
# only, need to figure out why the `np` alias is insufficient then remove.
import cupy
import numpy
import numpy as np
Expand All @@ -19,9 +17,13 @@
import pylibcudf as plc

import cudf

# TODO: The `numpy` import is needed for typing purposes during doc builds
# only, need to figure out why the `np` alias is insufficient then remove.
from cudf import _lib as libcudf
from cudf.api.types import is_dtype_equal, is_scalar
from cudf.core._compat import PANDAS_LT_300
from cudf.core._internals.search import search_sorted
from cudf.core.buffer import acquire_spill_lock
from cudf.core.column import (
ColumnBase,
Expand Down Expand Up @@ -1302,7 +1304,7 @@ def searchsorted(
for val, common_dtype in zip(values, common_dtype_list)
]

outcol = libcudf.search.search_sorted(
outcol = search_sorted(
sources,
values,
side,
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
import cudf
from cudf import _lib as libcudf
from cudf._lib.filling import sequence
from cudf._lib.search import search_sorted
from cudf._lib.types import size_type_dtype
from cudf.api.extensions import no_default
from cudf.api.types import (
Expand All @@ -32,6 +31,7 @@
)
from cudf.core._base_index import BaseIndex, _return_get_indexer_result
from cudf.core._compat import PANDAS_LT_300
from cudf.core._internals.search import search_sorted
from cudf.core.column import (
CategoricalColumn,
ColumnBase,
Expand Down

0 comments on commit d1d4420

Please sign in to comment.