From 7fd6918f9f4bbfc499bc60a3532a464c357da4f4 Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Wed, 5 Jun 2024 20:48:10 -0500 Subject: [PATCH] Migrate strings `contains` operations to `pylibcudf` (#15880) This PR creates pylibcudf strings `contains` APIs and migrates the cuDF cython to leverage them. Part of https://github.com/rapidsai/cudf/issues/15162. Authors: - https://github.com/brandon-b-miller Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/15880 --- .../api_docs/pylibcudf/strings/contains.rst | 6 ++ .../api_docs/pylibcudf/strings/index.rst | 1 + .../pylibcudf/libcudf/strings/CMakeLists.txt | 2 +- .../pylibcudf/libcudf/strings/regex_flags.pxd | 13 +++-- .../pylibcudf/libcudf/strings/regex_flags.pyx | 0 .../_lib/pylibcudf/strings/CMakeLists.txt | 4 +- .../cudf/_lib/pylibcudf/strings/__init__.pxd | 11 +++- .../cudf/_lib/pylibcudf/strings/__init__.py | 11 +++- .../cudf/_lib/pylibcudf/strings/contains.pxd | 7 +++ .../cudf/_lib/pylibcudf/strings/contains.pyx | 41 ++++++++++++++ .../_lib/pylibcudf/strings/regex_flags.pxd | 2 + .../_lib/pylibcudf/strings/regex_flags.pyx | 4 ++ .../_lib/pylibcudf/strings/regex_program.pxd | 10 ++++ .../_lib/pylibcudf/strings/regex_program.pyx | 37 +++++++++++++ python/cudf/cudf/_lib/strings/contains.pyx | 23 +++----- .../pylibcudf_tests/test_regex_program.py | 13 +++++ .../pylibcudf_tests/test_string_contains.py | 55 +++++++++++++++++++ 17 files changed, 215 insertions(+), 25 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/contains.rst create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pyx create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/contains.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/contains.pyx create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pyx create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pyx create mode 100644 python/cudf/cudf/pylibcudf_tests/test_regex_program.py create mode 100644 python/cudf/cudf/pylibcudf_tests/test_string_contains.py diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/contains.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/contains.rst new file mode 100644 index 00000000000..e5745331bc7 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/contains.rst @@ -0,0 +1,6 @@ +======== +contains +======== + +.. automodule:: cudf._lib.pylibcudf.strings.contains + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst index 8970fc80c0b..bfaef732555 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst @@ -4,4 +4,5 @@ strings .. toctree:: :maxdepth: 1 + contains replace diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/CMakeLists.txt index 930c22781d0..bd6e2e0af02 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources char_types.pyx) +set(cython_sources char_types.pyx regex_flags.pyx) set(linked_libraries cudf::cudf) diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pxd index 2a5701fa6a3..41617f157b7 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pxd @@ -1,9 +1,12 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. + +from libc.stdint cimport int32_t + cdef extern from "cudf/strings/regex/flags.hpp" \ namespace "cudf::strings" nogil: - ctypedef enum regex_flags: - DEFAULT 'cudf::strings::regex_flags::DEFAULT' - MULTILINE 'cudf::strings::regex_flags::MULTILINE' - DOTALL 'cudf::strings::regex_flags::DOTALL' + cpdef enum class regex_flags(int32_t): + DEFAULT + MULTILINE + DOTALL diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt index c9a983e24f4..cb7f71b1912 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt @@ -12,7 +12,9 @@ # the License. # ============================================================================= -set(cython_sources capitalize.pyx case.pyx char_types.pyx find.pyx replace.pyx) +set(cython_sources capitalize.pyx case.pyx char_types.pyx contains.pyx find.pyx regex_flags.pyx + regex_program.pyx replace.pyx +) set(linked_libraries cudf::cudf) rapids_cython_create_modules( diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd index 7563df8a107..959aa94737d 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd @@ -1,3 +1,12 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . cimport capitalize, case, char_types, find, replace +from . cimport ( + capitalize, + case, + char_types, + contains, + find, + regex_flags, + regex_program, + replace, +) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py index cb4f0e38f97..b7384913286 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py @@ -1,3 +1,12 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import capitalize, case, char_types, find, replace +from . import ( + capitalize, + case, + char_types, + contains, + find, + regex_flags, + regex_program, + replace, +) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/contains.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/contains.pxd new file mode 100644 index 00000000000..275aa95d97e --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/contains.pxd @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cudf._lib.pylibcudf.column cimport Column +from cudf._lib.pylibcudf.strings.regex_program cimport RegexProgram + + +cpdef Column contains_re(Column input, RegexProgram prog) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/contains.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/contains.pyx new file mode 100644 index 00000000000..8c598b7c953 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/contains.pyx @@ -0,0 +1,41 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move + +from cudf._lib.pylibcudf.column cimport Column +from cudf._lib.pylibcudf.libcudf.column.column cimport column +from cudf._lib.pylibcudf.libcudf.strings cimport contains as cpp_contains +from cudf._lib.pylibcudf.strings.regex_program cimport RegexProgram + + +cpdef Column contains_re( + Column input, + RegexProgram prog +): + """Returns a boolean column identifying rows which match the given + regex_program object. + + For details, see :cpp:func:`cudf::strings::contains_re`. + + Parameters + ---------- + input : Column + The input strings + prog : RegexProgram + Regex program instance + + Returns + ------- + pylibcudf.Column + New column of boolean results for each string + """ + + cdef unique_ptr[column] result + + with nogil: + result = cpp_contains.contains_re( + input.view(), + prog.c_obj.get()[0] + ) + + return Column.from_libcudf(move(result)) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pxd new file mode 100644 index 00000000000..79937bf574a --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pxd @@ -0,0 +1,2 @@ +# Copyright (c) 2020-2024, NVIDIA CORPORATION. +from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pyx new file mode 100644 index 00000000000..903c2ddd503 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pyx @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cudf._lib.pylibcudf.libcudf.strings.regex_flags import \ + regex_flags as RegexFlags # no-cython-lint diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pxd new file mode 100644 index 00000000000..61ed268fb2d --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pxd @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string + +from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program + + +cdef class RegexProgram: + cdef unique_ptr[regex_program] c_obj diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pyx new file mode 100644 index 00000000000..d605b0aba02 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pyx @@ -0,0 +1,37 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + + +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string +from libcpp.utility cimport move + +from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags +from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program + +from cudf._lib.pylibcudf.strings.regex_flags import RegexFlags +from cudf._lib.pylibcudf.strings.regex_flags cimport regex_flags + + +cdef class RegexProgram: + + def __init__(self, *args, **kwargs): + raise ValueError("Do not instantiate RegexProgram directly, use create") + + @staticmethod + def create(str pattern, int flags): + cdef unique_ptr[regex_program] c_prog + cdef regex_flags c_flags + cdef string c_pattern = pattern.encode() + + cdef RegexProgram ret = RegexProgram.__new__(RegexProgram) + if isinstance(flags, object): + if isinstance(flags, (int, RegexFlags)): + c_flags = flags + with nogil: + c_prog = regex_program.create(c_pattern, c_flags) + + ret.c_obj = move(c_prog) + else: + raise ValueError("flags must be of type RegexFlags") + + return ret diff --git a/python/cudf/cudf/_lib/strings/contains.pyx b/python/cudf/cudf/_lib/strings/contains.pyx index 087acd8062d..502a1d14696 100644 --- a/python/cudf/cudf/_lib/strings/contains.pyx +++ b/python/cudf/cudf/_lib/strings/contains.pyx @@ -14,7 +14,6 @@ from cudf._lib.pylibcudf.libcudf.column.column cimport column from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar from cudf._lib.pylibcudf.libcudf.strings.contains cimport ( - contains_re as cpp_contains_re, count_re as cpp_count_re, like as cpp_like, matches_re as cpp_matches_re, @@ -23,6 +22,9 @@ from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program from cudf._lib.scalar cimport DeviceScalar +from cudf._lib.pylibcudf.strings import contains +from cudf._lib.pylibcudf.strings.regex_program import RegexProgram + @acquire_spill_lock() def contains_re(Column source_strings, object reg_ex, uint32_t flags): @@ -30,21 +32,10 @@ def contains_re(Column source_strings, object reg_ex, uint32_t flags): Returns a Column of boolean values with True for `source_strings` that contain regular expression `reg_ex`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef string reg_ex_string = str(reg_ex).encode() - cdef regex_flags c_flags = flags - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(reg_ex_string, c_flags)) - c_result = move(cpp_contains_re( - source_view, - dereference(c_prog) - )) - - return Column.from_unique_ptr(move(c_result)) + prog = RegexProgram.create(str(reg_ex), flags) + return Column.from_pylibcudf( + contains.contains_re(source_strings.to_pylibcudf(mode="read"), prog) + ) @acquire_spill_lock() diff --git a/python/cudf/cudf/pylibcudf_tests/test_regex_program.py b/python/cudf/cudf/pylibcudf_tests/test_regex_program.py new file mode 100644 index 00000000000..3a9bcec3616 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_regex_program.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pytest + +import cudf._lib.pylibcudf as plc + + +@pytest.mark.parametrize("pat", ["(", "*", "\\"]) +def test_regex_program_invalid(pat): + with pytest.raises(RuntimeError): + plc.strings.regex_program.RegexProgram.create( + pat, plc.strings.regex_flags.RegexFlags.DEFAULT + ) diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_contains.py b/python/cudf/cudf/pylibcudf_tests/test_string_contains.py new file mode 100644 index 00000000000..8cdb6f7c521 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_string_contains.py @@ -0,0 +1,55 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import cudf._lib.pylibcudf as plc + + +@pytest.fixture(scope="module") +def pa_target_col(): + return pa.array( + ["AbC", "de", "FGHI", "j", "kLm", "nOPq", None, "RsT", None, "uVw"] + ) + + +@pytest.fixture(scope="module") +def plc_target_col(pa_target_col): + return plc.interop.from_arrow(pa_target_col) + + +@pytest.fixture( + params=[ + "A", + "de", + ".*", + "^a", + "^A", + "[^a-z]", + "[a-z]{3,}", + "^[A-Z]{2,}", + "j|u", + ], + scope="module", +) +def pa_target_scalar(request): + return pa.scalar(request.param, type=pa.string()) + + +@pytest.fixture(scope="module") +def plc_target_pat(pa_target_scalar): + prog = plc.strings.regex_program.RegexProgram.create( + pa_target_scalar.as_py(), plc.strings.regex_flags.RegexFlags.DEFAULT + ) + return prog + + +def test_contains_re( + pa_target_col, plc_target_col, pa_target_scalar, plc_target_pat +): + got = plc.strings.contains.contains_re(plc_target_col, plc_target_pat) + expected = pa.compute.match_substring_regex( + pa_target_col, pa_target_scalar.as_py() + ) + assert_column_eq(got, expected)