From 305182e58c19add98a5abd6a5b00d9b266f41732 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 22 Nov 2024 08:45:32 -0600 Subject: [PATCH] Enable unified memory by default in `cudf_polars` (#17375) This PR enables Unified memory as the default memory resource for `cudf_polars` --------- Co-authored-by: Vyas Ramasubramani Co-authored-by: Vyas Ramasubramani Co-authored-by: Matthew Murray <41342305+Matt711@users.noreply.github.com> Co-authored-by: Lawrence Mitchell Co-authored-by: Matthew Murray --- .../cudf/source/cudf_polars/engine_options.md | 7 +++ docs/cudf/source/cudf_polars/index.rst | 6 ++ python/cudf_polars/cudf_polars/callback.py | 56 +++++++++++++++++-- python/cudf_polars/tests/test_config.py | 20 +++++++ 4 files changed, 84 insertions(+), 5 deletions(-) diff --git a/docs/cudf/source/cudf_polars/engine_options.md b/docs/cudf/source/cudf_polars/engine_options.md index 4c930c7392d..afb2bb6e8b9 100644 --- a/docs/cudf/source/cudf_polars/engine_options.md +++ b/docs/cudf/source/cudf_polars/engine_options.md @@ -23,3 +23,10 @@ engine = GPUEngine( result = query.collect(engine=engine) ``` Note that passing `chunked: False` disables chunked reading entirely, and thus `chunk_read_limit` and `pass_read_limit` will have no effect. + +## Disabling CUDA Managed Memory + +By default `cudf_polars` will default to [CUDA managed memory](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#unified-memory-introduction) with RMM's pool allocator. On systems that don't support managed memory, a non-managed asynchronous pool +allocator is used. +Managed memory can be turned off by setting `POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY` to `0`. System requirements for managed memory can be found [here]( +https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#system-requirements-for-unified-memory). diff --git a/docs/cudf/source/cudf_polars/index.rst b/docs/cudf/source/cudf_polars/index.rst index 6fd98a6b5da..a9b4bb2dff2 100644 --- a/docs/cudf/source/cudf_polars/index.rst +++ b/docs/cudf/source/cudf_polars/index.rst @@ -9,6 +9,12 @@ and run on the CPU. Benchmark --------- + +.. note:: + The following benchmarks were performed with `POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY` environment variable set to `"0"`. + Using managed memory (the default) imposes a performance cost in order to avoid out of memory errors. + Peak performance can still be attained by setting the environment variable to 1. + We reproduced the `Polars Decision Support (PDS) `__ benchmark to compare Polars GPU engine with the default CPU settings across several dataset sizes. Here are the results: .. figure:: ../_static/pds_benchmark_polars.png diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py index 7915c9e6b18..8dc5715195d 100644 --- a/python/cudf_polars/cudf_polars/callback.py +++ b/python/cudf_polars/cudf_polars/callback.py @@ -15,6 +15,7 @@ from polars.exceptions import ComputeError, PerformanceWarning +import pylibcudf import rmm from rmm._cuda import gpu @@ -32,8 +33,26 @@ __all__: list[str] = ["execute_with_cudf"] +_SUPPORTED_PREFETCHES = { + "column_view::get_data", + "mutable_column_view::get_data", + "gather", + "hash_join", +} + + +def _env_get_int(name, default): + try: + return int(os.getenv(name, default)) + except (ValueError, TypeError): # pragma: no cover + return default # pragma: no cover + + @cache -def default_memory_resource(device: int) -> rmm.mr.DeviceMemoryResource: +def default_memory_resource( + device: int, + cuda_managed_memory: bool, # noqa: FBT001 +) -> rmm.mr.DeviceMemoryResource: """ Return the default memory resource for cudf-polars. @@ -42,15 +61,35 @@ def default_memory_resource(device: int) -> rmm.mr.DeviceMemoryResource: device Disambiguating device id when selecting the device. Must be the active device when this function is called. + cuda_managed_memory + Whether to use managed memory or not. Returns ------- rmm.mr.DeviceMemoryResource The default memory resource that cudf-polars uses. Currently - an async pool resource. + a managed memory resource, if `cuda_managed_memory` is `True`. + else, an async pool resource is returned. """ try: - return rmm.mr.CudaAsyncMemoryResource() + if ( + cuda_managed_memory + and pylibcudf.utils._is_concurrent_managed_access_supported() + ): + # Allocating 80% of the available memory for the pool. + # Leaving a 20% headroom to avoid OOM errors. + free_memory, _ = rmm.mr.available_device_memory() + free_memory = int(round(float(free_memory) * 0.80 / 256) * 256) + for key in _SUPPORTED_PREFETCHES: + pylibcudf.experimental.enable_prefetching(key) + mr = rmm.mr.PrefetchResourceAdaptor( + rmm.mr.PoolMemoryResource( + rmm.mr.ManagedMemoryResource(), + initial_pool_size=free_memory, + ) + ) + else: + mr = rmm.mr.CudaAsyncMemoryResource() except RuntimeError as e: # pragma: no cover msg, *_ = e.args if ( @@ -64,6 +103,8 @@ def default_memory_resource(device: int) -> rmm.mr.DeviceMemoryResource: ) from None else: raise + else: + return mr @contextlib.contextmanager @@ -89,10 +130,15 @@ def set_memory_resource( at entry. If a memory resource is provided, it must be valid to use with the currently active device. """ + previous = rmm.mr.get_current_device_resource() if mr is None: device: int = gpu.getDevice() - mr = default_memory_resource(device) - previous = rmm.mr.get_current_device_resource() + mr = default_memory_resource( + device=device, + cuda_managed_memory=bool( + _env_get_int("POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY", default=1) != 0 + ), + ) rmm.mr.set_current_device_resource(mr) try: yield mr diff --git a/python/cudf_polars/tests/test_config.py b/python/cudf_polars/tests/test_config.py index 25b71716eed..52c5c9894fe 100644 --- a/python/cudf_polars/tests/test_config.py +++ b/python/cudf_polars/tests/test_config.py @@ -10,6 +10,7 @@ import rmm +from cudf_polars.callback import default_memory_resource from cudf_polars.dsl.ir import DataFrameScan from cudf_polars.testing.asserts import ( assert_gpu_result_equal, @@ -58,6 +59,25 @@ def test_invalid_memory_resource_raises(mr): q.collect(engine=pl.GPUEngine(memory_resource=mr)) +@pytest.mark.parametrize("disable_managed_memory", ["1", "0"]) +def test_cudf_polars_enable_disable_managed_memory(monkeypatch, disable_managed_memory): + q = pl.LazyFrame({"a": [1, 2, 3]}) + + with monkeypatch.context() as monkeycontext: + monkeycontext.setenv( + "POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY", disable_managed_memory + ) + result = q.collect(engine=pl.GPUEngine()) + mr = default_memory_resource(0, bool(disable_managed_memory == "1")) + if disable_managed_memory == "1": + assert isinstance(mr, rmm.mr.PrefetchResourceAdaptor) + assert isinstance(mr.upstream_mr, rmm.mr.PoolMemoryResource) + else: + assert isinstance(mr, rmm.mr.CudaAsyncMemoryResource) + monkeycontext.delenv("POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY") + assert_frame_equal(q.collect(), result) + + def test_explicit_device_zero(): q = pl.LazyFrame({"a": [1, 2, 3]})