From 305182e58c19add98a5abd6a5b00d9b266f41732 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 22 Nov 2024 08:45:32 -0600
Subject: [PATCH] Enable unified memory by default in `cudf_polars` (#17375)

This PR enables Unified memory as the default memory resource for
`cudf_polars`

---------

Co-authored-by: Vyas Ramasubramani <vyasr@nvidia.com>
Co-authored-by: Vyas Ramasubramani <vyas.ramasubramani@gmail.com>
Co-authored-by: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Co-authored-by: Lawrence Mitchell <wence@gmx.li>
Co-authored-by: Matthew Murray <matthewmurray711@gmail.com>
---
 .../cudf/source/cudf_polars/engine_options.md |  7 +++
 docs/cudf/source/cudf_polars/index.rst        |  6 ++
 python/cudf_polars/cudf_polars/callback.py    | 56 +++++++++++++++++--
 python/cudf_polars/tests/test_config.py       | 20 +++++++
 4 files changed, 84 insertions(+), 5 deletions(-)

diff --git a/docs/cudf/source/cudf_polars/engine_options.md b/docs/cudf/source/cudf_polars/engine_options.md
index 4c930c7392d..afb2bb6e8b9 100644
--- a/docs/cudf/source/cudf_polars/engine_options.md
+++ b/docs/cudf/source/cudf_polars/engine_options.md
@@ -23,3 +23,10 @@ engine = GPUEngine(
 result = query.collect(engine=engine)
 ```
 Note that passing `chunked: False` disables chunked reading entirely, and thus `chunk_read_limit` and `pass_read_limit` will have no effect.
+
+## Disabling CUDA Managed Memory
+
+By default `cudf_polars` will default to [CUDA managed memory](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#unified-memory-introduction) with RMM's pool allocator. On systems that don't support managed memory, a non-managed asynchronous pool
+allocator is used.
+Managed memory can be turned off by setting `POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY` to `0`. System requirements for managed memory can be found [here](
+https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#system-requirements-for-unified-memory).
diff --git a/docs/cudf/source/cudf_polars/index.rst b/docs/cudf/source/cudf_polars/index.rst
index 6fd98a6b5da..a9b4bb2dff2 100644
--- a/docs/cudf/source/cudf_polars/index.rst
+++ b/docs/cudf/source/cudf_polars/index.rst
@@ -9,6 +9,12 @@ and run on the CPU.
 
 Benchmark
 ---------
+
+.. note::
+   The following benchmarks were performed with `POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY` environment variable set to `"0"`.
+   Using managed memory (the default) imposes a performance cost in order to avoid out of memory errors.
+   Peak performance can still be attained by setting the environment variable to 1.
+
 We reproduced the `Polars Decision Support (PDS) <https://github.com/pola-rs/polars-benchmark>`__ benchmark to compare Polars GPU engine with the default CPU settings across several dataset sizes. Here are the results:
 
 .. figure:: ../_static/pds_benchmark_polars.png
diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py
index 7915c9e6b18..8dc5715195d 100644
--- a/python/cudf_polars/cudf_polars/callback.py
+++ b/python/cudf_polars/cudf_polars/callback.py
@@ -15,6 +15,7 @@
 
 from polars.exceptions import ComputeError, PerformanceWarning
 
+import pylibcudf
 import rmm
 from rmm._cuda import gpu
 
@@ -32,8 +33,26 @@
 __all__: list[str] = ["execute_with_cudf"]
 
 
+_SUPPORTED_PREFETCHES = {
+    "column_view::get_data",
+    "mutable_column_view::get_data",
+    "gather",
+    "hash_join",
+}
+
+
+def _env_get_int(name, default):
+    try:
+        return int(os.getenv(name, default))
+    except (ValueError, TypeError):  # pragma: no cover
+        return default  # pragma: no cover
+
+
 @cache
-def default_memory_resource(device: int) -> rmm.mr.DeviceMemoryResource:
+def default_memory_resource(
+    device: int,
+    cuda_managed_memory: bool,  # noqa: FBT001
+) -> rmm.mr.DeviceMemoryResource:
     """
     Return the default memory resource for cudf-polars.
 
@@ -42,15 +61,35 @@ def default_memory_resource(device: int) -> rmm.mr.DeviceMemoryResource:
     device
         Disambiguating device id when selecting the device. Must be
         the active device when this function is called.
+    cuda_managed_memory
+        Whether to use managed memory or not.
 
     Returns
     -------
     rmm.mr.DeviceMemoryResource
         The default memory resource that cudf-polars uses. Currently
-        an async pool resource.
+        a managed memory resource, if `cuda_managed_memory` is `True`.
+        else, an async pool resource is returned.
     """
     try:
-        return rmm.mr.CudaAsyncMemoryResource()
+        if (
+            cuda_managed_memory
+            and pylibcudf.utils._is_concurrent_managed_access_supported()
+        ):
+            # Allocating 80% of the available memory for the pool.
+            # Leaving a 20% headroom to avoid OOM errors.
+            free_memory, _ = rmm.mr.available_device_memory()
+            free_memory = int(round(float(free_memory) * 0.80 / 256) * 256)
+            for key in _SUPPORTED_PREFETCHES:
+                pylibcudf.experimental.enable_prefetching(key)
+            mr = rmm.mr.PrefetchResourceAdaptor(
+                rmm.mr.PoolMemoryResource(
+                    rmm.mr.ManagedMemoryResource(),
+                    initial_pool_size=free_memory,
+                )
+            )
+        else:
+            mr = rmm.mr.CudaAsyncMemoryResource()
     except RuntimeError as e:  # pragma: no cover
         msg, *_ = e.args
         if (
@@ -64,6 +103,8 @@ def default_memory_resource(device: int) -> rmm.mr.DeviceMemoryResource:
             ) from None
         else:
             raise
+    else:
+        return mr
 
 
 @contextlib.contextmanager
@@ -89,10 +130,15 @@ def set_memory_resource(
     at entry. If a memory resource is provided, it must be valid to
     use with the currently active device.
     """
+    previous = rmm.mr.get_current_device_resource()
     if mr is None:
         device: int = gpu.getDevice()
-        mr = default_memory_resource(device)
-    previous = rmm.mr.get_current_device_resource()
+        mr = default_memory_resource(
+            device=device,
+            cuda_managed_memory=bool(
+                _env_get_int("POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY", default=1) != 0
+            ),
+        )
     rmm.mr.set_current_device_resource(mr)
     try:
         yield mr
diff --git a/python/cudf_polars/tests/test_config.py b/python/cudf_polars/tests/test_config.py
index 25b71716eed..52c5c9894fe 100644
--- a/python/cudf_polars/tests/test_config.py
+++ b/python/cudf_polars/tests/test_config.py
@@ -10,6 +10,7 @@
 
 import rmm
 
+from cudf_polars.callback import default_memory_resource
 from cudf_polars.dsl.ir import DataFrameScan
 from cudf_polars.testing.asserts import (
     assert_gpu_result_equal,
@@ -58,6 +59,25 @@ def test_invalid_memory_resource_raises(mr):
         q.collect(engine=pl.GPUEngine(memory_resource=mr))
 
 
+@pytest.mark.parametrize("disable_managed_memory", ["1", "0"])
+def test_cudf_polars_enable_disable_managed_memory(monkeypatch, disable_managed_memory):
+    q = pl.LazyFrame({"a": [1, 2, 3]})
+
+    with monkeypatch.context() as monkeycontext:
+        monkeycontext.setenv(
+            "POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY", disable_managed_memory
+        )
+        result = q.collect(engine=pl.GPUEngine())
+        mr = default_memory_resource(0, bool(disable_managed_memory == "1"))
+        if disable_managed_memory == "1":
+            assert isinstance(mr, rmm.mr.PrefetchResourceAdaptor)
+            assert isinstance(mr.upstream_mr, rmm.mr.PoolMemoryResource)
+        else:
+            assert isinstance(mr, rmm.mr.CudaAsyncMemoryResource)
+        monkeycontext.delenv("POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY")
+    assert_frame_equal(q.collect(), result)
+
+
 def test_explicit_device_zero():
     q = pl.LazyFrame({"a": [1, 2, 3]})