[torchcodec] Add support for Nvidia GPU Decoding

Summary: X-link: pytorch#58 1. Add CUDA support to VideoDecoder.cpp. This is done by checking what device is passed into the options and using CUDA if the device type is cuda. 2. Add -DENABLE_CUDA flag in cmake. 3. Check ENABLE_CUDA environment variable in setup.py and pass it down to cmake if it is present. 4. Add a unit test to demonstrate that CUDA decoding does work. This uses a different tensor than the one from CPU decoding because hardware decoding is intrinsically a bit inaccurate. I generated the reference tensor by dumping the tensor from the GPU on my devVM. It is possible different Nvidia hardware show different outputs. How to test this in a more robust way is TBD. 5. Added a new parameter for cuda device index for `add_video_stream`. If this is present, we will use it to do hardware decoding on a CUDA device. There is a whole bunch of TODOs: 1. Currently GPU utilization is only 7-8% when decoding the video. We need to get this higher. 2. Speed it up compared to CPU implementation. Currently this is slower than CPU decoding even for HD videos (probably because we can't hide the CPU to GPU memcpy). However, decode+resize is faster as the benchmark says. Differential Revision: D59121006
ahmadsharif1 · Jul 31, 2024 · cdedfb4 · cdedfb4
1 parent 361968f
commit cdedfb4
Show file tree

Hide file tree

Showing 18 changed files with 390 additions and 23 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,6 +1,9 @@
 cmake_minimum_required(VERSION 3.18)
 project(TorchCodec)
 
+option(ENABLE_CUDA "Enable CUDA decoding using NVDEC" OFF)
+option(ENABLE_NVTX "Enable NVTX annotations for profiling" OFF)
+
 add_subdirectory(src/torchcodec/decoders/_core)
 
 

diff --git a/README.md b/README.md
@@ -134,3 +134,7 @@ guide](CONTRIBUTING.md) for more details.
 ## License
 
 TorchCodec is released under the [BSD 3 license](./LICENSE).
+
+
+If you are building with ENABLE_CUDA and/or ENABLE_NVTX please review
+[Nvidia licenses](https://docs.nvidia.com/cuda/eula/index.html).
diff --git a/benchmarks/decoders/BenchmarkDecodersMain.cpp b/benchmarks/decoders/BenchmarkDecodersMain.cpp
@@ -145,7 +145,8 @@ void runNDecodeIterationsWithCustomOps(
         /*height=*/std::nullopt,
         /*thread_count=*/std::nullopt,
         /*dimension_order=*/std::nullopt,
-        /*stream_index=*/std::nullopt);
+        /*stream_index=*/std::nullopt,
+        /*device_string=*/std::nullopt);
 
     for (double pts : ptsList) {
       seekFrameOp.call(decoderTensor, pts);

diff --git a/benchmarks/decoders/manual_benchmark.py b/benchmarks/decoders/manual_benchmark.py
@@ -0,0 +1,101 @@
+import argparse
+import os
+import time
+
+import torch.utils.benchmark as benchmark
+
+import torchcodec
+from torchvision.transforms import Resize
+
+
+def transfer_and_resize_frame(frame, device):
+    # This should be a no-op if the frame is already on the device.
+    frame = frame.to(device)
+    frame = Resize((256, 256))(frame)
+    return frame
+
+
+def decode_full_video(video_path, decode_device):
+    decoder = torchcodec.decoders._core.create_from_file(video_path)
+    num_threads = None
+    if "cuda" in decode_device:
+        num_threads = 1
+    torchcodec.decoders._core.add_video_stream(
+        decoder, stream_index=0, device_string=decode_device, num_threads=num_threads
+    )
+    start_time = time.time()
+    frame_count = 0
+    while True:
+        try:
+            frame, *_ = torchcodec.decoders._core.get_next_frame(decoder)
+            # You can do a resize to simulate extra preproc work that happens
+            # on the GPU by uncommenting the following line:
+            # frame = transfer_and_resize_frame(frame, decode_device)
+
+            frame_count += 1
+        except Exception as e:
+            print("EXCEPTION", e)
+            break
+        # print(f"current {frame_count=}", flush=True)
+    end_time = time.time()
+    elapsed = end_time - start_time
+    fps = frame_count / (end_time - start_time)
+    print(
+        f"****** DECODED full video {decode_device=} {frame_count=} {elapsed=} {fps=}"
+    )
+    return frame_count, end_time - start_time
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--devices",
+        default="cuda:0,cpu",
+        type=str,
+        help="Comma-separated devices to test decoding on.",
+    )
+    parser.add_argument(
+        "--video",
+        type=str,
+        default=os.path.dirname(__file__) + "/../../test/resources/nasa_13013.mp4",
+    )
+    parser.add_argument(
+        "--use_torch_benchmark",
+        action=argparse.BooleanOptionalAction,
+        default=True,
+        help=(
+            "Use pytorch benchmark to measure decode time with warmup and "
+            "autorange. Without this we just run one iteration without warmup "
+            "to measure the cold start time."
+        ),
+    )
+    args = parser.parse_args()
+    video_path = args.video
+
+    if not args.use_torch_benchmark:
+        for device in args.devices.split(","):
+            print("Testing on", device)
+            decode_full_video(video_path, device)
+        return
+
+    results = []
+    for device in args.devices.split(","):
+        print("device", device)
+        t = benchmark.Timer(
+            stmt="decode_full_video(video_path, device)",
+            globals={
+                "device": device,
+                "video_path": video_path,
+                "decode_full_video": decode_full_video,
+            },
+            label="Decode+Resize Time",
+            sub_label=f"video={os.path.basename(video_path)}",
+            description=f"decode_device={device}",
+        ).blocked_autorange()
+        results.append(t)
+    compare = benchmark.Compare(results)
+    compare.print()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/setup.py b/setup.py
@@ -112,12 +112,16 @@ def _build_all_extensions_with_cmake(self):
         torch_dir = Path(torch.utils.cmake_prefix_path) / "Torch"
         cmake_build_type = os.environ.get("CMAKE_BUILD_TYPE", "Release")
         python_version = sys.version_info
+        enable_cuda = os.environ.get("ENABLE_CUDA", "")
+        enable_nvtx = os.environ.get("ENABLE_NVTX", "")
         cmake_args = [
             f"-DCMAKE_INSTALL_PREFIX={self._install_prefix}",
             f"-DTorch_DIR={torch_dir}",
             "-DCMAKE_VERBOSE_MAKEFILE=ON",
             f"-DCMAKE_BUILD_TYPE={cmake_build_type}",
             f"-DPYTHON_VERSION={python_version.major}.{python_version.minor}",
+            f"-DENABLE_CUDA={enable_cuda}",
+            f"-DENABLE_NVTX={enable_nvtx}",
         ]
 
         Path(self.build_temp).mkdir(parents=True, exist_ok=True)

diff --git a/src/torchcodec/decoders/_core/CMakeLists.txt b/src/torchcodec/decoders/_core/CMakeLists.txt
@@ -4,6 +4,21 @@ set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
 find_package(Torch REQUIRED)
+find_package(CUDA REQUIRED)
+
+file(
+  DOWNLOAD
+  https://github.com/cpm-cmake/CPM.cmake/releases/download/v0.38.3/CPM.cmake
+  ${CMAKE_CURRENT_BINARY_DIR}/cmake/CPM.cmake
+  EXPECTED_HASH SHA256=cc155ce02e7945e7b8967ddfaff0b050e958a723ef7aad3766d368940cb15494
+)
+include(${CMAKE_CURRENT_BINARY_DIR}/cmake/CPM.cmake)
+CPMAddPackage(
+    NAME NVTX
+    GITHUB_REPOSITORY NVIDIA/NVTX
+    GIT_TAG v3.1.0-c-cpp
+    GIT_SHALLOW TRUE)
+
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
 find_package(Python3 ${PYTHON_VERSION} EXACT COMPONENTS Development)
 
@@ -19,6 +34,12 @@ function(make_torchcodec_library library_name ffmpeg_target)
     )
     add_library(${library_name} SHARED ${sources})
     set_property(TARGET ${library_name} PROPERTY CXX_STANDARD 17)
+    if(ENABLE_CUDA)
+        target_compile_definitions(${library_name} PRIVATE ENABLE_CUDA=1)
+    endif()
+    if(ENABLE_NVTX)
+        target_compile_definitions(${library_name} PRIVATE ENABLE_NVTX=1)
+    endif()
 
     target_include_directories(
         ${library_name}
@@ -34,6 +55,8 @@ function(make_torchcodec_library library_name ffmpeg_target)
         ${ffmpeg_target}
         ${TORCH_LIBRARIES}
         ${Python3_LIBRARIES}
+        ${CUDA_CUDA_LIBRARY}
+        nvtx3-cpp
     )
 
     # We already set the library_name to be libtorchcodecN, so we don't want

diff --git a/src/torchcodec/decoders/_core/FFMPEGCommon.h b/src/torchcodec/decoders/_core/FFMPEGCommon.h
@@ -57,6 +57,8 @@ using UniqueAVFilterInOut = std::unique_ptr<
     Deleterp<AVFilterInOut, void, avfilter_inout_free>>;
 using UniqueAVIOContext = std::
     unique_ptr<AVIOContext, Deleterp<AVIOContext, void, avio_context_free>>;
+using UniqueAVBufferRef =
+    std::unique_ptr<AVBufferRef, Deleterp<AVBufferRef, void, av_buffer_unref>>;
 
 // av_find_best_stream is not const-correct before commit:
 // https://github.com/FFmpeg/FFmpeg/commit/46dac8cf3d250184ab4247809bc03f60e14f4c0c