Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Bfloat16 Benchmark and Benchmark Suite #71

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -160,3 +160,8 @@ jupyter_execute
.nvcode/

llvm.sh*

# nsys profiles
*cuda_gpu_kern_sum.json
*.sqlite
*.nsys-rep
6 changes: 0 additions & 6 deletions numbast/src/numbast/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,6 @@

import numba

# Use pynvjitlink by default. This can avoid version mismatch between system driver and
# installed CTK version.
from pynvjitlink.patch import patch_numba_linker

patch_numba_linker()

from numbast import numba_patch

from numbast.struct import bind_cxx_struct, bind_cxx_structs
Expand Down
11 changes: 8 additions & 3 deletions numbast/src/numbast/numba_patch.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
# added.


def nvrtc_compile(src, name, cc):
def nvrtc_compile(src, name, cc, ltoir=False):
"""
Compile a CUDA C/C++ source to PTX for a given compute capability.

Expand Down Expand Up @@ -67,6 +67,8 @@ def nvrtc_compile(src, name, cc):
numba_include = f"-I{numba_cuda_path}"
options = [arch, *extra_include_paths, include, numba_include, "-rdc", "true"]
options += extra_options
if ltoir:
options.append("-dlto")

# Compile the program
compile_error = nvrtc.compile_program(program, options)
Expand All @@ -84,8 +86,11 @@ def nvrtc_compile(src, name, cc):
msg = f"NVRTC log messages whilst compiling {name}:\n\n{log}"
warnings.warn(msg)

ptx = nvrtc.get_ptx(program)
return ptx, log
if ltoir:
return nvrtc.get_lto(program), log
else:
ptx = nvrtc.get_ptx(program)
return ptx, log


# Monkey-patch the existing implementation
Expand Down
63 changes: 63 additions & 0 deletions numbast_extensions/benchmarks/analyze.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import click
import json
import pandas as pd


@click.command()
@click.argument(
"gold_name", type=click.Path(exists=True, dir_okay=False, file_okay=True)
)
@click.argument(
"py_lto_off_name", type=click.Path(exists=True, dir_okay=False, file_okay=True)
)
@click.argument(
"py_lto_on_name", type=click.Path(exists=True, dir_okay=False, file_okay=True)
)
def compare_gpu_kern(gold_name, py_lto_off_name, py_lto_on_name):
"""Read profile results from gold run result and Numba kernel, compare them.

GOLD_NAME: JSON profile result of the gold kernel.
NUMBA_NAME: JSON profile result of the Numba kernel.
"""
with open(gold_name, "r") as goldf:
gold_kerns = json.load(goldf)
with open(py_lto_off_name, "r") as pyf:
lto_off_kerns = json.load(pyf)
with open(py_lto_on_name, "r") as pyf:
lto_on_kerns = json.load(pyf)

gold_kern = gold_kerns[0]
lto_off_kern = lto_off_kerns[0]
lto_on_kern = lto_on_kerns[0]

columns = [
"GOLD: " + gold_kern["Name"],
"NUMBA LTO OFF: " + lto_off_kern["Name"],
"NUMBA LTO ON: " + lto_on_kern["Name"],
]
index = [k for k in gold_kern.keys() if k != "Name"]
values = [
(gold_kern[k], lto_off_kern[k], lto_on_kern[k])
for k in gold_kern.keys()
if k != "Name"
]

df = pd.DataFrame(data=values, index=index, columns=columns)

print(df)

print("Perf Ratio (NUMBA LTO OFF / GOLD, %): ")
diff = df.iloc[:, 1] / df.iloc[:, 0] * 100
diff.index = diff.index.str.strip("%ns)").str.strip("( ")
print(diff[["Avg", "Med", "Min", "Max", "StdDev"]])

print("---------")

print("Perf Ratio (NUMBA LTO ON / GOLD, %): ")
diff = df.iloc[:, 2] / df.iloc[:, 0] * 100
diff.index = diff.index.str.strip("%ns)").str.strip("( ")
print(diff[["Avg", "Med", "Min", "Max", "StdDev"]])


if __name__ == "__main__":
compare_gpu_kern()
46 changes: 46 additions & 0 deletions numbast_extensions/benchmarks/run_benchmark.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/bin/bash

NUMBAST_BENCH_KERN_REPETITION=1000

BENCH_NAME=test_arithmetic_bf16

PY_NAME=${BENCH_NAME}.py
PY_PTX=${BENCH_NAME}_py.ptx

GOLD_NAME=${BENCH_NAME}_gold
GOLD_SRC_NAME=${GOLD_NAME}.cu
GOLD_PTX=${GOLD_NAME}.ptx

COMPUTE_CAP=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader|head -n 1)
SMCC=sm_${COMPUTE_CAP//./}

# Cleanup
rm -rf *.json *.nsys-rep *.sqlite $GOLD_NAME

# Compile gold
nvcc --gpu-architecture=$SMCC $GOLD_SRC_NAME -o $GOLD_NAME

# Prof gold
nsys profile --trace cuda --force-overwrite true -o gold.nsys-rep $GOLD_NAME

# Prof py LTO OFF
nsys profile --trace cuda --force-overwrite true -o py_lto_off.nsys-rep --env-var NUMBA_CUDA_ENABLE_PYNVJITLINK=1 python $PY_NAME --lto False

# Prof py LTO ON
nsys profile --trace cuda --force-overwrite true -o py_lto_on.nsys-rep --env-var NUMBA_CUDA_ENABLE_PYNVJITLINK=1 python $PY_NAME --lto True

# Create gold nsys stat report
nsys stats --report cuda_gpu_kern_sum --format json --output . gold.nsys-rep

# Analyze py LTO OFF nsys stat report
nsys stats --report cuda_gpu_kern_sum --format json --output . py_lto_off.nsys-rep

# Analyze py LTO ON nsys stat report
nsys stats --report cuda_gpu_kern_sum --format json --output . py_lto_on.nsys-rep

echo "Benchmark completes!"
echo "The below compares the performance between gold and Numba."
echo ""

# Compare stat report
python analyze.py gold_cuda_gpu_kern_sum.json py_lto_off_cuda_gpu_kern_sum.json py_lto_on_cuda_gpu_kern_sum.json
49 changes: 49 additions & 0 deletions numbast_extensions/benchmarks/test_arithmetic_bf16.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import click
import os
import warnings

import numba.cuda as cuda
import numpy as np
from numba import float32

from numbast_extensions.bf16 import (
nv_bfloat16,
get_shims,
)


repetition_char = os.getenv("NUMBAST_BENCH_KERN_REPETITION", None)
if repetition_char is None:
warnings.warn(
"Unable to retrieve NUMBAST_BENCH_KERN_REPETITION environment variable in `py`."
"Assume repetition 1000."
)
repetition = 1000
else:
repetition = int(repetition_char)


@click.command()
@click.option("--lto", type=click.BOOL, required=True)
def _run(lto):
@cuda.jit(link=get_shims(), lto=lto)
def kernel(arith):
# Binary Arithmetic Operators
a = nv_bfloat16(1.0)
b = nv_bfloat16(2.0)

arith[0] = float32(a + b)
arith[1] = float32(a - b)
arith[2] = float32(a * b)
arith[3] = float32(a / b)

arith = np.zeros(4, dtype=np.float32)

for _ in range(repetition):
kernel[1, 1](arith)

assert all(arith == [3.0, -1.0, 2.0, 0.5])


if __name__ == "__main__":
_run()
48 changes: 48 additions & 0 deletions numbast_extensions/benchmarks/test_arithmetic_bf16_gold.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#include <cstdlib>
#include <iostream>
#include <string>

#include <cuda_bf16.h>

__global__ void simple_kernel(float *arith) {
// Binary Arithmetic Operators
nv_bfloat16 a = nv_bfloat16(1.0f);
nv_bfloat16 b = nv_bfloat16(2.0f);
arith[0] = float(a + b);
arith[1] = float(a - b);
arith[2] = float(a * b);
arith[3] = float(a / b);
}

int main(void) {
char *repetition_char = std::getenv("NUMBAST_BENCH_KERN_REPETITION");
if (repetition_char == nullptr)
std::cout << "Unable to retrieve NUMBAST_BENCH_KERN_REPETITION environment "
"variable in `gold`. Assume repetition 1000."
<< std::endl;
int repetition =
repetition_char ? std::stoi(std::string(repetition_char)) : 1000;

int N = 4;
float *arith, *arith_d;
arith = (float *)malloc(N * sizeof(float));

cudaMalloc(&arith_d, N * sizeof(float));

for (int i = 0; i < N; i++) {
arith[i] = 0.0f;
}
cudaMemcpy(arith_d, arith, N * sizeof(float), cudaMemcpyHostToDevice);

for (int i = 0; i < repetition; i++)
simple_kernel<<<1, 1>>>(arith_d);

cudaDeviceSynchronize();

cudaMemcpy(arith, arith_d, N * sizeof(float), cudaMemcpyDeviceToHost);

cudaFree(arith_d);
free(arith);

return 0;
}
Loading