-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbackend_tensorrt.py
84 lines (67 loc) · 3.11 KB
/
backend_tensorrt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# See
# https://github.com/shinh/opbench/blob/0a5d7a92cace9361f206866e2705bc98c9e07973/drivers/trt.py
# https://github.com/NERSC/inference_benchmarks/blob/a44d2594ae8daee53165e5f4ae468235ac471002/hep_cnn/onnx/run_tensorrt_onnx.py
from PIL import Image
import time
from collections import namedtuple
import utils
import numpy as np
import tensorrt as trt
from trt.parsers import onnxparser
from image_net_labels import labels
import backend
from cuda_profiler import cuda_profiler_start, cuda_profiler_stop
class BackendTensorRT(backend.Backend):
def __init__(self):
super(BackendTensorRT, self).__init__()
self.session = None
self.batch_size = 1
def name(self):
return "tensorrt"
def version(self):
return "unknown"
def load(self, model_path):
with builder = trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
with open(model_path, 'rb') as model:
parser.parse(model.read())
builder.max_batch_size = batch_size
# This determines the amount of memory available to the builder when building an optimized engine and should generally be set as high as possible.
builder.max_workspace_size = 1 << 20
# When the engine is built, TensorRT makes copies of the weights.
self.engine = builder.build_cuda_engine(network)
def forward_once(self):
# Determine dimensions and create page-locked memory buffers (i.e. won't be swapped to disk) to hold host inputs/outputs.
h_input = cuda.pagelocked_empty( # pycuda.driver.pagelocked_empty(shape, dtype, order="C", mem_flags=0)
self.engine.get_binding_shape(0).volume(), dtype=np.float32)
h_output = cuda.pagelocked_empty(
self.engine.get_binding_shape(1).volume(), dtype=np.float32)
# Allocate device memory for inputs and outputs.
d_input = cuda.mem_alloc(h_input.nbytes)
d_output = cuda.mem_alloc(h_output.nbytes)
# Create a stream in which to copy inputs/outputs and run inference.
stream = cuda.Stream()
context = self.engine.create_execution_context()
# Transfer input data to the GPU.
cuda.memcpy_htod_async(d_input, h_input, stream)
start = time.time()
# Run inference.
context.execute_async(bindings=[int(d_input), int(
d_output)], stream_handle=stream.handle)
end = time.time()
# Transfer predictions back from the GPU.
cuda.memcpy_dtoh_async(h_output, d_output, stream)
# Synchronize the stream
stream.synchronize()
# Return the inference time.
return end - start
def forward(self, warmup=True, num_warmup=100, num_iterations=100):
if warmup:
for i in range(num_warmup):
self.forward_once()
res = []
cuda_profiler_start()
for i in range(num_iterations):
utils.debug("processing iteration = {}".format(i))
res.append(self.forward_once())
cuda_profiler_stop()
return res