diff --git a/Dockerfile b/Dockerfile index a9cf57b6fe..31b42bd729 100644 --- a/Dockerfile +++ b/Dockerfile @@ -97,8 +97,8 @@ RUN bash -c 'if [ "$BUILD_CLIENTS_ONLY" != "1" ]; then \ ############################################################################ FROM ${TENSORFLOW_IMAGE} AS trtserver_build -ARG TRTIS_VERSION=0.11.0dev -ARG TRTIS_CONTAINER_VERSION=19.02dev +ARG TRTIS_VERSION=0.11.0 +ARG TRTIS_CONTAINER_VERSION=19.02 ARG PYVER=3.5 ARG BUILD_CLIENTS_ONLY=0 @@ -247,8 +247,8 @@ ENTRYPOINT ["/opt/tensorrtserver/nvidia_entrypoint.sh"] ############################################################################ FROM ${BASE_IMAGE} -ARG TRTIS_VERSION=0.11.0dev -ARG TRTIS_CONTAINER_VERSION=19.02dev +ARG TRTIS_VERSION=0.11.0 +ARG TRTIS_CONTAINER_VERSION=19.02 ARG PYVER=3.5 ENV TENSORRT_SERVER_VERSION ${TRTIS_VERSION} diff --git a/README.rst b/README.rst index 3e14bace35..42f0507542 100644 --- a/README.rst +++ b/README.rst @@ -30,13 +30,179 @@ NVIDIA TensorRT Inference Server ================================ - **NOTE: You are currently on the r19.02 branch which tracks - stabilization towards the next release. This branch is not usable - during stabilization.** - .. overview-begin-marker-do-not-remove +The NVIDIA TensorRT Inference Server provides a cloud inferencing +solution optimized for NVIDIA GPUs. The server provides an inference +service via an HTTP or gRPC endpoint, allowing remote clients to +request inferencing for any model being managed by the server. + +What's New In 0.11.0 Beta +------------------------- + +* `Variable-size input and output tensor support + `_. Models + that support variable-size input tensors and produce variable-size + output tensors are now supported in the model configuration by using + a dimension size of -1 for those dimensions that can take on any + size. + +* `String datatype support + `_. + For TensorFlow models and custom backends, input and output tensors + can contain strings. + +* Improved support for non-GPU systems. The inference server will run + correctly on systems that do not contain GPUs and that do not have + nvidia-docker or CUDA installed. + +Features +-------- + +* `Multiple framework support + `_. The + server can manage any number and mix of models (limited by system + disk and memory resources). Supports TensorRT, TensorFlow GraphDef, + TensorFlow SavedModel and Caffe2 NetDef model formats. Also supports + TensorFlow-TensorRT integrated models. + +* `Custom backend support + `_. The inference server + allows individual models to be implemented with custom backends + instead of by a deep-learning framework. With a custom backend a + model can implement any logic desired, while still benefiting from + the GPU support, concurrent execution, dynamic batching and other + features provided by the server. + +* The inference server `monitors the model repository + `_ + for any change and dynamically reloads the model(s) when necessary, + without requiring a server restart. Models and model versions can be + added and removed, and model configurations can be modified while + the server is running. + +* Multi-GPU support. The server can distribute inferencing across all + system GPUs. + +* `Concurrent model execution support + `_. Multiple + models (or multiple instances of the same model) can run + simultaneously on the same GPU. + +* Batching support. For models that support batching, the server can + accept requests for a batch of inputs and respond with the + corresponding batch of outputs. The inference server also supports + `dynamic batching + `_ + where individual inference requests are dynamically combined + together to improve inference throughput. Dynamic batching is + transparent to the client requesting inference. + +* `Model repositories + `_ + may reside on a locally accessible file system (e.g. NFS) or in + Google Cloud Storage. + +* Readiness and liveness `health endpoints + `_ + suitable for any orchestration or deployment framework, such as + Kubernetes. + +* `Metrics + `_ + indicating GPU utiliization, server throughput, and server latency. + .. overview-end-marker-do-not-remove +The current release of the TensorRT Inference Server is 0.11.0 beta and +corresponds to the 19.02 release of the tensorrtserver container on +`NVIDIA GPU Cloud (NGC) `_. The branch for +this release is `r19.02 +`_. + +Backwards Compatibility +----------------------- + +The inference server is still in beta. As a result, we sometimes make +non-backwards-compatible changes. You must rebuild the client +libraries and any client applications you use to talk to the inference +server to make sure they stay in sync with the server. For the clients +you must use the GitHub branch corresponding to the server. + +Compared to the r19.01 release, the 19.02 release has the following +non-backward-compatible changes: + +* The inference request header for inputs and outputs no longer allow + the byte_size field. See InferRequestHeader::Input and + InferRequestHeader::Output in `api.proto + `_. + +* The inference response header no longer returns the batch-1 + byte_size field for each output. Instead the shape and byte-size for + the full output batch is returned. See InferResponseHeader::Output + in `api.proto + `_. + +* The inference response header reports the model version as a 64-bit + integer (previously reported as an unsigned 32-bit integer). See + InferResponseHeader.model_version in `api.proto + `_, + InferRequest.model_version in `grpc_service.proto + `_, + and ModelStatus.version_status in `server_status.proto + `_. + +* For custom backends, the CustomGetOutputFn function signature has + changed to require the backend to report the shape of each computed + output. See CustomGetOutputFn_t in `custom.h + `_. + +Documentation +------------- + +The User Guide, Developer Guide, and API Reference `documentation +`_ +provide guidance on installing, building and running the latest +release of the TensorRT Inference Server. + +You can also view the documentation for the `master branch +`_ +and for `earlier releases +`_. + +The `Release Notes +`_ +and `Support Matrix +`_ +indicate the required versions of the NVIDIA Driver and CUDA, and also +describe which GPUs are supported by the inference server. + +Contributing +------------ + +Contributions to TensorRT Inference Server are more than welcome. To +contribute make a pull request and follow the guidelines outlined in +the `Contributing `_ document. + +Reporting problems, asking questions +------------------------------------ + +We appreciate any feedback, questions or bug reporting regarding this +project. When help with code is needed, follow the process outlined in +the Stack Overflow (https://stackoverflow.com/help/mcve) +document. Ensure posted examples are: + +* minimal – use as little code as possible that still produces the + same problem + +* complete – provide all parts needed to reproduce the problem. Check + if you can strip external dependency and still show the problem. The + less time we spend on reproducing problems the more time we have to + fix it + +* verifiable – test the code you're about to provide to make sure it + reproduces the problem. Remove all other problems that are not + related to your request/question. + .. |License| image:: https://img.shields.io/badge/License-BSD3-lightgrey.svg :target: https://opensource.org/licenses/BSD-3-Clause diff --git a/VERSION b/VERSION index bf20b933a5..d9df1bbc0c 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.11.0dev +0.11.0 diff --git a/docs/client.rst b/docs/client.rst index 92196084bc..8711533303 100644 --- a/docs/client.rst +++ b/docs/client.rst @@ -88,23 +88,25 @@ In the client image you can find the example executables in If your host sytem is Ubuntu-16.04, an alternative to running the examples within the tensorrtserver_clients container is to instead -copy the libraries and examples from the docker image to the host -system:: +download the client libraries and examples from the `GitHub release +page `_ +corresponding to the release you are interested in:: - $ docker run -it --rm -v/tmp:/tmp/host tensorrtserver_clients - # cp /opt/tensorrtserver/bin/image_client /tmp/host/. - # cp /opt/tensorrtserver/bin/perf_client /tmp/host/. - # cp /opt/tensorrtserver/bin/simple_client /tmp/host/. - # cp /opt/tensorrtserver/pip/tensorrtserver-*.whl /tmp/host/. - # cp /opt/tensorrtserver/lib/librequest.* /tmp/host/. + $ mkdir tensorrtserver_clients + $ cd tensorrtserver_clients + $ wget https://github.com/NVIDIA/tensorrt-inference-server/archive/v0.11.0.clients.tar.gz + $ tar xzf v0.11.0.clients.tar.gz -You can now access the files from /tmp on the host system. To run the -C++ examples you must install some dependencies on your host system:: +You can now find client example binaries in bin/, c++ libraries in +lib/, and Python client examples and wheel file in python/. + +To run the C++ examples you must install some dependencies on your +Ubuntu-16.04 host system:: $ apt-get install curl libcurl3-dev libopencv-dev libopencv-core-dev To run the Python examples you will need to additionally install the -client whl file and some other dependencies:: +wheel file and some other dependencies:: $ apt-get install python3 python3-pip $ pip3 install --user --upgrade tensorrtserver-*.whl numpy pillow diff --git a/docs/model_configuration.rst b/docs/model_configuration.rst index fc185a2d9a..1030215941 100644 --- a/docs/model_configuration.rst +++ b/docs/model_configuration.rst @@ -214,7 +214,7 @@ For TensorFlow each value is in the tensorflow namespace. For example, tensorflow::DT_FLOAT is the 32-bit floating-point value. For Caffe2 each value is in the caffe2 namespace and is prepended with -TensorProto_DataType_. For example, caffe2::TensorProto_DataType_FLOAT +TensorProto\_DataType\_. For example, caffe2::TensorProto_DataType_FLOAT is the 32-bit floating-point datatype. For Numpy each value is in the numpy module. For example, numpy.float32