triton-inference-server · rmccorm4 · Aug 10, 2023 · Jul 29, 2023 · Jul 31, 2023 · Jul 31, 2023
diff --git a/qa/L0_lifecycle/lifecycle_test.py b/qa/L0_lifecycle/lifecycle_test.py
@@ -31,6 +31,7 @@
 sys.path.append("../common")
 
 import concurrent.futures
+import json
 import os
 import shutil
 import signal
@@ -2789,7 +2790,7 @@ def test_load_gpu_limit(self):
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
-    def test_concurrent_load_speedup(self):
+    def test_concurrent_model_load_speedup(self):
         # Initialize client
         try:
             triton_client = grpcclient.InferenceServerClient(
@@ -2835,7 +2836,7 @@ def test_concurrent_load_speedup(self):
             for model_name in model_pair:
                 self.assertTrue(triton_client.is_model_ready(model_name))
 
-    def test_concurrent_load(self):
+    def test_concurrent_model_load(self):
         # Initialize client
         try:
             triton_client = grpcclient.InferenceServerClient(
@@ -2864,7 +2865,7 @@ def test_concurrent_load(self):
         model_metadata = triton_client.get_model_metadata("identity_model")
         self.assertEqual(model_metadata.platform, "python")
 
-    def test_concurrent_load_unload(self):
+    def test_concurrent_model_load_unload(self):
         # Initialize client
         try:
             triton_client = grpcclient.InferenceServerClient(
@@ -2916,6 +2917,189 @@ def test_concurrent_load_unload(self):
             for model_name in model_names:
                 self.assertEqual(is_load, triton_client.is_model_ready(model_name))
 
+    def test_concurrent_model_instance_load_speedup(self):
+        # Initialize client
+        try:
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
+        except Exception as ex:
+            self.assertTrue(False, "unexpected error {}".format(ex))
+        models = ["identity_fp32"]
+        # Create 2 instances which each have a delay time of 10 seconds.
+        num_instances = 2
+        instance_group = [{"kind": "KIND_CPU", "count": num_instances}]
+        config = {"instance_group": instance_group}
+        for model in models:
+            # Instances should be loaded concurrently for supported backends
+            start_time = time.time()
+            try:
+                triton_client.load_model(model, config=json.dumps(config))
+            except Exception as ex:
+                self.assertTrue(False, "unexpected error {}".format(ex))
+            end_time = time.time()
+            loading_time = end_time - start_time
+            print(f"Time to load {num_instances} instances: {loading_time}")
+
+            # Each of the two models has a minimum loading delay of 10 seconds
+            # Speedup is observed when the concurrent loading time < 20 seconds
+            # but use a tighter bound of 15 seconds
+            self.assertLess(
+                loading_time, 15.0, "Concurrent loading speedup not observed"
+            )
+            # Concurrent loading time cannot be < 10 seconds
+            self.assertGreaterEqual(
+                loading_time, 10.0, "Invalid concurrent loading time"
+            )
+            # Make sure the models are loaded
+            self.assertTrue(triton_client.is_server_live())
+            self.assertTrue(triton_client.is_server_ready())
+            self.assertTrue(triton_client.is_model_ready(model))
+
+    def _call_with_timeout(self, callable, timeout_secs):
+        # Setup handler for timing out call
+        def timeout_handler(sig, frame):
+            raise TimeoutError()
+
+        signal.signal(signal.SIGALRM, timeout_handler)
+        signal.alarm(timeout_secs)
+        result = callable()
+        return result
+
+    def _call_with_expected_timeout(self, callable, timeout_secs=3):
+        # Call callable with expectation that it will timeout
+        try:
+            self._call_with_timeout(callable, timeout_secs)
+        except TimeoutError:
+            print("Inference timed out as expected.")
+            return
+        except Exception as ex:
+            self.assertTrue(False, "unexpected error {}".format(ex))
+        else:
+            self.assertTrue(False, "unexpected success, call should've timed out.")
+
+    def _get_fp32_io(self, client_type):
+        # Config
+        input_names = ["INPUT0", "INPUT1"]
+        output_names = ["OUTPUT0", "OUTPUT1"]
+        dtype, dims, shape = ("TYPE_FP32", [-1, 16], [1, 16])
+        input_config = [
+            {"name": name, "data_type": dtype, "dims": dims} for name in input_names
+        ]
+        output_config = [
+            {"name": name, "data_type": dtype, "dims": dims} for name in output_names
+        ]
+        # Inputs
+        inputs = []
+        for name in input_names:
+            inputs.append(
+                client_type.InferInput(name, shape, dtype.replace("TYPE_", ""))
+            )
+            inputs[-1].set_data_from_numpy(np.ones(shape, dtype=np.float32))
+        return input_config, output_config, inputs
+
+    def test_concurrent_model_instance_load_sanity(self):
+        cpu, gpu = "KIND_CPU", "KIND_GPU"
+        default_kinds = [cpu, gpu]
+        backend_kinds = {"plan": [gpu], "openvino": [cpu]}
+        try:
+            client_type = httpclient
+            triton_client = client_type.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
+        except Exception as ex:
+            self.assertTrue(False, "unexpected error {}".format(ex))
+
+        backends = os.environ.get("PARALLEL_BACKENDS", "").split()
+        self.assertTrue(len(backends) > 0, "PARALLEL_BACKENDS wasn't set")
+
+        num_instances = 5
+        input_config, output_config, inputs = self._get_fp32_io(client_type)
+        for backend in backends:
+            model = tu.get_model_name(backend, np.float32, np.float32, np.float32)
+            kinds = backend_kinds.get(backend, default_kinds)
+            for kind in kinds:
+                with self.subTest(backend=backend, model=model, kind=kind):
+                    # Setup model config
+                    instance_group = {"kind": kind, "count": num_instances}
+                    # Disable batching to guarantee 1 request per instance
+                    # Configure sequence batching such that each instance cannot accept new requests
+                    # while it is busy with an ongoing sequence. This way we can guarantee sending 1 request to each instance.
+                    max_batch_size = 0
+                    sequence_timeout_secs = 10
+                    sequence_batching = {
+                        "direct": {},
+                        "max_sequence_idle_microseconds": sequence_timeout_secs
+                        * 1000000,
+                    }
+                    config = {
+                        "instance_group": instance_group,
+                        "max_batch_size": max_batch_size,
+                        "sequence_batching": sequence_batching,
+                        "input": input_config,
+                        "output": output_config,
+                    }
+                    print(
+                        f"~~~ Backend: [{backend}], Model: [{model}], Config: [{config}] ~~~"
+                    )
+                    # Load the model
+                    try:
+                        triton_client.load_model(model, config=json.dumps(config))
+                    except Exception as ex:
+                        self.assertTrue(False, "unexpected error {}".format(ex))
+
+                    # Make sure the model is loaded
+                    self.assertTrue(triton_client.is_server_live())
+                    self.assertTrue(triton_client.is_model_ready(model))
+                    print(
+                        "Model Repository Index after load:",
+                        triton_client.get_model_repository_index(),
+                    )
+
+                    # Test inference on each instance
+                    for i in range(1, num_instances + 1):
+                        try:
+                            triton_client.infer(
+                                model, inputs, sequence_id=i, sequence_start=True
+                            )
+                        except Exception as ex:
+                            self.assertTrue(
+                                False, "unexpected inference error {}".format(ex)
+                            )
+
+                    # Each instance should be busy until their sequence times out, so
+                    # an additional infer call should time out. If it doesn't time out, something
+                    # is wrong and the test should fail.
+                    callable = partial(
+                        triton_client.infer,
+                        model,
+                        inputs,
+                        sequence_id=num_instances + 1,
+                        sequence_start=True,
+                    )
+                    self._call_with_expected_timeout(callable, timeout_secs=3)
+
+                    # Unload the model
+                    try:
+                        triton_client.unload_model(model)
+                    except Exception as ex:
+                        self.assertTrue(False, "unexpected error {}".format(ex))
+
+                    # Allow server to fully unload model before next test iteration
+                    num_tries = 10
+                    for i in range(num_tries):
+                        if triton_client.is_server_ready():
+                            break
+                        print(
+                            f"[Attempt {i}] Server not ready yet, sleeping and retrying. Current repository index: {triton_client.get_model_repository_index()}"
+                        )
+                        time.sleep(6)
+                    print(
+                        "Model Repository Index after unload attempts:",
+                        triton_client.get_model_repository_index(),
+                    )
+                    self.assertTrue(triton_client.is_server_ready())
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_lifecycle/test.sh b/qa/L0_lifecycle/test.sh
@@ -1713,7 +1713,7 @@ wait $SERVER_PID
 
 LOG_IDX=$((LOG_IDX+1))
 
-# LifeCycleTest.test_concurrent_load_speedup
+# LifeCycleTest.test_concurrent_model_load_speedup
 rm -rf models
 mkdir models
 MODEL_NAME="identity_zero_1_int32"
@@ -1743,7 +1743,7 @@ if [ "$SERVER_PID" == "0" ]; then
 fi
 
 set +e
-python $LC_TEST LifeCycleTest.test_concurrent_load_speedup >>$CLIENT_LOG 2>&1
+python $LC_TEST LifeCycleTest.test_concurrent_model_load_speedup >>$CLIENT_LOG 2>&1
 if [ $? -ne 0 ]; then
     cat $CLIENT_LOG
     echo -e "\n***\n*** Test Failed\n***"
@@ -1756,7 +1756,7 @@ wait $SERVER_PID
 
 LOG_IDX=$((LOG_IDX+1))
 
-# LifeCycleTest.test_concurrent_load
+# LifeCycleTest.test_concurrent_model_load
 rm -rf models models_v1 models_v2
 mkdir models models_v2
 cp -r identity_zero_1_int32 models/identity_model && \
@@ -1778,7 +1778,7 @@ if [ "$SERVER_PID" == "0" ]; then
 fi
 
 set +e
-python $LC_TEST LifeCycleTest.test_concurrent_load >>$CLIENT_LOG 2>&1
+python $LC_TEST LifeCycleTest.test_concurrent_model_load >>$CLIENT_LOG 2>&1
 if [ $? -ne 0 ]; then
     cat $CLIENT_LOG
     echo -e "\n***\n*** Test Failed\n***"
@@ -1791,7 +1791,7 @@ wait $SERVER_PID
 
 LOG_IDX=$((LOG_IDX+1))
 
-# LifeCycleTest.test_concurrent_load_unload
+# LifeCycleTest.test_concurrent_model_load_unload
 rm -rf models
 mkdir models
 cp -r identity_zero_1_int32 models && mkdir -p models/identity_zero_1_int32/1
@@ -1813,7 +1813,7 @@ if [ "$SERVER_PID" == "0" ]; then
 fi
 
 set +e
-python $LC_TEST LifeCycleTest.test_concurrent_load_unload >>$CLIENT_LOG 2>&1
+python $LC_TEST LifeCycleTest.test_concurrent_model_load_unload >>$CLIENT_LOG 2>&1
 if [ $? -ne 0 ]; then
     cat $CLIENT_LOG
     echo -e "\n***\n*** Test Failed\n***"
@@ -1824,6 +1824,82 @@ set -e
 kill $SERVER_PID
 wait $SERVER_PID
 
+# LifeCycleTest.test_concurrent_model_instance_load_speedup
+rm -rf models
+mkdir models
+MODEL_NAME="identity_fp32"
+cp -r ../python_models/${MODEL_NAME} models/ && (cd models/${MODEL_NAME} && \
+    mkdir 1 && mv model.py 1 && \
+    echo "    def initialize(self, args):" >> 1/model.py && \
+    echo "        import time" >> 1/model.py && \
+    echo "        time.sleep(10)" >> 1/model.py)
+rm models/${MODEL_NAME}/config.pbtxt
+
+SERVER_ARGS="--model-repository=`pwd`/models --model-control-mode=explicit"
+SERVER_LOG="./inference_server_$LOG_IDX.log"
+run_server
+if [ "$SERVER_PID" == "0" ]; then
+    echo -e "\n***\n*** Failed to start $SERVER\n***"
+    cat $SERVER_LOG
+    exit 1
+fi
+
+set +e
+python $LC_TEST LifeCycleTest.test_concurrent_model_instance_load_speedup >>$CLIENT_LOG 2>&1
+if [ $? -ne 0 ]; then
+    cat $CLIENT_LOG
+    echo -e "\n***\n*** Test Failed\n***"
+    RET=1
+fi
+set -e
+
+kill $SERVER_PID
+wait $SERVER_PID
+
+LOG_IDX=$((LOG_IDX+1))
+
+# LifeCycleTest.test_concurrent_model_instance_load_sanity
+rm -rf models
+mkdir models
+# Sanity check loading multiple instances in parallel for each supported backend
+PARALLEL_BACKENDS="python onnx"
+for backend in ${PARALLEL_BACKENDS} ; do
+    model="${backend}_float32_float32_float32"
+    model_dir="models/${model}"
+    if [[ $backend == "python" ]]; then
+      cp -r ../python_models/identity_fp32 ${model_dir}
+      mkdir ${model_dir}/1 && mv ${model_dir}/model.py ${model_dir}/1
+      rm ${model_dir}/config.pbtxt
+    else
+      mkdir models/${model}
+      cp -r $DATADIR/qa_model_repository/${model}/1 models/${model}/1
+    fi
+done
+
+SERVER_ARGS="--model-repository=`pwd`/models --model-control-mode=explicit --log-verbose=2"
+SERVER_LOG="./inference_server_$LOG_IDX.log"
+run_server
+if [ "$SERVER_PID" == "0" ]; then
+    echo -e "\n***\n*** Failed to start $SERVER\n***"
+    cat $SERVER_LOG
+    exit 1
+fi
+
+set +e
+PARALLEL_BACKENDS=${PARALLEL_BACKENDS} python $LC_TEST LifeCycleTest.test_concurrent_model_instance_load_sanity >>$CLIENT_LOG 2>&1
+if [ $? -ne 0 ]; then
+    cat $CLIENT_LOG
+    echo -e "\n***\n*** Test Failed\n***"
+    RET=1
+fi
+set -e
+
+kill $SERVER_PID
+wait $SERVER_PID
+
+LOG_IDX=$((LOG_IDX+1))
+
+
 if [ $RET -eq 0 ]; then
   echo -e "\n***\n*** Test Passed\n***"
 fi

diff --git a/qa/L0_onnx_optimization/test.sh b/qa/L0_onnx_optimization/test.sh
@@ -61,8 +61,11 @@ for MODEL in \
        models/${MODEL}_test && \
     rm -fr models/${MODEL}_test/2 && \
     rm -fr models/${MODEL}_test/3 && \
+    # Set instance count > 1 to test parallel instance loading across all EPs
+    INSTANCE_COUNT=5
     (cd models/${MODEL}_test && \
-            sed -i 's/_float32_float32_float32/&_test/' config.pbtxt) && \
+            sed -i 's/_float32_float32_float32/&_test/' config.pbtxt && \
+            echo -e "\ninstance_group { count: ${INSTANCE_COUNT} }" >> config.pbtxt) && \
     # CUDA EP optimization params
     cp -r models/${MODEL}_test models/${MODEL}_cuda_config && \
     (cd models/${MODEL}_cuda_config && \

diff --git a/qa/python_models/bls_model_loading/model.py b/qa/python_models/bls_model_loading/model.py
@@ -24,6 +24,7 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import time
 import unittest
 
 import numpy as np
@@ -38,6 +39,10 @@ def tearDown(self):
         # The unload call does not wait for the requested model to be fully
         # unloaded before returning.
         pb_utils.unload_model(self.model_name)
+        # TODO: Make this more robust to wait until fully unloaded
+        print("Sleep 30 seconds to make sure model finishes unloading...")
+        time.sleep(30)
+        print("Done sleeping.")
 
     def test_load_unload_model(self):
         self.assertFalse(pb_utils.is_model_ready(model_name=self.model_name))