triton-inference-server · indrajit96 · Oct 14, 2024 · Oct 14, 2024 · Oct 15, 2024 · Oct 18, 2024
diff --git a/docs/user_guide/metrics.md b/docs/user_guide/metrics.md
@@ -183,6 +183,15 @@ There are some places where a request would not be considered pending:
     generally brief, it will not be considered pending from Triton's
     perspective until Triton core has received the request from the frontend.
 
+#### Load Time Per-Model
+The *Model Load Duration* reflects the time to load a model from storage into GPU/CPU in seconds.
+```
+# HELP nv_model_load_duration_secs Model load time in seconds
+# TYPE nv_model_load_duration_secs gauge
+nv_model_load_duration_secs{model="input_all_optional",version="2"} 1.532738387
+nv_model_load_duration_secs{model="input_all_optional",version="1"} 11.68753265
+```
+
 ### Latencies
 
 Starting in 23.04, Triton exposes the ability to choose the types of metrics

diff --git a/qa/L0_metrics/general_metrics_test.py b/qa/L0_metrics/general_metrics_test.py
@@ -0,0 +1,178 @@
+# /usr/bin/python
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import re
+import time
+import unittest
+
+import requests
+
+_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost")
+MODEL_LOAD_TIME = "nv_model_load_duration_secs{model="
+
+
+def get_model_load_times():
+    r = requests.get(f"http://{_tritonserver_ipaddr}:8002/metrics")
+    r.raise_for_status()
+    # Initialize an empty dictionary to store the data
+    model_data = {}
+    lines = r.text.strip().split("\n")
+    for line in lines:
+        # Use regex to extract model name, version, and load time
+        match = re.match(
+            r"nv_model_load_duration_secs\{model=\"(.*?)\",version=\"(.*?)\"\} (.*)",
+            line,
+        )
+        if match:
+            model_name = match.group(1)
+            model_version = match.group(2)
+            load_time = float(match.group(3))
+            # Store in dictionary
+            if model_name not in model_data:
+                model_data[model_name] = {}
+            model_data[model_name][model_version] = load_time
+    return model_data
+
+
+def load_model_explicit(model_name, server_url="http://localhost:8000"):
+    endpoint = f"{server_url}/v2/repository/models/{model_name}/load"
+    response = requests.post(endpoint)
+    try:
+        self.assertEqual(response.status_code, 200)
+        print(f"Model '{model_name}' loaded successfully.")
+    except AssertionError:
+        print(
+            f"Failed to load model '{model_name}'. Status code: {response.status_code}"
+        )
+        print("Response:", response.text)
+
+
+def unload_model_explicit(model_name, server_url="http://localhost:8000"):
+    endpoint = f"{server_url}/v2/repository/models/{model_name}/unload"
+    response = requests.post(endpoint)
+    try:
+        self.assertEqual(response.status_code, 200)
+        print(f"Model '{model_name}' unloaded successfully.")
+    except AssertionError:
+        print(
+            f"Failed to unload model '{model_name}'. Status code: {response.status_code}"
+        )
+        print("Response:", response.text)
+
+
+class TestGeneralMetrics(unittest.TestCase):
+    def setUp(self):
+        self.model_name = "libtorch_float32_float32_float32"
+        self.model_name_multiple_versions = "input_all_optional"
+
+    def test_metrics_load_time(self):
+        model_load_times = get_model_load_times()
+        load_time = model_load_times.get(self.model_name, {}).get("1")
+
+        self.assertIsNotNone(load_time, "Model Load time not found")
+
+        dict_size = len(model_load_times)
+        self.assertEqual(dict_size, 1, "Too many model_load_time entries found")
+
+    def test_metrics_load_time_explicit_load(self):
+        model_load_times = get_model_load_times()
+        load_time = model_load_times.get(self.model_name, {}).get("1")
+
+        self.assertIsNotNone(load_time, "Model Load time not found")
+
+        dict_size = len(model_load_times)
+        self.assertEqual(dict_size, 1, "Too many model_load_time entries found")
+
+    def test_metrics_load_time_explicit_unload(self):
+        model_load_times = get_model_load_times()
+        load_time = model_load_times.get(self.model_name, {}).get("1")
+        self.assertIsNone(load_time, "Model Load time found even after unload")
+
+    def test_metrics_load_time_multiple_version_reload(self):
+        # Part 0 check start condistion, metric should not be present
+        model_load_times = get_model_load_times()
+        load_time = model_load_times.get(self.model_name, {}).get("1")
+        self.assertIsNone(load_time, "Model Load time found even before model load")
+
+        # Part 1 load multiple versions of the same model and check if slow and fast models reflect the metric correctly
+        load_model_explicit(self.model_name_multiple_versions)
+        model_load_times = get_model_load_times()
+        load_time_slow = model_load_times.get(
+            self.model_name_multiple_versions, {}
+        ).get("1")
+        load_time_fast = model_load_times.get(
+            self.model_name_multiple_versions, {}
+        ).get("2")
+        # Fail the test if load_time_slow is less than load_time_fast
+        self.assertGreaterEqual(
+            load_time_slow,
+            load_time_fast,
+            "Slow load time should be greater than or equal to fast load time",
+        )
+        # Fail the test if load_time_slow is less than 10 seconds as manual delay is 10 seconds
+        self.assertGreaterEqual(
+            load_time_slow,
+            10,
+            "Slow load time should be greater than or equal to fast load time",
+        )
+        # Fail the test if load_time_fast is greater than generous 2 seconds
+        self.assertLess(
+            load_time_fast,
+            2,
+            "Model taking too much time to load",
+        )
+
+        # Part 2 load multiple versions AGAIN and compare with prev values expect to be the same
+        # as triton does not actually load the model again.
+        load_model_explicit(self.model_name_multiple_versions)
+        model_load_times_new = get_model_load_times()
+        load_time_slow_new = model_load_times_new.get(
+            self.model_name_multiple_versions, {}
+        ).get("1")
+        load_time_fast_new = model_load_times_new.get(
+            self.model_name_multiple_versions, {}
+        ).get("2")
+        self.assertEqual(load_time_fast_new, load_time_fast)
+        self.assertEqual(load_time_slow_new, load_time_slow)
+
+        # Part 3 unload the model and expect the metrics to go away as model is not loaded now
+        unload_model_explicit(self.model_name_multiple_versions)
+        time.sleep(1)
+        model_load_times_new = get_model_load_times()
+        load_time_slow_new = model_load_times_new.get(
+            self.model_name_multiple_versions, {}
+        ).get("1")
+        load_time_fast_new = model_load_times_new.get(
+            self.model_name_multiple_versions, {}
+        ).get("2")
+        self.assertIsNone(load_time_slow_new, "Model Load time found even after unload")
+        self.assertIsNone(load_time_fast_new, "Model Load time found even after unload")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/qa/L0_metrics/test.sh b/qa/L0_metrics/test.sh
@@ -45,7 +45,6 @@ SERVER=${TRITON_DIR}/bin/tritonserver
 BASE_SERVER_ARGS="--model-repository=${MODELDIR}"
 SERVER_ARGS="${BASE_SERVER_ARGS}"
 SERVER_LOG="./inference_server.log"
-PYTHON_TEST="metrics_config_test.py"
 source ../common/util.sh
 
 CLIENT_LOG="client.log"
@@ -132,12 +131,52 @@ fi
 kill_server
 set -e
 
+### General metrics tests
+
+set +e
+CLIENT_PY="./general_metrics_test.py"
+CLIENT_LOG="general_metrics_test_client.log"
+SERVER_LOG="general_metrics_test_server.log"
+SERVER_ARGS="$BASE_SERVER_ARGS --log-verbose=1"
+PYTHON_TEST="general_metrics_test.py"
+run_and_check_server
+# Test 1 for default model control mode (all models loaded at startup)
+python3 -m pytest --junitxml="general_metrics_test.test_metrics_load_time.report.xml" $CLIENT_PY::TestGeneralMetrics::test_metrics_load_time >> $CLIENT_LOG 2>&1
+kill_server
+set -e
+
+set +e
+SERVER_ARGS="$BASE_SERVER_ARGS --model-control-mode=explicit --log-verbose=1"
+run_and_check_server
+MODEL_NAME='libtorch_float32_float32_float32'
+curl -s -w %{http_code} -X POST ${TRITONSERVER_IPADDR}:8000/v2/repository/models/${MODEL_NAME}/load
+# Test 2 for explicit mode LOAD
+python3 -m pytest --junitxml="general_metrics_test.test_metrics_load_time_explicit_load.report.xml" $CLIENT_PY::TestGeneralMetrics::test_metrics_load_time_explicit_load >> $CLIENT_LOG 2>&1
+
+curl -s -w %{http_code} -X POST ${TRITONSERVER_IPADDR}:8000/v2/repository/models/${MODEL_NAME}/unload
+# Test 3 for explicit mode UNLOAD
+python3 -m pytest --junitxml="general_metrics_test.test_metrics_load_time_explicit_unload.report.xml" $CLIENT_PY::TestGeneralMetrics::test_metrics_load_time_explicit_unload >> $CLIENT_LOG 2>&1
+kill_server
+set -e
+
+# Test 4 for explicit mode LOAD and UNLOAD with multiple versions
+set +e
+CLIENT_PY="./general_metrics_test.py"
+VERSION_DIR="${PWD}/version_models"
+SERVER_ARGS="$BASE_SERVER_ARGS --model-repository=${VERSION_DIR} --model-control-mode=explicit --log-verbose=1"
+run_and_check_server
+python3 -m pytest --junitxml="general_metrics_test.test_metrics_load_time_multiple_version_reload.report.xml" $CLIENT_PY::TestGeneralMetrics::test_metrics_load_time_multiple_version_reload >> $CLIENT_LOG 2>&1
+
+kill_server
+set -e
+
 ### Pinned memory metrics tests
 set +e
 CLIENT_PY="./pinned_memory_metrics_test.py"
 CLIENT_LOG="pinned_memory_metrics_test_client.log"
 SERVER_LOG="pinned_memory_metrics_test_server.log"
 SERVER_ARGS="$BASE_SERVER_ARGS --metrics-interval-ms=1 --model-control-mode=explicit --log-verbose=1"
+PYTHON_TEST="metrics_config_test.py"
 run_and_check_server
 python3 ${PYTHON_TEST} MetricsConfigTest.test_pinned_memory_metrics_exist -v 2>&1 | tee ${CLIENT_LOG}
 check_unit_test

diff --git a/qa/L0_metrics/version_models/input_all_optional/1/model.py b/qa/L0_metrics/version_models/input_all_optional/1/model.py
@@ -0,0 +1,49 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import json
+import time
+
+import numpy as np
+import triton_python_backend_utils as pb_utils
+
+
+class TritonPythonModel:
+    def initialize(self, args):
+        time.sleep(10)
+        self.model_config = json.loads(args["model_config"])
+
+    def execute(self, requests):
+        """This function is called on inference request."""
+
+        responses = []
+        for _ in requests:
+            # Include one of each specially parsed JSON value: nan, inf, and -inf
+            out_0 = np.array([1], dtype=np.float32)
+            out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0)
+            responses.append(pb_utils.InferenceResponse([out_tensor_0]))
+
+        return responses
diff --git a/qa/L0_metrics/version_models/input_all_optional/2/model.py b/qa/L0_metrics/version_models/input_all_optional/2/model.py
@@ -0,0 +1,47 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import json
+
+import numpy as np
+import triton_python_backend_utils as pb_utils
+
+
+class TritonPythonModel:
+    def initialize(self, args):
+        self.model_config = json.loads(args["model_config"])
+
+    def execute(self, requests):
+        """This function is called on inference request."""
+
+        responses = []
+        for _ in requests:
+            # Include one of each specially parsed JSON value: nan, inf, and -inf
+            out_0 = np.array([1], dtype=np.float32)
+            out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0)
+            responses.append(pb_utils.InferenceResponse([out_tensor_0]))
+
+        return responses