triton-inference-server · indrajit96 · Oct 14, 2024 · Oct 14, 2024 · Oct 15, 2024 · Oct 18, 2024
diff --git a/docs/user_guide/metrics.md b/docs/user_guide/metrics.md
@@ -183,6 +183,15 @@ There are some places where a request would not be considered pending:
     generally brief, it will not be considered pending from Triton's
     perspective until Triton core has received the request from the frontend.
 
+#### Load Time Per-Model
+The *Model Load Duration* reflects the time to load a model from storage into GPU/CPU in seconds.
+```
+# HELP nv_model_load_duration_secs Model load time in seconds
+# TYPE nv_model_load_duration_secs gauge
+nv_model_load_duration_secs{model="input_all_optional",version="2"} 1.532738387
+nv_model_load_duration_secs{model="input_all_optional",version="1"} 11.68753265
+```
+
 ### Latencies
 
 Starting in 23.04, Triton exposes the ability to choose the types of metrics

diff --git a/qa/L0_metrics/general_metrics_test.py b/qa/L0_metrics/general_metrics_test.py
@@ -0,0 +1,80 @@
+# /usr/bin/python
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import re
+import unittest
+
+import requests
+
+_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost")
+MODEL_LOAD_TIME = "nv_model_load_duration_secs{model="
+
+
+def get_model_load_times():
+    r = requests.get(f"http://{_tritonserver_ipaddr}:8002/metrics")
+    r.raise_for_status()
+    pattern = re.compile(rf'{MODEL_LOAD_TIME}"(.*?)".*?\ (\d+\.\d+)')
+    model_load_times = {}
+    matches = pattern.findall(r.text)
+    for match in matches:
+        model_name, load_time = match
+        model_load_times[model_name] = float(load_time)
+    return model_load_times
+
+
+class TestGeneralMetrics(unittest.TestCase):
+    def setUp(self):
+        self.model_name = "libtorch_float32_float32_float32"
+
+    def test_metrics_load_time(self):
+        model_load_times = get_model_load_times()
+        load_time = model_load_times.get(self.model_name)
+
+        self.assertIsNotNone(load_time, "Model Load time not found")
+
+        dict_size = len(model_load_times)
+        self.assertEqual(dict_size, 1, "Too many model_load_time entries found")
+
+    def test_metrics_load_time_explicit_load(self):
+        model_load_times = get_model_load_times()
+        load_time = model_load_times.get(self.model_name)
+
+        self.assertIsNotNone(load_time, "Model Load time not found")
+
+        dict_size = len(model_load_times)
+        self.assertEqual(dict_size, 1, "Too many model_load_time entries found")
+
+    def test_metrics_load_time_explicit_unload(self):
+        model_load_times = get_model_load_times()
+        load_time = model_load_times.get(self.model_name)
+
+        self.assertIsNone(load_time, "Model Load time found even after unload")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/qa/L0_metrics/test.sh b/qa/L0_metrics/test.sh
@@ -45,7 +45,6 @@ SERVER=${TRITON_DIR}/bin/tritonserver
 BASE_SERVER_ARGS="--model-repository=${MODELDIR}"
 SERVER_ARGS="${BASE_SERVER_ARGS}"
 SERVER_LOG="./inference_server.log"
-PYTHON_TEST="metrics_config_test.py"
 source ../common/util.sh
 
 CLIENT_LOG="client.log"
@@ -132,12 +131,42 @@ fi
 kill_server
 set -e
 
+### General metrics tests
+
+set +e
+CLIENT_PY="./general_metrics_test.py"
+CLIENT_LOG="general_metrics_test_client.log"
+SERVER_LOG="general_metrics_test_server.log"
+SERVER_ARGS="$BASE_SERVER_ARGS --log-verbose=1"
+PYTHON_TEST="general_metrics_test.py"
+run_and_check_server
+# Test 1 for default model control mode (all models loaded at startup)
+python3 -m pytest --junitxml="general_metrics_test.test_metrics_load_time.report.xml" $CLIENT_PY::TestGeneralMetrics::test_metrics_load_time >> $CLIENT_LOG 2>&1
+kill_server
+
+set +e
+CLIENT_PY="./general_metrics_test.py"
+CLIENT_LOG="general_metrics_test_client.log"
+SERVER_LOG="general_metrics_test_server.log"
+SERVER_ARGS="$BASE_SERVER_ARGS --model-control-mode=explicit --log-verbose=1"
+run_and_check_server
+MODEL_NAME='libtorch_float32_float32_float32'
+code=`curl -s -w %{http_code} -X POST ${TRITONSERVER_IPADDR}:8000/v2/repository/models/${MODEL_NAME}/load`
+# Test 2 for explicit mode LOAD
+python3 -m pytest --junitxml="general_metrics_test.test_metrics_load_time_explicit_load.report.xml" $CLIENT_PY::TestGeneralMetrics::test_metrics_load_time_explicit_load >> $CLIENT_LOG 2>&1
+
+code=`curl -s -w %{http_code} -X POST ${TRITONSERVER_IPADDR}:8000/v2/repository/models/${MODEL_NAME}/unload`
+# Test 3 for explicit mode UNLOAD
+python3 -m pytest --junitxml="general_metrics_test.test_metrics_load_time_explicit_unload.report.xml" $CLIENT_PY::TestGeneralMetrics::test_metrics_load_time_explicit_unload >> $CLIENT_LOG 2>&1
+kill_server
+
 ### Pinned memory metrics tests
 set +e
 CLIENT_PY="./pinned_memory_metrics_test.py"
 CLIENT_LOG="pinned_memory_metrics_test_client.log"
 SERVER_LOG="pinned_memory_metrics_test_server.log"
 SERVER_ARGS="$BASE_SERVER_ARGS --metrics-interval-ms=1 --model-control-mode=explicit --log-verbose=1"
+PYTHON_TEST="metrics_config_test.py"
 run_and_check_server
 python3 ${PYTHON_TEST} MetricsConfigTest.test_pinned_memory_metrics_exist -v 2>&1 | tee ${CLIENT_LOG}
 check_unit_test