From 0ee81debab158d5c0779a16795a89ad8172abf4e Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 7 Aug 2024 18:51:36 +0000 Subject: [PATCH 01/40] fix --- vllm/entrypoints/openai/api_server.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index d44604b12fb69..fedc1ced1883e 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -12,7 +12,7 @@ from fastapi.exceptions import RequestValidationError from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse, Response, StreamingResponse -from prometheus_client import make_asgi_app +from prometheus_client import make_asgi_app, multiprocess, CollectorRegistry from starlette.routing import Mount import vllm.envs as envs @@ -148,8 +148,11 @@ async def build_async_engine_client(args) -> AsyncIterator[AsyncEngineClient]: def mount_metrics(app: FastAPI): + registry = CollectorRegistry() + multiprocess.MultiProcessCollector(registry) + # Add prometheus asgi middleware to route /metrics requests - metrics_route = Mount("/metrics", make_asgi_app()) + metrics_route = Mount("/metrics", make_asgi_app(registry=registry)) # Workaround for 307 Redirect for /metrics metrics_route.path_regex = re.compile('^/metrics(?P.*)$') app.routes.append(metrics_route) From 2de4dc483389cff1f929fb949529b68dffc6b3a4 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 8 Aug 2024 00:05:15 +0000 Subject: [PATCH 02/40] stash --- vllm/entrypoints/openai/__init__.py | 2 ++ vllm/entrypoints/openai/api_server.py | 37 +++++++++++++++++++++++---- vllm/entrypoints/openai/rpc/server.py | 3 +++ vllm/scripts.py | 3 +++ 4 files changed, 40 insertions(+), 5 deletions(-) diff --git a/vllm/entrypoints/openai/__init__.py b/vllm/entrypoints/openai/__init__.py index e69de29bb2d1d..3e9ba146d86de 100644 --- a/vllm/entrypoints/openai/__init__.py +++ b/vllm/entrypoints/openai/__init__.py @@ -0,0 +1,2 @@ +import os +os.environ["PROMETHEUS_MULTIPROC_DIR"] = "/tmp/test" \ No newline at end of file diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index fedc1ced1883e..aee3dd359392a 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1,7 +1,9 @@ +import os import asyncio import importlib import inspect import re +import tempfile from argparse import Namespace from contextlib import asynccontextmanager from http import HTTPStatus @@ -12,9 +14,9 @@ from fastapi.exceptions import RequestValidationError from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse, Response, StreamingResponse -from prometheus_client import make_asgi_app, multiprocess, CollectorRegistry from starlette.routing import Mount + import vllm.envs as envs from vllm.config import ModelConfig from vllm.engine.arg_utils import AsyncEngineArgs @@ -43,7 +45,7 @@ OpenAIServingTokenization) from vllm.logger import init_logger from vllm.usage.usage_lib import UsageContext -from vllm.utils import FlexibleArgumentParser, get_open_zmq_ipc_path +from vllm.utils import FlexibleArgumentParser, get_open_zmq_ipc_path, random_uuid from vllm.version import __version__ as VLLM_VERSION TIMEOUT_KEEP_ALIVE = 5 # seconds @@ -54,6 +56,9 @@ openai_serving_completion: OpenAIServingCompletion openai_serving_embedding: OpenAIServingEmbedding openai_serving_tokenization: OpenAIServingTokenization +prometheus_multiproc_dir: tempfile.TemporaryDirectory +prometheus_multiproc_dir = tempfile.TemporaryDirectory() +os.environ["PROMETHEUS_MULTIPROC_DIR"] = prometheus_multiproc_dir.name logger = init_logger('vllm.entrypoints.openai.api_server') @@ -106,6 +111,15 @@ async def build_async_engine_client(args) -> AsyncIterator[AsyncEngineClient]: # Otherwise, use the multiprocessing AsyncLLMEngine. else: + # Create a tmp dir to be used for Prometheus Multiprocessing. + # See: https://prometheus.github.io/client_python/multiprocess/ + # Note: TemporaryDirectory manages the lifecycle of the + # /tmp/xxx directory which is created. We use a global + # variable such that the tmp dir will exist for the life + # of server and get cleaned up at exit. + # global prometheus_multiproc_dir + + # Select random path for IPC. rpc_path = get_open_zmq_ipc_path() logger.info("Multiprocessing frontend to use %s for RPC Path.", @@ -115,7 +129,7 @@ async def build_async_engine_client(args) -> AsyncIterator[AsyncEngineClient]: rpc_server_process = Process(target=run_rpc_server, args=(engine_args, UsageContext.OPENAI_API_SERVER, - rpc_path)) + rpc_path),) rpc_server_process.start() # Build RPCClient, which conforms to AsyncEngineClient Protocol. @@ -148,8 +162,21 @@ async def build_async_engine_client(args) -> AsyncIterator[AsyncEngineClient]: def mount_metrics(app: FastAPI): - registry = CollectorRegistry() - multiprocess.MultiProcessCollector(registry) + # Lazy import such that we can set PROMETHEUS_MULTIPROC_DIR + # before prometheus_client is imported in case of multiprocessing. + from prometheus_client import make_asgi_app, multiprocess, CollectorRegistry + + prometheus_multiproc_dir_path = os.getenv("PROMETHEUS_MULTIPROC_DIR", None) + + # If set, we will use multiprocessing mode. + if prometheus_multiproc_dir_path is not None: + logger.info("Prometheus client using multiprocessing mode with " + "PROMETHEUS_MULTIPROC_DIR=%s", prometheus_multiproc_dir_path) + # See https://prometheus.github.io/client_python/exporting/http/fastapi-gunicorn/ + registry = CollectorRegistry() + multiprocess.MultiProcessCollector(registry) + else: + registry = None # Add prometheus asgi middleware to route /metrics requests metrics_route = Mount("/metrics", make_asgi_app(registry=registry)) diff --git a/vllm/entrypoints/openai/rpc/server.py b/vllm/entrypoints/openai/rpc/server.py index 617c9b7070e2c..fb6e4c1d2efbc 100644 --- a/vllm/entrypoints/openai/rpc/server.py +++ b/vllm/entrypoints/openai/rpc/server.py @@ -212,5 +212,8 @@ def signal_handler() -> None: def run_rpc_server(async_engine_args: AsyncEngineArgs, usage_context: UsageContext, rpc_path: str): + import os + print("In subprocess") + print(os.environ["PROMETHEUS_MULTIPROC_DIR"]) server = AsyncEngineRPCServer(async_engine_args, usage_context, rpc_path) asyncio.run(run_server(server)) diff --git a/vllm/scripts.py b/vllm/scripts.py index f45bfe06047de..13ce7606694a4 100644 --- a/vllm/scripts.py +++ b/vllm/scripts.py @@ -6,6 +6,8 @@ import sys from typing import List, Optional +os.environ["PROMETHEUS_MULTIPROC_DIR"] = "/tmp/test" + from openai import OpenAI from openai.types.chat import ChatCompletionMessageParam @@ -14,6 +16,7 @@ from vllm.utils import FlexibleArgumentParser + def register_signal_handlers(): def signal_handler(sig, frame): From ebd062e3cdf64484bc29ef89dc83686459157d0f Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 8 Aug 2024 00:05:58 +0000 Subject: [PATCH 03/40] remove __init__ --- vllm/entrypoints/openai/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vllm/entrypoints/openai/__init__.py b/vllm/entrypoints/openai/__init__.py index 3e9ba146d86de..e69de29bb2d1d 100644 --- a/vllm/entrypoints/openai/__init__.py +++ b/vllm/entrypoints/openai/__init__.py @@ -1,2 +0,0 @@ -import os -os.environ["PROMETHEUS_MULTIPROC_DIR"] = "/tmp/test" \ No newline at end of file From c79d165810172c2f7f12287e7e7a532379df73e5 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 8 Aug 2024 00:07:41 +0000 Subject: [PATCH 04/40] scripts fix --- vllm/entrypoints/openai/api_server.py | 2 +- vllm/scripts.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index aee3dd359392a..30c20abaaf02f 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -45,7 +45,7 @@ OpenAIServingTokenization) from vllm.logger import init_logger from vllm.usage.usage_lib import UsageContext -from vllm.utils import FlexibleArgumentParser, get_open_zmq_ipc_path, random_uuid +from vllm.utils import FlexibleArgumentParser, get_open_zmq_ipc_path from vllm.version import __version__ as VLLM_VERSION TIMEOUT_KEEP_ALIVE = 5 # seconds diff --git a/vllm/scripts.py b/vllm/scripts.py index 13ce7606694a4..b605022b6ee7e 100644 --- a/vllm/scripts.py +++ b/vllm/scripts.py @@ -6,7 +6,7 @@ import sys from typing import List, Optional -os.environ["PROMETHEUS_MULTIPROC_DIR"] = "/tmp/test" + from openai import OpenAI from openai.types.chat import ChatCompletionMessageParam From 6da5189448319b54bea52ce838f5dcf906b443c7 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 8 Aug 2024 00:09:10 +0000 Subject: [PATCH 05/40] cleanup --- vllm/scripts.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/vllm/scripts.py b/vllm/scripts.py index b605022b6ee7e..f45bfe06047de 100644 --- a/vllm/scripts.py +++ b/vllm/scripts.py @@ -6,8 +6,6 @@ import sys from typing import List, Optional - - from openai import OpenAI from openai.types.chat import ChatCompletionMessageParam @@ -16,7 +14,6 @@ from vllm.utils import FlexibleArgumentParser - def register_signal_handlers(): def signal_handler(sig, frame): From 346e5fc11fad1bf0abcc8bb6caa7995cea4dca64 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 8 Aug 2024 00:09:35 +0000 Subject: [PATCH 06/40] more cleanup --- vllm/entrypoints/openai/rpc/server.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/vllm/entrypoints/openai/rpc/server.py b/vllm/entrypoints/openai/rpc/server.py index fb6e4c1d2efbc..617c9b7070e2c 100644 --- a/vllm/entrypoints/openai/rpc/server.py +++ b/vllm/entrypoints/openai/rpc/server.py @@ -212,8 +212,5 @@ def signal_handler() -> None: def run_rpc_server(async_engine_args: AsyncEngineArgs, usage_context: UsageContext, rpc_path: str): - import os - print("In subprocess") - print(os.environ["PROMETHEUS_MULTIPROC_DIR"]) server = AsyncEngineRPCServer(async_engine_args, usage_context, rpc_path) asyncio.run(run_server(server)) From b1d945d12894abcc4f5f57844f54b36166b2ab60 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 8 Aug 2024 00:12:10 +0000 Subject: [PATCH 07/40] clean --- vllm/entrypoints/openai/api_server.py | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 30c20abaaf02f..b1553883751f2 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -57,8 +57,6 @@ openai_serving_embedding: OpenAIServingEmbedding openai_serving_tokenization: OpenAIServingTokenization prometheus_multiproc_dir: tempfile.TemporaryDirectory -prometheus_multiproc_dir = tempfile.TemporaryDirectory() -os.environ["PROMETHEUS_MULTIPROC_DIR"] = prometheus_multiproc_dir.name logger = init_logger('vllm.entrypoints.openai.api_server') @@ -111,14 +109,9 @@ async def build_async_engine_client(args) -> AsyncIterator[AsyncEngineClient]: # Otherwise, use the multiprocessing AsyncLLMEngine. else: - # Create a tmp dir to be used for Prometheus Multiprocessing. - # See: https://prometheus.github.io/client_python/multiprocess/ - # Note: TemporaryDirectory manages the lifecycle of the - # /tmp/xxx directory which is created. We use a global - # variable such that the tmp dir will exist for the life - # of server and get cleaned up at exit. - # global prometheus_multiproc_dir - + global prometheus_multiproc_dir + prometheus_multiproc_dir = tempfile.TemporaryDirectory() + os.environ["PROMETHEUS_MULTIPROC_DIR"] = prometheus_multiproc_dir.name # Select random path for IPC. rpc_path = get_open_zmq_ipc_path() @@ -166,13 +159,7 @@ def mount_metrics(app: FastAPI): # before prometheus_client is imported in case of multiprocessing. from prometheus_client import make_asgi_app, multiprocess, CollectorRegistry - prometheus_multiproc_dir_path = os.getenv("PROMETHEUS_MULTIPROC_DIR", None) - - # If set, we will use multiprocessing mode. - if prometheus_multiproc_dir_path is not None: - logger.info("Prometheus client using multiprocessing mode with " - "PROMETHEUS_MULTIPROC_DIR=%s", prometheus_multiproc_dir_path) - # See https://prometheus.github.io/client_python/exporting/http/fastapi-gunicorn/ + if "PROMETHEUS_MULTIPROC_DIR" in os.environ: registry = CollectorRegistry() multiprocess.MultiProcessCollector(registry) else: From 460b621f386843d82f100b8603884745ad01c0de Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 8 Aug 2024 00:13:13 +0000 Subject: [PATCH 08/40] clean --- vllm/entrypoints/openai/api_server.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index b1553883751f2..4511770f9e85c 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1,7 +1,7 @@ -import os import asyncio import importlib import inspect +import os import re import tempfile from argparse import Namespace @@ -16,7 +16,6 @@ from fastapi.responses import JSONResponse, Response, StreamingResponse from starlette.routing import Mount - import vllm.envs as envs from vllm.config import ModelConfig from vllm.engine.arg_utils import AsyncEngineArgs @@ -122,7 +121,7 @@ async def build_async_engine_client(args) -> AsyncIterator[AsyncEngineClient]: rpc_server_process = Process(target=run_rpc_server, args=(engine_args, UsageContext.OPENAI_API_SERVER, - rpc_path),) + rpc_path)) rpc_server_process.start() # Build RPCClient, which conforms to AsyncEngineClient Protocol. From 66fa98b45749a659feada1267f4b8ef3fbc8690b Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 8 Aug 2024 00:16:22 +0000 Subject: [PATCH 09/40] match nick --- vllm/entrypoints/openai/api_server.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 4511770f9e85c..6af69058b2589 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -55,7 +55,7 @@ openai_serving_completion: OpenAIServingCompletion openai_serving_embedding: OpenAIServingEmbedding openai_serving_tokenization: OpenAIServingTokenization -prometheus_multiproc_dir: tempfile.TemporaryDirectory +# prometheus_multiproc_dir: tempfile.TemporaryDirectory logger = init_logger('vllm.entrypoints.openai.api_server') @@ -108,9 +108,10 @@ async def build_async_engine_client(args) -> AsyncIterator[AsyncEngineClient]: # Otherwise, use the multiprocessing AsyncLLMEngine. else: - global prometheus_multiproc_dir - prometheus_multiproc_dir = tempfile.TemporaryDirectory() - os.environ["PROMETHEUS_MULTIPROC_DIR"] = prometheus_multiproc_dir.name + # global prometheus_multiproc_dir + # prometheus_multiproc_dir = tempfile.TemporaryDirectory() + # os.environ["PROMETHEUS_MULTIPROC_DIR"] = prometheus_multiproc_dir.name + os.environ["PROMETHEUS_MULTIPROC_DIR"] = "/tmp/testit" # Select random path for IPC. rpc_path = get_open_zmq_ipc_path() From db8671487d1152f6c23a54fbf52d5f48e76dcfa5 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 8 Aug 2024 00:29:08 +0000 Subject: [PATCH 10/40] match nick exactly --- vllm/entrypoints/openai/api_server.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 6af69058b2589..8c579814085f1 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -3,7 +3,6 @@ import inspect import os import re -import tempfile from argparse import Namespace from contextlib import asynccontextmanager from http import HTTPStatus @@ -55,7 +54,6 @@ openai_serving_completion: OpenAIServingCompletion openai_serving_embedding: OpenAIServingEmbedding openai_serving_tokenization: OpenAIServingTokenization -# prometheus_multiproc_dir: tempfile.TemporaryDirectory logger = init_logger('vllm.entrypoints.openai.api_server') @@ -108,9 +106,6 @@ async def build_async_engine_client(args) -> AsyncIterator[AsyncEngineClient]: # Otherwise, use the multiprocessing AsyncLLMEngine. else: - # global prometheus_multiproc_dir - # prometheus_multiproc_dir = tempfile.TemporaryDirectory() - # os.environ["PROMETHEUS_MULTIPROC_DIR"] = prometheus_multiproc_dir.name os.environ["PROMETHEUS_MULTIPROC_DIR"] = "/tmp/testit" # Select random path for IPC. @@ -155,8 +150,6 @@ async def build_async_engine_client(args) -> AsyncIterator[AsyncEngineClient]: def mount_metrics(app: FastAPI): - # Lazy import such that we can set PROMETHEUS_MULTIPROC_DIR - # before prometheus_client is imported in case of multiprocessing. from prometheus_client import make_asgi_app, multiprocess, CollectorRegistry if "PROMETHEUS_MULTIPROC_DIR" in os.environ: From 4029167fd0b25c5338019fba90a10a47ebba5d77 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Thu, 8 Aug 2024 08:18:09 -0700 Subject: [PATCH 11/40] grabbed nicks changes --- vllm/engine/async_llm_engine.py | 2 +- vllm/engine/llm_engine.py | 6 ++- vllm/engine/metrics.py | 73 ++------------------------- vllm/engine/metrics_types.py | 72 ++++++++++++++++++++++++++ vllm/entrypoints/openai/api_server.py | 8 +-- 5 files changed, 85 insertions(+), 76 deletions(-) create mode 100644 vllm/engine/metrics_types.py diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index b4a9520e623ea..f9e9eaf8ea4a3 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -13,7 +13,7 @@ from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_timeout import asyncio_timeout from vllm.engine.llm_engine import LLMEngine -from vllm.engine.metrics import StatLoggerBase +from vllm.engine.metrics_types import StatLoggerBase from vllm.executor.executor_base import ExecutorAsyncBase from vllm.executor.ray_utils import initialize_ray_cluster, ray from vllm.inputs import LLMInputs, PromptInputs diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 75c6d7e6c9b21..614c297dfc440 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -14,8 +14,7 @@ from vllm.core.scheduler import (ScheduledSequenceGroup, Scheduler, SchedulerOutputs) from vllm.engine.arg_utils import EngineArgs -from vllm.engine.metrics import (LoggingStatLogger, PrometheusStatLogger, - StatLoggerBase, Stats) +from vllm.engine.metrics_types import StatLoggerBase, Stats from vllm.engine.output_processor.interfaces import ( SequenceGroupOutputProcessor) from vllm.engine.output_processor.stop_checker import StopChecker @@ -320,6 +319,9 @@ def __init__( if stat_loggers is not None: self.stat_loggers = stat_loggers else: + from vllm.engine.metrics import (LoggingStatLogger, + PrometheusStatLogger) + self.stat_loggers = { "logging": LoggingStatLogger( diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 2f105b9cd2fb6..c2c59151c4db6 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -1,13 +1,12 @@ -import time -from abc import ABC, abstractmethod -from dataclasses import dataclass from typing import TYPE_CHECKING from typing import Counter as CollectionsCounter -from typing import Dict, List, Optional, Protocol, Union +from typing import Dict, List, Optional, Union import numpy as np import prometheus_client +from vllm.engine.metrics_types import (StatLoggerBase, Stats, + SupportsMetricsInfo) from vllm.executor.ray_utils import ray from vllm.logger import init_logger @@ -295,46 +294,6 @@ def build_1_2_5_buckets(max_value: int) -> List[int]: exponent += 1 -@dataclass -class Stats: - """Created by LLMEngine for use by StatLogger.""" - now: float - - # System stats (should have _sys suffix) - # Scheduler State - num_running_sys: int - num_waiting_sys: int - num_swapped_sys: int - # KV Cache Usage in % - gpu_cache_usage_sys: float - cpu_cache_usage_sys: float - - # Iteration stats (should have _iter suffix) - num_prompt_tokens_iter: int - num_generation_tokens_iter: int - time_to_first_tokens_iter: List[float] - time_per_output_tokens_iter: List[float] - num_preemption_iter: int - - # Request stats (should have _requests suffix) - # Latency - time_e2e_requests: List[float] - # Metadata - num_prompt_tokens_requests: List[int] - num_generation_tokens_requests: List[int] - best_of_requests: List[int] - n_requests: List[int] - finished_reason_requests: List[str] - - spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None - - -class SupportsMetricsInfo(Protocol): - - def metrics_info(self) -> Dict[str, str]: - ... - - def local_interval_elapsed(now: float, last_log: float, local_interval: float) -> bool: elapsed_time = now - last_log @@ -346,32 +305,6 @@ def get_throughput(tracked_stats: List[int], now: float, return float(np.sum(tracked_stats) / (now - last_log)) -class StatLoggerBase(ABC): - """Base class for StatLogger.""" - - def __init__(self, local_interval: float) -> None: - # Tracked stats over current local logging interval. - self.num_prompt_tokens: List[int] = [] - self.num_generation_tokens: List[int] = [] - self.last_local_log = time.time() - self.local_interval = local_interval - self.spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None - - @abstractmethod - def info(self, type: str, obj: SupportsMetricsInfo) -> None: - raise NotImplementedError - - @abstractmethod - def log(self, stats: Stats) -> None: - raise NotImplementedError - - def maybe_update_spec_decode_metrics(self, stats: Stats): - """Save spec decode metrics (since they are unlikely - to be emitted at same time as log interval).""" - if stats.spec_decode_metrics is not None: - self.spec_decode_metrics = stats.spec_decode_metrics - - class LoggingStatLogger(StatLoggerBase): """LoggingStatLogger is used in LLMEngine to log to Stdout.""" diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py new file mode 100644 index 0000000000000..2c84c13a41ed4 --- /dev/null +++ b/vllm/engine/metrics_types.py @@ -0,0 +1,72 @@ +import time +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Dict, List, Optional, Protocol + +from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics + + +@dataclass +class Stats: + """Created by LLMEngine for use by StatLogger.""" + now: float + + # System stats (should have _sys suffix) + # Scheduler State + num_running_sys: int + num_waiting_sys: int + num_swapped_sys: int + # KV Cache Usage in % + gpu_cache_usage_sys: float + cpu_cache_usage_sys: float + + # Iteration stats (should have _iter suffix) + num_prompt_tokens_iter: int + num_generation_tokens_iter: int + time_to_first_tokens_iter: List[float] + time_per_output_tokens_iter: List[float] + num_preemption_iter: int + + # Request stats (should have _requests suffix) + # Latency + time_e2e_requests: List[float] + # Metadata + num_prompt_tokens_requests: List[int] + num_generation_tokens_requests: List[int] + best_of_requests: List[int] + n_requests: List[int] + finished_reason_requests: List[str] + + spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None + + +class SupportsMetricsInfo(Protocol): + + def metrics_info(self) -> Dict[str, str]: + ... + + +class StatLoggerBase(ABC): + """Base class for StatLogger.""" + + def __init__(self, local_interval: float) -> None: + # Tracked stats over current local logging interval. + self.num_prompt_tokens: List[int] = [] + self.num_generation_tokens: List[int] = [] + self.last_local_log = time.time() + self.local_interval = local_interval + self.spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None + + @abstractmethod + def info(self, type: str, obj: SupportsMetricsInfo) -> None: + raise NotImplementedError + + @abstractmethod + def log(self, stats: Stats) -> None: + raise NotImplementedError + + def maybe_update_spec_decode_metrics(self, stats: Stats): + """Save spec decode metrics (since they are unlikely + to be emitted at same time as log interval).""" + if stats.spec_decode_metrics is not None: + self.spec_decode_metrics = stats.spec_decode_metrics diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 8c579814085f1..5575479a06d18 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -106,7 +106,8 @@ async def build_async_engine_client(args) -> AsyncIterator[AsyncEngineClient]: # Otherwise, use the multiprocessing AsyncLLMEngine. else: - os.environ["PROMETHEUS_MULTIPROC_DIR"] = "/tmp/testit" + if "PROMETHEUS_MULTIPROC_DIR" not in os.environ: + os.environ["PROMETHEUS_MULTIPROC_DIR"] = "/tmp/mytmpdir" # Select random path for IPC. rpc_path = get_open_zmq_ipc_path() @@ -150,14 +151,15 @@ async def build_async_engine_client(args) -> AsyncIterator[AsyncEngineClient]: def mount_metrics(app: FastAPI): - from prometheus_client import make_asgi_app, multiprocess, CollectorRegistry + from prometheus_client import (CollectorRegistry, make_asgi_app, + multiprocess) if "PROMETHEUS_MULTIPROC_DIR" in os.environ: registry = CollectorRegistry() multiprocess.MultiProcessCollector(registry) else: registry = None - + # Add prometheus asgi middleware to route /metrics requests metrics_route = Mount("/metrics", make_asgi_app(registry=registry)) # Workaround for 307 Redirect for /metrics From c2b304a1a30c58df25e06390f4a02a6eb0d96543 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 8 Aug 2024 18:16:13 +0000 Subject: [PATCH 12/40] switch to tempfile --- vllm/entrypoints/openai/api_server.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 5575479a06d18..a002b50a260a7 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -3,6 +3,7 @@ import inspect import os import re +import tempfile from argparse import Namespace from contextlib import asynccontextmanager from http import HTTPStatus @@ -54,6 +55,7 @@ openai_serving_completion: OpenAIServingCompletion openai_serving_embedding: OpenAIServingEmbedding openai_serving_tokenization: OpenAIServingTokenization +prometheus_multiproc_dir: tempfile.TemporaryDirectory logger = init_logger('vllm.entrypoints.openai.api_server') @@ -107,7 +109,17 @@ async def build_async_engine_client(args) -> AsyncIterator[AsyncEngineClient]: # Otherwise, use the multiprocessing AsyncLLMEngine. else: if "PROMETHEUS_MULTIPROC_DIR" not in os.environ: - os.environ["PROMETHEUS_MULTIPROC_DIR"] = "/tmp/mytmpdir" + # Make TemporaryDirectory for prometheus multiprocessing + # Note: global TemporaryDirectory will be automatically + # cleaned up upon exit. + global prometheus_multiproc_dir + prometheus_multiproc_dir = tempfile.TemporaryDirectory() + os.environ["PROMETHEUS_MULTIPROC_DIR"] = prometheus_multiproc_dir.name + else: + logger.warning("Found PROMETHEUS_MULTIPROC_DIR was set by user. " + "This directory must be wiped between vLLM runs or " + "you will find inaccurate metrics. Unset the variable " + "and vLLM will properly handle cleanup.") # Select random path for IPC. rpc_path = get_open_zmq_ipc_path() @@ -151,10 +163,16 @@ async def build_async_engine_client(args) -> AsyncIterator[AsyncEngineClient]: def mount_metrics(app: FastAPI): + # PROMETHEUS_MULTIPROC_DIR needs to be set before we import + # the prometheus_client. So lazy import here. + # see: https://prometheus.github.io/client_python/multiprocess/ from prometheus_client import (CollectorRegistry, make_asgi_app, multiprocess) - if "PROMETHEUS_MULTIPROC_DIR" in os.environ: + prometheus_multiproc_dir_path = os.getenv("PROMETHEUS_MULTIPROC_DIR", None) + if prometheus_multiproc_dir_path is not None: + logger.info("vLLM to use %s as PROMETHEUS_MULTIPROC_DIR", + prometheus_multiproc_dir_path) registry = CollectorRegistry() multiprocess.MultiProcessCollector(registry) else: From dea689626730d8fa8f6dc8efe6b620f5626d654a Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 8 Aug 2024 18:20:00 +0000 Subject: [PATCH 13/40] add comment --- vllm/engine/llm_engine.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 614c297dfc440..89fdd542b3fd6 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -319,6 +319,10 @@ def __init__( if stat_loggers is not None: self.stat_loggers = stat_loggers else: + # Lazy import for frontend multiprocessing. + # We need to set PROMETHEUS_MULTIPROC_DIR enviornment variable + # before prometheus_client is imported. + # See https://prometheus.github.io/client_python/multiprocess/ from vllm.engine.metrics import (LoggingStatLogger, PrometheusStatLogger) From 1082e63fb37a038bcf3a3bf5001752ff14bc86d7 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 8 Aug 2024 18:23:33 +0000 Subject: [PATCH 14/40] format --- vllm/engine/llm_engine.py | 4 ++-- vllm/entrypoints/openai/api_server.py | 23 +++++++++++++---------- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 89fdd542b3fd6..313558d8cd6a8 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -319,8 +319,8 @@ def __init__( if stat_loggers is not None: self.stat_loggers = stat_loggers else: - # Lazy import for frontend multiprocessing. - # We need to set PROMETHEUS_MULTIPROC_DIR enviornment variable + # Lazy import for prometheus multiprocessing. + # We need to set PROMETHEUS_MULTIPROC_DIR environment variable # before prometheus_client is imported. # See https://prometheus.github.io/client_python/multiprocess/ from vllm.engine.metrics import (LoggingStatLogger, diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index a002b50a260a7..7e37f11eab08e 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -110,16 +110,18 @@ async def build_async_engine_client(args) -> AsyncIterator[AsyncEngineClient]: else: if "PROMETHEUS_MULTIPROC_DIR" not in os.environ: # Make TemporaryDirectory for prometheus multiprocessing - # Note: global TemporaryDirectory will be automatically + # Note: global TemporaryDirectory will be automatically # cleaned up upon exit. global prometheus_multiproc_dir prometheus_multiproc_dir = tempfile.TemporaryDirectory() - os.environ["PROMETHEUS_MULTIPROC_DIR"] = prometheus_multiproc_dir.name + os.environ[ + "PROMETHEUS_MULTIPROC_DIR"] = prometheus_multiproc_dir.name else: - logger.warning("Found PROMETHEUS_MULTIPROC_DIR was set by user. " - "This directory must be wiped between vLLM runs or " - "you will find inaccurate metrics. Unset the variable " - "and vLLM will properly handle cleanup.") + logger.warning( + "Found PROMETHEUS_MULTIPROC_DIR was set by user. " + "This directory must be wiped between vLLM runs or " + "you will find inaccurate metrics. Unset the variable " + "and vLLM will properly handle cleanup.") # Select random path for IPC. rpc_path = get_open_zmq_ipc_path() @@ -163,15 +165,16 @@ async def build_async_engine_client(args) -> AsyncIterator[AsyncEngineClient]: def mount_metrics(app: FastAPI): - # PROMETHEUS_MULTIPROC_DIR needs to be set before we import - # the prometheus_client. So lazy import here. - # see: https://prometheus.github.io/client_python/multiprocess/ + # Lazy import for prometheus multiprocessing. + # We need to set PROMETHEUS_MULTIPROC_DIR environment variable + # before prometheus_client is imported. + # See https://prometheus.github.io/client_python/multiprocess/ from prometheus_client import (CollectorRegistry, make_asgi_app, multiprocess) prometheus_multiproc_dir_path = os.getenv("PROMETHEUS_MULTIPROC_DIR", None) if prometheus_multiproc_dir_path is not None: - logger.info("vLLM to use %s as PROMETHEUS_MULTIPROC_DIR", + logger.info("vLLM to use %s as PROMETHEUS_MULTIPROC_DIR", prometheus_multiproc_dir_path) registry = CollectorRegistry() multiprocess.MultiProcessCollector(registry) From b26cb5376a2f706f9af1adb685b77f49d183bb77 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 8 Aug 2024 18:33:46 +0000 Subject: [PATCH 15/40] deprecate Info metrics --- vllm/engine/llm_engine.py | 2 -- vllm/engine/metrics.py | 20 -------------------- 2 files changed, 22 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 313558d8cd6a8..5114afeea2490 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -336,8 +336,6 @@ def __init__( labels=dict(model_name=model_config.served_model_name), max_model_len=self.model_config.max_model_len), } - self.stat_loggers["prometheus"].info("cache_config", - self.cache_config) self.tracer = None if self.observability_config.otlp_traces_endpoint: diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index c2c59151c4db6..e5513b6fe1b5e 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -37,9 +37,6 @@ def __init__(self, labelnames: List[str], max_model_len: int): # Unregister any existing vLLM collectors self._unregister_vllm_metrics() - # Config Information - self._create_info_cache_config() - # System stats # Scheduler State self.gauge_scheduler_running = self._gauge_cls( @@ -167,12 +164,6 @@ def __init__(self, labelnames: List[str], max_model_len: int): labelnames=labelnames, ) - def _create_info_cache_config(self) -> None: - # Config Information - self.info_cache_config = prometheus_client.Info( - name='vllm:cache_config', - documentation='information of cache_config') - def _unregister_vllm_metrics(self) -> None: for collector in list(prometheus_client.REGISTRY._collector_to_names): if hasattr(collector, "_name") and "vllm" in collector._name: @@ -267,10 +258,6 @@ def _unregister_vllm_metrics(self) -> None: # No-op on purpose pass - def _create_info_cache_config(self) -> None: - # No-op on purpose - pass - def build_1_2_5_buckets(max_value: int) -> List[int]: """ @@ -386,10 +373,6 @@ def __init__(self, local_interval: float, labels: Dict[str, str], self.metrics = self._metrics_cls(labelnames=list(labels.keys()), max_model_len=max_model_len) - def info(self, type: str, obj: SupportsMetricsInfo) -> None: - if type == "cache_config": - self.metrics.info_cache_config.info(obj.metrics_info()) - def _log_gauge(self, gauge, data: Union[int, float]) -> None: # Convenience function for logging to gauge. gauge.labels(**self.labels).set(data) @@ -523,6 +506,3 @@ def log(self, stats: Stats): class RayPrometheusStatLogger(PrometheusStatLogger): """RayPrometheusStatLogger uses Ray metrics instead.""" _metrics_cls = RayMetrics - - def info(self, type: str, obj: SupportsMetricsInfo) -> None: - return None From 64ba1397135bad563cd63a481c769b60a72a6b55 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 8 Aug 2024 18:41:08 +0000 Subject: [PATCH 16/40] fixt --- vllm/engine/metrics.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index e5513b6fe1b5e..e005dd68e1f08 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -28,6 +28,12 @@ # begin-metrics-definitions class Metrics: + """ + vLLM uses a multiprocessing-based frontend for the OpenAI server. + This means that we need to run prometheus_client in multiprocessing mode + See https://prometheus.github.io/client_python/multiprocess/ for more + details on limitations. + """ labelname_finish_reason = "finished_reason" _gauge_cls = prometheus_client.Gauge _counter_cls = prometheus_client.Counter @@ -295,9 +301,6 @@ def get_throughput(tracked_stats: List[int], now: float, class LoggingStatLogger(StatLoggerBase): """LoggingStatLogger is used in LLMEngine to log to Stdout.""" - def info(self, type: str, obj: SupportsMetricsInfo) -> None: - raise NotImplementedError - def log(self, stats: Stats) -> None: """Called by LLMEngine. Logs to Stdout every self.local_interval seconds.""" From 2263569505a7f6b4186b44c32ec6ec1ee14c5df2 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 8 Aug 2024 18:45:38 +0000 Subject: [PATCH 17/40] format --- vllm/engine/metrics.py | 3 +-- vllm/engine/metrics_types.py | 12 +----------- 2 files changed, 2 insertions(+), 13 deletions(-) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index e005dd68e1f08..b1e50edcb75c4 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -5,8 +5,7 @@ import numpy as np import prometheus_client -from vllm.engine.metrics_types import (StatLoggerBase, Stats, - SupportsMetricsInfo) +from vllm.engine.metrics_types import StatLoggerBase, Stats from vllm.executor.ray_utils import ray from vllm.logger import init_logger diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py index 2c84c13a41ed4..ee1736c467653 100644 --- a/vllm/engine/metrics_types.py +++ b/vllm/engine/metrics_types.py @@ -1,7 +1,7 @@ import time from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import Dict, List, Optional, Protocol +from typing import List, Optional from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics @@ -40,12 +40,6 @@ class Stats: spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None -class SupportsMetricsInfo(Protocol): - - def metrics_info(self) -> Dict[str, str]: - ... - - class StatLoggerBase(ABC): """Base class for StatLogger.""" @@ -57,10 +51,6 @@ def __init__(self, local_interval: float) -> None: self.local_interval = local_interval self.spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None - @abstractmethod - def info(self, type: str, obj: SupportsMetricsInfo) -> None: - raise NotImplementedError - @abstractmethod def log(self, stats: Stats) -> None: raise NotImplementedError From ba5c7419245b7d3361b181e3f3bdf475b55c9f5f Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 8 Aug 2024 19:03:12 +0000 Subject: [PATCH 18/40] add multiprocess mode to gauges --- vllm/engine/metrics.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index b1e50edcb75c4..aed9bece8232e 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -47,24 +47,29 @@ def __init__(self, labelnames: List[str], max_model_len: int): self.gauge_scheduler_running = self._gauge_cls( name="vllm:num_requests_running", documentation="Number of requests currently running on GPU.", - labelnames=labelnames) + labelnames=labelnames, + multiprocess_mode="sum") self.gauge_scheduler_waiting = self._gauge_cls( name="vllm:num_requests_waiting", documentation="Number of requests waiting to be processed.", - labelnames=labelnames) + labelnames=labelnames, + multiprocess_mode="sum") self.gauge_scheduler_swapped = self._gauge_cls( name="vllm:num_requests_swapped", documentation="Number of requests swapped to CPU.", - labelnames=labelnames) + labelnames=labelnames, + multiprocess_mode="sum") # KV Cache Usage in % self.gauge_gpu_cache_usage = self._gauge_cls( name="vllm:gpu_cache_usage_perc", documentation="GPU KV-cache usage. 1 means 100 percent usage.", - labelnames=labelnames) + labelnames=labelnames, + multiprocess_mode="sum") self.gauge_cpu_cache_usage = self._gauge_cls( name="vllm:cpu_cache_usage_perc", documentation="CPU KV-cache usage. 1 means 100 percent usage.", - labelnames=labelnames) + labelnames=labelnames, + mulitprocess_mode="sum") # Iteration stats self.counter_num_preemption = self._counter_cls( @@ -138,11 +143,13 @@ def __init__(self, labelnames: List[str], max_model_len: int): self.gauge_spec_decode_draft_acceptance_rate = self._gauge_cls( name="vllm:spec_decode_draft_acceptance_rate", documentation="Speulative token acceptance rate.", - labelnames=labelnames) + labelnames=labelnames, + multiprocess_mode="sum") self.gauge_spec_decode_efficiency = self._gauge_cls( name="vllm:spec_decode_efficiency", documentation="Speculative decoding system efficiency.", - labelnames=labelnames) + labelnames=labelnames, + multiprocess_mode="sum") self.counter_spec_decode_num_accepted_tokens = (self._counter_cls( name="vllm:spec_decode_num_accepted_tokens_total", documentation="Number of accepted tokens.", @@ -161,12 +168,14 @@ def __init__(self, labelnames: List[str], max_model_len: int): name="vllm:avg_prompt_throughput_toks_per_s", documentation="Average prefill throughput in tokens/s.", labelnames=labelnames, + multiprocess_mode="sum", ) # Deprecated in favor of vllm:generation_tokens_total self.gauge_avg_generation_throughput = self._gauge_cls( name="vllm:avg_generation_throughput_toks_per_s", documentation="Average generation throughput in tokens/s.", labelnames=labelnames, + multiprocess_mode="sum", ) def _unregister_vllm_metrics(self) -> None: From 694fc122d9c824030c72e228b69550c06c78d915 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 8 Aug 2024 19:08:03 +0000 Subject: [PATCH 19/40] fix typo --- vllm/engine/metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index aed9bece8232e..e030f48e1827b 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -69,7 +69,7 @@ def __init__(self, labelnames: List[str], max_model_len: int): name="vllm:cpu_cache_usage_perc", documentation="CPU KV-cache usage. 1 means 100 percent usage.", labelnames=labelnames, - mulitprocess_mode="sum") + multiprocess_mode="sum") # Iteration stats self.counter_num_preemption = self._counter_cls( From 4032b4d67a19c256078c9ef82c3a3842c7bf2c23 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 8 Aug 2024 20:35:40 +0000 Subject: [PATCH 20/40] test that metrics are exported --- tests/entrypoints/openai/test_basic.py | 9 --- tests/entrypoints/openai/test_metrics.py | 78 ++++++++++++++++++++++++ 2 files changed, 78 insertions(+), 9 deletions(-) create mode 100644 tests/entrypoints/openai/test_metrics.py diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py index 2c721d9ba7609..faada2ce64bcd 100644 --- a/tests/entrypoints/openai/test_basic.py +++ b/tests/entrypoints/openai/test_basic.py @@ -50,12 +50,3 @@ async def test_check_health(client: openai.AsyncOpenAI): response = requests.get(base_url + "/health") assert response.status_code == HTTPStatus.OK - - -@pytest.mark.asyncio -async def test_log_metrics(client: openai.AsyncOpenAI): - base_url = str(client.base_url)[:-3].strip("/") - - response = requests.get(base_url + "/metrics") - - assert response.status_code == HTTPStatus.OK diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py new file mode 100644 index 0000000000000..84658eaa8ab11 --- /dev/null +++ b/tests/entrypoints/openai/test_metrics.py @@ -0,0 +1,78 @@ +from http import HTTPStatus + +import openai +import pytest +import requests + +from ...utils import RemoteOpenAIServer + +MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" + + +@pytest.fixture(scope="module") +def default_server_args(): + return [ + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--max-model-len", + "1024", + "--enforce-eager", + "--max-num-seqs", + "128", + ] + + +@pytest.fixture(scope="module") +def client(default_server_args, request): + with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server: + yield remote_server.get_async_client() + +EXPECTED_METRICS = [ + "vllm:num_requests_running", + "vllm:num_requests_swapped", + "vllm:num_requests_waiting", + "vllm:cpu_cache_usage_perc", + "vllm:time_to_first_token_seconds_sum", + "vllm:time_to_first_token_seconds_bucket", + "vllm:time_to_first_token_seconds_count", + "vllm:time_per_output_token_seconds_sum", + "vllm:time_per_output_token_seconds_bucket", + "vllm:time_per_output_token_seconds_count", + "vllm:e2e_request_latency_seconds_sum", + "vllm:e2e_request_latency_seconds_bucket", + "vllm:e2e_request_latency_seconds_count", + "vllm:request_prompt_tokens_sum", + "vllm:request_prompt_tokens_bucket", + "vllm:request_prompt_tokens_count", + "vllm:request_generation_tokens_sum", + "vllm:request_generation_tokens_bucket", + "vllm:request_generation_tokens_count", + "vllm:request_params_n_sum", + "vllm:request_params_n_bucket", + "vllm:request_params_n_count", + "vllm:request_params_best_of_sum", + "vllm:request_params_best_of_bucket", + "vllm:request_params_best_of_count", + "vllm:num_preemptions_total", + "vllm:prompt_tokens_total", + "vllm:generation_tokens_total", + "vllm:request_success_total", +] + +@pytest.mark.asyncio +async def test_metrics_exist(client: openai.AsyncOpenAI): + base_url = str(client.base_url)[:-3].strip("/") + + await client.completions.create(model=MODEL_NAME, + prompt="Hello, my name is", + max_tokens=5, + temperature=0.0) + + response = requests.get(base_url + "/metrics") + assert response.status_code == HTTPStatus.OK + + for metric in EXPECTED_METRICS: + assert metric in response.text + + From d1fe504d52fc7a5ea4b4d0cf01ba70bb5a343ace Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 8 Aug 2024 20:38:46 +0000 Subject: [PATCH 21/40] run both in the ci --- tests/entrypoints/openai/test_metrics.py | 8 +++++++- vllm/entrypoints/openai/api_server.py | 10 ++++++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index 84658eaa8ab11..28c96353e26f2 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -20,14 +20,19 @@ def default_server_args(): "--enforce-eager", "--max-num-seqs", "128", + "--disable-frontend-multiprocessing" ] -@pytest.fixture(scope="module") +@pytest.fixture(scope="module", + params=["", "--disable-frontend-multiprocessing"]) def client(default_server_args, request): + if request.param: + default_server_args.append(request.param) with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server: yield remote_server.get_async_client() + EXPECTED_METRICS = [ "vllm:num_requests_running", "vllm:num_requests_swapped", @@ -64,6 +69,7 @@ def client(default_server_args, request): async def test_metrics_exist(client: openai.AsyncOpenAI): base_url = str(client.base_url)[:-3].strip("/") + # sending a request triggers the metrics to be logged. await client.completions.create(model=MODEL_NAME, prompt="Hello, my name is", max_tokens=5, diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 7e37f11eab08e..f4e5306077913 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -178,11 +178,13 @@ def mount_metrics(app: FastAPI): prometheus_multiproc_dir_path) registry = CollectorRegistry() multiprocess.MultiProcessCollector(registry) + + # Add prometheus asgi middleware to route /metrics requests + metrics_route = Mount("/metrics", make_asgi_app(registry=registry)) else: - registry = None - - # Add prometheus asgi middleware to route /metrics requests - metrics_route = Mount("/metrics", make_asgi_app(registry=registry)) + # Add prometheus asgi middleware to route /metrics requests + metrics_route = Mount("/metrics", make_asgi_app()) + # Workaround for 307 Redirect for /metrics metrics_route.path_regex = re.compile('^/metrics(?P.*)$') app.routes.append(metrics_route) From c65f8ea4404a1c2c46bc7ca39092c10fda8328ff Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 8 Aug 2024 20:39:14 +0000 Subject: [PATCH 22/40] format --- tests/entrypoints/openai/test_metrics.py | 5 ++--- vllm/entrypoints/openai/api_server.py | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index 28c96353e26f2..31acd5859d4d5 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -65,6 +65,7 @@ def client(default_server_args, request): "vllm:request_success_total", ] + @pytest.mark.asyncio async def test_metrics_exist(client: openai.AsyncOpenAI): base_url = str(client.base_url)[:-3].strip("/") @@ -74,11 +75,9 @@ async def test_metrics_exist(client: openai.AsyncOpenAI): prompt="Hello, my name is", max_tokens=5, temperature=0.0) - + response = requests.get(base_url + "/metrics") assert response.status_code == HTTPStatus.OK for metric in EXPECTED_METRICS: assert metric in response.text - - diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index f4e5306077913..5d21cc4a847f5 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -178,13 +178,13 @@ def mount_metrics(app: FastAPI): prometheus_multiproc_dir_path) registry = CollectorRegistry() multiprocess.MultiProcessCollector(registry) - + # Add prometheus asgi middleware to route /metrics requests metrics_route = Mount("/metrics", make_asgi_app(registry=registry)) else: # Add prometheus asgi middleware to route /metrics requests metrics_route = Mount("/metrics", make_asgi_app()) - + # Workaround for 307 Redirect for /metrics metrics_route.path_regex = re.compile('^/metrics(?P.*)$') app.routes.append(metrics_route) From e3025f751b63f23646085d651d4929ec2aaa1846 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 8 Aug 2024 20:45:26 +0000 Subject: [PATCH 23/40] fix test --- tests/entrypoints/openai/test_metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index 31acd5859d4d5..65250875f6897 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -20,7 +20,6 @@ def default_server_args(): "--enforce-eager", "--max-num-seqs", "128", - "--disable-frontend-multiprocessing" ] @@ -37,6 +36,7 @@ def client(default_server_args, request): "vllm:num_requests_running", "vllm:num_requests_swapped", "vllm:num_requests_waiting", + "vllm:gpu_cache_usage_perc", "vllm:cpu_cache_usage_perc", "vllm:time_to_first_token_seconds_sum", "vllm:time_to_first_token_seconds_bucket", From 350c66d532faf779b487585bb81015860cd702b5 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 8 Aug 2024 22:21:00 +0000 Subject: [PATCH 24/40] adding tests --- tests/entrypoints/openai/test_metrics.py | 76 +++++++++++++++++++++++- vllm/engine/metrics.py | 10 ++-- 2 files changed, 80 insertions(+), 6 deletions(-) diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index 65250875f6897..4e1bc5ab66016 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -4,8 +4,11 @@ import pytest import requests +from prometheus_client.parser import text_string_to_metric_families +from transformers import AutoTokenizer from ...utils import RemoteOpenAIServer + MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" @@ -24,7 +27,7 @@ def default_server_args(): @pytest.fixture(scope="module", - params=["", "--disable-frontend-multiprocessing"]) + params=["", "--disable-frontend-multiprocessing", "--enable-chunked-prefill"]) def client(default_server_args, request): if request.param: default_server_args.append(request.param) @@ -81,3 +84,74 @@ async def test_metrics_exist(client: openai.AsyncOpenAI): for metric in EXPECTED_METRICS: assert metric in response.text + + +_PROMPT = "Hello my name is Robert and I love doing magic because" +tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) +_TOKENIZED_PROMPT = tokenizer(_PROMPT) + +_NUM_REQUESTS = 10 +_NUM_PROMPT_TOKENS_PER_REQUEST = len(_TOKENIZED_PROMPT) +_NUM_GENERATION_TOKENS_PER_REQUEST = 10 + +# {metric_family: [(suffix, expected_value)]} +EXPECTED_VALUES = { + "vllm:time_to_first_token_seconds": [("_count", _NUM_REQUESTS)], + "vllm:time_per_output_token_seconds": [("_count", _NUM_REQUESTS)], + "vllm:e2e_request_latency_seconds": [("_count", _NUM_REQUESTS)], + "vllm:request_prompt_tokens": [ + ("_sum", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST), + ("_count", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)], + "vllm:request_generation_tokens": [ + ("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST), + ("_count", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST)], + "vllm:request_params_n": [("_count", _NUM_REQUESTS)], + "vllm:request_params_best_of": [("_count", _NUM_REQUESTS)], + "vllm:prompt_tokens": [ + ("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)], + "vllm:generation_tokens": [ + ("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)], + "vllm:request_success": [("_total", _NUM_REQUESTS)], +} + + +@pytest.mark.asyncio +async def test_metrics_counts(client: openai.AsyncOpenAI): + base_url = str(client.base_url)[:-3].strip("/") + + for _ in range(_NUM_REQUESTS): + # sending a request triggers the metrics to be logged. + await client.completions.create( + model=MODEL_NAME, + prompt=_TOKENIZED_PROMPT, + max_tokens=_NUM_GENERATION_TOKENS_PER_REQUEST, + temperature=0.0) + + + response = requests.get(base_url + "/metrics") + assert response.status_code == HTTPStatus.OK + + for metric_family, suffix_values_list in EXPECTED_VALUES.items(): + found_metric = False + for family in text_string_to_metric_families(response.text): + if family.name == metric_family: + found_metric = True + + for suffix, expected_value in suffix_values_list: + metric_name_w_suffix = f"{metric_family}{suffix}" + found_suffix = False + + for sample in family.samples: + if sample.name == metric_name_w_suffix: + found_suffix = True + assert sample.value == expected_value, ( + f"{metric_name_w_suffix} expected value of {expected_value}" + f"did not match found value {sample.value}" + ) + break + assert found_suffix, ( + f"Could not find {metric_name_w_suffix} in Prometheus endpoint") + break + + assert found_metric, ( + f"Could not find {metric_family} in Prometheus endpoint") \ No newline at end of file diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index e030f48e1827b..0dbae9a3a1e99 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -40,7 +40,7 @@ class Metrics: def __init__(self, labelnames: List[str], max_model_len: int): # Unregister any existing vLLM collectors - self._unregister_vllm_metrics() + # self._unregister_vllm_metrics() # System stats # Scheduler State @@ -178,10 +178,10 @@ def __init__(self, labelnames: List[str], max_model_len: int): multiprocess_mode="sum", ) - def _unregister_vllm_metrics(self) -> None: - for collector in list(prometheus_client.REGISTRY._collector_to_names): - if hasattr(collector, "_name") and "vllm" in collector._name: - prometheus_client.REGISTRY.unregister(collector) + # def _unregister_vllm_metrics(self) -> None: + # for collector in list(prometheus_client.REGISTRY._collector_to_names): + # if hasattr(collector, "_name") and "vllm" in collector._name: + # prometheus_client.REGISTRY.unregister(collector) # end-metrics-definitions From 2da7d138cad536c9fbb5967113cbb5aab30a925a Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 8 Aug 2024 22:23:12 +0000 Subject: [PATCH 25/40] comments in test --- tests/entrypoints/openai/test_metrics.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index 4e1bc5ab66016..fdb13d7407ecc 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -133,10 +133,13 @@ async def test_metrics_counts(client: openai.AsyncOpenAI): for metric_family, suffix_values_list in EXPECTED_VALUES.items(): found_metric = False + + # Check to see if the metric_family is found in the prom endpoint. for family in text_string_to_metric_families(response.text): if family.name == metric_family: found_metric = True + # Check that each suffix is found in the prom endpoint. for suffix, expected_value in suffix_values_list: metric_name_w_suffix = f"{metric_family}{suffix}" found_suffix = False @@ -144,6 +147,9 @@ async def test_metrics_counts(client: openai.AsyncOpenAI): for sample in family.samples: if sample.name == metric_name_w_suffix: found_suffix = True + + # For each suffix, value sure the value matches + # what we expect. assert sample.value == expected_value, ( f"{metric_name_w_suffix} expected value of {expected_value}" f"did not match found value {sample.value}" From 3d6aadecc4c9e1b82e2a4ce926390eff52528b3c Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 8 Aug 2024 23:04:05 +0000 Subject: [PATCH 26/40] format --- examples/openai_completion_client.py | 4 +- tests/entrypoints/openai/test_metrics.py | 160 ++++++++++++----------- vllm/engine/metrics.py | 10 +- 3 files changed, 88 insertions(+), 86 deletions(-) diff --git a/examples/openai_completion_client.py b/examples/openai_completion_client.py index 58519f978d340..ec10e5e234591 100644 --- a/examples/openai_completion_client.py +++ b/examples/openai_completion_client.py @@ -19,9 +19,7 @@ model=model, prompt="A robot may not injure a human being", echo=False, - n=2, - stream=stream, - logprobs=3) +) print("Completion results:") if stream: diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index fdb13d7407ecc..3229b12e45e6f 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -3,11 +3,10 @@ import openai import pytest import requests - from prometheus_client.parser import text_string_to_metric_families from transformers import AutoTokenizer -from ...utils import RemoteOpenAIServer +from ...utils import RemoteOpenAIServer MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" @@ -27,7 +26,11 @@ def default_server_args(): @pytest.fixture(scope="module", - params=["", "--disable-frontend-multiprocessing", "--enable-chunked-prefill"]) + params=[ + "", + "--enable-chunked-prefill", + "--disable-frontend-multiprocessing", + ]) def client(default_server_args, request): if request.param: default_server_args.append(request.param) @@ -35,60 +38,9 @@ def client(default_server_args, request): yield remote_server.get_async_client() -EXPECTED_METRICS = [ - "vllm:num_requests_running", - "vllm:num_requests_swapped", - "vllm:num_requests_waiting", - "vllm:gpu_cache_usage_perc", - "vllm:cpu_cache_usage_perc", - "vllm:time_to_first_token_seconds_sum", - "vllm:time_to_first_token_seconds_bucket", - "vllm:time_to_first_token_seconds_count", - "vllm:time_per_output_token_seconds_sum", - "vllm:time_per_output_token_seconds_bucket", - "vllm:time_per_output_token_seconds_count", - "vllm:e2e_request_latency_seconds_sum", - "vllm:e2e_request_latency_seconds_bucket", - "vllm:e2e_request_latency_seconds_count", - "vllm:request_prompt_tokens_sum", - "vllm:request_prompt_tokens_bucket", - "vllm:request_prompt_tokens_count", - "vllm:request_generation_tokens_sum", - "vllm:request_generation_tokens_bucket", - "vllm:request_generation_tokens_count", - "vllm:request_params_n_sum", - "vllm:request_params_n_bucket", - "vllm:request_params_n_count", - "vllm:request_params_best_of_sum", - "vllm:request_params_best_of_bucket", - "vllm:request_params_best_of_count", - "vllm:num_preemptions_total", - "vllm:prompt_tokens_total", - "vllm:generation_tokens_total", - "vllm:request_success_total", -] - - -@pytest.mark.asyncio -async def test_metrics_exist(client: openai.AsyncOpenAI): - base_url = str(client.base_url)[:-3].strip("/") - - # sending a request triggers the metrics to be logged. - await client.completions.create(model=MODEL_NAME, - prompt="Hello, my name is", - max_tokens=5, - temperature=0.0) - - response = requests.get(base_url + "/metrics") - assert response.status_code == HTTPStatus.OK - - for metric in EXPECTED_METRICS: - assert metric in response.text - - -_PROMPT = "Hello my name is Robert and I love doing magic because" +_PROMPT = "Hello my name is Robert and I love magic" tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) -_TOKENIZED_PROMPT = tokenizer(_PROMPT) +_TOKENIZED_PROMPT = tokenizer(_PROMPT)["input_ids"] _NUM_REQUESTS = 10 _NUM_PROMPT_TOKENS_PER_REQUEST = len(_TOKENIZED_PROMPT) @@ -97,20 +49,21 @@ async def test_metrics_exist(client: openai.AsyncOpenAI): # {metric_family: [(suffix, expected_value)]} EXPECTED_VALUES = { "vllm:time_to_first_token_seconds": [("_count", _NUM_REQUESTS)], - "vllm:time_per_output_token_seconds": [("_count", _NUM_REQUESTS)], + "vllm:time_per_output_token_seconds": + [("_count", _NUM_REQUESTS * (_NUM_GENERATION_TOKENS_PER_REQUEST - 1))], "vllm:e2e_request_latency_seconds": [("_count", _NUM_REQUESTS)], - "vllm:request_prompt_tokens": [ - ("_sum", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST), - ("_count", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)], - "vllm:request_generation_tokens": [ - ("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST), - ("_count", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST)], + "vllm:request_prompt_tokens": + [("_sum", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST), + ("_count", _NUM_REQUESTS)], + "vllm:request_generation_tokens": + [("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST), + ("_count", _NUM_REQUESTS)], "vllm:request_params_n": [("_count", _NUM_REQUESTS)], "vllm:request_params_best_of": [("_count", _NUM_REQUESTS)], - "vllm:prompt_tokens": [ - ("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)], - "vllm:generation_tokens": [ - ("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)], + "vllm:prompt_tokens": [("_total", + _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)], + "vllm:generation_tokens": + [("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)], "vllm:request_success": [("_total", _NUM_REQUESTS)], } @@ -124,13 +77,13 @@ async def test_metrics_counts(client: openai.AsyncOpenAI): await client.completions.create( model=MODEL_NAME, prompt=_TOKENIZED_PROMPT, - max_tokens=_NUM_GENERATION_TOKENS_PER_REQUEST, - temperature=0.0) - + max_tokens=_NUM_GENERATION_TOKENS_PER_REQUEST) response = requests.get(base_url + "/metrics") + print(response.text) assert response.status_code == HTTPStatus.OK + # Loop over all expected metric_families for metric_family, suffix_values_list in EXPECTED_VALUES.items(): found_metric = False @@ -147,17 +100,68 @@ async def test_metrics_counts(client: openai.AsyncOpenAI): for sample in family.samples: if sample.name == metric_name_w_suffix: found_suffix = True - + # For each suffix, value sure the value matches # what we expect. assert sample.value == expected_value, ( - f"{metric_name_w_suffix} expected value of {expected_value}" - f"did not match found value {sample.value}" - ) + f"{metric_name_w_suffix} expected value of " + f"{expected_value} did not match found value " + f"{sample.value}") break assert found_suffix, ( - f"Could not find {metric_name_w_suffix} in Prometheus endpoint") + f"Did not find {metric_name_w_suffix} in prom endpoint" + ) break - - assert found_metric, ( - f"Could not find {metric_family} in Prometheus endpoint") \ No newline at end of file + + assert found_metric, (f"Did not find {metric_family} in prom endpoint") + + +EXPECTED_METRICS = [ + "vllm:num_requests_running", + "vllm:num_requests_swapped", + "vllm:num_requests_waiting", + "vllm:gpu_cache_usage_perc", + "vllm:cpu_cache_usage_perc", + "vllm:time_to_first_token_seconds_sum", + "vllm:time_to_first_token_seconds_bucket", + "vllm:time_to_first_token_seconds_count", + "vllm:time_per_output_token_seconds_sum", + "vllm:time_per_output_token_seconds_bucket", + "vllm:time_per_output_token_seconds_count", + "vllm:e2e_request_latency_seconds_sum", + "vllm:e2e_request_latency_seconds_bucket", + "vllm:e2e_request_latency_seconds_count", + "vllm:request_prompt_tokens_sum", + "vllm:request_prompt_tokens_bucket", + "vllm:request_prompt_tokens_count", + "vllm:request_generation_tokens_sum", + "vllm:request_generation_tokens_bucket", + "vllm:request_generation_tokens_count", + "vllm:request_params_n_sum", + "vllm:request_params_n_bucket", + "vllm:request_params_n_count", + "vllm:request_params_best_of_sum", + "vllm:request_params_best_of_bucket", + "vllm:request_params_best_of_count", + "vllm:num_preemptions_total", + "vllm:prompt_tokens_total", + "vllm:generation_tokens_total", + "vllm:request_success_total", +] + + +@pytest.mark.asyncio +async def test_metrics_exist(client: openai.AsyncOpenAI): + base_url = str(client.base_url)[:-3].strip("/") + + # sending a request triggers the metrics to be logged. + await client.completions.create(model=MODEL_NAME, + prompt="Hello, my name is", + max_tokens=5, + temperature=0.0) + + response = requests.get(base_url + "/metrics") + assert response.status_code == HTTPStatus.OK + + for metric in EXPECTED_METRICS: + assert metric in response.text diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 0dbae9a3a1e99..e030f48e1827b 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -40,7 +40,7 @@ class Metrics: def __init__(self, labelnames: List[str], max_model_len: int): # Unregister any existing vLLM collectors - # self._unregister_vllm_metrics() + self._unregister_vllm_metrics() # System stats # Scheduler State @@ -178,10 +178,10 @@ def __init__(self, labelnames: List[str], max_model_len: int): multiprocess_mode="sum", ) - # def _unregister_vllm_metrics(self) -> None: - # for collector in list(prometheus_client.REGISTRY._collector_to_names): - # if hasattr(collector, "_name") and "vllm" in collector._name: - # prometheus_client.REGISTRY.unregister(collector) + def _unregister_vllm_metrics(self) -> None: + for collector in list(prometheus_client.REGISTRY._collector_to_names): + if hasattr(collector, "_name") and "vllm" in collector._name: + prometheus_client.REGISTRY.unregister(collector) # end-metrics-definitions From a76f38a96319aac1da401b12d8ac0b23a8eea065 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 8 Aug 2024 23:05:15 +0000 Subject: [PATCH 27/40] fix example --- examples/openai_completion_client.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/openai_completion_client.py b/examples/openai_completion_client.py index ec10e5e234591..58519f978d340 100644 --- a/examples/openai_completion_client.py +++ b/examples/openai_completion_client.py @@ -19,7 +19,9 @@ model=model, prompt="A robot may not injure a human being", echo=False, -) + n=2, + stream=stream, + logprobs=3) print("Completion results:") if stream: From 6eea97c31f0d2c4b50d1b8bcbb1ff9ac0a056133 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 8 Aug 2024 23:07:53 +0000 Subject: [PATCH 28/40] remove unregistering --- vllm/engine/metrics.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index e030f48e1827b..7e7b340082187 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -39,8 +39,6 @@ class Metrics: _histogram_cls = prometheus_client.Histogram def __init__(self, labelnames: List[str], max_model_len: int): - # Unregister any existing vLLM collectors - self._unregister_vllm_metrics() # System stats # Scheduler State @@ -178,11 +176,6 @@ def __init__(self, labelnames: List[str], max_model_len: int): multiprocess_mode="sum", ) - def _unregister_vllm_metrics(self) -> None: - for collector in list(prometheus_client.REGISTRY._collector_to_names): - if hasattr(collector, "_name") and "vllm" in collector._name: - prometheus_client.REGISTRY.unregister(collector) - # end-metrics-definitions @@ -268,10 +261,6 @@ def __init__(self, labelnames: List[str], max_model_len: int): raise ImportError("RayMetrics requires Ray to be installed.") super().__init__(labelnames, max_model_len) - def _unregister_vllm_metrics(self) -> None: - # No-op on purpose - pass - def build_1_2_5_buckets(max_value: int) -> List[int]: """ From 0745f7d25f0f5f05d5a680d232a9ef584947f982 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 14 Aug 2024 18:01:35 +0000 Subject: [PATCH 29/40] cleanup for prom multiprocessing --- vllm/entrypoints/openai/api_server.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 8d797d81f9c1b..783c2c4c7f392 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -163,6 +163,12 @@ async def build_async_engine_client(args) -> AsyncIterator[AsyncEngineClient]: # Wait for server process to join rpc_server_process.join() + # Lazy import for prometheus multiprocessing. + # We need to set PROMETHEUS_MULTIPROC_DIR environment variable + # before prometheus_client is imported. + # See https://prometheus.github.io/client_python/multiprocess/ + from prometheus_client import multiprocess + multiprocess.mark_process_dead(rpc_server_process.pid) router = APIRouter() From 5c253d90f3a25ef71d0ede4c31e629348935485f Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 14 Aug 2024 18:13:38 +0000 Subject: [PATCH 30/40] format --- vllm/entrypoints/openai/api_server.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 783c2c4c7f392..6de588919f154 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -170,6 +170,7 @@ async def build_async_engine_client(args) -> AsyncIterator[AsyncEngineClient]: from prometheus_client import multiprocess multiprocess.mark_process_dead(rpc_server_process.pid) + router = APIRouter() From af3474a37a14332dfd28765efb2429be6392aa56 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 18 Aug 2024 14:53:32 +0000 Subject: [PATCH 31/40] stash --- vllm/engine/metrics.py | 23 ++++++++++++++++++++++- vllm/engine/metrics_types.py | 8 +++++++- 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 7e7b340082187..d362d0004a2f2 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -39,6 +39,13 @@ class Metrics: _histogram_cls = prometheus_client.Histogram def __init__(self, labelnames: List[str], max_model_len: int): + + # Config Stats + self.gauge_cache_info = self._gauge_cls( + name="vllm:cache_config_info", + documentation="Info about the cache configuration for vLLM.", + labelnames=labelnames, + multiprocess_mode="sum") # System stats # Scheduler State @@ -176,9 +183,23 @@ def __init__(self, labelnames: List[str], max_model_len: int): multiprocess_mode="sum", ) - # end-metrics-definitions + def _create_info_cache_config(self) -> None: + # Config Information + self.info_cache_config = self._gauge_cls( + name='vllm:cache_config', + documentation='Information of the LLMEngine CacheConfig', + labelnames= + multiprocess_mode="sum" + ) + + + def _unregister_vllm_metrics(self) -> None: + for collector in list(prometheus_client.REGISTRY._collector_to_names): + if hasattr(collector, "_name") and "vllm" in collector._name: + prometheus_client.REGISTRY.unregister(collector) + class _RayGaugeWrapper: """Wraps around ray.util.metrics.Gauge to provide same API as diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py index ee1736c467653..8c01641de0082 100644 --- a/vllm/engine/metrics_types.py +++ b/vllm/engine/metrics_types.py @@ -1,7 +1,7 @@ import time from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import List, Optional +from typing import Dict, List, Optional, Protocol from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics @@ -60,3 +60,9 @@ def maybe_update_spec_decode_metrics(self, stats: Stats): to be emitted at same time as log interval).""" if stats.spec_decode_metrics is not None: self.spec_decode_metrics = stats.spec_decode_metrics + + +class SupportsMetricsInfo(Protocol): + + def metrics_info(self) -> Dict[str, str]: + ... From 13c04446283fb3c9702434a8a1e053b17a53c2c9 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 18 Aug 2024 15:20:10 +0000 Subject: [PATCH 32/40] updated --- vllm/engine/llm_engine.py | 2 ++ vllm/engine/metrics.py | 42 +++++++++++++++++++++++++----------- vllm/engine/metrics_types.py | 16 ++++++++------ 3 files changed, 42 insertions(+), 18 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index ca2e90f3d563d..6e60de998d6bc 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -357,6 +357,8 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer: labels=dict(model_name=model_config.served_model_name), max_model_len=self.model_config.max_model_len), } + self.stat_loggers["prometheus"].info("cache_config", + self.cache_config) self.tracer = None if self.observability_config.otlp_traces_endpoint: diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index d362d0004a2f2..6d630afbe9657 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -5,7 +5,8 @@ import numpy as np import prometheus_client -from vllm.engine.metrics_types import StatLoggerBase, Stats +from vllm.engine.metrics_types import (StatLoggerBase, Stats, + SupportsMetricsInfo) from vllm.executor.ray_utils import ray from vllm.logger import init_logger @@ -39,7 +40,9 @@ class Metrics: _histogram_cls = prometheus_client.Histogram def __init__(self, labelnames: List[str], max_model_len: int): - + # Unregister any existing vLLM collectors (for CI/CD) + self._unregister_vllm_metrics() + # Config Stats self.gauge_cache_info = self._gauge_cls( name="vllm:cache_config_info", @@ -183,17 +186,8 @@ def __init__(self, labelnames: List[str], max_model_len: int): multiprocess_mode="sum", ) -# end-metrics-definitions - def _create_info_cache_config(self) -> None: - # Config Information - self.info_cache_config = self._gauge_cls( - name='vllm:cache_config', - documentation='Information of the LLMEngine CacheConfig', - labelnames= - multiprocess_mode="sum" - ) - +# end-metrics-definitions def _unregister_vllm_metrics(self) -> None: for collector in list(prometheus_client.REGISTRY._collector_to_names): @@ -282,6 +276,10 @@ def __init__(self, labelnames: List[str], max_model_len: int): raise ImportError("RayMetrics requires Ray to be installed.") super().__init__(labelnames, max_model_len) + def _unregister_vllm_metrics(self) -> None: + # No-op on purpose + pass + def build_1_2_5_buckets(max_value: int) -> List[int]: """ @@ -381,10 +379,14 @@ def _format_spec_decode_metrics_str( f"Number of draft tokens: {metrics.draft_tokens}, " f"Number of emitted tokens: {metrics.emitted_tokens}.") + def info(self, type: str, obj: SupportsMetricsInfo) -> None: + raise NotImplementedError + class PrometheusStatLogger(StatLoggerBase): """PrometheusStatLogger is used LLMEngine to log to Promethus.""" _metrics_cls = Metrics + _gauge_cls = prometheus_client.Gauge def __init__(self, local_interval: float, labels: Dict[str, str], max_model_len: int) -> None: @@ -523,7 +525,23 @@ def log(self, stats: Stats): self.last_local_log = stats.now self.spec_decode_metrics = None + def info(self, type: str, obj: SupportsMetricsInfo) -> None: + # Info type metrics are syntactic sugar for a gauge permanently set to 1 + # Since prometheus multiprocessing mode does not support Info, emulate + # info here with a gauge. + if type == "cache_config": + metrics_info = obj.metrics_info() + info_gauge = self._gauge_cls( + name='vllm:cache_config', + documentation='Information of the LLMEngine CacheConfig', + labelnames=metrics_info.keys(), + multiprocess_mode="mostrecent") + info_gauge.labels(**metrics_info).set(1) + class RayPrometheusStatLogger(PrometheusStatLogger): """RayPrometheusStatLogger uses Ray metrics instead.""" _metrics_cls = RayMetrics + + def info(self, type: str, obj: SupportsMetricsInfo) -> None: + return None diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py index 8c01641de0082..fc4c6efe47e65 100644 --- a/vllm/engine/metrics_types.py +++ b/vllm/engine/metrics_types.py @@ -40,6 +40,12 @@ class Stats: spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None +class SupportsMetricsInfo(Protocol): + + def metrics_info(self) -> Dict[str, str]: + ... + + class StatLoggerBase(ABC): """Base class for StatLogger.""" @@ -55,14 +61,12 @@ def __init__(self, local_interval: float) -> None: def log(self, stats: Stats) -> None: raise NotImplementedError + @abstractmethod + def info(self, type: str, obj: SupportsMetricsInfo) -> None: + raise NotImplementedError + def maybe_update_spec_decode_metrics(self, stats: Stats): """Save spec decode metrics (since they are unlikely to be emitted at same time as log interval).""" if stats.spec_decode_metrics is not None: self.spec_decode_metrics = stats.spec_decode_metrics - - -class SupportsMetricsInfo(Protocol): - - def metrics_info(self) -> Dict[str, str]: - ... From c4477c48d259ac7b041cfc61c6e927b99c5f7568 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 18 Aug 2024 15:22:32 +0000 Subject: [PATCH 33/40] updated --- tests/entrypoints/openai/test_metrics.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index 3229b12e45e6f..f575a7706fc72 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -147,6 +147,18 @@ async def test_metrics_counts(client: openai.AsyncOpenAI): "vllm:prompt_tokens_total", "vllm:generation_tokens_total", "vllm:request_success_total", + "vllm:cache_config_info" + # labels in cache_config_info + "block_size", + "cache_dtype", + "cpu_offload_gb", + "enable_prefix_caching" + "gpu_memory_utilization=", + "num_cpu_blocks", + "num_gpu_blocks=", + "num_gpu_blocks_override", + "sliding_window", + "swap_space_bytes", ] From 281a26a3b639668124c4c08e909a970ec03e231f Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 18 Aug 2024 15:24:45 +0000 Subject: [PATCH 34/40] fix --- tests/entrypoints/openai/test_metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index f575a7706fc72..13899cc5dc12a 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -147,7 +147,7 @@ async def test_metrics_counts(client: openai.AsyncOpenAI): "vllm:prompt_tokens_total", "vllm:generation_tokens_total", "vllm:request_success_total", - "vllm:cache_config_info" + "vllm:cache_config_info", # labels in cache_config_info "block_size", "cache_dtype", From e79349852ddc596a21cc5adcbf2c292f02ddae92 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 18 Aug 2024 15:27:45 +0000 Subject: [PATCH 35/40] fix naming --- vllm/engine/metrics.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 6d630afbe9657..876b96191d5d0 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -43,13 +43,6 @@ def __init__(self, labelnames: List[str], max_model_len: int): # Unregister any existing vLLM collectors (for CI/CD) self._unregister_vllm_metrics() - # Config Stats - self.gauge_cache_info = self._gauge_cls( - name="vllm:cache_config_info", - documentation="Info about the cache configuration for vLLM.", - labelnames=labelnames, - multiprocess_mode="sum") - # System stats # Scheduler State self.gauge_scheduler_running = self._gauge_cls( @@ -532,8 +525,8 @@ def info(self, type: str, obj: SupportsMetricsInfo) -> None: if type == "cache_config": metrics_info = obj.metrics_info() info_gauge = self._gauge_cls( - name='vllm:cache_config', - documentation='Information of the LLMEngine CacheConfig', + name="vllm:cache_config_info", + documentation="Information of the LLMEngine CacheConfig", labelnames=metrics_info.keys(), multiprocess_mode="mostrecent") info_gauge.labels(**metrics_info).set(1) From 53a56d541660e02da842574ebc79cd187e743f02 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 18 Aug 2024 15:33:32 +0000 Subject: [PATCH 36/40] comment --- vllm/engine/metrics_types.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py index fc4c6efe47e65..ae2c056336e6e 100644 --- a/vllm/engine/metrics_types.py +++ b/vllm/engine/metrics_types.py @@ -1,3 +1,16 @@ +""" +These types are defined in this file to avoid importing vllm.engine.metrics +and therefore importing prometheus_client. + +This is required due to usage of Prometheus multiprocess mode to enable +metrics after splitting out the uvicorn process from the engine process. + +Prometheus multiprocess mode requires setting PROMETHEUS_MULTIPROC_DIR +before prometheus_client is imported. Typically, this is done by setting +the env variable before launch, but since we are a library, we need to +do this in Python code and lazily import prometheus_client. +""" + import time from abc import ABC, abstractmethod from dataclasses import dataclass @@ -6,6 +19,7 @@ from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics + @dataclass class Stats: """Created by LLMEngine for use by StatLogger.""" From 59479a62abb6fa31d8d27f753e54fc973a65a1f6 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 18 Aug 2024 15:33:43 +0000 Subject: [PATCH 37/40] format --- vllm/engine/metrics_types.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py index ae2c056336e6e..7449aafc5aecb 100644 --- a/vllm/engine/metrics_types.py +++ b/vllm/engine/metrics_types.py @@ -19,7 +19,6 @@ from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics - @dataclass class Stats: """Created by LLMEngine for use by StatLogger.""" From f74d426728554cc3261ca1da4b2b8487efb21c68 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 18 Aug 2024 15:40:14 +0000 Subject: [PATCH 38/40] fix cache_config_info --- tests/entrypoints/openai/test_metrics.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index 13899cc5dc12a..cbe601e623056 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -152,10 +152,10 @@ async def test_metrics_counts(client: openai.AsyncOpenAI): "block_size", "cache_dtype", "cpu_offload_gb", - "enable_prefix_caching" - "gpu_memory_utilization=", + "enable_prefix_caching", + "gpu_memory_utilization", "num_cpu_blocks", - "num_gpu_blocks=", + "num_gpu_blocks", "num_gpu_blocks_override", "sliding_window", "swap_space_bytes", From 224c9879ef346e1a9fb965fd75b4b03136f0fdab Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 18 Aug 2024 18:47:36 +0000 Subject: [PATCH 39/40] properly pass multiprocess_mode to RayGaugeCLS --- vllm/engine/metrics.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 876b96191d5d0..679e626eda3c1 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -195,7 +195,9 @@ class _RayGaugeWrapper: def __init__(self, name: str, documentation: str = "", - labelnames: Optional[List[str]] = None): + labelnames: Optional[List[str]] = None, + multiprocess_mode: str=""): + del multiprocess_mode labelnames_tuple = tuple(labelnames) if labelnames else None self._gauge = ray_metrics.Gauge(name=name, description=documentation, From ad26ad7452607d9135da9a0aa4f0d127b04a9dcf Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 18 Aug 2024 18:47:54 +0000 Subject: [PATCH 40/40] ./format --- vllm/engine/metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 679e626eda3c1..1071786c27cd6 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -196,7 +196,7 @@ def __init__(self, name: str, documentation: str = "", labelnames: Optional[List[str]] = None, - multiprocess_mode: str=""): + multiprocess_mode: str = ""): del multiprocess_mode labelnames_tuple = tuple(labelnames) if labelnames else None self._gauge = ray_metrics.Gauge(name=name,