Sync with upstream @ v0.6.2

opendatahub-io · Sep 26, 2024 · 18d7da7 · 18d7da7
2 parents 9ebf28d + 7193774
commit 18d7da7
Show file tree

Hide file tree

Showing 385 changed files with 19,991 additions and 7,135 deletions.
diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@@ -8,8 +8,7 @@ steps:
           containers:
           - image: badouralix/curl-jq
             command:
-            - sh
-            - .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+            - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
   - wait
   - label: "A100"
     agents:

diff --git a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
@@ -2,9 +2,11 @@
 TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token)
 URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
 
+TIMEOUT_SECONDS=10
+
 retries=0
 while [ $retries -lt 1000 ]; do
-    if [ $(curl -s -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then
+    if [ $(curl -s --max-time $TIMEOUT_SECONDS -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then
         exit 0
     fi
 

diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
@@ -83,6 +83,7 @@ if [[ $commands == *" kernels "* ]]; then
   --ignore=kernels/test_encoder_decoder_attn.py \
   --ignore=kernels/test_flash_attn.py \
   --ignore=kernels/test_flashinfer.py \
+  --ignore=kernels/test_gguf.py \
   --ignore=kernels/test_int8_quant.py \
   --ignore=kernels/test_machete_gemm.py \
   --ignore=kernels/test_mamba_ssm.py \
@@ -93,6 +94,16 @@ if [[ $commands == *" kernels "* ]]; then
   --ignore=kernels/test_sampler.py"
 fi
 
+#ignore certain Entrypoints tests
+if [[ $commands == *" entrypoints/openai "* ]]; then
+  commands=${commands//" entrypoints/openai "/" entrypoints/openai \
+  --ignore=entrypoints/openai/test_accuracy.py \
+  --ignore=entrypoints/openai/test_audio.py \
+  --ignore=entrypoints/openai/test_encoder_decoder.py \
+  --ignore=entrypoints/openai/test_embedding.py \
+  --ignore=entrypoints/openai/test_oot_registration.py "}
+fi
+
 PARALLEL_JOB_COUNT=8
 # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. 
 if [[ $commands == *"--shard-id="* ]]; then

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
@@ -22,7 +22,7 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
 
 # Run basic model test
 docker exec cpu-test bash -c "
-  pip install pytest matplotlib einops transformers_stream_generator
+  pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator
   pytest -v -s tests/models/decoder_only/language \
     --ignore=tests/models/test_fp8.py \
     --ignore=tests/models/decoder_only/language/test_jamba.py \

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -43,13 +43,15 @@ steps:
   fast_check: true
   source_file_dependencies:
   - vllm/
+  - tests/mq_llm_engine
   - tests/async_engine
   - tests/test_inputs
   - tests/multimodal
   - tests/test_utils
   - tests/worker
   commands:
-  - pytest -v -s async_engine # Async Engine
+  - pytest -v -s mq_llm_engine # MQLLMEngine
+  - pytest -v -s async_engine # AsyncLLMEngine
   - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
   - pytest -v -s test_inputs.py
   - pytest -v -s multimodal
@@ -68,7 +70,7 @@ steps:
   - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
   - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
   - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
-  
+
 - label: Core Test # 10min
   mirror_hardwares: [amd]
   fast_check: true
@@ -82,14 +84,17 @@ steps:
 - label: Entrypoints Test # 20min
   working_dir: "/vllm-workspace/tests"
   fast_check: true
-  #mirror_hardwares: [amd]
+  mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
   commands:
   - pip install -e ./plugins/vllm_add_dummy_model
   - pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@a4987bba6e9e9b3f22bd3a6c1ecf0abd04fd5622#egg=lm_eval[api]
-  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py
+  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py
   - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
+  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
+  - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
+  - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
   - pytest -v -s entrypoints/openai
   - pytest -v -s entrypoints/test_chat_utils.py
   - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
@@ -163,13 +168,6 @@ steps:
     - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
     - python3 offline_inference_encoder_decoder.py
 
-- label: torch compile integration test
-  source_file_dependencies:
-  - vllm/
-  commands:
-    - pytest -v -s ./compile/test_full_graph.py
-    - pytest -v -s ./compile/test_wrapper.py
-
 - label: Prefix Caching Test # 7min
   #mirror_hardwares: [amd]
   source_file_dependencies:
@@ -212,6 +210,21 @@ steps:
   command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
   parallelism: 4
 
+- label: "PyTorch Fullgraph Smoke Test"
+  fast_check: true
+  source_file_dependencies:
+  - vllm/
+  - tests/compile
+  commands:
+  - pytest -v -s compile/test_full_graph_smoke.py
+
+- label: "PyTorch Fullgraph Test"
+  source_file_dependencies:
+  - vllm/
+  - tests/compile
+  commands:
+  - pytest -v -s compile/test_full_graph.py
+
 - label: Kernels Test %N # 30min each
   mirror_hardwares: [amd]
   source_file_dependencies:
@@ -259,6 +272,13 @@ steps:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - bash ./run-tests.sh -c configs/models-small.txt -t 1
 
+- label: Encoder Decoder tests # 5min
+  source_file_dependencies:
+  - vllm/
+  - tests/encoder_decoder
+  commands:
+    - pytest -v -s encoder_decoder
+
 - label: OpenAI-Compatible Tool Use # 20 min
   fast_check: false
   mirror_hardwares: [ amd ]
@@ -348,7 +368,10 @@ steps:
   - vllm/executor/
   - vllm/model_executor/models/
   - tests/distributed/
+  - vllm/compilation
   commands:
+  - pytest -v -s ./compile/test_full_graph_multi_gpu.py
+  - pytest -v -s ./compile/test_wrapper.py
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
   - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
   # Avoid importing model tests that cause CUDA reinitialization error

diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
@@ -25,10 +25,10 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install ruff==0.1.5 codespell==2.3.0 tomli==2.0.1 isort==5.13.2
+        pip install -r requirements-lint.txt
     - name: Analysing the code with ruff
       run: |
-        ruff .
+        ruff check .
     - name: Spelling check with codespell
       run: |
         codespell --toml pyproject.toml

diff --git a/.github/workflows/scripts/build.sh b/.github/workflows/scripts/build.sh
@@ -15,5 +15,6 @@ $python_executable -m pip install -r requirements-cuda.txt
 export MAX_JOBS=1
 # Make sure release wheels are built for the following architectures
 export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
+export VLLM_FA_CMAKE_GPU_ARCHES="80-real;90-real"
 # Build
 $python_executable setup.py bdist_wheel --dist-dir=dist
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,8 @@
-# vllm commit id, generated by setup.py
-vllm/commit_id.py
+# version file generated by setuptools-scm
+/vllm/_version.py
+
+# vllm-flash-attn built from source
+vllm/vllm_flash_attn/
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
@@ -12,6 +15,8 @@ __pycache__/
 # Distribution / packaging
 .Python
 build/
+cmake-build-*/
+CMakeUserPresets.json
 develop-eggs/
 dist/
 downloads/