diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index f5051eb86..c6854a9f6 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -6,6 +6,7 @@ on:
   push:
     branches:
       - release
+      - 312-test
     paths-ignore:
       - "**.md"
       - "proto/**"
diff --git a/Dockerfile.ubi b/Dockerfile.ubi
index 32157c16e..4c7396e72 100644
--- a/Dockerfile.ubi
+++ b/Dockerfile.ubi
@@ -1,318 +1,26 @@
-# Please update any changes made here to
-# docs/source/dev/dockerfile-ubi/dockerfile-ubi.rst
-
-## Global Args #################################################################
-ARG BASE_UBI_IMAGE_TAG=9.4-1134
-ARG PYTHON_VERSION=3.11
-
-# NOTE: This setting only has an effect when not using prebuilt-wheel kernels
-ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
-
-
-## Base Layer ##################################################################
-FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} as base
-
-WORKDIR /workspace
-
-ENV LANG=C.UTF-8 \
-    LC_ALL=C.UTF-8
-
-# Some utils for dev purposes - tar required for kubectl cp
-RUN microdnf install -y \
-        which procps findutils tar vim \
-    && microdnf clean all
-
-
-## Python Installer ############################################################
-FROM base as python-install
-
-ARG PYTHON_VERSION
-ARG MINIFORGE_VERSION=24.3.0-0
-
-RUN curl -fsSL -o ~/miniforge3.sh -O  "https://github.com/conda-forge/miniforge/releases/download/${MINIFORGE_VERSION}/Miniforge3-$(uname)-$(uname -m).sh" && \
-    chmod +x ~/miniforge3.sh && \
-    bash ~/miniforge3.sh -b -p /opt/conda && \
-    source "/opt/conda/etc/profile.d/conda.sh" && \
-    conda create -y -p /opt/vllm python=${PYTHON_VERSION} && \
-    conda activate /opt/vllm && \
-    rm ~/miniforge3.sh
-# use of the /opt/vllm env requires:
-# ENV PATH=/opt/vllm/bin/:$PATH
-
-## CUDA Base ###################################################################
-FROM base as cuda-base
-
-# The Nvidia operator won't allow deploying on CUDA 12.0 hosts if
-# this env var is set to 12.2.0, even though it's compatible
-#ENV CUDA_VERSION=12.2.0 \
-ENV CUDA_VERSION=12.0.0 \
-    NV_CUDA_LIB_VERSION=12.2.0-1 \
-    NVIDIA_VISIBLE_DEVICES=all \
-    NVIDIA_DRIVER_CAPABILITIES=compute,utility \
-    NV_CUDA_CUDART_VERSION=12.2.53-1 \
-    NV_CUDA_COMPAT_VERSION=535.104.12
-
-RUN curl -Lo /etc/yum.repos.d/cuda-rhel9.repo \
-        https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo
-
-RUN microdnf install -y \
-        cuda-cudart-12-2-${NV_CUDA_CUDART_VERSION} \
-        cuda-compat-12-2-${NV_CUDA_COMPAT_VERSION} \
-    && microdnf clean all
-
-ENV CUDA_HOME="/usr/local/cuda" \
-    PATH="/usr/local/nvidia/bin:${CUDA_HOME}/bin:${PATH}" \
-    LD_LIBRARY_PATH="/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$CUDA_HOME/lib64:$CUDA_HOME/extras/CUPTI/lib64:${LD_LIBRARY_PATH}"
-
-
-## CUDA Runtime ################################################################
-FROM cuda-base as cuda-runtime
-
-ENV NV_NVTX_VERSION=12.2.53-1 \
-    NV_LIBNPP_VERSION=12.1.1.14-1 \
-    NV_LIBCUBLAS_VERSION=12.2.1.16-1 \
-    NV_LIBNCCL_PACKAGE_VERSION=2.18.5-1+cuda12.2
-
-RUN microdnf install -y \
-        cuda-libraries-12-2-${NV_CUDA_LIB_VERSION} \
-        cuda-nvtx-12-2-${NV_NVTX_VERSION} \
-        libnpp-12-2-${NV_LIBNPP_VERSION} \
-        libcublas-12-2-${NV_LIBCUBLAS_VERSION} \
-        libnccl-${NV_LIBNCCL_PACKAGE_VERSION} \
-    && microdnf clean all
-
-
-## CUDA Development ############################################################
-FROM cuda-base as cuda-devel
-
-ENV NV_CUDA_CUDART_DEV_VERSION=12.2.53-1 \
-    NV_NVML_DEV_VERSION=12.2.81-1 \
-    NV_LIBCUBLAS_DEV_VERSION=12.2.1.16-1 \
-    NV_LIBNPP_DEV_VERSION=12.1.1.14-1 \
-    NV_LIBNCCL_DEV_PACKAGE_VERSION=2.18.5-1+cuda12.2
-
-RUN microdnf install -y \
-        cuda-command-line-tools-12-2-${NV_CUDA_LIB_VERSION} \
-        cuda-libraries-devel-12-2-${NV_CUDA_LIB_VERSION} \
-        cuda-minimal-build-12-2-${NV_CUDA_LIB_VERSION} \
-        cuda-cudart-devel-12-2-${NV_CUDA_CUDART_DEV_VERSION} \
-        cuda-nvml-devel-12-2-${NV_NVML_DEV_VERSION} \
-        libcublas-devel-12-2-${NV_LIBCUBLAS_DEV_VERSION} \
-        libnpp-devel-12-2-${NV_LIBNPP_DEV_VERSION} \
-        libnccl-devel-${NV_LIBNCCL_DEV_PACKAGE_VERSION} \
-    && microdnf clean all
-
-ENV LIBRARY_PATH="$CUDA_HOME/lib64/stubs"
-
-# Workaround for https://github.com/openai/triton/issues/2507 and
-# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
-# this won't be needed for future versions of this docker image
-# or future versions of triton.
-RUN ldconfig /usr/local/cuda-12.2/compat/
-
-## Python cuda base #################################################################
-FROM cuda-devel as python-cuda-base
-
-COPY --from=python-install --link /opt/vllm /opt/vllm
-ENV PATH=/opt/vllm/bin/:$PATH
-
-# install cuda and common dependencies
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
-    --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
-    pip3 install \
-        -r requirements-cuda.txt
-
-## Development #################################################################
-FROM python-cuda-base AS dev
-
-# install build and runtime dependencies
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
-    --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
-    --mount=type=bind,source=requirements-dev.txt,target=requirements-dev.txt \
-    --mount=type=bind,source=requirements-lint.txt,target=requirements-lint.txt \
-    --mount=type=bind,source=requirements-test.txt,target=requirements-test.txt \
-    pip3 install \
-        -r requirements-cuda.txt \
-        -r requirements-dev.txt
-
-## Proto Compilation ###########################################################
-FROM python-install AS gen-protos
-
-ENV PATH=/opt/vllm/bin/:$PATH
-
-RUN microdnf install -y \
-        make \
-        findutils \
-    && microdnf clean all
-
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,source=Makefile,target=Makefile \
-    --mount=type=bind,source=proto,target=proto \
-    make gen-protos
-
-## Extension Cache #############################################################
-# Instead of compiling artifacts every build just copy from pre-built wheel
-# This might not work if the PyTorch and CUDA versions don't match!
-FROM base as prebuilt-wheel
-
-RUN microdnf install -y \
-        unzip \
-    && microdnf clean all
-
-ARG PYTHON_VERSION
-# 0.4.2 is built for CUDA 12.1 and PyTorch 2.3.0
-ARG VLLM_WHEEL_VERSION=0.4.3
-
-RUN curl -Lo vllm.whl https://github.com/vllm-project/vllm/releases/download/v${VLLM_WHEEL_VERSION}/vllm-${VLLM_WHEEL_VERSION}-cp${PYTHON_VERSION//.}-cp${PYTHON_VERSION//.}-manylinux1_x86_64.whl \
-    && unzip vllm.whl \
-    && rm vllm.whl
-# compiled extensions located at /workspace/vllm/*.so
-
-## Builder #####################################################################
-FROM dev AS build
-
-# install build dependencies
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,source=requirements-build.txt,target=requirements-build.txt \
-    pip install -r requirements-build.txt
-
-# copy input files
-COPY csrc csrc
-COPY setup.py setup.py
-COPY cmake cmake
-COPY CMakeLists.txt CMakeLists.txt
-COPY requirements-common.txt requirements-common.txt
-COPY requirements-cuda.txt requirements-cuda.txt
-COPY pyproject.toml pyproject.toml
-COPY vllm/__init__.py vllm/__init__.py
-
-ARG TORCH_CUDA_ARCH_LIST
-ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
-
-# max jobs used by Ninja to build extensions
-ARG max_jobs=2
-ENV MAX_JOBS=${max_jobs}
-# number of threads used by nvcc
-ARG nvcc_threads=2
-ENV NVCC_THREADS=$nvcc_threads
-# make sure punica kernels are built (for LoRA)
-ENV VLLM_INSTALL_PUNICA_KERNELS=1
-
-# Setup path stuff? Ref: https://github.com/vllm-project/vllm/blob/main/.github/workflows/scripts/build.sh#L6-L8
-ENV PATH=/usr/local/cuda/bin:$PATH
-ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
-
-# Copy the entire directory before building wheel
-COPY --link vllm vllm
-
-# Comment if building *.so files from scratch
-##################################################
-# Copy the prebuilt *.so files
-COPY --from=prebuilt-wheel --link /workspace/vllm/*.so /workspace/vllm/
-ENV VLLM_USE_PRECOMPILED=1
-##################################################
-# Comment if not building .so files from scratch
-#RUN microdnf install -y git \
-#    && microdnf clean all
-##################################################
-
-# Copy over the generated *.pb2 files
-COPY --from=gen-protos --link /workspace/vllm/entrypoints/grpc/pb vllm/entrypoints/grpc/pb
-
-ENV CCACHE_DIR=/root/.cache/ccache
-RUN --mount=type=cache,target=/root/.cache/ccache \
-    --mount=type=cache,target=/root/.cache/pip \
-    CMAKE_BUILD_TYPE=Release python3 setup.py bdist_wheel --dist-dir=dist
-
-#################### libsodium Build IMAGE ####################
-FROM base as libsodium-builder
-
-RUN microdnf install -y gcc gzip \
-    && microdnf clean all
-
-WORKDIR /usr/src/libsodium
-
-ARG LIBSODIUM_VERSION=1.0.19
-RUN curl -LO https://github.com/jedisct1/libsodium/releases/download/${LIBSODIUM_VERSION}-RELEASE/libsodium-${LIBSODIUM_VERSION}.tar.gz \
-    && tar -xzvf libsodium*.tar.gz \
-    && rm -f libsodium*.tar.gz \
-    && mv libsodium*/* ./
-
-RUN ./configure && make && make check
-
 ## Release #####################################################################
 # Note from the non-UBI Dockerfile:
 # We used base cuda image because pytorch installs its own cuda libraries.
 # However pynccl depends on cuda libraries so we had to switch to the runtime image
 # In the future it would be nice to get a container with pytorch and cuda without duplicating cuda
-FROM cuda-runtime AS vllm-openai
-
-WORKDIR /workspace
-
-# Create release python environment
-COPY --from=python-cuda-base --link /opt/vllm /opt/vllm
-ENV PATH=/opt/vllm/bin/:$PATH
-
-# install vllm wheel first, so that torch etc will be installed
-RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
-    --mount=type=cache,target=/root/.cache/pip \
-    pip install $(echo dist/*.whl)'[tensorizer]' --verbose
-
-# Install the vllm_nccl package which is a bit quirky
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
-    --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
-    # The "install" happens in `setup.py` so it happens when built...
-    # Remove the already installed package and the cached wheel
-    pip uninstall -y vllm-nccl-cu12 \
-    && pip cache remove vllm_nccl* \
-    # install the version depended on by vllm requirements
-    && pip install vllm-nccl-cu12 -r requirements-cuda.txt \
-    # The lib is downloaded to root's home directory... move it
-    && mv ~/.config/vllm/nccl/cu12/libnccl.so.2* /usr/local/lib/libnccl.so.2
-ENV VLLM_NCCL_SO_PATH=/usr/local/lib/libnccl.so.2
-
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip3 install \
-        # additional dependencies for the TGIS gRPC server
-        grpcio-tools==1.63.0 \
-        # additional dependencies for openai api_server
-        accelerate==0.30.0 \
-        # hf_transfer for faster HF hub downloads
-        hf_transfer==0.1.6
-
-# Triton needs a CC compiler
-RUN microdnf install -y gcc \
-    && microdnf clean all
-
-# patch triton (fix for #720)
-COPY triton_patch/custom_cache_manager.py /opt/vllm/lib/python3.11/site-packages/triton/runtime/custom_cache_manager.py
-
-# Install libsodium for Tensorizer encryption
-RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/src/libsodium \
-    cd /usr/src/libsodium \
-    && make install
-ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
-
-ENV HF_HUB_OFFLINE=1 \
-    PORT=8000 \
-    GRPC_PORT=8033 \
-    HOME=/home/vllm \
-    VLLM_USAGE_SOURCE=production-docker-image \
-    VLLM_WORKER_MULTIPROC_METHOD=fork \
-    TRITON_CACHE_MANAGER="triton.runtime.custom_cache_manager:CustomCacheManager"
-
-# setup non-root user for OpenShift
-RUN microdnf install -y shadow-utils \
-    && umask 002 \
-    && useradd --uid 2000 --gid 0 vllm \
-    && microdnf remove -y shadow-utils \
-    && microdnf clean all \
-    && chmod g+rwx $HOME /usr/src /workspace
-
-COPY LICENSE /licenses/vllm.md
-
-USER 2000
-ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+FROM vllm/vllm-openai:v0.5.5 AS vllm-openai
+
+
+# install python 3.12 instead
+
+ARG PYTHON_VERSION=3.12
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install Python and other dependencies
+RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
+    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
+    && apt-get update -y \
+    && apt-get install -y ccache software-properties-common git curl sudo \
+    && add-apt-repository ppa:deadsnakes/ppa \
+    && apt-get update -y \
+    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
+    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
+    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
+    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
+    && python3 --version && python3 -m pip --version