diff --git a/.github/workflows/check_quality.yaml b/.github/workflows/check_quality.yaml index 36b99f99..c2db2f09 100644 --- a/.github/workflows/check_quality.yaml +++ b/.github/workflows/check_quality.yaml @@ -21,7 +21,7 @@ jobs: - name: Set up Python 3.10 uses: actions/setup-python@v3 with: - python-version: '3.10' + python-version: "3.10" - name: Install quality requirements run: | diff --git a/.github/workflows/test_api_cpu.yaml b/.github/workflows/test_api_cpu.yaml index 752afab7..70b8c02c 100644 --- a/.github/workflows/test_api_cpu.yaml +++ b/.github/workflows/test_api_cpu.yaml @@ -21,7 +21,7 @@ jobs: - name: Set up Python 3.10 uses: actions/setup-python@v3 with: - python-version: '3.10' + python-version: "3.10" - name: Install dependencies run: | diff --git a/.github/workflows/test_api_misc.yaml b/.github/workflows/test_api_misc.yaml index df72ffb2..99a24217 100644 --- a/.github/workflows/test_api_misc.yaml +++ b/.github/workflows/test_api_misc.yaml @@ -21,7 +21,7 @@ jobs: - name: Set up Python 3.10 uses: actions/setup-python@v3 with: - python-version: '3.10' + python-version: "3.10" - name: Install requirements run: | diff --git a/.github/workflows/test_cli_cpu_neural_compressor.yaml b/.github/workflows/test_cli_cpu_neural_compressor.yaml index 9150a90f..2e9008bf 100644 --- a/.github/workflows/test_cli_cpu_neural_compressor.yaml +++ b/.github/workflows/test_cli_cpu_neural_compressor.yaml @@ -21,12 +21,12 @@ jobs: - name: Set up Python 3.10 uses: actions/setup-python@v3 with: - python-version: '3.10' + python-version: "3.10" - - name: Install Intel Neural Compressor CPU requirements + - name: Install requirements run: | pip install --upgrade pip pip install -e .[testing,neural-compressor,diffusers,timm] - - name: Run Intel Neural Compressor CPU tests + - name: Run CPU tests run: pytest -k "cli and cpu and neural_compressor" diff --git a/.github/workflows/test_cli_cpu_onnxruntime.yaml b/.github/workflows/test_cli_cpu_onnxruntime.yaml index e7caf218..486429af 100644 --- a/.github/workflows/test_cli_cpu_onnxruntime.yaml +++ b/.github/workflows/test_cli_cpu_onnxruntime.yaml @@ -21,7 +21,7 @@ jobs: - name: Set up Python 3.10 uses: actions/setup-python@v3 with: - python-version: '3.10' + python-version: "3.10" - name: Install requirements run: | @@ -29,5 +29,4 @@ jobs: pip install -e .[testing,onnxruntime,diffusers,timm] - name: Run tests - run: | - pytest -k "cli and cpu and onnxruntime" + run: pytest -k "cli and cpu and onnxruntime" diff --git a/.github/workflows/test_cli_cpu_openvino.yaml b/.github/workflows/test_cli_cpu_openvino.yaml index 00b40aef..bbfa29a8 100644 --- a/.github/workflows/test_cli_cpu_openvino.yaml +++ b/.github/workflows/test_cli_cpu_openvino.yaml @@ -21,7 +21,7 @@ jobs: - name: Set up Python 3.10 uses: actions/setup-python@v3 with: - python-version: '3.10' + python-version: "3.10" - name: Install requirements run: | @@ -29,5 +29,4 @@ jobs: pip install -e .[testing,openvino,diffusers,timm] - name: Run tests - run: | - pytest -k "cli and cpu and openvino" + run: pytest -k "cli and cpu and openvino" diff --git a/.github/workflows/test_cli_cpu_py_tgi.yaml b/.github/workflows/test_cli_cpu_py_tgi.yaml new file mode 100644 index 00000000..1ec01c84 --- /dev/null +++ b/.github/workflows/test_cli_cpu_py_tgi.yaml @@ -0,0 +1,35 @@ +name: CLI CPU Py-TGI Tests + +on: + workflow_dispatch: + push: + branches: [main] + pull_request: + types: [opened, reopened, synchronize] + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + run_cli_cpu_py_tgi_tests: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v3 + + - name: Set up Python 3.10 + uses: actions/setup-python@v3 + with: + python-version: "3.10" + + - name: Install requirements + run: | + pip install --upgrade pip + pip install -e .[testing,py-tgi] + + - name: Pull TGI docker image + run: docker pull ghcr.io/huggingface/text-generation-inference:latest + + - name: Run tests + run: pytest -k "cli and cpu and py_tgi" diff --git a/.github/workflows/test_cli_cpu_pytorch.yaml b/.github/workflows/test_cli_cpu_pytorch.yaml index 3df5368b..332229b7 100644 --- a/.github/workflows/test_cli_cpu_pytorch.yaml +++ b/.github/workflows/test_cli_cpu_pytorch.yaml @@ -21,7 +21,7 @@ jobs: - name: Set up Python 3.10 uses: actions/setup-python@v3 with: - python-version: '3.10' + python-version: "3.10" - name: Install requirements run: | @@ -29,5 +29,4 @@ jobs: pip install -e .[testing,diffusers,timm,peft] - name: Run tests - run: | - pytest -k "cli and cpu and pytorch" + run: pytest -k "cli and cpu and pytorch" diff --git a/.github/workflows/test_cli_misc.yaml b/.github/workflows/test_cli_misc.yaml index 5b55c0a7..1ca9d54f 100644 --- a/.github/workflows/test_cli_misc.yaml +++ b/.github/workflows/test_cli_misc.yaml @@ -21,7 +21,7 @@ jobs: - name: Set up Python 3.10 uses: actions/setup-python@v3 with: - python-version: '3.10' + python-version: "3.10" - name: Install requirements run: | diff --git a/.github/workflows/test_cli_tensorrt_onnxruntime.yaml b/.github/workflows/test_cli_tensorrt_onnxruntime.yaml index a98bfc15..1d52ee33 100644 --- a/.github/workflows/test_cli_tensorrt_onnxruntime.yaml +++ b/.github/workflows/test_cli_tensorrt_onnxruntime.yaml @@ -26,15 +26,20 @@ jobs: --tag opt-bench-tensorrt:latest . + - name: Get GPUs with most free memory + id: get_devices + run: | + echo "::set-output name=devices::$(nvidia-smi --query-gpu=memory.free,index --format=csv,noheader,nounits | sort -n -k1 | tail -n 2 | awk -F', ' '{print $2}' | xargs echo -n | sed 's/ /,/g' | awk '{print $0}')" + - name: Run tests run: docker run --rm --pid host --shm-size 64G --env USE_CUDA="1" + --gpus '"device=${{ steps.get_devices.outputs.devices }}"' --volume $(pwd):/workspace/optimum-benchmark --workdir /workspace/optimum-benchmark - --gpus '"device=0,1"' --entrypoint /bin/bash opt-bench-tensorrt:latest -c "pip install -e .[testing,onnxruntime-gpu,diffusers,timm] && pytest -k 'cli and tensorrt and onnxruntime' -x" diff --git a/Makefile b/Makefile index 60493c16..61c77458 100644 --- a/Makefile +++ b/Makefile @@ -1,10 +1,28 @@ # List of targets that are not associated with files .PHONY: quality style install build_docker_cpu build_docker_cuda build_docker_rocm build_docker_tensorrt test_api_misc test_api_cpu test_api_cuda test_api_rocm test_api_tensorrt test_cli_misc test_cli_cpu_pytorch test_cli_cpu_neural_compressor test_cli_cpu_onnxruntime test_cli_cpu_openvino test_cli_cuda_pytorch test_cli_rocm_pytorch test_cli_tensorrt_onnxruntime test_cli_tensorrt_llm +, := , PWD := $(shell pwd) USER_ID := $(shell id -u) GROUP_ID := $(shell id -g) +API_MISC_REQS := testing +API_CPU_REQS := testing,timm,diffusers +API_CUDA_REQS := testing,timm,diffusers +API_ROCM_REQS := testing,timm,diffusers + +CLI_MISC_REQS := testing + +CLI_CUDA_ONNXRUNTIME_REQS := testing,timm,diffusers +CLI_ROCM_ONNXRUNTIME_REQS := testing,timm,diffusers +CLI_CUDA_PYTORCH_REQS := testing,timm,diffusers,deepspeed,peft +CLI_ROCM_PYTORCH_REQS := testing,timm,diffusers,deepspeed,peft + +CLI_CPU_OPENVINO_REQS := testing,openvino,timm,diffusers +CLI_CPU_PYTORCH_REQS := testing,timm,diffusers,deepspeed,peft +CLI_CPU_ONNXRUNTIME_REQS := testing,onnxruntime,timm,diffusers +CLI_CPU_NEURAL_COMPRESSOR_REQS := testing,neural-compressor,timm,diffusers + quality: ruff check . ruff format --check . @@ -18,49 +36,44 @@ install: ## Docker builds +define build_docker + docker build -f docker/$(1).dockerfile --build-arg USER_ID=$(USER_ID) --build-arg GROUP_ID=$(GROUP_ID) -t opt-bench-$(1):local . +endef + build_docker_cpu: - docker build -f docker/cpu.dockerfile --build-arg USER_ID=$(USER_ID) --build-arg GROUP_ID=$(GROUP_ID) -t opt-bench-cpu:local . + $(call build_docker,cpu) build_docker_cuda: - docker build -f docker/cuda.dockerfile --build-arg USER_ID=$(USER_ID) --build-arg GROUP_ID=$(GROUP_ID) -t opt-bench-cuda:local . + $(call build_docker,cuda) build_docker_rocm: - docker build -f docker/rocm.dockerfile --build-arg USER_ID=$(USER_ID) --build-arg GROUP_ID=$(GROUP_ID) -t opt-bench-rocm:local . - -build_docker_tensorrt: - docker build -f docker/tensorrt.dockerfile --build-arg USER_ID=$(USER_ID) --build-arg GROUP_ID=$(GROUP_ID) -t opt-bench-tensorrt:local . - -## API tests + $(call build_docker,rocm) -test_api_misc: - docker run \ - --rm \ - --pid host \ - --entrypoint /bin/bash \ - --volume $(PWD):/workspace \ - --workdir /workspace \ - opt-bench-cpu:local -c "pip install -e .[testing,timm,diffusers] && pytest tests/ -k 'api and not (cpu or cuda or rocm or tensorrt)' -x" +## Tests -test_api_cpu: +define test_ubuntu docker run \ --rm \ --pid host \ --entrypoint /bin/bash \ --volume $(PWD):/workspace \ --workdir /workspace \ - opt-bench-cpu:local -c "pip install -e .[testing,timm,diffusers] && pytest tests/ -k 'api and cpu' -x" + opt-bench-$(1):local -c "pip install -e .[$(2)] && pytest tests/ -k '$(3)' -x" +endef -test_api_cuda: +define test_nvidia docker run \ --rm \ --pid host \ + --shm-size 64G \ --gpus '"device=0,1"' \ --entrypoint /bin/bash \ --volume $(PWD):/workspace \ --workdir /workspace \ - opt-bench-cuda:local -c "pip install -e .[testing,timm,diffusers] && pytest tests/ -k 'api and cuda' -x" + opt-bench-$(1):local -c "pip install -e .[$(2)] && pytest tests/ -k '$(3)' -x" +endef -test_api_rocm: +define test_amdgpu docker run \ --rm \ --pid host \ @@ -71,101 +84,44 @@ test_api_rocm: --entrypoint /bin/bash \ --volume $(PWD):/workspace \ --workdir /workspace \ - opt-bench-rocm:local -c "pip install -e .[testing,timm,diffusers] && pytest tests/ -k 'api and cuda' -x" + opt-bench-$(1):local -c "pip install -e .[$(2)] && pytest tests/ -k '$(3)' -x" +endef -## CLI tests +# group the extra +test_api_cpu: + $(call test_ubuntu,cpu,$(API_CPU_REQS),api and cpu) -### CLI CPU tests +test_api_cuda: + $(call test_nvidia,cuda,$(API_CUDA_REQS),api and cuda) -test_cli_misc: - docker run \ - --rm \ - --pid host \ - --entrypoint /bin/bash \ - --volume $(PWD):/workspace \ - --workdir /workspace \ - opt-bench-cpu:local -c "pip install -e .[testing,diffusers,timm] && pytest tests/ -k 'cli and not (cpu or cuda or rocm or tensorrt)' -x" +test_api_rocm: + $(call test_amdgpu,rocm,$(API_ROCM_REQS),api and rocm) -test_cli_cpu_pytorch: - docker run \ - --rm \ - --pid host \ - --entrypoint /bin/bash \ - --volume $(PWD):/workspace \ - --workdir /workspace \ - opt-bench-cpu:local -c "pip install -e .[testing,diffusers,timm] && pytest tests/ -k 'cli and cpu and pytorch' -x" +test_api_misc: + $(call test_ubuntu,cpu,$(API_MISC_REQS),api and not (cpu or cuda or rocm or tensorrt)) -test_cli_cpu_neural_compressor: - docker run \ - --rm \ - --pid host \ - --entrypoint /bin/bash \ - --volume $(PWD):/workspace \ - --workdir /workspace \ - opt-bench-cpu:local -c "pip install -e .[testing,neural-compressor,diffusers,timm] && pytest tests/ -k 'cli and cpu and neural_compressor' -x" +## CLI tests -test_cli_cpu_onnxruntime: - docker run \ - --rm \ - --pid host \ - --entrypoint /bin/bash \ - --volume $(PWD):/workspace \ - --workdir /workspace \ - opt-bench-cpu:local -c "pip install -e .[testing,onnxruntime,diffusers,timm] && pytest tests/ -k 'cli and cpu and onnxruntime' -x" +test_cli_cuda_pytorch: + $(call test_nvidia,cuda,$(CLI_CUDA_PYTORCH_REQS),cli and cuda and pytorch) -test_cli_cpu_openvino: - docker run \ - --rm \ - --pid host \ - --entrypoint /bin/bash \ - --volume $(PWD):/workspace \ - --workdir /workspace \ - opt-bench-cpu:local -c "pip install -e .[testing,openvino,diffusers,timm] && pytest tests/ -k 'cli and cpu and openvino' -x" +test_cli_rocm_pytorch: + $(call test_amdgpu,rocm,$(CLI_ROCM_PYTORCH_REQS),cli and cuda and pytorch and peft) -### CLI GPU tests +test_cli_cuda_onnxruntime: + $(call test_nvidia,cuda,$(CLI_CUDA_ONNXRUNTIME_REQS),cli and cuda and onnxruntime) -test_cli_cuda_pytorch: - docker run \ - --rm \ - --pid host \ - --shm-size 64G \ - --gpus '"device=0,1"' \ - --entrypoint /bin/bash \ - --volume $(PWD):/workspace \ - --workdir /workspace \ - opt-bench-cuda:local -c "pip install -e .[testing,diffusers,timm,deepspeed,peft] && pytest tests/ -k 'cli and cuda and pytorch' -x" +test_cli_rocm_onnxruntime: + $(call test_amdgpu,rocm,$(CLI_ROCM_ONNXRUNTIME_REQS),cli and rocm and onnxruntime) -test_cli_rocm_pytorch: - docker run \ - --rm \ - --pid host \ - --shm-size 64G \ - --device /dev/kfd \ - --device /dev/dri/renderD128 \ - --device /dev/dri/renderD129 \ - --entrypoint /bin/bash \ - --volume $(PWD):/workspace \ - --workdir /workspace \ - opt-bench-rocm:local -c "pip install -e .[testing,diffusers,timm,deepspeed,peft] && pytest tests/ -k 'cli and cuda and pytorch' -x" +test_cli_cpu_pytorch: + $(call test_ubuntu,cpu,$(CLI_CPU_PYTORCH_REQS),cli and cpu and pytorch) -test_cli_tensorrt_onnxruntime: - docker run \ - --rm \ - --pid host \ - --shm-size 64G \ - --gpus '"device=0,1"' \ - --entrypoint /bin/bash \ - --volume $(PWD):/workspace \ - --workdir /workspace \ - opt-bench-tensorrt:local -c "pip install -e .[testing,onnxruntime-gpu,diffusers,timm] && pytest tests/ -k 'cli and tensorrt and onnxruntime' -x" +test_cli_cpu_openvino: + $(call test_ubuntu,cpu,$(CLI_CPU_OPENVINO_REQS),cli and cpu and openvino) -test_cli_tensorrt_llm: - docker run \ - --rm \ - --pid host \ - --shm-size 64G \ - --gpus '"device=0,1"' \ - --entrypoint /bin/bash \ - --volume $(PWD):/workspace \ - --workdir /workspace \ - opt-bench-tensorrt-llm:local -c "pip install -e .[testing] && pip uninstall -y nvidia-ml-py && pytest tests/ -k 'cli and tensorrt and llm' -x" +test_cli_cpu_onnxruntime: + $(call test_ubuntu,cpu,$(CLI_CPU_ONNXRUNTIME_REQS),cli and cpu and onnxruntime) + +test_cli_cpu_neural_compressor: + $(call test_ubuntu,cpu,$(CLI_CPU_NEURAL_COMPRESSOR_REQS),cli and cpu and neural-compressor) diff --git a/README.md b/README.md index 8b5e7368..fa34294b 100644 --- a/README.md +++ b/README.md @@ -58,14 +58,14 @@ pip install -e . Depending on the backends you want to use, you might need to install some extra dependencies: -- Pytorch (default): `pip install optimum-benchmark` +- PyTorch (default): `pip install optimum-benchmark` - OpenVINO: `pip install optimum-benchmark[openvino]` - Torch-ORT: `pip install optimum-benchmark[torch-ort]` - OnnxRuntime: `pip install optimum-benchmark[onnxruntime]` - TensorRT-LLM: `pip install optimum-benchmark[tensorrt-llm]` - OnnxRuntime-GPU: `pip install optimum-benchmark[onnxruntime-gpu]` - Intel Neural Compressor: `pip install optimum-benchmark[neural-compressor]` -- Text Generation Inference: `pip install optimum-benchmark[text-generation-inference]` +- Py-TGI: `pip install optimum-benchmark[py-tgi]` ### Running benchmarks from Python API 🧪 diff --git a/optimum_benchmark/backends/base.py b/optimum_benchmark/backends/base.py index 8a6430d8..d9eaa5f3 100644 --- a/optimum_benchmark/backends/base.py +++ b/optimum_benchmark/backends/base.py @@ -5,7 +5,8 @@ from logging import getLogger from typing import Any, ClassVar, Dict, Generic, Optional -import numpy as np +import datasets.utils.logging as datasets_logging +import transformers.utils.logging as transformers_logging from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel, TrainerState from ..task_utils import get_automodel_class_for_task @@ -20,6 +21,9 @@ get_transformers_pretrained_processor, ) +datasets_logging.set_verbosity_error() +transformers_logging.set_verbosity_error() + LOGGER = getLogger("backend") @@ -71,7 +75,6 @@ def __init__(self, config: BackendConfigT): def seed(self) -> None: LOGGER.info(f"\t+ Setting random seed to {self.config.seed}") random.seed(self.config.seed) - np.random.seed(self.config.seed) def prepare_for_inference(self, **kwargs) -> None: """ diff --git a/optimum_benchmark/backends/config.py b/optimum_benchmark/backends/config.py index e8c9c231..623455b9 100644 --- a/optimum_benchmark/backends/config.py +++ b/optimum_benchmark/backends/config.py @@ -11,7 +11,12 @@ LOGGER = getLogger("backend") -HUB_KWARGS = {"revision": "main", "force_download": False, "local_files_only": False, "trust_remote_code": False} +HUB_KWARGS = { + "revision": "main", + "force_download": False, + "local_files_only": False, + "trust_remote_code": False, +} @dataclass @@ -20,18 +25,18 @@ class BackendConfig(ABC): version: str _target_: str - seed: int = 42 - model: Optional[str] = None + + task: Optional[str] = None + library: Optional[str] = None + device: Optional[str] = None device_ids: Optional[str] = None # yes we use a string here instead of a list # because it's easier to pass in a yaml or from cli # and it's consistent with GPU environment variables - task: Optional[str] = None - library: Optional[str] = None - + seed: int = 42 inter_op_num_threads: Optional[int] = None intra_op_num_threads: Optional[int] = None @@ -42,23 +47,28 @@ def __post_init__(self): raise ValueError("`model` must be specified.") if self.task is None: - self.task = infer_task_from_model_name_or_path(self.model) + self.task = infer_task_from_model_name_or_path(self.model, self.hub_kwargs.get("revision", None)) + + if self.library is None: + self.library = infer_library_from_model_name_or_path(self.model, self.hub_kwargs.get("revision", None)) if self.device is None: self.device = "cuda" if is_nvidia_system() or is_rocm_system() else "cpu" - LOGGER.warning(f"`device` is not specified, defaulting to {self.device} based on system configuration.") - - if self.device not in ["cuda", "cpu", "mps", "xla"]: - raise ValueError(f"`device` must be either `cuda`, `cpu`, `mps` or `xla`, but got {self.device}") if ":" in self.device: - # support pytorch device index notation + LOGGER.warning("`device` was specified using PyTorch format (e.g. `cuda:0`) which is not recommended.") self.device = self.device.split(":")[0] self.device_ids = self.device.split(":")[1] + LOGGER.warning(f"`device` and `device_ids` are now set to `{self.device}` and `{self.device_ids}`.") + + if self.device not in ["cuda", "cpu", "mps", "xla"]: + raise ValueError(f"`device` must be either `cuda`, `cpu`, `mps` or `xla`, but got {self.device}") if self.device == "cuda": if self.device_ids is None: + LOGGER.warning("`device_ids` was not specified, using all available GPUs.") self.device_ids = get_gpu_device_ids() + LOGGER.warning(f"`device_ids` is now set to `{self.device_ids}` based on system configuration.") os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = self.device_ids @@ -69,21 +79,14 @@ def __post_init__(self): os.environ["HIP_VISIBLE_DEVICES"] = self.device_ids os.environ["ROCR_VISIBLE_DEVICES"] = self.device_ids - if self.library is None: - self.library = infer_library_from_model_name_or_path(self.model) - if self.library not in ["transformers", "diffusers", "timm"]: raise ValueError(f"`library` must be either `transformers`, `diffusers` or `timm`, but got {self.library}") if self.inter_op_num_threads is not None: - if not isinstance(self.inter_op_num_threads, int): - raise ValueError(f"`inter_op_num_threads` must be an integer, but got {self.inter_op_num_threads}") if self.inter_op_num_threads == -1: self.inter_op_num_threads = cpu_count() if self.intra_op_num_threads is not None: - if not isinstance(self.intra_op_num_threads, int): - raise ValueError(f"`intra_op_num_threads` must be an integer, but got {self.intra_op_num_threads}") if self.intra_op_num_threads == -1: self.intra_op_num_threads = cpu_count() diff --git a/optimum_benchmark/backends/diffusers_utils.py b/optimum_benchmark/backends/diffusers_utils.py index 042ea3d0..a9b5b5a7 100644 --- a/optimum_benchmark/backends/diffusers_utils.py +++ b/optimum_benchmark/backends/diffusers_utils.py @@ -13,23 +13,25 @@ def get_diffusers_pretrained_config(model: str, **kwargs) -> Dict[str, int]: def extract_diffusers_shapes_from_model(model: str, **kwargs) -> Dict[str, int]: - config = diffusers.DiffusionPipeline.load_config(model, **kwargs) + model_config = get_diffusers_pretrained_config(model, **kwargs) shapes = {} - if "vae" in config: - vae_import_path = config["vae"] + if "vae" in model_config: + vae_import_path = model_config["vae"] vae_class = get_class(f"{vae_import_path[0]}.{vae_import_path[1]}") vae_config = vae_class.load_config(model, subfolder="vae", **kwargs) shapes["num_channels"] = vae_config["out_channels"] shapes["height"] = vae_config["sample_size"] shapes["width"] = vae_config["sample_size"] - elif "vae_encoder" in config: - vae_import_path = config["vae_encoder"] + + elif "vae_encoder" in model_config: + vae_import_path = model_config["vae_encoder"] vae_class = get_class(f"{vae_import_path[0]}.{vae_import_path[1]}") - vae_config = vae_class.load_config(model, subfolder="vae", **kwargs) + vae_config = vae_class.load_config(model, subfolder="vae_encoder", **kwargs) shapes["num_channels"] = vae_config["out_channels"] shapes["height"] = vae_config["sample_size"] shapes["width"] = vae_config["sample_size"] + else: shapes["num_channels"] = -1 shapes["height"] = -1 diff --git a/optimum_benchmark/backends/neural_compressor/backend.py b/optimum_benchmark/backends/neural_compressor/backend.py index cb70fdfc..3fecd3d9 100644 --- a/optimum_benchmark/backends/neural_compressor/backend.py +++ b/optimum_benchmark/backends/neural_compressor/backend.py @@ -8,19 +8,14 @@ from hydra.utils import get_class from neural_compressor.config import AccuracyCriterion, PostTrainingQuantConfig, TuningCriterion from optimum.intel.neural_compressor.quantization import INCQuantizer -from transformers.modeling_utils import no_init_weights from transformers.utils import ModelOutput -from transformers.utils.logging import set_verbosity_error from ...generators.dataset_generator import DatasetGenerator from ..base import Backend -from ..transformers_utils import randomize_weights +from ..transformers_utils import random_init_weights from .config import INCConfig from .utils import TASKS_TO_INCMODELS -# disable transformers logging -set_verbosity_error() - LOGGER = getLogger("neural-compressor") @@ -36,17 +31,30 @@ def __init__(self, config: INCConfig): if self.config.ptq_quantization: if self.config.no_weights: + LOGGER.info("\t+ Loading no weights AutoModel") self.load_automodel_with_no_weights() else: + LOGGER.info("\t+ Loading pretrained AutoModel") self.load_automodel_from_pretrained() + + LOGGER.info("\t+ Applying post-training quantization") self.quantize_automodel() - self.delete_pretrained_model() + + LOGGER.info("\t+ Loading quantized INCModel") + original_model, self.config.model = self.config.model, self.quantized_model self.load_incmodel_from_pretrained() + self.config.model = original_model + elif self.config.no_weights: + LOGGER.info("\t+ Loading no weights INCModel") self.load_incmodel_with_no_weights() + else: + LOGGER.info("\t+ Loading pretrained INCModel") self.load_incmodel_from_pretrained() + self.tmpdir.cleanup() + def validate_task(self) -> None: if self.config.task not in TASKS_TO_INCMODELS: raise NotImplementedError(f"INCBackend does not support task {self.config.task}") @@ -55,60 +63,52 @@ def validate_task(self) -> None: LOGGER.info(f"Using INCModel class {self.incmodel_class.__name__}") def load_automodel_from_pretrained(self) -> None: - LOGGER.info("\t+ Loading AutoModel from pretrained") self.pretrained_model = self.automodel_class.from_pretrained(self.config.model, **self.config.hub_kwargs) def create_no_weights_model(self) -> None: - LOGGER.info("\t+ Creating no weights model state_dict") - state_dict = torch.nn.Linear(1, 1).state_dict() - + self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights_model") LOGGER.info("\t+ Creating no weights model directory") - self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights") os.makedirs(self.no_weights_model, exist_ok=True) - - LOGGER.info("\t+ Saving no weights model pretrained config") - self.pretrained_config.save_pretrained(save_directory=self.no_weights_model) - - LOGGER.info("\t+ Saving no weights model state_dict") + LOGGER.info("\t+ Creating no weights model state dict") + state_dict = torch.nn.Linear(1, 1).state_dict() + LOGGER.info("\t+ Saving no weights model pytorch_model.bin") torch.save(state_dict, os.path.join(self.no_weights_model, "pytorch_model.bin")) + if self.config.library == "transformers": + LOGGER.info("\t+ Saving no weights model pretrained config") + self.pretrained_config.save_pretrained(save_directory=self.no_weights_model) + def load_automodel_with_no_weights(self) -> None: + LOGGER.info("\t+ Creating no weights model") self.create_no_weights_model() - with no_init_weights(): - original_model = self.config.model - self.config.model = self.no_weights_model - LOGGER.info("\t+ Loading no weights model") + with random_init_weights(): + original_model, self.config.model = self.config.model, self.no_weights_model + LOGGER.info("\t+ Loading no weights AutoModel") self.load_automodel_from_pretrained() self.config.model = original_model - LOGGER.info("\t+ Randomizing model weights") - randomize_weights(self.pretrained_model) LOGGER.info("\t+ Tying model weights") self.pretrained_model.tie_weights() def load_incmodel_from_pretrained(self) -> None: - LOGGER.info("\t+ Loading INCModel from pretrained") self.pretrained_model = self.incmodel_class.from_pretrained(self.config.model, **self.config.hub_kwargs) def load_incmodel_with_no_weights(self) -> None: + LOGGER.info("\t+ Creating no weights model") self.create_no_weights_model() - with no_init_weights(): - original_model = self.config.model - self.config.model = self.no_weights_model - LOGGER.info("\t+ Loading no weights model") + with random_init_weights(): + original_model, self.config.model = self.config.model, self.no_weights_model + LOGGER.info("\t+ Loading no weights INCModel") self.load_incmodel_from_pretrained() self.config.model = original_model - LOGGER.info("\t+ Randomizing model weights") - randomize_weights(self.pretrained_model.model) LOGGER.info("\t+ Tying model weights") self.pretrained_model.model.tie_weights() def quantize_automodel(self) -> None: - LOGGER.info("\t+ Attempting to quantize model") - quantized_model_path = f"{self.tmpdir.name}/quantized" + self.quantized_model = f"{self.tmpdir.name}/quantized_model" LOGGER.info("\t+ Processing quantization config") ptq_quantization_config = self.config.ptq_quantization_config.copy() ptq_quantization_config["accuracy_criterion"] = AccuracyCriterion( @@ -139,7 +139,7 @@ def quantize_automodel(self) -> None: LOGGER.info("\t+ Quantizing model") quantizer.quantize( - save_directory=quantized_model_path, + save_directory=self.quantized_model, calibration_dataset=calibration_dataset, quantization_config=ptq_quantization_config, # TODO: add support for these @@ -148,17 +148,20 @@ def quantize_automodel(self) -> None: file_name=None, batch_size=1, ) - self.config.model = quantized_model_path def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + inputs = super().prepare_inputs(inputs) + if self.config.library == "diffusers": return {"prompt": inputs["prompt"]} return inputs + @torch.inference_mode() def forward(self, input: Dict[str, Any], kwargs: Dict[str, Any]) -> ModelOutput: return self.pretrained_model(**input, **kwargs) + @torch.inference_mode() def generate(self, input: Dict[str, Any], kwargs: Dict[str, Any]) -> ModelOutput: return self.pretrained_model.generate(**input, **kwargs) diff --git a/optimum_benchmark/backends/onnxruntime/backend.py b/optimum_benchmark/backends/onnxruntime/backend.py index 33e2694a..64e51d8c 100644 --- a/optimum_benchmark/backends/onnxruntime/backend.py +++ b/optimum_benchmark/backends/onnxruntime/backend.py @@ -3,20 +3,12 @@ from collections import OrderedDict from logging import getLogger from tempfile import TemporaryDirectory -from typing import Any, Callable, Dict, List +from typing import Any, Dict, List import torch -from datasets import Dataset from hydra.utils import get_class from onnxruntime import SessionOptions -from optimum.onnxruntime import ( - ONNX_DECODER_NAME, - ONNX_DECODER_WITH_PAST_NAME, - ORTOptimizer, - ORTQuantizer, - ORTTrainer, - ORTTrainingArguments, -) +from optimum.onnxruntime import ONNX_DECODER_NAME, ONNX_DECODER_WITH_PAST_NAME, ORTOptimizer, ORTQuantizer from optimum.onnxruntime.configuration import ( AutoCalibrationConfig, AutoOptimizationConfig, @@ -26,19 +18,14 @@ QuantizationConfig, ) from safetensors.torch import save_file -from transformers import TrainerCallback -from transformers.modeling_utils import no_init_weights -from transformers.utils.logging import set_verbosity_error from ...generators.dataset_generator import DatasetGenerator from ...task_utils import TEXT_GENERATION_TASKS from ..base import Backend +from ..transformers_utils import random_init_weights from .config import ORTConfig from .utils import TASKS_TO_ORTMODELS, TASKS_TO_ORTSD, format_calibration_config, format_quantization_config -# disable transformers logging -set_verbosity_error() - LOGGER = getLogger("onnxruntime") @@ -49,82 +36,81 @@ def __init__(self, config: ORTConfig) -> None: super().__init__(config) self.validate_task() - if self.config.library == "diffusers": - self.ortmodel_class = get_class(TASKS_TO_ORTSD[self.config.task]) - LOGGER.info(f"Using ORTDiffusion class {self.ortmodel_class.__name__}") - elif self.config.task in TASKS_TO_ORTMODELS: - self.ortmodel_class = get_class(TASKS_TO_ORTMODELS[self.config.task]) - LOGGER.info(f"Using ORTModel class {self.ortmodel_class.__name__}") - else: - raise NotImplementedError(f"ORTBackend does not support task {self.config.task}") + self.session_options = SessionOptions() + if self.config.session_options: + LOGGER.info("\t+ Processing session options") + for key, value in self.config.session_options.items(): + setattr(self.session_options, key, value) LOGGER.info("\t+ Creating backend temporary directory") self.tmpdir = TemporaryDirectory() - self.session_options = SessionOptions() - for key, value in self.config.session_options.items(): - setattr(self.session_options, key, value) - if self.config.no_weights: + LOGGER.info("\t+ Loading no weights ORTModel") self.load_ortmodel_with_no_weights() else: + LOGGER.info("\t+ Loading pretrained ORTModel") self.load_ortmodel_from_pretrained() - if self.is_trt_text_generation: - return - if self.is_optimized or self.is_quantized: - original_model = self.config.model - self.config.model = self.pretrained_model.model_save_dir + original_model, self.config.model = self.config.model, self.pretrained_model.model_save_dir if self.is_optimized: + LOGGER.info("\t+ Applying ORT optimization") self.optimize_onnx_files() + self.config.model = self.optimized_model if self.is_quantized: + LOGGER.info("\t+ Applying ORT quantization") self.quantize_onnx_files() + self.config.model = self.quantized_model if self.is_optimized or self.is_quantized: - original_export = self.config.export - self.load_ortmodel_from_pretrained() # load optimized/quantized model - self.config.export = original_export - self.config.model = original_model + original_export, self.config.export = self.config.export, False + LOGGER.info("\t+ Loading optimized/quantized ORTModel") + self.load_ortmodel_from_pretrained() + self.config.model, self.config.export = original_model, original_export self.validate_provider() + self.tmpdir.cleanup() def validate_task(self) -> None: - if self.config.task not in {**TASKS_TO_ORTMODELS, **TASKS_TO_ORTSD}: + if self.config.task in TASKS_TO_ORTSD: + self.ortmodel_class = get_class(TASKS_TO_ORTSD[self.config.task]) + LOGGER.info(f"Using ORTStableDiffusion class {self.ortmodel_class.__name__}") + elif self.config.task in TASKS_TO_ORTMODELS: + self.ortmodel_class = get_class(TASKS_TO_ORTMODELS[self.config.task]) + LOGGER.info(f"Using ORTModel class {self.ortmodel_class.__name__}") + else: raise NotImplementedError(f"ORTBackend does not support task {self.config.task}") def validate_provider(self) -> None: - assert ( - self.pretrained_model.providers[0] == self.config.provider - ), f"{self.config.provider} is not first in providers list: {self.pretrained_model.providers}" + if not self.pretrained_model.providers[0] == self.config.provider: + raise ValueError( + f"{self.config.provider} is not first in providers list: {self.pretrained_model.providers}" + ) def create_no_weights_model(self) -> None: + self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights_model") LOGGER.info("\t+ Creating no weights model directory") - self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights") os.makedirs(self.no_weights_model, exist_ok=True) - - LOGGER.info("\t+ Saving pretrained config") - self.pretrained_config.save_pretrained(save_directory=self.no_weights_model) - LOGGER.info("\t+ Creating no weights model state dict") state_dict = torch.nn.Linear(1, 1).state_dict() + LOGGER.info("\t+ Saving no weights model safetensors") + safetensors = os.path.join(self.no_weights_model, "model.safetensors") + save_file(tensors=state_dict, filename=safetensors, metadata={"format": "pt"}) - LOGGER.info("\t+ Saving no weights model state dict") - save_file( - filename=os.path.join(self.no_weights_model, "model.safetensors"), - metadata={"format": "pt"}, - tensors=state_dict, - ) + if self.config.library == "transformers": + LOGGER.info("\t+ Saving no weights model pretrained config") + self.pretrained_config.save_pretrained(save_directory=self.no_weights_model) def load_ortmodel_with_no_weights(self) -> None: + LOGGER.info("\t+ Creating no weights model") self.create_no_weights_model() - with no_init_weights(): - original_model = self.config.model - self.config.model = self.no_weights_model - LOGGER.info("\t+ Loading no weights model") + with random_init_weights(): + original_model, self.config.model = self.config.model, self.no_weights_model + LOGGER.info("\t+ Loading no weights ORTModel") self.load_ortmodel_from_pretrained() self.config.model = original_model @@ -140,10 +126,6 @@ def load_ortmodel_from_pretrained(self) -> None: **self.ortmodel_kwargs, ) - @property - def is_trt_text_generation(self) -> bool: - return self.config.provider == "TensorrtExecutionProvider" and self.config.task in TEXT_GENERATION_TASKS - @property def is_optimized(self) -> bool: return (self.config.auto_optimization is not None) or self.config.optimization @@ -189,7 +171,7 @@ def inputs_names(self) -> List[str]: def optimize_onnx_files(self) -> None: LOGGER.info("\t+ Attempting optimization") - optimized_model_path = os.path.join(self.tmpdir.name, "optimized") + self.optimized_model = os.path.join(self.tmpdir.name, "optimized") LOGGER.info("\t+ Processing optimization config") if self.config.auto_optimization is not None: optimization_config = AutoOptimizationConfig.with_optimization_level( @@ -206,24 +188,20 @@ def optimize_onnx_files(self) -> None: LOGGER.info("\t+ Optimizing ORTModel") optimizer.optimize( optimization_config, - save_dir=optimized_model_path, + save_dir=self.optimized_model, # TODO: add support for these use_external_data_format=None, one_external_file=True, file_suffix="", ) - if self.pretrained_processor is not None: - self.pretrained_processor.save_pretrained(optimized_model_path) - + self.pretrained_processor.save_pretrained(self.optimized_model) if self.pretrained_config is not None: - self.pretrained_config.save_pretrained(optimized_model_path) - - self.config.model = optimized_model_path + self.pretrained_config.save_pretrained(self.optimized_model) def quantize_onnx_files(self) -> None: LOGGER.info("\t+ Attempting quantization") - quantized_model_path = f"{self.tmpdir.name}/quantized" + self.quantized_model = f"{self.tmpdir.name}/quantized_model" if self.is_calibrated and len(self.onnx_files_names) > 1: raise NotImplementedError( @@ -286,7 +264,7 @@ def quantize_onnx_files(self) -> None: LOGGER.info("\t+ Quantizing model") quantizer.quantize( - save_dir=quantized_model_path, + save_dir=self.quantized_model, quantization_config=quantization_config, calibration_tensors_range=calibration_tensors_range, # TODO: add support for these (maybe) @@ -294,56 +272,25 @@ def quantize_onnx_files(self) -> None: preprocessor=None, file_suffix="", ) - if self.pretrained_processor is not None: - self.pretrained_processor.save_pretrained(quantized_model_path) - + self.pretrained_processor.save_pretrained(self.quantized_model) if self.pretrained_config is not None: - self.pretrained_config.save_pretrained(quantized_model_path) - - self.config.model = quantized_model_path - - def prepare_for_inference(self, **kwargs) -> None: - if self.is_trt_text_generation: - LOGGER.info("\t+ Creating dynamic shapes for Tensorrt engine. Engine creation might take a while.") - batch_size = kwargs["batch_size"] - max_new_tokens = kwargs["max_new_tokens"] - sequence_length = kwargs["sequence_length"] - self.config.provider_options = { - **self.config.provider_options, - "trt_profile_min_shapes": ( - f"input_ids:{batch_size}x{sequence_length}," - f"attention_mask:{batch_size}x{sequence_length}," - f"position_ids:{batch_size}x{sequence_length}" - ), - "trt_profile_max_shapes": ( - f"input_ids:{batch_size}x{sequence_length + max_new_tokens}," - f"attention_mask:{batch_size}x{sequence_length + max_new_tokens}," - f"position_ids:{batch_size}x{sequence_length + max_new_tokens}" - ), - "trt_profile_opt_shapes": ( - f"input_ids:{batch_size}x{sequence_length + max_new_tokens}," - f"attention_mask:{batch_size}x{sequence_length + max_new_tokens}," - f"position_ids:{batch_size}x{sequence_length + max_new_tokens}" - ), - } - self.load_ortmodel_from_pretrained() - self.validate_provider() + self.pretrained_config.save_pretrained(self.quantized_model) def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + inputs = super().prepare_inputs(inputs) + if self.config.library == "diffusers": + return {"prompt": inputs["prompt"]} + else: + for key, value in list(inputs.items()): + if key in self.inputs_names: + inputs[key] = value.to(self.config.device) + else: + LOGGER.warning(f"Input {key} is not in expected inputs names. Removing it.") + inputs.pop(key) return inputs - LOGGER.info(f"\t+ Moving inputs tensors to device {self.config.device}") - for key, value in list(inputs.items()): - if key in self.inputs_names: - inputs[key] = value.to(self.config.device) - else: - LOGGER.warning(f"Input {key} is not in expected inputs names. Removing it.") - inputs.pop(key) - - return inputs - def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict: return self.pretrained_model.forward(**inputs, **kwargs) @@ -353,29 +300,6 @@ def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDic def call(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict: return self.pretrained_model(**inputs, **kwargs) - def train( - self, - training_dataset: Dataset, - training_arguments: Dict[str, Any], - training_callbacks: List[TrainerCallback], - training_data_collator: Callable[[List[Dict[str, Any]]], Dict[str, Any]], - ) -> None: - LOGGER.info("\t+ Setting dataset format to `torch`") - training_dataset.set_format(type="torch", columns=list(training_dataset.features.keys())) - LOGGER.info("\t+ Wrapping training arguments with optimum.onnxruntime.ORTTrainingArguments") - training_arguments = ORTTrainingArguments(**training_arguments) - LOGGER.info("\t+ Wrapping model with optimum.onnxruntime.ORTTrainer") - trainer = ORTTrainer( - model=self.pretrained_model, - args=training_arguments, - callbacks=training_callbacks, - train_dataset=training_dataset, - data_collator=training_data_collator, - ) - LOGGER.info("\t+ Starting training") - trainer.train() - LOGGER.info("\t+ Training finished successfully") - def clean(self) -> None: super().clean() diff --git a/optimum_benchmark/backends/onnxruntime/config.py b/optimum_benchmark/backends/onnxruntime/config.py index 19ad747d..07101f78 100644 --- a/optimum_benchmark/backends/onnxruntime/config.py +++ b/optimum_benchmark/backends/onnxruntime/config.py @@ -1,10 +1,9 @@ -import os from dataclasses import dataclass, field from typing import Any, Dict, Optional from ...import_utils import onnxruntime_version +from ...task_utils import TEXT_GENERATION_TASKS from ..config import BackendConfig -from ..peft_utils import PEFT_CONFIGS, PEFT_TASKS_TYPES QUANTIZATION_CONFIG = { "is_static": False, @@ -22,8 +21,6 @@ # is_static is mandatory } -TRT_PROVIDER_OPTIONS = {"trt_engine_cache_enable": True, "trt_engine_cache_path": "/tmp/trt_cache"} - IO_BINDING_LIBRARIES = ["transformers", "timm"] IO_BINDING_PROVIDERS = ["CPUExecutionProvider", "CUDAExecutionProvider"] DEVICE_PROVIDER_MAP = {"cpu": "CPUExecutionProvider", "cuda": "CUDAExecutionProvider"} @@ -46,7 +43,7 @@ class ORTConfig(BackendConfig): # provider options provider: Optional[str] = None - provider_options: Dict[str, Any] = field(default_factory=lambda: {}) + provider_options: Dict[str, Any] = field(default_factory=dict) # inference options use_io_binding: Optional[bool] = None @@ -76,10 +73,6 @@ class ORTConfig(BackendConfig): calibration: bool = False calibration_config: Dict[str, Any] = field(default_factory=dict) - # peft options - peft_strategy: Optional[str] = None - peft_config: Dict[str, Any] = field(default_factory=dict) - def __post_init__(self): super().__post_init__() @@ -95,9 +88,8 @@ def __post_init__(self): if self.use_io_binding is None: self.use_io_binding = self.provider in IO_BINDING_PROVIDERS and self.library in IO_BINDING_LIBRARIES - if self.provider == "TensorrtExecutionProvider": - self.provider_options = {**TRT_PROVIDER_OPTIONS, **self.provider_options} - os.makedirs(self.provider_options["trt_engine_cache_path"], exist_ok=True) + if self.provider == "TensorrtExecutionProvider" and self.task in TEXT_GENERATION_TASKS: + raise NotImplementedError("we don't support TensorRT for text generation tasks") if self.quantization: self.quantization_config = {**QUANTIZATION_CONFIG, **self.quantization_config} @@ -118,14 +110,3 @@ def __post_init__(self): if self.calibration: self.calibration_config = {**CALIBRATION_CONFIG, **self.calibration_config} - - if self.peft_strategy is not None: - if self.peft_strategy not in PEFT_CONFIGS: - raise ValueError( - f"`peft_strategy` must be one of {list(PEFT_CONFIGS.keys())}. Got {self.peft_strategy} instead." - ) - PEFT_CONFIG = PEFT_CONFIGS[self.peft_strategy] - self.peft_config = {**PEFT_CONFIG, **self.peft_config} - - if self.peft_config["task_type"] is None: - raise ValueError(f"`peft_config.task_type` must be set to one of the following {PEFT_TASKS_TYPES}") diff --git a/optimum_benchmark/backends/openvino/backend.py b/optimum_benchmark/backends/openvino/backend.py index e883c3ac..ae2d2918 100644 --- a/optimum_benchmark/backends/openvino/backend.py +++ b/optimum_benchmark/backends/openvino/backend.py @@ -12,13 +12,12 @@ from optimum.intel.openvino import OVConfig as OVQuantizationConfig # naming conflict from optimum.intel.openvino import OVQuantizer from safetensors.torch import save_file -from transformers.modeling_utils import no_init_weights from transformers.utils.logging import set_verbosity_error from ...generators.dataset_generator import DatasetGenerator from ...task_utils import TEXT_GENERATION_TASKS from ..base import Backend -from ..transformers_utils import randomize_weights +from ..transformers_utils import random_init_weights from .config import OVConfig from .utils import TASKS_TO_OVMODEL @@ -35,27 +34,35 @@ def __init__(self, config: OVConfig) -> None: super().__init__(config) self.validate_task() - self.ovmodel_class = get_class(TASKS_TO_OVMODEL[self.config.task]) - LOGGER.info(f"\t+ Using OVModel class {self.ovmodel_class.__name__}") - if self.config.inter_op_num_threads is not None: - self.set_inter_op_num_threads() + LOGGER.info(f"\t+ Setting inter_op_num_threads to {self.config.inter_op_num_threads}") + self.config.openvino_config[properties.inference_num_threads()] = self.config.inter_op_num_threads + LOGGER.info("\t+ Creating backend temporary directory") self.tmpdir = TemporaryDirectory() if self.config.quantization: if self.config.no_weights: + LOGGER.info("\t+ Loading no weights AutoModel") self.load_automodel_with_no_weights() else: + LOGGER.info("\t+ Loading pretrained AutoModel") self.load_automodel_from_pretrained() - original_export = self.config.export - self.config.export = False + + LOGGER.info("\t+ Applying post-training quantization") self.quantize_automodel() + + original_model, self.config.model = self.config.model, self.quantized_model + original_export, self.config.export = self.config.export, False + LOGGER.info("\t+ Loading quantized OVModel") self.load_ovmodel_from_pretrained() - self.config.export = original_export + self.config.model, self.config.export = original_model, original_export + elif self.config.no_weights: + LOGGER.info("\t+ Loading no weights OVModel") self.load_ovmodel_with_no_weights() else: + LOGGER.info("\t+ Loading pretrained OVModel") self.load_ovmodel_from_pretrained() self.tmpdir.cleanup() @@ -64,40 +71,33 @@ def validate_task(self) -> None: if self.config.task not in TASKS_TO_OVMODEL: raise NotImplementedError(f"OVBackend does not support task {self.config.task}") - def set_inter_op_num_threads(self) -> None: - LOGGER.info(f"\t+ Setting inter_op_num_threads to {self.config.inter_op_num_threads}") - self.config.openvino_config[properties.inference_num_threads()] = self.config.inter_op_num_threads + self.ovmodel_class = get_class(TASKS_TO_OVMODEL[self.config.task]) + LOGGER.info(f"\t+ Using OVModel class {self.ovmodel_class.__name__}") def create_no_weights_model(self) -> None: + self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights_model") LOGGER.info("\t+ Creating no weights model directory") - self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights") os.makedirs(self.no_weights_model, exist_ok=True) - - LOGGER.info("\t+ Saving pretrained config") - self.pretrained_config.save_pretrained(save_directory=self.no_weights_model) - LOGGER.info("\t+ Creating no weights model state dict") state_dict = torch.nn.Linear(1, 1).state_dict() + LOGGER.info("\t+ Saving no weights model safetensors") + safetensors = os.path.join(self.no_weights_model, "model.safetensors") + save_file(tensors=state_dict, filename=safetensors, metadata={"format": "pt"}) - LOGGER.info("\t+ Saving no weights model state dict") - save_file( - filename=os.path.join(self.no_weights_model, "model.safetensors"), - metadata={"format": "pt"}, - tensors=state_dict, - ) + if self.config.library == "transformers": + LOGGER.info("\t+ Saving no weights model pretrained config") + self.pretrained_config.save_pretrained(save_directory=self.no_weights_model) def load_automodel_with_no_weights(self) -> None: + LOGGER.info("\t+ Creating no weights model") self.create_no_weights_model() - with no_init_weights(): - original_model = self.config.model - self.config.model = self.no_weights_model - LOGGER.info("\t+ Loading no weights model") + with random_init_weights(): + original_model, self.config.model = self.config.model, self.no_weights_model + LOGGER.info("\t+ Loading no weights AutoModel") self.load_automodel_from_pretrained() self.config.model = original_model - LOGGER.info("\t+ Randomizing weights") - randomize_weights(self.pretrained_model) LOGGER.info("\t+ Tying model weights") self.pretrained_model.tie_weights() @@ -105,14 +105,16 @@ def load_automodel_from_pretrained(self) -> None: self.pretrained_model = self.automodel_class.from_pretrained(self.config.model, **self.config.hub_kwargs) def load_ovmodel_with_no_weights(self) -> None: + LOGGER.info("\t+ Creating no weights model") self.create_no_weights_model() - with no_init_weights(): - original_model = self.config.model - self.config.model = self.no_weights_model - LOGGER.info("\t+ Loading OVModel with no weights") + with random_init_weights(): + original_model, self.config.model = self.config.model, self.no_weights_model + original_export, self.config.export = self.config.export, True + LOGGER.info("\t+ Loading no weights OVModel") self.load_ovmodel_from_pretrained() self.config.model = original_model + self.config.export = original_export def load_ovmodel_from_pretrained(self) -> None: self.pretrained_model = self.ovmodel_class.from_pretrained( @@ -135,7 +137,7 @@ def ovmodel_kwargs(self) -> Dict[str, Any]: def quantize_automodel(self) -> None: LOGGER.info("\t+ Attempting quantization") - quantized_model_path = f"{self.tmpdir.name}/quantized" + self.quantized_model = f"{self.tmpdir.name}/quantized_model" LOGGER.info("\t+ Processing quantization config") quantization_config = OVQuantizationConfig(**self.config.quantization_config) LOGGER.info("\t+ Creating quantizer") @@ -154,7 +156,7 @@ def quantize_automodel(self) -> None: LOGGER.info("\t+ Quantizing model") quantizer.quantize( - save_directory=quantized_model_path, + save_directory=self.quantized_model, quantization_config=quantization_config, calibration_dataset=calibration_dataset, # TODO: add support for these (maybe) @@ -164,7 +166,6 @@ def quantize_automodel(self) -> None: file_name=None, batch_size=1, ) - self.config.model = quantized_model_path def prepare_for_inference(self, **kwargs) -> None: if self.config.reshape: @@ -188,6 +189,8 @@ def prepare_for_inference(self, **kwargs) -> None: self.pretrained_model.compile() def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + inputs = super().prepare_inputs(inputs) + if self.config.library == "diffusers": return {"prompt": inputs["prompt"]} diff --git a/optimum_benchmark/backends/peft_utils.py b/optimum_benchmark/backends/peft_utils.py index 8ec7d1fa..037d4e87 100644 --- a/optimum_benchmark/backends/peft_utils.py +++ b/optimum_benchmark/backends/peft_utils.py @@ -1,102 +1,13 @@ -from typing import Type +from typing import Any, Dict + +from transformers import PreTrainedModel from ..import_utils import is_peft_available if is_peft_available(): - from peft import ( - AdaLoraConfig, - IA3Config, - LoraConfig, - PeftConfig, - PrefixTuningConfig, - PromptEncoderConfig, - PromptLearningConfig, - ) - -PEFT_TASKS_TYPES = ["SEQ_CLS", "SEQ_2_SEQ_LM", "CAUSAL_LM", "TOKEN_CLS", "QUESTION_ANS", "FEATURE_EXTRACTION"] - -PEFT_CONFIG = { - "base_model_name_or_path": None, - "revision": None, # str - "peft_type": None, # PeftType: can't be changed anyway - "task_type": None, # TaskType: SEQ_CLS, SEQ_2_SEQ_LM, CAUSAL_LM, TOKEN_CLS, QUESTION_ANS, FEATURE_EXTRACTION - "inference_mode": False, -} -LORA_CONFIG = { - **PEFT_CONFIG, # inherits from PEFT_CONFIG - "auto_mapping": None, # dict - "r": 8, # int - "target_modules": None, # List[str] | str - "lora_alpha": 8, # int - "lora_dropout": 0, # float - "fan_in_fan_out": False, # bool - "bias": "none", # str - "modules_to_save": None, # List[str] - "init_lora_weights": True, # bool - "layers_to_transform": None, # List[int] | int - "layers_pattern": None, # str -} -ADA_LORA_CONFIG = { - **LORA_CONFIG, # inherits from LORA_CONFIG - "target_r": None, # int - "init_r": None, # int - "tinit": None, # int - "tfinal": None, # int - "deltaT": None, # int - "beta1": None, # float - "beta2": None, # float - "orth_reg_weight": None, # float - "total_step": None, # Optional[int] - "rank_pattern": None, # Optional[dict] -} -PROMPT_TUNING_CONFIG = { - **PEFT_CONFIG, # inherits from PEFT_CONFIG - "num_virtual_tokens": None, # int - "token_dim": None, # int - "num_transformer_submodules": None, # int - "num_attention_heads": None, # int - "num_layers": None, # int -} -PREFIX_TUNING_CONFIG = { - **PROMPT_TUNING_CONFIG, # inherits from PROMPT_TUNING_CONFIG - "encoder_hidden_size": None, # int - "prefix_projection": False, # bool -} -P_TUNING_CONFIG = { - **PROMPT_TUNING_CONFIG, # inherits from PROMPT_TUNING_CONFIG - "encoder_reparameterization_type": None, # Union[str, PromptEncoderReparameterizationType] - "encoder_hidden_size": None, # int - "encoder_num_layers": None, # int - "encoder_dropout": None, # float -} -IA3_CONFIG = { - **PEFT_CONFIG, # inherits from PEFT_CONFIG - "target_modules": None, # List[str] | str - "feedforward_modules": None, # List[str] | str - "fan_in_fan_out": False, # bool - "modules_to_save": None, # List[str] - "init_ia3_weights": True, # bool -} -PEFT_CONFIGS = { - "lora": LORA_CONFIG, - "prefix_tuning": PREFIX_TUNING_CONFIG, - "prompt_tuning": PROMPT_TUNING_CONFIG, - "p_tuning": P_TUNING_CONFIG, - "ada_lora": ADA_LORA_CONFIG, - "ia3": IA3_CONFIG, -} + from peft import PEFT_TYPE_TO_CONFIG_MAPPING, get_peft_model # type: ignore -def get_peft_config_class(peft_strategy: str) -> Type["PeftConfig"]: - if peft_strategy == "lora": - return LoraConfig - elif peft_strategy == "ada_lora": - return AdaLoraConfig - elif peft_strategy == "prompt_tuning": - return PromptLearningConfig - elif peft_strategy == "prefix_tuning": - return PrefixTuningConfig - elif peft_strategy == "p_tuning": - return PromptEncoderConfig - elif peft_strategy == "ia3": - return IA3Config +def apply_peft(model: PreTrainedModel, peft_type: str, peft_config: Dict[str, Any]) -> PreTrainedModel: + peft_config = PEFT_TYPE_TO_CONFIG_MAPPING[peft_type](**peft_config) + return get_peft_model(model=model, peft_config=peft_config) diff --git a/optimum_benchmark/backends/text_generation_inference/__init__.py b/optimum_benchmark/backends/py_tgi/__init__.py similarity index 100% rename from optimum_benchmark/backends/text_generation_inference/__init__.py rename to optimum_benchmark/backends/py_tgi/__init__.py diff --git a/optimum_benchmark/backends/text_generation_inference/backend.py b/optimum_benchmark/backends/py_tgi/backend.py similarity index 53% rename from optimum_benchmark/backends/text_generation_inference/backend.py rename to optimum_benchmark/backends/py_tgi/backend.py index 8132b276..42e1b9e9 100644 --- a/optimum_benchmark/backends/text_generation_inference/backend.py +++ b/optimum_benchmark/backends/py_tgi/backend.py @@ -5,151 +5,133 @@ from typing import Any, Dict, List import torch -from huggingface_hub import snapshot_download from py_tgi import TGI -from safetensors.torch import save_model -from transformers import logging as transformers_logging +from safetensors.torch import save_file +from transformers import GenerationConfig -from ...system_utils import is_nvidia_system, is_rocm_system from ...task_utils import TEXT_GENERATION_TASKS from ..base import Backend -from ..transformers_utils import randomize_weights -from .config import TGIConfig +from ..transformers_utils import random_init_weights +from .config import PyTGIConfig # bachend logger -LOGGER = getLogger("text-generation-inference") +LOGGER = getLogger("py-tgi") -# disable other loggers -transformers_logging.set_verbosity_error() +class PyTGIBackend(Backend[PyTGIConfig]): + NAME: str = "py-tgi" -class TGIBackend(Backend[TGIConfig]): - NAME: str = "text-generation-inference" - - def __init__(self, config: TGIConfig) -> None: + def __init__(self, config: PyTGIConfig) -> None: super().__init__(config) self.validate_task() - if self.config.device == "cuda" and is_nvidia_system(): - self.devices = None - self.gpus = self.config.device_ids - LOGGER.info(f"\t+ CUDA devices: {self.gpus}") - if self.config.device == "cuda" and is_rocm_system(): - self.gpus = None - device_ids = list(map(int, self.config.device_ids.split(","))) - renderDs = [file for file in os.listdir("/dev/dri") if file.startswith("renderD")] - self.devices = ["/dev/kfd"] + [f"/dev/dri/{renderDs[i]}" for i in device_ids] - LOGGER.info(f"\t+ ROCm devices: {self.devices}") - else: - self.gpus = None - self.devices = None - LOGGER.info("\t+ CPU device") + if self.generation_config is None: + self.generation_config = GenerationConfig() LOGGER.info("\t+ Creating backend temporary directory") - self.tmp_dir = TemporaryDirectory() + self.tmpdir = TemporaryDirectory() if self.config.no_weights: + LOGGER.info("\t+ Loading no weights model") self.load_model_with_no_weights() else: + LOGGER.info("\t+ Downloading pretrained model") self.download_pretrained_model() + LOGGER.info("\t+ Preparing generation config") + self.prepare_generation_config() + LOGGER.info("\t+ Loading pretrained model") self.load_model_from_pretrained() + self.tmpdir.cleanup() + def validate_task(self) -> None: if self.config.task not in TEXT_GENERATION_TASKS: raise NotImplementedError(f"TGI does not support task {self.config.task}") def download_pretrained_model(self) -> None: LOGGER.info("\t+ Downloading pretrained model") - snapshot_download(self.config.model, **self.config.hub_kwargs) + with torch.device("meta"): + self.automodel_class.from_pretrained(self.config.model, **self.config.hub_kwargs) - def prepare_pretrained_model(self) -> None: - LOGGER.info("\t+ Modifying pretrained generation config") - self.generation_config.eos_token_id = -100 - self.generation_config.pad_token_id = -101 - - LOGGER.info("\t+ Saving new pretrained generation config") + def prepare_generation_config(self) -> None: + LOGGER.info("\t+ Modifying generation config for fixed length generation") + self.generation_config.eos_token_id = None + self.generation_config.pad_token_id = None model_cache_folder = f"models/{self.config.model}".replace("/", "--") model_cache_path = f"{self.config.volume}/{model_cache_folder}" - snapshot_file = f"{model_cache_path}/refs/{self.config.hub_kwargs.get('revision', 'main')}" snapshot_ref = open(snapshot_file, "r").read().strip() - model_snapshot_path = f"{model_cache_path}/snapshots/{snapshot_ref}" + LOGGER.info("\t+ Saving new pretrained generation config") self.generation_config.save_pretrained(save_directory=model_snapshot_path) - def load_model_from_pretrained(self) -> None: - self.prepare_pretrained_model() - self.start_tgi_server() - def create_no_weights_model(self) -> None: + self.no_weights_model = os.path.join(self.tmp_dir.name, "no_weights_model") LOGGER.info("\t+ Creating no weights model directory") - self.no_weights_model = os.path.join(self.config.volume, "no_weights_model") os.makedirs(self.no_weights_model, exist_ok=True) - - LOGGER.info("\t+ Saving pretrained config") - self.pretrained_config.save_pretrained(save_directory=self.no_weights_model) - - LOGGER.info("\t+ Saving pretrained tokenizer") - self.pretrained_processor.save_pretrained(save_directory=self.no_weights_model) - - LOGGER.info("\t+ Saving no weights model") - save_model( - filename=os.path.join(self.no_weights_model, "model.safetensors"), - model=torch.nn.Linear(1, 1), - metadata={"format": "pt"}, - ) - # unlike transformers api, TGI won't accept an empty model.safetensors + LOGGER.info("\t+ Creating no weights model state dict") + state_dict = torch.nn.Linear(1, 1).state_dict() + LOGGER.info("\t+ Saving no weights model safetensors") + safetensor = os.path.join(self.no_weights_model, "model.safetensors") + save_file(tensors=state_dict, filename=safetensor, metadata={"format": "pt"}) + # unlike Transformers api, TGI won't accept any missing tensors # so we need to materialize the model and resave it LOGGER.info(f"\t+ Loading no weights model from {self.no_weights_model}") - self.pretrained_model = self.automodel_class.from_pretrained( - self.no_weights_model, - **self.config.hub_kwargs, - device_map="auto", # for faster/safer loading - ) - - LOGGER.info("\t+ Randomizing weights") - randomize_weights(self.pretrained_model) - + with random_init_weights(): + self.pretrained_model = self.automodel_class.from_pretrained( + self.no_weights_model, **self.config.hub_kwargs, device_map="auto", _fast_init=False + ) LOGGER.info("\t+ Saving no weights model") self.pretrained_model.save_pretrained(save_directory=self.no_weights_model) - self.delete_pretrained_model() - - LOGGER.info("\t+ Saving generation config") - self.generation_config.eos_token_id = -100 - self.generation_config.pad_token_id = -101 + LOGGER.info("\t+ Saving no weights model pretrained config") + self.pretrained_config.save_pretrained(save_directory=self.no_weights_model) + LOGGER.info("\t+ Saving no weights model pretrained processor") + self.pretrained_processor.save_pretrained(save_directory=self.no_weights_model) + LOGGER.info("\t+ Modifying generation config for fixed length generation") + self.generation_config.eos_token_id = None + self.generation_config.pad_token_id = None + LOGGER.info("\t+ Saving new pretrained generation config") self.generation_config.save_pretrained(save_directory=self.no_weights_model) def load_model_with_no_weights(self) -> None: + LOGGER.info("\t+ Creating no weights model") self.create_no_weights_model() - original_model = self.config.model - self.config.model = "data/no_weights_model" - self.start_tgi_server() - self.config.model = original_model - def start_tgi_server(self) -> None: + original_volume, self.config.volume = self.config.volume, self.tmp_dir.name + original_model, self.config.model = self.config.model, "/data/no_weights_model" + LOGGER.info("\t+ Loading no weights model") + self.load_model_from_pretrained() + self.config.model, self.config.volume = original_model, original_volume + + def load_model_from_pretrained(self) -> None: self.pretrained_model = TGI( + # model model=self.config.model, dtype=self.config.dtype, - image=self.config.image, quantize=self.config.quantize, - port=self.config.port, - volume=self.config.volume, - address=self.config.address, + # docker + image=self.config.image, shm_size=self.config.shm_size, - gpus=self.gpus, - devices=self.devices, + address=self.config.address, + volume=self.config.volume, + port=self.config.port, + # device + gpus=self.config.gpus, + devices=self.config.devices, + # sharding sharded=self.config.sharded, num_shard=self.config.num_shard, + # other disable_custom_kernels=self.config.disable_custom_kernels, - revision=self.config.hub_kwargs.get("revision", "main"), trust_remote_code=self.config.hub_kwargs.get("trust_remote_code", False), + revision=self.config.hub_kwargs.get("revision", "main"), ) def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]: - if "input_ids" in inputs: - return {"prompt": self.pretrained_processor.batch_decode(inputs["input_ids"].tolist())} - elif "inputs" in inputs: + if "inputs" in inputs: return {"prompt": self.pretrained_processor.batch_decode(inputs["inputs"].tolist())} + elif "input_ids" in inputs: + return {"prompt": self.pretrained_processor.batch_decode(inputs["input_ids"].tolist())} else: raise ValueError("inputs must contain either input_ids or inputs") @@ -158,16 +140,14 @@ def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> List[str]: def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> List[str]: return self.pretrained_model.generate( - **inputs, - do_sample=kwargs.get("do_sample", False), - max_new_tokens=kwargs.get("max_new_tokens", 1), + **inputs, do_sample=kwargs.get("do_sample", False), max_new_tokens=kwargs.get("max_new_tokens", 1) ) def clean(self) -> None: super().clean() - if hasattr(self, "tmp_dir"): + if hasattr(self, "tmpdir"): LOGGER.info("\t+ Cleaning temporary directory") - self.tmp_dir.cleanup() + self.tmpdir.cleanup() gc.collect() diff --git a/optimum_benchmark/backends/text_generation_inference/config.py b/optimum_benchmark/backends/py_tgi/config.py similarity index 58% rename from optimum_benchmark/backends/text_generation_inference/config.py rename to optimum_benchmark/backends/py_tgi/config.py index 2e88597d..62e91321 100644 --- a/optimum_benchmark/backends/text_generation_inference/config.py +++ b/optimum_benchmark/backends/py_tgi/config.py @@ -1,27 +1,31 @@ import os from dataclasses import dataclass -from typing import Optional +from typing import List, Optional from ...import_utils import py_tgi_version +from ...system_utils import is_nvidia_system, is_rocm_system from ..config import BackendConfig @dataclass -class TGIConfig(BackendConfig): - name: str = "text-generation-inference" +class PyTGIConfig(BackendConfig): + name: str = "py-tgi" version: Optional[str] = py_tgi_version() - _target_: str = "optimum_benchmark.backends.text_generation_inference.backend.TGIBackend" + _target_: str = "optimum_benchmark.backends.py_tgi.backend.PyTGIBackend" # optimum benchmark specific no_weights: bool = False # docker options image: str = "ghcr.io/huggingface/text-generation-inference:latest" - volume: str = f"{os.path.expanduser('~')}/.cache/huggingface/hub" + volume: str = os.path.expanduser("~/.cache/huggingface/hub") address: str = "127.0.0.1" shm_size: str = "1g" port: int = 1111 + gpus: Optional[str] = None # "0,1,2,3" + devices: Optional[List[str]] = None # ["/dev/dri/renderD128", "/dev/dri/renderD129"] + # sharding options sharded: Optional[bool] = None # None, True, False num_shard: Optional[int] = None # None, 1, 2, 4, 8, 16, 32, 64 @@ -41,3 +45,11 @@ def __post_init__(self): if self.quantize is not None: if self.quantize not in ["bitsandbytes-nf4", "bitsandbytes-fp4", "awq", "gptq"]: raise ValueError(f"Invalid value for quantize: {self.quantize}") + + if self.gpus is None and self.device == "cuda" and is_nvidia_system(): + self.gpus = self.device_ids + + if self.devices is None and self.device == "cuda" and is_rocm_system(): + device_ids = list(map(int, self.device_ids.split(","))) + renderDs = [file for file in os.listdir("/dev/dri") if file.startswith("renderD")] + self.devices = ["/dev/kfd"] + [f"/dev/dri/{renderDs[i]}" for i in device_ids] diff --git a/optimum_benchmark/backends/pytorch/backend.py b/optimum_benchmark/backends/pytorch/backend.py index d76789db..9c377f12 100644 --- a/optimum_benchmark/backends/pytorch/backend.py +++ b/optimum_benchmark/backends/pytorch/backend.py @@ -5,30 +5,30 @@ from tempfile import TemporaryDirectory from typing import Any, Callable, Dict, List -import datasets.utils.logging as datasets_logging import torch -import transformers.utils.logging as transformers_logging from datasets import Dataset from safetensors.torch import save_file -from transformers import Trainer, TrainerCallback, TrainerState, TrainingArguments -from transformers.modeling_utils import no_init_weights - -from ...import_utils import is_deepspeed_available, is_peft_available +from transformers import ( + AwqConfig, + BitsAndBytesConfig, + GPTQConfig, + Trainer, + TrainerCallback, + TrainerState, + TrainingArguments, +) + +from ...import_utils import is_deepspeed_available, is_torch_distributed_available from ..base import Backend -from ..peft_utils import get_peft_config_class -from ..transformers_utils import randomize_weights +from ..peft_utils import apply_peft +from ..transformers_utils import random_init_weights from .config import PyTorchConfig -if is_peft_available(): - from peft import get_peft_model # type: ignore - if is_deepspeed_available(): - from deepspeed import init_inference # type: ignore - + from deepspeed import init_inference -# disable other loggers -datasets_logging.set_verbosity_error() -transformers_logging.set_verbosity_error() +if is_torch_distributed_available(): + import torch.distributed # bachend logger LOGGER = getLogger("pytorch") @@ -41,7 +41,7 @@ def __init__(self, config: PyTorchConfig): super().__init__(config) self.validate_library() - # Threads + # Thread settings if self.config.inter_op_num_threads is not None: LOGGER.info(f"\t+ Setting pytorch inter_op_num_threads({self.config.inter_op_num_threads}))") torch.set_num_threads(self.config.inter_op_num_threads) @@ -63,13 +63,17 @@ def __init__(self, config: PyTorchConfig): else: self.quantization_config = None + if self.config.deepspeed_inference: + if self.quantization_config is not None: + raise ValueError("Deepspeed-Inference is not compatible with Transformers quantization") + LOGGER.info("\t+ Creating backend temporary directory") self.tmpdir = TemporaryDirectory() - if self.config.no_weights and self.config.library == "diffusers": - raise ValueError("Diffusion pipelines are not supported with no_weights=True") + if self.config.no_weights and (self.config.library == "diffusers" or self.config.library == "timm"): + raise ValueError("Diffusion pipelines and Timm models don't support no weights") elif self.config.no_weights: - LOGGER.info("\t+ Loading model with no weights") + LOGGER.info("\t+ Loading model with random weights") self.load_model_with_no_weights() else: LOGGER.info("\t+ Loading model with pretrained weights") @@ -103,19 +107,11 @@ def __init__(self, config: PyTorchConfig): self.pretrained_model.forward, **self.config.torch_compile_config ) - if self.config.peft_strategy is not None: - LOGGER.info("\t+ Using PEFT") - peft_config_class = get_peft_config_class(self.config.peft_strategy) - peft_config = peft_config_class(**self.config.peft_config) - self.pretrained_model = get_peft_model(self.pretrained_model, peft_config=peft_config) + if self.config.peft_type is not None: + LOGGER.info("\t+ Applying PEFT") + self.pretrained_model = apply_peft(self.pretrained_model, self.config.peft_type, self.config.peft_config) - if self.config.deepspeed_inference: - LOGGER.info("\t+ Using DeepSpeed-Inference") - self.pretrained_model = init_inference( - self.pretrained_model, - config=self.config.deepspeed_inference_config, - dtype=getattr(self.pretrained_model, "dtype", None), - ) + self.tmpdir.cleanup() def validate_library(self) -> None: if self.config.library == "timm": @@ -130,38 +126,46 @@ def validate_library(self) -> None: def load_model_from_pretrained(self) -> None: if self.config.library == "timm": LOGGER.info("\t+ Loading Timm model") - self.pretrained_model = self.automodel_class(self.config.model) - self.pretrained_model.to(self.config.device) + self.pretrained_model = self.automodel_class(model_name=self.config.model) + if self.config.device != "cpu": + LOGGER.info(f"\t+ Moving model to device: {self.config.device}") + self.pretrained_model.to(self.config.device) elif self.config.library == "diffusers": LOGGER.info("\t+ Loading Diffusion pipeline") self.pretrained_model = self.automodel_class.from_pretrained( - pretrained_model_name_or_path=self.config.model, + pretrained_model_or_path=self.config.model, device_map=self.config.device_map, **self.config.hub_kwargs, **self.automodel_kwargs, ) - if self.config.device_map is None: + if self.config.device_map is None and self.config.device != "cpu": LOGGER.info(f"\t+ Moving pipeline to device: {self.config.device}") self.pretrained_model.to(self.config.device) - elif self.is_bnb_quantized: - LOGGER.info("\t+ Loading BnB quantized model") + elif self.config.deepspeed_inference: + with torch.device("cpu"): + LOGGER.info("\t+ Loading DeepSpeed model directly on CPU to avoid OOM") + self.pretrained_model = self.automodel_class.from_pretrained( + pretrained_model_name_or_path=self.config.model, **self.config.hub_kwargs, **self.automodel_kwargs + ) + + torch.distributed.barrier() # better safe than hanging + LOGGER.info("\t+ Initializing DeepSpeed Inference") + self.pretrained_model = init_inference(self.pretrained_model, config=self.config.deepspeed_inference_config) + torch.distributed.barrier() # better safe than hanging + elif self.is_quantized: + # we can't use device context manager since the model is quantized + LOGGER.info("\t+ Loading Quantized model") self.pretrained_model = self.automodel_class.from_pretrained( pretrained_model_name_or_path=self.config.model, device_map=self.config.device_map, **self.config.hub_kwargs, **self.automodel_kwargs, ) - elif self.is_gptq_quantized or self.is_awq_quantized: - LOGGER.info("\t+ Loading quantized model") - self.pretrained_model = self.automodel_class.from_pretrained( - pretrained_model_name_or_path=self.config.model, - # for gptq, we need to specify the device_map to either auto - # or a cuda adevice to avoid any modules being assigned to cpu ¯\_(ツ)_/¯ - device_map=self.config.device_map or torch.device(self.config.device), - **self.config.hub_kwargs, - **self.automodel_kwargs, - ) + if self.config.device_map is None and self.config.device != "cpu": + LOGGER.info(f"\t+ Moving model to device: {self.config.device}") + self.pretrained_model.to(self.config.device) elif self.config.device_map is not None: + # we can't use device context manager since device_map is specified LOGGER.info(f"\t+ Loading model with device map: {self.config.device_map}") self.pretrained_model = self.automodel_class.from_pretrained( pretrained_model_name_or_path=self.config.model, @@ -170,8 +174,6 @@ def load_model_from_pretrained(self) -> None: **self.automodel_kwargs, ) else: - # this is the fastest way to load a model on a specific device - # but not compatible with all quantization methods (and pipelines) LOGGER.info(f"\t+ Loading model directly on device: {self.config.device}") with torch.device(self.config.device): self.pretrained_model = self.automodel_class.from_pretrained( @@ -179,75 +181,68 @@ def load_model_from_pretrained(self) -> None: ) def create_no_weights_model(self) -> None: - LOGGER.info("\t+ Creating no weights model state_dict") + if self.pretrained_config is None: + raise ValueError("Can't create no weights model without a pretrained config") + + self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights_model") + LOGGER.info("\t+ Creating no weights model directory") + os.makedirs(self.no_weights_model, exist_ok=True) + LOGGER.info("\t+ Creating no weights model state dict") state_dict = torch.nn.Linear(1, 1).state_dict() if self.is_exllamav2: - # for exllamav2 we need to add g_idx to the state_dict which - # requires some information about linear layers dimensions + LOGGER.info("\t+ Adding g_idx to no weights model state dict") with torch.device("meta"): meta_model = self.automodel_class.from_config(self.pretrained_config) for name, module in meta_model.named_modules(): if hasattr(module, "in_features"): state_dict[name + ".g_idx"] = torch.ones((module.in_features,), dtype=torch.int32) + LOGGER.info("\t+ Saving no weights model safetensors") + safetensors = os.path.join(self.no_weights_model, "model.safetensors") + save_file(tensors=state_dict, filename=safetensors, metadata={"format": "pt"}) + if self.is_quantized: - # tricking from_pretrained to load the model as if it was quantized + LOGGER.info("\t+ Adding quantization config to no weights model's pretrained config") self.pretrained_config.quantization_config = self.quantization_config.to_dict() - - LOGGER.info("\t+ Creating no weights model directory") - self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights") - os.makedirs(self.no_weights_model, exist_ok=True) + # tricking from_pretrained to load the model as if it was quantized LOGGER.info("\t+ Saving no weights model pretrained config") - self.pretrained_config.save_pretrained(save_directory=self.no_weights_model) - - LOGGER.info("\t+ Saving no weights model state_dict") - save_file( - filename=os.path.join(self.no_weights_model, "model.safetensors"), - metadata={"format": "pt"}, - tensors=state_dict, - ) + if self.config.library == "transformers": + self.pretrained_config.save_pretrained(save_directory=self.no_weights_model) def load_model_with_no_weights(self) -> None: + LOGGER.info("\t+ Creating no weights model") self.create_no_weights_model() - with no_init_weights(): - original_model = self.config.model - self.config.model = self.no_weights_model - LOGGER.info("\t+ Loading no weights model") + with random_init_weights(): + original_model, self.config.model = self.config.model, self.no_weights_model + LOGGER.info("\t+ Loading no weights AutoModel") self.load_model_from_pretrained() self.config.model = original_model - LOGGER.info("\t+ Randomizing model weights") - randomize_weights(self.pretrained_model) + # dunno how necessary this is LOGGER.info("\t+ Tying model weights") self.pretrained_model.tie_weights() def process_quantization_config(self) -> None: if self.is_gptq_quantized: LOGGER.info("\t+ Processing GPTQ config") - from transformers import GPTQConfig - self.quantization_config = GPTQConfig( **dict(getattr(self.pretrained_config, "quantization_config", {}), **self.config.quantization_config) ) elif self.is_awq_quantized: LOGGER.info("\t+ Processing AWQ config") - from transformers import AwqConfig - self.quantization_config = AwqConfig( **dict(getattr(self.pretrained_config, "quantization_config", {}), **self.config.quantization_config) ) elif self.is_bnb_quantized: LOGGER.info("\t+ Processing BitsAndBytes config") - from transformers import BitsAndBytesConfig - self.quantization_config = BitsAndBytesConfig( **dict(getattr(self.pretrained_config, "quantization_config", {}), **self.config.quantization_config) ) else: - self.quantization_config = None + raise ValueError(f"Quantization scheme {self.config.quantization_scheme} not recognized") @property def is_quantized(self) -> bool: @@ -256,36 +251,38 @@ def is_quantized(self) -> bool: @property def is_bnb_quantized(self) -> bool: return self.config.quantization_scheme == "bnb" or ( - hasattr(self.pretrained_config, "quantization_config") - and self.pretrained_config.quantization_config.get("quant_method", None) == "bnb" + getattr(self.pretrained_config, "quantization_config", {}).get("quant_method", None) == "bnb" ) @property def is_gptq_quantized(self) -> bool: return self.config.quantization_scheme == "gptq" or ( - hasattr(self.pretrained_config, "quantization_config") - and self.pretrained_config.quantization_config.get("quant_method", None) == "gptq" + getattr(self.pretrained_config, "quantization_config", {}).get("quant_method", None) == "gptq" ) @property def is_awq_quantized(self) -> bool: return self.config.quantization_scheme == "awq" or ( - hasattr(self.pretrained_config, "quantization_config") - and self.pretrained_config.quantization_config.get("quant_method", None) == "awq" + getattr(self.pretrained_config, "quantization_config", {}).get("quant_method", None) == "awq" ) @property def is_exllamav2(self) -> bool: - return ( - self.is_gptq_quantized - and hasattr(self.quantization_config, "exllama_config") - and self.quantization_config.exllama_config.get("version", None) == 2 - ) + dummy_exllama = {"exllama_version": None} + return (self.is_gptq_quantized or self.is_awq_quantized) and ( + getattr(self.quantization_config, "exllama_config", dummy_exllama)["exllama_version"] + or getattr(self.pretrained_config, "quantization_config", {}).get("exllama_config", dummy_exllama)[ + "exllama_version" + ] + ) == 2 @property def automodel_kwargs(self) -> Dict[str, Any]: kwargs = {} + if self.is_quantized: + kwargs["quantization_config"] = self.quantization_config + if self.config.torch_dtype is not None: kwargs["torch_dtype"] = getattr(torch, self.config.torch_dtype) @@ -295,24 +292,23 @@ def automodel_kwargs(self) -> Dict[str, Any]: if self.config.low_cpu_mem_usage is not None: kwargs["low_cpu_mem_usage"] = self.config.low_cpu_mem_usage - if self.is_quantized: + if self.config.no_weights: + # we use our own context manager to load the model with random weights kwargs["_fast_init"] = False - kwargs["quantization_config"] = self.quantization_config return kwargs def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + inputs = super().prepare_inputs(inputs) + if self.config.library == "diffusers": return {"prompt": inputs["prompt"]} - - LOGGER.info(f"\t+ Moving inputs tensors to device {self.config.device}") - for key, value in inputs.items(): - inputs[key] = value.to(self.config.device) - - if self.config.library == "timm": - return {"x": inputs["pixel_values"]} - - return inputs + elif self.config.library == "timm": + return {"x": inputs["pixel_values"].to(self.config.device)} + else: + for key, value in inputs.items(): + inputs[key] = value.to(self.config.device) + return inputs @torch.inference_mode() def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict: @@ -335,9 +331,9 @@ def train( training_callbacks: List[TrainerCallback], training_data_collator: Callable[[List[Dict[str, Any]]], Dict[str, Any]], ) -> TrainerState: - LOGGER.info("\t+ Wrapping training arguments with transformers.TrainingArguments") + LOGGER.info(f"\t+ Wrapping training arguments with {TrainingArguments.__name__}") training_arguments = TrainingArguments(**training_arguments) - LOGGER.info("\t+ Wrapping model with transformers.Trainer") + LOGGER.info(f"\t+ Wrapping model with {Trainer.__name__}") trainer = Trainer( args=training_arguments, model=self.pretrained_model, @@ -347,7 +343,7 @@ def train( ) LOGGER.info("\t+ Starting training") trainer.train() - LOGGER.info("\t+ Training finished successfully") + LOGGER.info("\t+ Finished training") def seed(self): super().seed() diff --git a/optimum_benchmark/backends/pytorch/config.py b/optimum_benchmark/backends/pytorch/config.py index 7902719d..3efeb63c 100644 --- a/optimum_benchmark/backends/pytorch/config.py +++ b/optimum_benchmark/backends/pytorch/config.py @@ -4,7 +4,6 @@ from ...import_utils import torch_version from ...system_utils import is_rocm_system from ..config import BackendConfig -from ..peft_utils import PEFT_CONFIGS, PEFT_TASKS_TYPES DEVICE_MAPS = ["auto", "sequential"] AMP_DTYPES = ["bfloat16", "float16"] @@ -56,7 +55,7 @@ class PyTorchConfig(BackendConfig): deepspeed_inference_config: Dict[str, Any] = field(default_factory=dict) # peft options - peft_strategy: Optional[str] = None + peft_type: Optional[str] = None peft_config: Dict[str, Any] = field(default_factory=dict) def __post_init__(self): @@ -86,14 +85,3 @@ def __post_init__(self): if self.quantization_config: QUANTIZATION_CONFIG = QUANTIZATION_CONFIGS[self.quantization_scheme] self.quantization_config = {**QUANTIZATION_CONFIG, **self.quantization_config} - - if self.peft_strategy is not None: - if self.peft_strategy not in PEFT_CONFIGS: - raise ValueError( - f"`peft_strategy` must be one of {list(PEFT_CONFIGS.keys())}. Got {self.peft_strategy} instead." - ) - PEFT_CONFIG = PEFT_CONFIGS[self.peft_strategy] - self.peft_config = {**PEFT_CONFIG, **self.peft_config} - - if self.peft_config["task_type"] is None: - raise ValueError(f"`peft_config.task_type` must be set to one of the following {PEFT_TASKS_TYPES}") diff --git a/optimum_benchmark/backends/tensorrt_llm/backend.py b/optimum_benchmark/backends/tensorrt_llm/backend.py index 7a3b1984..302141f5 100644 --- a/optimum_benchmark/backends/tensorrt_llm/backend.py +++ b/optimum_benchmark/backends/tensorrt_llm/backend.py @@ -1,7 +1,10 @@ +import os from logging import getLogger from typing import Any, Dict +import torch from hydra.utils import get_class +from safetensors.torch import save_file from transformers.utils import ModelOutput from ..base import Backend @@ -18,6 +21,7 @@ def __init__(self, config: TRTLLMConfig): super().__init__(config) self.validate_model_type() + LOGGER.info("\t+ Loading pretrained TRTLLMModel") self.load_trtmodel_from_pretrained() def validate_model_type(self) -> None: @@ -27,6 +31,18 @@ def validate_model_type(self) -> None: self.trtmodel_class = get_class(MODEL_TYPE_TO_TRTLLMMODEL[self.model_type]) LOGGER.info(f"\t+ Using TRTLLMModel class {self.trtmodel_class.__name__}") + def create_no_weights_model(self) -> None: + self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights_model") + LOGGER.info("\t+ Creating no weights model state dict") + state_dict = torch.nn.Linear(1, 1).state_dict() + LOGGER.info("\t+ Saving no weights model safetensors") + safetensors = os.path.join(self.no_weights_model, "model.safetensors") + save_file(tensors=state_dict, filename=safetensors, metadata={"format": "pt"}) + + if self.config.library == "transformers": + LOGGER.info("\t+ Saving no weights model pretrained config") + self.pretrained_config.save_pretrained(save_directory=self.no_weights_model) + def load_trtmodel_from_pretrained(self) -> None: self.pretrained_model = self.trtmodel_class.from_pretrained( self.config.model, diff --git a/optimum_benchmark/backends/timm_utils.py b/optimum_benchmark/backends/timm_utils.py index 07105003..22a017f9 100644 --- a/optimum_benchmark/backends/timm_utils.py +++ b/optimum_benchmark/backends/timm_utils.py @@ -1,18 +1,14 @@ -from typing import Any, Dict, Optional +from typing import Any, Dict -from ..import_utils import is_timm_available, is_torch_available, is_transformers_available +from transformers import PretrainedConfig -if is_torch_available(): - import torch +from ..import_utils import is_timm_available if is_timm_available(): - import timm + import timm # type: ignore -if is_transformers_available(): - from transformers import PretrainedConfig - -def get_timm_pretrained_config(model_name: str) -> "PretrainedConfig": +def get_timm_pretrained_config(model_name: str) -> PretrainedConfig: model_source, model_name = timm.models.parse_model_name(model_name) if model_source == "hf-hub": # For model names specified in the form `hf-hub:path/architecture_name@revision`, @@ -23,15 +19,7 @@ def get_timm_pretrained_config(model_name: str) -> "PretrainedConfig": return timm.get_pretrained_cfg(model_name) -def get_timm_pre_processor(model: str) -> Optional["torch.nn.Module"]: - try: - pretrained_config = get_timm_pretrained_config(model) - return timm.data.create_transform(**timm.data.resolve_data_config(pretrained_config)) - except Exception: - return None - - -def extract_timm_shapes_from_config(config: "PretrainedConfig") -> Dict[str, Any]: +def extract_timm_shapes_from_config(config: PretrainedConfig) -> Dict[str, Any]: artifacts_dict = {} config_dict = {k: v for k, v in config.to_dict().items() if v is not None} diff --git a/optimum_benchmark/backends/torch_ort/backend.py b/optimum_benchmark/backends/torch_ort/backend.py index 52bede74..7f8863f3 100644 --- a/optimum_benchmark/backends/torch_ort/backend.py +++ b/optimum_benchmark/backends/torch_ort/backend.py @@ -8,22 +8,13 @@ from datasets import Dataset from optimum.onnxruntime import ORTTrainer, ORTTrainingArguments from safetensors.torch import save_file -from transformers import TrainerCallback, TrainerState -from transformers.modeling_utils import no_init_weights -from transformers.utils.logging import set_verbosity_error +from transformers import TrainerCallback -from ...import_utils import is_peft_available from ..base import Backend -from ..peft_utils import get_peft_config_class -from ..transformers_utils import randomize_weights +from ..peft_utils import apply_peft +from ..transformers_utils import random_init_weights from .config import TorchORTConfig -if is_peft_available(): - from peft import get_peft_model # type: ignore - -# disable transformers logging -set_verbosity_error() - LOGGER = getLogger("torch-ort") @@ -38,52 +29,46 @@ def __init__(self, config: TorchORTConfig): self.tmpdir = TemporaryDirectory() if self.config.no_weights: + LOGGER.info("\t+ Loading no weights AutoModel") self.load_automodel_with_no_weights() else: + LOGGER.info("\t+ Loading pretrained AutoModel") self.load_automodel_from_pretrained() - if self.config.peft_strategy is not None: - LOGGER.info("\t+ Using PEFT") - peft_config_class = get_peft_config_class(self.config.peft_strategy) - peft_config = peft_config_class(**self.config.peft_config) - self.pretrained_model = get_peft_model(self.pretrained_model, peft_config=peft_config) + if self.config.peft_type is not None: + LOGGER.info("\t+ Applying PEFT") + self.pretrained_model = apply_peft(self.pretrained_model, self.config.peft_type, self.config.peft_config) + + self.tmpdir.cleanup() def validate_library(self) -> None: if self.config.library == "transformers": - LOGGER.info(f"Using AutoModel: {self.automodel_class.__name__}") + LOGGER.info(f"Using AutoModel class {self.automodel_class.__name__}") else: raise NotImplementedError(f"TorchORTBackend does not support {self.config.library} library") def create_no_weights_model(self) -> None: - LOGGER.info("\t+ Creating no weights model directory") - self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights") - os.makedirs(self.no_weights_model, exist_ok=True) - - LOGGER.info("\t+ Saving pretrained config") - self.pretrained_config.save_pretrained(save_directory=self.no_weights_model) - - LOGGER.info("\t+ Creating no weights model state_dict") + self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights_model") + LOGGER.info("\t+ Creating no weights model state dict") state_dict = torch.nn.Linear(1, 1).state_dict() + LOGGER.info("\t+ Saving no weights model safetensors") + safetensors = os.path.join(self.no_weights_model, "model.safetensors") + save_file(tensors=state_dict, filename=safetensors, metadata={"format": "pt"}) - LOGGER.info("\t+ Saving no weights model state_dict") - save_file( - filename=os.path.join(self.no_weights_model, "model.safetensors"), - metadata={"format": "pt"}, - tensors=state_dict, - ) + if self.config.library == "transformers": + LOGGER.info("\t+ Saving no weights model pretrained config") + self.pretrained_config.save_pretrained(save_directory=self.no_weights_model) def load_automodel_with_no_weights(self) -> None: + LOGGER.info("\t+ Creating no weights model") self.create_no_weights_model() - with no_init_weights(): - original_model = self.config.model - self.config.model = self.no_weights_model - LOGGER.info("\t+ Loading no weights model") + with random_init_weights(): + original_model, self.config.model = self.config.model, self.no_weights_model + LOGGER.info("\t+ Loading no weights AutoModel") self.load_automodel_from_pretrained() self.config.model = original_model - LOGGER.info("\t+ Randomizing model weights") - randomize_weights(self.pretrained_model) LOGGER.info("\t+ Tying model weights") self.pretrained_model.tie_weights() @@ -107,9 +92,7 @@ def train( training_arguments: Dict[str, Any], training_callbacks: List[TrainerCallback], training_data_collator: Callable[[List[Dict[str, Any]]], Dict[str, Any]], - ) -> TrainerState: - LOGGER.info("\t+ Setting dataset format to `torch`") - training_dataset.set_format(type="torch", columns=list(training_dataset.features.keys())) + ): LOGGER.info(f"\t+ Wrapping training arguments with {ORTTrainingArguments.__name__}") training_arguments = ORTTrainingArguments(**training_arguments) LOGGER.info(f"\t+ Wrapping model with {ORTTrainer.__name__}") @@ -122,9 +105,7 @@ def train( ) LOGGER.info("\t+ Starting training") trainer.train() - LOGGER.info("\t+ Training finished successfully") - - return trainer.state + LOGGER.info("\t+ Finished training") def clean(self) -> None: super().clean() diff --git a/optimum_benchmark/backends/torch_ort/config.py b/optimum_benchmark/backends/torch_ort/config.py index 8559022f..252ee72b 100644 --- a/optimum_benchmark/backends/torch_ort/config.py +++ b/optimum_benchmark/backends/torch_ort/config.py @@ -3,7 +3,6 @@ from ...import_utils import torch_ort_version from ..config import BackendConfig -from ..peft_utils import PEFT_CONFIGS, PEFT_TASKS_TYPES @dataclass @@ -17,7 +16,7 @@ class TorchORTConfig(BackendConfig): torch_dtype: Optional[str] = None # peft options - peft_strategy: Optional[str] = None + peft_type: Optional[str] = None peft_config: Dict[str, Any] = field(default_factory=dict) def __post_init__(self): @@ -25,14 +24,3 @@ def __post_init__(self): if self.device != "cuda": raise ValueError(f"TorchORTBackend only supports CUDA devices, got {self.device}") - - if self.peft_strategy is not None: - if self.peft_strategy not in PEFT_CONFIGS: - raise ValueError( - f"`peft_strategy` must be one of {list(PEFT_CONFIGS.keys())}. Got {self.peft_strategy} instead." - ) - PEFT_CONFIG = PEFT_CONFIGS[self.peft_strategy] - self.peft_config = {**PEFT_CONFIG, **self.peft_config} - - if self.peft_config["task_type"] is None: - raise ValueError(f"`peft_config.task_type` must be set to one of the following {PEFT_TASKS_TYPES}") diff --git a/optimum_benchmark/backends/transformers_utils.py b/optimum_benchmark/backends/transformers_utils.py index b47e3030..93c35560 100644 --- a/optimum_benchmark/backends/transformers_utils.py +++ b/optimum_benchmark/backends/transformers_utils.py @@ -1,24 +1,21 @@ import os +from contextlib import contextmanager from typing import Any, Dict, Optional, Union -from ..import_utils import is_torch_available, is_transformers_available +import torch +from transformers import ( + AutoConfig, + AutoProcessor, + AutoTokenizer, + FeatureExtractionMixin, + GenerationConfig, + ImageProcessingMixin, + PretrainedConfig, + PreTrainedTokenizer, + ProcessorMixin, +) -if is_torch_available(): - import torch - -if is_transformers_available(): - from transformers import ( - AutoConfig, - AutoProcessor, - FeatureExtractionMixin, - GenerationConfig, - ImageProcessingMixin, - PretrainedConfig, - PreTrainedTokenizer, - ProcessorMixin, - ) - - PretrainedProcessor = Union[FeatureExtractionMixin, ImageProcessingMixin, PreTrainedTokenizer, ProcessorMixin] +PretrainedProcessor = Union[FeatureExtractionMixin, ImageProcessingMixin, PreTrainedTokenizer, ProcessorMixin] def get_transformers_cache_dir() -> str: @@ -43,7 +40,10 @@ def get_transformers_pretrained_processor(model: str, **kwargs) -> Optional["Pre # sometimes contains information about the model's input shapes that are not available in the config return AutoProcessor.from_pretrained(model, **kwargs) except Exception: - return None + try: + return AutoTokenizer.from_pretrained(model, **kwargs) + except Exception: + return None def extract_transformers_shapes_from_artifacts( @@ -119,20 +119,37 @@ def extract_transformers_shapes_from_artifacts( return shapes -def randomize_weights(model: "torch.nn.Module") -> None: - for param in model.parameters(): - if param.data.is_floating_point(): - if torch.cuda.is_available() and param.device.type != "cuda": - param.data.cuda().normal_(mean=0.0, std=0.2).cpu() - elif torch.backends.mps.is_available() and param.device.type != "mps": - param.data.to("mps").normal_(mean=0.0, std=0.2).cpu() - else: - param.data.normal_(mean=0.0, std=0.2) - - elif param.data.dtype in (torch.int32, torch.int16, torch.int8): - if torch.cuda.is_available() and param.device.type != "cuda": - param.data.copy_(torch.randint(-127, 127, param.data.shape, device="cuda")) - elif torch.backends.mps.is_available() and param.device.type != "mps": - param.data.copy_(torch.randint(-127, 127, param.data.shape, device="mps")) - else: - param.data.copy_(torch.randint(-127, 127, param.data.shape)) +TORCH_INIT_FUNCTIONS = { + "normal_": torch.nn.init.normal_, + "uniform_": torch.nn.init.uniform_, + "trunc_normal_": torch.nn.init.trunc_normal_, + "xavier_normal_": torch.nn.init.xavier_normal_, + "xavier_uniform_": torch.nn.init.xavier_uniform_, + "kaiming_normal_": torch.nn.init.kaiming_normal_, + "kaiming_uniform_": torch.nn.init.kaiming_uniform_, + "normal": torch.nn.init.normal, + "uniform": torch.nn.init.uniform, + "xavier_normal": torch.nn.init.xavier_normal, + "xavier_uniform": torch.nn.init.xavier_uniform, + "kaiming_normal": torch.nn.init.kaiming_normal, + "kaiming_uniform": torch.nn.init.kaiming_uniform, +} + + +def fast_rand(tensor: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: + return torch.nn.init.uniform_(tensor) + + +@contextmanager +def random_init_weights(): + # Replace the initialization functions + for name, init_func in TORCH_INIT_FUNCTIONS.items(): + if name != "uniform_": + setattr(torch.nn.init, name, fast_rand) + try: + yield + finally: + # Restore the original initialization functions + for name, init_func in TORCH_INIT_FUNCTIONS.items(): + if name != "uniform_": + setattr(torch.nn.init, name, init_func) diff --git a/optimum_benchmark/cli.py b/optimum_benchmark/cli.py index f91a3b2c..0a9254ab 100644 --- a/optimum_benchmark/cli.py +++ b/optimum_benchmark/cli.py @@ -9,9 +9,9 @@ from .backends.neural_compressor.config import INCConfig from .backends.onnxruntime.config import ORTConfig from .backends.openvino.config import OVConfig +from .backends.py_tgi.config import PyTGIConfig from .backends.pytorch.config import PyTorchConfig from .backends.tensorrt_llm.config import TRTLLMConfig -from .backends.text_generation_inference.config import TGIConfig from .backends.torch_ort.config import TorchORTConfig from .benchmarks.inference.config import InferenceConfig from .benchmarks.report import BenchmarkReport @@ -33,7 +33,7 @@ cs.store(group="backend", name=TorchORTConfig.name, node=TorchORTConfig) cs.store(group="backend", name=TRTLLMConfig.name, node=TRTLLMConfig) cs.store(group="backend", name=INCConfig.name, node=INCConfig) -cs.store(group="backend", name=TGIConfig.name, node=TGIConfig) +cs.store(group="backend", name=PyTGIConfig.name, node=PyTGIConfig) # benchmarks configurations cs.store(group="benchmark", name=TrainingConfig.name, node=TrainingConfig) cs.store(group="benchmark", name=InferenceConfig.name, node=InferenceConfig) diff --git a/optimum_benchmark/launchers/torchrun/launcher.py b/optimum_benchmark/launchers/torchrun/launcher.py index 1f4d2d53..436ac0f2 100644 --- a/optimum_benchmark/launchers/torchrun/launcher.py +++ b/optimum_benchmark/launchers/torchrun/launcher.py @@ -83,7 +83,7 @@ def entrypoint(worker, queue, lock, log_level, *worker_args): rank = int(os.environ["RANK"]) torch.cuda.set_device(rank) if torch.cuda.is_available() else None - setup_logging(level=log_level, prefix=f"RANK-{rank}") if rank == 0 else None + setup_logging(level=log_level, prefix=f"RANK-{rank}") if rank == 0 else setup_logging(level="ERROR") torch.distributed.init_process_group(backend="nccl" if torch.cuda.is_available() else "gloo") torch.distributed.barrier() diff --git a/optimum_benchmark/system_utils.py b/optimum_benchmark/system_utils.py index 070bf805..e2500d7b 100644 --- a/optimum_benchmark/system_utils.py +++ b/optimum_benchmark/system_utils.py @@ -91,7 +91,7 @@ def get_gpus(): elif is_rocm_system(): if not is_amdsmi_available() and not is_pyrsmi_available(): raise ValueError( - "Either the library amdsmi or pyrsmi is required to run memory benchmark on AMD GPUs, but is not installed. " + "Either the library amdsmi or pyrsmi is required to run memory benchmark on AMD GPUs, but neither is installed." ) gpus = [] @@ -139,7 +139,7 @@ def get_gpu_vram_mb() -> List[int]: elif is_rocm_system(): if not is_amdsmi_available() and not is_pyrsmi_available(): raise ValueError( - "Either the library amdsmi or pyrsmi is required to run memory benchmark on AMD GPUs, but is not installed. " + "Either the library amdsmi or pyrsmi is required to run memory benchmark on AMD GPUs, but neither is installed." ) if is_amdsmi_available(): @@ -182,7 +182,7 @@ def get_gpu_device_ids() -> str: elif is_nvidia_system(): if not is_pynvml_available(): raise ValueError( - "The library pynvml is required to run memory benchmark on NVIDIA GPUs, but is not installed. " + "The library pynvml is required to get GPU device ids, but is not installed. " "Please install the official and NVIDIA maintained PyNVML library through `pip install nvidia-ml-py`." ) @@ -191,22 +191,26 @@ def get_gpu_device_ids() -> str: device_ids = ",".join(str(i) for i in device_ids) pynvml.nvmlShutdown() elif is_rocm_system(): - if not is_amdsmi_available(): + if not is_amdsmi_available() or not is_pyrsmi_available(): raise ValueError( - "The library amdsmi is required to run memory benchmark on AMD GPUs, but is not installed. " - "Please install the official and AMD maintained amdsmi library from https://github.com/ROCm/amdsmi." + "Either the library amdsmi or pyrsmi is required to get GPU device ids, but neither is installed." ) - amdsmi.amdsmi_init() - rocm_version = get_rocm_version() - - if rocm_version >= "5.7": - device_ids = list(range(len(amdsmi.amdsmi_get_processor_handles()))) - else: - device_ids = list(range(len(amdsmi.amdsmi_get_device_handles()))) + if is_amdsmi_available(): + amdsmi.amdsmi_init() + rocm_version = get_rocm_version() + if rocm_version >= "5.7": + device_ids = list(range(len(amdsmi.amdsmi_get_processor_handles()))) + else: + device_ids = list(range(len(amdsmi.amdsmi_get_device_handles()))) + device_ids = ",".join(str(i) for i in device_ids) + amdsmi.amdsmi_shut_down() - device_ids = ",".join(str(i) for i in device_ids) - amdsmi.amdsmi_shut_down() + elif is_pyrsmi_available(): + rocml.smi_initialize() + device_ids = list(range(rocml.smi_get_device_count())) + device_ids = ",".join(str(i) for i in device_ids) + rocml.smi_shutdown() else: raise ValueError("Couldn't infer GPU device ids.") diff --git a/optimum_benchmark/task_utils.py b/optimum_benchmark/task_utils.py index bd7d7999..dfa3f808 100644 --- a/optimum_benchmark/task_utils.py +++ b/optimum_benchmark/task_utils.py @@ -58,8 +58,8 @@ "zero-shot-object-detection": "TFAutoModelForZeroShotObjectDetection", } _DIFFUSERS_TASKS_TO_MODEL_LOADERS = { - "stable-diffusion": "StableDiffusionPipeline", - "stable-diffusion-xl": "StableDiffusionXLImg2ImgPipeline", + "stable-diffusion": "AutoPipelineForText2Image", + "stable-diffusion-xl": "AutoPipelineForText2Image", } _TIMM_TASKS_TO_MODEL_LOADERS = {"image-classification": "create_model"} _LIBRARY_TO_TF_TASKS_TO_MODEL_LOADER_MAP = {"transformers": _TRANSFORMERS_TASKS_TO_TF_MODEL_LOADERS} diff --git a/setup.py b/setup.py index c911ceb5..dba055ff 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,5 @@ import importlib.util import os -import subprocess from setuptools import find_packages, setup @@ -23,32 +22,16 @@ "pandas", ] -# We may allow to install CUDA or RoCm dependencies even -# when building in a non-CUDA or non-ROCm environment. USE_CUDA = os.environ.get("USE_CUDA", None) == "1" USE_ROCM = os.environ.get("USE_ROCM", None) == "1" if USE_CUDA: INSTALL_REQUIRES.append("nvidia-ml-py") -else: - try: - subprocess.run(["nvidia-smi"], stdout=subprocess.DEVNULL) - INSTALL_REQUIRES.append("nvidia-ml-py") - except FileNotFoundError: - pass -# we keep this as a check that amdsmi is installed since it's not available on pypi PYRSMI = "pyrsmi@git+https://github.com/ROCm/pyrsmi.git" if USE_ROCM: if not importlib.util.find_spec("amdsmi"): INSTALL_REQUIRES.append(PYRSMI) -else: - try: - subprocess.run(["rocm-smi"], stdout=subprocess.DEVNULL) - if not importlib.util.find_spec("amdsmi"): - INSTALL_REQUIRES.append(PYRSMI) - except FileNotFoundError: - pass if PYRSMI in INSTALL_REQUIRES: print("ROCm GPU detected without amdsmi installed. Using pyrsmi instead but some features may not work.") @@ -57,15 +40,15 @@ EXTRAS_REQUIRE = { "quality": ["ruff"], "testing": ["pytest", "hydra-joblib-launcher"], - # api-based backends + # optimum backends "openvino": [f"optimum[openvino,nncf]>={MIN_OPTIMUM_VERSION}"], "onnxruntime": [f"optimum[onnxruntime]>={MIN_OPTIMUM_VERSION}"], - "neural-compressor": [f"optimum[neural-compressor]>={MIN_OPTIMUM_VERSION}"], - "torch-ort": [f"optimum>={MIN_OPTIMUM_VERSION}", "onnxruntime-training", "torch-ort"], "onnxruntime-gpu": [f"optimum[onnxruntime-gpu]>={MIN_OPTIMUM_VERSION}"], + "neural-compressor": [f"optimum[neural-compressor]>={MIN_OPTIMUM_VERSION}"], + "torch-ort": ["torch-ort", "onnxruntime-training", f"optimum>={MIN_OPTIMUM_VERSION}"], # docker-based backends - "text-generation-inference": ["py-tgi"], - # specific settings + "py-tgi": ["py-tgi==0.1.3"], + # third-party features "codecarbon": ["codecarbon"], "deepspeed": ["deepspeed"], "diffusers": ["diffusers"], diff --git a/tests/configs/_bert_sweep_.yaml b/tests/configs/_bert_sweep_.yaml index f618a34f..08c7bbde 100644 --- a/tests/configs/_bert_sweep_.yaml +++ b/tests/configs/_bert_sweep_.yaml @@ -1,5 +1,6 @@ hydra: sweeper: params: + backend.no_weights: true,false backend.task: fill-mask,text-classification,token-classification,question-answering backend.model: hf-internal-testing/tiny-random-bert,hf-internal-testing/tiny-random-roberta diff --git a/tests/configs/_gpt_.yaml b/tests/configs/_gpt_.yaml new file mode 100644 index 00000000..17847b2a --- /dev/null +++ b/tests/configs/_gpt_.yaml @@ -0,0 +1,2 @@ +backend: + model: gpt2 diff --git a/tests/configs/_gpt_naive_mp_.yaml b/tests/configs/_gpt_naive_mp_.yaml deleted file mode 100644 index cf2adfd3..00000000 --- a/tests/configs/_gpt_naive_mp_.yaml +++ /dev/null @@ -1,6 +0,0 @@ -backend: - model: gpt2 - task: text-generation - library: transformers - device_ids: 0,1 - device_map: auto diff --git a/tests/configs/_gpt_peft_.yaml b/tests/configs/_gpt_peft_.yaml deleted file mode 100644 index d99267e4..00000000 --- a/tests/configs/_gpt_peft_.yaml +++ /dev/null @@ -1,7 +0,0 @@ -backend: - model: gpt2 - task: text-generation - library: transformers - peft_strategy: lora - peft_config: - task_type: CAUSAL_LM diff --git a/tests/configs/_gpt_sweep_.yaml b/tests/configs/_gpt_sweep_.yaml index 1ff5e2c7..1e3325a9 100644 --- a/tests/configs/_gpt_sweep_.yaml +++ b/tests/configs/_gpt_sweep_.yaml @@ -2,4 +2,5 @@ hydra: sweeper: params: backend.task: text-generation + backend.no_weights: true,false backend.model: hf-internal-testing/tiny-random-gpt2,IlyasMoutawwakil/tiny-random-llama diff --git a/tests/configs/_inference_.yaml b/tests/configs/_inference_.yaml index ef429e8b..b72082b8 100644 --- a/tests/configs/_inference_.yaml +++ b/tests/configs/_inference_.yaml @@ -2,7 +2,11 @@ defaults: - benchmark: inference benchmark: + memory: true + latency: true + duration: 1 warmup_runs: 1 - new_tokens: 2 - memory: true + generate_kwargs: + max_new_tokens: 5 + min_new_tokens: 5 diff --git a/tests/configs/_naive_mp_.yaml b/tests/configs/_naive_mp_.yaml new file mode 100644 index 00000000..108e8b55 --- /dev/null +++ b/tests/configs/_naive_mp_.yaml @@ -0,0 +1,3 @@ +backend: + device_ids: 0,1 + device_map: auto diff --git a/tests/configs/_peft_.yaml b/tests/configs/_peft_.yaml new file mode 100644 index 00000000..d82a6476 --- /dev/null +++ b/tests/configs/_peft_.yaml @@ -0,0 +1,2 @@ +backend: + peft_type: LORA diff --git a/tests/configs/_timm_.yaml b/tests/configs/_timm_.yaml index 0b374c8a..22d47cdd 100644 --- a/tests/configs/_timm_.yaml +++ b/tests/configs/_timm_.yaml @@ -1,5 +1,2 @@ backend: - library: timm - task: image-classification model: timm/tiny_vit_21m_224.in1k - diff --git a/tests/configs/cpu_inference_py_tgi_gpt.yaml b/tests/configs/cpu_inference_py_tgi_gpt.yaml new file mode 100644 index 00000000..c0805b71 --- /dev/null +++ b/tests/configs/cpu_inference_py_tgi_gpt.yaml @@ -0,0 +1,10 @@ +defaults: + - backend: py-tgi + # order of inheritance, last one overrides previous ones + - _base_ # inherits from base config + - _inference_ # inherits from inference config + - _gpt_ # inherits from gpt config + - _cpu_ # inherits from cpu config + - _self_ # hydra 1.1 compatibility + +experiment_name: cpu_inference_py_tgi_gpt diff --git a/tests/configs/cuda_inference_pytorch_gpt_naive_mp.yaml b/tests/configs/cuda_inference_pytorch_gpt_naive_mp.yaml index 6e19ba18..0bd1dcd8 100644 --- a/tests/configs/cuda_inference_pytorch_gpt_naive_mp.yaml +++ b/tests/configs/cuda_inference_pytorch_gpt_naive_mp.yaml @@ -3,7 +3,8 @@ defaults: # order of inheritance, last one overrides previous ones - _base_ # inherits from base config - _inference_ # inherits from inference config - - _gpt_naive_mp_ # inherits from lm naive mp config + - _naive_mp_ # inherits from lm naive mp config + - _gpt_ # inherits from gpt config - _cuda_ # inherits from cpu config - _self_ # hydra 1.1 compatibility diff --git a/tests/configs/cuda_training_pytorch_gpt_naive_mp.yaml b/tests/configs/cuda_training_pytorch_gpt_naive_mp.yaml index ab6d4bc2..f9ae53fb 100644 --- a/tests/configs/cuda_training_pytorch_gpt_naive_mp.yaml +++ b/tests/configs/cuda_training_pytorch_gpt_naive_mp.yaml @@ -3,7 +3,8 @@ defaults: # order of inheritance, last one overrides previous ones - _base_ # inherits from base config - _training_ # inherits from training config - - _gpt_naive_mp_ # inherits from lm naive mp config + - _naive_mp_ # inherits from lm naive mp config + - _gpt_ # inherits from gpt config - _cuda_ # inherits from cpu config - _self_ # hydra 1.1 compatibility diff --git a/tests/configs/cuda_training_pytorch_gpt_peft.yaml b/tests/configs/cuda_training_pytorch_gpt_peft.yaml index 1ee6f473..ce473e6b 100644 --- a/tests/configs/cuda_training_pytorch_gpt_peft.yaml +++ b/tests/configs/cuda_training_pytorch_gpt_peft.yaml @@ -3,7 +3,8 @@ defaults: # order of inheritance, last one overrides previous ones - _base_ # inherits from base config - _training_ # inherits from training config - - _gpt_peft_ # inherits from language modeling peft config + - _peft_ # inherits from language modeling peft config + - _gpt_ # inherits from gpt config - _cuda_ # inherits from cpu config - _self_ # hydra 1.1 compatibility diff --git a/tests/configs/cuda_training_torch_ort_gpt_peft.yaml b/tests/configs/cuda_training_torch_ort_gpt_peft.yaml index 665dec16..6730f3d3 100644 --- a/tests/configs/cuda_training_torch_ort_gpt_peft.yaml +++ b/tests/configs/cuda_training_torch_ort_gpt_peft.yaml @@ -3,7 +3,8 @@ defaults: # order of inheritance, last one overrides previous ones - _base_ # inherits from base config - _training_ # inherits from training config - - _gpt_peft_ # inherits from language modeling peft config + - _peft_ # inherits from language modeling peft config + - _gpt_ # inherits from gpt config - _cuda_ # inherits from cpu config - _self_ # hydra 1.1 compatibility