From df79a8f8f1e7529c28c3b09aa72f10d708872369 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Tue, 27 Feb 2024 10:02:45 +0000 Subject: [PATCH] make deepspeed model initialization fatser --- Makefile | 5 ++- optimum_benchmark/backends/pytorch/backend.py | 33 ++++++++++++------- 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/Makefile b/Makefile index e30f8d01..468cccd0 100644 --- a/Makefile +++ b/Makefile @@ -68,7 +68,7 @@ run_docker_cuda: --rm \ --pid host \ --shm-size 64G \ - --gpus '"device=0,1"' \ + --gpus all \ --entrypoint /bin/bash \ --volume $(PWD):/workspace \ --workdir /workspace \ @@ -81,8 +81,7 @@ run_docker_rocm: --pid host \ --shm-size 64G \ --device /dev/kfd \ - --device /dev/dri/renderD128 \ - --device /dev/dri/renderD129 \ + --device /dev/dri/ \ --entrypoint /bin/bash \ --volume $(PWD):/workspace \ --workdir /workspace \ diff --git a/optimum_benchmark/backends/pytorch/backend.py b/optimum_benchmark/backends/pytorch/backend.py index a27b0cfa..87d53290 100644 --- a/optimum_benchmark/backends/pytorch/backend.py +++ b/optimum_benchmark/backends/pytorch/backend.py @@ -142,18 +142,33 @@ def load_model_from_pretrained(self) -> None: LOGGER.info(f"\t+ Moving pipeline to device: {self.config.device}") self.pretrained_model.to(self.config.device) elif self.config.deepspeed_inference: - with torch.device("cpu"): - LOGGER.info("\t+ Loading DeepSpeed model directly on CPU to avoid OOM") - self.pretrained_model = self.automodel_class.from_pretrained( - pretrained_model_name_or_path=self.config.model, **self.config.hub_kwargs, **self.automodel_kwargs - ) + if self.config.no_weights: + with torch.device("meta"): + LOGGER.info("\t+ Loading model on meta device for fast initialization") + self.pretrained_model = self.automodel_class.from_pretrained( + pretrained_model_name_or_path=self.config.model, + **self.config.hub_kwargs, + **self.automodel_kwargs, + ) + LOGGER.info("\t+ Materializing model on CPU") + self.pretrained_model.to_empty(device="cpu") + LOGGER.info("\t+ Tying model weights") + self.pretrained_model.tie_weights() + else: + LOGGER.info("\t+ Loading model on cpu to avoid OOM") + with torch.device("cpu"): + self.pretrained_model = self.automodel_class.from_pretrained( + pretrained_model_name_or_path=self.config.model, + **self.config.hub_kwargs, + **self.automodel_kwargs, + ) torch.distributed.barrier() # better safe than hanging - LOGGER.info("\t+ Initializing DeepSpeed Inference") + LOGGER.info("\t+ Initializing DeepSpeed Inference Engine") self.pretrained_model = init_inference(self.pretrained_model, config=self.config.deepspeed_inference_config) torch.distributed.barrier() # better safe than hanging elif self.is_quantized: - # we can't use device context manager since the model is quantized + # we can't use device context manager on quantized models LOGGER.info("\t+ Loading Quantized model") self.pretrained_model = self.automodel_class.from_pretrained( pretrained_model_name_or_path=self.config.model, @@ -218,10 +233,6 @@ def load_model_with_no_weights(self) -> None: self.load_model_from_pretrained() self.config.model = original_model - # dunno how necessary this is - LOGGER.info("\t+ Tying model weights") - self.pretrained_model.tie_weights() - def process_quantization_config(self) -> None: if self.is_gptq_quantized: LOGGER.info("\t+ Processing GPTQ config")