diff --git a/examples/api_launch.py b/examples/api_launch.py deleted file mode 100644 index 734dfd3c..00000000 --- a/examples/api_launch.py +++ /dev/null @@ -1,20 +0,0 @@ -from optimum_benchmark.backends.pytorch.config import PyTorchConfig -from optimum_benchmark.benchmarks.inference.config import InferenceConfig -from optimum_benchmark.experiment import ExperimentConfig, launch -from optimum_benchmark.launchers.torchrun.config import TorchrunConfig -from optimum_benchmark.logging_utils import setup_logging - -if __name__ == "__main__": - setup_logging(level="INFO") - launcher_config = TorchrunConfig(nproc_per_node=2) - benchmark_config = InferenceConfig(latency=True, memory=True) - backend_config = PyTorchConfig(model="gpt2", device="cuda", device_ids="0,1", no_weights=True) - experiment_config = ExperimentConfig( - experiment_name="api-launch", - benchmark=benchmark_config, - launcher=launcher_config, - backend=backend_config, - ) - benchmark_report = launch(experiment_config) - experiment_config.push_to_hub("IlyasMoutawwakil/benchmarks") - benchmark_report.push_to_hub("IlyasMoutawwakil/benchmarks") diff --git a/examples/pytorch_awq_exllama.py b/examples/pytorch_awq_exllama.py new file mode 100644 index 00000000..f93b8cf6 --- /dev/null +++ b/examples/pytorch_awq_exllama.py @@ -0,0 +1,32 @@ +from optimum_benchmark.backends.pytorch.config import PyTorchConfig +from optimum_benchmark.benchmarks.inference.config import InferenceConfig +from optimum_benchmark.experiment import ExperimentConfig, launch +from optimum_benchmark.launchers.process.config import ProcessConfig +from optimum_benchmark.logging_utils import setup_logging + +if __name__ == "__main__": + setup_logging(level="INFO") + launcher_config = ProcessConfig(device_isolation=False) + benchmark_config = InferenceConfig( + memory=True, + latency=True, + input_shapes={"batch_size": 4, "sequence_length": 128}, + generate_kwargs={"max_new_tokens": 128, "min_new_tokens": 128}, + ) + backend_config = PyTorchConfig( + model="TheBloke/Mistral-7B-Instruct-v0.1-AWQ", + device="cuda", + device_ids="0", + no_weights=True, + quantization_scheme="awq", + quantization_config={"version": "exllama"}, + ) + experiment_config = ExperimentConfig( + experiment_name="awq-exllamav2", + benchmark=benchmark_config, + launcher=launcher_config, + backend=backend_config, + ) + benchmark_report = launch(experiment_config) + experiment_config.push_to_hub("IlyasMoutawwakil/awq-benchmarks") + benchmark_report.push_to_hub("IlyasMoutawwakil/awq-benchmarks") diff --git a/optimum_benchmark/backends/pytorch/backend.py b/optimum_benchmark/backends/pytorch/backend.py index 9c377f12..a27b0cfa 100644 --- a/optimum_benchmark/backends/pytorch/backend.py +++ b/optimum_benchmark/backends/pytorch/backend.py @@ -157,13 +157,10 @@ def load_model_from_pretrained(self) -> None: LOGGER.info("\t+ Loading Quantized model") self.pretrained_model = self.automodel_class.from_pretrained( pretrained_model_name_or_path=self.config.model, - device_map=self.config.device_map, + device_map=self.config.device_map or torch.device(self.config.device), **self.config.hub_kwargs, **self.automodel_kwargs, ) - if self.config.device_map is None and self.config.device != "cpu": - LOGGER.info(f"\t+ Moving model to device: {self.config.device}") - self.pretrained_model.to(self.config.device) elif self.config.device_map is not None: # we can't use device context manager since device_map is specified LOGGER.info(f"\t+ Loading model with device map: {self.config.device_map}") @@ -268,13 +265,19 @@ def is_awq_quantized(self) -> bool: @property def is_exllamav2(self) -> bool: - dummy_exllama = {"exllama_version": None} return (self.is_gptq_quantized or self.is_awq_quantized) and ( - getattr(self.quantization_config, "exllama_config", dummy_exllama)["exllama_version"] - or getattr(self.pretrained_config, "quantization_config", {}).get("exllama_config", dummy_exllama)[ - "exllama_version" - ] - ) == 2 + ( + hasattr(self.pretrained_config, "quantization_config") + and hasattr(self.pretrained_config.quantization_config, "exllama_config") + and "exllama_version" in self.pretrained_config.quantization_config.exllama_config + and self.pretrained_config.quantization_config.exllama_config["exllama_version"] == 2 + ) + or ( + hasattr(self.quantization_config, "exllama_config") + and "exllama_version" in self.quantization_config.exllama_config + and self.quantization_config.exllama_config["exllama_version"] == 2 + ) + ) @property def automodel_kwargs(self) -> Dict[str, Any]: