From a7f2dec852cd80c9286414a31f052547478815cc Mon Sep 17 00:00:00 2001 From: BuildTools Date: Mon, 2 Sep 2024 18:17:29 -0700 Subject: [PATCH] feat(ui): add AutoFP8 quantization window - add AutoFP8 quantization window (currently broken) - add more dynamic KV parameters --- .github/workflows/build.yml | 2 + src/AutoGGUF.py | 99 +++++++++++++++++++ src/KVOverrideEntry.py | 37 ++++++- ...{AutoFP8.py => quantize_to_fp8_dynamic.py} | 21 ++-- 4 files changed, 146 insertions(+), 13 deletions(-) rename src/{AutoFP8.py => quantize_to_fp8_dynamic.py} (98%) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 0befdf4..87986db 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -66,6 +66,7 @@ jobs: Copy-Item -Path "src\convert_hf_to_gguf.py" -Destination "$distPath\src" Copy-Item -Path "src\convert_lora_to_gguf.py" -Destination "$distPath\src" Copy-Item -Path "src\convert_lora_to_ggml.py" -Destination "$distPath\src" + Copy-Item -Path "src\quantize_to_fp8_dynamic.py" -Destination "$distPath\src" - name: Copy additional files (Linux/macOS) if: matrix.os != 'windows-latest' @@ -76,6 +77,7 @@ jobs: cp src/convert_hf_to_gguf.py $distPath/src/ cp src/convert_lora_to_gguf.py $distPath/src/ cp src/convert_lora_to_ggml.py $distPath/src/ + cp src/quantize_to_fp8_dynamic.py $distPath/src/ - name: Generate SHA256 (Windows) if: matrix.os == 'windows-latest' diff --git a/src/AutoGGUF.py b/src/AutoGGUF.py index b2d02e0..227d65a 100644 --- a/src/AutoGGUF.py +++ b/src/AutoGGUF.py @@ -151,6 +151,12 @@ def __init__(self, args: List[str]) -> None: about_action.triggered.connect(self.show_about) help_menu.addAction(about_action) + # Tools menu + tools_menu = self.menubar.addMenu("&Tools") + autofp8_action = QAction("&AutoFP8", self) + autofp8_action.triggered.connect(self.show_autofp8_window) + tools_menu.addAction(autofp8_action) + # Content widget content_widget = QWidget() content_layout = QHBoxLayout(content_widget) @@ -1010,6 +1016,91 @@ def browse_hf_outfile(self) -> None: if outfile: self.hf_outfile.setText(os.path.abspath(outfile)) + def quantize_to_fp8_dynamic(self, model_dir: str, output_dir: str) -> None: + self.logger.info(f"Quantizing {os.path.basename(model_dir)} to {output_dir}") + try: + command = [ + "python", + "src/quantize_to_fp8_dynamic.py", + model_dir, + output_dir, + ] + + logs_path = self.logs_input.text() + ensure_directory(logs_path) + + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + log_file = os.path.join(logs_path, f"autofp8_{timestamp}.log") + + thread = QuantizationThread(command, os.getcwd(), log_file) + self.quant_threads.append(thread) + + task_name = f"Quantizing {os.path.basename(model_dir)} with AutoFP8" + task_item = TaskListItem(task_name, log_file, show_progress_bar=False) + list_item = QListWidgetItem(self.task_list) + list_item.setSizeHint(task_item.sizeHint()) + self.task_list.addItem(list_item) + self.task_list.setItemWidget(list_item, task_item) + + thread.status_signal.connect(task_item.update_status) + thread.finished_signal.connect( + lambda: self.task_finished(thread, task_item) + ) + thread.error_signal.connect( + lambda err: handle_error(self.logger, err, task_item) + ) + thread.start() + + except Exception as e: + show_error(self.logger, f"Error starting AutoFP8 quantization: {e}") + self.logger.info("AutoFP8 quantization task started") + + def show_autofp8_window(self): + dialog = QDialog(self) + dialog.setWindowTitle("Quantize to FP8 Dynamic") + dialog.setFixedWidth(500) + layout = QVBoxLayout() + + # Input path + input_layout = QHBoxLayout() + self.fp8_input = QLineEdit() + input_button = QPushButton(BROWSE) + input_button.clicked.connect( + lambda: self.fp8_input.setText( + QFileDialog.getExistingDirectory(self, "Open Model Folder") + ) + ) + input_layout.addWidget(QLabel("Input Model:")) + input_layout.addWidget(self.fp8_input) + input_layout.addWidget(input_button) + layout.addLayout(input_layout) + + # Output path + output_layout = QHBoxLayout() + self.fp8_output = QLineEdit() + output_button = QPushButton(BROWSE) + output_button.clicked.connect( + lambda: self.fp8_output.setText( + QFileDialog.getExistingDirectory(self, "Open Model Folder") + ) + ) + output_layout.addWidget(QLabel("Output Path:")) + output_layout.addWidget(self.fp8_output) + output_layout.addWidget(output_button) + layout.addLayout(output_layout) + + # Quantize button + quantize_button = QPushButton("Quantize") + quantize_button.clicked.connect( + lambda: self.quantize_to_fp8_dynamic( + self.fp8_input.text(), self.fp8_output.text() + ) + ) + layout.addWidget(quantize_button) + + dialog.setLayout(layout) + dialog.exec() + def convert_hf_to_gguf(self) -> None: self.logger.info(STARTING_HF_TO_GGUF_CONVERSION) try: @@ -1346,10 +1437,12 @@ def quantize_model(self) -> None: output_name_parts.append("rq") # Check for KV override + kv_used = bool if any( entry.get_override_string() for entry in self.kv_override_entries ): output_name_parts.append("kv") + kv_used = True # Join all parts with underscores and add .gguf extension output_name = "_".join(output_name_parts) + ".gguf" @@ -1391,6 +1484,12 @@ def quantize_model(self) -> None: model_name=model_name, quant_type=quant_type, output_path=output_path, + quantization_parameters=[ + kv_used, # If KV overrides are used + self.allow_requantize.isChecked(), # If requantize is used + self.pure.isChecked(), # If pure tensors option is used + self.leave_output_tensor.isChecked(), # If leave output tensor option is used + ], ) if override_string: command.extend(["--override-kv", override_string]) diff --git a/src/KVOverrideEntry.py b/src/KVOverrideEntry.py index 91ef165..71fc90a 100644 --- a/src/KVOverrideEntry.py +++ b/src/KVOverrideEntry.py @@ -1,7 +1,12 @@ +import locale +import shutil + +import psutil from PySide6.QtWidgets import QWidget, QHBoxLayout, QLineEdit, QComboBox, QPushButton from PySide6.QtCore import Signal, QRegularExpression from PySide6.QtGui import QDoubleValidator, QIntValidator, QRegularExpressionValidator from datetime import datetime +import pytz import time import os import socket @@ -24,7 +29,7 @@ def __init__(self, parent=None) -> None: layout.addWidget(self.key_input) self.type_combo = QComboBox() - self.type_combo.addItems(["int", "str", "float"]) + self.type_combo.addItems(["int", "str", "float", "u32", "i32"]) layout.addWidget(self.type_combo) self.value_input = QLineEdit() @@ -46,7 +51,11 @@ def delete_clicked(self) -> None: self.deleted.emit(self) def get_override_string( - self, model_name=None, quant_type=None, output_path=None + self, + model_name=None, + quant_type=None, + output_path=None, + quantization_parameters=None, ) -> str: # Add arguments key = self.key_input.text() type_ = self.type_combo.currentText() @@ -61,7 +70,14 @@ def get_override_string( "{system.hostname}": lambda: socket.gethostname(), "{system.platform}": lambda: platform.system(), "{system.python.version}": lambda: platform.python_version(), - "{system.date}": lambda: datetime.now().strftime("%Y-%m-%d"), + "{system.timezone}": lambda: time.tzname[time.daylight], + "{system.cpus}": lambda: str(os.cpu_count()), + "{system.memory.total}": lambda: str(psutil.virtual_memory().total), + "{system.memory.free}": lambda: str(psutil.virtual_memory().free), + "{system.filesystem.used}": lambda: str(shutil.disk_usage("/").used), + "{system.kernel.version}": lambda: platform.release(), + "{system.locale}": lambda: locale.getdefaultlocale()[0], + "{process.nice}": lambda: str(os.nice(0)), "{model.name}": lambda: ( model_name if model_name is not None else "Unknown Model" ), @@ -71,6 +87,21 @@ def get_override_string( "{output.path}": lambda: ( output_path if output_path is not None else "Unknown Output Path" ), + "{quant.kv}": lambda: ( + quantization_parameters[0] + if quantization_parameters is not None + else False + ), + "{quant.requantized}": lambda: ( + quantization_parameters[1] + if quantization_parameters is not None + else False + ), + "{quant.leave_output_tensor}": lambda: ( + quantization_parameters[2] + if quantization_parameters is not None + else False + ), } for param, func in dynamic_params.items(): diff --git a/src/AutoFP8.py b/src/quantize_to_fp8_dynamic.py similarity index 98% rename from src/AutoFP8.py rename to src/quantize_to_fp8_dynamic.py index 8d876e6..44dde54 100644 --- a/src/AutoFP8.py +++ b/src/quantize_to_fp8_dynamic.py @@ -1,6 +1,7 @@ import copy import gc import re +import sys from typing import List from typing import Optional, Tuple @@ -280,12 +281,11 @@ def _prepare_calibration_data(calibration_tokens): _prepare_calibration_data(calibration_tokens), ) - def save_quantized(self, save_dir, logger): + def save_quantized(self, save_dir): save_quantized_model( self.model, quant_config=self.quantize_config, save_dir=save_dir, - logger=logger, ) @@ -489,10 +489,9 @@ def save_quantized_model( model: AutoModelForCausalLM, quant_config: BaseQuantizeConfig, save_dir: str, - logger: Logger, ): - logger.info(model) - logger.info(f"Saving the model to {save_dir}") + print(model) + print(f"Saving the model to {save_dir}") static_q_dict = { "quantization_config": { "quant_method": "fp8", @@ -544,10 +543,8 @@ def get_kv_cache_quant_layers(model, kv_cache_quant_targets: Tuple[str]) -> List return kv_cache_quant_layers -def quantize_to_fp8_dynamic( - input_model_dir: str, output_model_dir: str, logger: Logger -) -> None: - logger.info("Starting fp8 dynamic quantization") +def quantize_to_fp8_dynamic(input_model_dir: str, output_model_dir: str) -> None: + print("Starting fp8 dynamic quantization") # Define quantization config with static activation scales quantize_config = BaseQuantizeConfig( quant_method="fp8", activation_scheme="dynamic" @@ -557,4 +554,8 @@ def quantize_to_fp8_dynamic( model = AutoFP8ForCausalLM.from_pretrained(input_model_dir, quantize_config) # No examples for dynamic quantization model.quantize([]) - model.save_quantized(output_model_dir, logger) + model.save_quantized(output_model_dir) + + +if __name__ == "__main__": + quantize_to_fp8_dynamic(sys.argv[0], sys.argv[1])