From a7f2dec852cd80c9286414a31f052547478815cc Mon Sep 17 00:00:00 2001
From: BuildTools <chenxuzhang45@gmail.com>
Date: Mon, 2 Sep 2024 18:17:29 -0700
Subject: [PATCH] feat(ui): add AutoFP8 quantization window

- add AutoFP8 quantization window (currently broken)
- add more dynamic KV parameters
---
 .github/workflows/build.yml                   |  2 +
 src/AutoGGUF.py                               | 99 +++++++++++++++++++
 src/KVOverrideEntry.py                        | 37 ++++++-
 ...{AutoFP8.py => quantize_to_fp8_dynamic.py} | 21 ++--
 4 files changed, 146 insertions(+), 13 deletions(-)
 rename src/{AutoFP8.py => quantize_to_fp8_dynamic.py} (98%)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 0befdf4..87986db 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -66,6 +66,7 @@ jobs:
         Copy-Item -Path "src\convert_hf_to_gguf.py" -Destination "$distPath\src"
         Copy-Item -Path "src\convert_lora_to_gguf.py" -Destination "$distPath\src"
         Copy-Item -Path "src\convert_lora_to_ggml.py" -Destination "$distPath\src"
+        Copy-Item -Path "src\quantize_to_fp8_dynamic.py" -Destination "$distPath\src"
 
     - name: Copy additional files (Linux/macOS)
       if: matrix.os != 'windows-latest'
@@ -76,6 +77,7 @@ jobs:
         cp src/convert_hf_to_gguf.py $distPath/src/
         cp src/convert_lora_to_gguf.py $distPath/src/
         cp src/convert_lora_to_ggml.py $distPath/src/
+        cp src/quantize_to_fp8_dynamic.py $distPath/src/
 
     - name: Generate SHA256 (Windows)
       if: matrix.os == 'windows-latest'
diff --git a/src/AutoGGUF.py b/src/AutoGGUF.py
index b2d02e0..227d65a 100644
--- a/src/AutoGGUF.py
+++ b/src/AutoGGUF.py
@@ -151,6 +151,12 @@ def __init__(self, args: List[str]) -> None:
         about_action.triggered.connect(self.show_about)
         help_menu.addAction(about_action)
 
+        # Tools menu
+        tools_menu = self.menubar.addMenu("&Tools")
+        autofp8_action = QAction("&AutoFP8", self)
+        autofp8_action.triggered.connect(self.show_autofp8_window)
+        tools_menu.addAction(autofp8_action)
+
         # Content widget
         content_widget = QWidget()
         content_layout = QHBoxLayout(content_widget)
@@ -1010,6 +1016,91 @@ def browse_hf_outfile(self) -> None:
         if outfile:
             self.hf_outfile.setText(os.path.abspath(outfile))
 
+    def quantize_to_fp8_dynamic(self, model_dir: str, output_dir: str) -> None:
+        self.logger.info(f"Quantizing {os.path.basename(model_dir)} to {output_dir}")
+        try:
+            command = [
+                "python",
+                "src/quantize_to_fp8_dynamic.py",
+                model_dir,
+                output_dir,
+            ]
+
+            logs_path = self.logs_input.text()
+            ensure_directory(logs_path)
+
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            log_file = os.path.join(logs_path, f"autofp8_{timestamp}.log")
+
+            thread = QuantizationThread(command, os.getcwd(), log_file)
+            self.quant_threads.append(thread)
+
+            task_name = f"Quantizing {os.path.basename(model_dir)} with AutoFP8"
+            task_item = TaskListItem(task_name, log_file, show_progress_bar=False)
+            list_item = QListWidgetItem(self.task_list)
+            list_item.setSizeHint(task_item.sizeHint())
+            self.task_list.addItem(list_item)
+            self.task_list.setItemWidget(list_item, task_item)
+
+            thread.status_signal.connect(task_item.update_status)
+            thread.finished_signal.connect(
+                lambda: self.task_finished(thread, task_item)
+            )
+            thread.error_signal.connect(
+                lambda err: handle_error(self.logger, err, task_item)
+            )
+            thread.start()
+
+        except Exception as e:
+            show_error(self.logger, f"Error starting AutoFP8 quantization: {e}")
+        self.logger.info("AutoFP8 quantization task started")
+
+    def show_autofp8_window(self):
+        dialog = QDialog(self)
+        dialog.setWindowTitle("Quantize to FP8 Dynamic")
+        dialog.setFixedWidth(500)
+        layout = QVBoxLayout()
+
+        # Input path
+        input_layout = QHBoxLayout()
+        self.fp8_input = QLineEdit()
+        input_button = QPushButton(BROWSE)
+        input_button.clicked.connect(
+            lambda: self.fp8_input.setText(
+                QFileDialog.getExistingDirectory(self, "Open Model Folder")
+            )
+        )
+        input_layout.addWidget(QLabel("Input Model:"))
+        input_layout.addWidget(self.fp8_input)
+        input_layout.addWidget(input_button)
+        layout.addLayout(input_layout)
+
+        # Output path
+        output_layout = QHBoxLayout()
+        self.fp8_output = QLineEdit()
+        output_button = QPushButton(BROWSE)
+        output_button.clicked.connect(
+            lambda: self.fp8_output.setText(
+                QFileDialog.getExistingDirectory(self, "Open Model Folder")
+            )
+        )
+        output_layout.addWidget(QLabel("Output Path:"))
+        output_layout.addWidget(self.fp8_output)
+        output_layout.addWidget(output_button)
+        layout.addLayout(output_layout)
+
+        # Quantize button
+        quantize_button = QPushButton("Quantize")
+        quantize_button.clicked.connect(
+            lambda: self.quantize_to_fp8_dynamic(
+                self.fp8_input.text(), self.fp8_output.text()
+            )
+        )
+        layout.addWidget(quantize_button)
+
+        dialog.setLayout(layout)
+        dialog.exec()
+
     def convert_hf_to_gguf(self) -> None:
         self.logger.info(STARTING_HF_TO_GGUF_CONVERSION)
         try:
@@ -1346,10 +1437,12 @@ def quantize_model(self) -> None:
                     output_name_parts.append("rq")
 
                 # Check for KV override
+                kv_used = bool
                 if any(
                     entry.get_override_string() for entry in self.kv_override_entries
                 ):
                     output_name_parts.append("kv")
+                    kv_used = True
 
                 # Join all parts with underscores and add .gguf extension
                 output_name = "_".join(output_name_parts) + ".gguf"
@@ -1391,6 +1484,12 @@ def quantize_model(self) -> None:
                             model_name=model_name,
                             quant_type=quant_type,
                             output_path=output_path,
+                            quantization_parameters=[
+                                kv_used,  # If KV overrides are used
+                                self.allow_requantize.isChecked(),  # If requantize is used
+                                self.pure.isChecked(),  # If pure tensors option is used
+                                self.leave_output_tensor.isChecked(),  # If leave output tensor option is used
+                            ],
                         )
                         if override_string:
                             command.extend(["--override-kv", override_string])
diff --git a/src/KVOverrideEntry.py b/src/KVOverrideEntry.py
index 91ef165..71fc90a 100644
--- a/src/KVOverrideEntry.py
+++ b/src/KVOverrideEntry.py
@@ -1,7 +1,12 @@
+import locale
+import shutil
+
+import psutil
 from PySide6.QtWidgets import QWidget, QHBoxLayout, QLineEdit, QComboBox, QPushButton
 from PySide6.QtCore import Signal, QRegularExpression
 from PySide6.QtGui import QDoubleValidator, QIntValidator, QRegularExpressionValidator
 from datetime import datetime
+import pytz
 import time
 import os
 import socket
@@ -24,7 +29,7 @@ def __init__(self, parent=None) -> None:
         layout.addWidget(self.key_input)
 
         self.type_combo = QComboBox()
-        self.type_combo.addItems(["int", "str", "float"])
+        self.type_combo.addItems(["int", "str", "float", "u32", "i32"])
         layout.addWidget(self.type_combo)
 
         self.value_input = QLineEdit()
@@ -46,7 +51,11 @@ def delete_clicked(self) -> None:
         self.deleted.emit(self)
 
     def get_override_string(
-        self, model_name=None, quant_type=None, output_path=None
+        self,
+        model_name=None,
+        quant_type=None,
+        output_path=None,
+        quantization_parameters=None,
     ) -> str:  # Add arguments
         key = self.key_input.text()
         type_ = self.type_combo.currentText()
@@ -61,7 +70,14 @@ def get_override_string(
             "{system.hostname}": lambda: socket.gethostname(),
             "{system.platform}": lambda: platform.system(),
             "{system.python.version}": lambda: platform.python_version(),
-            "{system.date}": lambda: datetime.now().strftime("%Y-%m-%d"),
+            "{system.timezone}": lambda: time.tzname[time.daylight],
+            "{system.cpus}": lambda: str(os.cpu_count()),
+            "{system.memory.total}": lambda: str(psutil.virtual_memory().total),
+            "{system.memory.free}": lambda: str(psutil.virtual_memory().free),
+            "{system.filesystem.used}": lambda: str(shutil.disk_usage("/").used),
+            "{system.kernel.version}": lambda: platform.release(),
+            "{system.locale}": lambda: locale.getdefaultlocale()[0],
+            "{process.nice}": lambda: str(os.nice(0)),
             "{model.name}": lambda: (
                 model_name if model_name is not None else "Unknown Model"
             ),
@@ -71,6 +87,21 @@ def get_override_string(
             "{output.path}": lambda: (
                 output_path if output_path is not None else "Unknown Output Path"
             ),
+            "{quant.kv}": lambda: (
+                quantization_parameters[0]
+                if quantization_parameters is not None
+                else False
+            ),
+            "{quant.requantized}": lambda: (
+                quantization_parameters[1]
+                if quantization_parameters is not None
+                else False
+            ),
+            "{quant.leave_output_tensor}": lambda: (
+                quantization_parameters[2]
+                if quantization_parameters is not None
+                else False
+            ),
         }
 
         for param, func in dynamic_params.items():
diff --git a/src/AutoFP8.py b/src/quantize_to_fp8_dynamic.py
similarity index 98%
rename from src/AutoFP8.py
rename to src/quantize_to_fp8_dynamic.py
index 8d876e6..44dde54 100644
--- a/src/AutoFP8.py
+++ b/src/quantize_to_fp8_dynamic.py
@@ -1,6 +1,7 @@
 import copy
 import gc
 import re
+import sys
 from typing import List
 from typing import Optional, Tuple
 
@@ -280,12 +281,11 @@ def _prepare_calibration_data(calibration_tokens):
                 _prepare_calibration_data(calibration_tokens),
             )
 
-    def save_quantized(self, save_dir, logger):
+    def save_quantized(self, save_dir):
         save_quantized_model(
             self.model,
             quant_config=self.quantize_config,
             save_dir=save_dir,
-            logger=logger,
         )
 
 
@@ -489,10 +489,9 @@ def save_quantized_model(
     model: AutoModelForCausalLM,
     quant_config: BaseQuantizeConfig,
     save_dir: str,
-    logger: Logger,
 ):
-    logger.info(model)
-    logger.info(f"Saving the model to {save_dir}")
+    print(model)
+    print(f"Saving the model to {save_dir}")
     static_q_dict = {
         "quantization_config": {
             "quant_method": "fp8",
@@ -544,10 +543,8 @@ def get_kv_cache_quant_layers(model, kv_cache_quant_targets: Tuple[str]) -> List
     return kv_cache_quant_layers
 
 
-def quantize_to_fp8_dynamic(
-    input_model_dir: str, output_model_dir: str, logger: Logger
-) -> None:
-    logger.info("Starting fp8 dynamic quantization")
+def quantize_to_fp8_dynamic(input_model_dir: str, output_model_dir: str) -> None:
+    print("Starting fp8 dynamic quantization")
     # Define quantization config with static activation scales
     quantize_config = BaseQuantizeConfig(
         quant_method="fp8", activation_scheme="dynamic"
@@ -557,4 +554,8 @@ def quantize_to_fp8_dynamic(
     model = AutoFP8ForCausalLM.from_pretrained(input_model_dir, quantize_config)
     # No examples for dynamic quantization
     model.quantize([])
-    model.save_quantized(output_model_dir, logger)
+    model.save_quantized(output_model_dir)
+
+
+if __name__ == "__main__":
+    quantize_to_fp8_dynamic(sys.argv[0], sys.argv[1])