kp-forks · pull · Oct 18, 2024 · Oct 16, 2024 · Oct 17, 2024
diff --git a/docs/compilation/configure_quantization.rst b/docs/compilation/configure_quantization.rst
@@ -1,5 +1,5 @@
-🚧 Configure Quantization
-=========================
+Configure Quantization
+======================
 
 Quantization Algorithm
 ----------------------
@@ -11,12 +11,79 @@ The default quantization algorithm used in MLC-LLM is grouping quantization meth
 Quantization Mode
 -----------------
 
-In MLC-LLM we use a short code that indicates the quantization mode to use.    
+In MLC-LLM we use a short code that indicates the quantization mode to use. MLC-LLM supports both
+weight-only quantization and weight-activation quantization.
 
-The format of the code is ``qAfB(_id)``, where ``A`` represents the number
+For the weight-only quantization, he format of the code is ``qAfB(_id)``, where ``A`` represents the number
 of bits for storing weights and ``B`` represents the number of bits for storing activations.
 The ``_id`` is an integer identifier to distinguish different quantization algorithms (e.g. symmetric, non-symmetric, AWQ, etc).
 
 Currently, available options are: ``q0f16``, ``q0f32``, ``q3f16_1``, ``q4f16_1``, ``q4f32_1``, and ``q4f16_awq`` (not stable).
 
-More details to come.
+For the weight-activation quantization, currently MLC-LLM supports FP8 quantization on CUDA.
+The available options are: ``e4m3_e4m3_f16`` and ``e5m2_e5m2_f16``. In these modes, both weights and activations are quantized to FP8 format.
+The output of each layer is in higher precision (FP16) and then requantized to FP8.
+
+.. _calibration:
+
+Calibration
+-----------
+
+For ``e4m3_e4m3_f16`` quantization, we need to calibrate the quantization parameters for the activations.
+The calibration process is done by running the following command:
+
+1. Compile the calibration model
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+We use the same compilation workflow to compile the model in calibration mode.
+The only difference is that we need to specify the quantization mode as ``e4m3_e4m3_f16_calibrate``.
+
+.. code-block:: bash
+
+    mlc_llm gen_config \
+        <model-path> \
+        --quantization e4m3_e4m3_f16_max_calibrate \
+        --output <output-path>
+
+    mlc_llm convert_weights \
+        <model-path> \
+        --quantization e4m3_e4m3_f16_max_calibrate \
+        --output <output-path>
+
+    mlc_llm compile \
+        <config-path> \
+        --output <output-path>
+
+2. Run the calibration model
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+We will run the calibration model on the dataset such as ShareGPT to collect the statistics of the
+activations. The calibration model will updates the quantization parameters in the weights file
+in-place. We turn off the cuda graph as it is not yet supported in the calibration process.
+
+.. code-block:: bash
+
+   mlc_llm calibrate \
+       <model-path> \
+       --model-lib <model-lib-path> \
+       --dataset <dataset-path> \
+       --num-calibration-samples <num-samples> \
+       --opt "cudagraph=0"
+       --output <output-path>
+
+3. Compile the quantized model for inference.
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+After the calibration process, we can compile the model for inference. In this step, we only need
+to generate the configuration file using the desired quantization format and compile the model.
+Weights are already quantized and calibrated in the previous steps and do not need to be converted again.
+
+.. code-block:: bash
+
+    mlc_llm gen_config \
+        <model-path> \
+        --quantization e4m3_e4m3_f16 \
+        --output <output-path>
+    mlc_llm compile \
+        <config-path> \
+        --output <output-path>
diff --git a/docs/index.rst b/docs/index.rst
@@ -52,6 +52,7 @@ Check out :ref:`introduction-to-mlc-llm` for the introduction and tutorial of a
    compilation/compile_models.rst
    compilation/package_libraries_and_weights.rst
    compilation/define_new_models.rst
+   compilation/configure_quantization.rst
 
 .. toctree::
    :maxdepth: 1

diff --git a/python/mlc_llm/model/llava/llava_model.py b/python/mlc_llm/model/llava/llava_model.py
@@ -155,9 +155,12 @@ def embed(self, input_ids: Tensor) -> Tensor:
         return self.language_model.embed(input_ids)
 
     def image_preprocess(self, pixel_values: Tensor) -> Tensor:
-        # pixel_values shape is NHWC
+        pixel_values = permute_dims(pixel_values, axes=(0, 3, 1, 2))  # NHWC -> NCHW
         pixel_values = self.image_processor.resize(
-            pixel_values, {"shortest_edge": self.config.vision_config.image_size}
+            pixel_values,
+            {
+                "shortest_edge": self.config.vision_config.image_size,
+            },
         )
         pixel_values = self.image_processor.crop(
             pixel_values,
@@ -168,7 +171,6 @@ def image_preprocess(self, pixel_values: Tensor) -> Tensor:
         )
         pixel_values = self.image_processor.rescale(pixel_values)
         pixel_values = self.image_processor.normalize(pixel_values)
-        pixel_values = permute_dims(pixel_values, axes=(0, 3, 1, 2))  # NHWC -> NCHW
         return pixel_values
 
     def image_embed(self, pixel_values: Tensor) -> Tensor:

diff --git a/python/mlc_llm/model/phi3v/phi3v_model.py b/python/mlc_llm/model/phi3v/phi3v_model.py
@@ -219,6 +219,7 @@ def embed(self, input_ids: Tensor):
 
     # pylint: disable=protected-access
     def image_preprocess(self, pixel_values: Tensor, num_crops=16) -> Tensor:
+        pixel_values = op.permute_dims(pixel_values, axes=(0, 3, 1, 2))  # NHWC -> NCHW
         pixel_values = self.image_processor.resize(pixel_values, params={"hd_transform": 336})
         new_h = tir.Var("new_h", "int64")
         new_w = tir.Var("new_w", "int64")
@@ -228,7 +229,7 @@ def image_preprocess(self, pixel_values: Tensor, num_crops=16) -> Tensor:
             .match_cast(
                 pixel_values._expr,
                 relax.TensorStructInfo(
-                    [pixel_values.shape[0], new_h, new_w, pixel_values.shape[3]], pixel_values.dtype
+                    [pixel_values.shape[0], pixel_values.shape[1], new_h, new_w], pixel_values.dtype
                 ),
             ),
             "pixel_values",
@@ -246,16 +247,14 @@ def image_preprocess(self, pixel_values: Tensor, num_crops=16) -> Tensor:
             .match_cast(
                 global_image._expr,
                 relax.TensorStructInfo(
-                    [global_image.shape[0], 336, 336, global_image.shape[3]], global_image.dtype
+                    [global_image.shape[0], global_image.shape[1], 336, 336], global_image.dtype
                 ),
             ),
             "global_image",
         )
 
-        global_image = op.permute_dims(global_image, axes=(0, 3, 1, 2))
-        n, h, w, c = pixel_values.shape  # pylint: disable=unused-variable
+        n, c, h, w = pixel_values.shape  # pylint: disable=unused-variable
         assert isinstance(h, tir.Mul) and isinstance(h.b, tir.IntImm) and h.b.value == 336
-        pixel_values = op.permute_dims(pixel_values, axes=(0, 3, 1, 2))  # NHWC -> NCHW
         pixel_values = op.reshape(pixel_values, shape=(1, 3, h.a, 336, w // 336, 336))
         pixel_values = op.permute_dims(pixel_values, axes=(0, 2, 4, 1, 3, 5))
         pixel_values = op.reshape(pixel_values, shape=(-1, 3, 336, 336))