Xilinx · bwintermann · Aug 26, 2024 · Aug 26, 2024 · Aug 26, 2024 · Aug 27, 2024
diff --git a/.gitignore b/.gitignore
@@ -63,6 +63,7 @@ htmlcov/*
 junit.xml
 coverage.xml
 .pytest_cache/
+tests/util/fastpack_*.txt
 
 # Build and docs folder/files
 build/*

diff --git a/src/finn/util/data_packing.py b/src/finn/util/data_packing.py
@@ -27,15 +27,54 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import binascii
+
+# Import the faster packing functions. This is executed when loading the module
+# so that the faster version is always available when this is imported
+import ctypes as ct
 import numpy as np
 import os
 import sys
+import threading
 from bitstring import BitArray
+from math import ceil
+from numpy.ctypeslib import ndpointer
 from qonnx.core.datatype import DataType
 from qonnx.util.basic import roundup_to_integer_multiple
 
-
-def array2hexstring(array, dtype, pad_to_nbits, prefix="0x", reverse=False):
+# Setup
+fastpack_source = os.path.abspath(os.path.join(os.path.dirname(__file__), "fast_pack.c"))
+fastpack_lib = os.path.abspath(os.path.join(os.path.dirname(__file__), "fastpack.so"))
+__fastpack_so = None
+__fastpack_load_lock = threading.Lock()
+assert os.path.isfile(fastpack_source), "Could not find fast_pack.c in the utils/ dir of FINN"
+
+
+# Singleton setup to safely load this module in multithreading contexts
+def get_fastpack():
+    global __fastpack_load_lock, __fastpack_so
+    with __fastpack_load_lock:
+        if __fastpack_so is None:
+            # Compile
+            if os.path.isfile(fastpack_lib):
+                os.system(f"rm {fastpack_lib}")
+            os.system(f"gcc -shared -O3 -fpic {fastpack_source} -o {fastpack_lib}")
+            assert os.path.isfile(fastpack_lib), "Could not find fastpack.so. Did compilation fail?"
+
+            # Load
+            fastpack = ct.CDLL(fastpack_lib)
+            fastpack_floatarray = ndpointer(ct.c_float, flags="C_CONTIGUOUS")
+            fastpack.array_to_hexstring_binary.argtypes = (
+                fastpack_floatarray,
+                ct.c_uint,
+                ct.c_uint,
+                ct.c_char_p,
+            )
+            fastpack.array_to_hexstring_binary.restype = ct.c_bool
+            __fastpack_so = fastpack
+    return __fastpack_so
+
+
+def array2hexstring(array, dtype, pad_to_nbits, prefix="0x", reverse=False, use_fastpack=True):
     """
     Pack given one-dimensional NumPy array with FINN DataType dtype into a hex
     string.
@@ -45,6 +84,8 @@ def array2hexstring(array, dtype, pad_to_nbits, prefix="0x", reverse=False):
     fixed width. The minimum value for pad_to_nbits is 4, since a single hex
     digit is four bits. reverse can be used to reverse the array prior to
     packing.
+    When use_fastpack is set to true, if available the function is outsourced
+    to a faster C implementation for some cases.
 
     Examples:
 
@@ -71,6 +112,17 @@ def array2hexstring(array, dtype, pad_to_nbits, prefix="0x", reverse=False):
     # reverse prior to packing, if desired
     if reverse:
         array = np.flip(array, -1)
+
+    # Check if the fast way can be taken
+    # TODO: Expand this to cover more cases
+    if use_fastpack and dtype == DataType["BINARY"]:
+        output_string = ct.create_string_buffer(ceil(pad_to_nbits / 4) + 4)
+        success = get_fastpack().array_to_hexstring_binary(
+            np.asarray(array, order="C"), array.size, pad_to_nbits, output_string
+        )
+        assert success, f"Could not convert array {array} with datatype {dtype} to hexstring!"
+        return prefix + output_string.value.decode("utf-8")
+
     lineval = BitArray(length=0)
     bw = dtype.bitwidth()
     # special handling for fixed point: rescale, then pack as integers
@@ -124,10 +176,11 @@ def npbytearray2hexstring(npbytearray, prefix="0x"):
 
 
 def pack_innermost_dim_as_hex_string(
-    ndarray, dtype, pad_to_nbits, reverse_inner=False, prefix="0x"
+    ndarray, dtype, pad_to_nbits, reverse_inner=False, prefix="0x", use_fastpack=True
 ):
     """Pack the innermost dimension of the given numpy ndarray into hex
-    strings using array2hexstring.
+    strings using array2hexstring. If use_fastpack is enabled this tries to speed
+    up the conversion
 
     Examples:
 
@@ -149,7 +202,9 @@ def pack_innermost_dim_as_hex_string(
         ndarray = np.asarray(ndarray, dtype=np.float32)
 
     def fun(x):
-        return array2hexstring(x, dtype, pad_to_nbits, reverse=reverse_inner, prefix=prefix)
+        return array2hexstring(
+            x, dtype, pad_to_nbits, reverse=reverse_inner, prefix=prefix, use_fastpack=use_fastpack
+        )
 
     return np.apply_along_axis(fun, ndarray.ndim - 1, ndarray)
 

diff --git a/src/finn/util/fast_pack.c b/src/finn/util/fast_pack.c
@@ -0,0 +1,61 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <stdint.h>
+
+/***
+ * Takes a numpy array of floats in BINARY datatype from finn and the number of elements in that array, as well as the number of padded bits required.
+ * It also takes an out-string buffer to write the results to. This buffer is created by python via ctypes.create_string_buffer() and must be large enough to
+ * hold the required number of padded bits.
+ *
+ * The function returns false on an error and true in case of success
+ */
+bool array_to_hexstring_binary(float* values, unsigned int elements, unsigned int padded_bits, char* out) {
+    // Calculate min number of bits required
+    unsigned int min_bits;
+    if (elements % 4 != 0) {
+        min_bits = elements + (4 - (elements % 4));
+    } else {
+        min_bits = elements;
+    }
+
+    // Padded bits must be atleast length of min_bits and divisible by 4 for hex repr
+    if (min_bits > padded_bits || padded_bits % 4 != 0) {
+        return false;
+    }
+
+    // Pad output string
+    strcpy(out, "");
+    unsigned int prefix_digits = (padded_bits - min_bits) / 4;
+    for (int i = 0; i < prefix_digits; i++) {
+        out[i] = '0';
+    }
+    out[prefix_digits] = '0';
+    out[prefix_digits + min_bits / 4] = '\0';
+
+    unsigned int temp = 0;
+    unsigned int digits = 0;
+    unsigned int bit_shift_left = 0;
+    char letter;
+    for (int index = elements - 1; index >= 0; index--) {
+        // Add new bit
+        temp |= (((unsigned int) values[index]) << bit_shift_left);
+
+        // Convert to hex either when 4 bits are there or we arrived at the end
+        if (bit_shift_left == 3 || index == 0) {
+            if (temp <= 9) {
+                letter = '0' + temp;
+            } else {
+                letter = 'a' + temp - 10;
+            }
+            out[prefix_digits + min_bits / 4 - digits - 1] = letter;
+            digits++;
+            temp = 0;
+            bit_shift_left = 0;
+        } else {
+            bit_shift_left++;
+        }
+    }
+    return true;
+}
diff --git a/tests/util/test_data_packing.py b/tests/util/test_data_packing.py
@@ -32,11 +32,16 @@
 import os
 import shutil
 import subprocess
+import time
 from qonnx.core.datatype import DataType
 from qonnx.util.basic import gen_finn_dt_tensor
 
 from finn.util.basic import make_build_dir
-from finn.util.data_packing import npy_to_rtlsim_input, numpy_to_hls_code
+from finn.util.data_packing import (
+    npy_to_rtlsim_input,
+    numpy_to_hls_code,
+    pack_innermost_dim_as_hex_string,
+)
 
 
 @pytest.mark.util
@@ -180,3 +185,66 @@ def test_npy_to_rtlsim_input(dtype):
 
     assert all([(x >> dtype.bitwidth()) == 0 for x in output_fast]), "extraneous bits detected"
     assert np.all(output_fast == output_slow_split), "different behavior of packing modes detected"
+
+
+@pytest.mark.util
+@pytest.mark.parametrize("tensorshape", [(1, 2, 16384, 64), (1, 1024, 2048)])
+def test_pack_innermost_dim_to_hexstring_fast(tensorshape: tuple[int]):
+    # check that the sped up function call in pack_inermost_dim_to_hex_string() is valid
+    tensor_count = 5
+    assert tensorshape[-1] % 4 == 0, "Smallest tensorshape dimension must be divisible by 4"
+
+    # Create random binary tensor by simply rounding a random tensor
+    tensors = [
+        np.round(np.random.random(tensorshape)).astype(np.float32) for i in range(tensor_count)
+    ]
+    results_python = []
+    results_c = []
+
+    # Test C impl
+    start_c = time.time()
+    for count in range(tensor_count):
+        c_result = pack_innermost_dim_as_hex_string(
+            tensors[count],
+            DataType["BINARY"],
+            tensorshape[-1] * 2,
+            reverse_inner=False,
+            prefix="0x",
+            use_fastpack=True,
+        )
+        results_c.append(c_result)
+    end_c = time.time()
+
+    # Test python impl
+    start_python = time.time()
+    for count in range(tensor_count):
+        python_result = pack_innermost_dim_as_hex_string(
+            tensors[count],
+            DataType["BINARY"],
+            tensorshape[-1] * 2,
+            reverse_inner=False,
+            prefix="0x",
+            use_fastpack=False,
+        )
+        results_python.append(python_result)
+    end_python = time.time()
+
+    assert np.array_equal(np.array(results_python), np.array(results_c))
+
+    # Write timing results
+    with open(
+        os.path.join(
+            os.path.dirname(__file__),
+            "fastpack_benchmark" + "_".join(map(lambda x: str(x), list(tensorshape))) + ".txt",
+        ),
+        "w+",
+    ) as f:
+        f.write("Pack_innermost_dim_to_hexstring benchmark test results\n")
+        f.write("Shape: " + str(tensorshape) + "\n")
+        f.write(f"Ran {tensor_count} times\n")
+        python_time = end_python - start_python
+        c_time = end_c - start_c
+        f.write(
+            f"Python: {python_time}s overall | {python_time / tensor_count}s on avg. per sample\n"
+        )
+        f.write(f"C: {c_time}s overall | {c_time / tensor_count}s on avg. per sample\n")