Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Faster packing #1172

Open
wants to merge 10 commits into
base: dev
Choose a base branch
from
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ htmlcov/*
junit.xml
coverage.xml
.pytest_cache/
tests/util/fastpack_*.txt

# Build and docs folder/files
build/*
Expand Down
65 changes: 60 additions & 5 deletions src/finn/util/data_packing.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,54 @@
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import binascii

# Import the faster packing functions. This is executed when loading the module
# so that the faster version is always available when this is imported
import ctypes as ct
import numpy as np
import os
import sys
import threading
from bitstring import BitArray
from math import ceil
from numpy.ctypeslib import ndpointer
from qonnx.core.datatype import DataType
from qonnx.util.basic import roundup_to_integer_multiple


def array2hexstring(array, dtype, pad_to_nbits, prefix="0x", reverse=False):
# Setup
fastpack_source = os.path.abspath(os.path.join(os.path.dirname(__file__), "fast_pack.c"))
fastpack_lib = os.path.abspath(os.path.join(os.path.dirname(__file__), "fastpack.so"))
__fastpack_so = None
__fastpack_load_lock = threading.Lock()
assert os.path.isfile(fastpack_source), "Could not find fast_pack.c in the utils/ dir of FINN"


# Singleton setup to safely load this module in multithreading contexts
def get_fastpack():
global __fastpack_load_lock, __fastpack_so
with __fastpack_load_lock:
if __fastpack_so is None:
# Compile
if os.path.isfile(fastpack_lib):
os.system(f"rm {fastpack_lib}")
os.system(f"gcc -shared -O3 -fpic {fastpack_source} -o {fastpack_lib}")
assert os.path.isfile(fastpack_lib), "Could not find fastpack.so. Did compilation fail?"

# Load
fastpack = ct.CDLL(fastpack_lib)
fastpack_floatarray = ndpointer(ct.c_float, flags="C_CONTIGUOUS")
fastpack.array_to_hexstring_binary.argtypes = (
fastpack_floatarray,
ct.c_uint,
ct.c_uint,
ct.c_char_p,
)
fastpack.array_to_hexstring_binary.restype = ct.c_bool
__fastpack_so = fastpack
return __fastpack_so


def array2hexstring(array, dtype, pad_to_nbits, prefix="0x", reverse=False, use_fastpack=True):
"""
Pack given one-dimensional NumPy array with FINN DataType dtype into a hex
string.
Expand All @@ -45,6 +84,8 @@ def array2hexstring(array, dtype, pad_to_nbits, prefix="0x", reverse=False):
fixed width. The minimum value for pad_to_nbits is 4, since a single hex
digit is four bits. reverse can be used to reverse the array prior to
packing.
When use_fastpack is set to true, if available the function is outsourced
to a faster C implementation for some cases.

Examples:

Expand All @@ -71,6 +112,17 @@ def array2hexstring(array, dtype, pad_to_nbits, prefix="0x", reverse=False):
# reverse prior to packing, if desired
if reverse:
array = np.flip(array, -1)

# Check if the fast way can be taken
# TODO: Expand this to cover more cases
if use_fastpack and dtype == DataType["BINARY"]:
output_string = ct.create_string_buffer(ceil(pad_to_nbits / 4) + 4)
success = get_fastpack().array_to_hexstring_binary(
np.asarray(array, order="C"), array.size, pad_to_nbits, output_string
)
assert success, f"Could not convert array {array} with datatype {dtype} to hexstring!"
return prefix + output_string.value.decode("utf-8")

lineval = BitArray(length=0)
bw = dtype.bitwidth()
# special handling for fixed point: rescale, then pack as integers
Expand Down Expand Up @@ -124,10 +176,11 @@ def npbytearray2hexstring(npbytearray, prefix="0x"):


def pack_innermost_dim_as_hex_string(
ndarray, dtype, pad_to_nbits, reverse_inner=False, prefix="0x"
ndarray, dtype, pad_to_nbits, reverse_inner=False, prefix="0x", use_fastpack=True
):
"""Pack the innermost dimension of the given numpy ndarray into hex
strings using array2hexstring.
strings using array2hexstring. If use_fastpack is enabled this tries to speed
up the conversion

Examples:

Expand All @@ -149,7 +202,9 @@ def pack_innermost_dim_as_hex_string(
ndarray = np.asarray(ndarray, dtype=np.float32)

def fun(x):
return array2hexstring(x, dtype, pad_to_nbits, reverse=reverse_inner, prefix=prefix)
return array2hexstring(
x, dtype, pad_to_nbits, reverse=reverse_inner, prefix=prefix, use_fastpack=use_fastpack
)

return np.apply_along_axis(fun, ndarray.ndim - 1, ndarray)

Expand Down
61 changes: 61 additions & 0 deletions src/finn/util/fast_pack.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <stdint.h>

/***
* Takes a numpy array of floats in BINARY datatype from finn and the number of elements in that array, as well as the number of padded bits required.
* It also takes an out-string buffer to write the results to. This buffer is created by python via ctypes.create_string_buffer() and must be large enough to
* hold the required number of padded bits.
*
* The function returns false on an error and true in case of success
*/
bool array_to_hexstring_binary(float* values, unsigned int elements, unsigned int padded_bits, char* out) {
// Calculate min number of bits required
unsigned int min_bits;
if (elements % 4 != 0) {
min_bits = elements + (4 - (elements % 4));
} else {
min_bits = elements;
}

// Padded bits must be atleast length of min_bits and divisible by 4 for hex repr
if (min_bits > padded_bits || padded_bits % 4 != 0) {
return false;
}

// Pad output string
strcpy(out, "");
unsigned int prefix_digits = (padded_bits - min_bits) / 4;
for (int i = 0; i < prefix_digits; i++) {
out[i] = '0';
}
out[prefix_digits] = '0';
out[prefix_digits + min_bits / 4] = '\0';

unsigned int temp = 0;
unsigned int digits = 0;
unsigned int bit_shift_left = 0;
char letter;
for (int index = elements - 1; index >= 0; index--) {
// Add new bit
temp |= (((unsigned int) values[index]) << bit_shift_left);

// Convert to hex either when 4 bits are there or we arrived at the end
if (bit_shift_left == 3 || index == 0) {
if (temp <= 9) {
letter = '0' + temp;
} else {
letter = 'a' + temp - 10;
}
out[prefix_digits + min_bits / 4 - digits - 1] = letter;
digits++;
temp = 0;
bit_shift_left = 0;
} else {
bit_shift_left++;
}
}
return true;
}
70 changes: 69 additions & 1 deletion tests/util/test_data_packing.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,16 @@
import os
import shutil
import subprocess
import time
from qonnx.core.datatype import DataType
from qonnx.util.basic import gen_finn_dt_tensor

from finn.util.basic import make_build_dir
from finn.util.data_packing import npy_to_rtlsim_input, numpy_to_hls_code
from finn.util.data_packing import (
npy_to_rtlsim_input,
numpy_to_hls_code,
pack_innermost_dim_as_hex_string,
)


@pytest.mark.util
Expand Down Expand Up @@ -180,3 +185,66 @@ def test_npy_to_rtlsim_input(dtype):

assert all([(x >> dtype.bitwidth()) == 0 for x in output_fast]), "extraneous bits detected"
assert np.all(output_fast == output_slow_split), "different behavior of packing modes detected"


@pytest.mark.util
@pytest.mark.parametrize("tensorshape", [(1, 2, 16384, 64), (1, 1024, 2048)])
def test_pack_innermost_dim_to_hexstring_fast(tensorshape: tuple[int]):
# check that the sped up function call in pack_inermost_dim_to_hex_string() is valid
tensor_count = 5
assert tensorshape[-1] % 4 == 0, "Smallest tensorshape dimension must be divisible by 4"

# Create random binary tensor by simply rounding a random tensor
tensors = [
np.round(np.random.random(tensorshape)).astype(np.float32) for i in range(tensor_count)
]
results_python = []
results_c = []

# Test C impl
start_c = time.time()
for count in range(tensor_count):
c_result = pack_innermost_dim_as_hex_string(
tensors[count],
DataType["BINARY"],
tensorshape[-1] * 2,
reverse_inner=False,
prefix="0x",
use_fastpack=True,
)
results_c.append(c_result)
end_c = time.time()

# Test python impl
start_python = time.time()
for count in range(tensor_count):
python_result = pack_innermost_dim_as_hex_string(
tensors[count],
DataType["BINARY"],
tensorshape[-1] * 2,
reverse_inner=False,
prefix="0x",
use_fastpack=False,
)
results_python.append(python_result)
end_python = time.time()

assert np.array_equal(np.array(results_python), np.array(results_c))

# Write timing results
with open(
os.path.join(
os.path.dirname(__file__),
"fastpack_benchmark" + "_".join(map(lambda x: str(x), list(tensorshape))) + ".txt",
),
"w+",
) as f:
f.write("Pack_innermost_dim_to_hexstring benchmark test results\n")
f.write("Shape: " + str(tensorshape) + "\n")
f.write(f"Ran {tensor_count} times\n")
python_time = end_python - start_python
c_time = end_c - start_c
f.write(
f"Python: {python_time}s overall | {python_time / tensor_count}s on avg. per sample\n"
)
f.write(f"C: {c_time}s overall | {c_time / tensor_count}s on avg. per sample\n")
Loading