diff --git a/fetch-repos.sh b/fetch-repos.sh index 451d8c1dae..57e14bc291 100755 --- a/fetch-repos.sh +++ b/fetch-repos.sh @@ -33,7 +33,7 @@ FINN_EXP_COMMIT="0724be21111a21f0d81a072fccc1c446e053f851" BREVITAS_COMMIT="d4834bd2a0fad3c1fbc0ff7e1346d5dcb3797ea4" PYVERILATOR_COMMIT="ce0a08c20cb8c1d1e84181d6f392390f846adbd1" CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4" -HLSLIB_COMMIT="d56b1d0c1eeb844a873fb29a29240a86e00d9f80" +HLSLIB_COMMIT="16cfc4b3ab895babf30f7db7c4bcac27d68317a9" OMX_COMMIT="0b59762f9e4c4f7e5aa535ee9bc29f292434ca7a" AVNET_BDF_COMMIT="2d49cfc25766f07792c0b314489f21fe916b639b" XIL_BDF_COMMIT="8cf4bb674a919ac34e3d99d8d71a9e60af93d14e" diff --git a/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py b/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py index 8931d818d4..f1e7d7aca8 100644 --- a/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py @@ -89,7 +89,7 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name): weight_tensor = self.get_hw_compatible_weight_tensor(weights) export_wdt = self.get_weight_datatype() if weight_file_mode == "hls_header": - weight_hls_code = numpy_to_hls_code1(weight_tensor, export_wdt, "weights", True, True) + weight_hls_code = numpy_to_hls_code1(weight_tensor, export_wdt, "weights", False, True) # write weights into C++ header file as dictated by finn-hlslib f_weights = open(weight_file_name, "w") f_weights.write( @@ -118,19 +118,21 @@ def get_hw_compatible_weight_tensor(self, orig_weight_matrix): wmem = self.calc_wmem() assert orig_weight_matrix.shape == ( ofm_ch, - k_h * k_w * ifm_ch, + k_h, + k_w, + ifm_ch, ), """Weights matrix doesn't - #have expected shape (k_h*k_w*ifm_ch, ofm_ch)""" + have expected shape (ofm_ch, k_h, k_w, ifm_ch)""" assert ofm_ch % pe == 0, "Requirement output channels divisable by PE is violated." assert ifm_ch % simd == 0, "Requirement input channels divisable by SIMD is violated." # interleave rows between PEs and reshape # distribute rows between PEs ret = orig_weight_matrix + ret = ret.reshape(ofm_ch, k_h * k_w * ifm_ch) ret = interleave_matrix_outer_dim_from_partitions(ret, pe) - # create SIMD as innermost dimension and add a dummy outer dim + # create SIMD as innermost dimension ret = ret.reshape(1, pe, wmem, simd) - # reverse the SIMD dimension - ret = np.flip(ret, axis=-1) + ret = ret.transpose(0, 2, 1, 3) return ret def global_includes(self): @@ -196,12 +198,27 @@ def strm_decl(self): def docompute(self): odtype = self.get_output_datatype() pe = self.get_nodeattr("PE") - ishape = self.get_normal_input_shape() + simd = self.get_nodeattr("SIMD") + i_ch = self.get_nodeattr("IFMChannels") + k_h, k_w = self.get_nodeattr("KernelDim") + s_h, s_w = self.get_nodeattr("Stride") + i_h, i_w = self.get_nodeattr("IFMDim") + p_h, p_w = self.get_nodeattr("Padding") + if p_w >= k_w - s_w: + padup = 0 + else: + padup = (k_w - p_w - 1) / s_w + crop = s_w * padup - ((k_w - s_w) - p_w) + sf = i_ch / simd + w_eff = padup + i_w + padup + wo_eff = (w_eff - 1) * s_w + k_w self.code_gen_dict["$DOCOMPUTE$"] = [ "hls::stream> strm;".format(odtype.get_hls_datatype_str(), pe) ] self.code_gen_dict["$DOCOMPUTE$"].append("unsigned timeout = 0;") - self.code_gen_dict["$DOCOMPUTE$"].append("while(timeout < %s) {" % np.prod(ishape)) + self.code_gen_dict["$DOCOMPUTE$"].append( + "while(timeout < %s) {" % (wo_eff * (crop + 1) * ((k_w / s_w) ** 2) * 2 * sf + 50) + ) self.code_gen_dict["$DOCOMPUTE$"].append( """deconv (weights, in0_{}, out_{});""".format( diff --git a/tests/fpgadataflow/test_fpgadataflow_deconv.py b/tests/fpgadataflow/test_fpgadataflow_deconv.py index a09e788570..26509bc738 100644 --- a/tests/fpgadataflow/test_fpgadataflow_deconv.py +++ b/tests/fpgadataflow/test_fpgadataflow_deconv.py @@ -41,22 +41,11 @@ import finn.core.onnx_exec as oxe from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim -from finn.transformation.fpgadataflow.convert_to_hw_layers import ( - InferConvInpGen, - InferQuantizedMatrixVectorActivation, -) from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP -from finn.transformation.fpgadataflow.infer_pixel_padding_deconv import ( - InferPixelPaddingDeconv, -) -from finn.transformation.fpgadataflow.minimize_accumulator_width import ( - MinimizeAccumulatorWidth, -) from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode -from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.util.basic import pynq_part_map test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1") @@ -64,12 +53,11 @@ target_clk_ns = 10 -def set_up_reference_model(idt, wdt, k, idim, ifm_ch, ofm_ch, stride, padding): +def set_up_reference_model(idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding): idim_h, idim_w = idim stride_h, stride_w = stride odim_h = (idim_h - 1) * stride_h - 2 * padding + (k - 1) + 1 odim_w = (idim_w - 1) * stride_w - 2 * padding + (k - 1) + 1 - odt = DataType["INT32"] inp = helper.make_tensor_value_info( "inp", @@ -120,10 +108,10 @@ def set_up_reference_model(idt, wdt, k, idim, ifm_ch, ofm_ch, stride, padding): model = model.transform(InferShapes()) - return model + return model, w_tensor -def create_deconv_node(idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding): +def create_deconv_node(idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding, w_tensor): idim_h, idim_w = idim stride_h, stride_w = stride odim_h = (idim_h - 1) * stride_h - 2 * padding + (k - 1) + 1 @@ -140,8 +128,7 @@ def create_deconv_node(idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding): ], ) outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, odim_h, odim_w, ofm_ch]) - - W = helper.make_tensor_value_info("W", TensorProto.FLOAT, [ifm_ch * k * k, ofm_ch]) + W = helper.make_tensor_value_info("W", TensorProto.FLOAT, [ofm_ch, k, k, ifm_ch]) Deconv = helper.make_node( "Deconvolution_hls", @@ -154,6 +141,7 @@ def create_deconv_node(idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding): OFMChannels=ofm_ch, IFMDim=idim, Stride=[stride_h, stride_w], + Padding=[padding, padding], PE=1, SIMD=1, inputDataType=idt.name, @@ -180,7 +168,7 @@ def create_deconv_node(idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding): model.set_tensor_datatype(model.graph.output[0].name, odt) model.set_tensor_datatype("W", wdt) - w_tensor = gen_finn_dt_tensor(wdt, [ifm_ch * k * k, ofm_ch]) + w_tensor = w_tensor.transpose(1, 2, 3, 0) model.set_initializer("W", w_tensor) model = model.transform(InferShapes()) @@ -189,33 +177,37 @@ def create_deconv_node(idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding): # input image dimension -@pytest.mark.parametrize("idim", [[8, 8], [10, 8]]) +@pytest.mark.parametrize("idim", [[8, 8]]) # number of rows and number of cols to add -@pytest.mark.parametrize("stride", [[2, 2], [2, 3]]) +@pytest.mark.parametrize("stride", [[2, 2]]) # number of channels @pytest.mark.parametrize("ifm_ch", [2]) # number of channels -@pytest.mark.parametrize("ofm_ch", [4]) +@pytest.mark.parametrize("ofm_ch", [3]) # Input parallelism -@pytest.mark.parametrize("simd", [1, 2]) +@pytest.mark.parametrize("simd", [1]) # PE -@pytest.mark.parametrize("pe", [1, 2]) +@pytest.mark.parametrize("pe", [1]) # kernel size -@pytest.mark.parametrize("k", [2]) +@pytest.mark.parametrize("k", [4]) # padding -@pytest.mark.parametrize("padding", [0, 1]) +@pytest.mark.parametrize("padding", [1]) # exec mode -@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) +@pytest.mark.parametrize("exec_mode", ["cppsim"]) @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado def test_fpgadataflow_deconv(idim, stride, ifm_ch, ofm_ch, simd, pe, k, padding, exec_mode): - idt = wdt = DataType["INT4"] + idt = wdt = DataType["INT8"] wdt = idt + odt = DataType["INT32"] idim_h, idim_w = idim stride_h, stride_w = stride - ref_model = set_up_reference_model(idt, wdt, k, idim, ifm_ch, ofm_ch, stride, padding) + ref_model, w_tensor = set_up_reference_model( + idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding + ) + model = create_deconv_node(idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding, w_tensor) odim_h = (idim_h - 1) * stride_h - 2 * padding + (k - 1) + 1 odim_w = (idim_w - 1) * stride_w - 2 * padding + (k - 1) + 1 @@ -225,34 +217,31 @@ def test_fpgadataflow_deconv(idim, stride, ifm_ch, ofm_ch, simd, pe, k, padding, y_expected = oxe.execute_onnx(ref_model, input_dict)["outp"] - model = ref_model.transform(InferPixelPaddingDeconv()) - model = model.transform(InferConvInpGen()) - model = model.transform(InferQuantizedMatrixVectorActivation()) - model = model.transform(InferShapes()) - model = model.transform(GiveUniqueNodeNames()) - - y_produced = oxe.execute_onnx(model, input_dict)["outp"] - assert (y_produced == y_expected).all() + # model = model.transform(InferShapes()) + # model = model.transform(GiveUniqueNodeNames()) + input_tensor_nhwc = input_tensor.transpose(0, 2, 3, 1) + input_dict_nhwc = {"inp": input_tensor_nhwc} + # y_produced = oxe.execute_onnx(model, input_dict_nhwc)["outp"] + # assert (y_produced == y_expected).all() - model = model.transform(SpecializeLayers(test_fpga_part)) - model = model.transform(MinimizeAccumulatorWidth()) + # model = model.transform(SpecializeLayers(test_fpga_part)) + # model = model.transform(MinimizeAccumulatorWidth()) for n in model.graph.node: - if n.op_type.startswith("ConvolutionInputGenerator"): - convinputgen_node = getCustomOp(n) - convinputgen_node.set_nodeattr("SIMD", simd) - elif n.op_type.startswith("MVAU"): - mvau_node = getCustomOp(n) - mvau_node.set_nodeattr("PE", pe) - mvau_node.set_nodeattr("SIMD", simd) - - expected_oshape = (1, ofm_ch, odim_h, odim_w) + if n.op_type.startswith("Deconvolution_hls"): + deconv_node = getCustomOp(n) + deconv_node.set_nodeattr("PE", pe) + deconv_node.set_nodeattr("SIMD", simd) + expected_oshape = (1, odim_h, odim_w, ofm_ch) + # model.save("deconv.onnx") # cppsim if exec_mode == "cppsim": + model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareCppSim()) model = model.transform(CompileCppSim()) model = model.transform(SetExecMode("cppsim")) + # breakpoint() # rtlsim else: @@ -262,12 +251,13 @@ def test_fpgadataflow_deconv(idim, stride, ifm_ch, ofm_ch, simd, pe, k, padding, model = model.transform(PrepareRTLSim()) model = model.transform(SetExecMode("rtlsim")) - y_produced = oxe.execute_onnx(model, input_dict)["outp"] + y_produced = oxe.execute_onnx(model, input_dict_nhwc)["outp"] assert y_produced.shape == expected_oshape + y_produced = y_produced.transpose(0, 3, 1, 2) assert (y_produced == y_expected).all() if exec_mode == "rtlsim": - node = model.get_nodes_by_op_type("FMPadding_Pixel_hls")[0] + node = model.get_nodes_by_op_type("Deconvolution_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer)