diff --git a/apps/hardware_benchmarks/apps/pointwise_fp/Makefile b/apps/hardware_benchmarks/apps/pointwise_fp/Makefile new file mode 100644 index 000000000..2d5be6c4d --- /dev/null +++ b/apps/hardware_benchmarks/apps/pointwise_fp/Makefile @@ -0,0 +1,52 @@ +# Usage: +# make all: compiles all code without running +# generator: create Halide generator +# design: create cpu design +# design-clockwork: create clockwork design +# image: create an image with random data +# run: run cpu design with image +# run-clockwork: run clockwork design with image +# compare: compare two output images +# test: run and compare to cpu output +# eval: evaluate runtime +# clean: remove bin directory + +############################################################################### +# MODIFICATION: NONE +#------------------------------------------------------------------------------ +# Makefile parameter/variable declarations +# +############################################################################### + +include ../../hw_support/Makefile.inc + + + +############################################################################### +# MODIFICATION: OPTIONAL +#------------------------------------------------------------------------------ +# App-specific info +# +# TESTNAME : name of the app +# USE_CORE_IR_VALID : whether to generate valid signal for coreir codegen +# HL_TARGET : Halide target +# RDAI_PLATFORM_RUNTIME : RDAI platform runtime to use +############################################################################### + +TESTNAME = pointwise_fp +USE_COREIR_VALID = 1 +HL_TARGET = host-x86-64-bfloat_hardware-enable_ponds +RDAI_PLATFORM_RUNTIME = clockwork_sim +EXT = mat + + +############################################################################### +# MODIFICATION : NONE +#------------------------------------------------------------------------------ +# Include hardwrae build targets +# +############################################################################### + +include ../../hw_support/hardware_targets.mk + + diff --git a/apps/hardware_benchmarks/apps/pointwise_fp/cgra_config.json b/apps/hardware_benchmarks/apps/pointwise_fp/cgra_config.json new file mode 100644 index 000000000..03a374bb0 --- /dev/null +++ b/apps/hardware_benchmarks/apps/pointwise_fp/cgra_config.json @@ -0,0 +1,22 @@ +{ + "IOs": { + "inputs": [ + { + "name": "hw_input", + "bitwidth": 16, + "shape": [32, 56, 56], + "pixels_per_cycle": 1, + "datafile": "bin/hw_input_stencil.raw" + } + ], + "output": { + "name": "hw_output", + "bitwidth": 16, + "pixels_per_cycle": 1, + "datafile": "bin/hw_output.raw" + } + }, + "testing": { + "total_cycles": -1 + } +} diff --git a/apps/hardware_benchmarks/apps/pointwise_fp/pointwise_fp_generator.cpp b/apps/hardware_benchmarks/apps/pointwise_fp/pointwise_fp_generator.cpp new file mode 100644 index 000000000..459626a1d --- /dev/null +++ b/apps/hardware_benchmarks/apps/pointwise_fp/pointwise_fp_generator.cpp @@ -0,0 +1,67 @@ +#include "Halide.h" + +namespace { + +using namespace std; + +using namespace Halide; +using namespace Halide::ConciseCasts; + +class ReluLayer : public Halide::Generator { +public: + Input> input{"input", 3}; + Output> output{"output", 3}; + + GeneratorParam out_img{"out_img", 56}; + GeneratorParam n_oc{"n_oc", 32}; + + GeneratorParam unroll{"unroll", 8}; + + void generate() { + /* THE ALGORITHM */ + + Var x("x"), y("y"), w("w"); + Func hw_input("hw_input"), input_host("input_host"), input_glb("input_glb"), input_cgra("input_cgra"); + Func hw_output("hw_output"), output_glb("output_glb"), output_cgra("output_cgra"); + Func relu6; + + hw_input(w, x, y) = cast(input(w, x, y)); + + relu6(w, x, y) = hw_input(w, x, y) * cast(2.0f); + + hw_output(w, x, y) = relu6(w, x, y); + output(w, x, y) = cast(hw_output(w, x, y)); + + /* THE SCHEDULE */ + if (get_target().has_feature(Target::CoreIR) || + get_target().has_feature(Target::HLS)) { + + } else if (get_target().has_feature(Target::Clockwork)) { + Var xi,yi, xo,yo; + + output.bound(x, 0, out_img); + output.bound(y, 0, out_img); + output.bound(w, 0, n_oc); + + //hw_input.compute_root(); + hw_output.compute_root(); + hw_output + .tile(x,y, xo,yo, xi,yi, out_img, out_img) + .reorder(w, xi, yi, xo, yo) + .hw_accelerate(xi, xo); + hw_output.unroll(w, unroll); + + relu6.compute_at(hw_output, xo).unroll(w, unroll); + + hw_input.stream_to_accelerator(); + hw_input.in().unroll(w, unroll); + + } else { + + } + } +}; + +} // namespace + +HALIDE_REGISTER_GENERATOR(ReluLayer, pointwise_fp) diff --git a/apps/hardware_benchmarks/apps/pointwise_fp/process.cpp b/apps/hardware_benchmarks/apps/pointwise_fp/process.cpp new file mode 100644 index 000000000..372e2d5d6 --- /dev/null +++ b/apps/hardware_benchmarks/apps/pointwise_fp/process.cpp @@ -0,0 +1,293 @@ +#include +#include +#include +#include +#include +#include "hardware_process_helper.h" +#include "halide_image_io.h" +#include "coreir.h" + +#if defined(WITH_CPU) + #include "pointwise_fp.h" +#endif + +#if defined(WITH_COREIR) + #include "coreir_interpret.h" +#endif + +#if defined(WITH_CLOCKWORK) + #include "rdai_api.h" + #include "clockwork_sim_platform.h" + #include "pointwise_fp_clockwork.h" +#endif + +using namespace Halide::Tools; +using namespace Halide::Runtime; + +nlohmann::json output_starting_json; + +union { + uint32_t val; + float f; +} union_var; + +uint16_t round_to_even_process(float a) { + //uint32_t e = reinterpret_cast(a); + union_var.f = a; + uint32_t e = union_var.val; + + // round float to even, comment out this codeblock for truncation + uint32_t half = 0x00008000; + uint32_t sum = e + half; + + // check if bottom bits are all zero + uint32_t mantissa_mask = 0x0000ffff; + bool is_zeroed = (sum & mantissa_mask) == 0; + + // clear last bit (round even) on tie + uint32_t clear_mask = ~( ((uint32_t)is_zeroed) << 16); + e = sum & clear_mask; + + // clear bottom bits + e = e >> 16; + + //return bfloat16_t::make_from_bits(float_to_bfloat16( expf(bfloat16_to_float(a.to_bits())) )); + //return bfloat16_t::make_from_bits( (uint16_t)e ); + return (uint16_t)e; +} + +// Similar routines for bfloat. It's somewhat simpler. +uint16_t float_to_bfloat16_process(float f) { +// uint16_t ret[2]; +// memcpy(ret, &f, sizeof(float)); +// // Assume little-endian floats +// return ret[1]; + return round_to_even_process(f); +} + +float bfloat16_to_float_process(uint16_t b) { + // Assume little-endian floats + uint16_t bits[2] = {0, b}; + float ret; + memcpy(&ret, bits, sizeof(float)); + return ret; +} + +void saveHalideBufferToRawBigEndian(const Halide::Runtime::Buffer& buffer, const std::string& filename) { + std::ofstream out(filename, std::ios::binary); + if (!out.is_open()) { + std::cerr << "Failed to open file for writing: " << filename << std::endl; + return; + } + + // Iterate through each element of the buffer and write in big-endian order + for (int i = 0; i < buffer.number_of_elements(); ++i) { + uint16_t val = buffer(i); + // Swap bytes if the system is little-endian + uint16_t big_endian_val = (val << 8) | (val >> 8); // Swap bytes to convert to big-endian + out.write(reinterpret_cast(&big_endian_val), sizeof(big_endian_val)); + } + + out.close(); +} + +void loadRawDataToBuffer(const std::string& filename, Halide::Runtime::Buffer& buffer) { + std::ifstream inFile(filename, std::ios::binary); + if (!inFile) { + std::cerr << "Failed to open file for reading: " << filename << std::endl; + return; + } + + // Get the extents of each dimension in the buffer + std::vector extents(buffer.dimensions()); + for (int i = 0; i < buffer.dimensions(); i++) { + extents[i] = buffer.dim(i).extent(); + } + + // Initialize indices to zero for all dimensions + std::vector indices(buffer.dimensions(), 0); + + // Function to recursively fill the buffer, with column-major order + std::function fillBuffer = [&](int dim) { + if (dim == -1) { // All dimensions are set + uint16_t value; + inFile.read(reinterpret_cast(&value), sizeof(value)); + if (!inFile) { + std::cerr << "Error reading data for indices "; + for (int i = 0; i < indices.size(); ++i) { + std::cerr << indices[i] << (i < indices.size() - 1 ? ", " : ""); + } + std::cerr << std::endl; + throw std::runtime_error("Failed to read data from file"); + } + // Perform byte swap if necessary (for big-endian files) + value = (value >> 8) | (value << 8); + buffer(indices.data()) = value; + } else { // Set the current dimension and recurse + for (int i = 0; i < extents[dim]; ++i) { + indices[dim] = i; + fillBuffer(dim - 1); + } + indices[dim] = 0; + } + }; + + try { + // From the last dimension down to 0; reverse order for column-major) + fillBuffer(buffer.dimensions() - 1); + } catch (const std::exception& e) { + std::cerr << "An exception occurred: " << e.what() << std::endl; + inFile.close(); + return; + } + + inFile.close(); +} + +void copyFile(const std::string &srcPath, const std::string &dstPath) { + std::ifstream src(srcPath, std::ios::binary); + std::ofstream dst(dstPath, std::ios::binary); + + if (!src.is_open() || !dst.is_open()) { + throw std::runtime_error("Error opening files while copying from " + srcPath + " to " + dstPath); + } + + dst << src.rdbuf(); +} + +bool file_exists(const std::string& name) { + std::ifstream f(name.c_str()); + return f.good(); +} + +std::vector parse_glb_bank_config_env_var(const std::string& env_var_name) { + std::vector values; + const char* env_var_value = std::getenv(env_var_name.c_str()); + + if (env_var_value) { + std::string value_str = env_var_value; + std::istringstream iss(value_str); + std::string token; + + // Split the string by commas and convert to integers + while (std::getline(iss, token, ',')) { + // Trim potential whitespace + token.erase(0, token.find_first_not_of(" \t\n\r\f\v")); + token.erase(token.find_last_not_of(" \t\n\r\f\v") + 1); + values.push_back(std::stoi(token)); + } + } else { + std::cerr << "Environment variable " << env_var_name << " not found." << std::endl; + } + + return values; +} + +int main( int argc, char **argv ) { + std::map> functions; + ManyInOneOut_ProcessController processor("pointwise_fp", {"hw_input_stencil.mat"}); + + #if defined(WITH_CPU) + auto cpu_process = [&]( auto &proc ) { + pointwise_fp(proc.inputs["hw_input_stencil.mat"], proc.output); + }; + functions["cpu"] = [&](){ cpu_process( processor ); } ; + #endif + + #if defined(WITH_COREIR) + auto coreir_process = [&]( auto &proc ) { + run_coreir_on_interpreter<>( "bin/design_top.json", + proc.inputs["hw_input_stencil.mat"], proc.output, + "self.in_arg_0_0_0", "self.out_0_0" ); + }; + functions["coreir"] = [&](){ coreir_process( processor ); }; + #endif + + #if defined(WITH_CLOCKWORK) + auto clockwork_process = [&]( auto &proc ) { + RDAI_Platform *rdai_platform = RDAI_register_platform( &rdai_clockwork_sim_ops ); + if ( rdai_platform ) { + printf( "[RUN_INFO] found an RDAI platform\n" ); + pointwise_fp_clockwork(proc.inputs["hw_input_stencil.mat"], proc.output); + RDAI_unregister_platform( rdai_platform ); + } else { + printf("[RUN_INFO] failed to register RDAI platform!\n"); + } + }; + functions["clockwork"] = [&](){ clockwork_process( processor ); }; + #endif + + auto OX = getenv("out_img"); + auto OC = getenv("n_oc"); + + auto out_img = OX ? atoi(OX) : 56; + auto n_oc = OC ? atoi(OC) : 32; + + // Add all defined functions + processor.run_calls = functions; + + processor.inputs["hw_input_stencil.mat"] = Buffer(n_oc, out_img, out_img); + processor.output = Buffer(n_oc, out_img, out_img); + + processor.inputs_preset = true; + + for (int y = 0; y < processor.inputs["hw_input_stencil.mat"].dim(2).extent(); y++) { + for (int x = 0; x < processor.inputs["hw_input_stencil.mat"].dim(1).extent(); x++) { + for (int w = 0; w < processor.inputs["hw_input_stencil.mat"].dim(0).extent(); w++) { + processor.inputs["hw_input_stencil.mat"](w, x, y) = float_to_bfloat16_process( + // [-7, 7] + ((float)rand() / RAND_MAX) * 14.0 - 7.0 + ); + } + } + } + + // Gold output + for (int w = 0; w < processor.output.dim(0).extent(); w++) { + for (int x = 0; x < processor.output.dim(1).extent(); x++) { + for (int y = 0; y < processor.output.dim(2).extent(); y++) { + float result = bfloat16_to_float_process(processor.inputs["hw_input_stencil.mat"](w, x, y)) * 2.0f; + processor.output(w, x, y) = float_to_bfloat16_process(result); + } + } + } + + std::cout << "Writing hw_input_stencil.mat to bin folder" << std::endl; + save_image(processor.inputs["hw_input_stencil.mat"], "bin/hw_input_stencil.mat"); + + std::cout << "Writing output to bin folder" << std::endl; + save_image(processor.output, "bin/hw_output.mat"); + + // Generate glb_bank_config.json if "USE_GLB_BANK_CONFIG" is 1 + std::cout << "Checking for GLB bank configuration..." << std::endl; + std::cout << "USE_GLB_BANK_CONFIG = " << getenv("USE_GLB_BANK_CONFIG") << std::endl; + if (getenv("USE_GLB_BANK_CONFIG") && std::stoi(getenv("USE_GLB_BANK_CONFIG"))) { + std::vector hw_input_stencil = parse_glb_bank_config_env_var("HW_INPUT_STENCIL_POS"); + std::vector hw_output_stencil = parse_glb_bank_config_env_var("HW_OUTPUT_STENCIL_POS"); + std::vector glb_inputs = parse_glb_bank_config_env_var("GLB_INPUTS"); + + // Create the glb_bank_config.json structure + json config = { + {"inputs", { + {"hw_input_stencil", hw_input_stencil} + }}, + {"outputs", { + {"hw_output_stencil", hw_output_stencil} + }}, + {"glb_inputs", glb_inputs} + }; + + std::ofstream file("bin/glb_bank_config.json"); + if (file.is_open()) { + file << config.dump(4) << std::endl; + file.close(); + std::cout << "Successfully wrote to bin/glb_bank_config.json" << std::endl; + } else { + std::cerr << "Unable to open file for writing." << std::endl; + return 1; + } + } + + return processor.process_command(argc, argv); + +}