From 670b71881df16473ac86dcd465e57f3e1fc0e0b9 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 17 Jun 2024 11:21:23 -0700 Subject: [PATCH] Provisional A&M only (not working) --- .../apps/hdr_plus/hdr_plus_generator.cpp | 928 +++++++++--------- 1 file changed, 464 insertions(+), 464 deletions(-) diff --git a/apps/hardware_benchmarks/apps/hdr_plus/hdr_plus_generator.cpp b/apps/hardware_benchmarks/apps/hdr_plus/hdr_plus_generator.cpp index 5ad392ed0..73568dcf9 100644 --- a/apps/hardware_benchmarks/apps/hdr_plus/hdr_plus_generator.cpp +++ b/apps/hardware_benchmarks/apps/hdr_plus/hdr_plus_generator.cpp @@ -1205,7 +1205,7 @@ class HDRPlus : public Halide::Generator { merge_output(x, y) = final_merge_output(x, y); //merge_output.trace_stores(); - //output(x, y, c) = u8(merge_output(x, y) * 255.f); + output(x, y, c) = u8(merge_output(x, y) * 255.f); // TODO: bound all dimensions of the output with statements like those below // output.bound(c, 0, 3); @@ -1217,133 +1217,133 @@ class HDRPlus : public Halide::Generator { */ - /* - * BEGIN CAMERA PIPELINE - */ - Func cp_hw_input, cp_hw_input_temp, cp_hw_input_shuffle, cp_hw_input_shift; - //cp_hw_input_temp(x,y) = u16(input(x+(blockSize-1)/2, y+(blockSize-1)/2)); - //cp_hw_input_temp(x,y) = u16(input(x, y)); - //cp_hw_input_temp(x,y) = cast(merge_output(x, y) * 1024.f); - //cp_hw_input_temp(x,y) = cast(merge_output(x, y) * 16383.f); - - //cp_hw_input_temp(x,y) = cast(merge_output(x, y)); - cp_hw_input_temp(x,y) = merge_output(x, y); + // /* + // * BEGIN CAMERA PIPELINE + // */ + // Func cp_hw_input, cp_hw_input_temp, cp_hw_input_shuffle, cp_hw_input_shift; + // //cp_hw_input_temp(x,y) = u16(input(x+(blockSize-1)/2, y+(blockSize-1)/2)); + // //cp_hw_input_temp(x,y) = u16(input(x, y)); + // //cp_hw_input_temp(x,y) = cast(merge_output(x, y) * 1024.f); + // //cp_hw_input_temp(x,y) = cast(merge_output(x, y) * 16383.f); + + // //cp_hw_input_temp(x,y) = cast(merge_output(x, y)); + // cp_hw_input_temp(x,y) = merge_output(x, y); - if (get_target().has_feature(Target::Clockwork)) { - cp_hw_input_shuffle(x, y, c) = cp_hw_input_temp(2*x + c/2, 2*y + c%2); + // if (get_target().has_feature(Target::Clockwork)) { + // cp_hw_input_shuffle(x, y, c) = cp_hw_input_temp(2*x + c/2, 2*y + c%2); - //cp_hw_input(x, y) = cp_hw_input_shuffle(x/4 + 622*(y%2), y/2, x%4); - int iWidth = (tWidth * nTiles + blockSize-1) / 4; - cp_hw_input_shift(x, y) = cp_hw_input_shuffle(x/4 + iWidth*(y%2), y/2, x%4); - cp_hw_input(x, y) = cp_hw_input_shift(x+(blockSize-1)/2, y+(blockSize-1)/2); - } else { - cp_hw_input(x, y) = cp_hw_input_temp(x+(blockSize-1)/2, y+(blockSize-1)/2); - } + // //cp_hw_input(x, y) = cp_hw_input_shuffle(x/4 + 622*(y%2), y/2, x%4); + // int iWidth = (tWidth * nTiles + blockSize-1) / 4; + // cp_hw_input_shift(x, y) = cp_hw_input_shuffle(x/4 + iWidth*(y%2), y/2, x%4); + // cp_hw_input(x, y) = cp_hw_input_shift(x+(blockSize-1)/2, y+(blockSize-1)/2); + // } else { + // cp_hw_input(x, y) = cp_hw_input_temp(x+(blockSize-1)/2, y+(blockSize-1)/2); + // } - Func denoised; - denoised = hot_pixel_suppression(cp_hw_input); + // Func denoised; + // denoised = hot_pixel_suppression(cp_hw_input); - // Give more convenient names to the four channels we know - Func r_r, g_gr, g_gb, b_b; - g_gr(x, y) = denoised(2*x, 2*y);//deinterleaved(x, y, 0); - r_r(x, y) = denoised(2*x+1, 2*y);//deinterleaved(x, y, 1); - b_b(x, y) = denoised(2*x, 2*y+1);//deinterleaved(x, y, 2); - g_gb(x, y) = denoised(2*x+1, 2*y+1);//deinterleaved(x, y, 3); + // // Give more convenient names to the four channels we know + // Func r_r, g_gr, g_gb, b_b; + // g_gr(x, y) = denoised(2*x, 2*y);//deinterleaved(x, y, 0); + // r_r(x, y) = denoised(2*x+1, 2*y);//deinterleaved(x, y, 1); + // b_b(x, y) = denoised(2*x, 2*y+1);//deinterleaved(x, y, 2); + // g_gb(x, y) = denoised(2*x+1, 2*y+1);//deinterleaved(x, y, 3); - //denoised.trace_stores(); + // //denoised.trace_stores(); - Func demosaicked, my_demosaicked; - Func b_r, g_r, b_gr, r_gr, b_gb, r_gb, r_b, g_b; - demosaicked = demosaic(g_gr, r_r, b_b, g_gb, - b_r, g_r, b_gr, r_gr, b_gb, r_gb, r_b, g_b); + // Func demosaicked, my_demosaicked; + // Func b_r, g_r, b_gr, r_gr, b_gb, r_gb, r_b, g_b; + // demosaicked = demosaic(g_gr, r_r, b_b, g_gb, + // b_r, g_r, b_gr, r_gr, b_gb, r_gb, r_b, g_b); - my_demosaicked = my_demosaic(denoised); - //demosaicked = demosaic(g_gr_wb, r_r_wb, b_b_wb, g_gb_wb, - // b_r, g_r, b_gr, r_gr, b_gb, r_gb, r_b, g_b); + // my_demosaicked = my_demosaic(denoised); + // //demosaicked = demosaic(g_gr_wb, r_r_wb, b_b_wb, g_gb_wb, + // // b_r, g_r, b_gr, r_gr, b_gb, r_gb, r_b, g_b); - Func color_corrected; - color_corrected = color_correct(my_demosaicked, matrix); + // Func color_corrected; + // color_corrected = color_correct(my_demosaicked, matrix); - Func curve; - { + // Func curve; + // { - // BL, WL CHANGE - Expr minRaw = blackLevel; - Expr maxRaw = whiteLevel; - Expr invRange = 1.0f / (maxRaw - minRaw); + // // BL, WL CHANGE + // Expr minRaw = blackLevel; + // Expr maxRaw = whiteLevel; + // Expr invRange = 1.0f / (maxRaw - minRaw); - // BL, WL CHANGE - //Expr xf = clamp(cast(x)/1024.0f, 0.f, 1.f); + // // BL, WL CHANGE + // //Expr xf = clamp(cast(x)/1024.0f, 0.f, 1.f); - Expr xf = clamp(cast(x - minRaw) * invRange, 0.0f, 1.0f); - Expr g = pow(xf, 1.0f/gamma); - Expr b = 2.0f - (float) pow(2.0f, contrast/100.0f); - Expr a = 2.0f - 2.0f*b; - Expr val = select(g > 0.5f, - 1.0f - (a*(1.0f-g)*(1.0f-g) + b*(1.0f-g)), - a*g*g + b*g); + // Expr xf = clamp(cast(x - minRaw) * invRange, 0.0f, 1.0f); + // Expr g = pow(xf, 1.0f/gamma); + // Expr b = 2.0f - (float) pow(2.0f, contrast/100.0f); + // Expr a = 2.0f - 2.0f*b; + // Expr val = select(g > 0.5f, + // 1.0f - (a*(1.0f-g)*(1.0f-g) + b*(1.0f-g)), + // a*g*g + b*g); - // BL, WL CHANGE - //curve(x) = u16(clamp(val*256.0f, 0.0f, 255.0f)); + // // BL, WL CHANGE + // //curve(x) = u16(clamp(val*256.0f, 0.0f, 255.0f)); - //curve(x) = select(x <= minRaw, 0, select(x > maxRaw, u16(255), u16(clamp(val*256.0f, 0.0f, 255.0f)))); - //curve(x) = select(x <= minRaw, 0.0f, select(x > maxRaw, 1.0f, clamp(val, 0.0f, 1.0f))); - //curve(x) = select(x <= minRaw, 0.0f, select(x > maxRaw, 3072.f, clamp(val * 3072.f, 0.0f, 3072.f))); + // //curve(x) = select(x <= minRaw, 0, select(x > maxRaw, u16(255), u16(clamp(val*256.0f, 0.0f, 255.0f)))); + // //curve(x) = select(x <= minRaw, 0.0f, select(x > maxRaw, 1.0f, clamp(val, 0.0f, 1.0f))); + // //curve(x) = select(x <= minRaw, 0.0f, select(x > maxRaw, 3072.f, clamp(val * 3072.f, 0.0f, 3072.f))); - // curve(x) = select(x <= minRaw, 0.0f, select(x > maxRaw, 1023.f, clamp(val * 1023.f, 0.0f, 1023.f))); - curve(x) = select(x <= minRaw, u16(0), select(x > maxRaw, u16(1023), u16(clamp(val * 1023.f, 0.0f, 1023.f)))); + // // curve(x) = select(x <= minRaw, 0.0f, select(x > maxRaw, 1023.f, clamp(val * 1023.f, 0.0f, 1023.f))); + // curve(x) = select(x <= minRaw, u16(0), select(x > maxRaw, u16(1023), u16(clamp(val * 1023.f, 0.0f, 1023.f)))); - //curve(x) = select(x <= minRaw, 0.0f, select(x > maxRaw, 255.f, clamp(val*256.0f, 0.0f, 255.0f))); - //curve(x) = clamp(val*256.0f, 0.0f, 255.0f); - } + // //curve(x) = select(x <= minRaw, 0.0f, select(x > maxRaw, 255.f, clamp(val*256.0f, 0.0f, 255.0f))); + // //curve(x) = clamp(val*256.0f, 0.0f, 255.0f); + // } - Func cp_hw_output, curve_out, output_shuffle, gamma_corr_out; - curve_out = apply_curve(color_corrected, curve); - //curve_out = apply_curve(my_demosaicked, curve); - //gamma_corr_out = gamma_correction(curve_out, 1.1f); - //curve_out = apply_curve(color_corrected, curve); - cp_hw_output(c, x, y) = curve_out(x, y, c); - //cp_hw_output(c, x, y) = demosaicked(x, y, c); - //cp_hw_output(c, x, y) = denoised(x, y); + // Func cp_hw_output, curve_out, output_shuffle, gamma_corr_out; + // curve_out = apply_curve(color_corrected, curve); + // //curve_out = apply_curve(my_demosaicked, curve); + // //gamma_corr_out = gamma_correction(curve_out, 1.1f); + // //curve_out = apply_curve(color_corrected, curve); + // cp_hw_output(c, x, y) = curve_out(x, y, c); + // //cp_hw_output(c, x, y) = demosaicked(x, y, c); + // //cp_hw_output(c, x, y) = denoised(x, y); - Func cp_output; + // Func cp_output; - Var k; - if (get_target().has_feature(Target::Clockwork)) { - int iWidth = tWidth * nTiles / 4; - output_shuffle(c, k, x, y) = u8(cp_hw_output(c, (x%iWidth)*4 + k, x/iWidth + 2*y)); - //output(x, y, c) = output_shuffle(c, y%2 + 2*(x%2), max(x/2 - 1, 0), y/2); - cp_output(x, y, c) = output_shuffle(c, y%2 + 2*(x%2), x/2, y/2); - } else { - //output(x, y, c) = u8(cp_hw_output(c, x+2, y)); + // Var k; + // if (get_target().has_feature(Target::Clockwork)) { + // int iWidth = tWidth * nTiles / 4; + // output_shuffle(c, k, x, y) = u8(cp_hw_output(c, (x%iWidth)*4 + k, x/iWidth + 2*y)); + // //output(x, y, c) = output_shuffle(c, y%2 + 2*(x%2), max(x/2 - 1, 0), y/2); + // cp_output(x, y, c) = output_shuffle(c, y%2 + 2*(x%2), x/2, y/2); + // } else { + // //output(x, y, c) = u8(cp_hw_output(c, x+2, y)); - //cp_output(x, y, c) = u8(cp_hw_output(c, x, y)); - //cp_output(x, y, c) = cp_hw_output(c, x, y)/1023.f; - cp_output(x, y, c) = cp_hw_output(c, x, y); + // //cp_output(x, y, c) = u8(cp_hw_output(c, x, y)); + // //cp_output(x, y, c) = cp_hw_output(c, x, y)/1023.f; + // cp_output(x, y, c) = cp_hw_output(c, x, y); - } + // } - //output(x, y, c) = u8((cp_output(x, y, c)/3072.f) * 255.f); - output(x, y, c) = u8((cp_output(x, y, c)/1023.f) * 255.f); - //output(x, y, c) = u8(cp_output(x, y, c) * 255.f); + // //output(x, y, c) = u8((cp_output(x, y, c)/3072.f) * 255.f); + // output(x, y, c) = u8((cp_output(x, y, c)/1023.f) * 255.f); + // //output(x, y, c) = u8(cp_output(x, y, c) * 255.f); - //curve.bound(x, 0, 256); + // //curve.bound(x, 0, 256); - output.bound(c, 0, 3); - // output.bound(x, 0, 64); - // output.bound(y, 0, 64); + // output.bound(c, 0, 3); + // // output.bound(x, 0, 64); + // // output.bound(y, 0, 64); - //cp_output.trace_stores(); + // //cp_output.trace_stores(); - /* - * END CAMERA PIPELINE - */ + // /* + // * END CAMERA PIPELINE + // */ @@ -1354,28 +1354,28 @@ class HDRPlus : public Halide::Generator { } else if (get_target().has_feature(Target::Clockwork)) { - // merge_output.in().compute_root(); + merge_output.in().compute_root(); - // merge_output.in().tile(x, y, xo, yo, xi, yi, 4, 4) - // .reorder(xi, yi, xo, yo) - // .hw_accelerate(xi, xo); + merge_output.in().tile(x, y, xo, yo, xi, yi, 4, 4) + .reorder(xi, yi, xo, yo) + .hw_accelerate(xi, xo); - // merge_output.tile(x, y, xo, yo, xi, yi, 4, 4) - // .reorder(xi, yi, xo, yo); - // merge_output.compute_at(merge_output.in(), xo); - // merge_output.store_in(MemoryType::GLB); + merge_output.tile(x, y, xo, yo, xi, yi, 4, 4) + .reorder(xi, yi, xo, yo); + merge_output.compute_at(merge_output.in(), xo); + merge_output.store_in(MemoryType::GLB); - merge_output.compute_at(cp_hw_output, xo); + //merge_output.compute_at(merge_output, xo); - final_merge_output.compute_at(cp_hw_output, xo); + final_merge_output.compute_at(merge_output, xo); - output_tiled_normalized_cosined.compute_at(cp_hw_output, xo); - output_tiled.compute_at(cp_hw_output, xo); - val.compute_at(cp_hw_output, xo); - sum_weight.compute_at(cp_hw_output, xo); - weight.compute_at(cp_hw_output, xo); - dist_tile_norm.compute_at(cp_hw_output, xo); - dist_tile.compute_at(cp_hw_output, xo); + output_tiled_normalized_cosined.compute_at(merge_output, xo); + output_tiled.compute_at(merge_output, xo); + val.compute_at(merge_output, xo); + sum_weight.compute_at(merge_output, xo); + weight.compute_at(merge_output, xo); + dist_tile_norm.compute_at(merge_output, xo); + dist_tile.compute_at(merge_output, xo); // align_output.compute_root(); @@ -1387,8 +1387,8 @@ class HDRPlus : public Halide::Generator { for (size_t j = 0; j < gPyramid.size(); ++j) { - //gPyramid[j].compute_at(cp_hw_output, xo); - alignPyramid[j].compute_at(cp_hw_output, xo); + //gPyramid[j].compute_at(merge_output, xo); + alignPyramid[j].compute_at(merge_output, xo); //alignPyramid[j].compute_at(align_output, xo); } @@ -1418,22 +1418,22 @@ class HDRPlus : public Halide::Generator { // gPyramid[3].compute_at(align_output, xo); // gPyramid[4].compute_at(align_output, xo); - gPyramid[0].compute_at(cp_hw_output, xo); - gPyramid[1].compute_at(cp_hw_output, xo); - gPyramid[2].compute_at(cp_hw_output, xo); - gPyramid[3].compute_at(cp_hw_output, xo); - gPyramid[4].compute_at(cp_hw_output, xo); + gPyramid[0].compute_at(merge_output, xo); + gPyramid[1].compute_at(merge_output, xo); + gPyramid[2].compute_at(merge_output, xo); + gPyramid[3].compute_at(merge_output, xo); + gPyramid[4].compute_at(merge_output, xo); - gray.compute_at(cp_hw_output, xo); - hw_input_copy.compute_at(cp_hw_output, xo); + gray.compute_at(merge_output, xo); + hw_input_copy.compute_at(merge_output, xo); //gray.compute_at(align_output, xo); //gray.compute_at(alignPyramid[4], xo); //gray.compute_at(gPyramid[4], xo); - hw_input.in().in().compute_at(cp_hw_output, xo); // represents the mem tile + hw_input.in().in().compute_at(merge_output, xo); // represents the mem tile - hw_input.in().compute_at(cp_hw_output.in(), xo); // represents the glb level + hw_input.in().compute_at(merge_output.in(), xo); // represents the glb level hw_input.in().store_in(MemoryType::GLB); hw_input.compute_root() @@ -1503,383 +1503,383 @@ class HDRPlus : public Halide::Generator { } - // CAMERA PIPELINE SCHEDULE - if (get_target().has_feature(Target::CoreIR)) { - cp_hw_input.store_at(cp_hw_output, xo).compute_at(denoised, x); - cp_hw_output.compute_root(); + // // CAMERA PIPELINE SCHEDULE + // if (get_target().has_feature(Target::CoreIR)) { + // cp_hw_input.store_at(cp_hw_output, xo).compute_at(denoised, x); + // cp_hw_output.compute_root(); - cp_hw_output.accelerate({cp_hw_input}, xi, xo, {}); - cp_hw_output.tile(x, y, xo, yo, xi, yi, 64-6,64-6) - .reorder(c,xi,yi,xo,yo); + // cp_hw_output.accelerate({cp_hw_input}, xi, xo, {}); + // cp_hw_output.tile(x, y, xo, yo, xi, yi, 64-6,64-6) + // .reorder(c,xi,yi,xo,yo); - curve_out.compute_at(cp_hw_output, xo); - //cp_hw_output.unroll(c).unroll(xi, 2); - cp_hw_output.unroll(c); + // curve_out.compute_at(cp_hw_output, xo); + // //cp_hw_output.unroll(c).unroll(xi, 2); + // cp_hw_output.unroll(c); - demosaicked.linebuffer(); - demosaicked.unroll(c); - //demosaicked.reorder(c, x, y); + // demosaicked.linebuffer(); + // demosaicked.unroll(c); + // //demosaicked.reorder(c, x, y); - denoised.linebuffer(); - //.unroll(x).unroll(y); + // denoised.linebuffer(); + // //.unroll(x).unroll(y); - curve.compute_at(cp_hw_output, xo).unroll(x); // synthesize curve to a ROM + // curve.compute_at(cp_hw_output, xo).unroll(x); // synthesize curve to a ROM - cp_hw_input.stream_to_accelerator(); - - } else if (get_target().has_feature(Target::Clockwork)) { - - if (schedule == 1) { // host and glb tiling - const int numHostTiles = 4; - const int numTiles = 3; - const int tileSize = 58; - const int glbSize = tileSize * numTiles; - const int outputSize = numHostTiles * glbSize; - - cp_output.bound(x, 0, outputSize); - cp_output.bound(y, 0, outputSize); - - cp_hw_output.in().compute_root(); - - cp_hw_output.in() - .tile(x, y, xo, yo, xi, yi, glbSize, glbSize) - .reorder(c, xi, yi, xo, yo) - .hw_accelerate(xi, xo); - cp_hw_output.in().unroll(c); - - Var xii, yii, xio, yio; - cp_hw_output - .tile(x, y, xo, yo, xi, yi, tileSize, tileSize) - .reorder(c, xi, yi, xo, yo); - cp_hw_output.compute_at(cp_hw_output.in(), xo); - cp_hw_output.store_in(MemoryType::GLB); - cp_hw_output.unroll(c); - - curve_out.compute_at(cp_hw_output, xo); - curve_out.unroll(c); + // cp_hw_input.stream_to_accelerator(); + + // } else if (get_target().has_feature(Target::Clockwork)) { + + // if (schedule == 1) { // host and glb tiling + // const int numHostTiles = 4; + // const int numTiles = 3; + // const int tileSize = 58; + // const int glbSize = tileSize * numTiles; + // const int outputSize = numHostTiles * glbSize; + + // cp_output.bound(x, 0, outputSize); + // cp_output.bound(y, 0, outputSize); + + // cp_hw_output.in().compute_root(); + + // cp_hw_output.in() + // .tile(x, y, xo, yo, xi, yi, glbSize, glbSize) + // .reorder(c, xi, yi, xo, yo) + // .hw_accelerate(xi, xo); + // cp_hw_output.in().unroll(c); + + // Var xii, yii, xio, yio; + // cp_hw_output + // .tile(x, y, xo, yo, xi, yi, tileSize, tileSize) + // .reorder(c, xi, yi, xo, yo); + // cp_hw_output.compute_at(cp_hw_output.in(), xo); + // cp_hw_output.store_in(MemoryType::GLB); + // cp_hw_output.unroll(c); + + // curve_out.compute_at(cp_hw_output, xo); + // curve_out.unroll(c); - color_corrected.compute_at(cp_hw_output, xo); - color_corrected.unroll(c); + // color_corrected.compute_at(cp_hw_output, xo); + // color_corrected.unroll(c); - demosaicked.compute_at(cp_hw_output, xo); - demosaicked - .reorder(c, x, y) - .unroll(c); + // demosaicked.compute_at(cp_hw_output, xo); + // demosaicked + // .reorder(c, x, y) + // .unroll(c); - denoised.compute_at(cp_hw_output, xo); - //.unroll(x).unroll(y); + // denoised.compute_at(cp_hw_output, xo); + // //.unroll(x).unroll(y); - g_gr.compute_at(cp_hw_output, xo); - r_r.compute_at(cp_hw_output, xo); - b_b.compute_at(cp_hw_output, xo); - g_gb.compute_at(cp_hw_output, xo); + // g_gr.compute_at(cp_hw_output, xo); + // r_r.compute_at(cp_hw_output, xo); + // b_b.compute_at(cp_hw_output, xo); + // g_gb.compute_at(cp_hw_output, xo); - curve.compute_at(cp_hw_output, xo).unroll(x); // synthesize curve to a ROM + // curve.compute_at(cp_hw_output, xo).unroll(x); // synthesize curve to a ROM - cp_hw_input.in().compute_at(cp_hw_output.in(), xo); // represents the glb level - cp_hw_input.in().store_in(MemoryType::GLB); - //cp_hw_input.in().unroll(c); // hw input bound + // cp_hw_input.in().compute_at(cp_hw_output.in(), xo); // represents the glb level + // cp_hw_input.in().store_in(MemoryType::GLB); + // //cp_hw_input.in().unroll(c); // hw input bound - cp_hw_input.compute_root() - .accelerator_input(); - - } else if (schedule == 2) { // big parrot - const int tileWidth = 64; - const int tileHeight = 56; - const int numHostTiles = 11; - const int numTiles = 3; - const int glbWidth = tileWidth * numTiles; - const int glbHeight = tileHeight * numTiles; - const int outputWidth = numHostTiles * glbWidth; - const int outputHeight = numHostTiles * glbHeight; - - cp_output.bound(x, 0, outputWidth); - cp_output.bound(y, 0, outputHeight); - - cp_hw_output.in().compute_root(); - - cp_hw_output.in() - .tile(x, y, xo, yo, xi, yi, glbWidth, glbHeight) - .reorder(c, xi, yi, xo, yo) - .hw_accelerate(xi, xo); - cp_hw_output.in().unroll(c); - - Var xii, yii, xio, yio; - cp_hw_output - .tile(x, y, xo, yo, xi, yi, tileWidth, tileHeight) - .reorder(c, xi, yi, xo, yo); - cp_hw_output.compute_at(cp_hw_output.in(), xo); - cp_hw_output.store_in(MemoryType::GLB); - cp_hw_output.unroll(c); - - curve_out.compute_at(cp_hw_output, xo); - curve_out.unroll(c); + // cp_hw_input.compute_root() + // .accelerator_input(); + + // } else if (schedule == 2) { // big parrot + // const int tileWidth = 64; + // const int tileHeight = 56; + // const int numHostTiles = 11; + // const int numTiles = 3; + // const int glbWidth = tileWidth * numTiles; + // const int glbHeight = tileHeight * numTiles; + // const int outputWidth = numHostTiles * glbWidth; + // const int outputHeight = numHostTiles * glbHeight; + + // cp_output.bound(x, 0, outputWidth); + // cp_output.bound(y, 0, outputHeight); + + // cp_hw_output.in().compute_root(); + + // cp_hw_output.in() + // .tile(x, y, xo, yo, xi, yi, glbWidth, glbHeight) + // .reorder(c, xi, yi, xo, yo) + // .hw_accelerate(xi, xo); + // cp_hw_output.in().unroll(c); + + // Var xii, yii, xio, yio; + // cp_hw_output + // .tile(x, y, xo, yo, xi, yi, tileWidth, tileHeight) + // .reorder(c, xi, yi, xo, yo); + // cp_hw_output.compute_at(cp_hw_output.in(), xo); + // cp_hw_output.store_in(MemoryType::GLB); + // cp_hw_output.unroll(c); + + // curve_out.compute_at(cp_hw_output, xo); + // curve_out.unroll(c); - color_corrected.compute_at(cp_hw_output, xo); - color_corrected.unroll(c); + // color_corrected.compute_at(cp_hw_output, xo); + // color_corrected.unroll(c); - demosaicked.compute_at(cp_hw_output, xo); - demosaicked - .reorder(c, x, y) - .unroll(c); - - denoised.compute_at(cp_hw_output, xo); - //.unroll(x).unroll(y); - - g_gr.compute_at(cp_hw_output, xo); - r_r.compute_at(cp_hw_output, xo); - b_b.compute_at(cp_hw_output, xo); - g_gb.compute_at(cp_hw_output, xo); + // demosaicked.compute_at(cp_hw_output, xo); + // demosaicked + // .reorder(c, x, y) + // .unroll(c); + + // denoised.compute_at(cp_hw_output, xo); + // //.unroll(x).unroll(y); + + // g_gr.compute_at(cp_hw_output, xo); + // r_r.compute_at(cp_hw_output, xo); + // b_b.compute_at(cp_hw_output, xo); + // g_gb.compute_at(cp_hw_output, xo); - curve.compute_at(cp_hw_output, xo).unroll(x); // synthesize curve to a ROM + // curve.compute_at(cp_hw_output, xo).unroll(x); // synthesize curve to a ROM - cp_hw_input.in().compute_at(cp_hw_output.in(), xo); // represents the glb level - cp_hw_input.in().store_in(MemoryType::GLB); - //cp_hw_input.in().unroll(c); // hw input bound + // cp_hw_input.in().compute_at(cp_hw_output.in(), xo); // represents the glb level + // cp_hw_input.in().store_in(MemoryType::GLB); + // //cp_hw_input.in().unroll(c); // hw input bound - cp_hw_input.compute_root() - .accelerator_input(); - - } else if (schedule == 3) { // big parrot with unroll - const int unrollx = 2; - const int unrolly = 2; - //const int tileWidth = 64-8; - const int tileWidth = tWidth;//256-8; - //const int tileHeight = 64-8; - const int tileHeight = tHeight;//192-8; - const int numHostTiles = nTiles;//10; - const int numTiles = 1; // number of tiles in the glb - const int glbWidth = tileWidth * numTiles; - const int glbHeight = tileHeight * numTiles; - const int outputWidth = numHostTiles * glbWidth; - const int outputHeight = numHostTiles * glbHeight; - - cp_output.bound(x, 0, outputWidth); - cp_output.bound(y, 0, outputHeight); - - cp_hw_output.in().compute_root(); - - Var xii, yii, xio, yio; - cp_hw_output.in() - .tile(x, y, xo, yo, xi, yi, glbWidth, glbHeight) - .split(yi, yio, yii, unrolly) - .reorder(c, yii, xi, yio, xo, yo) - .hw_accelerate(xi, xo); - cp_hw_output.in().unroll(c) - .unroll(yii, unrolly, TailStrategy::RoundUp) - .unroll(xi, unrollx, TailStrategy::RoundUp); - - cp_hw_output - .tile(x, y, xo, yo, xi, yi, tileWidth, tileHeight) - .split(yi, yio, yii, unrolly) - .reorder(c, yii, xi, yio, xo, yo); - cp_hw_output.compute_at(cp_hw_output.in(), xo); - cp_hw_output.store_in(MemoryType::GLB); - cp_hw_output.unroll(c) - .unroll(yii, unrolly, TailStrategy::RoundUp) - .unroll(xi, unrollx, TailStrategy::RoundUp); - - curve_out.compute_at(cp_hw_output, xo); - curve_out.unroll(c) - .split(y, yio, yii, unrolly).reorder(c, yii, x, yio) - .unroll(yii, unrolly, TailStrategy::RoundUp) - .unroll(x, unrollx, TailStrategy::RoundUp); + // cp_hw_input.compute_root() + // .accelerator_input(); + + // } else if (schedule == 3) { // big parrot with unroll + // const int unrollx = 2; + // const int unrolly = 2; + // //const int tileWidth = 64-8; + // const int tileWidth = tWidth;//256-8; + // //const int tileHeight = 64-8; + // const int tileHeight = tHeight;//192-8; + // const int numHostTiles = nTiles;//10; + // const int numTiles = 1; // number of tiles in the glb + // const int glbWidth = tileWidth * numTiles; + // const int glbHeight = tileHeight * numTiles; + // const int outputWidth = numHostTiles * glbWidth; + // const int outputHeight = numHostTiles * glbHeight; + + // cp_output.bound(x, 0, outputWidth); + // cp_output.bound(y, 0, outputHeight); + + // cp_hw_output.in().compute_root(); + + // Var xii, yii, xio, yio; + // cp_hw_output.in() + // .tile(x, y, xo, yo, xi, yi, glbWidth, glbHeight) + // .split(yi, yio, yii, unrolly) + // .reorder(c, yii, xi, yio, xo, yo) + // .hw_accelerate(xi, xo); + // cp_hw_output.in().unroll(c) + // .unroll(yii, unrolly, TailStrategy::RoundUp) + // .unroll(xi, unrollx, TailStrategy::RoundUp); + + // cp_hw_output + // .tile(x, y, xo, yo, xi, yi, tileWidth, tileHeight) + // .split(yi, yio, yii, unrolly) + // .reorder(c, yii, xi, yio, xo, yo); + // cp_hw_output.compute_at(cp_hw_output.in(), xo); + // cp_hw_output.store_in(MemoryType::GLB); + // cp_hw_output.unroll(c) + // .unroll(yii, unrolly, TailStrategy::RoundUp) + // .unroll(xi, unrollx, TailStrategy::RoundUp); + + // curve_out.compute_at(cp_hw_output, xo); + // curve_out.unroll(c) + // .split(y, yio, yii, unrolly).reorder(c, yii, x, yio) + // .unroll(yii, unrolly, TailStrategy::RoundUp) + // .unroll(x, unrollx, TailStrategy::RoundUp); - color_corrected.compute_at(cp_hw_output, xo); - color_corrected.unroll(c) - .split(y, yio, yii, unrolly).reorder(c, yii, x, yio) - .unroll(yii, unrolly, TailStrategy::RoundUp) - .unroll(x, unrollx, TailStrategy::RoundUp); + // color_corrected.compute_at(cp_hw_output, xo); + // color_corrected.unroll(c) + // .split(y, yio, yii, unrolly).reorder(c, yii, x, yio) + // .unroll(yii, unrolly, TailStrategy::RoundUp) + // .unroll(x, unrollx, TailStrategy::RoundUp); - demosaicked.compute_at(cp_hw_output, xo); - demosaicked - .split(y, yio, yii, unrolly).reorder(c, yii, x, yio) - .unroll(c) - .unroll(yii, unrolly, TailStrategy::RoundUp) - .unroll(x, unrollx, TailStrategy::RoundUp); - - denoised.compute_at(cp_hw_output, xo) - .split(y, yio, yii, unrolly).reorder(yii, x, yio) - .unroll(yii, unrolly, TailStrategy::RoundUp) - .unroll(x, unrollx); - //.unroll(x).unroll(y); - - bool buffer_memories = true; - if (buffer_memories) { - b_r.compute_at(cp_hw_output, xo); - g_r.compute_at(cp_hw_output, xo); - b_gr.compute_at(cp_hw_output, xo); - r_gr.compute_at(cp_hw_output, xo); - b_gb.compute_at(cp_hw_output, xo); - r_gb.compute_at(cp_hw_output, xo); - r_b.compute_at(cp_hw_output, xo); - g_b.compute_at(cp_hw_output, xo); - } - - g_gr.compute_at(cp_hw_output, xo); - r_r.compute_at(cp_hw_output, xo); - b_b.compute_at(cp_hw_output, xo); - g_gb.compute_at(cp_hw_output, xo); + // demosaicked.compute_at(cp_hw_output, xo); + // demosaicked + // .split(y, yio, yii, unrolly).reorder(c, yii, x, yio) + // .unroll(c) + // .unroll(yii, unrolly, TailStrategy::RoundUp) + // .unroll(x, unrollx, TailStrategy::RoundUp); + + // denoised.compute_at(cp_hw_output, xo) + // .split(y, yio, yii, unrolly).reorder(yii, x, yio) + // .unroll(yii, unrolly, TailStrategy::RoundUp) + // .unroll(x, unrollx); + // //.unroll(x).unroll(y); + + // bool buffer_memories = true; + // if (buffer_memories) { + // b_r.compute_at(cp_hw_output, xo); + // g_r.compute_at(cp_hw_output, xo); + // b_gr.compute_at(cp_hw_output, xo); + // r_gr.compute_at(cp_hw_output, xo); + // b_gb.compute_at(cp_hw_output, xo); + // r_gb.compute_at(cp_hw_output, xo); + // r_b.compute_at(cp_hw_output, xo); + // g_b.compute_at(cp_hw_output, xo); + // } + + // g_gr.compute_at(cp_hw_output, xo); + // r_r.compute_at(cp_hw_output, xo); + // b_b.compute_at(cp_hw_output, xo); + // g_gb.compute_at(cp_hw_output, xo); - if (false) { // these buffers should not be unrolled - g_gr - .split(y, yio, yii, unrolly, TailStrategy::RoundUp).reorder(yii, x, yio) - .unroll(x, unrollx, TailStrategy::RoundUp) - .unroll(yii, unrolly, TailStrategy::RoundUp); - r_r - .split(y, yio, yii, unrolly, TailStrategy::RoundUp).reorder(yii, x, yio) - .unroll(x, unrollx, TailStrategy::RoundUp) - .unroll(yii, unrolly, TailStrategy::RoundUp); - b_b - .split(y, yio, yii, unrolly, TailStrategy::RoundUp).reorder(yii, x, yio) - .unroll(x, unrollx, TailStrategy::RoundUp) - .unroll(yii, unrolly, TailStrategy::RoundUp); - g_gb - .split(y, yio, yii, unrolly, TailStrategy::RoundUp).reorder(yii, x, yio) - .unroll(x, unrollx, TailStrategy::RoundUp) - .unroll(yii, unrolly, TailStrategy::RoundUp); - } + // if (false) { // these buffers should not be unrolled + // g_gr + // .split(y, yio, yii, unrolly, TailStrategy::RoundUp).reorder(yii, x, yio) + // .unroll(x, unrollx, TailStrategy::RoundUp) + // .unroll(yii, unrolly, TailStrategy::RoundUp); + // r_r + // .split(y, yio, yii, unrolly, TailStrategy::RoundUp).reorder(yii, x, yio) + // .unroll(x, unrollx, TailStrategy::RoundUp) + // .unroll(yii, unrolly, TailStrategy::RoundUp); + // b_b + // .split(y, yio, yii, unrolly, TailStrategy::RoundUp).reorder(yii, x, yio) + // .unroll(x, unrollx, TailStrategy::RoundUp) + // .unroll(yii, unrolly, TailStrategy::RoundUp); + // g_gb + // .split(y, yio, yii, unrolly, TailStrategy::RoundUp).reorder(yii, x, yio) + // .unroll(x, unrollx, TailStrategy::RoundUp) + // .unroll(yii, unrolly, TailStrategy::RoundUp); + // } - curve.compute_at(cp_hw_output, xo).unroll(x); // synthesize curve to a ROM - curve.store_in(MemoryType::ROM); - // unroll by x? + // curve.compute_at(cp_hw_output, xo).unroll(x); // synthesize curve to a ROM + // curve.store_in(MemoryType::ROM); + // // unroll by x? - cp_hw_input.split(y, yio, yii, unrolly).reorder(yii, x, yio) - .unroll(yii, unrolly, TailStrategy::RoundUp) - .unroll(x, unrollx, TailStrategy::RoundUp); + // cp_hw_input.split(y, yio, yii, unrolly).reorder(yii, x, yio) + // .unroll(yii, unrolly, TailStrategy::RoundUp) + // .unroll(x, unrollx, TailStrategy::RoundUp); - cp_hw_input.compute_at(cp_hw_output, xo); + // cp_hw_input.compute_at(cp_hw_output, xo); - // cp_hw_input.in().in().compute_at(cp_hw_output, xo); // represents the mem tile - // cp_hw_input.in().in() - // .split(y, yio, yii, unrolly).reorder(yii, x, yio) - // .unroll(yii, unrolly, TailStrategy::RoundUp) - // .unroll(x, unrollx, TailStrategy::RoundUp); + // // cp_hw_input.in().in().compute_at(cp_hw_output, xo); // represents the mem tile + // // cp_hw_input.in().in() + // // .split(y, yio, yii, unrolly).reorder(yii, x, yio) + // // .unroll(yii, unrolly, TailStrategy::RoundUp) + // // .unroll(x, unrollx, TailStrategy::RoundUp); - // cp_hw_input.in().compute_at(cp_hw_output.in(), xo); // represents the glb level - // cp_hw_input.in().store_in(MemoryType::GLB); - // cp_hw_input.in() - // .split(y, yio, yii, unrolly).reorder(yii, x, yio) - // .unroll(yii, unrolly, TailStrategy::RoundUp) - // .unroll(x, unrollx, TailStrategy::RoundUp); + // // cp_hw_input.in().compute_at(cp_hw_output.in(), xo); // represents the glb level + // // cp_hw_input.in().store_in(MemoryType::GLB); + // // cp_hw_input.in() + // // .split(y, yio, yii, unrolly).reorder(yii, x, yio) + // // .unroll(yii, unrolly, TailStrategy::RoundUp) + // // .unroll(x, unrollx, TailStrategy::RoundUp); - // cp_hw_input.compute_root() - // .accelerator_input(); + // // cp_hw_input.compute_root() + // // .accelerator_input(); - } else { - cp_output.bound(x, 0, 64-blockSize+1); - cp_output.bound(y, 0, 64-blockSize+1); + // } else { + // cp_output.bound(x, 0, 64-blockSize+1); + // cp_output.bound(y, 0, 64-blockSize+1); - cp_hw_output.compute_root(); - - cp_hw_output.tile(x, y, xo, yo, xi, yi, 64-blockSize+1,64-blockSize+1) - .reorder(c,xi,yi,xo,yo) - .reorder_storage(c, x, y) - .hw_accelerate(xi, xo); - cp_hw_output.unroll(c); - //cp_hw_output.unroll(c).unroll(xi, 2); + // cp_hw_output.compute_root(); + + // cp_hw_output.tile(x, y, xo, yo, xi, yi, 64-blockSize+1,64-blockSize+1) + // .reorder(c,xi,yi,xo,yo) + // .reorder_storage(c, x, y) + // .hw_accelerate(xi, xo); + // cp_hw_output.unroll(c); + // //cp_hw_output.unroll(c).unroll(xi, 2); - //curve_out.reorder(c, x, y).reorder_storage(c, x, y); - curve_out.compute_at(cp_hw_output, xo); - curve_out.unroll(c); + // //curve_out.reorder(c, x, y).reorder_storage(c, x, y); + // curve_out.compute_at(cp_hw_output, xo); + // curve_out.unroll(c); - color_corrected.compute_at(cp_hw_output, xo); - color_corrected.unroll(c); + // color_corrected.compute_at(cp_hw_output, xo); + // color_corrected.unroll(c); - demosaicked.compute_at(cp_hw_output, xo); - demosaicked.unroll(c); - //demosaicked.reorder(c, x, y); - - denoised.compute_at(cp_hw_output, xo); - //.unroll(x).unroll(y); - - g_gr.compute_at(cp_hw_output, xo); - r_r.compute_at(cp_hw_output, xo); - b_b.compute_at(cp_hw_output, xo); - g_gb.compute_at(cp_hw_output, xo); - - //b_r.compute_at(cp_hw_output, xo); - //g_r.compute_at(cp_hw_output, xo); - //b_gr.compute_at(cp_hw_output, xo); - //r_gr.compute_at(cp_hw_output, xo); - //b_gb.compute_at(cp_hw_output, xo); - //r_gb.compute_at(cp_hw_output, xo); - //r_b.compute_at(cp_hw_output, xo); - //g_b.compute_at(cp_hw_output, xo); + // demosaicked.compute_at(cp_hw_output, xo); + // demosaicked.unroll(c); + // //demosaicked.reorder(c, x, y); + + // denoised.compute_at(cp_hw_output, xo); + // //.unroll(x).unroll(y); + + // g_gr.compute_at(cp_hw_output, xo); + // r_r.compute_at(cp_hw_output, xo); + // b_b.compute_at(cp_hw_output, xo); + // g_gb.compute_at(cp_hw_output, xo); + + // //b_r.compute_at(cp_hw_output, xo); + // //g_r.compute_at(cp_hw_output, xo); + // //b_gr.compute_at(cp_hw_output, xo); + // //r_gr.compute_at(cp_hw_output, xo); + // //b_gb.compute_at(cp_hw_output, xo); + // //r_gb.compute_at(cp_hw_output, xo); + // //r_b.compute_at(cp_hw_output, xo); + // //g_b.compute_at(cp_hw_output, xo); - curve.compute_at(cp_hw_output, xo).unroll(x); // synthesize curve to a ROM + // curve.compute_at(cp_hw_output, xo).unroll(x); // synthesize curve to a ROM - //cp_hw_input_copy.compute_at(cp_hw_output, xo); - cp_hw_input.stream_to_accelerator(); - //cp_hw_input.compute_root(); - } + // //cp_hw_input_copy.compute_at(cp_hw_output, xo); + // cp_hw_input.stream_to_accelerator(); + // //cp_hw_input.compute_root(); + // } - } else { // schedule to CPU - if (schedule == 1 || schedule == 2 || schedule == 3) { - Var yii; - const int strip_size = 2; - const int vec = 4; - - cp_output - .compute_root() - .reorder(c, x, y) - .split(y, yi, yii, 2, TailStrategy::RoundUp) - .split(yi, yo, yi, strip_size / 2) - .vectorize(x, 2 * vec, TailStrategy::RoundUp) - //.unroll(c) - .parallel(yo); - - denoised - .compute_at(cp_output, yi) - //.compute_at(curve_out, yi) - .store_at(cp_output, yo) - .prefetch(input, y, 2) - //.fold_storage(y, 4) - .tile(x, y, x, y, xi, yi, 2 * vec, 2) - .vectorize(xi) - .unroll(yi); - - //max_g_gr.compute_at(output, yi); - //max_r_r.compute_at(output, yi); - //max_b_b.compute_at(output, yi); - //max_g_gb.compute_at(output, yi); + // } else { // schedule to CPU + // if (schedule == 1 || schedule == 2 || schedule == 3) { + // Var yii; + // const int strip_size = 2; + // const int vec = 4; + + // cp_output + // .compute_root() + // .reorder(c, x, y) + // .split(y, yi, yii, 2, TailStrategy::RoundUp) + // .split(yi, yo, yi, strip_size / 2) + // .vectorize(x, 2 * vec, TailStrategy::RoundUp) + // //.unroll(c) + // .parallel(yo); + + // denoised + // .compute_at(cp_output, yi) + // //.compute_at(curve_out, yi) + // .store_at(cp_output, yo) + // .prefetch(input, y, 2) + // //.fold_storage(y, 4) + // .tile(x, y, x, y, xi, yi, 2 * vec, 2) + // .vectorize(xi) + // .unroll(yi); + + // //max_g_gr.compute_at(output, yi); + // //max_r_r.compute_at(output, yi); + // //max_b_b.compute_at(output, yi); + // //max_g_gb.compute_at(output, yi); - demosaicked - .compute_at(cp_output, yi) - .store_at(cp_output, yo) - .fold_storage(y, 4) - .reorder(c, x, y) - .vectorize(x, 2 * vec, TailStrategy::RoundUp) - .unroll(c); - - curve_out - .compute_at(cp_output, yi) - //.compute_at(output, yo) - .store_at(cp_output, yo) - .reorder(c, x, y) - .tile(x, y, x, y, xi, yi, 2 * vec, 2, TailStrategy::RoundUp) - .vectorize(xi) - .unroll(yi); - //.unroll(c); - - color_corrected - .compute_at(curve_out, x) - .reorder(c, x, y) - .vectorize(x) - .unroll(c); - - //demosaicked->intermed_compute_at.set({processed, yi}); - //demosaicked->intermed_store_at.set({processed, yo}); - //demosaicked->output_compute_at.set({curved, x}); - - // We can generate slightly better code if we know the splits divide the extent. - //processed - //.bound(c, 0, 3); - //.bound(x, 0, ((out_width) / (2 * vec)) * (2 * vec)) - //.bound(y, 0, (out_height / strip_size) * strip_size); - } - } + // demosaicked + // .compute_at(cp_output, yi) + // .store_at(cp_output, yo) + // .fold_storage(y, 4) + // .reorder(c, x, y) + // .vectorize(x, 2 * vec, TailStrategy::RoundUp) + // .unroll(c); + + // curve_out + // .compute_at(cp_output, yi) + // //.compute_at(output, yo) + // .store_at(cp_output, yo) + // .reorder(c, x, y) + // .tile(x, y, x, y, xi, yi, 2 * vec, 2, TailStrategy::RoundUp) + // .vectorize(xi) + // .unroll(yi); + // //.unroll(c); + + // color_corrected + // .compute_at(curve_out, x) + // .reorder(c, x, y) + // .vectorize(x) + // .unroll(c); + + // //demosaicked->intermed_compute_at.set({processed, yi}); + // //demosaicked->intermed_store_at.set({processed, yo}); + // //demosaicked->output_compute_at.set({curved, x}); + + // // We can generate slightly better code if we know the splits divide the extent. + // //processed + // //.bound(c, 0, 3); + // //.bound(x, 0, ((out_width) / (2 * vec)) * (2 * vec)) + // //.bound(y, 0, (out_height / strip_size) * strip_size); + // } + // } } private: //Var x, y, tx, ty, xy, xi, yi, c, n;