Skip to content

Commit

Permalink
C906: fix conv2d_f32 and conv2d_f16 (#2)
Browse files Browse the repository at this point in the history
Co-authored-by: wengan.swg
  • Loading branch information
alter-xp authored and zhangwm-pt committed May 20, 2022
1 parent 2eb318a commit 8f2d250
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 86 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ riscv_elf_build/
x86_build/
install_nn2/
tools/
.vscode/
68 changes: 20 additions & 48 deletions source/c906_opt/convolution_3x3_fp16.c
Original file line number Diff line number Diff line change
Expand Up @@ -675,22 +675,11 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input,

/*********************************** dot ***************************************/
// reorder input_tm1_buf
int size_input_tm2 = 0;
if (tiles >= 8) {
size_input_tm2 = 64 * (tiles / 8 + (tiles % 8) / 4 + (tiles % 4) / 2 + tiles % 2) * in_c * 8;
} else if (tiles >= 4) {
size_input_tm2 = 64 * (tiles / 4 + (tiles % 4) / 2 + tiles % 2) * in_c * 4;
} else if (tiles >= 2) {
size_input_tm2 = 64 * (tiles / 2 + tiles % 2) * in_c * 2;
} else {
size_input_tm2 = 64 * tiles * in_c;
}
__fp16 *input_tm2_buf = (__fp16 *)csi_mem_alloc(size_input_tm2 * sizeof(__fp16));
__fp16 *input_tm2_buf = (__fp16 *)csi_mem_alloc(64 * tiles * in_c * sizeof(__fp16));

#pragma omp parallel for num_threads(1)
#pragma omp parallel for num_threads(1)
for (int r = 0; r < 64; r++) {

__fp16 *img_tm2 = input_tm2_buf + r * size_input_tm2 / 64; // input_tm2 r channel data
__fp16 *img_tm2 = input_tm2_buf + r * tiles * in_c; // input_tm2 r channel data

int t = 0;
for (; t + 7 < tiles; t += 8) {
Expand Down Expand Up @@ -762,7 +751,7 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input,
);
}
for (; t + 3 < tiles; t += 4) {
__fp16 *tm2 = img_tm2 + (t / 8 + (t % 8) / 4) * in_c * 8; // img_tm2 row data
__fp16 *tm2 = img_tm2 + t * in_c; // img_tm2 row data
__fp16 *tm1 = input_tm1_buf;

tm1 += (r * tiles + t) * 8;
Expand Down Expand Up @@ -817,7 +806,7 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input,

}
for (; t + 1 < tiles; t += 2) {
__fp16 *tm2 = img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2) * in_c * 8; // img_tm2 row data
__fp16 *tm2 = img_tm2 + t * in_c; // img_tm2 row data
__fp16 *tm1 = input_tm1_buf;

tm1 += (r * tiles + t) * 8;
Expand Down Expand Up @@ -865,7 +854,7 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input,

}
for (; t < tiles; t++) {
__fp16 *tm2 = img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2 + t % 2) * in_c * 8; // img_tm2 row data
__fp16 *tm2 = img_tm2 + t * in_c; // img_tm2 row data
__fp16 *tm1 = input_tm1_buf;

tm1 += (r * tiles + t) * 8;
Expand Down Expand Up @@ -923,12 +912,10 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input,
__fp16 *kernel0_tm = kernel_data + p * 64 * in_c * 8;

for (int r = 0; r < 64; r++) {

__fp16 *img_tm2 = input_tm2_buf + r * size_input_tm2 / 64; // img_tm2 第r个channel
__fp16 *img_tm2 = input_tm2_buf + r * tiles * in_c; // img_tm2 第r个channel

int t = 0;
for (; t + 7 < tiles; t += 8) {

__fp16 *r0 = img_tm2 + t * in_c;
__fp16 *k0 = kernel0_tm + r * in_c * 8;

Expand Down Expand Up @@ -1004,7 +991,7 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input,
);
}
for (; t + 3 < tiles; t += 4) {
__fp16 *r0 = img_tm2 + (t / 8 + (t % 8) / 4) * in_c * 8;
__fp16 *r0 = img_tm2 + t * in_c;
__fp16 *k0 = kernel0_tm + r * in_c * 8;

asm volatile(
Expand Down Expand Up @@ -1055,7 +1042,7 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input,
);
}
for (; t + 1 < tiles; t += 2) {
__fp16 *r0 = img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2) * in_c * 8;
__fp16 *r0 = img_tm2 + t * in_c;
__fp16 *k0 = kernel0_tm + r * in_c * 8;

asm volatile(
Expand Down Expand Up @@ -1096,8 +1083,7 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input,
);
}
for (; t < tiles; t++) {

__fp16 *r0 = img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2 + t % 2) * in_c * 8;
__fp16 *r0 = img_tm2 + t * in_c;
__fp16 *k0 = kernel0_tm + r * in_c * 8;

asm volatile(
Expand Down Expand Up @@ -1789,26 +1775,15 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input,

/*********************************** dot ***************************************/
// reorder input_tm1_buf
int size_input_tm2 = 0;
if (tiles >= 8) {
size_input_tm2 = 36 * (tiles / 8 + (tiles % 8) / 4 + (tiles % 4) / 2 + tiles % 2) * in_c * 8;
} else if (tiles >= 4) {
size_input_tm2 = 36 * (tiles / 4 + (tiles % 4) / 2 + tiles % 2) * in_c * 4;
} else if (tiles >= 2) {
size_input_tm2 = 36 * (tiles / 2 + tiles % 2) * in_c * 2;
} else {
size_input_tm2 = 36 * tiles * in_c;
}
__fp16 *input_tm2_buf = (__fp16 *)csi_mem_alloc(size_input_tm2 * sizeof(__fp16));
__fp16 *input_tm2_buf = (__fp16 *)csi_mem_alloc(36 * tiles * in_c * sizeof(__fp16));

#pragma omp parallel for num_threads(1)
for (int r = 0; r < 36; r++) {

__fp16 *img_tm2 = input_tm2_buf + r * size_input_tm2 / 36; // input_tm2 r channel data
__fp16 *img_tm2 = input_tm2_buf + r * tiles * in_c; // input_tm2 r channel data

int t = 0;
for (; t + 7 < tiles; t += 8) {
__fp16 *tm2 = img_tm2 + t * in_c; // img_tm2 row data
__fp16 *tm2 = img_tm2 + t * in_c; // img_tm2 row data
__fp16 *tm1 = input_tm1_buf;

tm1 += (r * tiles + t) * 8;
Expand All @@ -1830,7 +1805,7 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input,
}
}
for (; t + 3 < tiles; t += 4) {
__fp16 *tm2 = img_tm2 + (t / 8 + (t % 8) / 4) * in_c * 8; // img_tm2 row data
__fp16 *tm2 = img_tm2 + t * in_c; // img_tm2 row data
__fp16 *tm1 = input_tm1_buf;

tm1 += (r * tiles + t) * 8;
Expand All @@ -1847,7 +1822,7 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input,
}
}
for (; t + 1 < tiles; t += 2) {
__fp16 *tm2 = img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2) * in_c * 8; // img_tm2 row data
__fp16 *tm2 = img_tm2 + t * in_c; // img_tm2 row data
__fp16 *tm1 = input_tm1_buf;

tm1 += (r * tiles + t) * 8;
Expand All @@ -1862,7 +1837,7 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input,

}
for (; t < tiles; t++) {
__fp16 *tm2 = img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2 + t % 2) * in_c * 8; // img_tm2 row data
__fp16 *tm2 = img_tm2 + t * in_c; // img_tm2 row data
__fp16 *tm1 = input_tm1_buf;

tm1 += (r * tiles + t) * 8;
Expand All @@ -1888,12 +1863,10 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input,
__fp16 *kernel0_tm = kernel_data + p * 36 * in_c * 8; // 8 channel kernel

for (int r = 0; r < 36; r++) {

__fp16 *img_tm2 = input_tm2_buf + r * size_input_tm2 / 36; // img_tm2 第r个channel
__fp16 *img_tm2 = input_tm2_buf + r * tiles * in_c; // img_tm2 第r个channel

int t = 0;
for (; t + 7 < tiles; t += 8) {

__fp16 *r0 = img_tm2 + t * in_c;
__fp16 *k0 = kernel0_tm + r * in_c * 8;

Expand Down Expand Up @@ -1969,7 +1942,7 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input,
);
}
for (; t + 3 < tiles; t += 4) {
__fp16 *r0 = img_tm2 + (t / 8 + (t % 8) / 4) * in_c * 8;
__fp16 *r0 = img_tm2 + t * in_c;
__fp16 *k0 = kernel0_tm + r * in_c * 8;

asm volatile(
Expand Down Expand Up @@ -2020,7 +1993,7 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input,
);
}
for (; t + 1 < tiles; t += 2) {
__fp16 *r0 = img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2) * in_c * 8;
__fp16 *r0 = img_tm2 + t * in_c;
__fp16 *k0 = kernel0_tm + r * in_c * 8;

asm volatile(
Expand Down Expand Up @@ -2061,8 +2034,7 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input,
);
}
for (; t < tiles; t++) {

__fp16 *r0 = img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2 + t % 2) * in_c * 8;
__fp16 *r0 = img_tm2 + t * in_c;
__fp16 *k0 = kernel0_tm + r * in_c * 8;

asm volatile(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2220,22 +2220,12 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input,

/*********************************** dot ***************************************/
// reorder input_tm1_buf
int size_input_tm2 = 0;
if (tiles >= 8) {
size_input_tm2 = 64 * (tiles / 8 + (tiles % 8) / 4 + (tiles % 4) / 2 + tiles % 2) * in_c * 8;
} else if (tiles >= 4) {
size_input_tm2 = 64 * (tiles / 4 + (tiles % 4) / 2 + tiles % 2) * in_c * 4;
} else if (tiles >= 2) {
size_input_tm2 = 64 * (tiles / 2 + tiles % 2) * in_c * 2;
} else {
size_input_tm2 = 64 * tiles * in_c;
}
float *input_tm2_buf = (float *)csi_mem_alloc(size_input_tm2 * sizeof(float));
float *input_tm2_buf = (float *)csi_mem_alloc(64 * tiles * in_c * sizeof(float));

#pragma omp parallel for num_threads(1)
for (int r = 0; r < 64; r++) {

float *img_tm2 = input_tm2_buf + r * size_input_tm2 / 64; // input_tm2 r channel data
float *img_tm2 = input_tm2_buf + r * tiles * in_c; // input_tm2 r channel data

int t = 0;
for (; t + 7 < tiles; t += 8) {
Expand Down Expand Up @@ -2305,7 +2295,7 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input,
);
}
for (; t + 3 < tiles; t += 4) {
float *tm2 = img_tm2 + (t / 8 + (t % 8) / 4) * in_c * 8; // img_tm2 row data
float *tm2 = img_tm2 + t * in_c; // img_tm2 row data
float *tm1 = input_tm1_buf;

tm1 += (r * tiles + t) * 4;
Expand Down Expand Up @@ -2358,7 +2348,7 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input,
);
}
for (; t + 1 < tiles; t += 2) {
float *tm2 = img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2) * in_c * 8; // img_tm2 row data
float *tm2 = img_tm2 + t * in_c; // img_tm2 row data
float *tm1 = input_tm1_buf;

tm1 += (r * tiles + t) * 4;
Expand Down Expand Up @@ -2406,7 +2396,7 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input,

}
for (; t < tiles; t++) {
float *tm2 = img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2 + t % 2) * in_c * 8; // img_tm2 row data
float *tm2 = img_tm2 + t * in_c; // img_tm2 row data
float *tm1 = input_tm1_buf;

tm1 += (r * tiles + t) * 4;
Expand Down Expand Up @@ -2466,7 +2456,7 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input,

for (int r = 0; r < 64; r++) {

float *img_tm2 = input_tm2_buf + r * size_input_tm2 / 64; // img_tm2 第r个channel
float *img_tm2 = input_tm2_buf + r * tiles * in_c; // img_tm2 第r个channel

int t = 0;
for (; t + 7 < tiles; t += 8) {
Expand Down Expand Up @@ -2546,7 +2536,7 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input,
);
}
for (; t + 3 < tiles; t += 4) {
float *r0 = img_tm2 + (t / 8 + (t % 8) / 4) * in_c * 8;
float *r0 = img_tm2 + t * in_c;
float *k0 = kernel0_tm + r * in_c * 4;

asm volatile(
Expand Down Expand Up @@ -2597,7 +2587,7 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input,
);
}
for (; t + 1 < tiles; t += 2) {
float *r0 = img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2) * in_c * 8;
float *r0 = img_tm2 + t * in_c;
float *k0 = kernel0_tm + r * in_c * 4;

asm volatile(
Expand Down Expand Up @@ -2639,7 +2629,7 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input,
}
for (; t < tiles; t++) {

float *r0 = img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2 + t % 2) * in_c * 8;
float *r0 = img_tm2 + t * in_c;
float *k0 = kernel0_tm + r * in_c * 4;

asm volatile(
Expand Down Expand Up @@ -3320,22 +3310,12 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input,

/*********************************** dot ***************************************/
// reorder input_tm1_buf
int size_input_tm2 = 0;
if (tiles >= 8) {
size_input_tm2 = 36 * (tiles / 8 + (tiles % 8) / 4 + (tiles % 4) / 2 + tiles % 2) * in_c * 8;
} else if (tiles >= 4) {
size_input_tm2 = 36 * (tiles / 4 + (tiles % 4) / 2 + tiles % 2) * in_c * 4;
} else if (tiles >= 2) {
size_input_tm2 = 36 * (tiles / 2 + tiles % 2) * in_c * 2;
} else {
size_input_tm2 = 36 * tiles * in_c;
}
float *input_tm2_buf = (float *)csi_mem_alloc(size_input_tm2 * sizeof(float));
float *input_tm2_buf = (float *)csi_mem_alloc(36 * tiles * in_c * sizeof(float));

#pragma omp parallel for num_threads(1)
for (int r = 0; r < 36; r++) {

float *img_tm2 = input_tm2_buf + r * size_input_tm2 / 36; // input_tm2 r channel data
float *img_tm2 = input_tm2_buf + r * tiles * in_c; // input_tm2 r channel data

int t = 0;
for (; t + 7 < tiles; t += 8) {
Expand All @@ -3361,7 +3341,7 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input,
}
}
for (; t + 3 < tiles; t += 4) {
float *tm2 = img_tm2 + (t / 8 + (t % 8) / 4) * in_c * 8; // img_tm2 row data
float *tm2 = img_tm2 + t * in_c; // img_tm2 row data
float *tm1 = input_tm1_buf;

tm1 += (r * tiles + t) * 4;
Expand All @@ -3378,7 +3358,7 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input,
}
}
for (; t + 1 < tiles; t += 2) {
float *tm2 = img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2) * in_c * 8; // img_tm2 row data
float *tm2 = img_tm2 + t * in_c; // img_tm2 row data
float *tm1 = input_tm1_buf;

tm1 += (r * tiles + t) * 4;
Expand All @@ -3393,7 +3373,7 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input,

}
for (; t < tiles; t++) {
float *tm2 = img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2 + t % 2) * in_c * 8; // img_tm2 row data
float *tm2 = img_tm2 + t * in_c; // img_tm2 row data
float *tm1 = input_tm1_buf;

tm1 += (r * tiles + t) * 4;
Expand All @@ -3420,7 +3400,7 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input,

for (int r = 0; r < 36; r++) {

float *img_tm2 = input_tm2_buf + r * size_input_tm2 / 36; // img_tm2 第r个channel
float *img_tm2 = input_tm2_buf + r * tiles * in_c; // img_tm2 第r个channel

int t = 0;
for (; t + 7 < tiles; t += 8) {
Expand Down Expand Up @@ -3500,7 +3480,7 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input,
);
}
for (; t + 3 < tiles; t += 4) {
float *r0 = img_tm2 + (t / 8 + (t % 8) / 4) * in_c * 8;
float *r0 = img_tm2 + t * in_c;
float *k0 = kernel0_tm + r * in_c * 4;

asm volatile(
Expand Down Expand Up @@ -3551,7 +3531,7 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input,
);
}
for (; t + 1 < tiles; t += 2) {
float *r0 = img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2) * in_c * 8;
float *r0 = img_tm2 + t * in_c;
float *k0 = kernel0_tm + r * in_c * 4;

asm volatile(
Expand Down Expand Up @@ -3593,7 +3573,7 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input,
}
for (; t < tiles; t++) {

float *r0 = img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2 + t % 2) * in_c * 8;
float *r0 = img_tm2 + t * in_c;
float *k0 = kernel0_tm + r * in_c * 4;

asm volatile(
Expand Down

0 comments on commit 8f2d250

Please sign in to comment.