Briefly introduce Lab B loop reorder.
Computers usually have multiple cache layers between the processor and main memory, often called memory hierarchy. If you access the values in the cache smoothly, you will end up with better performance than random access data. In C/C++, arrays are usually stored in the row direction, which means that memory stores all values adjacent to each other for each row in the matrix. In each iteration, the value of k changes and increases. When running the innermost loop, each loop iteration will likely have a cache miss when loading B[k][j]. Every time the k increases, it will skip the entire column of the matrix and jump to a farther memory, which may exceed the value of the cache. Consider change the j order and k order. In each iteration, since the values are in row order, the value of C[i][j] is likely to be in cache. Similarly, B[k][j] may already be cached, and since i and k have not changed, A[i][k] may also be in the cache, which means that there are cache misses in each iteration in the inner loop.
Used pragma
- #pragma HLS UNROLL
- #pragma HLS LOOP_TRIPCOUNT min = c_size max = c_size
- #pragma HLS ARRAY_PARTITION variable = B dim = 2 complete
- #pragma HLS ARRAY_PARTITION variable = C dim = 2 complete
- #pragma HLS ARRAY_PARTITION variable = temp_sum dim = 1 complete
Read A, B
for (int itr = 0, i = 0, j = 0; itr < size * size; itr++, j++) {
#pragma HLS LOOP_TRIPCOUNT min = c_size* c_size max = c_size * c_size
if (j == size) {
j = 0;
i++;
}
A[i][j] = in1[itr];
}
readB:
for (int itr = 0, i = 0, j = 0; itr < size * size; itr++, j++) {
#pragma HLS LOOP_TRIPCOUNT min = c_size* c_size max = c_size * c_size
if (j == size) {
j = 0;
i++;
}
B[i][j] = in2[itr];
}
Loop Reorder
for (int i = 0; i < size; i++) {
#pragma HLS LOOP_TRIPCOUNT min = c_size max = c_size
lreorder2:
for (int k = 0; k < size; k++) {
#pragma HLS LOOP_TRIPCOUNT min = c_size max = c_size
lreorder3:
for (int j = 0; j < MAX_SIZE; j++) {
int result = (k == 0) ? 0 : temp_sum[j];
result += A[i][k] * B[k][j];
temp_sum[j] = result;
if (k == size - 1) C[i][j] = result;
}
}
}
Write C
for (int itr = 0, i = 0, j = 0; itr < size * size; itr++, j++) {
#pragma HLS LOOP_TRIPCOUNT min = c_size* c_size max = c_size * c_size
if (j == size) {
j = 0;
i++;
}
out_r[itr] = C[i][j];
}
Refer to the file below, and change the argument in 3.2.2 to 'binary_container_1.xclbin'
2021.2-Workbook-Lab3.pdf