-
Notifications
You must be signed in to change notification settings - Fork 8
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Use of opencl local address space instead of private in nvidia mm version #222
Comments
Ah, mm fails silently in the test. One problem is that loop unrolling needs to insert an if-guard, as the loop may be taken once, or never, depending on runtime values. Generated kernel, not sure what's going wrong there, the code is pretty massive: "__kernel __attribute__ ((reqd_work_group_size(16, 16, 1)))"
"void k0(global float* restrict output, int n205, int n204, int n206, const global float* restrict e207, const global float* restrict e208, local struct Record_32_4_float_32_256_float* restrict x1550){"
" /* Start of moved local vars */"
" /* End of moved local vars */"
" /* mapWorkGroup */"
" for (int wg_id_1614 = get_group_id(1); wg_id_1614 < (n204 / 4); wg_id_1614 = 16 + wg_id_1614) {"
" /* mapWorkGroup */"
" for (int wg_id_1615 = get_group_id(0); wg_id_1615 < (n205 / 256); wg_id_1615 = 64 + wg_id_1615) {"
" /* oclReduceSeq */"
" {"
" float x1401[256];"
" /* mapLocal */"
" /* unrolling loop of 1 */"
" if (get_local_id(1) < 4) {"
" /* mapLocal */"
" /* unrolling loop of 4 */"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" /* mapSeq */"
" /* unrolling loop of 4 */"
" x1401[4 * get_local_id(0)] = 0.0f;"
" x1401[1 + (4 * get_local_id(0))] = 0.0f;"
" x1401[2 + (4 * get_local_id(0))] = 0.0f;"
" x1401[3 + (4 * get_local_id(0))] = 0.0f;"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" /* mapSeq */"
" /* unrolling loop of 4 */"
" x1401[64 + (4 * get_local_id(0))] = 0.0f;"
" x1401[65 + (4 * get_local_id(0))] = 0.0f;"
" x1401[66 + (4 * get_local_id(0))] = 0.0f;"
" x1401[67 + (4 * get_local_id(0))] = 0.0f;"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" /* mapSeq */"
" /* unrolling loop of 4 */"
" x1401[128 + (4 * get_local_id(0))] = 0.0f;"
" x1401[129 + (4 * get_local_id(0))] = 0.0f;"
" x1401[130 + (4 * get_local_id(0))] = 0.0f;"
" x1401[131 + (4 * get_local_id(0))] = 0.0f;"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" /* mapSeq */"
" /* unrolling loop of 4 */"
" x1401[192 + (4 * get_local_id(0))] = 0.0f;"
" x1401[193 + (4 * get_local_id(0))] = 0.0f;"
" x1401[194 + (4 * get_local_id(0))] = 0.0f;"
" x1401[195 + (4 * get_local_id(0))] = 0.0f;"
" }"
" else {"
" /* skip */"
" }"
" "
" for (int i_1616 = 0; i_1616 < (n206 / 32); i_1616 = 1 + i_1616) {"
" /* mapLocal */"
" for (int l_id_1617 = get_local_id(1); l_id_1617 < 2; l_id_1617 = 16 + l_id_1617) {"
" /* mapLocal */"
" /* iteration count is exactly 1, no loop emitted */"
" int l_id_1618 = get_local_id(0);"
" vstore4(vload4(0, &e207[(((4 * wg_id_1614) + ((16 * l_id_1617) * n204)) + ((32 * i_1616) * n204)) + (l_id_1618 * n204)]), 0, &x1550[0]._fst[(4 * l_id_1618) + (64 * l_id_1617)]);"
" }"
" "
" /* mapLocal */"
" for (int l_id_1619 = get_local_id(1); l_id_1619 < 32; l_id_1619 = 16 + l_id_1619) {"
" /* mapLocal */"
" for (int l_id_1620 = get_local_id(0); l_id_1620 < 64; l_id_1620 = 16 + l_id_1620) {"
" vstore4(vload4(0, &e208[(((4 * l_id_1620) + ((32 * i_1616) * n205)) + (256 * wg_id_1615)) + (l_id_1619 * n205)]), 0, &x1550[0]._snd[(4 * l_id_1620) + (256 * l_id_1619)]);"
" }"
" "
" }"
" "
" barrier(CLK_LOCAL_MEM_FENCE);"
" /* mapLocal */"
" /* unrolling loop of 1 */"
" if (get_local_id(1) < 4) {"
" /* mapLocal */"
" /* unrolling loop of 4 */"
" /* oclReduceSeq */"
" {"
" float x1454[4];"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" /* mapSeq */"
" /* unrolling loop of 4 */"
" x1454[0] = x1401[4 * get_local_id(0)];"
" x1454[1] = x1401[1 + (4 * get_local_id(0))];"
" x1454[2] = x1401[2 + (4 * get_local_id(0))];"
" x1454[3] = x1401[3 + (4 * get_local_id(0))];"
" for (int i_1621 = 0; i_1621 < 32; i_1621 = 1 + i_1621) {"
" {"
" struct Record_1_float_4_float x1504;"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" x1504._fst[0] = x1550[0]._fst[(4 * i_1621) + get_local_id(1)];"
" /* mapSeq */"
" /* unrolling loop of 4 */"
" x1504._snd[0] = x1550[0]._snd[(4 * get_local_id(0)) + (256 * i_1621)];"
" x1504._snd[1] = x1550[0]._snd[(1 + (4 * get_local_id(0))) + (256 * i_1621)];"
" x1504._snd[2] = x1550[0]._snd[(2 + (4 * get_local_id(0))) + (256 * i_1621)];"
" x1504._snd[3] = x1550[0]._snd[(3 + (4 * get_local_id(0))) + (256 * i_1621)];"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" /* mapSeq */"
" /* unrolling loop of 4 */"
" x1454[0] = x1454[0] + (x1504._fst[0] * x1504._snd[0]);"
" x1454[1] = x1454[1] + (x1504._fst[0] * x1504._snd[1]);"
" x1454[2] = x1454[2] + (x1504._fst[0] * x1504._snd[2]);"
" x1454[3] = x1454[3] + (x1504._fst[0] * x1504._snd[3]);"
" }"
" "
" }"
" "
" /* mapSeq */"
" /* unrolling loop of 1 */"
" /* mapSeq */"
" /* unrolling loop of 4 */"
" x1401[4 * get_local_id(0)] = x1454[0];"
" x1401[1 + (4 * get_local_id(0))] = x1454[1];"
" x1401[2 + (4 * get_local_id(0))] = x1454[2];"
" x1401[3 + (4 * get_local_id(0))] = x1454[3];"
" }"
" "
" /* oclReduceSeq */"
" {"
" float x1454[4];"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" /* mapSeq */"
" /* unrolling loop of 4 */"
" x1454[0] = x1401[64 + (4 * get_local_id(0))];"
" x1454[1] = x1401[65 + (4 * get_local_id(0))];"
" x1454[2] = x1401[66 + (4 * get_local_id(0))];"
" x1454[3] = x1401[67 + (4 * get_local_id(0))];"
" for (int i_1622 = 0; i_1622 < 32; i_1622 = 1 + i_1622) {"
" {"
" struct Record_1_float_4_float x1504;"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" x1504._fst[0] = x1550[0]._fst[(4 * i_1622) + get_local_id(1)];"
" /* mapSeq */"
" /* unrolling loop of 4 */"
" x1504._snd[0] = x1550[0]._snd[(64 + (4 * get_local_id(0))) + (256 * i_1622)];"
" x1504._snd[1] = x1550[0]._snd[(65 + (4 * get_local_id(0))) + (256 * i_1622)];"
" x1504._snd[2] = x1550[0]._snd[(66 + (4 * get_local_id(0))) + (256 * i_1622)];"
" x1504._snd[3] = x1550[0]._snd[(67 + (4 * get_local_id(0))) + (256 * i_1622)];"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" /* mapSeq */"
" /* unrolling loop of 4 */"
" x1454[0] = x1454[0] + (x1504._fst[0] * x1504._snd[0]);"
" x1454[1] = x1454[1] + (x1504._fst[0] * x1504._snd[1]);"
" x1454[2] = x1454[2] + (x1504._fst[0] * x1504._snd[2]);"
" x1454[3] = x1454[3] + (x1504._fst[0] * x1504._snd[3]);"
" }"
" "
" }"
" "
" /* mapSeq */"
" /* unrolling loop of 1 */"
" /* mapSeq */"
" /* unrolling loop of 4 */"
" x1401[64 + (4 * get_local_id(0))] = x1454[0];"
" x1401[65 + (4 * get_local_id(0))] = x1454[1];"
" x1401[66 + (4 * get_local_id(0))] = x1454[2];"
" x1401[67 + (4 * get_local_id(0))] = x1454[3];"
" }"
" "
" /* oclReduceSeq */"
" {"
" float x1454[4];"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" /* mapSeq */"
" /* unrolling loop of 4 */"
" x1454[0] = x1401[128 + (4 * get_local_id(0))];"
" x1454[1] = x1401[129 + (4 * get_local_id(0))];"
" x1454[2] = x1401[130 + (4 * get_local_id(0))];"
" x1454[3] = x1401[131 + (4 * get_local_id(0))];"
" for (int i_1623 = 0; i_1623 < 32; i_1623 = 1 + i_1623) {"
" {"
" struct Record_1_float_4_float x1504;"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" x1504._fst[0] = x1550[0]._fst[(4 * i_1623) + get_local_id(1)];"
" /* mapSeq */"
" /* unrolling loop of 4 */"
" x1504._snd[0] = x1550[0]._snd[(128 + (4 * get_local_id(0))) + (256 * i_1623)];"
" x1504._snd[1] = x1550[0]._snd[(129 + (4 * get_local_id(0))) + (256 * i_1623)];"
" x1504._snd[2] = x1550[0]._snd[(130 + (4 * get_local_id(0))) + (256 * i_1623)];"
" x1504._snd[3] = x1550[0]._snd[(131 + (4 * get_local_id(0))) + (256 * i_1623)];"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" /* mapSeq */"
" /* unrolling loop of 4 */"
" x1454[0] = x1454[0] + (x1504._fst[0] * x1504._snd[0]);"
" x1454[1] = x1454[1] + (x1504._fst[0] * x1504._snd[1]);"
" x1454[2] = x1454[2] + (x1504._fst[0] * x1504._snd[2]);"
" x1454[3] = x1454[3] + (x1504._fst[0] * x1504._snd[3]);"
" }"
" "
" }"
" "
" /* mapSeq */"
" /* unrolling loop of 1 */"
" /* mapSeq */"
" /* unrolling loop of 4 */"
" x1401[128 + (4 * get_local_id(0))] = x1454[0];"
" x1401[129 + (4 * get_local_id(0))] = x1454[1];"
" x1401[130 + (4 * get_local_id(0))] = x1454[2];"
" x1401[131 + (4 * get_local_id(0))] = x1454[3];"
" }"
" "
" /* oclReduceSeq */"
" {"
" float x1454[4];"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" /* mapSeq */"
" /* unrolling loop of 4 */"
" x1454[0] = x1401[192 + (4 * get_local_id(0))];"
" x1454[1] = x1401[193 + (4 * get_local_id(0))];"
" x1454[2] = x1401[194 + (4 * get_local_id(0))];"
" x1454[3] = x1401[195 + (4 * get_local_id(0))];"
" for (int i_1624 = 0; i_1624 < 32; i_1624 = 1 + i_1624) {"
" {"
" struct Record_1_float_4_float x1504;"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" x1504._fst[0] = x1550[0]._fst[(4 * i_1624) + get_local_id(1)];"
" /* mapSeq */"
" /* unrolling loop of 4 */"
" x1504._snd[0] = x1550[0]._snd[(192 + (4 * get_local_id(0))) + (256 * i_1624)];"
" x1504._snd[1] = x1550[0]._snd[(193 + (4 * get_local_id(0))) + (256 * i_1624)];"
" x1504._snd[2] = x1550[0]._snd[(194 + (4 * get_local_id(0))) + (256 * i_1624)];"
" x1504._snd[3] = x1550[0]._snd[(195 + (4 * get_local_id(0))) + (256 * i_1624)];"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" /* mapSeq */"
" /* unrolling loop of 4 */"
" x1454[0] = x1454[0] + (x1504._fst[0] * x1504._snd[0]);"
" x1454[1] = x1454[1] + (x1504._fst[0] * x1504._snd[1]);"
" x1454[2] = x1454[2] + (x1504._fst[0] * x1504._snd[2]);"
" x1454[3] = x1454[3] + (x1504._fst[0] * x1504._snd[3]);"
" }"
" "
" }"
" "
" /* mapSeq */"
" /* unrolling loop of 1 */"
" /* mapSeq */"
" /* unrolling loop of 4 */"
" x1401[192 + (4 * get_local_id(0))] = x1454[0];"
" x1401[193 + (4 * get_local_id(0))] = x1454[1];"
" x1401[194 + (4 * get_local_id(0))] = x1454[2];"
" x1401[195 + (4 * get_local_id(0))] = x1454[3];"
" }"
" "
" }"
" else {"
" /* skip */"
" }"
" "
" barrier(CLK_LOCAL_MEM_FENCE);"
" }"
" "
" /* mapLocal */"
" /* unrolling loop of 1 */"
" if (get_local_id(1) < 4) {"
" /* mapLocal */"
" /* unrolling loop of 4 */"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" vstore4((float4)(x1401[4 * get_local_id(0)], x1401[1 + (4 * get_local_id(0))], x1401[2 + (4 * get_local_id(0))], x1401[3 + (4 * get_local_id(0))]), 0, &output[((((4 * n205) * wg_id_1614) + (4 * get_local_id(0))) + (256 * wg_id_1615)) + (n205 * get_local_id(1))]);"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" vstore4((float4)(x1401[64 + (4 * get_local_id(0))], x1401[65 + (4 * get_local_id(0))], x1401[66 + (4 * get_local_id(0))], x1401[67 + (4 * get_local_id(0))]), 0, &output[(((64 + ((4 * n205) * wg_id_1614)) + (4 * get_local_id(0))) + (256 * wg_id_1615)) + (n205 * get_local_id(1))]);"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" vstore4((float4)(x1401[128 + (4 * get_local_id(0))], x1401[129 + (4 * get_local_id(0))], x1401[130 + (4 * get_local_id(0))], x1401[131 + (4 * get_local_id(0))]), 0, &output[(((128 + ((4 * n205) * wg_id_1614)) + (4 * get_local_id(0))) + (256 * wg_id_1615)) + (n205 * get_local_id(1))]);"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" vstore4((float4)(x1401[192 + (4 * get_local_id(0))], x1401[193 + (4 * get_local_id(0))], x1401[194 + (4 * get_local_id(0))], x1401[195 + (4 * get_local_id(0))]), 0, &output[(((192 + ((4 * n205) * wg_id_1614)) + (4 * get_local_id(0))) + (256 * wg_id_1615)) + (n205 * get_local_id(1))]);"
" }"
" else {"
" /* skip */"
" }"
" "
" }"
" "
" }"
" "
" }"
" "
"}"
""; |
The mm is very heavy for non-gpu devices, so it's allowed to have an |
The execution succeeds on a RTX 2070. |
Problem
Use of private memory fails for some valid parameter configurations for the mm nvidia version if private opencl address space is used. Used to work in older versions. The performance is slightly worse if local address space is used.
Reproduce
Change lines in expression. Then, execute this test. The first parameter configuration will fail if the private version is used.
Output and Performance (RTX 2070)
Local
Private
Todo
Find out reason for
assertion failed
- error.The text was updated successfully, but these errors were encountered: