Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use of opencl local address space instead of private in nvidia mm version #222

Open
johanneslenfers opened this issue Feb 16, 2022 · 4 comments
Labels
bug Something isn't working prio:low

Comments

@johanneslenfers
Copy link
Member

Problem

Use of private memory fails for some valid parameter configurations for the mm nvidia version if private opencl address space is used. Used to work in older versions. The performance is slightly worse if local address space is used.

                // FIXME: there seems to be a bug in AdjustArraySizesForAllocations
//                oclReduceSeq(AddressSpace.Private)(fun((p13, p14) =>
                  oclReduceSeq(AddressSpace.Local)(fun((p13, p14) =>

Reproduce

Change lines in expression. Then, execute this test. The first parameter configuration will fail if the private version is used.

Output and Performance (RTX 2070)

Local

 1. 6.081472 ms
 2. 5.368928 ms

Private

1. CODE_GENERATION_ERROR - assertion failed
2. 4.872192 ms

Todo

Find out reason for assertion failed - error.

@johanneslenfers johanneslenfers added bug Something isn't working prio:low labels Feb 16, 2022
@Bastacyclop
Copy link
Member

"execute convolution" actually fails for me.
image
result.runtime.isRight was false

Bastacyclop added a commit that referenced this issue Feb 17, 2022
@Bastacyclop
Copy link
Member

Ah, mm fails silently in the test. One problem is that loop unrolling needs to insert an if-guard, as the loop may be taken once, or never, depending on runtime values.
I've tried to enable generating an if-guard in https://github.com/rise-lang/shine/tree/issue222, codegen now works but not execution.

Generated kernel, not sure what's going wrong there, the code is pretty massive:

"__kernel __attribute__ ((reqd_work_group_size(16, 16, 1)))"
"void k0(global float* restrict output, int n205, int n204, int n206, const global float* restrict e207, const global float* restrict e208, local struct Record_32_4_float_32_256_float* restrict x1550){"
"  /* Start of moved local vars */"
"  /* End of moved local vars */"
"  /* mapWorkGroup */"
"  for (int wg_id_1614 = get_group_id(1); wg_id_1614 < (n204 / 4); wg_id_1614 = 16 + wg_id_1614) {"
"    /* mapWorkGroup */"
"    for (int wg_id_1615 = get_group_id(0); wg_id_1615 < (n205 / 256); wg_id_1615 = 64 + wg_id_1615) {"
"      /* oclReduceSeq */"
"      {"
"        float x1401[256];"
"        /* mapLocal */"
"        /* unrolling loop of 1 */"
"        if (get_local_id(1) < 4) {"
"          /* mapLocal */"
"          /* unrolling loop of 4 */"
"          /* mapSeq */"
"          /* unrolling loop of 1 */"
"          /* mapSeq */"
"          /* unrolling loop of 4 */"
"          x1401[4 * get_local_id(0)] = 0.0f;"
"          x1401[1 + (4 * get_local_id(0))] = 0.0f;"
"          x1401[2 + (4 * get_local_id(0))] = 0.0f;"
"          x1401[3 + (4 * get_local_id(0))] = 0.0f;"
"          /* mapSeq */"
"          /* unrolling loop of 1 */"
"          /* mapSeq */"
"          /* unrolling loop of 4 */"
"          x1401[64 + (4 * get_local_id(0))] = 0.0f;"
"          x1401[65 + (4 * get_local_id(0))] = 0.0f;"
"          x1401[66 + (4 * get_local_id(0))] = 0.0f;"
"          x1401[67 + (4 * get_local_id(0))] = 0.0f;"
"          /* mapSeq */"
"          /* unrolling loop of 1 */"
"          /* mapSeq */"
"          /* unrolling loop of 4 */"
"          x1401[128 + (4 * get_local_id(0))] = 0.0f;"
"          x1401[129 + (4 * get_local_id(0))] = 0.0f;"
"          x1401[130 + (4 * get_local_id(0))] = 0.0f;"
"          x1401[131 + (4 * get_local_id(0))] = 0.0f;"
"          /* mapSeq */"
"          /* unrolling loop of 1 */"
"          /* mapSeq */"
"          /* unrolling loop of 4 */"
"          x1401[192 + (4 * get_local_id(0))] = 0.0f;"
"          x1401[193 + (4 * get_local_id(0))] = 0.0f;"
"          x1401[194 + (4 * get_local_id(0))] = 0.0f;"
"          x1401[195 + (4 * get_local_id(0))] = 0.0f;"
"        }"
"         else {"
"          /* skip */"
"        }"
"        "
"        for (int i_1616 = 0; i_1616 < (n206 / 32); i_1616 = 1 + i_1616) {"
"          /* mapLocal */"
"          for (int l_id_1617 = get_local_id(1); l_id_1617 < 2; l_id_1617 = 16 + l_id_1617) {"
"            /* mapLocal */"
"            /* iteration count is exactly 1, no loop emitted */"
"            int l_id_1618 = get_local_id(0);"
"            vstore4(vload4(0, &e207[(((4 * wg_id_1614) + ((16 * l_id_1617) * n204)) + ((32 * i_1616) * n204)) + (l_id_1618 * n204)]), 0, &x1550[0]._fst[(4 * l_id_1618) + (64 * l_id_1617)]);"
"          }"
"          "
"          /* mapLocal */"
"          for (int l_id_1619 = get_local_id(1); l_id_1619 < 32; l_id_1619 = 16 + l_id_1619) {"
"            /* mapLocal */"
"            for (int l_id_1620 = get_local_id(0); l_id_1620 < 64; l_id_1620 = 16 + l_id_1620) {"
"              vstore4(vload4(0, &e208[(((4 * l_id_1620) + ((32 * i_1616) * n205)) + (256 * wg_id_1615)) + (l_id_1619 * n205)]), 0, &x1550[0]._snd[(4 * l_id_1620) + (256 * l_id_1619)]);"
"            }"
"            "
"          }"
"          "
"          barrier(CLK_LOCAL_MEM_FENCE);"
"          /* mapLocal */"
"          /* unrolling loop of 1 */"
"          if (get_local_id(1) < 4) {"
"            /* mapLocal */"
"            /* unrolling loop of 4 */"
"            /* oclReduceSeq */"
"            {"
"              float x1454[4];"
"              /* mapSeq */"
"              /* unrolling loop of 1 */"
"              /* mapSeq */"
"              /* unrolling loop of 4 */"
"              x1454[0] = x1401[4 * get_local_id(0)];"
"              x1454[1] = x1401[1 + (4 * get_local_id(0))];"
"              x1454[2] = x1401[2 + (4 * get_local_id(0))];"
"              x1454[3] = x1401[3 + (4 * get_local_id(0))];"
"              for (int i_1621 = 0; i_1621 < 32; i_1621 = 1 + i_1621) {"
"                {"
"                  struct Record_1_float_4_float x1504;"
"                  /* mapSeq */"
"                  /* unrolling loop of 1 */"
"                  x1504._fst[0] = x1550[0]._fst[(4 * i_1621) + get_local_id(1)];"
"                  /* mapSeq */"
"                  /* unrolling loop of 4 */"
"                  x1504._snd[0] = x1550[0]._snd[(4 * get_local_id(0)) + (256 * i_1621)];"
"                  x1504._snd[1] = x1550[0]._snd[(1 + (4 * get_local_id(0))) + (256 * i_1621)];"
"                  x1504._snd[2] = x1550[0]._snd[(2 + (4 * get_local_id(0))) + (256 * i_1621)];"
"                  x1504._snd[3] = x1550[0]._snd[(3 + (4 * get_local_id(0))) + (256 * i_1621)];"
"                  /* mapSeq */"
"                  /* unrolling loop of 1 */"
"                  /* mapSeq */"
"                  /* unrolling loop of 4 */"
"                  x1454[0] = x1454[0] + (x1504._fst[0] * x1504._snd[0]);"
"                  x1454[1] = x1454[1] + (x1504._fst[0] * x1504._snd[1]);"
"                  x1454[2] = x1454[2] + (x1504._fst[0] * x1504._snd[2]);"
"                  x1454[3] = x1454[3] + (x1504._fst[0] * x1504._snd[3]);"
"                }"
"                "
"              }"
"              "
"              /* mapSeq */"
"              /* unrolling loop of 1 */"
"              /* mapSeq */"
"              /* unrolling loop of 4 */"
"              x1401[4 * get_local_id(0)] = x1454[0];"
"              x1401[1 + (4 * get_local_id(0))] = x1454[1];"
"              x1401[2 + (4 * get_local_id(0))] = x1454[2];"
"              x1401[3 + (4 * get_local_id(0))] = x1454[3];"
"            }"
"            "
"            /* oclReduceSeq */"
"            {"
"              float x1454[4];"
"              /* mapSeq */"
"              /* unrolling loop of 1 */"
"              /* mapSeq */"
"              /* unrolling loop of 4 */"
"              x1454[0] = x1401[64 + (4 * get_local_id(0))];"
"              x1454[1] = x1401[65 + (4 * get_local_id(0))];"
"              x1454[2] = x1401[66 + (4 * get_local_id(0))];"
"              x1454[3] = x1401[67 + (4 * get_local_id(0))];"
"              for (int i_1622 = 0; i_1622 < 32; i_1622 = 1 + i_1622) {"
"                {"
"                  struct Record_1_float_4_float x1504;"
"                  /* mapSeq */"
"                  /* unrolling loop of 1 */"
"                  x1504._fst[0] = x1550[0]._fst[(4 * i_1622) + get_local_id(1)];"
"                  /* mapSeq */"
"                  /* unrolling loop of 4 */"
"                  x1504._snd[0] = x1550[0]._snd[(64 + (4 * get_local_id(0))) + (256 * i_1622)];"
"                  x1504._snd[1] = x1550[0]._snd[(65 + (4 * get_local_id(0))) + (256 * i_1622)];"
"                  x1504._snd[2] = x1550[0]._snd[(66 + (4 * get_local_id(0))) + (256 * i_1622)];"
"                  x1504._snd[3] = x1550[0]._snd[(67 + (4 * get_local_id(0))) + (256 * i_1622)];"
"                  /* mapSeq */"
"                  /* unrolling loop of 1 */"
"                  /* mapSeq */"
"                  /* unrolling loop of 4 */"
"                  x1454[0] = x1454[0] + (x1504._fst[0] * x1504._snd[0]);"
"                  x1454[1] = x1454[1] + (x1504._fst[0] * x1504._snd[1]);"
"                  x1454[2] = x1454[2] + (x1504._fst[0] * x1504._snd[2]);"
"                  x1454[3] = x1454[3] + (x1504._fst[0] * x1504._snd[3]);"
"                }"
"                "
"              }"
"              "
"              /* mapSeq */"
"              /* unrolling loop of 1 */"
"              /* mapSeq */"
"              /* unrolling loop of 4 */"
"              x1401[64 + (4 * get_local_id(0))] = x1454[0];"
"              x1401[65 + (4 * get_local_id(0))] = x1454[1];"
"              x1401[66 + (4 * get_local_id(0))] = x1454[2];"
"              x1401[67 + (4 * get_local_id(0))] = x1454[3];"
"            }"
"            "
"            /* oclReduceSeq */"
"            {"
"              float x1454[4];"
"              /* mapSeq */"
"              /* unrolling loop of 1 */"
"              /* mapSeq */"
"              /* unrolling loop of 4 */"
"              x1454[0] = x1401[128 + (4 * get_local_id(0))];"
"              x1454[1] = x1401[129 + (4 * get_local_id(0))];"
"              x1454[2] = x1401[130 + (4 * get_local_id(0))];"
"              x1454[3] = x1401[131 + (4 * get_local_id(0))];"
"              for (int i_1623 = 0; i_1623 < 32; i_1623 = 1 + i_1623) {"
"                {"
"                  struct Record_1_float_4_float x1504;"
"                  /* mapSeq */"
"                  /* unrolling loop of 1 */"
"                  x1504._fst[0] = x1550[0]._fst[(4 * i_1623) + get_local_id(1)];"
"                  /* mapSeq */"
"                  /* unrolling loop of 4 */"
"                  x1504._snd[0] = x1550[0]._snd[(128 + (4 * get_local_id(0))) + (256 * i_1623)];"
"                  x1504._snd[1] = x1550[0]._snd[(129 + (4 * get_local_id(0))) + (256 * i_1623)];"
"                  x1504._snd[2] = x1550[0]._snd[(130 + (4 * get_local_id(0))) + (256 * i_1623)];"
"                  x1504._snd[3] = x1550[0]._snd[(131 + (4 * get_local_id(0))) + (256 * i_1623)];"
"                  /* mapSeq */"
"                  /* unrolling loop of 1 */"
"                  /* mapSeq */"
"                  /* unrolling loop of 4 */"
"                  x1454[0] = x1454[0] + (x1504._fst[0] * x1504._snd[0]);"
"                  x1454[1] = x1454[1] + (x1504._fst[0] * x1504._snd[1]);"
"                  x1454[2] = x1454[2] + (x1504._fst[0] * x1504._snd[2]);"
"                  x1454[3] = x1454[3] + (x1504._fst[0] * x1504._snd[3]);"
"                }"
"                "
"              }"
"              "
"              /* mapSeq */"
"              /* unrolling loop of 1 */"
"              /* mapSeq */"
"              /* unrolling loop of 4 */"
"              x1401[128 + (4 * get_local_id(0))] = x1454[0];"
"              x1401[129 + (4 * get_local_id(0))] = x1454[1];"
"              x1401[130 + (4 * get_local_id(0))] = x1454[2];"
"              x1401[131 + (4 * get_local_id(0))] = x1454[3];"
"            }"
"            "
"            /* oclReduceSeq */"
"            {"
"              float x1454[4];"
"              /* mapSeq */"
"              /* unrolling loop of 1 */"
"              /* mapSeq */"
"              /* unrolling loop of 4 */"
"              x1454[0] = x1401[192 + (4 * get_local_id(0))];"
"              x1454[1] = x1401[193 + (4 * get_local_id(0))];"
"              x1454[2] = x1401[194 + (4 * get_local_id(0))];"
"              x1454[3] = x1401[195 + (4 * get_local_id(0))];"
"              for (int i_1624 = 0; i_1624 < 32; i_1624 = 1 + i_1624) {"
"                {"
"                  struct Record_1_float_4_float x1504;"
"                  /* mapSeq */"
"                  /* unrolling loop of 1 */"
"                  x1504._fst[0] = x1550[0]._fst[(4 * i_1624) + get_local_id(1)];"
"                  /* mapSeq */"
"                  /* unrolling loop of 4 */"
"                  x1504._snd[0] = x1550[0]._snd[(192 + (4 * get_local_id(0))) + (256 * i_1624)];"
"                  x1504._snd[1] = x1550[0]._snd[(193 + (4 * get_local_id(0))) + (256 * i_1624)];"
"                  x1504._snd[2] = x1550[0]._snd[(194 + (4 * get_local_id(0))) + (256 * i_1624)];"
"                  x1504._snd[3] = x1550[0]._snd[(195 + (4 * get_local_id(0))) + (256 * i_1624)];"
"                  /* mapSeq */"
"                  /* unrolling loop of 1 */"
"                  /* mapSeq */"
"                  /* unrolling loop of 4 */"
"                  x1454[0] = x1454[0] + (x1504._fst[0] * x1504._snd[0]);"
"                  x1454[1] = x1454[1] + (x1504._fst[0] * x1504._snd[1]);"
"                  x1454[2] = x1454[2] + (x1504._fst[0] * x1504._snd[2]);"
"                  x1454[3] = x1454[3] + (x1504._fst[0] * x1504._snd[3]);"
"                }"
"                "
"              }"
"              "
"              /* mapSeq */"
"              /* unrolling loop of 1 */"
"              /* mapSeq */"
"              /* unrolling loop of 4 */"
"              x1401[192 + (4 * get_local_id(0))] = x1454[0];"
"              x1401[193 + (4 * get_local_id(0))] = x1454[1];"
"              x1401[194 + (4 * get_local_id(0))] = x1454[2];"
"              x1401[195 + (4 * get_local_id(0))] = x1454[3];"
"            }"
"            "
"          }"
"           else {"
"            /* skip */"
"          }"
"          "
"          barrier(CLK_LOCAL_MEM_FENCE);"
"        }"
"        "
"        /* mapLocal */"
"        /* unrolling loop of 1 */"
"        if (get_local_id(1) < 4) {"
"          /* mapLocal */"
"          /* unrolling loop of 4 */"
"          /* mapSeq */"
"          /* unrolling loop of 1 */"
"          /* mapSeq */"
"          /* unrolling loop of 1 */"
"          vstore4((float4)(x1401[4 * get_local_id(0)], x1401[1 + (4 * get_local_id(0))], x1401[2 + (4 * get_local_id(0))], x1401[3 + (4 * get_local_id(0))]), 0, &output[((((4 * n205) * wg_id_1614) + (4 * get_local_id(0))) + (256 * wg_id_1615)) + (n205 * get_local_id(1))]);"
"          /* mapSeq */"
"          /* unrolling loop of 1 */"
"          /* mapSeq */"
"          /* unrolling loop of 1 */"
"          vstore4((float4)(x1401[64 + (4 * get_local_id(0))], x1401[65 + (4 * get_local_id(0))], x1401[66 + (4 * get_local_id(0))], x1401[67 + (4 * get_local_id(0))]), 0, &output[(((64 + ((4 * n205) * wg_id_1614)) + (4 * get_local_id(0))) + (256 * wg_id_1615)) + (n205 * get_local_id(1))]);"
"          /* mapSeq */"
"          /* unrolling loop of 1 */"
"          /* mapSeq */"
"          /* unrolling loop of 1 */"
"          vstore4((float4)(x1401[128 + (4 * get_local_id(0))], x1401[129 + (4 * get_local_id(0))], x1401[130 + (4 * get_local_id(0))], x1401[131 + (4 * get_local_id(0))]), 0, &output[(((128 + ((4 * n205) * wg_id_1614)) + (4 * get_local_id(0))) + (256 * wg_id_1615)) + (n205 * get_local_id(1))]);"
"          /* mapSeq */"
"          /* unrolling loop of 1 */"
"          /* mapSeq */"
"          /* unrolling loop of 1 */"
"          vstore4((float4)(x1401[192 + (4 * get_local_id(0))], x1401[193 + (4 * get_local_id(0))], x1401[194 + (4 * get_local_id(0))], x1401[195 + (4 * get_local_id(0))]), 0, &output[(((192 + ((4 * n205) * wg_id_1614)) + (4 * get_local_id(0))) + (256 * wg_id_1615)) + (n205 * get_local_id(1))]);"
"        }"
"         else {"
"          /* skip */"
"        }"
"        "
"      }"
"      "
"    }"
"    "
"  }"
"  "
"}"
"";

@johanneslenfers
Copy link
Member Author

The mm is very heavy for non-gpu devices, so it's allowed to have an EXECUTION_ERROR. Btw. the assert to check that is missing here. If the codegen works it should be fine.

@johanneslenfers
Copy link
Member Author

The mm is very heavy for non-gpu devices, so it's allowed to have an EXECUTION_ERROR. Btw. the assert to check that is missing here. If the codegen works it should be fine.

The execution succeeds on a RTX 2070.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bug Something isn't working prio:low
Projects
None yet
Development

No branches or pull requests

2 participants