diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index 2c2bbf9a0c..49755e4ad6 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -23,6 +23,10 @@ /* Defines */ #define MAX_HW_DIMS 3 +#define KERNEL_PRIMARY 0 +#define KERNEL_AUXILIARY 1 +#define AXIS_FREE 0 +#define AXIS_REDUX 1 @@ -185,7 +189,8 @@ struct redux_ctx{ const int* reduxList; /* General. */ - int* axisList; + int* srcAxisList; + int* dstAxisList; gpucontext* gpuCtx; /* Source code Generator. */ @@ -203,21 +208,39 @@ struct redux_ctx{ int ndd; int ndr; int nds; - int ndh; - int ndhd; - int ndhr; int largeCodeModel; strb s; char* sourceCode; + size_t sourceCodeLen; + char* errorString0; + char* errorString1; + char* errorString2; GpuKernel preKernel; GpuKernel kernel; GpuKernel postKernel; - /* Scheduler */ - int hwAxisList[MAX_HW_DIMS]; - size_t blockSize [MAX_HW_DIMS]; - size_t gridSize [MAX_HW_DIMS]; - size_t chunkSize [MAX_HW_DIMS]; + /** + * Scheduler + * + * There are two sets of kernels that may be scheduled: + * 1) The reduction kernel. This is the only kernel scheduled in the + * large code model. + * 2) The initialization and post-scalar kernels. These are scheduled + * only in the small code model. + * + * The reduction kernel is the "primary" kernel. The other two, if needed, + * are referred to as "auxiliary" kernels. + */ + + struct{ + int ndh; + int ndhd; + int ndhr; + int axisList [MAX_HW_DIMS]; + size_t bs [MAX_HW_DIMS]; + size_t gs [MAX_HW_DIMS]; + size_t cs [MAX_HW_DIMS]; + } pri, aux; /* Invoker */ gpudata* srcStepsGD; @@ -257,8 +280,12 @@ static int reduxHasDst (redux_ctx* ctx); static int reduxHasDstArg (redux_ctx* ctx); static int reduxKernelRequiresDst (redux_ctx* ctx); static int reduxKernelRequiresDstArg (redux_ctx* ctx); -static int reduxCanAppendHwAxis (redux_ctx* ctx, int wantReductionAxis); -static void reduxAppendLargestAxisToHwList(redux_ctx* ctx, int wantReductionAxis); +static int reduxCanAppendHwAxis (redux_ctx* ctx, + int kernelType, + int axisType); +static void reduxAppendLargestAxisToHwList(redux_ctx* ctx, + int kernelType, + int axisType); static int reduxSelectHwAxes (redux_ctx* ctx); static int reduxComputeAxisList (redux_ctx* ctx); static int reduxGenSource (redux_ctx* ctx); @@ -279,10 +306,19 @@ static void reduxAppendLoopMacroDefs (redux_ctx* ctx); static void reduxAppendLoopOuter (redux_ctx* ctx); static void reduxAppendLoopInner (redux_ctx* ctx); static void reduxAppendLoopMacroUndefs (redux_ctx* ctx); -static int reduxCompileLarge (redux_ctx* ctx); -static int reduxCompileSmall (redux_ctx* ctx); -static int reduxScheduleLarge (redux_ctx* ctx); -static int reduxInvokeLarge (redux_ctx* ctx); +static int reduxCompile (redux_ctx* ctx); +static int reduxSchedule (redux_ctx* ctx); +static void reduxScheduleKernel (int ndims, + uint64_t* dims, + uint64_t warpSize, + uint64_t maxLg, + uint64_t* maxLs, + uint64_t maxGg, + uint64_t* maxGs, + uint64_t* bs, + uint64_t* gs, + uint64_t* cs); +static int reduxInvoke (redux_ctx* ctx); static int reduxCleanup (redux_ctx* ctx, int ret); @@ -748,23 +784,28 @@ static int reduxCheckargs (redux_ctx* ctx){ * We initialize certain parts of the context. */ - ctx->axisList = NULL; + ctx->srcAxisList = NULL; + ctx->dstAxisList = NULL; ctx->gpuCtx = NULL; ctx->srcTypeStr = ctx->dstTypeStr = ctx->dstArgTypeStr = ctx->accTypeStr = ctx->idxTypeStr = NULL; ctx->initVal = NULL; - ctx->ndh = 0; - ctx->ndhd = 0; - ctx->ndhr = 0; + ctx->pri.ndh = ctx->aux.ndh = 0; + ctx->pri.ndhd = ctx->aux.ndhd = 0; + ctx->pri.ndhr = ctx->aux.ndhr = 0; ctx->sourceCode = NULL; + ctx->sourceCodeLen = 0; + ctx->errorString0 = NULL; + ctx->errorString1 = NULL; + ctx->errorString2 = NULL; strb_init(&ctx->s); for (i=0;ihwAxisList[i] = 0; - ctx->blockSize [i] = 1; - ctx->gridSize [i] = 1; - ctx->chunkSize [i] = 1; + ctx->aux.axisList[i] = ctx->pri.axisList[i] = 0; + ctx->aux.bs [i] = ctx->pri.bs [i] = 1; + ctx->aux.gs [i] = ctx->pri.gs [i] = 1; + ctx->aux.cs [i] = ctx->pri.cs [i] = 1; } ctx->srcStepsGD = ctx->srcSizeGD = ctx->chunkSizeGD = @@ -1053,75 +1094,108 @@ static int reduxKernelRequiresDstArg (redux_ctx* ctx){ } /** - * @brief Check whether we can add another reduction axis - * (wantReductionAxis=1) or destination axis (wantReductionAxis=0) to - * the hardware axis list. + * @brief Check whether we can add another reduction axis or free axis + * to the hardware axis list for either the primary or secondary kernel. */ -static int reduxCanAppendHwAxis (redux_ctx* ctx, int wantReductionAxis){ - if (ctx->ndh >= MAX_HW_DIMS){ +static int reduxCanAppendHwAxis (redux_ctx* ctx, + int kernelType, + int axisType){ + int kernelNdh = kernelType == KERNEL_PRIMARY ? ctx->pri.ndh : ctx->aux.ndh; + int kernelNdhr = kernelType == KERNEL_PRIMARY ? ctx->pri.ndhr : ctx->aux.ndhr; + int kernelNdhd = kernelType == KERNEL_PRIMARY ? ctx->pri.ndhd : ctx->aux.ndhd; + + if(kernelNdh >= MAX_HW_DIMS){ return 0; }else{ - return wantReductionAxis ? ctx->ndhr < ctx->ndr: - ctx->ndhd < ctx->ndd; + return axisType == AXIS_REDUX ? kernelNdhr < ctx->ndr: + kernelNdhd < ctx->ndd; } } /** - * @brief Append the largest reduction axis (wantReductionAxis=1) or - * destination axis (wantReductionAxis=0) that isn't yet in the hardware - * axis list into said hardware axis list. + * @brief Append the largest reduction axis or free axis that isn't yet + * in the hardware axis list for either the primary or secondary kernel + * into said hardware axis list. */ -static void reduxAppendLargestAxisToHwList(redux_ctx* ctx, int wantReductionAxis){ +static void reduxAppendLargestAxisToHwList(redux_ctx* ctx, + int kernelType, + int axisType){ int maxI = 0, i, isInHwList, isInReduxList, isInDesiredList, isLargestSoFar; - size_t maxV = 0; + int* hwAxisList, * ndh, * ndhr, * ndhd; + size_t v, maxV = 0; + + /* Get pointers to the correct kernel's variables */ + hwAxisList = kernelType == KERNEL_PRIMARY ? ctx->pri.axisList: + ctx->aux.axisList; + ndh = kernelType == KERNEL_PRIMARY ? &ctx->pri.ndh: + &ctx->aux.ndh; + ndhr = kernelType == KERNEL_PRIMARY ? &ctx->pri.ndhr: + &ctx->aux.ndhr; + ndhd = kernelType == KERNEL_PRIMARY ? &ctx->pri.ndhd: + &ctx->aux.ndhd; /* Find */ for (i=0;inds;i++){ - isInHwList = axisInSet(i, ctx->hwAxisList, ctx->ndh, 0); - isInReduxList = axisInSet(i, ctx->reduxList, ctx->ndr, 0); - isInDesiredList = wantReductionAxis ? isInReduxList : !isInReduxList; - isLargestSoFar = ctx->src->dimensions[i] >= maxV; + isInHwList = axisInSet(i, hwAxisList, *ndh, 0); + isInReduxList = axisInSet(i, ctx->reduxList, ctx->ndr, 0); + isInDesiredList = axisType == AXIS_REDUX ? isInReduxList: + !isInReduxList; + v = ctx->src->dimensions[i]; + isLargestSoFar = v >= maxV; if (!isInHwList && isInDesiredList && isLargestSoFar){ - maxV = ctx->src->dimensions[i]; + maxV = v; maxI = i; } } /* Append */ - ctx->hwAxisList[ctx->ndh++] = maxI; - if (wantReductionAxis){ - ctx->ndhr++; + hwAxisList[(*ndh)++] = maxI; + if (axisType == AXIS_REDUX){ + (*ndhr)++; }else{ - ctx->ndhd++; + (*ndhd)++; } } /** * @brief Select which axes (up to MAX_HW_DIMS) will be assigned to hardware - * dimensions. + * dimensions for both the primary and auxiliary kernels. + * + * LARGE code model: Up to the MAX_HW_DIMS largest free axes are selected. + * Because the primary reduction kernel does everything, it's + * not necessary to compute an auxiliary kernel axis + * selection (or at least, one distinct from the primary + * kernel's). * - * For the "large" code model: The up-to-MAX_HW_DIMS largest destination tensor - * dimensions are selected. - * For the "small" code model: Up to MAX_HW_DIMS reduction dimensions (largest- - * to-smallest) are selected. If less than - * MAX_HW_DIMS dimensions were selected, - * destination tensor dimensions are selected until - * MAX_HW_DIMS total dimensions are selected, or no - * destination tensors are left. + * SMALL code model: For the primary reduction kernel, up to MAX_HW_DIMS + * reduction axes (largest-to-smallest) are selected. If less + * than MAX_HW_DIMS axes were selected, free axes are + * selected until MAX_HW_DIMS total axes are selected, or no + * free axes are left. + * + * For the auxiliary reduction kernel, up to the MAX_HW_DIMS + * largest free axes are selected. */ static int reduxSelectHwAxes (redux_ctx* ctx){ - if (reduxIsSmallCodeModel(ctx)){ - while (reduxCanAppendHwAxis(ctx, 1)){ - reduxAppendLargestAxisToHwList(ctx, 1); + if (reduxIsLargeCodeModel(ctx)){ + while (reduxCanAppendHwAxis (ctx, KERNEL_PRIMARY, AXIS_FREE)){ + reduxAppendLargestAxisToHwList(ctx, KERNEL_PRIMARY, AXIS_FREE); + } + }else{ + while (reduxCanAppendHwAxis (ctx, KERNEL_PRIMARY, AXIS_REDUX)){ + reduxAppendLargestAxisToHwList(ctx, KERNEL_PRIMARY, AXIS_REDUX); + } + while (reduxCanAppendHwAxis (ctx, KERNEL_PRIMARY, AXIS_FREE)){ + reduxAppendLargestAxisToHwList(ctx, KERNEL_PRIMARY, AXIS_FREE); } - } - while (reduxCanAppendHwAxis(ctx, 0)){ - reduxAppendLargestAxisToHwList(ctx, 0); + while (reduxCanAppendHwAxis (ctx, KERNEL_AUXILIARY, AXIS_FREE)){ + reduxAppendLargestAxisToHwList(ctx, KERNEL_AUXILIARY, AXIS_FREE); + } } return reduxComputeAxisList(ctx); @@ -1146,17 +1220,17 @@ static int reduxSelectHwAxes (redux_ctx* ctx){ static int reduxComputeAxisList (redux_ctx* ctx){ int i, f=0; - ctx->axisList = malloc(ctx->nds * sizeof(unsigned)); - if (!ctx->axisList){ + ctx->srcAxisList = malloc(ctx->nds * sizeof(unsigned)); + if (!ctx->srcAxisList){ return reduxCleanup(ctx, GA_MEMORY_ERROR); } for (i=0;inds;i++){ if (!axisInSet(i, ctx->reduxList, ctx->ndr, 0)){ - ctx->axisList[f++] = i; + ctx->srcAxisList[f++] = i; } } - memcpy(&ctx->axisList[f], ctx->reduxList, ctx->ndr * sizeof(*ctx->reduxList)); + memcpy(&ctx->srcAxisList[f], ctx->reduxList, ctx->ndr * sizeof(*ctx->reduxList)); return reduxGenSource(ctx); @@ -1170,49 +1244,50 @@ static int reduxComputeAxisList (redux_ctx* ctx){ static int reduxGenSource (redux_ctx* ctx){ reduxAppendSource(ctx); - ctx->sourceCode = strb_cstr(&ctx->s); + ctx->sourceCodeLen = ctx->s.l; + ctx->sourceCode = strb_cstr(&ctx->s); if (!ctx->sourceCode){ return reduxCleanup(ctx, GA_MEMORY_ERROR); } - return reduxIsLargeCodeModel(ctx) ? reduxCompileLarge(ctx): - reduxCompileSmall(ctx); + return reduxCompile(ctx); } static void reduxAppendSource (redux_ctx* ctx){ reduxAppendTypedefs (ctx); reduxAppendFuncGetInitVal (ctx); reduxAppendFuncLoadVal (ctx); reduxAppendFuncReduxVal (ctx); - reduxAppendFuncPreKernel (ctx); + if(reduxIsSmallCodeModel(ctx)){ + reduxAppendFuncPreKernel (ctx); + reduxAppendFuncPostKernel (ctx); + } reduxAppendFuncKernel (ctx); - reduxAppendFuncPostKernel (ctx); } static void reduxAppendTypedefs (redux_ctx* ctx){ - strb_appends(&ctx->s, "/* Typedefs */\n"); - strb_appendf(&ctx->s, "typedef %s S;/* The type of the source array. */\n", ctx->srcTypeStr); - strb_appendf(&ctx->s, "typedef %s T;/* The type of the destination array. */\n", ctx->dstTypeStr); - strb_appendf(&ctx->s, "typedef %s A;/* The type of the destination argument array. */\n", ctx->dstArgTypeStr); - strb_appendf(&ctx->s, "typedef %s X;/* The type of the indices: signed 32/64-bit. */\n", ctx->idxTypeStr); - strb_appendf(&ctx->s, "typedef %s K;/* The type of the accumulator variable. */\n", ctx->accTypeStr); - strb_appends(&ctx->s, "\n\n\n"); + strb_appendf(&ctx->s, "typedef %s S;\n", ctx->srcTypeStr); /* The type of the source array. */ + strb_appendf(&ctx->s, "typedef %s T;\n", ctx->dstTypeStr); /* The type of the destination array. */ + strb_appendf(&ctx->s, "typedef %s A;\n", ctx->dstArgTypeStr);/* The type of the destination argument array. */ + strb_appendf(&ctx->s, "typedef %s X;\n", ctx->idxTypeStr); /* The type of the indices: signed 32/64-bit. */ + strb_appendf(&ctx->s, "typedef %s K;\n", ctx->accTypeStr); /* The type of the accumulator variable. */ } static void reduxAppendFuncGetInitVal (redux_ctx* ctx){ - strb_appends(&ctx->s, "/**\n"); - strb_appends(&ctx->s, " * Initial value function.\n"); - strb_appends(&ctx->s, " */\n\n"); - strb_appends(&ctx->s, "WITHIN_KERNEL K getInitVal(void){\n"); - strb_appendf(&ctx->s, "\treturn (%s);\n", ctx->initVal); - strb_appends(&ctx->s, "}\n\n\n\n"); + /** + * Initial value function. + */ + + strb_appendf(&ctx->s, "WITHIN_KERNEL K getInitVal(void){\n" + "\treturn (%s);\n" + "}\n\n\n\n", ctx->initVal); } static void reduxAppendFuncLoadVal (redux_ctx* ctx){ int i; - strb_appends(&ctx->s, "/**\n"); - strb_appends(&ctx->s, " * Multidimensional source element loader.\n"); - strb_appends(&ctx->s, " *\n"); - strb_appends(&ctx->s, " * Also implements prescalar transformations if any.\n"); - strb_appends(&ctx->s, " */\n"); - strb_appends(&ctx->s, "\n"); + /** + * Multidimensional source element loader. + * + * Also implements prescalar transformations if any. + */ + appendIdxes (&ctx->s, "WITHIN_KERNEL K loadVal(", "X i", 0, ctx->nds, "", ""); if (ctx->nds > 0){ strb_appends(&ctx->s, ", "); @@ -1220,7 +1295,7 @@ static void reduxAppendFuncLoadVal (redux_ctx* ctx){ strb_appends(&ctx->s, "const GLOBAL_MEM S* src, const GLOBAL_MEM X* srcSteps){\n"); strb_appends(&ctx->s, "\tS v = (*(const GLOBAL_MEM S*)((const GLOBAL_MEM char*)src + "); for (i=0;inds;i++){ - strb_appendf(&ctx->s, "i%d*srcSteps[%d] + \\\n\t ", i, ctx->axisList[i]); + strb_appendf(&ctx->s, "i%d*srcSteps[%d] + \\\n\t ", i, ctx->srcAxisList[i]); } strb_appends(&ctx->s, "0));\n"); @@ -1233,15 +1308,16 @@ static void reduxAppendFuncLoadVal (redux_ctx* ctx){ static void reduxAppendFuncReduxVal (redux_ctx* ctx){ int i, anyArgsEmitted = 0; - /* Function Signature. */ - strb_appends(&ctx->s, "/**\n"); - strb_appends(&ctx->s, " * Global memory value reduction function.\n"); - strb_appends(&ctx->s, " *\n"); - strb_appends(&ctx->s, " * Responsible for either:\n"); - strb_appends(&ctx->s, " * 1) Safe writeback of final value to memory, or\n"); - strb_appends(&ctx->s, " * 2) Safe atomic reduction of partial value into memory.\n"); - strb_appends(&ctx->s, " */\n"); - strb_appends(&ctx->s, "\n"); + /** + * Function Signature. + * + * Global memory value reduction function. + * + * Responsible for either: + * 1) Safe writeback of final value to memory, or + * 2) Safe atomic reduction of partial value into memory. + */ + appendIdxes (&ctx->s, "WITHIN_KERNEL void reduxVal(", "X i", 0, ctx->ndd, "", ""); anyArgsEmitted = ctx->ndd>0; if (reduxKernelRequiresDst (ctx)){ @@ -1347,11 +1423,11 @@ static void reduxAppendIndexDeclarations (redux_ctx* ctx){ strb_appends(&ctx->s, "\tX bd0 = LDIM_0, bd1 = LDIM_1, bd2 = LDIM_2;\n"); strb_appends(&ctx->s, "\tX ti0 = LID_0, ti1 = LID_1, ti2 = LID_2;\n"); strb_appends(&ctx->s, "\tX gi0 = bi0*bd0+ti0, gi1 = bi1*bd1+ti1, gi2 = bi2*bd2+ti2;\n"); - if (ctx->ndh>0){ + if (ctx->pri.ndh>0){ strb_appends(&ctx->s, "\tX "); - for (i=0;indh;i++){ + for (i=0;ipri.ndh;i++){ strb_appendf(&ctx->s, "ci%u = chunkSize[%u]%s", - i, i, (i==ctx->ndh-1) ? ";\n" : ", "); + i, i, (i==ctx->pri.ndh-1) ? ";\n" : ", "); } } @@ -1377,10 +1453,10 @@ static void reduxAppendRangeCalculations (redux_ctx* ctx){ strb_appends(&ctx->s, "\t/* Compute ranges for this thread. */\n"); for (i=0;inds;i++){ - strb_appendf(&ctx->s, "\ti%dDim = srcSize[%d];\n", i, ctx->axisList[i]); + strb_appendf(&ctx->s, "\ti%dDim = srcSize[%d];\n", i, ctx->srcAxisList[i]); } for (i=0;inds;i++){ - strb_appendf(&ctx->s, "\ti%dSStep = srcSteps[%d];\n", i, ctx->axisList[i]); + strb_appendf(&ctx->s, "\ti%dSStep = srcSteps[%d];\n", i, ctx->srcAxisList[i]); } for (i=0;indd;i++){ strb_appendf(&ctx->s, "\ti%dMStep = dstSteps[%d];\n", i, i); @@ -1406,7 +1482,7 @@ static void reduxAppendRangeCalculations (redux_ctx* ctx){ * The others, if any, have to use software looping beginning at 0. */ - if (axisInSet(ctx->axisList[i], ctx->hwAxisList, ctx->ndh, &hwDim)){ + if (axisInSet(ctx->srcAxisList[i], ctx->pri.axisList, ctx->pri.ndh, &hwDim)){ strb_appendf(&ctx->s, "\ti%dStart = gi%d * ci%d;\n", i, hwDim, hwDim); }else{ strb_appendf(&ctx->s, "\ti%dStart = 0;\n", i); @@ -1418,7 +1494,7 @@ static void reduxAppendRangeCalculations (redux_ctx* ctx){ * The others, if any, have to use software looping beginning at 0. */ - if (axisInSet(ctx->axisList[i], ctx->hwAxisList, ctx->ndh, &hwDim)){ + if (axisInSet(ctx->srcAxisList[i], ctx->pri.axisList, ctx->pri.ndh, &hwDim)){ strb_appendf(&ctx->s, "\ti%dEnd = i%dStart + ci%d;\n", i, i, hwDim); }else{ strb_appendf(&ctx->s, "\ti%dEnd = i%dStart + i%dDim;\n", i, i, i); @@ -1618,119 +1694,255 @@ static void reduxAppendLoopMacroUndefs (redux_ctx* ctx){ /** * @brief Compile the kernel from source code. - * - * @return */ -static int reduxCompileLarge (redux_ctx* ctx){ - const int ARG_TYPECODES[] = { - GA_BUFFER, /* src */ - GA_SIZE, /* srcOff */ - GA_BUFFER, /* srcSteps */ - GA_BUFFER, /* srcSize */ - GA_BUFFER, /* chnkSize */ - GA_BUFFER, /* dst */ - GA_SIZE, /* dstOff */ - GA_BUFFER, /* dstSteps */ - GA_BUFFER, /* dstArg */ - GA_SIZE, /* dstArgOff */ - GA_BUFFER /* dstArgSteps */ - }; - const size_t ARG_TYPECODES_LEN = sizeof(ARG_TYPECODES)/sizeof(*ARG_TYPECODES); - const char* SRCS[1] = {ctx->sourceCode}; - const size_t SRC_LENS[1] = {strlen(ctx->sourceCode)}; - const size_t SRCS_LEN = sizeof(SRCS)/sizeof(*SRCS); - - int ret = GpuKernel_init(&ctx->kernel, - ctx->gpuCtx, - SRCS_LEN, - SRCS, - SRC_LENS, - "redux", - ARG_TYPECODES_LEN, - ARG_TYPECODES, - GA_USE_CLUDA, - (char**)0); - - if (ret != GA_NO_ERROR){ - return reduxCleanup(ctx, ret); - }else{ - return reduxScheduleLarge(ctx); +static int reduxCompile (redux_ctx* ctx){ + int ret, i = 0; + int PRI_TYPECODES[11]; + size_t PRI_TYPECODES_LEN; + int* AUX_TYPECODES; + size_t AUX_TYPECODES_LEN; + + + /** + * Construct Argument Typecode Lists. + */ + + PRI_TYPECODES[i++] = GA_BUFFER; /* src */ + PRI_TYPECODES[i++] = GA_SIZE; /* srcOff */ + PRI_TYPECODES[i++] = GA_BUFFER; /* srcSteps */ + PRI_TYPECODES[i++] = GA_BUFFER; /* srcSize */ + PRI_TYPECODES[i++] = GA_BUFFER; /* chnkSize */ + if(reduxKernelRequiresDst(ctx)){ + PRI_TYPECODES[i++] = GA_BUFFER; /* dst */ + PRI_TYPECODES[i++] = GA_SIZE; /* dstOff */ + PRI_TYPECODES[i++] = GA_BUFFER; /* dstSteps */ + } + if(reduxKernelRequiresDstArg(ctx)){ + PRI_TYPECODES[i++] = GA_BUFFER; /* dstArg */ + PRI_TYPECODES[i++] = GA_SIZE; /* dstArgOff */ + PRI_TYPECODES[i++] = GA_BUFFER; /* dstArgSteps */ + } + PRI_TYPECODES_LEN = i; + AUX_TYPECODES = &PRI_TYPECODES[3]; + AUX_TYPECODES_LEN = PRI_TYPECODES_LEN-3; + + + /** + * Compile the kernels. + */ + + { + ret = GpuKernel_init(&ctx->kernel, + ctx->gpuCtx, + 1, + (const char**)&ctx->sourceCode, + &ctx->sourceCodeLen, + "redux", + PRI_TYPECODES_LEN, + PRI_TYPECODES, + GA_USE_CLUDA, + &ctx->errorString0); + if (ret != GA_NO_ERROR){ + return reduxCleanup(ctx, ret); + } } -} -static int reduxCompileSmall (redux_ctx* ctx){ - /* BUG: Implement small code model. */ - return reduxCompileLarge(ctx); + if(reduxIsSmallCodeModel(ctx)){ + ret = GpuKernel_init(&ctx->kernel, + ctx->gpuCtx, + 1, + (const char**)&ctx->sourceCode, + &ctx->sourceCodeLen, + "preRedux", + AUX_TYPECODES_LEN, + AUX_TYPECODES, + GA_USE_CLUDA, + &ctx->errorString1); + if (ret != GA_NO_ERROR){ + return reduxCleanup(ctx, ret); + } + ret = GpuKernel_init(&ctx->kernel, + ctx->gpuCtx, + 1, + (const char**)&ctx->sourceCode, + &ctx->sourceCodeLen, + "postRedux", + AUX_TYPECODES_LEN, + AUX_TYPECODES, + GA_USE_CLUDA, + &ctx->errorString2); + if (ret != GA_NO_ERROR){ + return reduxCleanup(ctx, ret); + } + } + + return reduxSchedule(ctx); } /** - * Compute a good thread block size / grid size / software chunk size for Nvidia. + * @brief Compute a good thread block size / grid size / software chunk size + * for the primary/auxilliary kernels. */ -static int reduxScheduleLarge (redux_ctx* ctx){ - int i; - size_t warpMod; - size_t bestWarpMod = 1; - unsigned bestWarpAxis = 0; - uint64_t maxLg; - uint64_t maxLs[MAX_HW_DIMS]; - uint64_t maxGg; - uint64_t maxGs [MAX_HW_DIMS]; - uint64_t dims [MAX_HW_DIMS]; - double slack [MAX_HW_DIMS]; - ga_factor_list factBS[MAX_HW_DIMS]; - ga_factor_list factGS[MAX_HW_DIMS]; - ga_factor_list factCS[MAX_HW_DIMS]; - - +static int reduxSchedule (redux_ctx* ctx){ + int i, priNdims, auxNdims; + uint64_t maxLgRdx, maxLgPre, maxLgPost; + uint64_t maxLgPri, maxLgAux; + uint64_t maxLs [MAX_HW_DIMS]; + uint64_t maxGg; + uint64_t maxGs [MAX_HW_DIMS]; + uint64_t priDims[MAX_HW_DIMS]; + uint64_t auxDims[MAX_HW_DIMS]; + uint64_t bs [MAX_HW_DIMS]; + uint64_t gs [MAX_HW_DIMS]; + uint64_t cs [MAX_HW_DIMS]; + size_t warpSize, + maxL, maxL0, maxL1, maxL2, + maxG, maxG0, maxG1, maxG2; + + /** * Obtain the constraints of our problem. */ - size_t warpSize, - maxL, maxL0, maxL1, maxL2, /* Maximum total and per-dimension thread/block sizes */ - maxG, maxG0, maxG1, maxG2; /* Maximum total and per-dimension block /grid sizes */ - gpukernel_property(ctx->kernel.k, GA_KERNEL_PROP_PREFLSIZE, &warpSize); - gpukernel_property(ctx->kernel.k, GA_KERNEL_PROP_MAXLSIZE, &maxL); - gpudata_property (ctx->src->data, GA_CTX_PROP_MAXLSIZE0, &maxL0); - gpudata_property (ctx->src->data, GA_CTX_PROP_MAXLSIZE1, &maxL1); - gpudata_property (ctx->src->data, GA_CTX_PROP_MAXLSIZE2, &maxL2); - gpudata_property (ctx->src->data, GA_CTX_PROP_MAXGSIZE, &maxG); - gpudata_property (ctx->src->data, GA_CTX_PROP_MAXGSIZE0, &maxG0); - gpudata_property (ctx->src->data, GA_CTX_PROP_MAXGSIZE1, &maxG1); - gpudata_property (ctx->src->data, GA_CTX_PROP_MAXGSIZE2, &maxG2); - + gpudata_property (ctx->src->data, GA_CTX_PROP_MAXLSIZE0, &maxL0); + gpudata_property (ctx->src->data, GA_CTX_PROP_MAXLSIZE1, &maxL1); + gpudata_property (ctx->src->data, GA_CTX_PROP_MAXLSIZE2, &maxL2); + gpudata_property (ctx->src->data, GA_CTX_PROP_MAXGSIZE, &maxG); + gpudata_property (ctx->src->data, GA_CTX_PROP_MAXGSIZE0, &maxG0); + gpudata_property (ctx->src->data, GA_CTX_PROP_MAXGSIZE1, &maxG1); + gpudata_property (ctx->src->data, GA_CTX_PROP_MAXGSIZE2, &maxG2); + gpukernel_property(ctx->kernel.k, GA_KERNEL_PROP_PREFLSIZE, &warpSize); + gpukernel_property(ctx->kernel.k, GA_KERNEL_PROP_MAXLSIZE, &maxL); + maxLgRdx = maxL; + maxLgPri = maxLgRdx; + if(reduxIsSmallCodeModel(ctx)){ + gpukernel_property(ctx->preKernel.k, GA_KERNEL_PROP_MAXLSIZE, &maxL); + maxLgPre = maxL; + gpukernel_property(ctx->postKernel.k, GA_KERNEL_PROP_MAXLSIZE, &maxL); + maxLgPost = maxL; + maxLgAux = maxLgPrepri.ndh; + maxGs[0] = maxG0; + maxGs[1] = maxG1; + maxGs[2] = maxG2; + maxGg = maxG; + maxLs[0] = maxL0; + maxLs[1] = maxL1; + maxLs[2] = maxL2; + for (i=0;isrc->dimensions[ctx->pri.axisList[i]]; + } + if(reduxIsSmallCodeModel(ctx)){ + auxNdims = ctx->aux.ndh; + for (i=0;isrc->dimensions[ctx->aux.axisList[i]]; + } + } + + /** - * Prepare inputs to the solver. - * - * This involves, amongst others, - * - Initializing the blockSize, gridSize and chunkSize factor lists for all - * hardware dimensions. - * - Finding on which hardware axis is it optimal to place the warpSize factor. + * Apply the solver. */ + + { + reduxScheduleKernel(priNdims, + priDims, + warpSize, + maxLgPri, maxLs, + maxGg, maxGs, + bs, gs, cs); + for (i=0;ipri.bs[i] = bs[i]; + ctx->pri.gs[i] = gs[i]; + ctx->pri.cs[i] = cs[i]; + } + if (priNdims <= 0){ + ctx->pri.bs[i] = ctx->pri.gs[i] = ctx->pri.cs[i] = 1; + } + } + if (reduxIsSmallCodeModel(ctx)){ + reduxScheduleKernel(auxNdims, + auxDims, + warpSize, + maxLgAux, maxLs, + maxGg, maxGs, + bs, gs, cs); + for (i=0;iaux.bs[i] = bs[i]; + ctx->aux.gs[i] = gs[i]; + ctx->aux.cs[i] = cs[i]; + } + if (auxNdims <= 0){ + ctx->aux.bs[i] = ctx->aux.gs[i] = ctx->aux.cs[i] = 1; + } + } + + return reduxInvoke(ctx); +} - maxLg = maxL; - maxLs[0] = maxL0, maxLs[1]=maxL1, maxLs[2]=maxL2; - maxGg = maxG; - maxGs[0] = maxG0, maxGs[1]=maxG1, maxGs[2]=maxG2; - dims[0] = dims[1] = dims[2] = 1; - slack[0] = slack[1] = slack[2] = 1.1; +/** + * @brief Given the parameters of a kernel scheduling problem, solve it as + * optimally as possible. + * + * NB: This is the only function in this entire file that should have + * anything to do with the integer factorization APIs. + */ - for (i=0;indh;i++){ - dims[i] = ctx->src->dimensions[ctx->hwAxisList[i]]; +static void reduxScheduleKernel (int ndims, + uint64_t* dims, + uint64_t warpSize, + uint64_t maxLg, + uint64_t* maxLs, + uint64_t maxGg, + uint64_t* maxGs, + uint64_t* bs, + uint64_t* gs, + uint64_t* cs){ + uint64_t warpMod, bestWarpMod = 1; + int i, bestWarpAxis = 0; + uint64_t roundedDims[MAX_HW_DIMS]; + double slack [MAX_HW_DIMS]; + ga_factor_list factBS [MAX_HW_DIMS]; + ga_factor_list factGS [MAX_HW_DIMS]; + ga_factor_list factCS [MAX_HW_DIMS]; + + + /** + * Quick check for scalar case. + */ + + if (ndims <= 0){ + return; + } + + + /** + * Identify the dimension to which the warp factor will be given. + * + * The current heuristic is to find the dimension that is either + * 1) Evenly divided by the warp size, or + * 2) As close to filling the last warp as possible. + */ + + for (i=0;i0 && (warpMod==0 || warpMod>=bestWarpMod)){ bestWarpAxis = i; bestWarpMod = warpMod; } } - if (ctx->ndh > 0){ - dims[bestWarpAxis] = (dims[bestWarpAxis] + warpSize - 1)/warpSize; + if (ndims > 0){ + roundedDims[bestWarpAxis] = (roundedDims[bestWarpAxis] + warpSize - 1)/warpSize; gaIFactorize(warpSize, 0, 0, &factBS[bestWarpAxis]); } @@ -1740,8 +1952,11 @@ static int reduxScheduleLarge (redux_ctx* ctx){ * chunkSize. */ - for (i=0;indh;i++){ - while (!gaIFactorize(dims[i], (uint64_t)(dims[i]*slack[i]), maxLs[i], &factCS[i])){ + for (i=0;indh, maxLg, maxLs, maxGg, maxGs, factBS, factGS, factCS); - - /* Output. */ - for (i=0;indh;i++){ - ctx->blockSize[i] = gaIFLGetProduct(&factBS[i]); - ctx->gridSize [i] = gaIFLGetProduct(&factGS[i]); - ctx->chunkSize[i] = gaIFLGetProduct(&factCS[i]); + gaIFLSchedule(ndims, maxLg, maxLs, maxGg, maxGs, factBS, factGS, factCS); + for (i=0;isrcStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->nds * sizeof(size_t), - ctx->src->strides, flags, 0); - ctx->srcSizeGD = gpudata_alloc(ctx->gpuCtx, ctx->nds * sizeof(size_t), - ctx->src->dimensions, flags, 0); - ctx->chunkSizeGD = gpudata_alloc(ctx->gpuCtx, ctx->ndh * sizeof(size_t), - ctx->chunkSize, flags, 0); - if (reduxKernelRequiresDst(ctx)){ + ctx->srcStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->nds * sizeof(size_t), + ctx->src->strides, flags, 0); + ctx->srcSizeGD = gpudata_alloc(ctx->gpuCtx, ctx->nds * sizeof(size_t), + ctx->src->dimensions, flags, 0); + ctx->chunkSizeGD = gpudata_alloc(ctx->gpuCtx, ctx->pri.ndh * sizeof(size_t), + ctx->pri.cs, flags, 0); + + args[i++] = (void*) ctx->src->data; + args[i++] = (void*)&ctx->src->offset; + args[i++] = (void*) ctx->srcStepsGD; + args[i++] = (void*) ctx->srcSizeGD; + args[i++] = (void*) ctx->chunkSizeGD; + if (reduxKernelRequiresDst (ctx)){ ctx->dstStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t), ctx->dst->strides, flags, 0); + dstStepsReqAndAlloc = !!ctx->dstStepsGD; + args[i++] = (void*) ctx->dst->data; + args[i++] = (void*)&ctx->dst->offset; + args[i++] = (void*) ctx->dstStepsGD; } if (reduxKernelRequiresDstArg(ctx)){ ctx->dstArgStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t), ctx->dstArg->strides, flags, 0); + dstArgStepsReqAndAlloc = !!ctx->dstArgStepsGD; + args[i++] = (void*) ctx->dstArg->data; + args[i++] = (void*)&ctx->dstArg->offset; + args[i++] = (void*) ctx->dstArgStepsGD; } - args[ 0] = (void*) ctx->src->data; - args[ 1] = (void*)&ctx->src->offset; - args[ 2] = (void*) ctx->srcStepsGD; - args[ 3] = (void*) ctx->srcSizeGD; - args[ 4] = (void*) ctx->chunkSizeGD; - if ( reduxKernelRequiresDst (ctx) && reduxKernelRequiresDstArg(ctx)){ - args[ 5] = (void*) ctx->dst->data; - args[ 6] = (void*)&ctx->dst->offset; - args[ 7] = (void*) ctx->dstStepsGD; - args[ 8] = (void*) ctx->dstArg->data; - args[ 9] = (void*)&ctx->dstArg->offset; - args[10] = (void*) ctx->dstArgStepsGD; - }else if ( reduxKernelRequiresDst (ctx) && !reduxKernelRequiresDstArg(ctx)){ - args[ 5] = (void*) ctx->dst->data; - args[ 6] = (void*)&ctx->dst->offset; - args[ 7] = (void*) ctx->dstStepsGD; - }else if (!reduxKernelRequiresDst (ctx) && reduxKernelRequiresDstArg(ctx)){ - args[ 5] = (void*) ctx->dstArg->data; - args[ 6] = (void*)&ctx->dstArg->offset; - args[ 7] = (void*) ctx->dstArgStepsGD; - } - if (ctx->srcStepsGD && - ctx->srcSizeGD && - ctx->chunkSizeGD && - ctx->dstStepsGD && - ctx->dstArgStepsGD){ + + /** + * One or three kernels is now invoked, depending on the code model. + */ + + if (ctx->srcStepsGD && + ctx->srcSizeGD && + ctx->chunkSizeGD && + dstStepsReqAndAlloc && + dstArgStepsReqAndAlloc){ + /* Pre-kernel invocation, if necessary */ + if(reduxIsSmallCodeModel(ctx)){ + ret = GpuKernel_call(&ctx->preKernel, + ctx->aux.ndh>0 ? ctx->aux.ndh : 1, + ctx->aux.gs, + ctx->aux.bs, + 0, + &args[3]); + if (ret != GA_NO_ERROR){ + return reduxCleanup(ctx, ret); + } + } + + /* Reduction kernel invocation */ ret = GpuKernel_call(&ctx->kernel, - ctx->ndh>0 ? ctx->ndh : 1, - ctx->gridSize, - ctx->blockSize, + ctx->pri.ndh>0 ? ctx->pri.ndh : 1, + ctx->pri.gs, + ctx->pri.bs, 0, - args); + &args[0]); + if (ret != GA_NO_ERROR){ + return reduxCleanup(ctx, ret); + } + + /* Post-kernel invocation, if necessary */ + if(reduxIsSmallCodeModel(ctx)){ + ret = GpuKernel_call(&ctx->postKernel, + ctx->aux.ndh>0 ? ctx->aux.ndh : 1, + ctx->aux.gs, + ctx->aux.bs, + 0, + &args[3]); + if (ret != GA_NO_ERROR){ + return reduxCleanup(ctx, ret); + } + } + return reduxCleanup(ctx, ret); }else{ return reduxCleanup(ctx, GA_MEMORY_ERROR); @@ -1843,10 +2085,18 @@ static int reduxInvokeLarge (redux_ctx* ctx){ */ static int reduxCleanup (redux_ctx* ctx, int ret){ - free(ctx->axisList); + free(ctx->srcAxisList); + free(ctx->dstAxisList); free(ctx->sourceCode); - ctx->axisList = NULL; - ctx->sourceCode = NULL; + free(ctx->errorString0); + free(ctx->errorString1); + free(ctx->errorString2); + ctx->srcAxisList = NULL; + ctx->dstAxisList = NULL; + ctx->sourceCode = NULL; + ctx->errorString0 = NULL; + ctx->errorString1 = NULL; + ctx->errorString2 = NULL; gpudata_release(ctx->srcStepsGD); gpudata_release(ctx->srcSizeGD);