diff --git a/include/cudaq/Optimizer/Builder/Intrinsics.h b/include/cudaq/Optimizer/Builder/Intrinsics.h index fa9ce53097f..5884dbb39e9 100644 --- a/include/cudaq/Optimizer/Builder/Intrinsics.h +++ b/include/cudaq/Optimizer/Builder/Intrinsics.h @@ -36,11 +36,16 @@ static constexpr const char getCudaqSizeFromTriple[] = // typically specialized to be bit packed). static constexpr const char stdvecBoolCtorFromInitList[] = "__nvqpp_initializer_list_to_vector_bool"; + // Convert a (likely packed) std::vector into a sequence of bytes, each // holding a boolean value. static constexpr const char stdvecBoolUnpackToInitList[] = "__nvqpp_vector_bool_to_initializer_list"; +// Free any temporary buffers used to hold std::vector data. +static constexpr const char stdvecBoolFreeTemporaryLists[] = + "__nvqpp_vector_bool_free_temporary_initlists"; + // The internal data of the cudaq::state object must be `2**n` in length. This // function returns the value `n`. static constexpr const char getNumQubitsFromCudaqState[] = diff --git a/lib/Optimizer/Builder/Factory.cpp b/lib/Optimizer/Builder/Factory.cpp index 5a4e5cb43b0..5c090d4271d 100644 --- a/lib/Optimizer/Builder/Factory.cpp +++ b/lib/Optimizer/Builder/Factory.cpp @@ -321,6 +321,22 @@ cc::StructType factory::stlVectorType(Type eleTy) { return cc::StructType::get(ctx, ArrayRef{ptrTy, ptrTy, ptrTy}); } +// Note that this is the raw host type, where std::vector is distinct. +// When converting to the device side, the distinction is deliberately removed +// making std::vector the same format as std::vector. +static cc::StructType stlHostVectorType(Type eleTy) { + MLIRContext *ctx = eleTy.getContext(); + if (eleTy != IntegerType::get(ctx, 1)) { + // std::vector where T != bool. + return factory::stlVectorType(eleTy); + } + // std::vector is a different type than std::vector. + auto ptrTy = cc::PointerType::get(eleTy); + auto i8Ty = IntegerType::get(ctx, 8); + auto padout = cc::ArrayType::get(ctx, i8Ty, 32); + return cc::StructType::get(ctx, ArrayRef{ptrTy, padout}); +} + // FIXME: Give these front-end names so we can disambiguate more types. cc::StructType factory::getDynamicBufferType(MLIRContext *ctx) { auto ptrTy = cc::PointerType::get(IntegerType::get(ctx, 8)); @@ -344,8 +360,7 @@ Type factory::getSRetElementType(FunctionType funcTy) { Type factory::convertToHostSideType(Type ty) { if (auto memrefTy = dyn_cast(ty)) - return factory::stlVectorType( - convertToHostSideType(memrefTy.getElementType())); + return stlHostVectorType(convertToHostSideType(memrefTy.getElementType())); if (isa(ty)) return cc::PointerType::get(IntegerType::get(ty.getContext(), 8)); if (isa(ty)) diff --git a/lib/Optimizer/Builder/Intrinsics.cpp b/lib/Optimizer/Builder/Intrinsics.cpp index 315743f057d..db19c28df1f 100644 --- a/lib/Optimizer/Builder/Intrinsics.cpp +++ b/lib/Optimizer/Builder/Intrinsics.cpp @@ -307,11 +307,17 @@ static constexpr IntrinsicCode intrinsicTable[] = { return %0 : !cc.ptr })#"}, + // __nvqpp_vector_bool_free_temporary_lists + {cudaq::stdvecBoolFreeTemporaryLists, + {}, + R"#( + func.func private @__nvqpp_vector_bool_free_temporary_initlists(!cc.ptr) -> ())#"}, + // __nvqpp_vector_bool_to_initializer_list {cudaq::stdvecBoolUnpackToInitList, {}, R"#( - func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr, !cc.ptr, !cc.ptr}>>, !cc.ptr, !cc.ptr, !cc.ptr}>>) -> ())#"}, + func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr, !cc.ptr, !cc.ptr}>>, !cc.ptr, !cc.array}>>, !cc.ptr>) -> ())#"}, {"__nvqpp_zeroDynamicResult", {}, R"#( func.func private @__nvqpp_zeroDynamicResult() -> !cc.struct<{!cc.ptr, i64}> { diff --git a/lib/Optimizer/Transforms/GenKernelExecution.cpp b/lib/Optimizer/Transforms/GenKernelExecution.cpp index 4db2c7992b9..7e450c2da7d 100644 --- a/lib/Optimizer/Transforms/GenKernelExecution.cpp +++ b/lib/Optimizer/Transforms/GenKernelExecution.cpp @@ -55,615 +55,905 @@ static bool isStateType(Type ty) { return false; } -/// This pass adds a `.thunk` function and a rewritten C++ host -/// side (mangled) stub to the code for every entry-point kernel in the module. -/// It may also generate a `.argsCreator` function. Finally, it -/// creates registration hooks for the CUDA-Q runtime to be able to find the -/// kernel by name and, as appropriate, the `.argsCreator` -/// function. -namespace { -class GenerateKernelExecution - : public cudaq::opt::impl::GenerateKernelExecutionBase< - GenerateKernelExecution> { -public: - using GenerateKernelExecutionBase::GenerateKernelExecutionBase; +/// Creates the function signature for a thunk function. The signature is always +/// the same for all thunk functions. +/// +/// Every thunk function has an identical signature, making it callable from a +/// generic "kernel launcher" in the CUDA-Q runtime. +/// +/// This signature is defined as: `(ptr, bool) -> {ptr, i64}`. +/// +/// The first argument is a pointer to a data buffer that encodes all the +/// arguments (and static return) values to (and from) the kernel in the +/// pointer-free encoding. The second argument indicates if this call is to a +/// remote process (if true). The result is a pointer and size (span) if the +/// kernel returns a dynamically sized result, otherwise it will be +/// `{nullptr, 0}`. It is the responsibility of calling code to free any +/// dynamic result buffer(s) and convert those to `std::vector` objects. +static FunctionType getThunkType(MLIRContext *ctx) { + auto ptrTy = cudaq::cc::PointerType::get(IntegerType::get(ctx, 8)); + return FunctionType::get(ctx, {ptrTy, IntegerType::get(ctx, 1)}, + {cudaq::opt::factory::getDynamicBufferType(ctx)}); +} -private: - /// Creates the function signature for a thunk function. The signature is - /// always the same for all thunk functions. - /// - /// Every thunk function has an identical signature, making it callable from a - /// generic "kernel launcher" in the CUDA-Q runtime. - /// - /// This signature is defined as: `(ptr, bool) -> {ptr, i64}`. - /// - /// The first argument is a pointer to a data buffer that encodes all the - /// arguments (and static return) values to (and from) the kernel in the - /// pointer-free encoding. The second argument indicates if this call is to a - /// remote process (if true). The result is a pointer and size (span) if the - /// kernel returns a dynamically sized result, otherwise it will be - /// `{nullptr, 0}`. It is the responsibility of calling code to free any - /// dynamic result buffer(s) and convert those to `std::vector` objects. - FunctionType getThunkType(MLIRContext *ctx) { - auto ptrTy = cudaq::cc::PointerType::get(IntegerType::get(ctx, 8)); - return FunctionType::get(ctx, {ptrTy, IntegerType::get(ctx, 1)}, - {cudaq::opt::factory::getDynamicBufferType(ctx)}); - } +/// Generate code to read the length from a host-side string object. (On the +/// device side, a string is encoded as a span.) The length of a string is the +/// number of bytes of data. +/// +/// In order to handle a std::string value it is assumed to be laid out in +/// memory as the following structure. +/// +/// +/// struct vector { +/// i8* data; +/// i64 length; +/// [i8 x 16] inlinedata; +/// }; +/// +/// +/// This implementation does \e not support wide characters. +static Value genStringLength(Location loc, OpBuilder &builder, + Value stringArg) { + Type stringTy = stringArg.getType(); + assert(isa(stringTy) && + isa( + cast(stringTy).getElementType()) && + cast( + cast(stringTy).getElementType()) + .getMember(1) == builder.getI64Type() && + "host side string expected"); + auto ptrTy = cast(stringTy); + auto strTy = cast(ptrTy.getElementType()); + auto lenPtr = builder.create( + loc, cudaq::cc::PointerType::get(strTy.getMember(1)), stringArg, + ArrayRef{1}); + return builder.create(loc, lenPtr); +} - /// Generate code to read the length from a host-side string object. (On the - /// device side, a string is encoded as a span.) The length of a string is the - /// number of bytes of data. - /// - /// In order to handle a std::string value it is assumed to be laid out in - /// memory as the following structure. - /// - /// - /// struct vector { - /// i8* data; - /// i64 length; - /// [i8 x 16] inlinedata; - /// }; - /// - /// - /// This implementation does \e not support wide characters. - Value genStringLength(Location loc, OpBuilder &builder, Value stringArg) { - Type stringTy = stringArg.getType(); - assert(isa(stringTy) && - isa( - cast(stringTy).getElementType()) && - cast( - cast(stringTy).getElementType()) - .getMember(1) == builder.getI64Type() && - "host side string expected"); - auto ptrTy = cast(stringTy); - auto strTy = cast(ptrTy.getElementType()); - auto lenPtr = builder.create( - loc, cudaq::cc::PointerType::get(strTy.getMember(1)), stringArg, - ArrayRef{1}); - return builder.create(loc, lenPtr); - } +/// Generate code that computes the size in bytes of a `std::vector` array +/// in the same way as a `std::vector::size()`. This assumes the vector is +/// laid out in memory as the following structure. +/// +/// +/// struct vector { +/// T* begin; +/// T* end; +/// T* allocated_end; +/// }; +/// +/// +/// The first two elements are pointers to the beginning and end of the data +/// in the vector, respectively. This data is kept in a contiguous memory +/// range. The following implementation follows what Clang CodeGen produces +/// for `std::vector::size()` without the final `sdiv` op that divides the +/// `sizeof(data[N])` by the `sizeof(T)`. The result is the total required +/// memory size for the vector data itself in \e bytes. +static Value genVectorSize(Location loc, OpBuilder &builder, Value vecArg) { + auto vecTy = cast(vecArg.getType()); + auto vecStructTy = cast(vecTy.getElementType()); + assert(vecStructTy.getNumMembers() == 3 && + vecStructTy.getMember(0) == vecStructTy.getMember(1) && + vecStructTy.getMember(0) == vecStructTy.getMember(2) && + "host side vector expected"); + auto vecElePtrTy = cudaq::cc::PointerType::get(vecStructTy.getMember(0)); + + // Get the pointer to the pointer of the end of the array + Value endPtr = builder.create( + loc, vecElePtrTy, vecArg, ArrayRef{1}); + + // Get the pointer to the pointer of the beginning of the array + Value beginPtr = builder.create( + loc, vecElePtrTy, vecArg, ArrayRef{0}); + + // Load to a T* + endPtr = builder.create(loc, endPtr); + beginPtr = builder.create(loc, beginPtr); + + // Map those pointers to integers + Type i64Ty = builder.getI64Type(); + Value endInt = builder.create(loc, i64Ty, endPtr); + Value beginInt = builder.create(loc, i64Ty, beginPtr); + + // Subtracting these will give us the size in bytes. + return builder.create(loc, endInt, beginInt); +} - /// Generate code that computes the size in bytes of a `std::vector` array - /// in the same way as a `std::vector::size()`. This assumes the vector is - /// laid out in memory as the following structure. - /// - /// - /// struct vector { - /// T* begin; - /// T* end; - /// T* allocated_end; - /// }; - /// - /// - /// The first two elements are pointers to the beginning and end of the data - /// in the vector, respectively. This data is kept in a contiguous memory - /// range. The following implementation follows what Clang CodeGen produces - /// for `std::vector::size()` without the final `sdiv` op that divides the - /// `sizeof(data[N])` by the `sizeof(T)`. The result is the total required - /// memory size for the vector data itself in \e bytes. - Value genVectorSize(Location loc, OpBuilder &builder, Value vecArg) { - auto vecTy = cast(vecArg.getType()); - auto vecStructTy = cast(vecTy.getElementType()); - assert(vecStructTy.getNumMembers() == 3 && - vecStructTy.getMember(0) == vecStructTy.getMember(1) && - vecStructTy.getMember(0) == vecStructTy.getMember(2) && - "host side vector expected"); - auto vecElePtrTy = cudaq::cc::PointerType::get(vecStructTy.getMember(0)); - - // Get the pointer to the pointer of the end of the array - Value endPtr = builder.create( - loc, vecElePtrTy, vecArg, ArrayRef{1}); - - // Get the pointer to the pointer of the beginning of the array - Value beginPtr = builder.create( - loc, vecElePtrTy, vecArg, ArrayRef{0}); - - // Load to a T* - endPtr = builder.create(loc, endPtr); - beginPtr = builder.create(loc, beginPtr); - - // Map those pointers to integers - Type i64Ty = builder.getI64Type(); - Value endInt = builder.create(loc, i64Ty, endPtr); - Value beginInt = builder.create(loc, i64Ty, beginPtr); +static Value genComputeReturnOffset(Location loc, OpBuilder &builder, + FunctionType funcTy, + cudaq::cc::StructType msgStructTy) { + if (funcTy.getNumResults() == 0) + return builder.create(loc, NoResultOffset, 64); + std::int32_t numKernelArgs = funcTy.getNumInputs(); + auto i64Ty = builder.getI64Type(); + return builder.create( + loc, i64Ty, msgStructTy, ArrayRef{numKernelArgs}); +} - // Subtracting these will give us the size in bytes. - return builder.create(loc, endInt, beginInt); - } +/// Create a function that determines the return value offset in the message +/// buffer. +static void genReturnOffsetFunction(Location loc, OpBuilder &builder, + FunctionType devKernelTy, + cudaq::cc::StructType msgStructTy, + const std::string &classNameStr) { + auto *ctx = builder.getContext(); + auto i64Ty = builder.getI64Type(); + auto funcTy = FunctionType::get(ctx, {}, {i64Ty}); + auto returnOffsetFunc = + builder.create(loc, classNameStr + ".returnOffset", funcTy); + OpBuilder::InsertionGuard guard(builder); + auto *entry = returnOffsetFunc.addEntryBlock(); + builder.setInsertionPointToStart(entry); + auto result = genComputeReturnOffset(loc, builder, devKernelTy, msgStructTy); + builder.create(loc, result); +} - /// Helper that converts a byte length to a length of i64. - Value convertLengthBytesToLengthI64(Location loc, OpBuilder &builder, - Value length) { - auto eight = builder.create(loc, 8, 64); - return builder.create(loc, length, eight); - } +static cudaq::cc::PointerType getByteAddressableType(OpBuilder &builder) { + return cudaq::cc::PointerType::get( + cudaq::cc::ArrayType::get(builder.getI8Type())); +} + +static cudaq::cc::PointerType getPointerToPointerType(OpBuilder &builder) { + return cudaq::cc::PointerType::get( + cudaq::cc::PointerType::get(builder.getI8Type())); +} + +static bool isDynamicSignature(FunctionType devFuncTy) { + for (auto t : devFuncTy.getInputs()) + if (cudaq::cc::isDynamicType(t)) + return true; + for (auto t : devFuncTy.getResults()) + if (cudaq::cc::isDynamicType(t)) + return true; + return false; +} - Value genComputeReturnOffset(Location loc, OpBuilder &builder, - FunctionType funcTy, - cudaq::cc::StructType msgStructTy) { - if (funcTy.getNumResults() == 0) - return builder.create(loc, NoResultOffset, 64); - std::int32_t numKernelArgs = funcTy.getNumInputs(); +static std::pair +genByteSizeAndElementCount(Location loc, OpBuilder &builder, Type eleTy, + Value size, Value arg, Type t) { + // If this is a vector>, convert the bytes of vector to bytes of + // length (i64). + if (auto sty = dyn_cast(eleTy)) { + auto eTy = cast(arg.getType()).getElementType(); + auto fTy = cast(eTy).getMember(0); + auto tTy = cast(fTy).getElementType(); auto i64Ty = builder.getI64Type(); - return builder.create( - loc, i64Ty, msgStructTy, ArrayRef{numKernelArgs}); + auto eleSize = builder.create(loc, i64Ty, tTy); + Value count = builder.create(loc, size, eleSize); + auto ate = builder.create(loc, 8, 64); + size = builder.create(loc, count, ate); + return {size, count}; } - /// Create a function that determines the return value offset in the message - /// buffer. - void genReturnOffsetFunction(Location loc, OpBuilder &builder, - FunctionType devKernelTy, - cudaq::cc::StructType msgStructTy, - const std::string &classNameStr) { - auto *ctx = builder.getContext(); + // If this is a vector, convert the bytes of string to bytes of length + // (i64). + if (isa(eleTy)) { + auto fore = builder.create(loc, 4, 64); + size = builder.create(loc, size, fore); + auto ate = builder.create(loc, 8, 64); + Value count = builder.create(loc, size, ate); + return {size, count}; + } + + // If this is a vector>, convert the bytes of struct to bytes of + // struct with converted members. + if (isa(eleTy)) { + auto eleTy = cast(arg.getType()).getElementType(); auto i64Ty = builder.getI64Type(); - auto funcTy = FunctionType::get(ctx, {}, {i64Ty}); - auto returnOffsetFunc = builder.create( - loc, classNameStr + ".returnOffset", funcTy); - OpBuilder::InsertionGuard guard(builder); - auto *entry = returnOffsetFunc.addEntryBlock(); - builder.setInsertionPointToStart(entry); - auto result = - genComputeReturnOffset(loc, builder, devKernelTy, msgStructTy); - builder.create(loc, result); + auto hostStrSize = builder.create(loc, i64Ty, eleTy); + Value count = builder.create(loc, size, hostStrSize); + Type packedTy = cudaq::opt::factory::genArgumentBufferType(t); + auto packSize = builder.create(loc, i64Ty, packedTy); + size = builder.create(loc, count, packSize); + return {size, count}; } + return {}; +} - static cudaq::cc::PointerType getByteAddressableType(OpBuilder &builder) { - return cudaq::cc::PointerType::get( - cudaq::cc::ArrayType::get(builder.getI8Type())); +static bool isStdVectorBool(Type ty) { + auto stdvecTy = dyn_cast(ty); + return stdvecTy && + (stdvecTy.getElementType() == IntegerType::get(ty.getContext(), 1)); +} + +/// Recursively check if \p ty contains a `std::vector`. +static bool hasStdVectorBool(Type ty) { + if (isStdVectorBool(ty)) + return true; + if (auto sty = dyn_cast(ty)) + return hasStdVectorBool(sty.getElementType()); + if (auto sty = dyn_cast(ty)) + for (auto mem : sty.getMembers()) + if (hasStdVectorBool(mem)) + return true; + return false; +} + +// The host-side type of a `std::vector` is distinct from the transient +// type for a `std::vector`. The former is a unique data type with a size +// of 40 bytes. The latter is identical to `std::vector` (which has a size +// of 24 bytes). +static Type convertToTransientType(Type ty) { + if (isStdVectorBool(ty)) { + auto *ctx = ty.getContext(); + return cudaq::opt::factory::stlVectorType(IntegerType::get(ctx, 1)); } + if (auto sty = dyn_cast(ty)) + return cudaq::opt::factory::stlVectorType( + convertToTransientType(sty.getElementType())); + if (auto sty = dyn_cast(ty)) { + SmallVector newMems; + for (auto mem : sty.getMembers()) + newMems.push_back(convertToTransientType(mem)); + auto *ctx = ty.getContext(); + return cudaq::cc::StructType::get(ctx, newMems); + } + return cudaq::opt::factory::convertToHostSideType(ty); +} - static cudaq::cc::PointerType getPointerToPointerType(OpBuilder &builder) { - return cudaq::cc::PointerType::get( - cudaq::cc::PointerType::get(builder.getI8Type())); +static std::pair +convertAllStdVectorBool(Location loc, OpBuilder &builder, Value arg, Type ty, + Value heapTracker, + std::optional preallocated = std::nullopt) { + // If we are here, `ty` must be a `std::vector` or recursively contain a + // `std::vector`. + + // Handle `std::vector`. + if (isStdVectorBool(ty)) { + auto stdvecTy = cast(ty); + Type stdvecHostTy = + cudaq::opt::factory::stlVectorType(stdvecTy.getElementType()); + Value tmp = preallocated.has_value() + ? *preallocated + : builder.create(loc, stdvecHostTy); + builder.create(loc, std::nullopt, + cudaq::stdvecBoolUnpackToInitList, + ArrayRef{tmp, arg, heapTracker}); + return {tmp, true}; } - static bool isDynamicSignature(FunctionType devFuncTy) { - for (auto t : devFuncTy.getInputs()) - if (cudaq::cc::isDynamicType(t)) - return true; - for (auto t : devFuncTy.getResults()) - if (cudaq::cc::isDynamicType(t)) - return true; - return false; + // Handle `std::vector` where `T` != `bool`. + if (auto sty = dyn_cast(ty)) { + // arg is a std::vector. + // It's type must be ptr, ptr, ptr>>. + auto seleTy = sty.getElementType(); + auto ptrArgTy = cast(arg.getType()); + auto argVecTy = cast(ptrArgTy.getElementType()); + auto subVecPtrTy = cudaq::cc::PointerType::get(argVecTy.getMember(0)); + // Compute the pointer to the pointer to the first T element. + auto inputRef = builder.create( + loc, subVecPtrTy, arg, ArrayRef{0}); + auto startInput = builder.create(loc, inputRef); + auto startTy = startInput.getType(); + auto subArrTy = cudaq::cc::ArrayType::get( + cast(startTy).getElementType()); + auto input = builder.create( + loc, cudaq::cc::PointerType::get(subArrTy), startInput); + auto transientTy = convertToTransientType(sty); + Value tmp = builder.create(loc, transientTy); + Value sizeDelta = genVectorSize(loc, builder, arg); + auto count = [&]() -> Value { + if (cudaq::cc::isDynamicType(seleTy)) { + auto p = genByteSizeAndElementCount(loc, builder, seleTy, sizeDelta, + arg, sty); + return p.second; + } + auto sizeEle = builder.create( + loc, builder.getI64Type(), seleTy); + return builder.create(loc, sizeDelta, sizeEle); + }(); + auto sizeTransientTy = builder.create( + loc, builder.getI64Type(), transientTy); + Value sizeInBytes = + builder.create(loc, count, sizeTransientTy); + + // Create a new vector that we'll store the converted data into. + Value byteBuffer = builder.create( + loc, builder.getI8Type(), sizeInBytes); + + // Initialize the temporary vector. + auto transEleTy = cast(transientTy).getMember(0); + auto vecEleTy = cudaq::cc::PointerType::get(transEleTy); + auto tmpBegin = builder.create( + loc, vecEleTy, tmp, ArrayRef{0}); + auto bufferBegin = + builder.create(loc, transEleTy, byteBuffer); + builder.create(loc, bufferBegin, tmpBegin); + auto tmpEnd = builder.create( + loc, vecEleTy, tmp, ArrayRef{1}); + auto byteBufferEnd = builder.create( + loc, cudaq::cc::PointerType::get(builder.getI8Type()), byteBuffer, + ArrayRef{sizeInBytes}); + auto bufferEnd = + builder.create(loc, transEleTy, byteBufferEnd); + builder.create(loc, bufferEnd, tmpEnd); + auto tmpEnd2 = builder.create( + loc, vecEleTy, tmp, ArrayRef{2}); + builder.create(loc, bufferEnd, tmpEnd2); + + // Loop over each element in the outer vector and initialize it to the inner + // vector value. The data may be heap allocated.) + auto transientEleTy = convertToTransientType(seleTy); + auto transientBufferTy = + cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(transientEleTy)); + auto buffer = + builder.create(loc, transientBufferTy, byteBuffer); + + cudaq::opt::factory::createInvariantLoop( + builder, loc, count, + [&](OpBuilder &builder, Location loc, Region &, Block &block) { + Value i = block.getArgument(0); + Value inp = builder.create( + loc, startTy, input, ArrayRef{i}); + auto currentVector = builder.create( + loc, cudaq::cc::PointerType::get(transientEleTy), buffer, + ArrayRef{i}); + convertAllStdVectorBool(loc, builder, inp, seleTy, heapTracker, + currentVector); + }); + return {tmp, true}; } - static std::pair - genByteSizeAndElementCount(Location loc, OpBuilder &builder, Type eleTy, - Value size, Value arg, Type t) { - // If this is a vector>, convert the bytes of vector - // to bytes of length (i64). - if (isa(eleTy)) { - auto three = builder.create(loc, 3, 64); - size = builder.create(loc, size, three); - auto ate = builder.create(loc, 8, 64); - Value count = builder.create(loc, size, ate); - return {size, count}; - } - // If this is a vector, convert the bytes of string to - // bytes of length (i64). - if (isa(eleTy)) { - auto fore = builder.create(loc, 4, 64); - size = builder.create(loc, size, fore); - auto ate = builder.create(loc, 8, 64); - Value count = builder.create(loc, size, ate); - return {size, count}; - } - // If this is a vector>, convert the bytes of struct - // to bytes of struct with converted members. - if (isa(eleTy)) { - auto eleTy = cast(arg.getType()).getElementType(); - auto i64Ty = builder.getI64Type(); - auto hostStrSize = builder.create(loc, i64Ty, eleTy); - Value count = builder.create(loc, size, hostStrSize); - Type packedTy = cudaq::opt::factory::genArgumentBufferType(t); - auto packSize = builder.create(loc, i64Ty, packedTy); - size = builder.create(loc, count, packSize); - return {size, count}; + // Handle `struct { ... };`. + if (auto sty = dyn_cast(ty)) { + auto bufferTy = convertToTransientType(ty); + auto argPtrTy = cast(arg.getType()); + auto argStrTy = cast(argPtrTy.getElementType()); + + // Create a new struct that we'll store the converted data into. + Value buffer = builder.create(loc, bufferTy); + + // Loop over each element. Replace each with the converted value. + for (auto iter : llvm::enumerate(sty.getMembers())) { + std::int32_t i = iter.index(); + Type memTy = iter.value(); + auto fromPtr = builder.create( + loc, cudaq::cc::PointerType::get(argStrTy.getMember(i)), arg, + ArrayRef{i}); + auto transientTy = convertToTransientType(memTy); + Value toPtr = builder.create( + loc, cudaq::cc::PointerType::get(transientTy), buffer, + ArrayRef{i}); + convertAllStdVectorBool(loc, builder, fromPtr, memTy, heapTracker, toPtr); } - return {}; + return {buffer, true}; } + return {arg, false}; +} - Value descendThroughDynamicType(Location loc, OpBuilder &builder, Type ty, - Value addend, Value arg, Value tmp) { - auto i64Ty = builder.getI64Type(); - Value tySize = - TypeSwitch(ty) - // A char span is dynamic, but it is not recursively dynamic. Just - // read the length of the string out. - .Case([&](cudaq::cc::CharspanType t) -> Value { - return genStringLength(loc, builder, arg); - }) - // A std::vector is dynamic and may be recursive dynamic as well. - .Case([&](cudaq::cc::StdvecType t) -> Value { - // Compute the byte span of the vector. - Value size = genVectorSize(loc, builder, arg); - auto eleTy = t.getElementType(); - if (!cudaq::cc::isDynamicType(eleTy)) - return size; - - // Otherwise, we have a recursively dynamic case. - auto [bytes, count] = - genByteSizeAndElementCount(loc, builder, eleTy, size, arg, t); - assert(count && "vector must have elements"); - size = bytes; - - // At this point, arg is a known vector of elements of dynamic - // type, so walk over the vector and recurse on each element. - // `size` is already the proper size of the lengths of each of the - // elements in turn. - builder.create(loc, size, tmp); - auto ptrTy = cast(arg.getType()); - auto strTy = cast(ptrTy.getElementType()); - auto memTy = cast(strTy.getMember(0)); - auto arrTy = - cudaq::cc::PointerType::get(cudaq::cc::PointerType::get( - cudaq::cc::ArrayType::get(memTy.getElementType()))); - auto castPtr = builder.create(loc, arrTy, arg); - auto castArg = builder.create(loc, castPtr); - auto castPtrTy = - cudaq::cc::PointerType::get(memTy.getElementType()); - cudaq::opt::factory::createInvariantLoop( - builder, loc, count, - [&](OpBuilder &builder, Location loc, Region &, - Block &block) { - Value i = block.getArgument(0); - auto ai = builder.create( - loc, castPtrTy, castArg, - ArrayRef{i}); - auto tmpVal = builder.create(loc, tmp); - Value innerSize = descendThroughDynamicType( - loc, builder, eleTy, tmpVal, ai, tmp); - builder.create(loc, innerSize, tmp); - }); - return builder.create(loc, tmp); - }) - // A struct can be dynamic if it contains dynamic members. Get the - // static portion of the struct first, which will have length slots. - // Then get the dynamic sizes for the dynamic members. - .Case([&](cudaq::cc::StructType t) -> Value { - if (cudaq::cc::isDynamicType(t)) { - Type packedTy = cudaq::opt::factory::genArgumentBufferType(t); - Value strSize = - builder.create(loc, i64Ty, packedTy); - for (auto [i, m] : llvm::enumerate(t.getMembers())) { - if (cudaq::cc::isDynamicType(m)) { - auto hostPtrTy = - cast(arg.getType()); - auto hostStrTy = - cast(hostPtrTy.getElementType()); - auto pm = - cudaq::cc::PointerType::get(hostStrTy.getMember(i)); - auto ai = builder.create( - loc, pm, arg, ArrayRef{i}); - strSize = descendThroughDynamicType(loc, builder, m, - strSize, ai, tmp); - } - } - return strSize; - } - return builder.create(loc, i64Ty, t); - }) - .Default([&](Type t) -> Value { - return builder.create(loc, i64Ty, t); - }); - return builder.create(loc, tySize, addend); - } +static std::pair unpackAnyStdVectorBool(Location loc, + OpBuilder &builder, + Value arg, Type ty, + Value heapTracker) { + if (hasStdVectorBool(ty)) + return convertAllStdVectorBool(loc, builder, arg, ty, heapTracker); + return {arg, false}; +} - // Take the list of host-side arguments and device side argument types and zip - // them together logically with the position. Generates any fixup code that's - // needed, like when the device side uses a pair of arguments for a single - // logical device side argument. May drop some arguments on the floor if they - // cannot be encoded. - template - SmallVector> - zipArgumentsWithDeviceTypes(Location loc, OpBuilder &builder, ValueRange args, - TypeRange types, - SmallVectorImpl &freeVectorBuffers) { - SmallVector> result; - if constexpr (argsAreReferences) { - // Simple case: the number of args must be equal to the types. - assert(args.size() == types.size() && - "arguments and types must have same size"); - auto *ctx = builder.getContext(); - for (auto iter : llvm::enumerate(llvm::zip(args, types))) { - // Remove the reference. - Value v = std::get(iter.value()); - Type ty = std::get(iter.value()); - if (!(cudaq::cc::isDynamicType(ty) || isStateType(ty) || - isa(ty))) - v = builder.create(loc, v); - // Python will pass a std::vector to us here. Unpack it. - if (auto stdvecTy = dyn_cast(ty)) - if (stdvecTy.getElementType() == IntegerType::get(ctx, 1)) { - Type stdvecHostTy = - cudaq::opt::factory::stlVectorType(stdvecTy.getElementType()); - Value tmp = builder.create(loc, stdvecHostTy); - builder.create(loc, std::nullopt, - cudaq::stdvecBoolUnpackToInitList, - ArrayRef{tmp, v}); - freeVectorBuffers.push_back(tmp); - v = tmp; - } - result.emplace_back(iter.index(), v, ty); +// Take the list of host-side arguments and device side argument types and zip +// them together logically with the position. Generates any fixup code that's +// needed, like when the device side uses a pair of arguments for a single +// logical device side argument. May drop some arguments on the floor if they +// cannot be encoded. +template +static SmallVector> +zipArgumentsWithDeviceTypes(Location loc, OpBuilder &builder, ValueRange args, + TypeRange types, Value heapTracker) { + SmallVector> result; + if constexpr (argsAreReferences) { + // Simple case: the number of args must be equal to the types. + assert(args.size() == types.size() && + "arguments and types must have same size"); + for (auto iter : llvm::enumerate(llvm::zip(args, types))) { + // Remove the reference. + Value v = std::get(iter.value()); + Type ty = std::get(iter.value()); + if (!(cudaq::cc::isDynamicType(ty) || isStateType(ty) || + isa(ty))) + v = builder.create(loc, v); + // Python will pass a std::vector to us here. Unpack it. + auto pear = unpackAnyStdVectorBool(loc, builder, v, ty, heapTracker); + v = pear.first; + result.emplace_back(iter.index(), v, ty); + } + } else /*constexpr*/ { + // In this case, we *may* have logical arguments that are passed in pairs. + auto *ctx = builder.getContext(); + auto *parent = builder.getBlock()->getParentOp(); + auto module = parent->getParentOfType(); + auto lastArg = args.end(); + auto tyIter = types.begin(); + unsigned argPos = 0; + for (auto argIter = args.begin(); argIter != lastArg; + ++argIter, ++tyIter, ++argPos) { + assert(tyIter != types.end()); + Type devTy = *tyIter; + + // std::vector isn't really a std::vector<>. Use the helper + // function to unpack it so it looks like any other vector. + auto pear = + unpackAnyStdVectorBool(loc, builder, *argIter, devTy, heapTracker); + if (pear.second) { + result.emplace_back(argPos, pear.first, devTy); + continue; } - } else /*constexpr*/ { - // In this case, we *may* have logical arguments that are passed in pairs. - auto *ctx = builder.getContext(); - auto *parent = builder.getBlock()->getParentOp(); - auto module = parent->getParentOfType(); - auto lastArg = args.end(); - auto tyIter = types.begin(); - unsigned argPos = 0; - for (auto argIter = args.begin(); argIter != lastArg; - ++argIter, ++tyIter, ++argPos) { - assert(tyIter != types.end()); - Type devTy = *tyIter; - - // std::vector isn't really a std::vector<>. Use the helper - // function to unpack it so it looks like any other vector. - if (auto stdvecTy = dyn_cast(devTy)) - if (stdvecTy.getElementType() == IntegerType::get(ctx, 1)) { - Type stdvecHostTy = - cudaq::opt::factory::stlVectorType(stdvecTy.getElementType()); - Value tmp = builder.create(loc, stdvecHostTy); - builder.create(loc, std::nullopt, - cudaq::stdvecBoolUnpackToInitList, - ArrayRef{tmp, *argIter}); - result.emplace_back(argPos, tmp, devTy); - freeVectorBuffers.push_back(tmp); - continue; - } - // Check for a struct passed in a pair of arguments. - if (isa(devTy) && - !isa((*argIter).getType()) && - cudaq::opt::factory::isX86_64(module) && - cudaq::opt::factory::structUsesTwoArguments(devTy)) { - auto first = *argIter++; - auto second = *argIter; - // TODO: Investigate if it's correct to assume the register layout - // will match the memory layout of the small struct. - auto pairTy = cudaq::cc::StructType::get( - ctx, ArrayRef{first.getType(), second.getType()}); - auto tmp = builder.create(loc, pairTy); - auto tmp1 = builder.create( - loc, cudaq::cc::PointerType::get(first.getType()), tmp); - builder.create(loc, first, tmp1); - auto tmp2 = builder.create( - loc, cudaq::cc::PointerType::get(second.getType()), tmp, - ArrayRef{1}); - builder.create(loc, second, tmp2); - auto devPtrTy = cudaq::cc::PointerType::get(devTy); - Value devVal = builder.create(loc, devPtrTy, tmp); - if (!cudaq::cc::isDynamicType(devTy)) - devVal = builder.create(loc, devVal); - result.emplace_back(argPos, devVal, devTy); - continue; - } + // Check for a struct passed in a pair of arguments. + if (isa(devTy) && + !isa((*argIter).getType()) && + cudaq::opt::factory::isX86_64(module) && + cudaq::opt::factory::structUsesTwoArguments(devTy)) { + auto first = *argIter++; + auto second = *argIter; + // TODO: Investigate if it's correct to assume the register layout + // will match the memory layout of the small struct. + auto pairTy = cudaq::cc::StructType::get( + ctx, ArrayRef{first.getType(), second.getType()}); + auto tmp = builder.create(loc, pairTy); + auto tmp1 = builder.create( + loc, cudaq::cc::PointerType::get(first.getType()), tmp); + builder.create(loc, first, tmp1); + auto tmp2 = builder.create( + loc, cudaq::cc::PointerType::get(second.getType()), tmp, + ArrayRef{1}); + builder.create(loc, second, tmp2); + auto devPtrTy = cudaq::cc::PointerType::get(devTy); + Value devVal = builder.create(loc, devPtrTy, tmp); + if (!cudaq::cc::isDynamicType(devTy)) + devVal = builder.create(loc, devVal); + result.emplace_back(argPos, devVal, devTy); + continue; + } - // Is this a static struct passed as a byval pointer? - if (isa(devTy) && - isa((*argIter).getType()) && - !cudaq::cc::isDynamicType(devTy)) { - Value devVal = builder.create(loc, *argIter); - result.emplace_back(argPos, devVal, devTy); - continue; - } - result.emplace_back(argPos, *argIter, devTy); + // Is this a static struct passed as a byval pointer? + if (isa(devTy) && + isa((*argIter).getType()) && + !cudaq::cc::isDynamicType(devTy)) { + Value devVal = builder.create(loc, *argIter); + result.emplace_back(argPos, devVal, devTy); + continue; } + result.emplace_back(argPos, *argIter, devTy); } - return result; } + return result; +} - Value genSizeOfDynamicMessageBuffer( - Location loc, OpBuilder &builder, cudaq::cc::StructType structTy, - ArrayRef> zippy, Value tmp) { - auto i64Ty = builder.getI64Type(); - Value initSize = builder.create(loc, i64Ty, structTy); - for (auto [_, a, t] : zippy) - if (cudaq::cc::isDynamicType(t)) - initSize = descendThroughDynamicType(loc, builder, t, initSize, a, tmp); - return initSize; - } +static Value descendThroughDynamicType(Location loc, OpBuilder &builder, + Type ty, Value addend, Value arg, + Value tmp) { + auto i64Ty = builder.getI64Type(); + Value tySize = + TypeSwitch(ty) + // A char span is dynamic, but it is not recursively dynamic. Just + // read the length of the string out. + .Case([&](cudaq::cc::CharspanType t) -> Value { + return genStringLength(loc, builder, arg); + }) + // A std::vector is dynamic and may be recursive dynamic as well. + .Case([&](cudaq::cc::StdvecType t) -> Value { + // Compute the byte span of the vector. + Value size = genVectorSize(loc, builder, arg); + auto eleTy = t.getElementType(); + if (!cudaq::cc::isDynamicType(eleTy)) + return size; + + // Otherwise, we have a recursively dynamic case. + auto [bytes, count] = + genByteSizeAndElementCount(loc, builder, eleTy, size, arg, t); + assert(count && "vector must have elements"); + size = bytes; + + // At this point, arg is a known vector of elements of dynamic + // type, so walk over the vector and recurse on each element. + // `size` is already the proper size of the lengths of each of the + // elements in turn. + builder.create(loc, size, tmp); + auto ptrTy = cast(arg.getType()); + auto strTy = cast(ptrTy.getElementType()); + auto memTy = cast(strTy.getMember(0)); + auto arrTy = + cudaq::cc::PointerType::get(cudaq::cc::PointerType::get( + cudaq::cc::ArrayType::get(memTy.getElementType()))); + auto castPtr = builder.create(loc, arrTy, arg); + auto castArg = builder.create(loc, castPtr); + auto castPtrTy = + cudaq::cc::PointerType::get(memTy.getElementType()); + cudaq::opt::factory::createInvariantLoop( + builder, loc, count, + [&](OpBuilder &builder, Location loc, Region &, Block &block) { + Value i = block.getArgument(0); + auto ai = builder.create( + loc, castPtrTy, castArg, + ArrayRef{i}); + auto tmpVal = builder.create(loc, tmp); + Value innerSize = descendThroughDynamicType( + loc, builder, eleTy, tmpVal, ai, tmp); + builder.create(loc, innerSize, tmp); + }); + return builder.create(loc, tmp); + }) + // A struct can be dynamic if it contains dynamic members. Get the + // static portion of the struct first, which will have length slots. + // Then get the dynamic sizes for the dynamic members. + .Case([&](cudaq::cc::StructType t) -> Value { + if (cudaq::cc::isDynamicType(t)) { + Type packedTy = cudaq::opt::factory::genArgumentBufferType(t); + Value strSize = + builder.create(loc, i64Ty, packedTy); + for (auto [i, m] : llvm::enumerate(t.getMembers())) { + if (cudaq::cc::isDynamicType(m)) { + auto hostPtrTy = cast(arg.getType()); + auto hostStrTy = + cast(hostPtrTy.getElementType()); + auto pm = cudaq::cc::PointerType::get(hostStrTy.getMember(i)); + auto ai = builder.create( + loc, pm, arg, ArrayRef{i}); + strSize = descendThroughDynamicType(loc, builder, m, strSize, + ai, tmp); + } + } + return strSize; + } + return builder.create(loc, i64Ty, t); + }) + .Default([&](Type t) -> Value { + return builder.create(loc, i64Ty, t); + }); + return builder.create(loc, tySize, addend); +} - Value populateStringAddendum(Location loc, OpBuilder &builder, Value host, - Value sizeSlot, Value addendum) { - Value size = genStringLength(loc, builder, host); - builder.create(loc, size, sizeSlot); - auto ptrI8Ty = cudaq::cc::PointerType::get(builder.getI8Type()); - auto ptrPtrI8 = getPointerToPointerType(builder); - auto fromPtrPtr = builder.create(loc, ptrPtrI8, host); - auto fromPtr = builder.create(loc, fromPtrPtr); - auto notVolatile = builder.create(loc, 0, 1); - auto toPtr = builder.create(loc, ptrI8Ty, addendum); - builder.create(loc, std::nullopt, cudaq::llvmMemCopyIntrinsic, - ValueRange{toPtr, fromPtr, size, notVolatile}); - auto ptrI8Arr = getByteAddressableType(builder); - auto addBytes = builder.create(loc, ptrI8Arr, addendum); - return builder.create( - loc, ptrI8Ty, addBytes, ArrayRef{size}); - } +static Value genSizeOfDynamicMessageBuffer( + Location loc, OpBuilder &builder, cudaq::cc::StructType structTy, + ArrayRef> zippy, Value tmp) { + auto i64Ty = builder.getI64Type(); + Value initSize = builder.create(loc, i64Ty, structTy); + for (auto [_, a, t] : zippy) + if (cudaq::cc::isDynamicType(t)) + initSize = descendThroughDynamicType(loc, builder, t, initSize, a, tmp); + return initSize; +} - // Simple case when the vector data is known to not hold dynamic data. - Value populateVectorAddendum(Location loc, OpBuilder &builder, Value host, - Value sizeSlot, Value addendum) { - Value size = genVectorSize(loc, builder, host); - builder.create(loc, size, sizeSlot); - auto ptrI8Ty = cudaq::cc::PointerType::get(builder.getI8Type()); - auto ptrPtrI8 = getPointerToPointerType(builder); - auto fromPtrPtr = builder.create(loc, ptrPtrI8, host); - auto fromPtr = builder.create(loc, fromPtrPtr); - auto notVolatile = builder.create(loc, 0, 1); - auto toPtr = builder.create(loc, ptrI8Ty, addendum); - builder.create(loc, std::nullopt, cudaq::llvmMemCopyIntrinsic, - ValueRange{toPtr, fromPtr, size, notVolatile}); - auto ptrI8Arr = getByteAddressableType(builder); - auto addBytes = builder.create(loc, ptrI8Arr, addendum); - return builder.create( - loc, ptrI8Ty, addBytes, ArrayRef{size}); - } +static Value populateStringAddendum(Location loc, OpBuilder &builder, + Value host, Value sizeSlot, + Value addendum) { + Value size = genStringLength(loc, builder, host); + builder.create(loc, size, sizeSlot); + auto ptrI8Ty = cudaq::cc::PointerType::get(builder.getI8Type()); + auto ptrPtrI8 = getPointerToPointerType(builder); + auto fromPtrPtr = builder.create(loc, ptrPtrI8, host); + auto fromPtr = builder.create(loc, fromPtrPtr); + auto notVolatile = builder.create(loc, 0, 1); + auto toPtr = builder.create(loc, ptrI8Ty, addendum); + builder.create(loc, std::nullopt, cudaq::llvmMemCopyIntrinsic, + ValueRange{toPtr, fromPtr, size, notVolatile}); + auto ptrI8Arr = getByteAddressableType(builder); + auto addBytes = builder.create(loc, ptrI8Arr, addendum); + return builder.create( + loc, ptrI8Ty, addBytes, ArrayRef{size}); +} - Value populateDynamicAddendum(Location loc, OpBuilder &builder, Type devArgTy, - Value host, Value sizeSlot, Value addendum, - Value addendumScratch) { - if (isa(devArgTy)) - return populateStringAddendum(loc, builder, host, sizeSlot, addendum); - if (auto vecTy = dyn_cast(devArgTy)) { - auto eleTy = vecTy.getElementType(); - if (cudaq::cc::isDynamicType(eleTy)) { - // Recursive case. Visit each dynamic element, copying it. - Value size = genVectorSize(loc, builder, host); - auto [bytes, count] = genByteSizeAndElementCount(loc, builder, eleTy, - size, host, devArgTy); - size = bytes; - builder.create(loc, size, sizeSlot); - // Convert from bytes to vector length in elements. - // Compute new addendum start. - auto addrTy = getByteAddressableType(builder); - auto castEnd = builder.create(loc, addrTy, addendum); - Value newAddendum = builder.create( - loc, addendum.getType(), castEnd, - ArrayRef{size}); - builder.create(loc, newAddendum, addendumScratch); - auto sizeBlockTy = cudaq::cc::PointerType::get( - cudaq::cc::ArrayType::get(builder.getI64Type())); - auto ptrI64Ty = cudaq::cc::PointerType::get(builder.getI64Type()); - // In the recursive case, the next block of addendum is a vector of - // sizes in bytes. Each size will be the size of the vector at that - // offset. - auto sizeBlock = - builder.create(loc, sizeBlockTy, addendum); - auto ptrPtrBlockTy = cudaq::cc::PointerType::get( - cast( - cast(host.getType()).getElementType()) - .getMember(0)); - // The host argument is a std::vector, so we want to get the address of - // "front" out of the vector (the first pointer in the triple) and step - // over the contiguous range of vectors in the host block. The vector of - // vectors forms a ragged array structure in host memory. - auto hostBeginPtrRef = builder.create( - loc, ptrPtrBlockTy, host, ArrayRef{0}); - auto hostBegin = - builder.create(loc, hostBeginPtrRef); - auto hostEleTy = cast(hostBegin.getType()); - auto hostBlockTy = cudaq::cc::PointerType::get( - cudaq::cc::ArrayType::get(hostEleTy.getElementType())); - auto hostBlock = - builder.create(loc, hostBlockTy, hostBegin); - // Loop over each vector element in the vector (recursively). - cudaq::opt::factory::createInvariantLoop( - builder, loc, count, - [&](OpBuilder &builder, Location loc, Region &, Block &block) { - Value i = block.getArgument(0); - Value addm = - builder.create(loc, addendumScratch); - auto subSlot = builder.create( - loc, ptrI64Ty, sizeBlock, - ArrayRef{i}); - auto subHost = builder.create( - loc, hostEleTy, hostBlock, - ArrayRef{i}); - Value newAddm = populateDynamicAddendum( - loc, builder, eleTy, subHost, subSlot, addm, addendumScratch); - builder.create(loc, newAddm, addendumScratch); - }); - return builder.create(loc, addendumScratch); - } - return populateVectorAddendum(loc, builder, host, sizeSlot, addendum); +// Simple case when the vector data is known to not hold dynamic data. +static Value populateVectorAddendum(Location loc, OpBuilder &builder, + Value host, Value sizeSlot, + Value addendum) { + Value size = genVectorSize(loc, builder, host); + builder.create(loc, size, sizeSlot); + auto ptrI8Ty = cudaq::cc::PointerType::get(builder.getI8Type()); + auto ptrPtrI8 = getPointerToPointerType(builder); + auto fromPtrPtr = builder.create(loc, ptrPtrI8, host); + auto fromPtr = builder.create(loc, fromPtrPtr); + auto notVolatile = builder.create(loc, 0, 1); + auto toPtr = builder.create(loc, ptrI8Ty, addendum); + builder.create(loc, std::nullopt, cudaq::llvmMemCopyIntrinsic, + ValueRange{toPtr, fromPtr, size, notVolatile}); + auto ptrI8Arr = getByteAddressableType(builder); + auto addBytes = builder.create(loc, ptrI8Arr, addendum); + return builder.create( + loc, ptrI8Ty, addBytes, ArrayRef{size}); +} + +static Value populateDynamicAddendum(Location loc, OpBuilder &builder, + Type devArgTy, Value host, Value sizeSlot, + Value addendum, Value addendumScratch) { + if (isa(devArgTy)) + return populateStringAddendum(loc, builder, host, sizeSlot, addendum); + if (auto vecTy = dyn_cast(devArgTy)) { + auto eleTy = vecTy.getElementType(); + if (cudaq::cc::isDynamicType(eleTy)) { + // Recursive case. Visit each dynamic element, copying it. + Value size = genVectorSize(loc, builder, host); + auto [bytes, count] = + genByteSizeAndElementCount(loc, builder, eleTy, size, host, devArgTy); + size = bytes; + builder.create(loc, size, sizeSlot); + + // Convert from bytes to vector length in elements. + // Compute new addendum start. + auto addrTy = getByteAddressableType(builder); + auto castEnd = builder.create(loc, addrTy, addendum); + Value newAddendum = builder.create( + loc, addendum.getType(), castEnd, + ArrayRef{size}); + builder.create(loc, newAddendum, addendumScratch); + Type dataTy = cudaq::opt::factory::genArgumentBufferType(eleTy); + auto arrDataTy = cudaq::cc::ArrayType::get(dataTy); + auto sizeBlockTy = cudaq::cc::PointerType::get(arrDataTy); + auto ptrDataTy = cudaq::cc::PointerType::get(dataTy); + + // In the recursive case, the next block of addendum is a vector of + // elements which are either sizes or contain sizes. The sizes are i64 + // and expressed in bytes. Each size will be the size of the span of the + // element (or its subfields) at that offset. + auto sizeBlock = + builder.create(loc, sizeBlockTy, addendum); + auto hostEleTy = + cast(host.getType()).getElementType(); + auto ptrPtrBlockTy = cudaq::cc::PointerType::get( + cast(hostEleTy).getMember(0)); + + // The host argument is a std::vector, so we want to get the address of + // "front" out of the vector (the first pointer in the triple) and step + // over the contiguous range of vectors in the host block. The vector of + // vectors forms a ragged array structure in host memory. + auto hostBeginPtrRef = builder.create( + loc, ptrPtrBlockTy, host, ArrayRef{0}); + auto hostBegin = builder.create(loc, hostBeginPtrRef); + auto hostBeginEleTy = cast(hostBegin.getType()); + auto hostBlockTy = cudaq::cc::PointerType::get( + cudaq::cc::ArrayType::get(hostBeginEleTy.getElementType())); + auto hostBlock = + builder.create(loc, hostBlockTy, hostBegin); + + // Loop over each vector element in the vector (recursively). + cudaq::opt::factory::createInvariantLoop( + builder, loc, count, + [&](OpBuilder &builder, Location loc, Region &, Block &block) { + Value i = block.getArgument(0); + Value addm = + builder.create(loc, addendumScratch); + auto subSlot = builder.create( + loc, ptrDataTy, sizeBlock, + ArrayRef{i}); + auto subHost = builder.create( + loc, hostBeginEleTy, hostBlock, + ArrayRef{i}); + Value newAddm = populateDynamicAddendum( + loc, builder, eleTy, subHost, subSlot, addm, addendumScratch); + builder.create(loc, newAddm, addendumScratch); + }); + return builder.create(loc, addendumScratch); } - auto devStrTy = cast(devArgTy); - auto hostStrTy = cast( - cast(sizeSlot.getType()).getElementType()); - assert(devStrTy.getNumMembers() == hostStrTy.getNumMembers()); - for (auto iter : llvm::enumerate(devStrTy.getMembers())) { - std::int32_t iterIdx = iter.index(); - auto hostPtrTy = cast(host.getType()); - auto hostMemTy = cast(hostPtrTy.getElementType()) - .getMember(iterIdx); - auto val = builder.create( - loc, cudaq::cc::PointerType::get(hostMemTy), host, + return populateVectorAddendum(loc, builder, host, sizeSlot, addendum); + } + auto devStrTy = cast(devArgTy); + auto hostStrTy = cast( + cast(sizeSlot.getType()).getElementType()); + assert(devStrTy.getNumMembers() == hostStrTy.getNumMembers()); + for (auto iter : llvm::enumerate(devStrTy.getMembers())) { + std::int32_t iterIdx = iter.index(); + auto hostPtrTy = cast(host.getType()); + auto hostMemTy = cast(hostPtrTy.getElementType()) + .getMember(iterIdx); + auto val = builder.create( + loc, cudaq::cc::PointerType::get(hostMemTy), host, + ArrayRef{iterIdx}); + Type iterTy = iter.value(); + if (cudaq::cc::isDynamicType(iterTy)) { + Value fieldInSlot = builder.create( + loc, cudaq::cc::PointerType::get(builder.getI64Type()), sizeSlot, ArrayRef{iterIdx}); - Type iterTy = iter.value(); - if (cudaq::cc::isDynamicType(iterTy)) { - Value fieldInSlot = builder.create( - loc, cudaq::cc::PointerType::get(builder.getI64Type()), sizeSlot, - ArrayRef{iterIdx}); - addendum = populateDynamicAddendum( - loc, builder, iterTy, val, fieldInSlot, addendum, addendumScratch); - } else { - Value fieldInSlot = builder.create( - loc, cudaq::cc::PointerType::get(iterTy), sizeSlot, - ArrayRef{iterIdx}); - auto v = builder.create(loc, val); - builder.create(loc, v, fieldInSlot); - } + addendum = populateDynamicAddendum(loc, builder, iterTy, val, fieldInSlot, + addendum, addendumScratch); + } else { + Value fieldInSlot = builder.create( + loc, cudaq::cc::PointerType::get(iterTy), sizeSlot, + ArrayRef{iterIdx}); + auto v = builder.create(loc, val); + builder.create(loc, v, fieldInSlot); } - return addendum; } + return addendum; +} - void populateMessageBuffer(Location loc, OpBuilder &builder, - Value msgBufferBase, - ArrayRef> zippy, - Value addendum = {}, Value addendumScratch = {}) { - auto structTy = cast( - cast(msgBufferBase.getType()).getElementType()); - // Loop over all the arguments and populate the message buffer. - for (auto [idx, arg, devArgTy] : zippy) { - if (cudaq::cc::isDynamicType(devArgTy)) { - assert(addendum && "must have addendum to encode dynamic argument(s)"); - // Get the address of the slot to be filled. - auto memberTy = cast(structTy).getMember(idx); - auto ptrTy = cudaq::cc::PointerType::get(memberTy); - auto slot = builder.create( - loc, ptrTy, msgBufferBase, ArrayRef{idx}); - addendum = populateDynamicAddendum(loc, builder, devArgTy, arg, slot, - addendum, addendumScratch); - continue; - } - - // If the argument is a callable, skip it. - if (isa(devArgTy)) - continue; - // If the argument is an empty struct, skip it. - if (auto strTy = dyn_cast(devArgTy); - strTy && strTy.isEmpty()) - continue; - +static void +populateMessageBuffer(Location loc, OpBuilder &builder, Value msgBufferBase, + ArrayRef> zippy, + Value addendum = {}, Value addendumScratch = {}) { + auto structTy = cast( + cast(msgBufferBase.getType()).getElementType()); + // Loop over all the arguments and populate the message buffer. + for (auto [idx, arg, devArgTy] : zippy) { + if (cudaq::cc::isDynamicType(devArgTy)) { + assert(addendum && "must have addendum to encode dynamic argument(s)"); // Get the address of the slot to be filled. auto memberTy = cast(structTy).getMember(idx); auto ptrTy = cudaq::cc::PointerType::get(memberTy); - Value slot = builder.create( + auto slot = builder.create( loc, ptrTy, msgBufferBase, ArrayRef{idx}); + addendum = populateDynamicAddendum(loc, builder, devArgTy, arg, slot, + addendum, addendumScratch); + continue; + } - // Argument is a packaged kernel. In this case, the argument is some - // unknown kernel that may be called. The packaged argument is coming - // from opaque C++ host code, so we need to identify what kernel it - // references and then pass its name as a span of characters to the - // launch kernel. - if (isa(devArgTy)) { - auto i64Ty = builder.getI64Type(); - auto kernKey = builder.create( - loc, i64Ty, cudaq::runtime::getLinkableKernelKey, ValueRange{arg}); - builder.create(loc, kernKey.getResult(0), slot); - continue; - } + // If the argument is a callable, skip it. + if (isa(devArgTy)) + continue; + // If the argument is an empty struct, skip it. + if (auto strTy = dyn_cast(devArgTy); + strTy && strTy.isEmpty()) + continue; + + // Get the address of the slot to be filled. + auto memberTy = cast(structTy).getMember(idx); + auto ptrTy = cudaq::cc::PointerType::get(memberTy); + Value slot = builder.create( + loc, ptrTy, msgBufferBase, ArrayRef{idx}); + + // Argument is a packaged kernel. In this case, the argument is some + // unknown kernel that may be called. The packaged argument is coming + // from opaque C++ host code, so we need to identify what kernel it + // references and then pass its name as a span of characters to the + // launch kernel. + if (isa(devArgTy)) { + auto i64Ty = builder.getI64Type(); + auto kernKey = builder.create( + loc, i64Ty, cudaq::runtime::getLinkableKernelKey, ValueRange{arg}); + builder.create(loc, kernKey.getResult(0), slot); + continue; + } - // Just pass the raw pointer. The buffer is supposed to be pointer-free - // since it may be unpacked in a different address space. However, if this - // is a simulation and things are in the same address space, we pass the - // pointer for convenience. - if (isa(devArgTy)) - arg = builder.create(loc, memberTy, arg); - - if (isa(arg.getType()) && - (cudaq::cc::PointerType::get(arg.getType()) != slot.getType())) { - slot = builder.create( - loc, cudaq::cc::PointerType::get(arg.getType()), slot); - } - builder.create(loc, arg, slot); + // Just pass the raw pointer. The buffer is supposed to be pointer-free + // since it may be unpacked in a different address space. However, if this + // is a simulation and things are in the same address space, we pass the + // pointer for convenience. + if (isa(devArgTy)) + arg = builder.create(loc, memberTy, arg); + + if (isa(arg.getType()) && + (cudaq::cc::PointerType::get(arg.getType()) != slot.getType())) { + slot = builder.create( + loc, cudaq::cc::PointerType::get(arg.getType()), slot); } + builder.create(loc, arg, slot); } +} + +/// A kernel function that takes a quantum type argument (also known as a pure +/// device kernel) cannot be called directly from C++ (classical) code. It must +/// be called via other quantum code. +static bool hasLegalType(FunctionType funTy) { + for (auto ty : funTy.getInputs()) + if (quake::isQuantumType(ty)) + return false; + for (auto ty : funTy.getResults()) + if (quake::isQuantumType(ty)) + return false; + return true; +} + +static MutableArrayRef +dropAnyHiddenArguments(MutableArrayRef args, FunctionType funcTy, + bool hasThisPointer) { + const bool hiddenSRet = cudaq::opt::factory::hasHiddenSRet(funcTy); + const unsigned count = + cudaq::cc::numberOfHiddenArgs(hasThisPointer, hiddenSRet); + if (count > 0 && args.size() >= count && + std::all_of(args.begin(), args.begin() + count, [](auto i) { + return isa(i.getType()); + })) + return args.drop_front(count); + return args; +} + +static std::pair +lookupHostEntryPointFunc(StringRef mangledEntryPointName, ModuleOp module, + func::FuncOp funcOp) { + if (mangledEntryPointName.equals("BuilderKernel.EntryPoint") || + mangledEntryPointName.contains("_PyKernelEntryPointRewrite")) { + // No host entry point needed. + return {false, func::FuncOp{}}; + } + if (auto *decl = module.lookupSymbol(mangledEntryPointName)) + if (auto func = dyn_cast(decl)) { + func.eraseBody(); + return {true, func}; + } + funcOp.emitOpError("could not generate the host-side kernel function (" + + mangledEntryPointName + ")"); + return {true, func::FuncOp{}}; +} + +/// Generate code to initialize the std::vector, \p sret, from an initializer +/// list with data at \p data and length \p size. Use the library helper +/// routine. This function takes two !llvm.ptr arguments. +static void genStdvecBoolFromInitList(Location loc, OpBuilder &builder, + Value sret, Value data, Value size) { + auto ptrTy = cudaq::cc::PointerType::get(builder.getContext()); + auto castData = builder.create(loc, ptrTy, data); + auto castSret = builder.create(loc, ptrTy, sret); + builder.create(loc, std::nullopt, + cudaq::stdvecBoolCtorFromInitList, + ArrayRef{castSret, castData, size}); +} + +/// Generate a `std::vector` (where `T != bool`) from an initializer list. +/// This is done with the assumption that `std::vector` is implemented as a +/// triple of pointers. The original content of the vector is freed and the new +/// content, which is already on the stack, is moved into the `std::vector`. +static void genStdvecTFromInitList(Location loc, OpBuilder &builder, Value sret, + Value data, Value tSize, Value vecSize) { + auto i8Ty = builder.getI8Type(); + auto stlVectorTy = + cudaq::cc::PointerType::get(cudaq::opt::factory::stlVectorType(i8Ty)); + auto ptrTy = cudaq::cc::PointerType::get(i8Ty); + auto castSret = builder.create(loc, stlVectorTy, sret); + auto ptrPtrTy = cudaq::cc::PointerType::get(ptrTy); + auto sret0 = builder.create( + loc, ptrPtrTy, castSret, SmallVector{0}); + auto arrI8Ty = cudaq::cc::ArrayType::get(i8Ty); + auto ptrArrTy = cudaq::cc::PointerType::get(arrI8Ty); + auto buffPtr0 = builder.create(loc, ptrTy, data); + builder.create(loc, buffPtr0, sret0); + auto sret1 = builder.create( + loc, ptrPtrTy, castSret, SmallVector{1}); + Value byteLen = builder.create(loc, tSize, vecSize); + auto buffPtr = builder.create(loc, ptrArrTy, data); + auto endPtr = builder.create( + loc, ptrTy, buffPtr, SmallVector{byteLen}); + builder.create(loc, endPtr, sret1); + auto sret2 = builder.create( + loc, ptrPtrTy, castSret, SmallVector{2}); + builder.create(loc, endPtr, sret2); +} + +// Alloca a pointer to a pointer and initialize it to nullptr. +static Value createEmptyHeapTracker(Location loc, OpBuilder &builder) { + auto ptrI8Ty = cudaq::cc::PointerType::get(builder.getI8Type()); + auto result = builder.create(loc, ptrI8Ty); + auto zero = builder.create(loc, 0, 64); + auto null = builder.create(loc, ptrI8Ty, zero); + builder.create(loc, null, result); + return result; +} + +// If there are temporaries, call the helper to free them. +static void maybeFreeHeapAllocations(Location loc, OpBuilder &builder, + Value heapTracker) { + auto head = builder.create(loc, heapTracker); + auto zero = builder.create(loc, 0, 64); + auto headAsInt = + builder.create(loc, builder.getI64Type(), head); + auto cmp = builder.create(loc, arith::CmpIPredicate::ne, + headAsInt, zero); + // If there are no std::vector to unpack, then the heapTracker will be + // set to `nullptr` and otherwise unused. That will allow the compiler to DCE + // this call after constant propagation. + builder.create( + loc, TypeRange{}, cmp, + [&](OpBuilder &builder, Location loc, Region ®ion) { + region.push_back(new Block()); + auto &body = region.front(); + OpBuilder::InsertionGuard guard(builder); + builder.setInsertionPointToStart(&body); + builder.create(loc, std::nullopt, + cudaq::stdvecBoolFreeTemporaryLists, + ArrayRef{head}); + builder.create(loc); + }); +} + +/// This pass adds a `.thunk` function and a rewritten C++ host +/// side (mangled) stub to the code for every entry-point kernel in the module. +/// It may also generate a `.argsCreator` function. Finally, it +/// creates registration hooks for the CUDA-Q runtime to be able to find the +/// kernel by name and, as appropriate, the `.argsCreator` +/// function. +namespace { +class GenerateKernelExecution + : public cudaq::opt::impl::GenerateKernelExecutionBase< + GenerateKernelExecution> { +public: + using GenerateKernelExecutionBase::GenerateKernelExecutionBase; /// Creates a function that can take a block of pointers to argument values /// and using the compiler's knowledge of a kernel encodes those argument @@ -738,9 +1028,9 @@ class GenerateKernelExecution // Zip the arguments with the device side argument types. Recall that some // of the (left-most) arguments may have been dropped on the floor. const bool hasDynamicSignature = isDynamicSignature(devKernelTy); - SmallVector freeVectorBuffers; + Value heapTracker = createEmptyHeapTracker(loc, builder); auto zippy = zipArgumentsWithDeviceTypes( - loc, builder, pseudoArgs, passedDevArgTys, freeVectorBuffers); + loc, builder, pseudoArgs, passedDevArgTys, heapTracker); auto sizeScratch = builder.create(loc, i64Ty); auto messageBufferSize = [&]() -> Value { if (hasDynamicSignature) @@ -774,18 +1064,7 @@ class GenerateKernelExecution populateMessageBuffer(loc, builder, msgBufferPrefix, zippy); } - if (!freeVectorBuffers.empty()) { - // Need to free any temporary vector-like buffers. These arise when - // there is a std::vector argument, which we translate into a - // std::vector to reuse the same code as any other std::vector. - for (auto vecVar : freeVectorBuffers) { - auto ptrPtrTy = cudaq::cc::PointerType::get(ptrI8Ty); - auto ptrPtr = builder.create(loc, ptrPtrTy, vecVar); - Value freeMe = builder.create(loc, ptrPtr); - builder.create(loc, std::nullopt, "free", - ArrayRef{freeMe}); - } - } + maybeFreeHeapAllocations(loc, builder, heapTracker); // Return the message buffer and its size in bytes. builder.create(loc, rawMessageBuffer, @@ -1086,82 +1365,6 @@ class GenerateKernelExecution return thunk; } - /// Generate code to initialize the std::vector, \p sret, from an - /// initializer list with data at \p data and length \p size. Use the library - /// helper routine. This function takes two !llvm.ptr arguments. - void genStdvecBoolFromInitList(Location loc, OpBuilder &builder, Value sret, - Value data, Value size) { - auto ptrTy = cudaq::cc::PointerType::get(builder.getContext()); - auto castData = builder.create(loc, ptrTy, data); - auto castSret = builder.create(loc, ptrTy, sret); - builder.create(loc, std::nullopt, - cudaq::stdvecBoolCtorFromInitList, - ArrayRef{castSret, castData, size}); - } - - /// Generate a `std::vector` (where `T != bool`) from an initializer list. - /// This is done with the assumption that `std::vector` is implemented as a - /// triple of pointers. The original content of the vector is freed and the - /// new content, which is already on the stack, is moved into the - /// `std::vector`. - void genStdvecTFromInitList(Location loc, OpBuilder &builder, Value sret, - Value data, Value tSize, Value vecSize) { - auto i8Ty = builder.getI8Type(); - auto stlVectorTy = - cudaq::cc::PointerType::get(cudaq::opt::factory::stlVectorType(i8Ty)); - auto ptrTy = cudaq::cc::PointerType::get(i8Ty); - auto castSret = builder.create(loc, stlVectorTy, sret); - auto ptrPtrTy = cudaq::cc::PointerType::get(ptrTy); - auto sret0 = builder.create( - loc, ptrPtrTy, castSret, SmallVector{0}); - auto arrI8Ty = cudaq::cc::ArrayType::get(i8Ty); - auto ptrArrTy = cudaq::cc::PointerType::get(arrI8Ty); - auto buffPtr0 = builder.create(loc, ptrTy, data); - builder.create(loc, buffPtr0, sret0); - auto sret1 = builder.create( - loc, ptrPtrTy, castSret, SmallVector{1}); - Value byteLen = builder.create(loc, tSize, vecSize); - auto buffPtr = builder.create(loc, ptrArrTy, data); - auto endPtr = builder.create( - loc, ptrTy, buffPtr, SmallVector{byteLen}); - builder.create(loc, endPtr, sret1); - auto sret2 = builder.create( - loc, ptrPtrTy, castSret, SmallVector{2}); - builder.create(loc, endPtr, sret2); - } - - static MutableArrayRef - dropAnyHiddenArguments(MutableArrayRef args, - FunctionType funcTy, bool hasThisPointer) { - const bool hiddenSRet = cudaq::opt::factory::hasHiddenSRet(funcTy); - const unsigned count = - cudaq::cc::numberOfHiddenArgs(hasThisPointer, hiddenSRet); - if (count > 0 && args.size() >= count && - std::all_of(args.begin(), args.begin() + count, [](auto i) { - return isa(i.getType()); - })) - return args.drop_front(count); - return args; - } - - static std::pair - lookupHostEntryPointFunc(StringRef mangledEntryPointName, ModuleOp module, - func::FuncOp funcOp) { - if (mangledEntryPointName.equals("BuilderKernel.EntryPoint") || - mangledEntryPointName.contains("_PyKernelEntryPointRewrite")) { - // No host entry point needed. - return {false, func::FuncOp{}}; - } - if (auto *decl = module.lookupSymbol(mangledEntryPointName)) - if (auto func = dyn_cast(decl)) { - func.eraseBody(); - return {true, func}; - } - funcOp.emitOpError("could not generate the host-side kernel function (" + - mangledEntryPointName + ")"); - return {true, func::FuncOp{}}; - } - /// Generate an all new entry point body, calling someLaunchKernel in /// the runtime library. Pass along the thunk, so the runtime can call the /// quantum circuit. These entry points may be `operator()` member functions @@ -1188,9 +1391,9 @@ class GenerateKernelExecution SmallVector blockValues(blockArgs.size()); std::copy(blockArgs.begin(), blockArgs.end(), blockValues.begin()); const bool hasDynamicSignature = isDynamicSignature(devFuncTy); - SmallVector freeVectorBuffers; + Value heapTracker = createEmptyHeapTracker(loc, builder); auto zippy = zipArgumentsWithDeviceTypes( - loc, builder, blockValues, devFuncTy.getInputs(), freeVectorBuffers); + loc, builder, blockValues, devFuncTy.getInputs(), heapTracker); auto sizeScratch = builder.create(loc, i64Ty); auto messageBufferSize = [&]() -> Value { if (hasDynamicSignature) @@ -1224,20 +1427,7 @@ class GenerateKernelExecution populateMessageBuffer(loc, builder, msgBufferPrefix, zippy); } - if (!freeVectorBuffers.empty()) { - // Need to free any temporary vector-like buffers. These arise when - // there is a std::vector argument, which we translate into a - // std::vector to reuse the same code as any other std::vector. - for (auto vecVar : freeVectorBuffers) { - auto ptrPtrTy = cudaq::cc::PointerType::get(ptrI8Ty); - auto ptrPtr = - builder.create(loc, ptrPtrTy, vecVar); - Value freeMe = builder.create(loc, ptrPtr); - builder.create(loc, std::nullopt, "free", - ArrayRef{freeMe}); - } - } - + maybeFreeHeapAllocations(loc, builder, heapTracker); extendedStructSize = messageBufferSize; Value loadThunk = builder.create(loc, thunkTy, thunkFunc.getName()); @@ -1485,19 +1675,6 @@ class GenerateKernelExecution builder.create(loc, results); } - /// A kernel function that takes a quantum type argument (also known as a pure - /// device kernel) cannot be called directly from C++ (classical) code. It - /// must be called via other quantum code. - bool hasLegalType(FunctionType funTy) { - for (auto ty : funTy.getInputs()) - if (quake::isQuantumType(ty)) - return false; - for (auto ty : funTy.getResults()) - if (quake::isQuantumType(ty)) - return false; - return true; - } - /// Generate a function to be executed at load-time which will register the /// kernel with the runtime. LLVM::LLVMFuncOp registerKernelWithRuntimeForExecution( @@ -1618,6 +1795,10 @@ class GenerateKernelExecution irBuilder.loadIntrinsic(module, cudaq::stdvecBoolUnpackToInitList))) return module.emitError(std::string("could not load ") + cudaq::stdvecBoolUnpackToInitList); + if (failed(irBuilder.loadIntrinsic(module, + cudaq::stdvecBoolFreeTemporaryLists))) + return module.emitError(std::string("could not load ") + + cudaq::stdvecBoolFreeTemporaryLists); if (failed(irBuilder.loadIntrinsic(module, cudaq::llvmMemCopyIntrinsic))) return module.emitError(std::string("could not load ") + cudaq::llvmMemCopyIntrinsic); @@ -1628,7 +1809,6 @@ class GenerateKernelExecution return success(); } -public: void runOnOperation() override { auto module = getOperation(); auto *ctx = module.getContext(); diff --git a/runtime/cudaq/cudaq.cpp b/runtime/cudaq/cudaq.cpp index 10ecc3b914a..d6cbc3c2270 100644 --- a/runtime/cudaq/cudaq.cpp +++ b/runtime/cudaq/cudaq.cpp @@ -470,20 +470,37 @@ void __nvqpp_initializer_list_to_vector_bool(std::vector &result, /// `std::vector` overload. The conversion turns the `std::vector` /// into a mock vector structure that looks like `std::vector`. The /// calling routine must cleanup the buffer allocated by this code. -void __nvqpp_vector_bool_to_initializer_list(void *outData, - const std::vector &inVec) { +/// This helper routine may only be called on the host side. +void __nvqpp_vector_bool_to_initializer_list( + void *outData, const std::vector &inVec, + std::vector **allocations) { // The MockVector must be allocated by the caller. struct MockVector { char *start; char *end; + char *end2; }; MockVector *mockVec = reinterpret_cast(outData); auto outSize = inVec.size(); // The buffer allocated here must be freed by the caller. - mockVec->start = static_cast(malloc(outSize)); - mockVec->end = mockVec->start + outSize; + if (!*allocations) + *allocations = new std::vector; + char *newData = static_cast(malloc(outSize)); + (*allocations)->push_back(newData); + mockVec->start = newData; + mockVec->end2 = mockVec->end = newData + outSize; for (unsigned i = 0; i < outSize; ++i) - (mockVec->start)[i] = static_cast(inVec[i]); + newData[i] = static_cast(inVec[i]); +} + +/// This helper routine deletes the vector that tracks all the temporaries that +/// were created as well as the temporaries themselves. +/// This routine may only be called on the host side. +void __nvqpp_vector_bool_free_temporary_initlists( + std::vector *allocations) { + for (auto *p : *allocations) + free(p); + delete allocations; } } } // namespace cudaq::support diff --git a/runtime/cudaq/qis/qubit_qis.h b/runtime/cudaq/qis/qubit_qis.h index c83dffe844f..cb3e2a6a735 100644 --- a/runtime/cudaq/qis/qubit_qis.h +++ b/runtime/cudaq/qis/qubit_qis.h @@ -828,11 +828,13 @@ std::vector mz(qubit &q, Qs &&...qs) { } namespace support { -// Helper to initialize a `vector` data structure. +// Helpers to deal with the `vector` specialized template type. extern "C" { void __nvqpp_initializer_list_to_vector_bool(std::vector &, char *, std::size_t); -void __nvqpp_vector_bool_to_initializer_list(void *, const std::vector &); +void __nvqpp_vector_bool_to_initializer_list(void *, const std::vector &, + std::vector **); +void __nvqpp_vector_bool_free_temporary_initlists(std::vector *); } } // namespace support diff --git a/targettests/SeparateCompilation/arith_spans.cpp b/targettests/SeparateCompilation/arith_spans.cpp new file mode 100644 index 00000000000..67dc8f329e6 --- /dev/null +++ b/targettests/SeparateCompilation/arith_spans.cpp @@ -0,0 +1,229 @@ +/******************************************************************************* + * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +// clang-format off +// RUN: if [ command -v split-file ]; then \ +// RUN: split-file %s %t && \ +// RUN: nvq++ %cpp_std --enable-mlir -c %t/span_dumps.cpp -o %t/span_dumps.o && \ +// RUN: nvq++ %cpp_std --enable-mlir -c %t/span_exercise.cpp -o %t/span_exercise.o && \ +// RUN: nvq++ %cpp_std --enable-mlir %t/span_dumps.o %t/span_exercise.o -o %t/spanaroo.out && \ +// RUN: %t/spanaroo.out | FileCheck %s ; else \ +// RUN: echo "skipping" ; fi +// clang-format on + +//--- span_dumps.cpp + +#include +#include +#include + +extern "C" { +void dump_bool_vector(std::span x) { + std::cout << "booleans: "; + for (auto i : x) + std::cout << i << ' '; + std::cout << '\n'; +} + +void dump_int_vector(std::span x) { + std::cout << "integers: "; + for (auto i : x) + std::cout << i << ' '; + std::cout << '\n'; +} + +void dump_double_vector(std::span x) { + std::cout << "doubles: "; + for (auto d : x) + std::cout << d << ' '; + std::cout << '\n'; +} +} + +//--- span_exercise.cpp + +#include +#include + +// Fake host C++ signature that matches. +extern "C" { +void dump_int_vector(const std::vector &pw); +void dump_bool_vector(const std::vector &pw); +void dump_double_vector(const std::vector &pw); +} + +__qpu__ void kern1(std::vector arg) { dump_int_vector(arg); } + +__qpu__ void kern2(std::vector> arg) { + for (unsigned i = 0; i < arg.size(); ++i) + dump_int_vector(arg[i]); +} + +struct IntVectorPair { + std::vector _0; + std::vector _1; +}; + +__qpu__ void kern3(IntVectorPair ivp) { + dump_int_vector(ivp._0); + dump_int_vector(ivp._1); +} + +__qpu__ void kern4(std::vector vivp) { + for (unsigned i = 0; i < vivp.size(); ++i) { + dump_int_vector(vivp[i]._0); + dump_int_vector(vivp[i]._1); + } +} + +__qpu__ void qern1(std::vector arg) { dump_double_vector(arg); } + +__qpu__ void qern2(std::vector> arg) { + for (unsigned i = 0; i < arg.size(); ++i) + dump_double_vector(arg[i]); +} + +struct DoubleVectorPair { + std::vector _0; + std::vector _1; +}; + +__qpu__ void qern3(DoubleVectorPair ivp) { + dump_double_vector(ivp._0); + dump_double_vector(ivp._1); +} + +__qpu__ void qern4(std::vector vivp) { + for (unsigned i = 0; i < vivp.size(); ++i) { + dump_double_vector(vivp[i]._0); + dump_double_vector(vivp[i]._1); + } +} + +__qpu__ void cern1(std::vector arg) { dump_bool_vector(arg); } + +__qpu__ void cern2(std::vector> arg) { + for (unsigned i = 0; i < arg.size(); ++i) + dump_bool_vector(arg[i]); +} + +struct BoolVectorPair { + std::vector _0; + std::vector _1; +}; + +__qpu__ void cern3(BoolVectorPair ivp) { + dump_bool_vector(ivp._0); + dump_bool_vector(ivp._1); +} + +__qpu__ void cern4(std::vector vivp) { + for (unsigned i = 0; i < vivp.size(); ++i) { + dump_bool_vector(vivp[i]._0); + dump_bool_vector(vivp[i]._1); + } +} + +int main() { + std::vector pw0 = {345, 1, 2}; + std::cout << "---\n"; + kern1(pw0); + std::vector pw1 = {92347, 3, 4}; + std::vector pw2 = {2358, 5, 6}; + std::vector pw3 = {45, 7, 18}; + std::vector> vpw{pw0, pw1, pw2, pw3}; + std::cout << "---\n"; + kern2(vpw); + + IntVectorPair ivp = {{8, 238, 44}, {0, -4, 81, 92745}}; + std::cout << "---\n"; + kern3(ivp); + + IntVectorPair ivp2 = {{5, -87, 43, 1, 76}, {0, 0, 2, 1}}; + IntVectorPair ivp3 = {{1}, {-2, 3}}; + IntVectorPair ivp4 = {{-4, -5, 6}, {-7, -8, -9, 88}}; + std::vector vivp = {ivp, ivp2, ivp3, ivp4}; + std::cout << "---\n"; + // kern4(vivp); + + std::vector dpw0 = {3.45, 1., 2.}; + std::cout << "---\n"; + qern1(dpw0); + std::vector dpw1 = {92.347, 2.3, 4.}; + std::vector dpw2 = {235.8, 5.5, 6.4}; + std::vector dpw3 = {4.5, 77.7, 18.2}; + std::vector> vdpw{dpw0, dpw1, dpw2, dpw3}; + std::cout << "---\n"; + qern2(vdpw); + + DoubleVectorPair dvp = {{8., 2.38, 4.4}, {0., -4.99, 81.5, 92.745}}; + std::cout << "---\n"; + qern3(dvp); + + DoubleVectorPair dvp2 = {{5., -8.7, 4.3, 1., 7.6}, {0., 0., 2., 1.}}; + DoubleVectorPair dvp3 = {{1.}, {-2., 3.}}; + DoubleVectorPair dvp4 = {{-4., -5., 6.}, {-7., -8., -9., .88}}; + std::vector vdvp = {dvp, dvp2, dvp3, dvp4}; + std::cout << "---\n"; + // qern4(vdvp); + + std::vector bpw0 = {true, false}; + std::cout << "---\n"; + cern1(bpw0); + std::vector bpw1 = {false, false, false}; + std::vector bpw2 = {false, true, false, true}; + std::vector bpw3 = {false, false, true, false, true}; + std::vector> vbpw{bpw0, bpw1, bpw2, bpw3}; + std::cout << "---\n"; + cern2(vbpw); + + BoolVectorPair bvp = {{false, false}, {false, true, true, false}}; + std::cout << "---\n"; + cern3(bvp); + + BoolVectorPair bvp2 = {{false, true, true, false, true, false}, + {false, true, true, false, false, false, true, false}}; + BoolVectorPair bvp3 = {{false}, {true, true}}; + BoolVectorPair bvp4 = {{true, false, false}, {false, true, false, true}}; + std::vector vbvp = {bvp, bvp2, bvp3, bvp4}; + std::cout << "---\n"; + // cern4(vbvp); + + return 0; +} + +// CHECK: --- +// CHECK: integers: 345 1 2 +// CHECK: --- +// CHECK: integers: 345 1 2 +// CHECK: integers: 92347 3 4 +// CHECK: integers: 2358 5 6 +// CHECK: integers: 45 7 18 +// CHECK: --- +// CHECK: integers: 8 238 44 +// CHECK: integers: 0 -4 81 92745 +// CHECK: --- +// CHECK: doubles: 3.45 1 2 +// CHECK: --- +// CHECK: doubles: 3.45 1 2 +// CHECK: doubles: 92.347 2.3 4 +// CHECK: doubles: 235.8 5.5 6.4 +// CHECK: doubles: 4.5 77.7 18.2 +// CHECK: --- +// CHECK: doubles: 8 2.38 4.4 +// CHECK: doubles: 0 -4.99 81.5 92.745 +// CHECK: --- +// CHECK: booleans: 1 0 +// CHECK: --- +// CHECK: booleans: 1 0 +// CHECK: booleans: 0 0 0 +// CHECK: booleans: 0 1 0 1 +// CHECK: booleans: 0 0 1 0 1 +// CHECK: --- +// CHECK: booleans: 0 0 +// CHECK: booleans: 0 1 1 0 diff --git a/test/AST-Quake/calling_convention.cpp b/test/AST-Quake/calling_convention.cpp index 3d2c6e2e4a4..fcf7c26cdac 100644 --- a/test/AST-Quake/calling_convention.cpp +++ b/test/AST-Quake/calling_convention.cpp @@ -278,9 +278,7 @@ struct V3 { // CHECK-SAME: %[[VAL_2:.*]]: !cc.ptr, !cc.ptr, !cc.ptr}>>, // CHECK-SAME: %[[VAL_3:.*]]: !cc.ptr, !cc.ptr, !cc.ptr}>>) // CHECK-LABEL: func.func @_ZN2V3clESt6vectorIlSaIlEES0_IbSaIbEE( -// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, -// CHECK-SAME: %[[VAL_1:.*]]: !cc.ptr, !cc.ptr, !cc.ptr}>>, -// CHECK-SAME: %[[VAL_2:.*]]: !cc.ptr, !cc.ptr, !cc.ptr}>>) +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, %[[VAL_1:.*]]: !cc.ptr, !cc.ptr, !cc.ptr}>>, %[[VAL_2:.*]]: !cc.ptr, !cc.array}>>) // clang-format on //===----------------------------------------------------------------------===// diff --git a/test/Quake/kernel_exec-1.qke b/test/Quake/kernel_exec-1.qke index 044bf937824..9bae7ecebf2 100644 --- a/test/Quake/kernel_exec-1.qke +++ b/test/Quake/kernel_exec-1.qke @@ -123,7 +123,7 @@ module attributes {quake.mangled_name_map = { // ALT: func.func private @malloc(i64) -> !cc.ptr // ALT: func.func private @free(!cc.ptr) // ALT: func.func private @__nvqpp_initializer_list_to_vector_bool(!cc.ptr, !cc.ptr, i64) -// ALT: func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr, !cc.ptr, !cc.ptr}>>, !cc.ptr, !cc.ptr, !cc.ptr}>>) +// ALT: func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr, !cc.ptr, !cc.ptr}>>, !cc.ptr, !cc.array}>>, !cc.ptr>) // ALT: func.func private @llvm.memcpy.p0i8.p0i8.i64(!cc.ptr, !cc.ptr, i64, i1) // ALT-LABEL: func.func private @__nvqpp_zeroDynamicResult() -> !cc.struct<{!cc.ptr, i64}> { @@ -250,7 +250,7 @@ module attributes {quake.mangled_name_map = { // STREAMLINED: func.func private @malloc(i64) -> !cc.ptr // STREAMLINED: func.func private @free(!cc.ptr) // STREAMLINED: func.func private @__nvqpp_initializer_list_to_vector_bool(!cc.ptr, !cc.ptr, i64) -// STREAMLINED: func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr, !cc.ptr, !cc.ptr}>>, !cc.ptr, !cc.ptr, !cc.ptr}>>) +// STREAMLINED: func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr, !cc.ptr, !cc.ptr}>>, !cc.ptr, !cc.array}>>, !cc.ptr>) // STREAMLINED: func.func private @llvm.memcpy.p0i8.p0i8.i64(!cc.ptr, !cc.ptr, i64, i1) // STREAMLINED-LABEL: func.func private @__nvqpp_zeroDynamicResult() -> !cc.struct<{!cc.ptr, i64}> { @@ -359,7 +359,7 @@ module attributes {quake.mangled_name_map = { // HYBRID: func.func private @malloc(i64) -> !cc.ptr // HYBRID: func.func private @free(!cc.ptr) // HYBRID: func.func private @__nvqpp_initializer_list_to_vector_bool(!cc.ptr, !cc.ptr, i64) -// HYBRID: func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr, !cc.ptr, !cc.ptr}>>, !cc.ptr, !cc.ptr, !cc.ptr}>>) +// HYBRID: func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr, !cc.ptr, !cc.ptr}>>, !cc.ptr, !cc.array}>>, !cc.ptr>) // HYBRID: func.func private @llvm.memcpy.p0i8.p0i8.i64(!cc.ptr, !cc.ptr, i64, i1) // HYBRID-LABEL: func.func private @__nvqpp_zeroDynamicResult() -> !cc.struct<{!cc.ptr, i64}> { diff --git a/test/Quake/kernel_exec-2.qke b/test/Quake/kernel_exec-2.qke index e8be1ab6acd..ebc29811a10 100644 --- a/test/Quake/kernel_exec-2.qke +++ b/test/Quake/kernel_exec-2.qke @@ -131,7 +131,7 @@ __nvqpp__mlirgen__function_cargo = "pants"}} { // CHECK: func.func private @malloc(i64) -> !cc.ptr // CHECK: func.func private @free(!cc.ptr) // CHECK: func.func private @__nvqpp_initializer_list_to_vector_bool(!cc.ptr, !cc.ptr, i64) -// CHECK: func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr, !cc.ptr, !cc.ptr}>>, !cc.ptr, !cc.ptr, !cc.ptr}>>) +// CHECK: func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr, !cc.ptr, !cc.ptr}>>, !cc.ptr, !cc.array}>>, !cc.ptr>) // CHECK: func.func private @llvm.memcpy.p0i8.p0i8.i64(!cc.ptr, !cc.ptr, i64, i1) // CHECK-LABEL: func.func private @__nvqpp_zeroDynamicResult() -> !cc.struct<{!cc.ptr, i64}> { diff --git a/test/Quake/return_vector.qke b/test/Quake/return_vector.qke index 0c706ca7b13..bba89bb5dd8 100644 --- a/test/Quake/return_vector.qke +++ b/test/Quake/return_vector.qke @@ -28,7 +28,7 @@ func.func @test_0(%0: !cc.ptr, !cc.ptr, !cc.ptr !cc.stdvec { +// CHECK-SAME: %[[VAL_0:.*]]: i32) -> !cc.stdvec { // CHECK: %[[VAL_1:.*]] = arith.constant 8 : i64 // CHECK: %[[VAL_2:.*]] = arith.constant 256 : i64 // CHECK: %[[VAL_3:.*]] = call @malloc(%[[VAL_2]]) : (i64) -> !cc.ptr @@ -37,72 +37,79 @@ func.func @test_0(%0: !cc.ptr, !cc.ptr, !cc.ptr, !cc.ptr, !cc.ptr}>> {llvm.sret = !cc.struct<{!cc.ptr, !cc.ptr, !cc.ptr}>}, -// CHECK-SAME: %[[VAL_1:.*]]: !cc.ptr, -// CHECK-SAME: %[[VAL_2:.*]]: i32) { +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, !cc.ptr, !cc.ptr}>> {llvm.sret = !cc.struct<{!cc.ptr, !cc.ptr, !cc.ptr}>}, %[[VAL_1:.*]]: !cc.ptr, %[[VAL_2:.*]]: i32) { // CHECK: %[[VAL_3:.*]] = arith.constant 4 : i64 // CHECK: %[[VAL_4:.*]] = arith.constant 8 : i64 -// CHECK: %[[VAL_5:.*]] = arith.constant 0 : i64 -// CHECK: %[[VAL_6:.*]] = constant @test_0.thunk : (!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}> -// CHECK: %[[VAL_7:.*]] = cc.alloca i64 -// CHECK: %[[VAL_8:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 -// CHECK: %[[VAL_9:.*]] = cc.alloca i8{{\[}}%[[VAL_8]] : i64] -// CHECK: %[[VAL_10:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr>) -> !cc.ptr, i64}>}>> -// CHECK: %[[VAL_11:.*]] = cc.alloca !cc.ptr -// CHECK: %[[VAL_12:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr>) -> !cc.ptr -// CHECK: cc.store %[[VAL_2]], %[[VAL_12]] : !cc.ptr -// CHECK: %[[VAL_13:.*]] = cc.func_ptr %[[VAL_6]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr -// CHECK: %[[VAL_14:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr>) -> !cc.ptr -// CHECK: %[[VAL_15:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> [1] : i64 -// CHECK: %[[VAL_16:.*]] = cc.alloca !cc.struct<{!cc.ptr>, !cc.ptr>, !cc.ptr>}> -// CHECK: %[[VAL_17:.*]] = cc.alloca !cc.array x 1> -// CHECK: %[[VAL_18:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr x 1>>) -> !cc.ptr> -// CHECK: %[[VAL_19:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> -// CHECK: cc.store %[[VAL_18]], %[[VAL_19]] : !cc.ptr>> -// CHECK: %[[VAL_20:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr x 1>>) -> i64 -// CHECK: %[[VAL_21:.*]] = arith.addi %[[VAL_20]], %[[VAL_4]] : i64 -// CHECK: %[[VAL_22:.*]] = cc.cast %[[VAL_21]] : (i64) -> !cc.ptr> -// CHECK: %[[VAL_23:.*]] = cc.compute_ptr %[[VAL_16]][1] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> -// CHECK: cc.store %[[VAL_22]], %[[VAL_23]] : !cc.ptr>> -// CHECK: %[[VAL_24:.*]] = cc.compute_ptr %[[VAL_16]][2] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> -// CHECK: cc.store %[[VAL_22]], %[[VAL_24]] : !cc.ptr>> -// CHECK: %[[VAL_25:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr x 1>>) -> !cc.ptr> -// CHECK: %[[VAL_26:.*]] = cc.alloca i32 -// CHECK: cc.store %[[VAL_2]], %[[VAL_26]] : !cc.ptr -// CHECK: %[[VAL_27:.*]] = cc.cast %[[VAL_26]] : (!cc.ptr) -> !cc.ptr -// CHECK: cc.store %[[VAL_27]], %[[VAL_25]] : !cc.ptr> -// CHECK: %[[VAL_28:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr -// CHECK: %[[VAL_29:.*]] = llvm.mlir.addressof @test_0.kernelName : !llvm.ptr> -// CHECK: %[[VAL_30:.*]] = cc.cast %[[VAL_29]] : (!llvm.ptr>) -> !cc.ptr -// CHECK: %[[VAL_31:.*]] = call @hybridLaunchKernel(%[[VAL_30]], %[[VAL_13]], %[[VAL_14]], %[[VAL_8]], %[[VAL_15]], %[[VAL_28]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> -// CHECK: %[[VAL_32:.*]] = cc.extract_value %[[VAL_31]][0] : (!cc.struct<{!cc.ptr, i64}>) -> !cc.ptr -// CHECK: %[[VAL_33:.*]] = cc.cast %[[VAL_32]] : (!cc.ptr) -> i64 -// CHECK: %[[VAL_34:.*]] = arith.cmpi ne, %[[VAL_33]], %[[VAL_5]] : i64 -// CHECK: cf.cond_br %[[VAL_34]], ^bb1, ^bb2 +// CHECK: %[[VAL_5:.*]] = constant @test_0.thunk : (!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_6:.*]] = arith.constant 0 : i64 +// CHECK: %[[VAL_7:.*]] = cc.alloca !cc.ptr +// CHECK: %[[VAL_8:.*]] = cc.cast %[[VAL_6]] : (i64) -> !cc.ptr +// CHECK: cc.store %[[VAL_8]], %[[VAL_7]] : !cc.ptr> +// CHECK: %[[VAL_9:.*]] = cc.alloca i64 +// CHECK: %[[VAL_10:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 +// CHECK: %[[VAL_11:.*]] = cc.alloca i8{{\[}}%[[VAL_10]] : i64] +// CHECK: %[[VAL_12:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr>) -> !cc.ptr, i64}>}>> +// CHECK: %[[VAL_13:.*]] = cc.alloca !cc.ptr +// CHECK: %[[VAL_14:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr>) -> !cc.ptr +// CHECK: cc.store %[[VAL_2]], %[[VAL_14]] : !cc.ptr +// CHECK: %[[VAL_15:.*]] = cc.load %[[VAL_7]] : !cc.ptr> +// CHECK: %[[VAL_16:.*]] = cc.cast %[[VAL_15]] : (!cc.ptr) -> i64 +// CHECK: %[[VAL_17:.*]] = arith.cmpi ne, %[[VAL_16]], %[[VAL_6]] : i64 +// CHECK: cc.if(%[[VAL_17]]) { +// CHECK: func.call @__nvqpp_vector_bool_free_temporary_initlists(%[[VAL_15]]) : (!cc.ptr) -> () +// CHECK: } +// CHECK: %[[VAL_18:.*]] = cc.func_ptr %[[VAL_5]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr +// CHECK: %[[VAL_19:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr>) -> !cc.ptr +// CHECK: %[[VAL_20:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> [1] : i64 +// CHECK: %[[VAL_21:.*]] = cc.alloca !cc.struct<{!cc.ptr>, !cc.ptr>, !cc.ptr>}> +// CHECK: %[[VAL_22:.*]] = cc.alloca !cc.array x 1> +// CHECK: %[[VAL_23:.*]] = cc.cast %[[VAL_22]] : (!cc.ptr x 1>>) -> !cc.ptr> +// CHECK: %[[VAL_24:.*]] = cc.cast %[[VAL_21]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// CHECK: cc.store %[[VAL_23]], %[[VAL_24]] : !cc.ptr>> +// CHECK: %[[VAL_25:.*]] = cc.cast %[[VAL_22]] : (!cc.ptr x 1>>) -> i64 +// CHECK: %[[VAL_26:.*]] = arith.addi %[[VAL_25]], %[[VAL_4]] : i64 +// CHECK: %[[VAL_27:.*]] = cc.cast %[[VAL_26]] : (i64) -> !cc.ptr> +// CHECK: %[[VAL_28:.*]] = cc.compute_ptr %[[VAL_21]][1] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// CHECK: cc.store %[[VAL_27]], %[[VAL_28]] : !cc.ptr>> +// CHECK: %[[VAL_29:.*]] = cc.compute_ptr %[[VAL_21]][2] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// CHECK: cc.store %[[VAL_27]], %[[VAL_29]] : !cc.ptr>> +// CHECK: %[[VAL_30:.*]] = cc.cast %[[VAL_22]] : (!cc.ptr x 1>>) -> !cc.ptr> +// CHECK: %[[VAL_31:.*]] = cc.alloca i32 +// CHECK: cc.store %[[VAL_2]], %[[VAL_31]] : !cc.ptr +// CHECK: %[[VAL_32:.*]] = cc.cast %[[VAL_31]] : (!cc.ptr) -> !cc.ptr +// CHECK: cc.store %[[VAL_32]], %[[VAL_30]] : !cc.ptr> +// CHECK: %[[VAL_33:.*]] = cc.cast %[[VAL_21]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr +// CHECK: %[[VAL_34:.*]] = llvm.mlir.addressof @test_0.kernelName : !llvm.ptr> +// CHECK: %[[VAL_35:.*]] = cc.cast %[[VAL_34]] : (!llvm.ptr>) -> !cc.ptr +// CHECK: %[[VAL_36:.*]] = call @hybridLaunchKernel(%[[VAL_35]], %[[VAL_18]], %[[VAL_19]], %[[VAL_10]], %[[VAL_20]], %[[VAL_33]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_37:.*]] = cc.extract_value %[[VAL_36]][0] : (!cc.struct<{!cc.ptr, i64}>) -> !cc.ptr +// CHECK: %[[VAL_38:.*]] = cc.cast %[[VAL_37]] : (!cc.ptr) -> i64 +// CHECK: %[[VAL_39:.*]] = arith.cmpi ne, %[[VAL_38]], %[[VAL_6]] : i64 +// CHECK: cf.cond_br %[[VAL_39]], ^bb1, ^bb2 // CHECK: ^bb1: -// CHECK: %[[VAL_35:.*]] = cc.cast %[[VAL_32]] : (!cc.ptr) -> !cc.ptr, i64}>}>> -// CHECK: %[[VAL_36:.*]] = cc.compute_ptr %[[VAL_35]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> -// CHECK: cf.br ^bb3(%[[VAL_36]] : !cc.ptr, i64}>>) +// CHECK: %[[VAL_40:.*]] = cc.cast %[[VAL_37]] : (!cc.ptr) -> !cc.ptr, i64}>}>> +// CHECK: %[[VAL_41:.*]] = cc.compute_ptr %[[VAL_40]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> +// CHECK: cf.br ^bb3(%[[VAL_41]] : !cc.ptr, i64}>>) // CHECK: ^bb2: -// CHECK: %[[VAL_37:.*]] = cc.compute_ptr %[[VAL_10]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> -// CHECK: cf.br ^bb3(%[[VAL_37]] : !cc.ptr, i64}>>) -// CHECK: ^bb3(%[[VAL_38:.*]]: !cc.ptr, i64}>>): -// CHECK: %[[VAL_39:.*]] = cc.cast %[[VAL_38]] : (!cc.ptr, i64}>>) -> !cc.ptr> -// CHECK: %[[VAL_40:.*]] = cc.load %[[VAL_39]] : !cc.ptr> -// CHECK: %[[VAL_41:.*]] = cc.compute_ptr %[[VAL_38]][1] : (!cc.ptr, i64}>>) -> !cc.ptr -// CHECK: %[[VAL_42:.*]] = cc.load %[[VAL_41]] : !cc.ptr -// CHECK: %[[VAL_43:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr, !cc.ptr, !cc.ptr}>> -// CHECK: %[[VAL_44:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> -// CHECK: %[[VAL_45:.*]] = cc.cast %[[VAL_40]] : (!cc.ptr) -> !cc.ptr -// CHECK: cc.store %[[VAL_45]], %[[VAL_44]] : !cc.ptr> -// CHECK: %[[VAL_46:.*]] = cc.compute_ptr %[[VAL_43]][1] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> -// CHECK: %[[VAL_47:.*]] = arith.muli %[[VAL_42]], %[[VAL_3]] : i64 -// CHECK: %[[VAL_48:.*]] = cc.cast %[[VAL_40]] : (!cc.ptr) -> !cc.ptr> -// CHECK: %[[VAL_49:.*]] = cc.compute_ptr %[[VAL_48]]{{\[}}%[[VAL_47]]] : (!cc.ptr>, i64) -> !cc.ptr -// CHECK: cc.store %[[VAL_49]], %[[VAL_46]] : !cc.ptr> -// CHECK: %[[VAL_50:.*]] = cc.compute_ptr %[[VAL_43]][2] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> -// CHECK: cc.store %[[VAL_49]], %[[VAL_50]] : !cc.ptr> -// CHECK: call @free(%[[VAL_32]]) : (!cc.ptr) -> () +// CHECK: %[[VAL_42:.*]] = cc.compute_ptr %[[VAL_12]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> +// CHECK: cf.br ^bb3(%[[VAL_42]] : !cc.ptr, i64}>>) +// CHECK: ^bb3(%[[VAL_43:.*]]: !cc.ptr, i64}>>): +// CHECK: %[[VAL_44:.*]] = cc.cast %[[VAL_43]] : (!cc.ptr, i64}>>) -> !cc.ptr> +// CHECK: %[[VAL_45:.*]] = cc.load %[[VAL_44]] : !cc.ptr> +// CHECK: %[[VAL_46:.*]] = cc.compute_ptr %[[VAL_43]][1] : (!cc.ptr, i64}>>) -> !cc.ptr +// CHECK: %[[VAL_47:.*]] = cc.load %[[VAL_46]] : !cc.ptr +// CHECK: %[[VAL_48:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr, !cc.ptr, !cc.ptr}>> +// CHECK: %[[VAL_49:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: %[[VAL_50:.*]] = cc.cast %[[VAL_45]] : (!cc.ptr) -> !cc.ptr +// CHECK: cc.store %[[VAL_50]], %[[VAL_49]] : !cc.ptr> +// CHECK: %[[VAL_51:.*]] = cc.compute_ptr %[[VAL_48]][1] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: %[[VAL_52:.*]] = arith.muli %[[VAL_47]], %[[VAL_3]] : i64 +// CHECK: %[[VAL_53:.*]] = cc.cast %[[VAL_45]] : (!cc.ptr) -> !cc.ptr> +// CHECK: %[[VAL_54:.*]] = cc.compute_ptr %[[VAL_53]]{{\[}}%[[VAL_52]]] : (!cc.ptr>, i64) -> !cc.ptr +// CHECK: cc.store %[[VAL_54]], %[[VAL_51]] : !cc.ptr> +// CHECK: %[[VAL_55:.*]] = cc.compute_ptr %[[VAL_48]][2] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: cc.store %[[VAL_54]], %[[VAL_55]] : !cc.ptr> +// CHECK: call @free(%[[VAL_37]]) : (!cc.ptr) -> () // CHECK: return // CHECK: } @@ -117,10 +124,9 @@ func.func @__nvqpp__mlirgen__test_1(%arg0: i32) -> !cc.stdvec { func.func @test_1(%0: !cc.ptr, !cc.ptr, !cc.ptr}>> {llvm.sret = !cc.struct<{!cc.ptr, !cc.ptr, !cc.ptr}>}, %1: !cc.ptr, %2: i32) { return } -} // CHECK-LABEL: func.func @__nvqpp__mlirgen__test_1( -// CHECK-SAME: %[[VAL_0:.*]]: i32) -> !cc.stdvec { +// CHECK-SAME: %[[VAL_0:.*]]: i32) -> !cc.stdvec { // CHECK: %[[VAL_1:.*]] = arith.constant 9 : i64 // CHECK: %[[VAL_2:.*]] = arith.constant 520 : i64 // CHECK: %[[VAL_3:.*]] = call @malloc(%[[VAL_2]]) : (i64) -> !cc.ptr @@ -129,73 +135,83 @@ func.func @test_1(%0: !cc.ptr, !cc.ptr, !cc.ptr, !cc.ptr, !cc.ptr}>> {llvm.sret = !cc.struct<{!cc.ptr, !cc.ptr, !cc.ptr}>}, -// CHECK-SAME: %[[VAL_1:.*]]: !cc.ptr, -// CHECK-SAME: %[[VAL_2:.*]]: i32) { +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, !cc.ptr, !cc.ptr}>> {llvm.sret = !cc.struct<{!cc.ptr, !cc.ptr, !cc.ptr}>}, %[[VAL_1:.*]]: !cc.ptr, %[[VAL_2:.*]]: i32) { // CHECK: %[[VAL_3:.*]] = arith.constant 8 : i64 -// CHECK: %[[VAL_4:.*]] = arith.constant 0 : i64 -// CHECK: %[[VAL_5:.*]] = constant @test_1.thunk : (!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}> -// CHECK: %[[VAL_6:.*]] = cc.alloca i64 -// CHECK: %[[VAL_7:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 -// CHECK: %[[VAL_8:.*]] = cc.alloca i8{{\[}}%[[VAL_7]] : i64] -// CHECK: %[[VAL_9:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr>) -> !cc.ptr, i64}>}>> -// CHECK: %[[VAL_10:.*]] = cc.alloca !cc.ptr -// CHECK: %[[VAL_11:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr>) -> !cc.ptr -// CHECK: cc.store %[[VAL_2]], %[[VAL_11]] : !cc.ptr -// CHECK: %[[VAL_12:.*]] = cc.func_ptr %[[VAL_5]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr -// CHECK: %[[VAL_13:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr>) -> !cc.ptr -// CHECK: %[[VAL_14:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> [1] : i64 -// CHECK: %[[VAL_15:.*]] = cc.alloca !cc.struct<{!cc.ptr>, !cc.ptr>, !cc.ptr>}> -// CHECK: %[[VAL_16:.*]] = cc.alloca !cc.array x 1> -// CHECK: %[[VAL_17:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr x 1>>) -> !cc.ptr> -// CHECK: %[[VAL_18:.*]] = cc.cast %[[VAL_15]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> -// CHECK: cc.store %[[VAL_17]], %[[VAL_18]] : !cc.ptr>> -// CHECK: %[[VAL_19:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr x 1>>) -> i64 -// CHECK: %[[VAL_20:.*]] = arith.addi %[[VAL_19]], %[[VAL_3]] : i64 -// CHECK: %[[VAL_21:.*]] = cc.cast %[[VAL_20]] : (i64) -> !cc.ptr> -// CHECK: %[[VAL_22:.*]] = cc.compute_ptr %[[VAL_15]][1] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> -// CHECK: cc.store %[[VAL_21]], %[[VAL_22]] : !cc.ptr>> -// CHECK: %[[VAL_23:.*]] = cc.compute_ptr %[[VAL_15]][2] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> -// CHECK: cc.store %[[VAL_21]], %[[VAL_23]] : !cc.ptr>> -// CHECK: %[[VAL_24:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr x 1>>) -> !cc.ptr> -// CHECK: %[[VAL_25:.*]] = cc.alloca i32 -// CHECK: cc.store %[[VAL_2]], %[[VAL_25]] : !cc.ptr -// CHECK: %[[VAL_26:.*]] = cc.cast %[[VAL_25]] : (!cc.ptr) -> !cc.ptr -// CHECK: cc.store %[[VAL_26]], %[[VAL_24]] : !cc.ptr> -// CHECK: %[[VAL_27:.*]] = cc.cast %[[VAL_15]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr -// CHECK: %[[VAL_28:.*]] = llvm.mlir.addressof @test_1.kernelName : !llvm.ptr> -// CHECK: %[[VAL_29:.*]] = cc.cast %[[VAL_28]] : (!llvm.ptr>) -> !cc.ptr -// CHECK: %[[VAL_30:.*]] = call @hybridLaunchKernel(%[[VAL_29]], %[[VAL_12]], %[[VAL_13]], %[[VAL_7]], %[[VAL_14]], %[[VAL_27]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> -// CHECK: %[[VAL_31:.*]] = cc.extract_value %[[VAL_30]][0] : (!cc.struct<{!cc.ptr, i64}>) -> !cc.ptr -// CHECK: %[[VAL_32:.*]] = cc.cast %[[VAL_31]] : (!cc.ptr) -> i64 -// CHECK: %[[VAL_33:.*]] = arith.cmpi ne, %[[VAL_32]], %[[VAL_4]] : i64 -// CHECK: cf.cond_br %[[VAL_33]], ^bb1, ^bb2 +// CHECK: %[[VAL_4:.*]] = constant @test_1.thunk : (!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_5:.*]] = arith.constant 0 : i64 +// CHECK: %[[VAL_6:.*]] = cc.alloca !cc.ptr +// CHECK: %[[VAL_7:.*]] = cc.cast %[[VAL_5]] : (i64) -> !cc.ptr +// CHECK: cc.store %[[VAL_7]], %[[VAL_6]] : !cc.ptr> +// CHECK: %[[VAL_8:.*]] = cc.alloca i64 +// CHECK: %[[VAL_9:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 +// CHECK: %[[VAL_10:.*]] = cc.alloca i8{{\[}}%[[VAL_9]] : i64] +// CHECK: %[[VAL_11:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr>) -> !cc.ptr, i64}>}>> +// CHECK: %[[VAL_12:.*]] = cc.alloca !cc.ptr +// CHECK: %[[VAL_13:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr>) -> !cc.ptr +// CHECK: cc.store %[[VAL_2]], %[[VAL_13]] : !cc.ptr +// CHECK: %[[VAL_14:.*]] = cc.load %[[VAL_6]] : !cc.ptr> +// CHECK: %[[VAL_15:.*]] = cc.cast %[[VAL_14]] : (!cc.ptr) -> i64 +// CHECK: %[[VAL_16:.*]] = arith.cmpi ne, %[[VAL_15]], %[[VAL_5]] : i64 +// CHECK: cc.if(%[[VAL_16]]) { +// CHECK: func.call @__nvqpp_vector_bool_free_temporary_initlists(%[[VAL_14]]) : (!cc.ptr) -> () +// CHECK: } +// CHECK: %[[VAL_17:.*]] = cc.func_ptr %[[VAL_4]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr +// CHECK: %[[VAL_18:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr>) -> !cc.ptr +// CHECK: %[[VAL_19:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> [1] : i64 +// CHECK: %[[VAL_20:.*]] = cc.alloca !cc.struct<{!cc.ptr>, !cc.ptr>, !cc.ptr>}> +// CHECK: %[[VAL_21:.*]] = cc.alloca !cc.array x 1> +// CHECK: %[[VAL_22:.*]] = cc.cast %[[VAL_21]] : (!cc.ptr x 1>>) -> !cc.ptr> +// CHECK: %[[VAL_23:.*]] = cc.cast %[[VAL_20]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// CHECK: cc.store %[[VAL_22]], %[[VAL_23]] : !cc.ptr>> +// CHECK: %[[VAL_24:.*]] = cc.cast %[[VAL_21]] : (!cc.ptr x 1>>) -> i64 +// CHECK: %[[VAL_25:.*]] = arith.addi %[[VAL_24]], %[[VAL_3]] : i64 +// CHECK: %[[VAL_26:.*]] = cc.cast %[[VAL_25]] : (i64) -> !cc.ptr> +// CHECK: %[[VAL_27:.*]] = cc.compute_ptr %[[VAL_20]][1] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// CHECK: cc.store %[[VAL_26]], %[[VAL_27]] : !cc.ptr>> +// CHECK: %[[VAL_28:.*]] = cc.compute_ptr %[[VAL_20]][2] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// CHECK: cc.store %[[VAL_26]], %[[VAL_28]] : !cc.ptr>> +// CHECK: %[[VAL_29:.*]] = cc.cast %[[VAL_21]] : (!cc.ptr x 1>>) -> !cc.ptr> +// CHECK: %[[VAL_30:.*]] = cc.alloca i32 +// CHECK: cc.store %[[VAL_2]], %[[VAL_30]] : !cc.ptr +// CHECK: %[[VAL_31:.*]] = cc.cast %[[VAL_30]] : (!cc.ptr) -> !cc.ptr +// CHECK: cc.store %[[VAL_31]], %[[VAL_29]] : !cc.ptr> +// CHECK: %[[VAL_32:.*]] = cc.cast %[[VAL_20]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr +// CHECK: %[[VAL_33:.*]] = llvm.mlir.addressof @test_1.kernelName : !llvm.ptr> +// CHECK: %[[VAL_34:.*]] = cc.cast %[[VAL_33]] : (!llvm.ptr>) -> !cc.ptr +// CHECK: %[[VAL_35:.*]] = call @hybridLaunchKernel(%[[VAL_34]], %[[VAL_17]], %[[VAL_18]], %[[VAL_9]], %[[VAL_19]], %[[VAL_32]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_36:.*]] = cc.extract_value %[[VAL_35]][0] : (!cc.struct<{!cc.ptr, i64}>) -> !cc.ptr +// CHECK: %[[VAL_37:.*]] = cc.cast %[[VAL_36]] : (!cc.ptr) -> i64 +// CHECK: %[[VAL_38:.*]] = arith.cmpi ne, %[[VAL_37]], %[[VAL_5]] : i64 +// CHECK: cf.cond_br %[[VAL_38]], ^bb1, ^bb2 // CHECK: ^bb1: -// CHECK: %[[VAL_34:.*]] = cc.cast %[[VAL_31]] : (!cc.ptr) -> !cc.ptr, i64}>}>> -// CHECK: %[[VAL_35:.*]] = cc.compute_ptr %[[VAL_34]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> -// CHECK: cf.br ^bb3(%[[VAL_35]] : !cc.ptr, i64}>>) +// CHECK: %[[VAL_39:.*]] = cc.cast %[[VAL_36]] : (!cc.ptr) -> !cc.ptr, i64}>}>> +// CHECK: %[[VAL_40:.*]] = cc.compute_ptr %[[VAL_39]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> +// CHECK: cf.br ^bb3(%[[VAL_40]] : !cc.ptr, i64}>>) // CHECK: ^bb2: -// CHECK: %[[VAL_36:.*]] = cc.compute_ptr %[[VAL_9]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> -// CHECK: cf.br ^bb3(%[[VAL_36]] : !cc.ptr, i64}>>) -// CHECK: ^bb3(%[[VAL_37:.*]]: !cc.ptr, i64}>>): -// CHECK: %[[VAL_38:.*]] = cc.cast %[[VAL_37]] : (!cc.ptr, i64}>>) -> !cc.ptr> -// CHECK: %[[VAL_39:.*]] = cc.load %[[VAL_38]] : !cc.ptr> -// CHECK: %[[VAL_40:.*]] = cc.compute_ptr %[[VAL_37]][1] : (!cc.ptr, i64}>>) -> !cc.ptr -// CHECK: %[[VAL_41:.*]] = cc.load %[[VAL_40]] : !cc.ptr -// CHECK: %[[VAL_42:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr, !cc.ptr, !cc.ptr}>> -// CHECK: %[[VAL_43:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> -// CHECK: %[[VAL_44:.*]] = cc.cast %[[VAL_39]] : (!cc.ptr) -> !cc.ptr -// CHECK: cc.store %[[VAL_44]], %[[VAL_43]] : !cc.ptr> -// CHECK: %[[VAL_45:.*]] = cc.compute_ptr %[[VAL_42]][1] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> -// CHECK: %[[VAL_46:.*]] = arith.muli %[[VAL_41]], %[[VAL_3]] : i64 -// CHECK: %[[VAL_47:.*]] = cc.cast %[[VAL_39]] : (!cc.ptr) -> !cc.ptr> -// CHECK: %[[VAL_48:.*]] = cc.compute_ptr %[[VAL_47]]{{\[}}%[[VAL_46]]] : (!cc.ptr>, i64) -> !cc.ptr -// CHECK: cc.store %[[VAL_48]], %[[VAL_45]] : !cc.ptr> -// CHECK: %[[VAL_49:.*]] = cc.compute_ptr %[[VAL_42]][2] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> -// CHECK: cc.store %[[VAL_48]], %[[VAL_49]] : !cc.ptr> -// CHECK: call @free(%[[VAL_31]]) : (!cc.ptr) -> () +// CHECK: %[[VAL_41:.*]] = cc.compute_ptr %[[VAL_11]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> +// CHECK: cf.br ^bb3(%[[VAL_41]] : !cc.ptr, i64}>>) +// CHECK: ^bb3(%[[VAL_42:.*]]: !cc.ptr, i64}>>): +// CHECK: %[[VAL_43:.*]] = cc.cast %[[VAL_42]] : (!cc.ptr, i64}>>) -> !cc.ptr> +// CHECK: %[[VAL_44:.*]] = cc.load %[[VAL_43]] : !cc.ptr> +// CHECK: %[[VAL_45:.*]] = cc.compute_ptr %[[VAL_42]][1] : (!cc.ptr, i64}>>) -> !cc.ptr +// CHECK: %[[VAL_46:.*]] = cc.load %[[VAL_45]] : !cc.ptr +// CHECK: %[[VAL_47:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr, !cc.ptr, !cc.ptr}>> +// CHECK: %[[VAL_48:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: %[[VAL_49:.*]] = cc.cast %[[VAL_44]] : (!cc.ptr) -> !cc.ptr +// CHECK: cc.store %[[VAL_49]], %[[VAL_48]] : !cc.ptr> +// CHECK: %[[VAL_50:.*]] = cc.compute_ptr %[[VAL_47]][1] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: %[[VAL_51:.*]] = arith.muli %[[VAL_46]], %[[VAL_3]] : i64 +// CHECK: %[[VAL_52:.*]] = cc.cast %[[VAL_44]] : (!cc.ptr) -> !cc.ptr> +// CHECK: %[[VAL_53:.*]] = cc.compute_ptr %[[VAL_52]]{{\[}}%[[VAL_51]]] : (!cc.ptr>, i64) -> !cc.ptr +// CHECK: cc.store %[[VAL_53]], %[[VAL_50]] : !cc.ptr> +// CHECK: %[[VAL_54:.*]] = cc.compute_ptr %[[VAL_47]][2] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: cc.store %[[VAL_53]], %[[VAL_54]] : !cc.ptr> +// CHECK: call @free(%[[VAL_36]]) : (!cc.ptr) -> () // CHECK: return // CHECK: } + +} + // CHECK: func.func private @hybridLaunchKernel(!cc.ptr, !cc.ptr, !cc.ptr, i64, i64, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> // CHECK: func.func private @cudaqRegisterArgsCreator(!cc.ptr, !cc.ptr) // CHECK: llvm.func @cudaqRegisterLambdaName(!llvm.ptr, !llvm.ptr) attributes {sym_visibility = "private"} @@ -204,7 +220,7 @@ func.func @test_1(%0: !cc.ptr, !cc.ptr, !cc.ptr) // CHECK: func.func private @free(!cc.ptr) // CHECK: func.func private @__nvqpp_initializer_list_to_vector_bool(!cc.ptr, !cc.ptr, i64) -// CHECK: func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr, !cc.ptr, !cc.ptr}>>, !cc.ptr, !cc.ptr, !cc.ptr}>>) +// CHECK: func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr, !cc.ptr, !cc.ptr}>>, !cc.ptr, !cc.array}>>, !cc.ptr>) // CHECK: func.func private @llvm.memcpy.p0i8.p0i8.i64(!cc.ptr, !cc.ptr, i64, i1) // CHECK-LABEL: func.func private @__nvqpp_zeroDynamicResult() -> !cc.struct<{!cc.ptr, i64}> {