From 895d224ba76cb06514129c2d32e2034d7061e018 Mon Sep 17 00:00:00 2001 From: Eric Schweitz Date: Wed, 30 Oct 2024 18:04:08 -0700 Subject: [PATCH 01/19] [pauli word] Rework the implementation from front to back. This changes the pauli_word implementation to be compatible with std::string, use the core character literal support, which changes the code generation and provides a potential way to perform optimizations on quake.exp_pauli ops. This PR also does a complete rewrite of the GKE code. The rewrite fuses the C++ host entry point argument processing with the .argsCreator support function. It removes several special cases that are no longer germane as the surface of supported C++ argument types has expanded quite a bit. It is fully backwards compatible with the old argument packing pointer-free format. The .argsCreator function remains highly coupled with the Python implementation and launcher. C++ should use the streamlined launcher as it has greater flexibility. Fixes tests and updates them to use the hybrid launcher where appropriate. Add new tests. Signed-off-by: Eric Schweitz --- include/cudaq/Optimizer/Builder/Factory.h | 3 + include/cudaq/Optimizer/Dialect/CC/CCTypes.td | 5 + lib/Optimizer/Builder/Factory.cpp | 15 +- lib/Optimizer/Dialect/CC/CCTypes.cpp | 2 +- .../Transforms/DecompositionPatterns.cpp | 5 +- .../Transforms/GenKernelExecution.cpp | 1341 ++++++++--------- lib/Optimizer/Transforms/QuakeSynthesizer.cpp | 25 +- python/utils/OpaqueArguments.h | 8 +- runtime/common/ArgumentConversion.cpp | 37 +- runtime/cudaq/qis/pauli_word.h | 29 +- runtime/test/test_argument_conversion.cpp | 25 +- targettests/Remote-Sim/pauli_word.cpp | 1 - .../SeparateCompilation/pauli_words.cpp | 65 + targettests/execution/exp_pauli.cpp | 17 +- test/Quake/kernel_exec-1.qke | 519 +++++-- test/Quake/kernel_exec-2.qke | 274 ++-- test/Quake/lambda_kernel_exec.qke | 4 +- test/Quake/return_vector.qke | 348 +++-- test/Translate/argument.qke | 288 ++-- test/Translate/return_values.qke | 486 +++--- 20 files changed, 2029 insertions(+), 1468 deletions(-) create mode 100644 targettests/SeparateCompilation/pauli_words.cpp diff --git a/include/cudaq/Optimizer/Builder/Factory.h b/include/cudaq/Optimizer/Builder/Factory.h index 24e933117a..dccca9be24 100644 --- a/include/cudaq/Optimizer/Builder/Factory.h +++ b/include/cudaq/Optimizer/Builder/Factory.h @@ -246,6 +246,9 @@ bool hasSRet(mlir::func::FuncOp funcOp); mlir::FunctionType toHostSideFuncType(mlir::FunctionType funcTy, bool addThisPtr, mlir::ModuleOp module); +/// Convert device type, \p ty, to host side type. +mlir::Type convertToHostSideType(mlir::Type ty); + // Return `true` if the given type corresponds to a standard vector type // according to our convention. // The convention is a `ptr, ptr, ptr>>`. diff --git a/include/cudaq/Optimizer/Dialect/CC/CCTypes.td b/include/cudaq/Optimizer/Dialect/CC/CCTypes.td index d8a5820abe..b7cf72d234 100644 --- a/include/cudaq/Optimizer/Dialect/CC/CCTypes.td +++ b/include/cudaq/Optimizer/Dialect/CC/CCTypes.td @@ -119,7 +119,12 @@ def cc_StructType : CCType<"Struct", "struct", ]; let extraClassDeclaration = [{ + // O(1) bool isEmpty() const { return getMembers().empty(); } + + // O(n) + std::size_t getNumMembers() const { return getMembers().size(); } + Type getMember(unsigned position) { return getMembers()[position]; } }]; } diff --git a/lib/Optimizer/Builder/Factory.cpp b/lib/Optimizer/Builder/Factory.cpp index 421943c6c9..5a4e5cb43b 100644 --- a/lib/Optimizer/Builder/Factory.cpp +++ b/lib/Optimizer/Builder/Factory.cpp @@ -342,19 +342,14 @@ Type factory::getSRetElementType(FunctionType funcTy) { return funcTy.getResult(0); } -static Type convertToHostSideType(Type ty) { +Type factory::convertToHostSideType(Type ty) { if (auto memrefTy = dyn_cast(ty)) - return convertToHostSideType( - factory::stlVectorType(memrefTy.getElementType())); + return factory::stlVectorType( + convertToHostSideType(memrefTy.getElementType())); if (isa(ty)) return cc::PointerType::get(IntegerType::get(ty.getContext(), 8)); - if (auto memrefTy = dyn_cast(ty)) { - // `pauli_word` is an object with a std::vector in the header files at - // present. This data type *must* be updated if it becomes a std::string - // once again. - return convertToHostSideType( - factory::stlVectorType(IntegerType::get(ty.getContext(), 8))); - } + if (isa(ty)) + return factory::stlStringType(ty.getContext()); auto *ctx = ty.getContext(); if (auto structTy = dyn_cast(ty)) { SmallVector newMembers; diff --git a/lib/Optimizer/Dialect/CC/CCTypes.cpp b/lib/Optimizer/Dialect/CC/CCTypes.cpp index 816695e173..0543a12a51 100644 --- a/lib/Optimizer/Dialect/CC/CCTypes.cpp +++ b/lib/Optimizer/Dialect/CC/CCTypes.cpp @@ -158,7 +158,7 @@ Type cc::SpanLikeType::getElementType() const { } bool isDynamicType(Type ty) { - if (isa(ty)) + if (isa(ty)) return true; if (auto strTy = dyn_cast(ty)) { for (auto memTy : strTy.getMembers()) diff --git a/lib/Optimizer/Transforms/DecompositionPatterns.cpp b/lib/Optimizer/Transforms/DecompositionPatterns.cpp index de32b86e45..bdf8e9244c 100644 --- a/lib/Optimizer/Transforms/DecompositionPatterns.cpp +++ b/lib/Optimizer/Transforms/DecompositionPatterns.cpp @@ -362,6 +362,9 @@ struct ExpPauliDecomposition : public OpRewritePattern { auto strAttr = cast(attr.value()); optPauliWordStr = strAttr.getValue(); } + } else if (auto lit = addrOp.getDefiningOp< + cudaq::cc::CreateStringLiteralOp>()) { + optPauliWordStr = lit.getStringLiteral(); } } } @@ -369,7 +372,7 @@ struct ExpPauliDecomposition : public OpRewritePattern { // Assert that we have a constant known pauli word if (!optPauliWordStr.has_value()) - return failure(); + return expPauliOp.emitOpError("cannot determine pauli word string"); auto pauliWordStr = optPauliWordStr.value(); diff --git a/lib/Optimizer/Transforms/GenKernelExecution.cpp b/lib/Optimizer/Transforms/GenKernelExecution.cpp index 2e45c8df96..78a968d822 100644 --- a/lib/Optimizer/Transforms/GenKernelExecution.cpp +++ b/lib/Optimizer/Transforms/GenKernelExecution.cpp @@ -15,6 +15,7 @@ #include "cudaq/Optimizer/Transforms/Passes.h" #include "cudaq/Todo.h" #include "clang/Basic/Version.h" +#include "llvm/ADT/TypeSwitch.h" #include "llvm/Support/Debug.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/ToolOutputFile.h" @@ -48,6 +49,12 @@ static bool isCodegenArgumentGather(std::size_t kind) { return kind == 0 || kind == 2; } +static bool isStateType(Type ty) { + if (auto ptrTy = dyn_cast(ty)) + return isa(ptrTy.getElementType()); + return false; +} + /// This pass adds a `.thunk` function and a rewritten C++ host /// side (mangled) stub to the code for every entry-point kernel in the module. /// It may also generate a `.argsCreator` function. Finally, it @@ -61,6 +68,7 @@ class GenerateKernelExecution public: using GenerateKernelExecutionBase::GenerateKernelExecutionBase; +private: /// Creates the function signature for a thunk function. The signature is /// always the same for all thunk functions. /// @@ -82,9 +90,42 @@ class GenerateKernelExecution {cudaq::opt::factory::getDynamicBufferType(ctx)}); } - /// Add LLVM code with the OpBuilder that computes the size in bytes - /// of a `std::vector` array in the same way as a `std::vector::size()`. - /// This assumes the vector is laid out in memory as the following structure. + /// Generate code to read the length from a host-side string object. (On the + /// device side, a string is encoded as a span.) The length of a string is the + /// number of bytes of data. + /// + /// In order to handle a std::string value it is assumed to be laid out in + /// memory as the following structure. + /// + /// + /// struct vector { + /// i8* data; + /// i64 length; + /// [i8 x 16] inlinedata; + /// }; + /// + /// + /// This implementation does \e not support wide characters. + Value genStringLength(Location loc, OpBuilder &builder, Value stringArg) { + Type stringTy = stringArg.getType(); + assert(isa(stringTy) && + isa( + cast(stringTy).getElementType()) && + cast( + cast(stringTy).getElementType()) + .getMember(1) == builder.getI64Type() && + "host side string expected"); + auto ptrTy = cast(stringTy); + auto strTy = cast(ptrTy.getElementType()); + auto lenPtr = builder.create( + loc, cudaq::cc::PointerType::get(strTy.getMember(1)), stringArg, + ArrayRef{1}); + return builder.create(loc, lenPtr); + } + + /// Generate code that computes the size in bytes of a `std::vector` array + /// in the same way as a `std::vector::size()`. This assumes the vector is + /// laid out in memory as the following structure. /// /// /// struct vector { @@ -100,51 +141,29 @@ class GenerateKernelExecution /// for `std::vector::size()` without the final `sdiv` op that divides the /// `sizeof(data[N])` by the `sizeof(T)`. The result is the total required /// memory size for the vector data itself in \e bytes. - /// - /// In order to handle a std::string value it is assumed to be laid out in - /// memory as the following structure. - /// - /// - /// struct vector { - /// i8* data; - /// i64 length; - /// [i8 x 16] inlinedata; - /// }; - /// - /// - /// In the string case, the size can just be read from the data structure. - Value getVectorSize(Location loc, OpBuilder &builder, - cudaq::cc::PointerType ptrTy, Value arg) { - // Create the i64 type - Type i64Ty = builder.getI64Type(); - - // We're given ptr>, get that struct type (struct) - auto inpStructTy = cast(ptrTy.getElementType()); - - if (inpStructTy.getMember(1) == i64Ty) { - // This is a string, so just read the length out. - auto ptrI64Ty = cudaq::cc::PointerType::get(i64Ty); - auto lenPtr = builder.create( - loc, ptrI64Ty, arg, SmallVector{1}); - return builder.create(loc, lenPtr); - } - - // For the following GEP calls, we'll expect them to return T** - auto ptrTtype = cudaq::cc::PointerType::get(inpStructTy.getMember(0)); + Value genVectorSize(Location loc, OpBuilder &builder, Value vecArg) { + auto vecTy = cast(vecArg.getType()); + auto vecStructTy = cast(vecTy.getElementType()); + assert(vecStructTy.getNumMembers() == 3 && + vecStructTy.getMember(0) == vecStructTy.getMember(1) && + vecStructTy.getMember(0) == vecStructTy.getMember(2) && + "host side vector expected"); + auto vecElePtrTy = cudaq::cc::PointerType::get(vecStructTy.getMember(0)); // Get the pointer to the pointer of the end of the array Value endPtr = builder.create( - loc, ptrTtype, arg, SmallVector{1}); + loc, vecElePtrTy, vecArg, ArrayRef{1}); // Get the pointer to the pointer of the beginning of the array Value beginPtr = builder.create( - loc, ptrTtype, arg, SmallVector{0}); + loc, vecElePtrTy, vecArg, ArrayRef{0}); // Load to a T* endPtr = builder.create(loc, endPtr); beginPtr = builder.create(loc, beginPtr); // Map those pointers to integers + Type i64Ty = builder.getI64Type(); Value endInt = builder.create(loc, i64Ty, endPtr); Value beginInt = builder.create(loc, i64Ty, beginPtr); @@ -159,143 +178,6 @@ class GenerateKernelExecution return builder.create(loc, length, eight); } - /// This computes a vector's size and handles recursive vector types. This - /// first value returned is the size of the top level (outermost) vector in - /// bytes. The second value is the recursive size of all the vectors within - /// the outer vector. - std::pair - computeRecursiveVectorSize(Location loc, OpBuilder &builder, Value hostArg, - cudaq::cc::PointerType hostVecTy, - cudaq::cc::SpanLikeType stdvecTy) { - Value topLevelSize; - Value recursiveSize; - auto eleTy = stdvecTy.getElementType(); - if (auto sTy = dyn_cast(eleTy)) { - // This is the recursive case. vector>. Convert size of - // vectors to i64s. - topLevelSize = computeHostVectorLengthInBytes( - loc, builder, hostArg, stdvecTy.getElementType(), hostVecTy); - auto nested = fetchHostVectorFront(loc, builder, hostArg, hostVecTy); - auto tmp = builder.create(loc, builder.getI64Type()); - builder.create(loc, topLevelSize, tmp); - // Convert bytes to units of i64. (Divide by 8) - auto topLevelCount = - convertLengthBytesToLengthI64(loc, builder, topLevelSize); - // Now walk the vectors recursively. - auto topLevelIndex = builder.create( - loc, builder.getI64Type(), topLevelCount, - cudaq::cc::CastOpMode::Unsigned); - cudaq::opt::factory::createInvariantLoop( - builder, loc, topLevelIndex, - [&](OpBuilder &builder, Location loc, Region &, Block &block) { - Value i = block.getArgument(0); - auto sub = builder.create(loc, hostVecTy, - nested, i); - auto p = - computeRecursiveVectorSize(loc, builder, sub, hostVecTy, sTy); - auto subSz = builder.create(loc, tmp); - auto sum = builder.create(loc, p.second, subSz); - builder.create(loc, sum, tmp); - }); - recursiveSize = builder.create(loc, tmp); - } else { - // Non-recusive case. Just compute the size of the top-level vector. - topLevelSize = getVectorSize(loc, builder, hostVecTy, hostArg); - recursiveSize = topLevelSize; - } - return {topLevelSize, recursiveSize}; - } - - /// This computes a dynamic struct's size and handles recursive dynamic types. - /// This first value returned is the initial value of the top level - /// (outermost) struct to be saved in the buffer. More specifically, any - /// (recursive) member that is a vector is replaced by a i64 byte size. The - /// offset of the trailing data is, as always, implicit. The second value is - /// the recursive size of all the dynamic components within the outer struct. - std::pair computeRecursiveDynamicStructSize( - Location loc, OpBuilder &builder, cudaq::cc::StructType structTy, - Value arg, Value totalSize, cudaq::cc::StructType genTy) { - Value retval = builder.create(loc, genTy); - auto argTy = cast(arg.getType()); - for (auto iter : llvm::enumerate(structTy.getMembers())) { - auto memTy = iter.value(); - std::int32_t off = iter.index(); - auto structMemTy = - cast(argTy.getElementType()).getMember(off); - auto structMemPtrTy = cudaq::cc::PointerType::get(structMemTy); - auto memPtrVal = builder.create( - loc, structMemPtrTy, arg, ArrayRef{off}); - if (cudaq::cc::isDynamicType(memTy)) { - if (auto sTy = dyn_cast(memTy)) { - auto gTy = cast(structMemTy); - auto pr = computeRecursiveDynamicStructSize( - loc, builder, sTy, memPtrVal, totalSize, gTy); - retval = builder.create( - loc, retval.getType(), retval, pr.first, off); - totalSize = builder.create(loc, totalSize, pr.second); - continue; - } - auto memStdVecTy = cast(memTy); - Type eTy = memStdVecTy.getElementType(); - auto stlVecTy = cudaq::opt::factory::stlVectorType(eTy); - auto ptrMemTy = cudaq::cc::PointerType::get(stlVecTy); - auto pr = computeRecursiveVectorSize(loc, builder, memPtrVal, ptrMemTy, - memStdVecTy); - retval = builder.create( - loc, retval.getType(), retval, pr.second, off); - totalSize = builder.create(loc, totalSize, pr.first); - continue; - } - auto memVal = builder.create(loc, memPtrVal); - retval = builder.create(loc, retval.getType(), - retval, memVal, off); - } - return {retval, totalSize}; - } - - /// Copy a vector's data, which must be \p bytes in length, from \p hostArg to - /// \p outputBuffer. The hostArg must have a pointer type that is compatible - /// with the triple pointer std::vector base implementation. - Value copyVectorData(Location loc, OpBuilder &builder, Value bytes, - Value hostArg, Value outputBuffer) { - auto notVolatile = builder.create(loc, 0, 1); - auto inStructTy = cast( - cast(hostArg.getType()).getElementType()); - auto beginPtr = builder.create( - loc, cudaq::cc::PointerType::get(inStructTy.getMember(0)), hostArg, - SmallVector{0}); - auto fromBuff = builder.create(loc, beginPtr); - auto i8Ty = builder.getI8Type(); - auto vecFromBuff = cudaq::opt::factory::createCast( - builder, loc, cudaq::cc::PointerType::get(i8Ty), fromBuff); - builder.create( - loc, std::nullopt, cudaq::llvmMemCopyIntrinsic, - ValueRange{outputBuffer, vecFromBuff, bytes, notVolatile}); - auto i8ArrTy = cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(i8Ty)); - auto buf1 = - cudaq::opt::factory::createCast(builder, loc, i8ArrTy, outputBuffer); - // Increment outputBuffer by size bytes. - return builder.create( - loc, outputBuffer.getType(), buf1, SmallVector{bytes}); - } - - /// Given that \p arg is a SpanLikeType value, compute its extent size (the - /// number of elements in the outermost vector times `sizeof(int64_t)`) and - /// total recursive size (both values are in bytes). We add the extent size - /// into the message buffer field and increase the size of the addend by the - /// total recursive size. - std::pair insertVectorSizeAndIncrementExtraBytes( - Location loc, OpBuilder &builder, Value arg, - cudaq::cc::PointerType ptrInTy, cudaq::cc::SpanLikeType stdvecTy, - Value stVal, std::int32_t idx, Value extraBytes) { - auto [extentSize, recursiveSize] = - computeRecursiveVectorSize(loc, builder, arg, ptrInTy, stdvecTy); - stVal = builder.create(loc, stVal.getType(), - stVal, extentSize, idx); - extraBytes = builder.create(loc, extraBytes, recursiveSize); - return {stVal, extraBytes}; - } - Value genComputeReturnOffset(Location loc, OpBuilder &builder, FunctionType funcTy, cudaq::cc::StructType msgStructTy) { @@ -326,6 +208,455 @@ class GenerateKernelExecution builder.create(loc, result); } + static cudaq::cc::PointerType getByteAddressableType(OpBuilder &builder) { + return cudaq::cc::PointerType::get( + cudaq::cc::ArrayType::get(builder.getI8Type())); + } + + static cudaq::cc::PointerType getPointerToPointerType(OpBuilder &builder) { + return cudaq::cc::PointerType::get( + cudaq::cc::PointerType::get(builder.getI8Type())); + } + + static bool isDynamicSignature(FunctionType devFuncTy) { + for (auto t : devFuncTy.getInputs()) + if (cudaq::cc::isDynamicType(t)) + return true; + for (auto t : devFuncTy.getResults()) + if (cudaq::cc::isDynamicType(t)) + return true; + return false; + } + + static std::pair + genByteSizeAndElementCount(Location loc, OpBuilder &builder, Type eleTy, + Value size, Value arg, Type t) { + // If this is a vector>, convert the bytes of vector + // to bytes of length (i64). + if (isa(eleTy)) { + auto three = builder.create(loc, 3, 64); + size = builder.create(loc, size, three); + auto ate = builder.create(loc, 8, 64); + Value count = builder.create(loc, size, ate); + return {size, count}; + } + // If this is a vector, convert the bytes of string to + // bytes of length (i64). + if (isa(eleTy)) { + auto fore = builder.create(loc, 4, 64); + size = builder.create(loc, size, fore); + auto ate = builder.create(loc, 8, 64); + Value count = builder.create(loc, size, ate); + return {size, count}; + } + // If this is a vector>, convert the bytes of struct + // to bytes of struct with converted members. + if (isa(eleTy)) { + auto eleTy = cast(arg.getType()).getElementType(); + auto i64Ty = builder.getI64Type(); + auto hostStrSize = builder.create(loc, i64Ty, eleTy); + Value count = builder.create(loc, size, hostStrSize); + Type packedTy = cudaq::opt::factory::genArgumentBufferType(t); + auto packSize = builder.create(loc, i64Ty, packedTy); + size = builder.create(loc, count, packSize); + return {size, count}; + } + return {}; + } + + Value descendThroughDynamicType(Location loc, OpBuilder &builder, Type ty, + Value addend, Value arg, Value tmp) { + auto i64Ty = builder.getI64Type(); + Value tySize = + TypeSwitch(ty) + // A char span is dynamic, but it is not recursively dynamic. Just + // read the length of the string out. + .Case([&](cudaq::cc::CharspanType t) -> Value { + return genStringLength(loc, builder, arg); + }) + // A std::vector is dynamic and may be recursive dynamic as well. + .Case([&](cudaq::cc::StdvecType t) -> Value { + // Compute the byte span of the vector. + Value size = genVectorSize(loc, builder, arg); + auto eleTy = t.getElementType(); + if (!cudaq::cc::isDynamicType(eleTy)) + return size; + + // Otherwise, we have a recursively dynamic case. + auto [bytes, count] = + genByteSizeAndElementCount(loc, builder, eleTy, size, arg, t); + assert(count && "vector must have elements"); + size = bytes; + + // At this point, arg is a known vector of elements of dynamic + // type, so walk over the vector and recurse on each element. + // `size` is already the proper size of the lengths of each of the + // elements in turn. + builder.create(loc, size, tmp); + auto ptrTy = cast(arg.getType()); + auto strTy = cast(ptrTy.getElementType()); + auto memTy = cast(strTy.getMember(0)); + auto arrTy = + cudaq::cc::PointerType::get(cudaq::cc::PointerType::get( + cudaq::cc::ArrayType::get(memTy.getElementType()))); + auto castPtr = builder.create(loc, arrTy, arg); + auto castArg = builder.create(loc, castPtr); + auto castPtrTy = + cudaq::cc::PointerType::get(memTy.getElementType()); + cudaq::opt::factory::createInvariantLoop( + builder, loc, count, + [&](OpBuilder &builder, Location loc, Region &, + Block &block) { + Value i = block.getArgument(0); + auto ai = builder.create( + loc, castPtrTy, castArg, + ArrayRef{i}); + auto tmpVal = builder.create(loc, tmp); + Value innerSize = descendThroughDynamicType( + loc, builder, eleTy, tmpVal, ai, tmp); + builder.create(loc, innerSize, tmp); + }); + return builder.create(loc, tmp); + }) + // A struct can be dynamic if it contains dynamic members. Get the + // static portion of the struct first, which will have length slots. + // Then get the dynamic sizes for the dynamic members. + .Case([&](cudaq::cc::StructType t) -> Value { + if (cudaq::cc::isDynamicType(t)) { + Type packedTy = cudaq::opt::factory::genArgumentBufferType(t); + Value strSize = + builder.create(loc, i64Ty, packedTy); + for (auto [i, m] : llvm::enumerate(t.getMembers())) { + if (cudaq::cc::isDynamicType(m)) { + auto hostPtrTy = + cast(arg.getType()); + auto hostStrTy = + cast(hostPtrTy.getElementType()); + auto pm = + cudaq::cc::PointerType::get(hostStrTy.getMember(i)); + auto ai = builder.create( + loc, pm, arg, ArrayRef{i}); + strSize = descendThroughDynamicType(loc, builder, m, + strSize, ai, tmp); + } + } + return strSize; + } + return builder.create(loc, i64Ty, t); + }) + .Default([&](Type t) -> Value { + return builder.create(loc, i64Ty, t); + }); + return builder.create(loc, tySize, addend); + } + + // Take the list of host-side arguments and device side argument types and zip + // them together logically with the position. Generates any fixup code that's + // needed, like when the device side uses a pair of arguments for a single + // logical device side argument. May drop some arguments on the floor if they + // cannot be encoded. + template + SmallVector> zipArgumentsWithDeviceTypes( + Location loc, OpBuilder &builder, ValueRange args, TypeRange types, + SmallVectorImpl *freeVectorBuffers = nullptr) { + SmallVector> result; + if constexpr (argsAreReferences) { + // Simple case: the number of args must be equal to the types. + assert(args.size() == types.size() && + "arguments and types must have same size"); + for (auto iter : llvm::enumerate(llvm::zip(args, types))) { + // Remove the reference. + Value v = std::get(iter.value()); + Type ty = std::get(iter.value()); + if (!(cudaq::cc::isDynamicType(ty) || isStateType(ty) || + isa(ty))) + v = builder.create(loc, v); + // NB: Will a vector be passed as a C++ object or "unrolled" by + // the caller into a contiguous string of bytes, where each byte is a + // bool? Assume the latter for now, since it's likely the way python + // will do / continue to do it. + result.emplace_back(iter.index(), v, ty); + } + } else /*constexpr*/ { + // In this case, we *may* have logical arguments that are passed in pairs. + auto *ctx = builder.getContext(); + auto *parent = builder.getBlock()->getParentOp(); + auto module = parent->getParentOfType(); + auto lastArg = args.end(); + auto tyIter = types.begin(); + unsigned argPos = 0; + for (auto argIter = args.begin(); argIter != lastArg; + ++argIter, ++tyIter, ++argPos) { + assert(tyIter != types.end()); + Type devTy = *tyIter; + + // std::vector isn't really a std::vector<>. Use the helper + // function to unpack it so it looks like any other vector. + if (auto stdvecTy = dyn_cast(devTy)) + if (stdvecTy.getElementType() == IntegerType::get(ctx, 1)) { + Type stdvecHostTy = + cudaq::opt::factory::stlVectorType(stdvecTy.getElementType()); + Value tmp = builder.create(loc, stdvecHostTy); + builder.create(loc, std::nullopt, + cudaq::stdvecBoolUnpackToInitList, + ArrayRef{tmp, *argIter}); + result.emplace_back(argPos, tmp, devTy); + assert(freeVectorBuffers && + "must have a vector to return heap allocations"); + freeVectorBuffers->push_back(tmp); + continue; + } + + // Check for a struct passed in a pair of arguments. + if (isa(devTy) && + !isa((*argIter).getType()) && + cudaq::opt::factory::isX86_64(module) && + cudaq::opt::factory::structUsesTwoArguments(devTy)) { + auto first = *argIter++; + auto second = *argIter; + // TODO: Investigate if it's correct to assume the register layout + // will match the memory layout of the small struct. + auto pairTy = cudaq::cc::StructType::get( + ctx, ArrayRef{first.getType(), second.getType()}); + auto tmp = builder.create(loc, pairTy); + auto tmp1 = builder.create( + loc, cudaq::cc::PointerType::get(first.getType()), tmp); + builder.create(loc, first, tmp1); + auto tmp2 = builder.create( + loc, cudaq::cc::PointerType::get(second.getType()), tmp, + ArrayRef{1}); + builder.create(loc, second, tmp2); + auto devPtrTy = cudaq::cc::PointerType::get(devTy); + Value devVal = builder.create(loc, devPtrTy, tmp); + if (!cudaq::cc::isDynamicType(devTy)) + devVal = builder.create(loc, devVal); + result.emplace_back(argPos, devVal, devTy); + continue; + } + + // Is this a static struct passed as a byval pointer? + if (isa(devTy) && + isa((*argIter).getType()) && + !cudaq::cc::isDynamicType(devTy)) { + Value devVal = builder.create(loc, *argIter); + result.emplace_back(argPos, devVal, devTy); + continue; + } + result.emplace_back(argPos, *argIter, devTy); + } + } + return result; + } + + Value genSizeOfDynamicMessageBuffer( + Location loc, OpBuilder &builder, cudaq::cc::StructType structTy, + ArrayRef> zippy, Value tmp) { + auto i64Ty = builder.getI64Type(); + Value initSize = builder.create(loc, i64Ty, structTy); + for (auto [_, a, t] : zippy) + if (cudaq::cc::isDynamicType(t)) + initSize = descendThroughDynamicType(loc, builder, t, initSize, a, tmp); + return initSize; + } + + Value populateStringAddendum(Location loc, OpBuilder &builder, Value host, + Value sizeSlot, Value addendum) { + Value size = genStringLength(loc, builder, host); + builder.create(loc, size, sizeSlot); + auto ptrI8Ty = cudaq::cc::PointerType::get(builder.getI8Type()); + auto ptrPtrI8 = getPointerToPointerType(builder); + auto fromPtrPtr = builder.create(loc, ptrPtrI8, host); + auto fromPtr = builder.create(loc, fromPtrPtr); + auto notVolatile = builder.create(loc, 0, 1); + auto toPtr = builder.create(loc, ptrI8Ty, addendum); + builder.create(loc, std::nullopt, cudaq::llvmMemCopyIntrinsic, + ValueRange{toPtr, fromPtr, size, notVolatile}); + auto ptrI8Arr = getByteAddressableType(builder); + auto addBytes = builder.create(loc, ptrI8Arr, addendum); + return builder.create( + loc, ptrI8Ty, addBytes, ArrayRef{size}); + } + + // Simple case when the vector data is known to not hold dynamic data. + Value populateVectorAddendum(Location loc, OpBuilder &builder, Value host, + Value sizeSlot, Value addendum) { + Value size = genVectorSize(loc, builder, host); + builder.create(loc, size, sizeSlot); + auto ptrI8Ty = cudaq::cc::PointerType::get(builder.getI8Type()); + auto ptrPtrI8 = getPointerToPointerType(builder); + auto fromPtrPtr = builder.create(loc, ptrPtrI8, host); + auto fromPtr = builder.create(loc, fromPtrPtr); + auto notVolatile = builder.create(loc, 0, 1); + auto toPtr = builder.create(loc, ptrI8Ty, addendum); + builder.create(loc, std::nullopt, cudaq::llvmMemCopyIntrinsic, + ValueRange{toPtr, fromPtr, size, notVolatile}); + auto ptrI8Arr = getByteAddressableType(builder); + auto addBytes = builder.create(loc, ptrI8Arr, addendum); + return builder.create( + loc, ptrI8Ty, addBytes, ArrayRef{size}); + } + + Value populateDynamicAddendum(Location loc, OpBuilder &builder, Type devArgTy, + Value host, Value sizeSlot, Value addendum, + Value addendumScratch) { + if (isa(devArgTy)) + return populateStringAddendum(loc, builder, host, sizeSlot, addendum); + if (auto vecTy = dyn_cast(devArgTy)) { + auto eleTy = vecTy.getElementType(); + if (cudaq::cc::isDynamicType(eleTy)) { + // Recursive case. Visit each dynamic element, copying it. + Value size = genVectorSize(loc, builder, host); + auto [bytes, count] = genByteSizeAndElementCount(loc, builder, eleTy, + size, host, devArgTy); + size = bytes; + builder.create(loc, size, sizeSlot); + // Convert from bytes to vector length in elements. + // Compute new addendum start. + auto addrTy = getByteAddressableType(builder); + auto castEnd = builder.create(loc, addrTy, addendum); + Value newAddendum = builder.create( + loc, addendum.getType(), castEnd, + ArrayRef{size}); + builder.create(loc, newAddendum, addendumScratch); + auto sizeBlockTy = cudaq::cc::PointerType::get( + cudaq::cc::ArrayType::get(builder.getI64Type())); + auto ptrI64Ty = cudaq::cc::PointerType::get(builder.getI64Type()); + // In the recursive case, the next block of addendum is a vector of + // sizes in bytes. Each size will be the size of the vector at that + // offset. + auto sizeBlock = + builder.create(loc, sizeBlockTy, addendum); + auto ptrPtrBlockTy = cudaq::cc::PointerType::get( + cast( + cast(host.getType()).getElementType()) + .getMember(0)); + // The host argument is a std::vector, so we want to get the address of + // "front" out of the vector (the first pointer in the triple) and step + // over the contiguous range of vectors in the host block. The vector of + // vectors forms a ragged array structure in host memory. + auto hostBeginPtrRef = builder.create( + loc, ptrPtrBlockTy, host, ArrayRef{0}); + auto hostBegin = + builder.create(loc, hostBeginPtrRef); + auto hostEleTy = cast(hostBegin.getType()); + auto hostBlockTy = cudaq::cc::PointerType::get( + cudaq::cc::ArrayType::get(hostEleTy.getElementType())); + auto hostBlock = + builder.create(loc, hostBlockTy, hostBegin); + // Loop over each vector element in the vector (recursively). + cudaq::opt::factory::createInvariantLoop( + builder, loc, count, + [&](OpBuilder &builder, Location loc, Region &, Block &block) { + Value i = block.getArgument(0); + Value addm = + builder.create(loc, addendumScratch); + auto subSlot = builder.create( + loc, ptrI64Ty, sizeBlock, + ArrayRef{i}); + auto subHost = builder.create( + loc, hostEleTy, hostBlock, + ArrayRef{i}); + Value newAddm = populateDynamicAddendum( + loc, builder, eleTy, subHost, subSlot, addm, addendumScratch); + builder.create(loc, newAddm, addendumScratch); + }); + return builder.create(loc, addendumScratch); + } + return populateVectorAddendum(loc, builder, host, sizeSlot, addendum); + } + auto devStrTy = cast(devArgTy); + auto hostStrTy = cast( + cast(sizeSlot.getType()).getElementType()); + assert(devStrTy.getNumMembers() == hostStrTy.getNumMembers()); + for (auto iter : llvm::enumerate(devStrTy.getMembers())) { + std::int32_t iterIdx = iter.index(); + auto hostPtrTy = cast(host.getType()); + auto hostMemTy = cast(hostPtrTy.getElementType()) + .getMember(iterIdx); + auto val = builder.create( + loc, cudaq::cc::PointerType::get(hostMemTy), host, + ArrayRef{iterIdx}); + Type iterTy = iter.value(); + if (cudaq::cc::isDynamicType(iterTy)) { + Value fieldInSlot = builder.create( + loc, cudaq::cc::PointerType::get(builder.getI64Type()), sizeSlot, + ArrayRef{iterIdx}); + addendum = populateDynamicAddendum( + loc, builder, iterTy, val, fieldInSlot, addendum, addendumScratch); + } else { + Value fieldInSlot = builder.create( + loc, cudaq::cc::PointerType::get(iterTy), sizeSlot, + ArrayRef{iterIdx}); + auto v = builder.create(loc, val); + builder.create(loc, v, fieldInSlot); + } + } + return addendum; + } + + void populateMessageBuffer(Location loc, OpBuilder &builder, + Value msgBufferBase, + ArrayRef> zippy, + Value addendum = {}, Value addendumScratch = {}) { + auto structTy = cast( + cast(msgBufferBase.getType()).getElementType()); + // Loop over all the arguments and populate the message buffer. + for (auto [idx, arg, devArgTy] : zippy) { + if (cudaq::cc::isDynamicType(devArgTy)) { + assert(addendum && "must have addendum to encode dynamic argument(s)"); + // Get the address of the slot to be filled. + auto memberTy = cast(structTy).getMember(idx); + auto ptrTy = cudaq::cc::PointerType::get(memberTy); + auto slot = builder.create( + loc, ptrTy, msgBufferBase, ArrayRef{idx}); + addendum = populateDynamicAddendum(loc, builder, devArgTy, arg, slot, + addendum, addendumScratch); + continue; + } + + // If the argument is a callable, skip it. + if (isa(devArgTy)) + continue; + // If the argument is an empty struct, skip it. + if (auto strTy = dyn_cast(devArgTy); + strTy && strTy.isEmpty()) + continue; + + // Get the address of the slot to be filled. + auto memberTy = cast(structTy).getMember(idx); + auto ptrTy = cudaq::cc::PointerType::get(memberTy); + Value slot = builder.create( + loc, ptrTy, msgBufferBase, ArrayRef{idx}); + + // Argument is a packaged kernel. In this case, the argument is some + // unknown kernel that may be called. The packaged argument is coming + // from opaque C++ host code, so we need to identify what kernel it + // references and then pass its name as a span of characters to the + // launch kernel. + if (isa(devArgTy)) { + auto i64Ty = builder.getI64Type(); + auto kernKey = builder.create( + loc, i64Ty, cudaq::runtime::getLinkableKernelKey, ValueRange{arg}); + builder.create(loc, kernKey.getResult(0), slot); + continue; + } + + // Just pass the raw pointer. The buffer is supposed to be pointer-free + // since it may be unpacked in a different address space. However, if this + // is a simulation and things are in the same address space, we pass the + // pointer for convenience. + if (isa(devArgTy)) + arg = builder.create(loc, memberTy, arg); + + if (isa(arg.getType()) && + (cudaq::cc::PointerType::get(arg.getType()) != slot.getType())) { + slot = builder.create( + loc, cudaq::cc::PointerType::get(arg.getType()), slot); + } + builder.create(loc, arg, slot); + } + } + /// Creates a function that can take a block of pointers to argument values /// and using the compiler's knowledge of a kernel encodes those argument /// values into a message buffer. The message buffer is a pointer-free block @@ -348,14 +679,18 @@ class GenerateKernelExecution auto *ctx = builder.getContext(); Type i8Ty = builder.getI8Type(); Type ptrI8Ty = cudaq::cc::PointerType::get(i8Ty); - auto ptrPtrType = cudaq::cc::PointerType::get(ptrI8Ty); + auto ptrPtrType = getPointerToPointerType(builder); Type i64Ty = builder.getI64Type(); auto structPtrTy = cudaq::cc::PointerType::get(msgStructTy); - auto getHostArgType = [&](unsigned idx) { - bool hasSRet = cudaq::opt::factory::hasHiddenSRet(hostFuncTy); - unsigned count = cudaq::cc::numberOfHiddenArgs(hasThisPtr, hasSRet); - return hostFuncTy.getInput(count + idx); - }; + auto passedDevArgTys = devKernelTy.getInputs().drop_front(startingArgIdx); + + SmallVector passedHostArgTys; + for (auto ty : passedDevArgTys) { + Type hostTy = cudaq::opt::factory::convertToHostSideType(ty); + if (cudaq::cc::isDynamicType(ty)) + hostTy = cudaq::cc::PointerType::get(hostTy); + passedHostArgTys.push_back(hostTy); + } // Create the function that we'll fill. auto funcType = FunctionType::get(ctx, {ptrPtrType, ptrPtrType}, {i64Ty}); @@ -365,195 +700,79 @@ class GenerateKernelExecution auto *entry = argsCreatorFunc.addEntryBlock(); builder.setInsertionPointToStart(entry); - // Get the original function args - auto kernelArgTypes = devKernelTy.getInputs().drop_front(startingArgIdx); + // Convert all the arguments passed in the array of void* to appear as if + // they had been naturally passed as C++ arguments. + // This means, casting to the correct type (host-side) and removing the + // outer pointer by a dereference. Each argument must be a valid reference + // at this point, so if the dereference fails (say it is a nullptr), it is a + // bug in the code that is calling this argsCreator. - // Init the struct - Value stVal = builder.create(loc, msgStructTy); - - // Get the variadic void* args - auto variadicArgs = builder.create( + // Get the array of void* args. + auto argsArray = builder.create( loc, cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(ptrI8Ty)), entry->getArgument(0)); - // Initialize the counter for extra size. - Value zero = builder.create(loc, 0, 64); - Value extraBytes = zero; - - // Process all the arguments for the original call by looping over the - // kernel's arguments. - bool hasTrailingData = false; - DenseMap replacementArgs; - for (auto kaIter : llvm::enumerate(kernelArgTypes)) { - std::int32_t idx = kaIter.index(); - - // The current cudaq kernel arg and message buffer element type. - Type currArgTy = kaIter.value(); - Type currEleTy = msgStructTy.getMember(idx); - - // Skip any elements that are callables or empty structures. - if (isa(currEleTy)) - continue; - if (auto strTy = dyn_cast(currEleTy)) - if (strTy.isEmpty()) - continue; - - // Get the pointer to the argument from out of the block of pointers, - // which are the variadic args. - Value argPtrPtr = builder.create( - loc, ptrPtrType, variadicArgs, - SmallVector{idx}); - Value argPtr = builder.create(loc, ptrI8Ty, argPtrPtr); - - if (auto stdvecTy = dyn_cast(currArgTy)) { - // If this is a vector argument, then we will add data to the message - // buffer's addendum (unless the vector is length 0). - auto ptrInTy = cudaq::cc::PointerType::get( - cudaq::opt::factory::stlVectorType(stdvecTy.getElementType())); - - Value arg = builder.create(loc, ptrInTy, argPtr); - if (stdvecTy.getElementType() == builder.getI1Type()) { - // Create a mock vector of i8 and populate the bools, 1 per char. - Value temp = builder.create( - loc, ptrInTy.getElementType()); - builder.create(loc, std::nullopt, - cudaq::stdvecBoolUnpackToInitList, - ArrayRef{temp, arg}); - replacementArgs[idx] = temp; - arg = temp; - } - - auto [p1, p2] = insertVectorSizeAndIncrementExtraBytes( - loc, builder, arg, ptrInTy, stdvecTy, stVal, idx, extraBytes); - stVal = p1; - extraBytes = p2; - hasTrailingData = true; - continue; - } + // Loop over the array and cast the void* to the host-side type. + SmallVector pseudoArgs; + for (auto iter : llvm::enumerate(passedHostArgTys)) { + std::int32_t i = iter.index(); + auto parg = builder.create( + loc, ptrPtrType, argsArray, ArrayRef{i}); + Type ty = iter.value(); + // parg is a pointer to a pointer as it is an element of an array of + // pointers. Always dereference the first layer here. + Value deref = builder.create(loc, parg); + if (!isa(ty)) + ty = cudaq::cc::PointerType::get(ty); + pseudoArgs.push_back(builder.create(loc, ty, deref)); + } - if (auto strTy = dyn_cast(currArgTy)) { - Value v = argPtr; - if (!cudaq::cc::isDynamicType(strTy)) { - // struct is static size, so just load the value (byval ptr). - v = builder.create( - loc, cudaq::cc::PointerType::get(currEleTy), v); - v = builder.create(loc, v); - stVal = builder.create(loc, stVal.getType(), - stVal, v, idx); - continue; - } - auto genTy = cast(currEleTy); - Value zero = builder.create(loc, 0, 64); - Type hostArgTy = getHostArgType(idx); - v = builder.create(loc, hostArgTy, v); - auto [quakeVal, recursiveSize] = computeRecursiveDynamicStructSize( - loc, builder, strTy, v, zero, genTy); - stVal = builder.create(loc, stVal.getType(), - stVal, quakeVal, idx); - extraBytes = - builder.create(loc, extraBytes, recursiveSize); - hasTrailingData = true; - continue; - } - if (auto ptrTy = dyn_cast(currEleTy)) { - if (isa(ptrTy.getElementType())) { - // Special case: if the argument is a `cudaq::state*`, then just pass - // the pointer. We can do that in this case because the synthesis step - // (which will receive the argument data) is assumed to run in the - // same memory space. - argPtr = builder.create(loc, currEleTy, argPtr); - stVal = builder.create(loc, stVal.getType(), - stVal, argPtr, idx); - } - continue; - } + // Zip the arguments with the device side argument types. Recall that some + // of the (left-most) arguments may have been dropped on the floor. + const bool hasDynamicSignature = isDynamicSignature(devKernelTy); + auto zippy = zipArgumentsWithDeviceTypes( + loc, builder, pseudoArgs, passedDevArgTys); + auto sizeScratch = builder.create(loc, i64Ty); + auto messageBufferSize = [&]() -> Value { + if (hasDynamicSignature) + return genSizeOfDynamicMessageBuffer(loc, builder, msgStructTy, zippy, + sizeScratch); + return builder.create(loc, i64Ty, msgStructTy); + }(); - // cast to the struct element type, void* -> TYPE * - argPtr = builder.create( - loc, cudaq::cc::PointerType::get(currEleTy), argPtr); - Value loadedVal = - builder.create(loc, currEleTy, argPtr); - stVal = builder.create(loc, stVal.getType(), - stVal, loadedVal, idx); + // Allocate the message buffer on the heap. It must outlive this call. + auto buff = builder.create(loc, ptrI8Ty, "malloc", + ValueRange(messageBufferSize)); + Value rawMessageBuffer = buff.getResult(0); + Value msgBufferPrefix = + builder.create(loc, structPtrTy, rawMessageBuffer); + + // Populate the message buffer with the pointer-free argument values. + if (hasDynamicSignature) { + auto addendumScratch = builder.create(loc, ptrI8Ty); + Value prefixSize = + builder.create(loc, i64Ty, msgStructTy); + auto arrMessageBuffer = builder.create( + loc, cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(i8Ty)), + rawMessageBuffer); + // Compute the position of the addendum. + Value addendumPtr = builder.create( + loc, ptrI8Ty, arrMessageBuffer, + ArrayRef{prefixSize}); + populateMessageBuffer(loc, builder, msgBufferPrefix, zippy, addendumPtr, + addendumScratch); + } else { + populateMessageBuffer(loc, builder, msgBufferPrefix, zippy); } - // Compute the struct size - Value structSize = - builder.create(loc, i64Ty, msgStructTy); - - // Here we do have vector args - Value extendedStructSize = - hasTrailingData - ? builder.create(loc, structSize, extraBytes) - : structSize; - // If no vector args, handle this simple case and drop out - Value buff = builder - .create(loc, ptrI8Ty, "malloc", - ValueRange(extendedStructSize)) - .getResult(0); - - Value casted = builder.create(loc, structPtrTy, buff); - builder.create(loc, stVal, casted); - if (hasTrailingData) { - auto arrTy = cudaq::cc::ArrayType::get(i8Ty); - auto ptrArrTy = cudaq::cc::PointerType::get(arrTy); - auto cast1 = builder.create(loc, ptrArrTy, buff); - Value vecToBuffer = builder.create( - loc, ptrI8Ty, cast1, SmallVector{structSize}); - for (auto iter : llvm::enumerate(msgStructTy.getMembers())) { - std::int32_t idx = iter.index(); - if (idx == static_cast(kernelArgTypes.size())) - break; - // Get the corresponding cudaq kernel arg type - auto currArgTy = kernelArgTypes[idx]; - if (auto stdvecTy = dyn_cast(currArgTy)) { - auto bytes = builder.create( - loc, builder.getI64Type(), stVal, idx); - Value argPtrPtr = builder.create( - loc, ptrPtrType, variadicArgs, - ArrayRef{idx}); - auto ptrInTy = cudaq::cc::PointerType::get( - cudaq::opt::factory::stlVectorType(stdvecTy.getElementType())); - Value arg = - builder.create(loc, ptrI8Ty, argPtrPtr); - arg = builder.create(loc, ptrInTy, arg); - vecToBuffer = encodeVectorData(loc, builder, bytes, stdvecTy, arg, - vecToBuffer, ptrInTy); - if (stdvecTy.getElementType() == builder.getI1Type()) { - auto ptrI1Ty = cudaq::cc::PointerType::get(builder.getI1Type()); - assert(replacementArgs.count(idx) && "must be in map"); - auto arg = replacementArgs[idx]; - auto heapPtr = builder.create( - loc, cudaq::cc::PointerType::get(ptrI1Ty), arg, - ArrayRef{0}); - auto loadHeapPtr = builder.create(loc, heapPtr); - auto i8Ty = builder.getI8Type(); - Value heapCast = builder.create( - loc, cudaq::cc::PointerType::get(i8Ty), loadHeapPtr); - builder.create(loc, std::nullopt, "free", - ArrayRef{heapCast}); - } - } else if (auto strTy = dyn_cast(currArgTy)) { - if (cudaq::cc::isDynamicType(strTy)) { - Value argPtrPtr = builder.create( - loc, ptrPtrType, variadicArgs, - ArrayRef{idx}); - Value arg = - builder.create(loc, ptrI8Ty, argPtrPtr); - Type hostArgTy = getHostArgType(idx); - arg = builder.create(loc, hostArgTy, arg); - auto structPtrArrTy = cudaq::cc::PointerType::get( - cudaq::cc::ArrayType::get(msgStructTy)); - auto temp = - builder.create(loc, structPtrArrTy, buff); - vecToBuffer = encodeDynamicStructData(loc, builder, strTy, arg, - temp, vecToBuffer); - } - } - } - } - builder.create(loc, buff, entry->getArgument(1)); - builder.create(loc, ValueRange{extendedStructSize}); + // Return the message buffer and its size in bytes. + builder.create(loc, rawMessageBuffer, + entry->getArgument(1)); + builder.create(loc, ValueRange{messageBufferSize}); + + // Note: the .argsCreator will have allocated space for a static result in + // the message buffer. If the kernel returns a dynamic result, the launch + // kernel code will have to properly return it in the appropriate context. return argsCreatorFunc; } @@ -590,9 +809,8 @@ class GenerateKernelExecution // passed. A span structure is a pointer and a size (in element // units). Note that this structure may be recursive. auto i8Ty = builder.getI8Type(); - auto arrI8Ty = cudaq::cc::ArrayType::get(i8Ty); auto ptrI8Ty = cudaq::cc::PointerType::get(i8Ty); - auto bytesTy = cudaq::cc::PointerType::get(arrI8Ty); + auto bytesTy = getByteAddressableType(builder); Type eleTy = stdvecTy.getElementType(); auto innerStdvecTy = dyn_cast(eleTy); std::size_t eleSize = @@ -904,137 +1122,6 @@ class GenerateKernelExecution return args; } - // Return the vector's length, computed on the CPU side, in bytes. - Value computeHostVectorLengthInBytes(Location loc, OpBuilder &builder, - Value hostArg, Type eleTy, - cudaq::cc::PointerType hostVecTy) { - auto rawSize = getVectorSize(loc, builder, hostVecTy, hostArg); - if (isa(eleTy)) { - auto three = builder.create(loc, 3, 64); - return builder.create(loc, rawSize, three); - } - return rawSize; - } - - Value fetchHostVectorFront(Location loc, OpBuilder &builder, Value hostArg, - cudaq::cc::PointerType hostVecTy) { - auto inpStructTy = cast(hostVecTy.getElementType()); - auto ptrTtype = cudaq::cc::PointerType::get(inpStructTy.getMember(0)); - auto beginPtr = builder.create( - loc, ptrTtype, hostArg, SmallVector{0}); - auto ptrArrSTy = cudaq::opt::factory::getIndexedObjectType(inpStructTy); - auto vecPtr = builder.create( - loc, cudaq::cc::PointerType::get(ptrArrSTy), beginPtr); - return builder.create(loc, vecPtr); - } - - Value recursiveVectorDataCopy(Location loc, OpBuilder &builder, Value hostArg, - Value buffPtr, cudaq::cc::SpanLikeType stdvecTy, - cudaq::cc::PointerType hostVecTy) { - auto vecLen = computeHostVectorLengthInBytes(loc, builder, hostArg, - stdvecTy, hostVecTy); - auto nested = fetchHostVectorFront(loc, builder, hostArg, hostVecTy); - auto vecLogicalLen = convertLengthBytesToLengthI64(loc, builder, vecLen); - auto vecLenIndex = builder.create( - loc, builder.getI64Type(), vecLogicalLen, - cudaq::cc::CastOpMode::Unsigned); - auto buffPtrTy = cast(buffPtr.getType()); - auto tmp = builder.create(loc, buffPtrTy); - auto buffArrTy = cudaq::cc::ArrayType::get(buffPtrTy.getElementType()); - auto castPtr = builder.create( - loc, cudaq::cc::PointerType::get(buffArrTy), buffPtr); - auto newEnd = builder.create( - loc, buffPtrTy, castPtr, SmallVector{vecLen}); - builder.create(loc, newEnd, tmp); - auto i64Ty = builder.getI64Type(); - auto arrI64Ty = cudaq::cc::ArrayType::get(i64Ty); - auto ptrI64Ty = cudaq::cc::PointerType::get(i64Ty); - auto ptrArrTy = cudaq::cc::PointerType::get(arrI64Ty); - auto vecBasePtr = builder.create(loc, ptrArrTy, buffPtr); - auto nestedArr = builder.create(loc, hostVecTy, nested); - auto hostArrVecTy = cudaq::cc::PointerType::get( - cudaq::cc::ArrayType::get(hostVecTy.getElementType())); - cudaq::opt::factory::createInvariantLoop( - builder, loc, vecLenIndex, - [&](OpBuilder &builder, Location loc, Region &, Block &block) { - Value i = block.getArgument(0); - auto currBuffPtr = builder.create( - loc, ptrI64Ty, vecBasePtr, ArrayRef{i}); - auto upCast = - builder.create(loc, hostArrVecTy, nestedArr); - auto hostSubVec = builder.create( - loc, hostVecTy, upCast, ArrayRef{i}); - Value buff = builder.create(loc, tmp); - // Compute and save the byte size. - auto vecSz = computeHostVectorLengthInBytes( - loc, builder, hostSubVec, stdvecTy.getElementType(), hostVecTy); - builder.create(loc, vecSz, currBuffPtr); - // Recursively copy vector data. - auto endBuff = encodeVectorData(loc, builder, vecSz, stdvecTy, - hostSubVec, buff, hostVecTy); - builder.create(loc, endBuff, tmp); - }); - return builder.create(loc, tmp); - } - - /// Recursively encode a `std::vector` into a buffer's addendum. The data is - /// read from \p hostArg. The data is \p bytes size long if this is a leaf - /// vector, otherwise the size is computed on-the-fly during the encoding of - /// the ragged array. - /// \return The new pointer to the end of the addendum block. - Value encodeVectorData(Location loc, OpBuilder &builder, Value bytes, - cudaq::cc::SpanLikeType stdvecTy, Value hostArg, - Value bufferAddendum, cudaq::cc::PointerType ptrInTy) { - auto eleTy = stdvecTy.getElementType(); - if (auto subVecTy = dyn_cast(eleTy)) - return recursiveVectorDataCopy(loc, builder, hostArg, bufferAddendum, - subVecTy, ptrInTy); - return copyVectorData(loc, builder, bytes, hostArg, bufferAddendum); - } - - /// Recursively encode a struct which has dynamically sized members (such as - /// vectors). The vector members are encoded as i64 sizes with the data - /// attached to the buffer addendum. - /// \return The new pointer to the end of the addendum block. - Value encodeDynamicStructData(Location loc, OpBuilder &builder, - cudaq::cc::StructType deviceTy, Value hostArg, - Value bufferArg, Value bufferAddendum) { - for (auto iter : llvm::enumerate(deviceTy.getMembers())) { - auto memTy = iter.value(); - if (auto vecTy = dyn_cast(memTy)) { - Type eTy = vecTy.getElementType(); - auto hostTy = cudaq::opt::factory::stlVectorType(eTy); - auto ptrHostTy = cudaq::cc::PointerType::get(hostTy); - auto ptrI64Ty = cudaq::cc::PointerType::get(builder.getI64Type()); - std::int32_t offset = iter.index(); - auto sizeAddr = builder.create( - loc, ptrI64Ty, bufferArg, - ArrayRef{0, 0, offset}); - auto size = builder.create(loc, sizeAddr); - auto vecAddr = builder.create( - loc, ptrHostTy, hostArg, - ArrayRef{offset}); - bufferAddendum = encodeVectorData(loc, builder, size, vecTy, vecAddr, - bufferAddendum, ptrHostTy); - } else if (auto strTy = dyn_cast(memTy)) { - if (cudaq::cc::isDynamicType(strTy)) { - auto ptrStrTy = cudaq::cc::PointerType::get(strTy); - std::int32_t idx = iter.index(); - auto strAddr = builder.create( - loc, ptrStrTy, bufferArg, - ArrayRef{idx}); - bufferAddendum = encodeDynamicStructData(loc, builder, strTy, strAddr, - bufferArg, bufferAddendum); - } - } else if (auto arrTy = dyn_cast(memTy)) { - // This is like vector type if the array has dynamic size. If it has a - // constant size, it is like a struct with n identical members. - TODO_loc(loc, "array type"); - } - } - return bufferAddendum; - } - static std::pair lookupHostEntryPointFunc(StringRef mangledEntryPointName, ModuleOp module, func::FuncOp funcOp) { @@ -1053,10 +1140,10 @@ class GenerateKernelExecution return {true, func::FuncOp{}}; } - /// Generate an all new entry point body, calling launchKernel in the runtime - /// library. Pass along the thunk, so the runtime can call the quantum - /// circuit. These entry points are `operator()` member functions in a class, - /// so account for the `this` argument here. + /// Generate an all new entry point body, calling someLaunchKernel in + /// the runtime library. Pass along the thunk, so the runtime can call the + /// quantum circuit. These entry points may be `operator()` member functions + /// in a class, so account for the `this` argument here. void genNewHostEntryPoint(Location loc, OpBuilder &builder, FunctionType devFuncTy, LLVM::GlobalOp kernelNameObj, func::FuncOp hostFunc, @@ -1064,220 +1151,78 @@ class GenerateKernelExecution func::FuncOp thunkFunc) { auto *ctx = builder.getContext(); auto i64Ty = builder.getI64Type(); - std::int32_t offset = devFuncTy.getNumInputs(); + auto i8Ty = builder.getI8Type(); + auto ptrI8Ty = cudaq::cc::PointerType::get(i8Ty); auto thunkTy = getThunkType(ctx); auto structPtrTy = cudaq::cc::PointerType::get(structTy); - Block *hostFuncEntryBlock = hostFunc.addEntryBlock(); + const std::int32_t offset = devFuncTy.getNumInputs(); + Block *hostFuncEntryBlock = hostFunc.addEntryBlock(); OpBuilder::InsertionGuard guard(builder); builder.setInsertionPointToStart(hostFuncEntryBlock); - auto i8Ty = builder.getI8Type(); - auto ptrI8Ty = cudaq::cc::PointerType::get(i8Ty); - Value temp; + SmallVector blockArgs{dropAnyHiddenArguments( + hostFuncEntryBlock->getArguments(), devFuncTy, addThisPtr)}; + SmallVector blockValues(blockArgs.size()); + std::copy(blockArgs.begin(), blockArgs.end(), blockValues.begin()); + const bool hasDynamicSignature = isDynamicSignature(devFuncTy); + SmallVector freeVectorBuffers; + auto zippy = zipArgumentsWithDeviceTypes( + loc, builder, blockValues, devFuncTy.getInputs(), &freeVectorBuffers); + auto sizeScratch = builder.create(loc, i64Ty); + auto messageBufferSize = [&]() -> Value { + if (hasDynamicSignature) + return genSizeOfDynamicMessageBuffer(loc, builder, structTy, zippy, + sizeScratch); + return builder.create(loc, i64Ty, structTy); + }(); + + Value msgBufferPrefix; Value castTemp; Value resultOffset; Value castLoadThunk; Value extendedStructSize; if (isCodegenPackedData(codegenKind)) { - Value stVal = builder.create(loc, structTy); - - // Process all the arguments for the original call, ignoring any hidden - // arguments (such as the `this` pointer). - auto zero = builder.create(loc, 0, 64); - Value extraBytes = zero; - bool hasTrailingData = false; - SmallVector blockArgs{dropAnyHiddenArguments( - hostFuncEntryBlock->getArguments(), devFuncTy, addThisPtr)}; - std::int32_t idx = 0; - SmallVector blockValues(blockArgs.size()); - std::copy(blockArgs.begin(), blockArgs.end(), blockValues.begin()); - for (auto iter = blockArgs.begin(), end = blockArgs.end(); iter != end; - ++iter, ++idx) { - Value arg = *iter; - Type inTy = arg.getType(); - Type quakeTy = devFuncTy.getInput(idx); - // If the argument is a callable, skip it. - if (isa(quakeTy)) - continue; - - // Argument is a packaged kernel. In this case, the argument is some - // unknown kernel that may be called. The packaged argument is coming - // from opaque C++ host code, so we need to identify what kernel it - // references and then pass its name as a span of characters to the - // launch kernel. - if (isa(quakeTy)) { - auto kernKey = builder.create( - loc, i64Ty, cudaq::runtime::getLinkableKernelKey, - ValueRange{arg}); - stVal = builder.create( - loc, stVal.getType(), stVal, kernKey.getResult(0), idx); - continue; - } - - // If the argument is an empty struct, skip it. - if (auto strTy = dyn_cast(quakeTy)) - if (strTy.isEmpty()) - continue; - - if (auto stdvecTy = dyn_cast(quakeTy)) { - // Per the CUDA-Q spec, an entry point kernel must take a `[const] - // std::vector` value argument. - // Should the spec stipulate that pure device kernels must pass by - // read-only reference, i.e., take `const std::vector &` arguments? - auto ptrInTy = cast(inTy); - // If this is a std::vector, unpack it. - if (stdvecTy.getElementType() == builder.getI1Type()) { - // Create a mock vector of i8 and populate the bools, 1 per char. - Value tmp = builder.create( - loc, ptrInTy.getElementType()); - builder.create(loc, std::nullopt, - cudaq::stdvecBoolUnpackToInitList, - ValueRange{tmp, arg}); - arg = blockValues[idx] = tmp; - } - // FIXME: call the `size` member function. For expediency, assume this - // is an std::vector and the size is the scaled delta between the - // first two pointers. Use the unscaled size for now. - auto [p1, p2] = insertVectorSizeAndIncrementExtraBytes( - loc, builder, arg, ptrInTy, stdvecTy, stVal, idx, extraBytes); - stVal = p1; - extraBytes = p2; - hasTrailingData = true; - continue; - } - if (auto strTy = dyn_cast(quakeTy)) { - if (!isa(arg.getType())) { - // If argument is not a pointer, then struct was promoted into a - // register. - auto *parent = builder.getBlock()->getParentOp(); - auto module = parent->getParentOfType(); - auto tmp = builder.create(loc, quakeTy); - auto cast = builder.create( - loc, cudaq::cc::PointerType::get(arg.getType()), tmp); - if (cudaq::opt::factory::isX86_64(module)) { - builder.create(loc, arg, cast); - if (cudaq::opt::factory::structUsesTwoArguments(quakeTy)) { - auto arrTy = cudaq::cc::ArrayType::get(builder.getI8Type()); - auto cast = builder.create( - loc, cudaq::cc::PointerType::get(arrTy), tmp); - auto hiPtr = builder.create( - loc, cudaq::cc::PointerType::get(builder.getI8Type()), cast, - cudaq::cc::ComputePtrArg{8}); - ++iter; - Value nextArg = *iter; - auto cast2 = builder.create( - loc, cudaq::cc::PointerType::get(nextArg.getType()), hiPtr); - builder.create(loc, nextArg, cast2); - } - } else { - builder.create(loc, arg, cast); - } - // Load the assembled (sub-)struct and insert into the buffer value. - Value v = builder.create(loc, tmp); - stVal = builder.create( - loc, stVal.getType(), stVal, v, idx); - continue; - } - if (!cudaq::cc::isDynamicType(strTy)) { - // struct is static size, so just load the value (byval ptr). - Value v = builder.create(loc, arg); - stVal = builder.create( - loc, stVal.getType(), stVal, v, idx); - continue; - } - auto genTy = cast( - cudaq::opt::factory::genArgumentBufferType(strTy)); - Value zero = builder.create(loc, 0, 64); - auto [quakeVal, recursiveSize] = computeRecursiveDynamicStructSize( - loc, builder, strTy, arg, zero, genTy); - stVal = builder.create( - loc, stVal.getType(), stVal, quakeVal, idx); - extraBytes = - builder.create(loc, extraBytes, recursiveSize); - hasTrailingData = true; - continue; - } - if (auto ptrTy = dyn_cast(inTy)) { - if (isa(ptrTy.getElementType())) { - // Special case: if the argument is a `cudaq::state*`, then just - // pass the pointer. We can do that in this case because the - // synthesis step (which will receive the argument data) is assumed - // to run in the same memory space. - Value argPtr = builder.create(loc, inTy, arg); - stVal = builder.create( - loc, stVal.getType(), stVal, argPtr, idx); - } - continue; - } - - stVal = builder.create(loc, stVal.getType(), - stVal, arg, idx); + auto rawMessageBuffer = + builder.create(loc, i8Ty, messageBufferSize); + msgBufferPrefix = + builder.create(loc, structPtrTy, rawMessageBuffer); + + if (hasDynamicSignature) { + auto addendumScratch = + builder.create(loc, ptrI8Ty); + Value prefixSize = + builder.create(loc, i64Ty, structTy); + Value addendumPtr = builder.create( + loc, ptrI8Ty, rawMessageBuffer, + ArrayRef{prefixSize}); + populateMessageBuffer(loc, builder, msgBufferPrefix, zippy, addendumPtr, + addendumScratch); + } else { + populateMessageBuffer(loc, builder, msgBufferPrefix, zippy); } - // Compute the struct size without the trailing bytes, structSize, and - // with the trailing bytes, extendedStructSize. - Value structSize = - builder.create(loc, i64Ty, structTy); - extendedStructSize = - builder.create(loc, structSize, extraBytes); - - // Allocate our struct to save the argument to. - auto buff = - builder.create(loc, i8Ty, extendedStructSize); - - temp = builder.create(loc, structPtrTy, buff); - - // Store the arguments to the argument section. - builder.create(loc, stVal, temp); - - auto structPtrArrTy = - cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(structTy)); - temp = builder.create(loc, structPtrArrTy, buff); - - // Append the vector data to the end of the struct. - if (hasTrailingData) { - Value vecToBuffer = builder.create( - loc, ptrI8Ty, buff, SmallVector{structSize}); - // Ignore any hidden `this` argument. - for (auto inp : llvm::enumerate(blockValues)) { - Value arg = inp.value(); - Type inTy = arg.getType(); - std::int32_t idx = inp.index(); - Type quakeTy = devFuncTy.getInput(idx); - if (auto stdvecTy = dyn_cast(quakeTy)) { - auto bytes = builder.create(loc, i64Ty, - stVal, idx); - assert(stdvecTy == devFuncTy.getInput(idx)); - auto ptrInTy = cast(inTy); - vecToBuffer = encodeVectorData(loc, builder, bytes, stdvecTy, arg, - vecToBuffer, ptrInTy); - if (stdvecTy.getElementType() == builder.getI1Type()) { - auto ptrI1Ty = cudaq::cc::PointerType::get(builder.getI1Type()); - auto heapPtr = builder.create( - loc, cudaq::cc::PointerType::get(ptrI1Ty), arg, - ArrayRef{0}); - auto loadHeapPtr = - builder.create(loc, heapPtr); - Value heapCast = builder.create( - loc, cudaq::cc::PointerType::get(i8Ty), loadHeapPtr); - builder.create(loc, std::nullopt, "free", - ArrayRef{heapCast}); - } - continue; - } - if (auto strTy = dyn_cast(quakeTy)) { - if (cudaq::cc::isDynamicType(strTy)) - vecToBuffer = encodeDynamicStructData(loc, builder, strTy, arg, - temp, vecToBuffer); - } + if (!freeVectorBuffers.empty()) { + // Need to free any temporary vector-like buffers. These arise when + // there is a std::vector argument, which we translate into a + // std::vector to reuse the same code as any other std::vector. + for (auto vecVar : freeVectorBuffers) { + auto ptrPtrTy = cudaq::cc::PointerType::get(ptrI8Ty); + auto ptrPtr = + builder.create(loc, ptrPtrTy, vecVar); + Value freeMe = builder.create(loc, ptrPtr); + builder.create(loc, std::nullopt, "free", + ArrayRef{freeMe}); } } + + extendedStructSize = messageBufferSize; Value loadThunk = builder.create(loc, thunkTy, thunkFunc.getName()); castLoadThunk = builder.create(loc, ptrI8Ty, loadThunk); - castTemp = builder.create(loc, ptrI8Ty, temp); + castTemp = + builder.create(loc, ptrI8Ty, msgBufferPrefix); resultOffset = genComputeReturnOffset(loc, builder, devFuncTy, structTy); } @@ -1397,7 +1342,8 @@ class GenerateKernelExecution builder.setInsertionPointToEnd(elseBlock); // span was returned in the original buffer. Value mRes = builder.create( - loc, ptrResTy, temp, ArrayRef{0, offset}); + loc, ptrResTy, msgBufferPrefix, + ArrayRef{offset}); builder.create(loc, endifBlock, ArrayRef{mRes}); builder.setInsertionPointToEnd(endifBlock); launchResult = endifBlock->getArgument(0); @@ -1454,7 +1400,8 @@ class GenerateKernelExecution if (resultVal) { // Static values. std::vector are necessarily sret, see below. auto resPtr = builder.create( - loc, ptrResTy, temp, ArrayRef{0, offset}); + loc, ptrResTy, msgBufferPrefix, + ArrayRef{offset}); Type castToTy = cudaq::cc::PointerType::get(hostFuncTy.getResult(0)); auto castResPtr = [&]() -> Value { if (castToTy == ptrResTy) @@ -1496,8 +1443,8 @@ class GenerateKernelExecution // type for the memcpy, so the device should return an (aggregate) // value of suitable size. auto resPtr = builder.create( - loc, ptrResTy, temp, - ArrayRef{0, offset}); + loc, ptrResTy, msgBufferPrefix, + ArrayRef{offset}); auto castMsgBuff = builder.create(loc, ptrI8Ty, resPtr); Type eleTy = @@ -1659,6 +1606,7 @@ class GenerateKernelExecution return success(); } +public: void runOnOperation() override { auto module = getOperation(); auto *ctx = module.getContext(); @@ -1784,6 +1732,7 @@ class GenerateKernelExecution out.keep(); } +private: const DataLayout *dataLayout = nullptr; }; } // namespace diff --git a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp index 166f558275..7365c03370 100644 --- a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp +++ b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp @@ -122,15 +122,9 @@ synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter, ATTR arrayAttr, MAKER makeElementValue) { auto *ctx = builder.getContext(); auto argTy = argument.getType(); - assert(isa(argTy) || - isa(argTy)); - ELETY eleTy = [&]() -> ELETY { - if (auto strTy = dyn_cast(argTy)) - return cast(strTy.getElementType()); - // Force cast this to ELETY. This will only happen for CharspanType. - return cast(cudaq::opt::factory::getCharType(ctx)); - }(); - auto strTy = cudaq::cc::StdvecType::get(ctx, eleTy); + assert(isa(argTy)); + auto strTy = cast(argTy); + auto eleTy = cast(strTy.getElementType()); builder.setInsertionPointToStart(argument.getOwner()); auto argLoc = argument.getLoc(); auto conArray = builder.create( @@ -621,19 +615,6 @@ class QuakeSynthesizer continue; } - if (auto charSpanTy = dyn_cast(type)) { - const char *ptrToSizeInBuffer = - static_cast(args) + offset; - auto sizeFromBuffer = - *reinterpret_cast(ptrToSizeInBuffer); - std::size_t bytesInType = sizeof(char); - auto vectorSize = sizeFromBuffer / bytesInType; - stdVecInfo.emplace_back( - argNum, cudaq::opt::factory::getCharType(builder.getContext()), - vectorSize); - continue; - } - funcOp.emitOpError("We cannot synthesize argument(s) of this type."); signalPassFailure(); return; diff --git a/python/utils/OpaqueArguments.h b/python/utils/OpaqueArguments.h index 3e410a07b6..46afd2fedc 100644 --- a/python/utils/OpaqueArguments.h +++ b/python/utils/OpaqueArguments.h @@ -101,7 +101,7 @@ inline py::args simplifiedValidateInputArguments(py::args &args) { arg = args[i].attr("tolist")(); } else if (py::isinstance(arg)) { - arg = cudaq::pauli_word(py::cast(arg)); + arg = py::cast(arg); } else if (py::isinstance(arg)) { py::list arg_list = py::cast(arg); const bool all_strings = [&]() { @@ -330,8 +330,7 @@ inline void packArgs(OpaqueArguments &argData, py::args args, addArgument(argData, arg.cast()); }) .Case([&](cudaq::cc::CharspanType ty) { - addArgument(argData, - cudaq::pauli_word(arg.cast().str())); + addArgument(argData, arg.cast().str()); }) .Case([&](cudaq::cc::PointerType ty) { if (isa(ty.getElementType())) { @@ -432,8 +431,7 @@ inline void packArgs(OpaqueArguments &argData, py::args args, .Case([&](cudaq::cc::CharspanType type) { genericVecAllocator.template operator()( [](py::handle element, int index, int elementIndex) { - auto pw = element.cast(); - return cudaq::pauli_word(pw.str()); + return element.cast().str(); }); return; }) diff --git a/runtime/common/ArgumentConversion.cpp b/runtime/common/ArgumentConversion.cpp index 0de2589752..c310966a07 100644 --- a/runtime/common/ArgumentConversion.cpp +++ b/runtime/common/ArgumentConversion.cpp @@ -77,14 +77,16 @@ static Value genConstant(OpBuilder &builder, FloatType fltTy, long double *v) { static Value genConstant(OpBuilder &builder, const std::string &v, ModuleOp substMod) { auto loc = builder.getUnknownLoc(); - cudaq::IRBuilder irBuilder(builder); - auto cString = irBuilder.genCStringLiteralAppendNul(loc, substMod, v); - auto addr = builder.create( - loc, cudaq::cc::PointerType::get(cString.getType()), cString.getName()); - auto i8PtrTy = cudaq::cc::PointerType::get(builder.getI8Type()); - auto cast = builder.create(loc, i8PtrTy, addr); + auto *ctx = builder.getContext(); + auto i8Ty = builder.getI8Type(); + auto strLitTy = cudaq::cc::PointerType::get( + cudaq::cc::ArrayType::get(ctx, i8Ty, v.size() + 1)); + auto strLit = + builder.create(loc, strLitTy, v); + auto i8PtrTy = cudaq::cc::PointerType::get(i8Ty); + auto cast = builder.create(loc, i8PtrTy, strLit); auto size = builder.create(loc, v.size(), 64); - auto chSpanTy = cudaq::cc::CharspanType::get(builder.getContext()); + auto chSpanTy = cudaq::cc::CharspanType::get(ctx); return builder.create(loc, chSpanTy, cast, size); } @@ -218,6 +220,21 @@ Value dispatchSubtype(OpBuilder &builder, Type ty, void *p, ModuleOp substMod, .Default({}); } +// Get the size of \p eleTy on the host side in bytes. +static std::size_t getHostSideElementSize(Type eleTy, + llvm::DataLayout &layout) { + if (isa(eleTy)) + return sizeof(std::vector); + if (isa(eleTy)) { + // char span type is a std::string on host side. + return sizeof(std::string); + } + // Note: we want the size on the host side, but `getDataSize()` returns the + // size on the device side. This is ok for now since they are the same for + // most types and the special cases are handled above. + return cudaq::opt::getDataSize(layout, eleTy); +} + Value genConstant(OpBuilder &builder, cudaq::cc::StdvecType vecTy, void *p, ModuleOp substMod, llvm::DataLayout &layout) { typedef const char *VectorType[3]; @@ -227,11 +244,7 @@ Value genConstant(OpBuilder &builder, cudaq::cc::StdvecType vecTy, void *p, return {}; auto eleTy = vecTy.getElementType(); auto elePtrTy = cudaq::cc::PointerType::get(eleTy); - auto eleSize = cudaq::opt::getDataSize(layout, eleTy); - if (isa(eleTy)) { - // char span type (i.e. pauli word) is a `vector` - eleSize = sizeof(VectorType); - } + auto eleSize = getHostSideElementSize(eleTy, layout); assert(eleSize && "element must have a size"); auto loc = builder.getUnknownLoc(); diff --git a/runtime/cudaq/qis/pauli_word.h b/runtime/cudaq/qis/pauli_word.h index afcd446e77..2265003083 100644 --- a/runtime/cudaq/qis/pauli_word.h +++ b/runtime/cudaq/qis/pauli_word.h @@ -8,20 +8,31 @@ #pragma once #include -#include namespace cudaq { -/// @brief The `pauli_word` is a thin wrapper on a -/// Pauli tensor product string, e.g. `XXYZ` on 4 -// qubits. +/// @brief The `pauli_word` is a thin wrapper on a Pauli tensor product string, +/// e.g. `XXYZ` on 4 qubits. class pauli_word { private: - std::vector term; + std::string term; public: pauli_word() = default; - pauli_word(const std::string t) : term(t.begin(), t.end()) {} - std::string str() const { return std::string(term.begin(), term.end()); } - const std::vector &data() const { return term; } + pauli_word(std::string &&t) : term{std::move(t)} {} + pauli_word(const std::string &t) : term(t) {} + pauli_word(const char *const p) : term{p} {} + pauli_word &operator=(const std::string &t) { + term = t; + return *this; + } + pauli_word &operator=(const char *const p) { + term = p; + return *this; + } + + std::string str() const { return term; } + + // TODO: Obsolete? Used by KernelWrapper.h only. + const std::vector data() const { return {term.begin(), term.end()}; } }; -} // namespace cudaq \ No newline at end of file +} // namespace cudaq diff --git a/runtime/test/test_argument_conversion.cpp b/runtime/test/test_argument_conversion.cpp index 987bfd4c34..c16b43ddb7 100644 --- a/runtime/test/test_argument_conversion.cpp +++ b/runtime/test/test_argument_conversion.cpp @@ -202,12 +202,11 @@ void test_scalars(mlir::MLIRContext *ctx) { // CHECK: Substitution module: // CHECK-LABEL: cc.arg_subst[0] { -// CHECK: %[[VAL_0:.*]] = cc.address_of @cstr.58595A00 : !cc.ptr> -// CHECK: %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr>) -> !cc.ptr +// CHECK: %[[VAL_0:.*]] = cc.string_literal "XYZ" : !cc.ptr> +// CHECK: %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr>) -> !cc.ptr // CHECK: %[[VAL_2:.*]] = arith.constant 3 : i64 // CHECK: %[[VAL_3:.*]] = cc.stdvec_init %[[VAL_1]], %[[VAL_2]] : (!cc.ptr, i64) -> !cc.charspan // CHECK: } -// CHECK-DAG: llvm.mlir.global private constant @cstr.58595A00("XYZ\00") {addr_space = 0 : i32} // clang-format on } @@ -250,14 +249,14 @@ void test_vectors(mlir::MLIRContext *ctx) { // clang-format off // CHECK-LABEL: cc.arg_subst[0] { // CHECK: %[[VAL_0:.*]] = cc.alloca !cc.array -// CHECK: %[[VAL_1:.*]] = cc.address_of @cstr.585800 : !cc.ptr> -// CHECK: %[[VAL_2:.*]] = cc.cast %[[VAL_1]] : (!cc.ptr>) -> !cc.ptr +// CHECK: %[[VAL_1:.*]] = cc.string_literal "XX" : !cc.ptr> +// CHECK: %[[VAL_2:.*]] = cc.cast %[[VAL_1]] : (!cc.ptr>) -> !cc.ptr // CHECK: %[[VAL_3:.*]] = arith.constant 2 : i64 // CHECK: %[[VAL_4:.*]] = cc.stdvec_init %[[VAL_2]], %[[VAL_3]] : (!cc.ptr, i64) -> !cc.charspan // CHECK: %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_0]][0] : (!cc.ptr>) -> !cc.ptr // CHECK: cc.store %[[VAL_4]], %[[VAL_5]] : !cc.ptr -// CHECK: %[[VAL_6:.*]] = cc.address_of @cstr.585900 : !cc.ptr> -// CHECK: %[[VAL_7:.*]] = cc.cast %[[VAL_6]] : (!cc.ptr>) -> !cc.ptr +// CHECK: %[[VAL_6:.*]] = cc.string_literal "XY" : !cc.ptr> +// CHECK: %[[VAL_7:.*]] = cc.cast %[[VAL_6]] : (!cc.ptr>) -> !cc.ptr // CHECK: %[[VAL_8:.*]] = arith.constant 2 : i64 // CHECK: %[[VAL_9:.*]] = cc.stdvec_init %[[VAL_7]], %[[VAL_8]] : (!cc.ptr, i64) -> !cc.charspan // CHECK: %[[VAL_10:.*]] = cc.compute_ptr %[[VAL_0]][1] : (!cc.ptr>) -> !cc.ptr @@ -265,8 +264,6 @@ void test_vectors(mlir::MLIRContext *ctx) { // CHECK: %[[VAL_11:.*]] = arith.constant 2 : i64 // CHECK: %[[VAL_12:.*]] = cc.stdvec_init %[[VAL_0]], %[[VAL_11]] : (!cc.ptr>, i64) -> !cc.stdvec // CHECK: } -// CHECK-DAG: llvm.mlir.global private constant @cstr.585800("XX\00") {addr_space = 0 : i32} -// CHECK-DAG: llvm.mlir.global private constant @cstr.585900("XY\00") {addr_space = 0 : i32} // clang-format on } @@ -502,14 +499,14 @@ void test_combinations(mlir::MLIRContext *ctx) { // CHECK-DAG: func.func private @__nvqpp_cudaq_state_createFromData_fp64(!cc.ptr, i64) -> !cc.ptr // CHECK-LABEL: cc.arg_subst[2] { // CHECK: %[[VAL_0:.*]] = cc.alloca !cc.array -// CHECK: %[[VAL_1:.*]] = cc.address_of @cstr.585800 : !cc.ptr> -// CHECK: %[[VAL_2:.*]] = cc.cast %[[VAL_1]] : (!cc.ptr>) -> !cc.ptr +// CHECK: %[[VAL_1:.*]] = cc.string_literal "XX" : !cc.ptr> +// CHECK: %[[VAL_2:.*]] = cc.cast %[[VAL_1]] : (!cc.ptr>) -> !cc.ptr // CHECK: %[[VAL_3:.*]] = arith.constant 2 : i64 // CHECK: %[[VAL_4:.*]] = cc.stdvec_init %[[VAL_2]], %[[VAL_3]] : (!cc.ptr, i64) -> !cc.charspan // CHECK: %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_0]][0] : (!cc.ptr>) -> !cc.ptr // CHECK: cc.store %[[VAL_4]], %[[VAL_5]] : !cc.ptr -// CHECK: %[[VAL_6:.*]] = cc.address_of @cstr.585900 : !cc.ptr> -// CHECK: %[[VAL_7:.*]] = cc.cast %[[VAL_6]] : (!cc.ptr>) -> !cc.ptr +// CHECK: %[[VAL_6:.*]] = cc.string_literal "XY" : !cc.ptr> +// CHECK: %[[VAL_7:.*]] = cc.cast %[[VAL_6]] : (!cc.ptr>) -> !cc.ptr // CHECK: %[[VAL_8:.*]] = arith.constant 2 : i64 // CHECK: %[[VAL_9:.*]] = cc.stdvec_init %[[VAL_7]], %[[VAL_8]] : (!cc.ptr, i64) -> !cc.charspan // CHECK: %[[VAL_10:.*]] = cc.compute_ptr %[[VAL_0]][1] : (!cc.ptr>) -> !cc.ptr @@ -517,8 +514,6 @@ void test_combinations(mlir::MLIRContext *ctx) { // CHECK: %[[VAL_11:.*]] = arith.constant 2 : i64 // CHECK: %[[VAL_12:.*]] = cc.stdvec_init %[[VAL_0]], %[[VAL_11]] : (!cc.ptr>, i64) -> !cc.stdvec // CHECK: } -// CHECK-DAG: llvm.mlir.global private constant @cstr.585800("XX\00") {addr_space = 0 : i32} -// CHECK-DAG: llvm.mlir.global private constant @cstr.585900("XY\00") {addr_space = 0 : i32} // clang-format on } diff --git a/targettests/Remote-Sim/pauli_word.cpp b/targettests/Remote-Sim/pauli_word.cpp index cd68042325..7624d948c0 100644 --- a/targettests/Remote-Sim/pauli_word.cpp +++ b/targettests/Remote-Sim/pauli_word.cpp @@ -10,7 +10,6 @@ // clang-format off // RUN: nvq++ %cpp_std --target remote-mqpu --remote-mqpu-auto-launch 1 %s -o %t && %t -// RUN: nvq++ %cpp_std --enable-mlir --target remote-mqpu --remote-mqpu-auto-launch 1 %s -o %t && %t // clang-format on #include "remote_test_assert.h" diff --git a/targettests/SeparateCompilation/pauli_words.cpp b/targettests/SeparateCompilation/pauli_words.cpp new file mode 100644 index 0000000000..31ac339e0c --- /dev/null +++ b/targettests/SeparateCompilation/pauli_words.cpp @@ -0,0 +1,65 @@ +/******************************************************************************* + * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +// clang-format off +// RUN: if [ command -v split-file ]; then \ +// RUN: split-file %s %t && \ +// RUN: nvq++ %cpp_std --enable-mlir -c %t/pauli_word_display.cpp -o %t/pauli_word_display.o && \ +// RUN: nvq++ %cpp_std --enable-mlir -c %t/pauli_wordle.cpp -o %t/pauli_wordle.o && \ +// RUN: nvq++ %cpp_std --enable-mlir %t/pauli_word_display.o %t/pauli_wordle.o -o %t/pauli_wordle.out && \ +// RUN: %t/pauli_wordle.out | FileCheck %s ; else \ +// RUN: echo "skipping" ; fi +// clang-format on + +//--- pauli_word_display.cpp + +#include +#include +#include + +extern "C" { +void display(std::span x) { + std::string s{x.data(), x.size()}; + std::cout << "pauli word: " << s << '\n'; +} +} + +//--- pauli_wordle.cpp + +#include + +// Fake host C++ signature that matches. Since this is called on the device side +// the pauli_word will have been converted to a span. +extern "C" void display(const cudaq::pauli_word &pw); + +__qpu__ void kerny(std::vector arg) { + display(arg[0]); + display(arg[1]); + display(arg[2]); + display(arg[3]); +} + +__qpu__ void kernub(cudaq::pauli_word arg) { display(arg); } + +int main() { + cudaq::pauli_word pw0 = "YYZ"; + kernub(pw0); + + cudaq::pauli_word pw1 = "ZIZ"; + cudaq::pauli_word pw2 = "XXXY"; + cudaq::pauli_word pw3 = "YIIII"; + std::vector vpw{pw0, pw1, pw2, pw3}; + kerny(vpw); + return 0; +} + +// CHECK: pauli word: YYZ +// CHECK: pauli word: YYZ +// CHECK: pauli word: ZIZ +// CHECK: pauli word: XXXY +// CHECK: pauli word: YIIII diff --git a/targettests/execution/exp_pauli.cpp b/targettests/execution/exp_pauli.cpp index bf7ed5bac1..014d86ccf6 100644 --- a/targettests/execution/exp_pauli.cpp +++ b/targettests/execution/exp_pauli.cpp @@ -8,17 +8,18 @@ // clang-format off // Simulators -// RUN: nvq++ %cpp_std --enable-mlir %s -o %t && %t | FileCheck %s -// RUN: nvq++ %cpp_std --enable-mlir --target remote-mqpu -fkernel-exec-kind=2 %s -o %t && %t | FileCheck %s +// RUN: nvq++ %cpp_std --enable-mlir %s -o %t && %t | FileCheck %s +// RUN: nvq++ %cpp_std -fkernel-exec-kind=2 --enable-mlir -target remote-mqpu %s -o %t && %t | FileCheck %s // // Quantum emulators -// RUN: nvq++ %cpp_std --target quantinuum --emulate -fkernel-exec-kind=2 %s -o %t && %t | FileCheck %s -// RUN: nvq++ %cpp_std --target ionq --emulate -fkernel-exec-kind=2 %s -o %t && %t | FileCheck %s +// RUN: nvq++ %cpp_std -fkernel-exec-kind=2 -target quantinuum --emulate %s -o %t && %t | FileCheck %s +// RUN: nvq++ %cpp_std -fkernel-exec-kind=2 -target ionq --emulate %s -o %t && %t | FileCheck %s +// RUN: nvq++ %cpp_std -fkernel-exec-kind=2 -target oqc --emulate %s -o %t && %t | FileCheck %s +// RUN: nvq++ %cpp_std -fkernel-exec-kind=2 -target anyon --emulate %s -o %t && %t | FileCheck %s + // 2 different IQM machines for 2 different topologies -// RUN: nvq++ %cpp_std --target iqm --iqm-machine Adonis --emulate -fkernel-exec-kind=2 %s -o %t && %t | FileCheck %s -// RUN: nvq++ %cpp_std --target iqm --iqm-machine Apollo --emulate -fkernel-exec-kind=2 %s -o %t && %t | FileCheck %s -// RUN: nvq++ %cpp_std --target oqc --emulate -fkernel-exec-kind=2 %s -o %t && %t | FileCheck %s -// RUN: nvq++ %cpp_std --target anyon --emulate -fkernel-exec-kind=2 %s -o %t && %t | FileCheck %s +// RUN: nvq++ %cpp_std -fkernel-exec-kind=2 -target iqm --iqm-machine Adonis --emulate %s -o %t && %t | FileCheck %s +// RUN: nvq++ %cpp_std -fkernel-exec-kind=2 -target iqm --iqm-machine Apollo --emulate %s -o %t && %t | FileCheck %s // clang-format on #include diff --git a/test/Quake/kernel_exec-1.qke b/test/Quake/kernel_exec-1.qke index 37ac7c7229..044bf93782 100644 --- a/test/Quake/kernel_exec-1.qke +++ b/test/Quake/kernel_exec-1.qke @@ -6,15 +6,13 @@ // the terms of the Apache License 2.0 which accompanies this distribution. // // ========================================================================== // -// RUN: cudaq-opt --kernel-execution=codegen=1 %s | FileCheck %s -// RUN: cudaq-opt --kernel-execution=codegen=2 %s | FileCheck --check-prefix=STREAM %s -// RUN: cudaq-opt --kernel-execution %s | FileCheck --check-prefix=HYBRID %s +// RUN: cudaq-opt -kernel-execution=codegen=1 %s | FileCheck --check-prefix=ALT %s +// RUN: cudaq-opt -kernel-execution=codegen=2 %s | FileCheck --check-prefix=STREAMLINED %s +// RUN: cudaq-opt -kernel-execution %s | FileCheck --check-prefix=HYBRID %s module attributes {quake.mangled_name_map = { __nvqpp__mlirgen__ghz = "_ZN3ghzclEi"}} { -// CHECK-LABEL: func.func @__nvqpp__mlirgen__ghz( - func.func @__nvqpp__mlirgen__ghz(%arg0: i32) -> f64 { %0 = cc.alloca i32 cc.store %arg0, %0 : !cc.ptr @@ -83,174 +81,369 @@ module attributes {quake.mangled_name_map = { } } -// Check the generated code. +// ALT-LABEL: func.func @_ZN3ghzclEi( +// ALT-SAME: %[[VAL_0:.*]]: !cc.ptr, +// ALT-SAME: %[[VAL_1:.*]]: i32) -> f64 { +// ALT: %[[VAL_2:.*]] = cc.alloca i64 +// ALT: %[[VAL_3:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64 +// ALT: %[[VAL_4:.*]] = cc.alloca i8{{\[}}%[[VAL_3]] : i64] +// ALT: %[[VAL_5:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr>) -> !cc.ptr> +// ALT: %[[VAL_6:.*]] = cc.compute_ptr %[[VAL_5]][0] : (!cc.ptr>) -> !cc.ptr +// ALT: cc.store %[[VAL_1]], %[[VAL_6]] : !cc.ptr +// ALT: %[[VAL_7:.*]] = constant @ghz.thunk : (!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}> +// ALT: %[[VAL_8:.*]] = cc.func_ptr %[[VAL_7]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr +// ALT: %[[VAL_9:.*]] = cc.cast %[[VAL_5]] : (!cc.ptr>) -> !cc.ptr +// ALT: %[[VAL_10:.*]] = cc.offsetof !cc.struct<{i32, f64}> [1] : i64 +// ALT: %[[VAL_11:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr> +// ALT: %[[VAL_12:.*]] = cc.cast %[[VAL_11]] : (!llvm.ptr>) -> !cc.ptr +// ALT: %[[VAL_13:.*]] = call @altLaunchKernel(%[[VAL_12]], %[[VAL_8]], %[[VAL_9]], %[[VAL_3]], %[[VAL_10]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) -> !cc.struct<{!cc.ptr, i64}> +// ALT: %[[VAL_14:.*]] = cc.extract_value %[[VAL_13]][0] : (!cc.struct<{!cc.ptr, i64}>) -> !cc.ptr +// ALT: %[[VAL_15:.*]] = cc.cast %[[VAL_14]] : (!cc.ptr) -> i64 +// ALT: %[[VAL_16:.*]] = arith.constant 0 : i64 +// ALT: %[[VAL_17:.*]] = arith.cmpi ne, %[[VAL_15]], %[[VAL_16]] : i64 +// ALT: cf.cond_br %[[VAL_17]], ^bb1, ^bb2 +// ALT: ^bb1: +// ALT: %[[VAL_18:.*]] = cc.cast %[[VAL_14]] : (!cc.ptr) -> !cc.ptr> +// ALT: %[[VAL_19:.*]] = cc.compute_ptr %[[VAL_18]][1] : (!cc.ptr>) -> !cc.ptr +// ALT: cf.br ^bb3(%[[VAL_19]] : !cc.ptr) +// ALT: ^bb2: +// ALT: %[[VAL_20:.*]] = cc.compute_ptr %[[VAL_5]][1] : (!cc.ptr>) -> !cc.ptr +// ALT: cf.br ^bb3(%[[VAL_20]] : !cc.ptr) +// ALT: ^bb3(%[[VAL_21:.*]]: !cc.ptr): +// ALT: %[[VAL_22:.*]] = cc.compute_ptr %[[VAL_5]][1] : (!cc.ptr>) -> !cc.ptr +// ALT: %[[VAL_23:.*]] = cc.load %[[VAL_22]] : !cc.ptr +// ALT: return %[[VAL_23]] : f64 +// ALT: } +// ALT: func.func private @altLaunchKernel(!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) -> !cc.struct<{!cc.ptr, i64}> +// ALT: func.func private @cudaqRegisterArgsCreator(!cc.ptr, !cc.ptr) +// ALT: llvm.func @cudaqRegisterLambdaName(!llvm.ptr, !llvm.ptr) attributes {sym_visibility = "private"} +// ALT: func.func private @__cudaq_registerLinkableKernel(!cc.ptr, !cc.ptr, !cc.ptr) +// ALT: func.func private @__cudaq_getLinkableKernelKey(!cc.ptr) -> i64 +// ALT: func.func private @cudaqRegisterKernelName(!cc.ptr) +// ALT: func.func private @malloc(i64) -> !cc.ptr +// ALT: func.func private @free(!cc.ptr) +// ALT: func.func private @__nvqpp_initializer_list_to_vector_bool(!cc.ptr, !cc.ptr, i64) +// ALT: func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr, !cc.ptr, !cc.ptr}>>, !cc.ptr, !cc.ptr, !cc.ptr}>>) +// ALT: func.func private @llvm.memcpy.p0i8.p0i8.i64(!cc.ptr, !cc.ptr, i64, i1) + +// ALT-LABEL: func.func private @__nvqpp_zeroDynamicResult() -> !cc.struct<{!cc.ptr, i64}> { +// ALT: %[[VAL_0:.*]] = arith.constant 0 : i64 +// ALT: %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (i64) -> !cc.ptr +// ALT: %[[VAL_2:.*]] = cc.undef !cc.struct<{!cc.ptr, i64}> +// ALT: %[[VAL_3:.*]] = cc.insert_value %[[VAL_1]], %[[VAL_2]][0] : (!cc.struct<{!cc.ptr, i64}>, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> +// ALT: %[[VAL_4:.*]] = cc.insert_value %[[VAL_0]], %[[VAL_3]][1] : (!cc.struct<{!cc.ptr, i64}>, i64) -> !cc.struct<{!cc.ptr, i64}> +// ALT: return %[[VAL_4]] : !cc.struct<{!cc.ptr, i64}> +// ALT: } + +// ALT-LABEL: func.func private @__nvqpp_createDynamicResult( +// ALT-SAME: %[[VAL_0:.*]]: !cc.ptr, +// ALT-SAME: %[[VAL_1:.*]]: i64, +// ALT-SAME: %[[VAL_2:.*]]: !cc.ptr, i64}>>, +// ALT-SAME: %[[VAL_3:.*]]: i64) -> !cc.struct<{!cc.ptr, i64}> { +// ALT: %[[VAL_4:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr, i64}>>) -> !cc.ptr +// ALT: %[[VAL_5:.*]] = cc.load %[[VAL_4]] : !cc.ptr +// ALT: %[[VAL_6:.*]] = arith.addi %[[VAL_1]], %[[VAL_5]] : i64 +// ALT: %[[VAL_7:.*]] = call @malloc(%[[VAL_6]]) : (i64) -> !cc.ptr +// ALT: %[[VAL_8:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr) -> !cc.ptr> +// ALT: %[[VAL_9:.*]] = arith.constant false +// ALT: call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_7]], %[[VAL_0]], %[[VAL_1]], %[[VAL_9]]) : (!cc.ptr, !cc.ptr, i64, i1) -> () +// ALT: %[[VAL_10:.*]] = cc.compute_ptr %[[VAL_2]][0] : (!cc.ptr, i64}>>) -> !cc.ptr> +// ALT: %[[VAL_11:.*]] = cc.load %[[VAL_10]] : !cc.ptr> +// ALT: %[[VAL_12:.*]] = cc.compute_ptr %[[VAL_8]]{{\[}}%[[VAL_1]]] : (!cc.ptr>, i64) -> !cc.ptr +// ALT: call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_12]], %[[VAL_11]], %[[VAL_5]], %[[VAL_9]]) : (!cc.ptr, !cc.ptr, i64, i1) -> () +// ALT: %[[VAL_13:.*]] = cc.undef !cc.struct<{!cc.ptr, i64}> +// ALT: %[[VAL_14:.*]] = cc.insert_value %[[VAL_7]], %[[VAL_13]][0] : (!cc.struct<{!cc.ptr, i64}>, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> +// ALT: %[[VAL_15:.*]] = cc.insert_value %[[VAL_6]], %[[VAL_14]][1] : (!cc.struct<{!cc.ptr, i64}>, i64) -> !cc.struct<{!cc.ptr, i64}> +// ALT: %[[VAL_16:.*]] = cc.compute_ptr %[[VAL_8]]{{\[}}%[[VAL_3]]] : (!cc.ptr>, i64) -> !cc.ptr +// ALT: %[[VAL_17:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr) -> !cc.ptr> +// ALT: cc.store %[[VAL_12]], %[[VAL_17]] : !cc.ptr> +// ALT: return %[[VAL_15]] : !cc.struct<{!cc.ptr, i64}> +// ALT: } +// ALT: llvm.mlir.global external constant @ghz.kernelName("ghz\00") {addr_space = 0 : i32} + +// ALT-LABEL: func.func @ghz.returnOffset() -> i64 { +// ALT: %[[VAL_0:.*]] = cc.offsetof !cc.struct<{i32, f64}> [1] : i64 +// ALT: return %[[VAL_0]] : i64 +// ALT: } + +// ALT-LABEL: func.func @ghz.thunk( +// ALT-SAME: %[[VAL_0:.*]]: !cc.ptr, +// ALT-SAME: %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr, i64}> { +// ALT: %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr) -> !cc.ptr> +// ALT: %[[VAL_3:.*]] = cc.load %[[VAL_2]] : !cc.ptr> +// ALT: %[[VAL_4:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64 +// ALT: %[[VAL_5:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr) -> !cc.ptr> +// ALT: %[[VAL_6:.*]] = cc.compute_ptr %[[VAL_5]]{{\[}}%[[VAL_4]]] : (!cc.ptr>, i64) -> !cc.ptr +// ALT: %[[VAL_7:.*]] = cc.extract_value %[[VAL_3]][0] : (!cc.struct<{i32, f64}>) -> i32 +// ALT: %[[VAL_8:.*]] = call @__nvqpp__mlirgen__ghz(%[[VAL_7]]) : (i32) -> f64 +// ALT: %[[VAL_9:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr>) -> !cc.ptr +// ALT: cc.store %[[VAL_8]], %[[VAL_9]] : !cc.ptr +// ALT: %[[VAL_10:.*]] = call @__nvqpp_zeroDynamicResult() : () -> !cc.struct<{!cc.ptr, i64}> +// ALT: return %[[VAL_10]] : !cc.struct<{!cc.ptr, i64}> +// ALT: } -// CHECK-LABEL: func.func @_ZN3ghzclEi( -// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, %[[VAL_1:.*]]: i32) -> f64 { -// CHECK-DAG: %[[VAL_2:.*]] = cc.undef !cc.struct<{i32, f64}> -// CHECK-DAG: %[[VAL_3:.*]] = arith.constant 0 : i64 -// CHECK: %[[VAL_4:.*]] = cc.insert_value %[[VAL_1]], %[[VAL_2]][0] : (!cc.struct<{i32, f64}>, i32) -> !cc.struct<{i32, f64}> -// CHECK: %[[VAL_5:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64 -// CHECK: %[[VAL_6:.*]] = arith.addi %[[VAL_5]], %[[VAL_3]] : i64 -// CHECK: %[[VAL_7:.*]] = cc.alloca i8{{\[}}%[[VAL_6]] : i64] -// CHECK: %[[VAL_8:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr>) -> !cc.ptr> -// CHECK: cc.store %[[VAL_4]], %[[VAL_8]] : !cc.ptr> -// CHECK: %[[VAL_9:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr>) -> !cc.ptr x ?>> -// CHECK: %[[VAL_10:.*]] = constant @ghz.thunk : (!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}> -// CHECK: %[[VAL_11:.*]] = cc.func_ptr %[[VAL_10]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr -// CHECK: %[[VAL_12:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr x ?>>) -> !cc.ptr -// CHECK: %[[VAL_13:.*]] = cc.offsetof !cc.struct<{i32, f64}> [1] : i64 -// CHECK: %[[VAL_14:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr> -// CHECK: %[[VAL_15:.*]] = cc.cast %[[VAL_14]] : (!llvm.ptr>) -> !cc.ptr -// CHECK: %[[VAL_16:.*]] = call @altLaunchKernel(%[[VAL_15]], %[[VAL_11]], %[[VAL_12]], %[[VAL_6]], %[[VAL_13]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) -> !cc.struct<{!cc.ptr, i64}> -// CHECK: %[[VAL_17:.*]] = cc.extract_value %[[VAL_16]][0] : (!cc.struct<{!cc.ptr, i64}>) -> !cc.ptr -// CHECK: %[[VAL_18:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr) -> i64 -// CHECK: %[[VAL_19:.*]] = arith.constant 0 : i64 -// CHECK: %[[VAL_20:.*]] = arith.cmpi ne, %[[VAL_18]], %[[VAL_19]] : i64 -// CHECK: cf.cond_br %[[VAL_20]], ^bb1, ^bb2 -// CHECK: ^bb1: -// CHECK: %[[VAL_21:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr) -> !cc.ptr> -// CHECK: %[[VAL_22:.*]] = cc.compute_ptr %[[VAL_21]][1] : (!cc.ptr>) -> !cc.ptr -// CHECK: cf.br ^bb3(%[[VAL_22]] : !cc.ptr) -// CHECK: ^bb2: -// CHECK: %[[VAL_23:.*]] = cc.compute_ptr %[[VAL_9]][0, 1] : (!cc.ptr x ?>>) -> !cc.ptr -// CHECK: cf.br ^bb3(%[[VAL_23]] : !cc.ptr) -// CHECK: ^bb3(%[[VAL_24:.*]]: !cc.ptr): -// CHECK: %[[VAL_25:.*]] = cc.compute_ptr %[[VAL_9]][0, 1] : (!cc.ptr x ?>>) -> !cc.ptr -// CHECK: %[[VAL_26:.*]] = cc.load %[[VAL_25]] : !cc.ptr -// CHECK: return %[[VAL_26]] : f64 -// CHECK: } +// ALT-LABEL: func.func @ghz.argsCreator( +// ALT-SAME: %[[VAL_0:.*]]: !cc.ptr>, +// ALT-SAME: %[[VAL_1:.*]]: !cc.ptr>) -> i64 { +// ALT: %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr>) -> !cc.ptr x ?>> +// ALT: %[[VAL_3:.*]] = cc.compute_ptr %[[VAL_2]][0] : (!cc.ptr x ?>>) -> !cc.ptr> +// ALT: %[[VAL_4:.*]] = cc.load %[[VAL_3]] : !cc.ptr> +// ALT: %[[VAL_5:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr) -> !cc.ptr +// ALT: %[[VAL_6:.*]] = cc.load %[[VAL_5]] : !cc.ptr +// ALT: %[[VAL_7:.*]] = cc.alloca i64 +// ALT: %[[VAL_8:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64 +// ALT: %[[VAL_9:.*]] = call @malloc(%[[VAL_8]]) : (i64) -> !cc.ptr +// ALT: %[[VAL_10:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr) -> !cc.ptr> +// ALT: %[[VAL_11:.*]] = cc.compute_ptr %[[VAL_10]][0] : (!cc.ptr>) -> !cc.ptr +// ALT: cc.store %[[VAL_6]], %[[VAL_11]] : !cc.ptr +// ALT: cc.store %[[VAL_9]], %[[VAL_1]] : !cc.ptr> +// ALT: return %[[VAL_8]] : i64 +// ALT: } -// CHECK: func.func private @altLaunchKernel(!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) -> !cc.struct<{!cc.ptr, i64}> -// CHECK: func.func private @cudaqRegisterKernelName(!cc.ptr) +// ALT-LABEL: llvm.func @ghz.kernelRegFunc() { +// ALT: %[[VAL_0:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr> +// ALT: %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (!llvm.ptr>) -> !cc.ptr +// ALT: func.call @cudaqRegisterKernelName(%[[VAL_1]]) : (!cc.ptr) -> () +// ALT: %[[VAL_2:.*]] = func.constant @ghz.argsCreator : (!cc.ptr>, !cc.ptr>) -> i64 +// ALT: %[[VAL_3:.*]] = cc.func_ptr %[[VAL_2]] : ((!cc.ptr>, !cc.ptr>) -> i64) -> !cc.ptr +// ALT: func.call @cudaqRegisterArgsCreator(%[[VAL_1]], %[[VAL_3]]) : (!cc.ptr, !cc.ptr) -> () +// ALT: llvm.return +// ALT: } +// ALT: llvm.mlir.global_ctors {ctors = [@ghz.kernelRegFunc], priorities = [17 : i32]} -// CHECK: llvm.mlir.global external constant @ghz.kernelName("ghz\00") {addr_space = 0 : i32} +// STREAMLINED-LABEL: func.func @_ZN3ghzclEi( +// STREAMLINED-SAME: %[[VAL_0:.*]]: !cc.ptr, +// STREAMLINED-SAME: %[[VAL_1:.*]]: i32) -> f64 { +// STREAMLINED: %[[VAL_2:.*]] = cc.alloca i64 +// STREAMLINED: %[[VAL_3:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64 +// STREAMLINED: %[[VAL_4:.*]] = cc.alloca !cc.struct<{!cc.ptr>, !cc.ptr>, !cc.ptr>}> +// STREAMLINED: %[[VAL_5:.*]] = cc.alloca !cc.array x 1> +// STREAMLINED: %[[VAL_6:.*]] = cc.sizeof !cc.array x 1> : i64 +// STREAMLINED: %[[VAL_7:.*]] = cc.cast %[[VAL_5]] : (!cc.ptr x 1>>) -> !cc.ptr> +// STREAMLINED: %[[VAL_8:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// STREAMLINED: cc.store %[[VAL_7]], %[[VAL_8]] : !cc.ptr>> +// STREAMLINED: %[[VAL_9:.*]] = cc.cast %[[VAL_5]] : (!cc.ptr x 1>>) -> i64 +// STREAMLINED: %[[VAL_10:.*]] = arith.addi %[[VAL_9]], %[[VAL_6]] : i64 +// STREAMLINED: %[[VAL_11:.*]] = cc.cast %[[VAL_10]] : (i64) -> !cc.ptr> +// STREAMLINED: %[[VAL_12:.*]] = cc.compute_ptr %[[VAL_4]][1] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// STREAMLINED: cc.store %[[VAL_11]], %[[VAL_12]] : !cc.ptr>> +// STREAMLINED: %[[VAL_13:.*]] = cc.compute_ptr %[[VAL_4]][2] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// STREAMLINED: cc.store %[[VAL_11]], %[[VAL_13]] : !cc.ptr>> +// STREAMLINED: %[[VAL_14:.*]] = cc.compute_ptr %[[VAL_5]][0] : (!cc.ptr x 1>>) -> !cc.ptr> +// STREAMLINED: %[[VAL_15:.*]] = cc.alloca i32 +// STREAMLINED: cc.store %[[VAL_1]], %[[VAL_15]] : !cc.ptr +// STREAMLINED: %[[VAL_16:.*]] = cc.cast %[[VAL_15]] : (!cc.ptr) -> !cc.ptr +// STREAMLINED: cc.store %[[VAL_16]], %[[VAL_14]] : !cc.ptr> +// STREAMLINED: %[[VAL_17:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr +// STREAMLINED: %[[VAL_18:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr> +// STREAMLINED: %[[VAL_19:.*]] = cc.cast %[[VAL_18]] : (!llvm.ptr>) -> !cc.ptr +// STREAMLINED: call @streamlinedLaunchKernel(%[[VAL_19]], %[[VAL_17]]) : (!cc.ptr, !cc.ptr) -> () +// STREAMLINED: %[[VAL_20:.*]] = cc.undef f64 +// STREAMLINED: return %[[VAL_20]] : f64 +// STREAMLINED: } +// STREAMLINED: func.func private @streamlinedLaunchKernel(!cc.ptr, !cc.ptr) +// STREAMLINED: func.func private @cudaqRegisterArgsCreator(!cc.ptr, !cc.ptr) +// STREAMLINED: llvm.func @cudaqRegisterLambdaName(!llvm.ptr, !llvm.ptr) attributes {sym_visibility = "private"} +// STREAMLINED: func.func private @__cudaq_registerLinkableKernel(!cc.ptr, !cc.ptr, !cc.ptr) +// STREAMLINED: func.func private @__cudaq_getLinkableKernelKey(!cc.ptr) -> i64 +// STREAMLINED: func.func private @cudaqRegisterKernelName(!cc.ptr) +// STREAMLINED: func.func private @malloc(i64) -> !cc.ptr +// STREAMLINED: func.func private @free(!cc.ptr) +// STREAMLINED: func.func private @__nvqpp_initializer_list_to_vector_bool(!cc.ptr, !cc.ptr, i64) +// STREAMLINED: func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr, !cc.ptr, !cc.ptr}>>, !cc.ptr, !cc.ptr, !cc.ptr}>>) +// STREAMLINED: func.func private @llvm.memcpy.p0i8.p0i8.i64(!cc.ptr, !cc.ptr, i64, i1) -// CHECK-LABEL: func.func @ghz.thunk( -// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, -// CHECK-SAME: %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr, i64}> { -// CHECK: %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr) -> !cc.ptr> -// CHECK: %[[VAL_3:.*]] = cc.load %[[VAL_2]] : !cc.ptr> -// CHECK: %[[VAL_7:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64 -// CHECK: %[[VAL_20:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr) -> !cc.ptr> -// CHECK: %[[VAL_8:.*]] = cc.compute_ptr %[[VAL_20]][%[[VAL_7]]] : (!cc.ptr>, i64) -> !cc.ptr -// CHECK: %[[VAL_9:.*]] = cc.extract_value %[[VAL_3]][0] : (!cc.struct<{i32, f64}>) -> i32 -// CHECK: %[[VAL_10:.*]] = call @__nvqpp__mlirgen__ghz(%[[VAL_9]]) : (i32) -> f64 -// CHECK: %[[VAL_11:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr>) -> !cc.ptr -// CHECK: cc.store %[[VAL_10]], %[[VAL_11]] : !cc.ptr -// CHECK: %[[VAL_12:.*]] = call @__nvqpp_zeroDynamicResult() : () -> !cc.struct<{!cc.ptr, i64}> -// CHECK: return %[[VAL_12]] : !cc.struct<{!cc.ptr, i64}> -// CHECK: } +// STREAMLINED-LABEL: func.func private @__nvqpp_zeroDynamicResult() -> !cc.struct<{!cc.ptr, i64}> { +// STREAMLINED: %[[VAL_0:.*]] = arith.constant 0 : i64 +// STREAMLINED: %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (i64) -> !cc.ptr +// STREAMLINED: %[[VAL_2:.*]] = cc.undef !cc.struct<{!cc.ptr, i64}> +// STREAMLINED: %[[VAL_3:.*]] = cc.insert_value %[[VAL_1]], %[[VAL_2]][0] : (!cc.struct<{!cc.ptr, i64}>, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> +// STREAMLINED: %[[VAL_4:.*]] = cc.insert_value %[[VAL_0]], %[[VAL_3]][1] : (!cc.struct<{!cc.ptr, i64}>, i64) -> !cc.struct<{!cc.ptr, i64}> +// STREAMLINED: return %[[VAL_4]] : !cc.struct<{!cc.ptr, i64}> +// STREAMLINED: } -// CHECK-LABEL: func.func @ghz.argsCreator( -// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr>, -// CHECK-SAME: %[[VAL_1:.*]]: !cc.ptr>) -> i64 { -// CHECK: %[[VAL_2:.*]] = cc.undef !cc.struct<{i32, f64}> -// CHECK: %[[VAL_14:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr>) -> !cc.ptr x ?>> -// CHECK: %[[VAL_3:.*]] = arith.constant 0 : i64 -// CHECK: %[[VAL_4:.*]] = cc.compute_ptr %[[VAL_14]][0] : (!cc.ptr x ?>>) -> !cc.ptr> -// CHECK: %[[VAL_5:.*]] = cc.load %[[VAL_4]] : !cc.ptr> -// CHECK: %[[VAL_6:.*]] = cc.cast %[[VAL_5]] : (!cc.ptr) -> !cc.ptr -// CHECK: %[[VAL_7:.*]] = cc.load %[[VAL_6]] : !cc.ptr -// CHECK: %[[VAL_8:.*]] = cc.insert_value %[[VAL_7]], %[[VAL_2]][0] : (!cc.struct<{i32, f64}>, i32) -> !cc.struct<{i32, f64}> -// CHECK: %[[VAL_11:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64 -// CHECK: %[[VAL_12:.*]] = call @malloc(%[[VAL_11]]) : (i64) -> !cc.ptr -// CHECK: %[[VAL_13:.*]] = cc.cast %[[VAL_12]] : (!cc.ptr) -> !cc.ptr> -// CHECK: cc.store %[[VAL_8]], %[[VAL_13]] : !cc.ptr> -// CHECK: cc.store %[[VAL_12]], %[[VAL_1]] : !cc.ptr> -// CHECK: return %[[VAL_11]] : i64 -// CHECK: } +// STREAMLINED-LABEL: func.func private @__nvqpp_createDynamicResult( +// STREAMLINED-SAME: %[[VAL_0:.*]]: !cc.ptr, +// STREAMLINED-SAME: %[[VAL_1:.*]]: i64, +// STREAMLINED-SAME: %[[VAL_2:.*]]: !cc.ptr, i64}>>, +// STREAMLINED-SAME: %[[VAL_3:.*]]: i64) -> !cc.struct<{!cc.ptr, i64}> { +// STREAMLINED: %[[VAL_4:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr, i64}>>) -> !cc.ptr +// STREAMLINED: %[[VAL_5:.*]] = cc.load %[[VAL_4]] : !cc.ptr +// STREAMLINED: %[[VAL_6:.*]] = arith.addi %[[VAL_1]], %[[VAL_5]] : i64 +// STREAMLINED: %[[VAL_7:.*]] = call @malloc(%[[VAL_6]]) : (i64) -> !cc.ptr +// STREAMLINED: %[[VAL_8:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr) -> !cc.ptr> +// STREAMLINED: %[[VAL_9:.*]] = arith.constant false +// STREAMLINED: call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_7]], %[[VAL_0]], %[[VAL_1]], %[[VAL_9]]) : (!cc.ptr, !cc.ptr, i64, i1) -> () +// STREAMLINED: %[[VAL_10:.*]] = cc.compute_ptr %[[VAL_2]][0] : (!cc.ptr, i64}>>) -> !cc.ptr> +// STREAMLINED: %[[VAL_11:.*]] = cc.load %[[VAL_10]] : !cc.ptr> +// STREAMLINED: %[[VAL_12:.*]] = cc.compute_ptr %[[VAL_8]]{{\[}}%[[VAL_1]]] : (!cc.ptr>, i64) -> !cc.ptr +// STREAMLINED: call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_12]], %[[VAL_11]], %[[VAL_5]], %[[VAL_9]]) : (!cc.ptr, !cc.ptr, i64, i1) -> () +// STREAMLINED: %[[VAL_13:.*]] = cc.undef !cc.struct<{!cc.ptr, i64}> +// STREAMLINED: %[[VAL_14:.*]] = cc.insert_value %[[VAL_7]], %[[VAL_13]][0] : (!cc.struct<{!cc.ptr, i64}>, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> +// STREAMLINED: %[[VAL_15:.*]] = cc.insert_value %[[VAL_6]], %[[VAL_14]][1] : (!cc.struct<{!cc.ptr, i64}>, i64) -> !cc.struct<{!cc.ptr, i64}> +// STREAMLINED: %[[VAL_16:.*]] = cc.compute_ptr %[[VAL_8]]{{\[}}%[[VAL_3]]] : (!cc.ptr>, i64) -> !cc.ptr +// STREAMLINED: %[[VAL_17:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr) -> !cc.ptr> +// STREAMLINED: cc.store %[[VAL_12]], %[[VAL_17]] : !cc.ptr> +// STREAMLINED: return %[[VAL_15]] : !cc.struct<{!cc.ptr, i64}> +// STREAMLINED: } +// STREAMLINED: llvm.mlir.global external constant @ghz.kernelName("ghz\00") {addr_space = 0 : i32} -// CHECK-LABEL: llvm.func @ghz.kernelRegFunc() { -// CHECK: %[[VAL_0:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr> -// CHECK: %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (!llvm.ptr>) -> !cc.ptr -// CHECK: func.call @cudaqRegisterKernelName(%[[VAL_1]]) : (!cc.ptr) -> () -// CHECK: %[[VAL_2:.*]] = func.constant @ghz.argsCreator : (!cc.ptr>, !cc.ptr>) -> i64 -// CHECK: %[[VAL_3:.*]] = cc.func_ptr %[[VAL_2]] : ((!cc.ptr>, !cc.ptr>) -> i64) -> !cc.ptr -// CHECK: func.call @cudaqRegisterArgsCreator(%[[VAL_1]], %[[VAL_3]]) : (!cc.ptr, !cc.ptr) -> () -// CHECK: llvm.return -// CHECK: } +// STREAMLINED-LABEL: llvm.func @ghz.kernelRegFunc() { +// STREAMLINED: %[[VAL_0:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr> +// STREAMLINED: %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (!llvm.ptr>) -> !cc.ptr +// STREAMLINED: func.call @cudaqRegisterKernelName(%[[VAL_1]]) : (!cc.ptr) -> () +// STREAMLINED: llvm.return +// STREAMLINED: } +// STREAMLINED: llvm.mlir.global_ctors {ctors = [@ghz.kernelRegFunc], priorities = [17 : i32]} -// STREAM-LABEL: func.func @_ZN3ghzclEi( -// STREAM-SAME: %[[VAL_0:.*]]: !cc.ptr, %[[VAL_1:.*]]: i32) -> f64 { -// STREAM: %[[VAL_2:.*]] = cc.alloca !cc.struct<{!cc.ptr>, !cc.ptr>, !cc.ptr>}> -// STREAM: %[[VAL_3:.*]] = cc.alloca !cc.array x 1> -// STREAM: %[[VAL_4:.*]] = cc.sizeof !cc.array x 1> : i64 -// STREAM: %[[VAL_5:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr x 1>>) -> !cc.ptr> -// STREAM: %[[VAL_6:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> -// STREAM: cc.store %[[VAL_5]], %[[VAL_6]] : !cc.ptr>> -// STREAM: %[[VAL_7:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr x 1>>) -> i64 -// STREAM: %[[VAL_8:.*]] = arith.addi %[[VAL_7]], %[[VAL_4]] : i64 -// STREAM: %[[VAL_9:.*]] = cc.cast %[[VAL_8]] : (i64) -> !cc.ptr> -// STREAM: %[[VAL_10:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> -// STREAM: cc.store %[[VAL_9]], %[[VAL_10]] : !cc.ptr>> -// STREAM: %[[VAL_11:.*]] = cc.compute_ptr %[[VAL_2]][2] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> -// STREAM: cc.store %[[VAL_9]], %[[VAL_11]] : !cc.ptr>> -// STREAM: %[[VAL_14:.*]] = cc.compute_ptr %[[VAL_3]][0] : (!cc.ptr x 1>>) -> !cc.ptr> -// STREAM: %[[VAL_15:.*]] = cc.alloca i32 -// STREAM: cc.store %[[VAL_1]], %[[VAL_15]] : !cc.ptr -// STREAM: %[[VAL_16:.*]] = cc.cast %[[VAL_15]] : (!cc.ptr) -> !cc.ptr -// STREAM: cc.store %[[VAL_16]], %[[VAL_14]] : !cc.ptr> -// STREAM: %[[VAL_19:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr -// STREAM: %[[VAL_20:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr> -// STREAM: %[[VAL_21:.*]] = cc.cast %[[VAL_20]] : (!llvm.ptr>) -> !cc.ptr -// STREAM: call @streamlinedLaunchKernel(%[[VAL_21]], %[[VAL_19]]) : (!cc.ptr, !cc.ptr) -> () -// STREAM: %[[VAL_22:.*]] = cc.undef f64 -// STREAM: return %[[VAL_22]] : f64 -// STREAM: } // HYBRID-LABEL: func.func @_ZN3ghzclEi( -// HYBRID-SAME: %[[VAL_0:.*]]: !cc.ptr, %[[VAL_1:.*]]: i32) -> f64 { -// HYBRID: %[[VAL_2:.*]] = cc.undef !cc.struct<{i32, f64}> -// HYBRID: %[[VAL_3:.*]] = arith.constant 0 : i64 -// HYBRID: %[[VAL_4:.*]] = cc.insert_value %[[VAL_1]], %[[VAL_2]][0] : (!cc.struct<{i32, f64}>, i32) -> !cc.struct<{i32, f64}> -// HYBRID: %[[VAL_5:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64 -// HYBRID: %[[VAL_6:.*]] = arith.addi %[[VAL_5]], %[[VAL_3]] : i64 -// HYBRID: %[[VAL_7:.*]] = cc.alloca i8{{\[}}%[[VAL_6]] : i64] -// HYBRID: %[[VAL_8:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr>) -> !cc.ptr> -// HYBRID: cc.store %[[VAL_4]], %[[VAL_8]] : !cc.ptr> -// HYBRID: %[[VAL_9:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr>) -> !cc.ptr x ?>> -// HYBRID: %[[VAL_10:.*]] = constant @ghz.thunk : (!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}> -// HYBRID: %[[VAL_11:.*]] = cc.func_ptr %[[VAL_10]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr -// HYBRID: %[[VAL_12:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr x ?>>) -> !cc.ptr -// HYBRID: %[[VAL_13:.*]] = cc.offsetof !cc.struct<{i32, f64}> [1] : i64 -// HYBRID: %[[VAL_14:.*]] = cc.alloca !cc.struct<{!cc.ptr>, !cc.ptr>, !cc.ptr>}> -// HYBRID: %[[VAL_15:.*]] = cc.alloca !cc.array x 1> -// HYBRID: %[[VAL_16:.*]] = cc.sizeof !cc.array x 1> : i64 -// HYBRID: %[[VAL_17:.*]] = cc.cast %[[VAL_15]] : (!cc.ptr x 1>>) -> !cc.ptr> -// HYBRID: %[[VAL_18:.*]] = cc.cast %[[VAL_14]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> -// HYBRID: cc.store %[[VAL_17]], %[[VAL_18]] : !cc.ptr>> -// HYBRID: %[[VAL_19:.*]] = cc.cast %[[VAL_15]] : (!cc.ptr x 1>>) -> i64 -// HYBRID: %[[VAL_20:.*]] = arith.addi %[[VAL_19]], %[[VAL_16]] : i64 -// HYBRID: %[[VAL_21:.*]] = cc.cast %[[VAL_20]] : (i64) -> !cc.ptr> -// HYBRID: %[[VAL_22:.*]] = cc.compute_ptr %[[VAL_14]][1] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> -// HYBRID: cc.store %[[VAL_21]], %[[VAL_22]] : !cc.ptr>> -// HYBRID: %[[VAL_23:.*]] = cc.compute_ptr %[[VAL_14]][2] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> -// HYBRID: cc.store %[[VAL_21]], %[[VAL_23]] : !cc.ptr>> -// HYBRID: %[[VAL_24:.*]] = cc.compute_ptr %[[VAL_15]][0] : (!cc.ptr x 1>>) -> !cc.ptr> -// HYBRID: %[[VAL_25:.*]] = cc.alloca i32 -// HYBRID: cc.store %[[VAL_1]], %[[VAL_25]] : !cc.ptr -// HYBRID: %[[VAL_26:.*]] = cc.cast %[[VAL_25]] : (!cc.ptr) -> !cc.ptr -// HYBRID: cc.store %[[VAL_26]], %[[VAL_24]] : !cc.ptr> -// HYBRID: %[[VAL_27:.*]] = cc.cast %[[VAL_14]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr -// HYBRID: %[[VAL_28:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr> -// HYBRID: %[[VAL_29:.*]] = cc.cast %[[VAL_28]] : (!llvm.ptr>) -> !cc.ptr -// HYBRID: %[[VAL_30:.*]] = call @hybridLaunchKernel(%[[VAL_29]], %[[VAL_11]], %[[VAL_12]], %[[VAL_6]], %[[VAL_13]], %[[VAL_27]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> -// HYBRID: %[[VAL_31:.*]] = cc.extract_value %[[VAL_30]][0] : (!cc.struct<{!cc.ptr, i64}>) -> !cc.ptr -// HYBRID: %[[VAL_32:.*]] = cc.cast %[[VAL_31]] : (!cc.ptr) -> i64 -// HYBRID: %[[VAL_33:.*]] = arith.constant 0 : i64 -// HYBRID: %[[VAL_34:.*]] = arith.cmpi ne, %[[VAL_32]], %[[VAL_33]] : i64 -// HYBRID: cf.cond_br %[[VAL_34]], ^bb1, ^bb2 +// HYBRID-SAME: %[[VAL_0:.*]]: !cc.ptr, +// HYBRID-SAME: %[[VAL_1:.*]]: i32) -> f64 { +// HYBRID: %[[VAL_2:.*]] = cc.alloca i64 +// HYBRID: %[[VAL_3:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64 +// HYBRID: %[[VAL_4:.*]] = cc.alloca i8{{\[}}%[[VAL_3]] : i64] +// HYBRID: %[[VAL_5:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr>) -> !cc.ptr> +// HYBRID: %[[VAL_6:.*]] = cc.compute_ptr %[[VAL_5]][0] : (!cc.ptr>) -> !cc.ptr +// HYBRID: cc.store %[[VAL_1]], %[[VAL_6]] : !cc.ptr +// HYBRID: %[[VAL_7:.*]] = constant @ghz.thunk : (!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}> +// HYBRID: %[[VAL_8:.*]] = cc.func_ptr %[[VAL_7]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr +// HYBRID: %[[VAL_9:.*]] = cc.cast %[[VAL_5]] : (!cc.ptr>) -> !cc.ptr +// HYBRID: %[[VAL_10:.*]] = cc.offsetof !cc.struct<{i32, f64}> [1] : i64 +// HYBRID: %[[VAL_11:.*]] = cc.alloca !cc.struct<{!cc.ptr>, !cc.ptr>, !cc.ptr>}> +// HYBRID: %[[VAL_12:.*]] = cc.alloca !cc.array x 1> +// HYBRID: %[[VAL_13:.*]] = cc.sizeof !cc.array x 1> : i64 +// HYBRID: %[[VAL_14:.*]] = cc.cast %[[VAL_12]] : (!cc.ptr x 1>>) -> !cc.ptr> +// HYBRID: %[[VAL_15:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// HYBRID: cc.store %[[VAL_14]], %[[VAL_15]] : !cc.ptr>> +// HYBRID: %[[VAL_16:.*]] = cc.cast %[[VAL_12]] : (!cc.ptr x 1>>) -> i64 +// HYBRID: %[[VAL_17:.*]] = arith.addi %[[VAL_16]], %[[VAL_13]] : i64 +// HYBRID: %[[VAL_18:.*]] = cc.cast %[[VAL_17]] : (i64) -> !cc.ptr> +// HYBRID: %[[VAL_19:.*]] = cc.compute_ptr %[[VAL_11]][1] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// HYBRID: cc.store %[[VAL_18]], %[[VAL_19]] : !cc.ptr>> +// HYBRID: %[[VAL_20:.*]] = cc.compute_ptr %[[VAL_11]][2] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// HYBRID: cc.store %[[VAL_18]], %[[VAL_20]] : !cc.ptr>> +// HYBRID: %[[VAL_21:.*]] = cc.compute_ptr %[[VAL_12]][0] : (!cc.ptr x 1>>) -> !cc.ptr> +// HYBRID: %[[VAL_22:.*]] = cc.alloca i32 +// HYBRID: cc.store %[[VAL_1]], %[[VAL_22]] : !cc.ptr +// HYBRID: %[[VAL_23:.*]] = cc.cast %[[VAL_22]] : (!cc.ptr) -> !cc.ptr +// HYBRID: cc.store %[[VAL_23]], %[[VAL_21]] : !cc.ptr> +// HYBRID: %[[VAL_24:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr +// HYBRID: %[[VAL_25:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr> +// HYBRID: %[[VAL_26:.*]] = cc.cast %[[VAL_25]] : (!llvm.ptr>) -> !cc.ptr +// HYBRID: %[[VAL_27:.*]] = call @hybridLaunchKernel(%[[VAL_26]], %[[VAL_8]], %[[VAL_9]], %[[VAL_3]], %[[VAL_10]], %[[VAL_24]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> +// HYBRID: %[[VAL_28:.*]] = cc.extract_value %[[VAL_27]][0] : (!cc.struct<{!cc.ptr, i64}>) -> !cc.ptr +// HYBRID: %[[VAL_29:.*]] = cc.cast %[[VAL_28]] : (!cc.ptr) -> i64 +// HYBRID: %[[VAL_30:.*]] = arith.constant 0 : i64 +// HYBRID: %[[VAL_31:.*]] = arith.cmpi ne, %[[VAL_29]], %[[VAL_30]] : i64 +// HYBRID: cf.cond_br %[[VAL_31]], ^bb1, ^bb2 // HYBRID: ^bb1: -// HYBRID: %[[VAL_35:.*]] = cc.cast %[[VAL_31]] : (!cc.ptr) -> !cc.ptr> -// HYBRID: %[[VAL_36:.*]] = cc.compute_ptr %[[VAL_35]][1] : (!cc.ptr>) -> !cc.ptr -// HYBRID: cf.br ^bb3(%[[VAL_36]] : !cc.ptr) +// HYBRID: %[[VAL_32:.*]] = cc.cast %[[VAL_28]] : (!cc.ptr) -> !cc.ptr> +// HYBRID: %[[VAL_33:.*]] = cc.compute_ptr %[[VAL_32]][1] : (!cc.ptr>) -> !cc.ptr +// HYBRID: cf.br ^bb3(%[[VAL_33]] : !cc.ptr) // HYBRID: ^bb2: -// HYBRID: %[[VAL_37:.*]] = cc.compute_ptr %[[VAL_9]][0, 1] : (!cc.ptr x ?>>) -> !cc.ptr -// HYBRID: cf.br ^bb3(%[[VAL_37]] : !cc.ptr) -// HYBRID: ^bb3(%[[VAL_38:.*]]: !cc.ptr): -// HYBRID: %[[VAL_39:.*]] = cc.compute_ptr %[[VAL_9]][0, 1] : (!cc.ptr x ?>>) -> !cc.ptr -// HYBRID: %[[VAL_40:.*]] = cc.load %[[VAL_39]] : !cc.ptr -// HYBRID: return %[[VAL_40]] : f64 +// HYBRID: %[[VAL_34:.*]] = cc.compute_ptr %[[VAL_5]][1] : (!cc.ptr>) -> !cc.ptr +// HYBRID: cf.br ^bb3(%[[VAL_34]] : !cc.ptr) +// HYBRID: ^bb3(%[[VAL_35:.*]]: !cc.ptr): +// HYBRID: %[[VAL_36:.*]] = cc.compute_ptr %[[VAL_5]][1] : (!cc.ptr>) -> !cc.ptr +// HYBRID: %[[VAL_37:.*]] = cc.load %[[VAL_36]] : !cc.ptr +// HYBRID: return %[[VAL_37]] : f64 +// HYBRID: } +// HYBRID: func.func private @hybridLaunchKernel(!cc.ptr, !cc.ptr, !cc.ptr, i64, i64, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> +// HYBRID: func.func private @cudaqRegisterArgsCreator(!cc.ptr, !cc.ptr) +// HYBRID: llvm.func @cudaqRegisterLambdaName(!llvm.ptr, !llvm.ptr) attributes {sym_visibility = "private"} +// HYBRID: func.func private @__cudaq_registerLinkableKernel(!cc.ptr, !cc.ptr, !cc.ptr) +// HYBRID: func.func private @__cudaq_getLinkableKernelKey(!cc.ptr) -> i64 +// HYBRID: func.func private @cudaqRegisterKernelName(!cc.ptr) +// HYBRID: func.func private @malloc(i64) -> !cc.ptr +// HYBRID: func.func private @free(!cc.ptr) +// HYBRID: func.func private @__nvqpp_initializer_list_to_vector_bool(!cc.ptr, !cc.ptr, i64) +// HYBRID: func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr, !cc.ptr, !cc.ptr}>>, !cc.ptr, !cc.ptr, !cc.ptr}>>) +// HYBRID: func.func private @llvm.memcpy.p0i8.p0i8.i64(!cc.ptr, !cc.ptr, i64, i1) + +// HYBRID-LABEL: func.func private @__nvqpp_zeroDynamicResult() -> !cc.struct<{!cc.ptr, i64}> { +// HYBRID: %[[VAL_0:.*]] = arith.constant 0 : i64 +// HYBRID: %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (i64) -> !cc.ptr +// HYBRID: %[[VAL_2:.*]] = cc.undef !cc.struct<{!cc.ptr, i64}> +// HYBRID: %[[VAL_3:.*]] = cc.insert_value %[[VAL_1]], %[[VAL_2]][0] : (!cc.struct<{!cc.ptr, i64}>, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> +// HYBRID: %[[VAL_4:.*]] = cc.insert_value %[[VAL_0]], %[[VAL_3]][1] : (!cc.struct<{!cc.ptr, i64}>, i64) -> !cc.struct<{!cc.ptr, i64}> +// HYBRID: return %[[VAL_4]] : !cc.struct<{!cc.ptr, i64}> +// HYBRID: } + +// HYBRID-LABEL: func.func private @__nvqpp_createDynamicResult( +// HYBRID-SAME: %[[VAL_0:.*]]: !cc.ptr, +// HYBRID-SAME: %[[VAL_1:.*]]: i64, +// HYBRID-SAME: %[[VAL_2:.*]]: !cc.ptr, i64}>>, +// HYBRID-SAME: %[[VAL_3:.*]]: i64) -> !cc.struct<{!cc.ptr, i64}> { +// HYBRID: %[[VAL_4:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr, i64}>>) -> !cc.ptr +// HYBRID: %[[VAL_5:.*]] = cc.load %[[VAL_4]] : !cc.ptr +// HYBRID: %[[VAL_6:.*]] = arith.addi %[[VAL_1]], %[[VAL_5]] : i64 +// HYBRID: %[[VAL_7:.*]] = call @malloc(%[[VAL_6]]) : (i64) -> !cc.ptr +// HYBRID: %[[VAL_8:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr) -> !cc.ptr> +// HYBRID: %[[VAL_9:.*]] = arith.constant false +// HYBRID: call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_7]], %[[VAL_0]], %[[VAL_1]], %[[VAL_9]]) : (!cc.ptr, !cc.ptr, i64, i1) -> () +// HYBRID: %[[VAL_10:.*]] = cc.compute_ptr %[[VAL_2]][0] : (!cc.ptr, i64}>>) -> !cc.ptr> +// HYBRID: %[[VAL_11:.*]] = cc.load %[[VAL_10]] : !cc.ptr> +// HYBRID: %[[VAL_12:.*]] = cc.compute_ptr %[[VAL_8]]{{\[}}%[[VAL_1]]] : (!cc.ptr>, i64) -> !cc.ptr +// HYBRID: call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_12]], %[[VAL_11]], %[[VAL_5]], %[[VAL_9]]) : (!cc.ptr, !cc.ptr, i64, i1) -> () +// HYBRID: %[[VAL_13:.*]] = cc.undef !cc.struct<{!cc.ptr, i64}> +// HYBRID: %[[VAL_14:.*]] = cc.insert_value %[[VAL_7]], %[[VAL_13]][0] : (!cc.struct<{!cc.ptr, i64}>, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> +// HYBRID: %[[VAL_15:.*]] = cc.insert_value %[[VAL_6]], %[[VAL_14]][1] : (!cc.struct<{!cc.ptr, i64}>, i64) -> !cc.struct<{!cc.ptr, i64}> +// HYBRID: %[[VAL_16:.*]] = cc.compute_ptr %[[VAL_8]]{{\[}}%[[VAL_3]]] : (!cc.ptr>, i64) -> !cc.ptr +// HYBRID: %[[VAL_17:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr) -> !cc.ptr> +// HYBRID: cc.store %[[VAL_12]], %[[VAL_17]] : !cc.ptr> +// HYBRID: return %[[VAL_15]] : !cc.struct<{!cc.ptr, i64}> // HYBRID: } +// HYBRID: llvm.mlir.global external constant @ghz.kernelName("ghz\00") {addr_space = 0 : i32} + +// HYBRID-LABEL: func.func @ghz.returnOffset() -> i64 { +// HYBRID: %[[VAL_0:.*]] = cc.offsetof !cc.struct<{i32, f64}> [1] : i64 +// HYBRID: return %[[VAL_0]] : i64 +// HYBRID: } + +// HYBRID-LABEL: func.func @ghz.thunk( +// HYBRID-SAME: %[[VAL_0:.*]]: !cc.ptr, +// HYBRID-SAME: %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr, i64}> { +// HYBRID: %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr) -> !cc.ptr> +// HYBRID: %[[VAL_3:.*]] = cc.load %[[VAL_2]] : !cc.ptr> +// HYBRID: %[[VAL_4:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64 +// HYBRID: %[[VAL_5:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr) -> !cc.ptr> +// HYBRID: %[[VAL_6:.*]] = cc.compute_ptr %[[VAL_5]]{{\[}}%[[VAL_4]]] : (!cc.ptr>, i64) -> !cc.ptr +// HYBRID: %[[VAL_7:.*]] = cc.extract_value %[[VAL_3]][0] : (!cc.struct<{i32, f64}>) -> i32 +// HYBRID: %[[VAL_8:.*]] = call @__nvqpp__mlirgen__ghz(%[[VAL_7]]) : (i32) -> f64 +// HYBRID: %[[VAL_9:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr>) -> !cc.ptr +// HYBRID: cc.store %[[VAL_8]], %[[VAL_9]] : !cc.ptr +// HYBRID: %[[VAL_10:.*]] = call @__nvqpp_zeroDynamicResult() : () -> !cc.struct<{!cc.ptr, i64}> +// HYBRID: return %[[VAL_10]] : !cc.struct<{!cc.ptr, i64}> +// HYBRID: } + +// HYBRID-LABEL: func.func @ghz.argsCreator( +// HYBRID-SAME: %[[VAL_0:.*]]: !cc.ptr>, +// HYBRID-SAME: %[[VAL_1:.*]]: !cc.ptr>) -> i64 { +// HYBRID: %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr>) -> !cc.ptr x ?>> +// HYBRID: %[[VAL_3:.*]] = cc.compute_ptr %[[VAL_2]][0] : (!cc.ptr x ?>>) -> !cc.ptr> +// HYBRID: %[[VAL_4:.*]] = cc.load %[[VAL_3]] : !cc.ptr> +// HYBRID: %[[VAL_5:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr) -> !cc.ptr +// HYBRID: %[[VAL_6:.*]] = cc.load %[[VAL_5]] : !cc.ptr +// HYBRID: %[[VAL_7:.*]] = cc.alloca i64 +// HYBRID: %[[VAL_8:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64 +// HYBRID: %[[VAL_9:.*]] = call @malloc(%[[VAL_8]]) : (i64) -> !cc.ptr +// HYBRID: %[[VAL_10:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr) -> !cc.ptr> +// HYBRID: %[[VAL_11:.*]] = cc.compute_ptr %[[VAL_10]][0] : (!cc.ptr>) -> !cc.ptr +// HYBRID: cc.store %[[VAL_6]], %[[VAL_11]] : !cc.ptr +// HYBRID: cc.store %[[VAL_9]], %[[VAL_1]] : !cc.ptr> +// HYBRID: return %[[VAL_8]] : i64 +// HYBRID: } + +// HYBRID-LABEL: llvm.func @ghz.kernelRegFunc() { +// HYBRID: %[[VAL_0:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr> +// HYBRID: %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (!llvm.ptr>) -> !cc.ptr +// HYBRID: func.call @cudaqRegisterKernelName(%[[VAL_1]]) : (!cc.ptr) -> () +// HYBRID: %[[VAL_2:.*]] = func.constant @ghz.argsCreator : (!cc.ptr>, !cc.ptr>) -> i64 +// HYBRID: %[[VAL_3:.*]] = cc.func_ptr %[[VAL_2]] : ((!cc.ptr>, !cc.ptr>) -> i64) -> !cc.ptr +// HYBRID: func.call @cudaqRegisterArgsCreator(%[[VAL_1]], %[[VAL_3]]) : (!cc.ptr, !cc.ptr) -> () +// HYBRID: llvm.return +// HYBRID: } +// HYBRID: llvm.mlir.global_ctors {ctors = [@ghz.kernelRegFunc], priorities = [17 : i32]} + diff --git a/test/Quake/kernel_exec-2.qke b/test/Quake/kernel_exec-2.qke index a9b04b8449..e8be1ab6ac 100644 --- a/test/Quake/kernel_exec-2.qke +++ b/test/Quake/kernel_exec-2.qke @@ -6,7 +6,7 @@ // the terms of the Apache License 2.0 which accompanies this distribution. // // ========================================================================== // -// RUN: cudaq-opt --kernel-execution=codegen=1 %s | FileCheck %s +// RUN: cudaq-opt -kernel-execution %s | FileCheck %s module attributes {quake.mangled_name_map = { __nvqpp__mlirgen__function_hawaiian = "shirt", @@ -36,120 +36,209 @@ __nvqpp__mlirgen__function_cargo = "pants"}} { } } +// CHECK-LABEL: func.func @__nvqpp__mlirgen__function_cargo( +// CHECK-SAME: %[[VAL_0:.*]]: !cc.stdvec, +// CHECK-SAME: %[[VAL_1:.*]]: !quake.ref) attributes {"cudaq-kernel", no_this} { +// CHECK: return +// CHECK: } + +// CHECK-LABEL: func.func @__nvqpp__mlirgen__function_hawaiian( +// CHECK-SAME: %[[VAL_0:.*]]: i1, +// CHECK-SAME: %[[VAL_1:.*]]: !cc.stdvec) attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} { +// CHECK: %[[VAL_2:.*]] = quake.alloca !quake.ref +// CHECK: %[[VAL_3:.*]] = quake.alloca !quake.ref +// CHECK: %[[VAL_4:.*]] = quake.alloca !quake.ref +// CHECK: cc.if(%[[VAL_0]]) { +// CHECK: quake.x %[[VAL_4]] : (!quake.ref) -> () +// CHECK: } +// CHECK: call @__nvqpp__mlirgen__function_cargo(%[[VAL_1]], %[[VAL_4]]) : (!cc.stdvec, !quake.ref) -> () +// CHECK: return +// CHECK: } + // CHECK-LABEL: func.func @shirt( -// CHECK-SAME: %[[VAL_0:.*]]: i1, %[[VAL_1:.*]]: !cc.ptr, !cc.ptr, !cc.ptr}>>) { -// CHECK: %[[VAL_2:.*]] = cc.undef !cc.struct<{i1, i64}> -// CHECK: %[[VAL_3:.*]] = arith.constant 0 : i64 -// CHECK: %[[VAL_4:.*]] = cc.insert_value %[[VAL_0]], %[[VAL_2]][0] : (!cc.struct<{i1, i64}>, i1) -> !cc.struct<{i1, i64}> -// CHECK: %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_1]][1] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> -// CHECK: %[[VAL_6:.*]] = cc.compute_ptr %[[VAL_1]][0] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK-SAME: %[[VAL_0:.*]]: i1, +// CHECK-SAME: %[[VAL_1:.*]]: !cc.ptr, !cc.ptr, !cc.ptr}>>) { +// CHECK: %[[VAL_2:.*]] = cc.alloca i64 +// CHECK: %[[VAL_3:.*]] = cc.sizeof !cc.struct<{i1, i64}> : i64 +// CHECK: %[[VAL_4:.*]] = cc.compute_ptr %[[VAL_1]][1] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_1]][0] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: %[[VAL_6:.*]] = cc.load %[[VAL_4]] : !cc.ptr> // CHECK: %[[VAL_7:.*]] = cc.load %[[VAL_5]] : !cc.ptr> -// CHECK: %[[VAL_8:.*]] = cc.load %[[VAL_6]] : !cc.ptr> +// CHECK: %[[VAL_8:.*]] = cc.cast %[[VAL_6]] : (!cc.ptr) -> i64 // CHECK: %[[VAL_9:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr) -> i64 -// CHECK: %[[VAL_10:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr) -> i64 -// CHECK: %[[VAL_11:.*]] = arith.subi %[[VAL_9]], %[[VAL_10]] : i64 -// CHECK: %[[VAL_12:.*]] = cc.insert_value %[[VAL_11]], %[[VAL_4]][1] : (!cc.struct<{i1, i64}>, i64) -> !cc.struct<{i1, i64}> -// CHECK: %[[VAL_13:.*]] = arith.addi %[[VAL_3]], %[[VAL_11]] : i64 -// CHECK: %[[VAL_16:.*]] = cc.sizeof !cc.struct<{i1, i64}> : i64 -// CHECK: %[[VAL_17:.*]] = arith.addi %[[VAL_16]], %[[VAL_13]] : i64 -// CHECK: %[[VAL_18:.*]] = cc.alloca i8[%[[VAL_17]] : i64] -// CHECK: %[[VAL_19:.*]] = cc.cast %[[VAL_18]] : (!cc.ptr>) -> !cc.ptr> -// CHECK: cc.store %[[VAL_12]], %[[VAL_19]] : !cc.ptr> -// CHECK: %[[VAL_20:.*]] = cc.cast %[[VAL_18]] : (!cc.ptr>) -> !cc.ptr x ?>> -// CHECK: %[[VAL_21:.*]] = cc.compute_ptr %[[VAL_18]][%[[VAL_16]]] : (!cc.ptr>, i64) -> !cc.ptr -// CHECK: %[[VAL_22:.*]] = cc.extract_value %[[VAL_12]][1] : (!cc.struct<{i1, i64}>) -> i64 -// CHECK: %[[VAL_23:.*]] = arith.constant false -// CHECK: %[[VAL_24:.*]] = cc.compute_ptr %[[VAL_1]][0] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> -// CHECK: %[[VAL_25:.*]] = cc.load %[[VAL_24]] : !cc.ptr> -// CHECK: %[[VAL_26:.*]] = cc.cast %[[VAL_25]] : (!cc.ptr) -> !cc.ptr -// CHECK: call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_21]], %[[VAL_26]], %[[VAL_22]], %[[VAL_23]]) : (!cc.ptr, !cc.ptr, i64, i1) -> () -// CHECK: %[[VAL_90:.*]] = cc.cast %[[VAL_21]] : -// CHECK: %[[VAL_27:.*]] = cc.compute_ptr %[[VAL_90]][%[[VAL_22]]] : (!cc.ptr>, i64) -> !cc.ptr -// CHECK: %[[VAL_29:.*]] = constant @function_hawaiian.thunk : (!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}> -// CHECK: %[[VAL_31:.*]] = cc.func_ptr %[[VAL_29]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr -// CHECK: %[[VAL_32:.*]] = cc.cast %[[VAL_20]] : (!cc.ptr x ?>>) -> !cc.ptr -// CHECK: %[[VAL_33:.*]] = arith.constant 2147483647 : i64 -// CHECK: %[[VAL_28:.*]] = llvm.mlir.addressof @function_hawaiian.kernelName : !llvm.ptr> -// CHECK: %[[VAL_30:.*]] = cc.cast %[[VAL_28]] : (!llvm.ptr>) -> !cc.ptr -// CHECK: call @altLaunchKernel(%[[VAL_30]], %[[VAL_31]], %[[VAL_32]], %[[VAL_17]], %[[VAL_33]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_10:.*]] = arith.subi %[[VAL_8]], %[[VAL_9]] : i64 +// CHECK: %[[VAL_11:.*]] = arith.addi %[[VAL_10]], %[[VAL_3]] : i64 +// CHECK: %[[VAL_12:.*]] = cc.alloca i8{{\[}}%[[VAL_11]] : i64] +// CHECK: %[[VAL_13:.*]] = cc.cast %[[VAL_12]] : (!cc.ptr>) -> !cc.ptr> +// CHECK: %[[VAL_14:.*]] = cc.alloca !cc.ptr +// CHECK: %[[VAL_15:.*]] = cc.sizeof !cc.struct<{i1, i64}> : i64 +// CHECK: %[[VAL_16:.*]] = cc.compute_ptr %[[VAL_12]]{{\[}}%[[VAL_15]]] : (!cc.ptr>, i64) -> !cc.ptr +// CHECK: %[[VAL_17:.*]] = cc.compute_ptr %[[VAL_13]][0] : (!cc.ptr>) -> !cc.ptr +// CHECK: cc.store %[[VAL_0]], %[[VAL_17]] : !cc.ptr +// CHECK: %[[VAL_18:.*]] = cc.compute_ptr %[[VAL_13]][1] : (!cc.ptr>) -> !cc.ptr +// CHECK: %[[VAL_19:.*]] = cc.compute_ptr %[[VAL_1]][1] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: %[[VAL_20:.*]] = cc.compute_ptr %[[VAL_1]][0] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: %[[VAL_21:.*]] = cc.load %[[VAL_19]] : !cc.ptr> +// CHECK: %[[VAL_22:.*]] = cc.load %[[VAL_20]] : !cc.ptr> +// CHECK: %[[VAL_23:.*]] = cc.cast %[[VAL_21]] : (!cc.ptr) -> i64 +// CHECK: %[[VAL_24:.*]] = cc.cast %[[VAL_22]] : (!cc.ptr) -> i64 +// CHECK: %[[VAL_25:.*]] = arith.subi %[[VAL_23]], %[[VAL_24]] : i64 +// CHECK: cc.store %[[VAL_25]], %[[VAL_18]] : !cc.ptr +// CHECK: %[[VAL_26:.*]] = cc.cast %[[VAL_1]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: %[[VAL_27:.*]] = cc.load %[[VAL_26]] : !cc.ptr> +// CHECK: %[[VAL_28:.*]] = arith.constant false +// CHECK: %[[VAL_29:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr) -> !cc.ptr +// CHECK: call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_29]], %[[VAL_27]], %[[VAL_25]], %[[VAL_28]]) : (!cc.ptr, !cc.ptr, i64, i1) -> () +// CHECK: %[[VAL_30:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr) -> !cc.ptr> +// CHECK: %[[VAL_31:.*]] = cc.compute_ptr %[[VAL_30]]{{\[}}%[[VAL_25]]] : (!cc.ptr>, i64) -> !cc.ptr +// CHECK: %[[VAL_32:.*]] = constant @function_hawaiian.thunk : (!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_33:.*]] = cc.func_ptr %[[VAL_32]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr +// CHECK: %[[VAL_34:.*]] = cc.cast %[[VAL_13]] : (!cc.ptr>) -> !cc.ptr +// CHECK: %[[VAL_35:.*]] = arith.constant 2147483647 : i64 +// CHECK: %[[VAL_36:.*]] = cc.alloca !cc.struct<{!cc.ptr>, !cc.ptr>, !cc.ptr>}> +// CHECK: %[[VAL_37:.*]] = cc.alloca !cc.array x 2> +// CHECK: %[[VAL_38:.*]] = cc.sizeof !cc.array x 2> : i64 +// CHECK: %[[VAL_39:.*]] = cc.cast %[[VAL_37]] : (!cc.ptr x 2>>) -> !cc.ptr> +// CHECK: %[[VAL_40:.*]] = cc.cast %[[VAL_36]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// CHECK: cc.store %[[VAL_39]], %[[VAL_40]] : !cc.ptr>> +// CHECK: %[[VAL_41:.*]] = cc.cast %[[VAL_37]] : (!cc.ptr x 2>>) -> i64 +// CHECK: %[[VAL_42:.*]] = arith.addi %[[VAL_41]], %[[VAL_38]] : i64 +// CHECK: %[[VAL_43:.*]] = cc.cast %[[VAL_42]] : (i64) -> !cc.ptr> +// CHECK: %[[VAL_44:.*]] = cc.compute_ptr %[[VAL_36]][1] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// CHECK: cc.store %[[VAL_43]], %[[VAL_44]] : !cc.ptr>> +// CHECK: %[[VAL_45:.*]] = cc.compute_ptr %[[VAL_36]][2] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// CHECK: cc.store %[[VAL_43]], %[[VAL_45]] : !cc.ptr>> +// CHECK: %[[VAL_46:.*]] = cc.compute_ptr %[[VAL_37]][0] : (!cc.ptr x 2>>) -> !cc.ptr> +// CHECK: %[[VAL_47:.*]] = cc.alloca i1 +// CHECK: cc.store %[[VAL_0]], %[[VAL_47]] : !cc.ptr +// CHECK: %[[VAL_48:.*]] = cc.cast %[[VAL_47]] : (!cc.ptr) -> !cc.ptr +// CHECK: cc.store %[[VAL_48]], %[[VAL_46]] : !cc.ptr> +// CHECK: %[[VAL_49:.*]] = cc.compute_ptr %[[VAL_37]][1] : (!cc.ptr x 2>>) -> !cc.ptr> +// CHECK: %[[VAL_50:.*]] = cc.cast %[[VAL_1]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr +// CHECK: cc.store %[[VAL_50]], %[[VAL_49]] : !cc.ptr> +// CHECK: %[[VAL_51:.*]] = cc.cast %[[VAL_36]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr +// CHECK: %[[VAL_52:.*]] = llvm.mlir.addressof @function_hawaiian.kernelName : !llvm.ptr> +// CHECK: %[[VAL_53:.*]] = cc.cast %[[VAL_52]] : (!llvm.ptr>) -> !cc.ptr +// CHECK: %[[VAL_54:.*]] = call @hybridLaunchKernel(%[[VAL_53]], %[[VAL_33]], %[[VAL_34]], %[[VAL_11]], %[[VAL_35]], %[[VAL_51]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> // CHECK: return // CHECK: } - -// CHECK-DAG: func.func private @altLaunchKernel(!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) -> !cc.struct<{!cc.ptr, i64}> -// CHECK-DAG: func.func private @cudaqRegisterKernelName(!cc.ptr) -// CHECK-DAG: func.func private @cudaqRegisterArgsCreator(!cc.ptr, !cc.ptr) -// CHECK-DAG: func.func private @malloc(i64) -> !cc.ptr -// CHECK-DAG: func.func private @__nvqpp_initializer_list_to_vector_bool(!cc.ptr, !cc.ptr, i64) -// CHECK-DAG: func.func private @llvm.memcpy.p0i8.p0i8.i64(!cc.ptr, !cc.ptr, i64, i1) +// CHECK: func.func private @hybridLaunchKernel(!cc.ptr, !cc.ptr, !cc.ptr, i64, i64, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: func.func private @cudaqRegisterArgsCreator(!cc.ptr, !cc.ptr) +// CHECK: llvm.func @cudaqRegisterLambdaName(!llvm.ptr, !llvm.ptr) attributes {sym_visibility = "private"} +// CHECK: func.func private @__cudaq_registerLinkableKernel(!cc.ptr, !cc.ptr, !cc.ptr) +// CHECK: func.func private @__cudaq_getLinkableKernelKey(!cc.ptr) -> i64 +// CHECK: func.func private @cudaqRegisterKernelName(!cc.ptr) +// CHECK: func.func private @malloc(i64) -> !cc.ptr +// CHECK: func.func private @free(!cc.ptr) +// CHECK: func.func private @__nvqpp_initializer_list_to_vector_bool(!cc.ptr, !cc.ptr, i64) +// CHECK: func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr, !cc.ptr, !cc.ptr}>>, !cc.ptr, !cc.ptr, !cc.ptr}>>) +// CHECK: func.func private @llvm.memcpy.p0i8.p0i8.i64(!cc.ptr, !cc.ptr, i64, i1) // CHECK-LABEL: func.func private @__nvqpp_zeroDynamicResult() -> !cc.struct<{!cc.ptr, i64}> { +// CHECK: %[[VAL_0:.*]] = arith.constant 0 : i64 +// CHECK: %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (i64) -> !cc.ptr +// CHECK: %[[VAL_2:.*]] = cc.undef !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_3:.*]] = cc.insert_value %[[VAL_1]], %[[VAL_2]][0] : (!cc.struct<{!cc.ptr, i64}>, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_4:.*]] = cc.insert_value %[[VAL_0]], %[[VAL_3]][1] : (!cc.struct<{!cc.ptr, i64}>, i64) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: return %[[VAL_4]] : !cc.struct<{!cc.ptr, i64}> +// CHECK: } // CHECK-LABEL: func.func private @__nvqpp_createDynamicResult( +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, %[[VAL_1:.*]]: i64, %[[VAL_2:.*]]: !cc.ptr, i64}>>, %[[VAL_3:.*]]: i64) -> !cc.struct<{!cc.ptr, i64}> { +// CHECK: %[[VAL_4:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr, i64}>>) -> !cc.ptr +// CHECK: %[[VAL_5:.*]] = cc.load %[[VAL_4]] : !cc.ptr +// CHECK: %[[VAL_6:.*]] = arith.addi %[[VAL_1]], %[[VAL_5]] : i64 +// CHECK: %[[VAL_7:.*]] = call @malloc(%[[VAL_6]]) : (i64) -> !cc.ptr +// CHECK: %[[VAL_8:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr) -> !cc.ptr> +// CHECK: %[[VAL_9:.*]] = arith.constant false +// CHECK: call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_7]], %[[VAL_0]], %[[VAL_1]], %[[VAL_9]]) : (!cc.ptr, !cc.ptr, i64, i1) -> () +// CHECK: %[[VAL_10:.*]] = cc.compute_ptr %[[VAL_2]][0] : (!cc.ptr, i64}>>) -> !cc.ptr> +// CHECK: %[[VAL_11:.*]] = cc.load %[[VAL_10]] : !cc.ptr> +// CHECK: %[[VAL_12:.*]] = cc.compute_ptr %[[VAL_8]]{{\[}}%[[VAL_1]]] : (!cc.ptr>, i64) -> !cc.ptr +// CHECK: call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_12]], %[[VAL_11]], %[[VAL_5]], %[[VAL_9]]) : (!cc.ptr, !cc.ptr, i64, i1) -> () +// CHECK: %[[VAL_13:.*]] = cc.undef !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_14:.*]] = cc.insert_value %[[VAL_7]], %[[VAL_13]][0] : (!cc.struct<{!cc.ptr, i64}>, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_15:.*]] = cc.insert_value %[[VAL_6]], %[[VAL_14]][1] : (!cc.struct<{!cc.ptr, i64}>, i64) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_16:.*]] = cc.compute_ptr %[[VAL_8]]{{\[}}%[[VAL_3]]] : (!cc.ptr>, i64) -> !cc.ptr +// CHECK: %[[VAL_17:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr) -> !cc.ptr> +// CHECK: cc.store %[[VAL_12]], %[[VAL_17]] : !cc.ptr> +// CHECK: return %[[VAL_15]] : !cc.struct<{!cc.ptr, i64}> +// CHECK: } +// CHECK: llvm.mlir.global external constant @function_hawaiian.kernelName("function_hawaiian\00") {addr_space = 0 : i32} -// CHECK: llvm.mlir.global external constant @function_hawaiian.kernelName("function +// CHECK-LABEL: func.func @function_hawaiian.returnOffset() -> i64 { +// CHECK: %[[VAL_0:.*]] = arith.constant 2147483647 : i64 +// CHECK: return %[[VAL_0]] : i64 +// CHECK: } // CHECK-LABEL: func.func @function_hawaiian.thunk( -// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr, i64}> { +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, +// CHECK-SAME: %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr, i64}> { // CHECK: %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr) -> !cc.ptr> // CHECK: %[[VAL_3:.*]] = cc.load %[[VAL_2]] : !cc.ptr> -// CHECK: %[[VAL_7:.*]] = cc.sizeof !cc.struct<{i1, i64}> : i64 -// CHECK: %[[VAL_20:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr) -> !cc.ptr> -// CHECK: %[[VAL_8:.*]] = cc.compute_ptr %[[VAL_20]][%[[VAL_7]]] : (!cc.ptr>, i64) -> !cc.ptr -// CHECK: %[[VAL_9:.*]] = cc.extract_value %[[VAL_3]][0] : (!cc.struct<{i1, i64}>) -> i1 -// CHECK: %[[VAL_10:.*]] = cc.extract_value %[[VAL_3]][1] : (!cc.struct<{i1, i64}>) -> i64 -// CHECK: %[[VAL_11:.*]] = arith.constant 4 : i64 -// CHECK: %[[VAL_12:.*]] = arith.divsi %[[VAL_10]], %[[VAL_11]] : i64 -// CHECK: %[[VAL_13:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr) -> !cc.ptr -// CHECK: %[[VAL_14:.*]] = cc.stdvec_init %[[VAL_13]], %[[VAL_12]] : (!cc.ptr, i64) -> !cc.stdvec -// CHECK: %[[VAL_90:.*]] = cc.cast %[[VAL_8]] : -// CHECK: %[[VAL_15:.*]] = cc.compute_ptr %[[VAL_90]][%[[VAL_10]]] : (!cc.ptr>, i64) -> !cc.ptr -// CHECK: call @__nvqpp__mlirgen__function_hawaiian(%[[VAL_9]], %[[VAL_14]]) : (i1, !cc.stdvec) -> () -// CHECK: %[[VAL_16:.*]] = call @__nvqpp_zeroDynamicResult() : () -> !cc.struct<{!cc.ptr, i64}> -// CHECK: return %[[VAL_16]] : !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_4:.*]] = cc.sizeof !cc.struct<{i1, i64}> : i64 +// CHECK: %[[VAL_5:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr) -> !cc.ptr> +// CHECK: %[[VAL_6:.*]] = cc.compute_ptr %[[VAL_5]]{{\[}}%[[VAL_4]]] : (!cc.ptr>, i64) -> !cc.ptr +// CHECK: %[[VAL_7:.*]] = cc.extract_value %[[VAL_3]][0] : (!cc.struct<{i1, i64}>) -> i1 +// CHECK: %[[VAL_8:.*]] = cc.extract_value %[[VAL_3]][1] : (!cc.struct<{i1, i64}>) -> i64 +// CHECK: %[[VAL_9:.*]] = arith.constant 4 : i64 +// CHECK: %[[VAL_10:.*]] = arith.divsi %[[VAL_8]], %[[VAL_9]] : i64 +// CHECK: %[[VAL_11:.*]] = cc.cast %[[VAL_6]] : (!cc.ptr) -> !cc.ptr +// CHECK: %[[VAL_12:.*]] = cc.stdvec_init %[[VAL_11]], %[[VAL_10]] : (!cc.ptr, i64) -> !cc.stdvec +// CHECK: %[[VAL_13:.*]] = cc.cast %[[VAL_6]] : (!cc.ptr) -> !cc.ptr> +// CHECK: %[[VAL_14:.*]] = cc.compute_ptr %[[VAL_13]]{{\[}}%[[VAL_8]]] : (!cc.ptr>, i64) -> !cc.ptr +// CHECK: call @__nvqpp__mlirgen__function_hawaiian(%[[VAL_7]], %[[VAL_12]]) : (i1, !cc.stdvec) -> () +// CHECK: %[[VAL_15:.*]] = call @__nvqpp_zeroDynamicResult() : () -> !cc.struct<{!cc.ptr, i64}> +// CHECK: return %[[VAL_15]] : !cc.struct<{!cc.ptr, i64}> // CHECK: } // CHECK-LABEL: func.func @function_hawaiian.argsCreator( -// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr>, %[[VAL_1:.*]]: !cc.ptr>) -> i64 { -// CHECK: %[[VAL_2:.*]] = cc.undef !cc.struct<{i1, i64}> -// CHECK: %[[VAL_90:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr>) -> -// CHECK: %[[VAL_3:.*]] = arith.constant 0 : i64 -// CHECK: %[[VAL_4:.*]] = cc.compute_ptr %[[VAL_90]][0] : (!cc.ptr x ?>>) -> !cc.ptr> -// CHECK: %[[VAL_5:.*]] = cc.load %[[VAL_4]] : !cc.ptr> -// CHECK: %[[VAL_6:.*]] = cc.cast %[[VAL_5]] : (!cc.ptr) -> !cc.ptr -// CHECK: %[[VAL_7:.*]] = cc.load %[[VAL_6]] : !cc.ptr -// CHECK: %[[VAL_8:.*]] = cc.insert_value %[[VAL_7]], %[[VAL_2]][0] : (!cc.struct<{i1, i64}>, i1) -> !cc.struct<{i1, i64}> -// CHECK: %[[VAL_9:.*]] = cc.compute_ptr %[[VAL_90]][1] : (!cc.ptr x ?>>) -> !cc.ptr> -// CHECK: %[[VAL_10:.*]] = cc.load %[[VAL_9]] : !cc.ptr> -// CHECK: %[[VAL_11:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr) -> !cc.ptr, !cc.ptr, !cc.ptr}>> -// CHECK: %[[VAL_12:.*]] = cc.compute_ptr %[[VAL_11]][1] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> -// CHECK: %[[VAL_13:.*]] = cc.compute_ptr %[[VAL_11]][0] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr>, +// CHECK-SAME: %[[VAL_1:.*]]: !cc.ptr>) -> i64 { +// CHECK: %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr>) -> !cc.ptr x ?>> +// CHECK: %[[VAL_3:.*]] = cc.compute_ptr %[[VAL_2]][0] : (!cc.ptr x ?>>) -> !cc.ptr> +// CHECK: %[[VAL_4:.*]] = cc.load %[[VAL_3]] : !cc.ptr> +// CHECK: %[[VAL_5:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr) -> !cc.ptr +// CHECK: %[[VAL_6:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr x ?>>) -> !cc.ptr> +// CHECK: %[[VAL_7:.*]] = cc.load %[[VAL_6]] : !cc.ptr> +// CHECK: %[[VAL_8:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr) -> !cc.ptr, !cc.ptr, !cc.ptr}>> +// CHECK: %[[VAL_9:.*]] = cc.load %[[VAL_5]] : !cc.ptr +// CHECK: %[[VAL_10:.*]] = cc.alloca i64 +// CHECK: %[[VAL_11:.*]] = cc.sizeof !cc.struct<{i1, i64}> : i64 +// CHECK: %[[VAL_12:.*]] = cc.compute_ptr %[[VAL_8]][1] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: %[[VAL_13:.*]] = cc.compute_ptr %[[VAL_8]][0] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> // CHECK: %[[VAL_14:.*]] = cc.load %[[VAL_12]] : !cc.ptr> // CHECK: %[[VAL_15:.*]] = cc.load %[[VAL_13]] : !cc.ptr> // CHECK: %[[VAL_16:.*]] = cc.cast %[[VAL_14]] : (!cc.ptr) -> i64 // CHECK: %[[VAL_17:.*]] = cc.cast %[[VAL_15]] : (!cc.ptr) -> i64 // CHECK: %[[VAL_18:.*]] = arith.subi %[[VAL_16]], %[[VAL_17]] : i64 -// CHECK: %[[VAL_19:.*]] = cc.insert_value %[[VAL_18]], %[[VAL_8]][1] : (!cc.struct<{i1, i64}>, i64) -> !cc.struct<{i1, i64}> -// CHECK: %[[VAL_20:.*]] = arith.addi %[[VAL_3]], %[[VAL_18]] : i64 +// CHECK: %[[VAL_19:.*]] = arith.addi %[[VAL_18]], %[[VAL_11]] : i64 +// CHECK: %[[VAL_20:.*]] = call @malloc(%[[VAL_19]]) : (i64) -> !cc.ptr +// CHECK: %[[VAL_21:.*]] = cc.cast %[[VAL_20]] : (!cc.ptr) -> !cc.ptr> +// CHECK: %[[VAL_22:.*]] = cc.alloca !cc.ptr // CHECK: %[[VAL_23:.*]] = cc.sizeof !cc.struct<{i1, i64}> : i64 -// CHECK: %[[VAL_24:.*]] = arith.addi %[[VAL_23]], %[[VAL_20]] : i64 -// CHECK: %[[VAL_25:.*]] = call @malloc(%[[VAL_24]]) : (i64) -> !cc.ptr -// CHECK: %[[VAL_26:.*]] = cc.cast %[[VAL_25]] : (!cc.ptr) -> !cc.ptr> -// CHECK: cc.store %[[VAL_19]], %[[VAL_26]] : !cc.ptr> -// CHECK: %[[VAL_80:.*]] = cc.cast %[[VAL_25]] : -// CHECK: %[[VAL_27:.*]] = cc.compute_ptr %[[VAL_80]][%[[VAL_23]]] : (!cc.ptr>, i64) -> !cc.ptr -// CHECK: %[[VAL_28:.*]] = cc.extract_value %[[VAL_19]][1] : (!cc.struct<{i1, i64}>) -> i64 -// CHECK: %[[VAL_29:.*]] = cc.compute_ptr %[[VAL_90]][1] : (!cc.ptr x ?>>) -> !cc.ptr> -// CHECK: %[[VAL_30:.*]] = cc.load %[[VAL_29]] : !cc.ptr> -// CHECK: %[[VAL_31:.*]] = cc.cast %[[VAL_30]] : (!cc.ptr) -> !cc.ptr, !cc.ptr, !cc.ptr}>> -// CHECK: %[[VAL_32:.*]] = arith.constant false -// CHECK: %[[VAL_33:.*]] = cc.compute_ptr %[[VAL_31]][0] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> -// CHECK: %[[VAL_34:.*]] = cc.load %[[VAL_33]] : !cc.ptr> -// CHECK: %[[VAL_35:.*]] = cc.cast %[[VAL_34]] : (!cc.ptr) -> !cc.ptr -// CHECK: call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_27]], %[[VAL_35]], %[[VAL_28]], %[[VAL_32]]) : (!cc.ptr, !cc.ptr, i64, i1) -> () -// CHECK: %[[VAL_83:.*]] = cc.cast %[[VAL_27]] : -// CHECK: %[[VAL_36:.*]] = cc.compute_ptr %[[VAL_83]][%[[VAL_28]]] : (!cc.ptr>, i64) -> !cc.ptr -// CHECK: cc.store %[[VAL_25]], %[[VAL_1]] : !cc.ptr> -// CHECK: return %[[VAL_24]] : i64 +// CHECK: %[[VAL_24:.*]] = cc.cast %[[VAL_20]] : (!cc.ptr) -> !cc.ptr> +// CHECK: %[[VAL_25:.*]] = cc.compute_ptr %[[VAL_24]]{{\[}}%[[VAL_23]]] : (!cc.ptr>, i64) -> !cc.ptr +// CHECK: %[[VAL_26:.*]] = cc.compute_ptr %[[VAL_21]][0] : (!cc.ptr>) -> !cc.ptr +// CHECK: cc.store %[[VAL_9]], %[[VAL_26]] : !cc.ptr +// CHECK: %[[VAL_27:.*]] = cc.compute_ptr %[[VAL_21]][1] : (!cc.ptr>) -> !cc.ptr +// CHECK: %[[VAL_28:.*]] = cc.compute_ptr %[[VAL_8]][1] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: %[[VAL_29:.*]] = cc.compute_ptr %[[VAL_8]][0] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: %[[VAL_30:.*]] = cc.load %[[VAL_28]] : !cc.ptr> +// CHECK: %[[VAL_31:.*]] = cc.load %[[VAL_29]] : !cc.ptr> +// CHECK: %[[VAL_32:.*]] = cc.cast %[[VAL_30]] : (!cc.ptr) -> i64 +// CHECK: %[[VAL_33:.*]] = cc.cast %[[VAL_31]] : (!cc.ptr) -> i64 +// CHECK: %[[VAL_34:.*]] = arith.subi %[[VAL_32]], %[[VAL_33]] : i64 +// CHECK: cc.store %[[VAL_34]], %[[VAL_27]] : !cc.ptr +// CHECK: %[[VAL_35:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: %[[VAL_36:.*]] = cc.load %[[VAL_35]] : !cc.ptr> +// CHECK: %[[VAL_37:.*]] = arith.constant false +// CHECK: %[[VAL_38:.*]] = cc.cast %[[VAL_25]] : (!cc.ptr) -> !cc.ptr +// CHECK: call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_38]], %[[VAL_36]], %[[VAL_34]], %[[VAL_37]]) : (!cc.ptr, !cc.ptr, i64, i1) -> () +// CHECK: %[[VAL_39:.*]] = cc.cast %[[VAL_25]] : (!cc.ptr) -> !cc.ptr> +// CHECK: %[[VAL_40:.*]] = cc.compute_ptr %[[VAL_39]]{{\[}}%[[VAL_34]]] : (!cc.ptr>, i64) -> !cc.ptr +// CHECK: cc.store %[[VAL_20]], %[[VAL_1]] : !cc.ptr> +// CHECK: return %[[VAL_19]] : i64 // CHECK: } // CHECK-LABEL: llvm.func @function_hawaiian.kernelRegFunc() { @@ -161,6 +250,5 @@ __nvqpp__mlirgen__function_cargo = "pants"}} { // CHECK: func.call @cudaqRegisterArgsCreator(%[[VAL_1]], %[[VAL_3]]) : (!cc.ptr, !cc.ptr) -> () // CHECK: llvm.return // CHECK: } - // CHECK: llvm.mlir.global_ctors {ctors = [@function_hawaiian.kernelRegFunc], priorities = [17 : i32]} diff --git a/test/Quake/lambda_kernel_exec.qke b/test/Quake/lambda_kernel_exec.qke index 606b644ffe..aedb9564b5 100644 --- a/test/Quake/lambda_kernel_exec.qke +++ b/test/Quake/lambda_kernel_exec.qke @@ -15,7 +15,7 @@ // CHECK: llvm.call @cudaqRegisterLambdaName(%[[VAL_1]], %{{.*}}) : (!llvm.ptr, !llvm.ptr) -> () module attributes {quake.mangled_name_map = {__nvqpp__mlirgen__lambda.main.canHaveMultiple = "_ZZ4mainENK3$_1clEv", __nvqpp__mlirgen__lambda.main.test = "_ZZ4mainENK3$_0clEv"}} { - func.func @__nvqpp__mlirgen__lambda.main.test() attributes {"cudaq-entrypoint"} { + func.func @__nvqpp__mlirgen__lambda.main.test() attributes {"cudaq-entrypoint", no_this} { %c2_i32 = arith.constant 2 : i32 %0 = arith.extsi %c2_i32 : i32 to i64 %1 = quake.alloca !quake.veq[%0 : i64] @@ -54,7 +54,7 @@ module attributes {quake.mangled_name_map = {__nvqpp__mlirgen__lambda.main.canHa // CHECK-NEXT: %[[VAL_4:.*]] = cc.cast %[[VAL_3]] : (!llvm.ptr>) -> !llvm.ptr // CHECK: llvm.call @cudaqRegisterLambdaName(%[[VAL_4]], %{{.*}}) : (!llvm.ptr, !llvm.ptr) -> () - func.func @__nvqpp__mlirgen__lambda.main.canHaveMultiple() attributes {"cudaq-entrypoint"} { + func.func @__nvqpp__mlirgen__lambda.main.canHaveMultiple() attributes {"cudaq-entrypoint", no_this} { %c2_i32 = arith.constant 2 : i32 %0 = arith.extsi %c2_i32 : i32 to i64 %1 = quake.alloca !quake.veq[%0 : i64] diff --git a/test/Quake/return_vector.qke b/test/Quake/return_vector.qke index 90ccc90610..0c706ca7b1 100644 --- a/test/Quake/return_vector.qke +++ b/test/Quake/return_vector.qke @@ -6,8 +6,7 @@ // the terms of the Apache License 2.0 which accompanies this distribution. // // ========================================================================== // -// RUN: cudaq-opt --add-dealloc --kernel-execution=codegen=1 --canonicalize %s \ -// RUN: | FileCheck %s +// RUN: cudaq-opt -add-dealloc -kernel-execution -canonicalize %s | FileCheck %s // NB: the mangled name map is required for the kernel-execution pass. module attributes{ quake.mangled_name_map = { @@ -29,61 +28,81 @@ func.func @test_0(%0: !cc.ptr, !cc.ptr, !cc.ptr !cc.stdvec { -// CHECK-DAG: %[[VAL_2:.*]] = arith.constant 8 : i64 -// CHECK-DAG: %[[VAL_3:.*]] = arith.constant 256 : i64 -// CHECK: %[[VAL_4:.*]] = call @malloc(%[[VAL_3]]) : (i64) -> !cc.ptr -// CHECK: %[[VAL_5:.*]] = cc.stdvec_init %[[VAL_4]], %[[VAL_2]] : (!cc.ptr, i64) -> !cc.stdvec -// CHECK: return %[[VAL_5]] : !cc.stdvec +// CHECK-SAME: %[[VAL_0:.*]]: i32) -> !cc.stdvec { +// CHECK: %[[VAL_1:.*]] = arith.constant 8 : i64 +// CHECK: %[[VAL_2:.*]] = arith.constant 256 : i64 +// CHECK: %[[VAL_3:.*]] = call @malloc(%[[VAL_2]]) : (i64) -> !cc.ptr +// CHECK: %[[VAL_4:.*]] = cc.stdvec_init %[[VAL_3]], %[[VAL_1]] : (!cc.ptr, i64) -> !cc.stdvec +// CHECK: return %[[VAL_4]] : !cc.stdvec // CHECK: } // CHECK-LABEL: func.func @test_0( -// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, !cc.ptr, !cc.ptr}>> {llvm.sret = !cc.struct<{!cc.ptr, !cc.ptr, !cc.ptr}>}, -// CHECK-SAME: %[[VAL_1:.*]]: !cc.ptr, %[[VAL_2:.*]]: i32) { +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, !cc.ptr, !cc.ptr}>> {llvm.sret = !cc.struct<{!cc.ptr, !cc.ptr, !cc.ptr}>}, +// CHECK-SAME: %[[VAL_1:.*]]: !cc.ptr, +// CHECK-SAME: %[[VAL_2:.*]]: i32) { // CHECK: %[[VAL_3:.*]] = arith.constant 4 : i64 -// CHECK: %[[VAL_4:.*]] = constant @test_0.thunk : (!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_4:.*]] = arith.constant 8 : i64 // CHECK: %[[VAL_5:.*]] = arith.constant 0 : i64 -// CHECK: %[[VAL_6:.*]] = cc.undef !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> -// CHECK: %[[VAL_7:.*]] = cc.insert_value %[[VAL_2]], %[[VAL_6]][0] : (!cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}>, i32) -> !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> +// CHECK: %[[VAL_6:.*]] = constant @test_0.thunk : (!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_7:.*]] = cc.alloca i64 // CHECK: %[[VAL_8:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 // CHECK: %[[VAL_9:.*]] = cc.alloca i8{{\[}}%[[VAL_8]] : i64] // CHECK: %[[VAL_10:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr>) -> !cc.ptr, i64}>}>> -// CHECK: cc.store %[[VAL_7]], %[[VAL_10]] : !cc.ptr, i64}>}>> -// CHECK: %[[VAL_11:.*]] = cc.func_ptr %[[VAL_4]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr -// CHECK: %[[VAL_12:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr>) -> !cc.ptr -// CHECK: %[[VAL_13:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> [1] : i64 -// CHECK: %[[VAL_14:.*]] = llvm.mlir.addressof @test_0.kernelName : !llvm.ptr> -// CHECK: %[[VAL_15:.*]] = cc.cast %[[VAL_14]] : (!llvm.ptr>) -> !cc.ptr -// CHECK: %[[VAL_16:.*]] = call @altLaunchKernel(%[[VAL_15]], %[[VAL_11]], %[[VAL_12]], %[[VAL_8]], %[[VAL_13]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) -> !cc.struct<{!cc.ptr, i64}> -// CHECK: %[[VAL_17:.*]] = cc.extract_value %[[VAL_16]][0] : (!cc.struct<{!cc.ptr, i64}>) -> !cc.ptr -// CHECK: %[[VAL_18:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr) -> i64 -// CHECK: %[[VAL_19:.*]] = arith.cmpi ne, %[[VAL_18]], %[[VAL_5]] : i64 -// CHECK: cf.cond_br %[[VAL_19]], ^bb1, ^bb2 +// CHECK: %[[VAL_11:.*]] = cc.alloca !cc.ptr +// CHECK: %[[VAL_12:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr>) -> !cc.ptr +// CHECK: cc.store %[[VAL_2]], %[[VAL_12]] : !cc.ptr +// CHECK: %[[VAL_13:.*]] = cc.func_ptr %[[VAL_6]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr +// CHECK: %[[VAL_14:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr>) -> !cc.ptr +// CHECK: %[[VAL_15:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> [1] : i64 +// CHECK: %[[VAL_16:.*]] = cc.alloca !cc.struct<{!cc.ptr>, !cc.ptr>, !cc.ptr>}> +// CHECK: %[[VAL_17:.*]] = cc.alloca !cc.array x 1> +// CHECK: %[[VAL_18:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr x 1>>) -> !cc.ptr> +// CHECK: %[[VAL_19:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// CHECK: cc.store %[[VAL_18]], %[[VAL_19]] : !cc.ptr>> +// CHECK: %[[VAL_20:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr x 1>>) -> i64 +// CHECK: %[[VAL_21:.*]] = arith.addi %[[VAL_20]], %[[VAL_4]] : i64 +// CHECK: %[[VAL_22:.*]] = cc.cast %[[VAL_21]] : (i64) -> !cc.ptr> +// CHECK: %[[VAL_23:.*]] = cc.compute_ptr %[[VAL_16]][1] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// CHECK: cc.store %[[VAL_22]], %[[VAL_23]] : !cc.ptr>> +// CHECK: %[[VAL_24:.*]] = cc.compute_ptr %[[VAL_16]][2] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// CHECK: cc.store %[[VAL_22]], %[[VAL_24]] : !cc.ptr>> +// CHECK: %[[VAL_25:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr x 1>>) -> !cc.ptr> +// CHECK: %[[VAL_26:.*]] = cc.alloca i32 +// CHECK: cc.store %[[VAL_2]], %[[VAL_26]] : !cc.ptr +// CHECK: %[[VAL_27:.*]] = cc.cast %[[VAL_26]] : (!cc.ptr) -> !cc.ptr +// CHECK: cc.store %[[VAL_27]], %[[VAL_25]] : !cc.ptr> +// CHECK: %[[VAL_28:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr +// CHECK: %[[VAL_29:.*]] = llvm.mlir.addressof @test_0.kernelName : !llvm.ptr> +// CHECK: %[[VAL_30:.*]] = cc.cast %[[VAL_29]] : (!llvm.ptr>) -> !cc.ptr +// CHECK: %[[VAL_31:.*]] = call @hybridLaunchKernel(%[[VAL_30]], %[[VAL_13]], %[[VAL_14]], %[[VAL_8]], %[[VAL_15]], %[[VAL_28]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_32:.*]] = cc.extract_value %[[VAL_31]][0] : (!cc.struct<{!cc.ptr, i64}>) -> !cc.ptr +// CHECK: %[[VAL_33:.*]] = cc.cast %[[VAL_32]] : (!cc.ptr) -> i64 +// CHECK: %[[VAL_34:.*]] = arith.cmpi ne, %[[VAL_33]], %[[VAL_5]] : i64 +// CHECK: cf.cond_br %[[VAL_34]], ^bb1, ^bb2 // CHECK: ^bb1: -// CHECK: %[[VAL_20:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr) -> !cc.ptr, i64}>}>> -// CHECK: %[[VAL_21:.*]] = cc.compute_ptr %[[VAL_20]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> -// CHECK: cf.br ^bb3(%[[VAL_21]] : !cc.ptr, i64}>>) +// CHECK: %[[VAL_35:.*]] = cc.cast %[[VAL_32]] : (!cc.ptr) -> !cc.ptr, i64}>}>> +// CHECK: %[[VAL_36:.*]] = cc.compute_ptr %[[VAL_35]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> +// CHECK: cf.br ^bb3(%[[VAL_36]] : !cc.ptr, i64}>>) // CHECK: ^bb2: -// CHECK: %[[VAL_22:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr>) -> !cc.ptr, i64}>}>> -// CHECK: %[[VAL_23:.*]] = cc.compute_ptr %[[VAL_22]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> -// CHECK: cf.br ^bb3(%[[VAL_23]] : !cc.ptr, i64}>>) -// CHECK: ^bb3(%[[VAL_24:.*]]: !cc.ptr, i64}>>): -// CHECK: %[[VAL_25:.*]] = cc.cast %[[VAL_24]] : (!cc.ptr, i64}>>) -> !cc.ptr> -// CHECK: %[[VAL_26:.*]] = cc.load %[[VAL_25]] : !cc.ptr> -// CHECK: %[[VAL_27:.*]] = cc.compute_ptr %[[VAL_24]][1] : (!cc.ptr, i64}>>) -> !cc.ptr -// CHECK: %[[VAL_28:.*]] = cc.load %[[VAL_27]] : !cc.ptr -// CHECK: %[[VAL_29:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr, !cc.ptr, !cc.ptr}>> -// CHECK: %[[VAL_30:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> -// CHECK: %[[VAL_31:.*]] = cc.cast %[[VAL_26]] : (!cc.ptr) -> !cc.ptr -// CHECK: cc.store %[[VAL_31]], %[[VAL_30]] : !cc.ptr> -// CHECK: %[[VAL_32:.*]] = cc.compute_ptr %[[VAL_29]][1] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> -// CHECK: %[[VAL_33:.*]] = arith.muli %[[VAL_28]], %[[VAL_3]] : i64 -// CHECK: %[[VAL_34:.*]] = cc.cast %[[VAL_26]] : (!cc.ptr) -> !cc.ptr> -// CHECK: %[[VAL_35:.*]] = cc.compute_ptr %[[VAL_34]]{{\[}}%[[VAL_33]]] : (!cc.ptr>, i64) -> !cc.ptr -// CHECK: cc.store %[[VAL_35]], %[[VAL_32]] : !cc.ptr> -// CHECK: %[[VAL_36:.*]] = cc.compute_ptr %[[VAL_29]][2] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> -// CHECK: cc.store %[[VAL_35]], %[[VAL_36]] : !cc.ptr> -// CHECK: call @free(%[[VAL_17]]) : (!cc.ptr) -> () +// CHECK: %[[VAL_37:.*]] = cc.compute_ptr %[[VAL_10]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> +// CHECK: cf.br ^bb3(%[[VAL_37]] : !cc.ptr, i64}>>) +// CHECK: ^bb3(%[[VAL_38:.*]]: !cc.ptr, i64}>>): +// CHECK: %[[VAL_39:.*]] = cc.cast %[[VAL_38]] : (!cc.ptr, i64}>>) -> !cc.ptr> +// CHECK: %[[VAL_40:.*]] = cc.load %[[VAL_39]] : !cc.ptr> +// CHECK: %[[VAL_41:.*]] = cc.compute_ptr %[[VAL_38]][1] : (!cc.ptr, i64}>>) -> !cc.ptr +// CHECK: %[[VAL_42:.*]] = cc.load %[[VAL_41]] : !cc.ptr +// CHECK: %[[VAL_43:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr, !cc.ptr, !cc.ptr}>> +// CHECK: %[[VAL_44:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: %[[VAL_45:.*]] = cc.cast %[[VAL_40]] : (!cc.ptr) -> !cc.ptr +// CHECK: cc.store %[[VAL_45]], %[[VAL_44]] : !cc.ptr> +// CHECK: %[[VAL_46:.*]] = cc.compute_ptr %[[VAL_43]][1] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: %[[VAL_47:.*]] = arith.muli %[[VAL_42]], %[[VAL_3]] : i64 +// CHECK: %[[VAL_48:.*]] = cc.cast %[[VAL_40]] : (!cc.ptr) -> !cc.ptr> +// CHECK: %[[VAL_49:.*]] = cc.compute_ptr %[[VAL_48]]{{\[}}%[[VAL_47]]] : (!cc.ptr>, i64) -> !cc.ptr +// CHECK: cc.store %[[VAL_49]], %[[VAL_46]] : !cc.ptr> +// CHECK: %[[VAL_50:.*]] = cc.compute_ptr %[[VAL_43]][2] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: cc.store %[[VAL_49]], %[[VAL_50]] : !cc.ptr> +// CHECK: call @free(%[[VAL_32]]) : (!cc.ptr) -> () // CHECK: return // CHECK: } @@ -98,70 +117,139 @@ func.func @__nvqpp__mlirgen__test_1(%arg0: i32) -> !cc.stdvec { func.func @test_1(%0: !cc.ptr, !cc.ptr, !cc.ptr}>> {llvm.sret = !cc.struct<{!cc.ptr, !cc.ptr, !cc.ptr}>}, %1: !cc.ptr, %2: i32) { return } +} // CHECK-LABEL: func.func @__nvqpp__mlirgen__test_1( -// CHECK-SAME: %[[VAL_1:.*]]: i32) -> !cc.stdvec { -// CHECK: %[[VAL_2:.*]] = arith.constant 9 : i64 -// CHECK: %[[VAL_3:.*]] = arith.constant 520 : i64 -// CHECK: %[[VAL_4:.*]] = call @malloc(%[[VAL_3]]) : (i64) -> !cc.ptr -// CHECK: %[[VAL_5:.*]] = cc.stdvec_init %[[VAL_4]], %[[VAL_2]] : (!cc.ptr, i64) -> !cc.stdvec -// CHECK: return +// CHECK-SAME: %[[VAL_0:.*]]: i32) -> !cc.stdvec { +// CHECK: %[[VAL_1:.*]] = arith.constant 9 : i64 +// CHECK: %[[VAL_2:.*]] = arith.constant 520 : i64 +// CHECK: %[[VAL_3:.*]] = call @malloc(%[[VAL_2]]) : (i64) -> !cc.ptr +// CHECK: %[[VAL_4:.*]] = cc.stdvec_init %[[VAL_3]], %[[VAL_1]] : (!cc.ptr, i64) -> !cc.stdvec +// CHECK: return %[[VAL_4]] : !cc.stdvec // CHECK: } // CHECK-LABEL: func.func @test_1( -// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, !cc.ptr, !cc.ptr}>> {llvm.sret = !cc.struct<{!cc.ptr, !cc.ptr, !cc.ptr}>}, -// CHECK-SAME: %[[VAL_1:.*]]: !cc.ptr, %[[VAL_2:.*]]: i32) { +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, !cc.ptr, !cc.ptr}>> {llvm.sret = !cc.struct<{!cc.ptr, !cc.ptr, !cc.ptr}>}, +// CHECK-SAME: %[[VAL_1:.*]]: !cc.ptr, +// CHECK-SAME: %[[VAL_2:.*]]: i32) { // CHECK: %[[VAL_3:.*]] = arith.constant 8 : i64 -// CHECK: %[[VAL_4:.*]] = constant @test_1.thunk : (!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}> -// CHECK: %[[VAL_5:.*]] = arith.constant 0 : i64 -// CHECK: %[[VAL_6:.*]] = cc.undef !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> -// CHECK: %[[VAL_7:.*]] = cc.insert_value %[[VAL_2]], %[[VAL_6]][0] : (!cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}>, i32) -> !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> -// CHECK: %[[VAL_8:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 -// CHECK: %[[VAL_9:.*]] = cc.alloca i8{{\[}}%[[VAL_8]] : i64] -// CHECK: %[[VAL_10:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr>) -> !cc.ptr, i64}>}>> -// CHECK: cc.store %[[VAL_7]], %[[VAL_10]] : !cc.ptr, i64}>}>> -// CHECK: %[[VAL_11:.*]] = cc.func_ptr %[[VAL_4]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr -// CHECK: %[[VAL_12:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr>) -> !cc.ptr -// CHECK: %[[VAL_13:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> [1] : i64 -// CHECK: %[[VAL_14:.*]] = llvm.mlir.addressof @test_1.kernelName : !llvm.ptr> -// CHECK: %[[VAL_15:.*]] = cc.cast %[[VAL_14]] : (!llvm.ptr>) -> !cc.ptr -// CHECK: %[[VAL_16:.*]] = call @altLaunchKernel(%[[VAL_15]], %[[VAL_11]], %[[VAL_12]], %[[VAL_8]], %[[VAL_13]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) -> !cc.struct<{!cc.ptr, i64}> -// CHECK: %[[VAL_17:.*]] = cc.extract_value %[[VAL_16]][0] : (!cc.struct<{!cc.ptr, i64}>) -> !cc.ptr -// CHECK: %[[VAL_18:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr) -> i64 -// CHECK: %[[VAL_19:.*]] = arith.cmpi ne, %[[VAL_18]], %[[VAL_5]] : i64 -// CHECK: cf.cond_br %[[VAL_19]], ^bb1, ^bb2 +// CHECK: %[[VAL_4:.*]] = arith.constant 0 : i64 +// CHECK: %[[VAL_5:.*]] = constant @test_1.thunk : (!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_6:.*]] = cc.alloca i64 +// CHECK: %[[VAL_7:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 +// CHECK: %[[VAL_8:.*]] = cc.alloca i8{{\[}}%[[VAL_7]] : i64] +// CHECK: %[[VAL_9:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr>) -> !cc.ptr, i64}>}>> +// CHECK: %[[VAL_10:.*]] = cc.alloca !cc.ptr +// CHECK: %[[VAL_11:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr>) -> !cc.ptr +// CHECK: cc.store %[[VAL_2]], %[[VAL_11]] : !cc.ptr +// CHECK: %[[VAL_12:.*]] = cc.func_ptr %[[VAL_5]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr +// CHECK: %[[VAL_13:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr>) -> !cc.ptr +// CHECK: %[[VAL_14:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> [1] : i64 +// CHECK: %[[VAL_15:.*]] = cc.alloca !cc.struct<{!cc.ptr>, !cc.ptr>, !cc.ptr>}> +// CHECK: %[[VAL_16:.*]] = cc.alloca !cc.array x 1> +// CHECK: %[[VAL_17:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr x 1>>) -> !cc.ptr> +// CHECK: %[[VAL_18:.*]] = cc.cast %[[VAL_15]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// CHECK: cc.store %[[VAL_17]], %[[VAL_18]] : !cc.ptr>> +// CHECK: %[[VAL_19:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr x 1>>) -> i64 +// CHECK: %[[VAL_20:.*]] = arith.addi %[[VAL_19]], %[[VAL_3]] : i64 +// CHECK: %[[VAL_21:.*]] = cc.cast %[[VAL_20]] : (i64) -> !cc.ptr> +// CHECK: %[[VAL_22:.*]] = cc.compute_ptr %[[VAL_15]][1] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// CHECK: cc.store %[[VAL_21]], %[[VAL_22]] : !cc.ptr>> +// CHECK: %[[VAL_23:.*]] = cc.compute_ptr %[[VAL_15]][2] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// CHECK: cc.store %[[VAL_21]], %[[VAL_23]] : !cc.ptr>> +// CHECK: %[[VAL_24:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr x 1>>) -> !cc.ptr> +// CHECK: %[[VAL_25:.*]] = cc.alloca i32 +// CHECK: cc.store %[[VAL_2]], %[[VAL_25]] : !cc.ptr +// CHECK: %[[VAL_26:.*]] = cc.cast %[[VAL_25]] : (!cc.ptr) -> !cc.ptr +// CHECK: cc.store %[[VAL_26]], %[[VAL_24]] : !cc.ptr> +// CHECK: %[[VAL_27:.*]] = cc.cast %[[VAL_15]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr +// CHECK: %[[VAL_28:.*]] = llvm.mlir.addressof @test_1.kernelName : !llvm.ptr> +// CHECK: %[[VAL_29:.*]] = cc.cast %[[VAL_28]] : (!llvm.ptr>) -> !cc.ptr +// CHECK: %[[VAL_30:.*]] = call @hybridLaunchKernel(%[[VAL_29]], %[[VAL_12]], %[[VAL_13]], %[[VAL_7]], %[[VAL_14]], %[[VAL_27]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_31:.*]] = cc.extract_value %[[VAL_30]][0] : (!cc.struct<{!cc.ptr, i64}>) -> !cc.ptr +// CHECK: %[[VAL_32:.*]] = cc.cast %[[VAL_31]] : (!cc.ptr) -> i64 +// CHECK: %[[VAL_33:.*]] = arith.cmpi ne, %[[VAL_32]], %[[VAL_4]] : i64 +// CHECK: cf.cond_br %[[VAL_33]], ^bb1, ^bb2 // CHECK: ^bb1: -// CHECK: %[[VAL_20:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr) -> !cc.ptr, i64}>}>> -// CHECK: %[[VAL_21:.*]] = cc.compute_ptr %[[VAL_20]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> -// CHECK: cf.br ^bb3(%[[VAL_21]] : !cc.ptr, i64}>>) +// CHECK: %[[VAL_34:.*]] = cc.cast %[[VAL_31]] : (!cc.ptr) -> !cc.ptr, i64}>}>> +// CHECK: %[[VAL_35:.*]] = cc.compute_ptr %[[VAL_34]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> +// CHECK: cf.br ^bb3(%[[VAL_35]] : !cc.ptr, i64}>>) // CHECK: ^bb2: -// CHECK: %[[VAL_22:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr>) -> !cc.ptr, i64}>}>> -// CHECK: %[[VAL_23:.*]] = cc.compute_ptr %[[VAL_22]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> -// CHECK: cf.br ^bb3(%[[VAL_23]] : !cc.ptr, i64}>>) -// CHECK: ^bb3(%[[VAL_24:.*]]: !cc.ptr, i64}>>): -// CHECK: %[[VAL_25:.*]] = cc.cast %[[VAL_24]] : (!cc.ptr, i64}>>) -> !cc.ptr> -// CHECK: %[[VAL_26:.*]] = cc.load %[[VAL_25]] : !cc.ptr> -// CHECK: %[[VAL_27:.*]] = cc.compute_ptr %[[VAL_24]][1] : (!cc.ptr, i64}>>) -> !cc.ptr -// CHECK: %[[VAL_28:.*]] = cc.load %[[VAL_27]] : !cc.ptr -// CHECK: %[[VAL_29:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr, !cc.ptr, !cc.ptr}>> -// CHECK: %[[VAL_30:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> -// CHECK: %[[VAL_31:.*]] = cc.cast %[[VAL_26]] : (!cc.ptr) -> !cc.ptr -// CHECK: cc.store %[[VAL_31]], %[[VAL_30]] : !cc.ptr> -// CHECK: %[[VAL_32:.*]] = cc.compute_ptr %[[VAL_29]][1] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> -// CHECK: %[[VAL_33:.*]] = arith.muli %[[VAL_28]], %[[VAL_3]] : i64 -// CHECK: %[[VAL_34:.*]] = cc.cast %[[VAL_26]] : (!cc.ptr) -> !cc.ptr> -// CHECK: %[[VAL_35:.*]] = cc.compute_ptr %[[VAL_34]]{{\[}}%[[VAL_33]]] : (!cc.ptr>, i64) -> !cc.ptr -// CHECK: cc.store %[[VAL_35]], %[[VAL_32]] : !cc.ptr> -// CHECK: %[[VAL_36:.*]] = cc.compute_ptr %[[VAL_29]][2] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> -// CHECK: cc.store %[[VAL_35]], %[[VAL_36]] : !cc.ptr> -// CHECK: call @free(%[[VAL_17]]) : (!cc.ptr) -> () +// CHECK: %[[VAL_36:.*]] = cc.compute_ptr %[[VAL_9]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> +// CHECK: cf.br ^bb3(%[[VAL_36]] : !cc.ptr, i64}>>) +// CHECK: ^bb3(%[[VAL_37:.*]]: !cc.ptr, i64}>>): +// CHECK: %[[VAL_38:.*]] = cc.cast %[[VAL_37]] : (!cc.ptr, i64}>>) -> !cc.ptr> +// CHECK: %[[VAL_39:.*]] = cc.load %[[VAL_38]] : !cc.ptr> +// CHECK: %[[VAL_40:.*]] = cc.compute_ptr %[[VAL_37]][1] : (!cc.ptr, i64}>>) -> !cc.ptr +// CHECK: %[[VAL_41:.*]] = cc.load %[[VAL_40]] : !cc.ptr +// CHECK: %[[VAL_42:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr, !cc.ptr, !cc.ptr}>> +// CHECK: %[[VAL_43:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: %[[VAL_44:.*]] = cc.cast %[[VAL_39]] : (!cc.ptr) -> !cc.ptr +// CHECK: cc.store %[[VAL_44]], %[[VAL_43]] : !cc.ptr> +// CHECK: %[[VAL_45:.*]] = cc.compute_ptr %[[VAL_42]][1] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: %[[VAL_46:.*]] = arith.muli %[[VAL_41]], %[[VAL_3]] : i64 +// CHECK: %[[VAL_47:.*]] = cc.cast %[[VAL_39]] : (!cc.ptr) -> !cc.ptr> +// CHECK: %[[VAL_48:.*]] = cc.compute_ptr %[[VAL_47]]{{\[}}%[[VAL_46]]] : (!cc.ptr>, i64) -> !cc.ptr +// CHECK: cc.store %[[VAL_48]], %[[VAL_45]] : !cc.ptr> +// CHECK: %[[VAL_49:.*]] = cc.compute_ptr %[[VAL_42]][2] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: cc.store %[[VAL_48]], %[[VAL_49]] : !cc.ptr> +// CHECK: call @free(%[[VAL_31]]) : (!cc.ptr) -> () // CHECK: return // CHECK: } +// CHECK: func.func private @hybridLaunchKernel(!cc.ptr, !cc.ptr, !cc.ptr, i64, i64, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: func.func private @cudaqRegisterArgsCreator(!cc.ptr, !cc.ptr) +// CHECK: llvm.func @cudaqRegisterLambdaName(!llvm.ptr, !llvm.ptr) attributes {sym_visibility = "private"} +// CHECK: func.func private @__cudaq_registerLinkableKernel(!cc.ptr, !cc.ptr, !cc.ptr) +// CHECK: func.func private @__cudaq_getLinkableKernelKey(!cc.ptr) -> i64 +// CHECK: func.func private @cudaqRegisterKernelName(!cc.ptr) +// CHECK: func.func private @free(!cc.ptr) +// CHECK: func.func private @__nvqpp_initializer_list_to_vector_bool(!cc.ptr, !cc.ptr, i64) +// CHECK: func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr, !cc.ptr, !cc.ptr}>>, !cc.ptr, !cc.ptr, !cc.ptr}>>) +// CHECK: func.func private @llvm.memcpy.p0i8.p0i8.i64(!cc.ptr, !cc.ptr, i64, i1) -} +// CHECK-LABEL: func.func private @__nvqpp_zeroDynamicResult() -> !cc.struct<{!cc.ptr, i64}> { +// CHECK: %[[VAL_0:.*]] = arith.constant 0 : i64 +// CHECK: %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (i64) -> !cc.ptr +// CHECK: %[[VAL_2:.*]] = cc.undef !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_3:.*]] = cc.insert_value %[[VAL_1]], %[[VAL_2]][0] : (!cc.struct<{!cc.ptr, i64}>, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_4:.*]] = cc.insert_value %[[VAL_0]], %[[VAL_3]][1] : (!cc.struct<{!cc.ptr, i64}>, i64) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: return %[[VAL_4]] : !cc.struct<{!cc.ptr, i64}> +// CHECK: } + +// CHECK-LABEL: func.func private @__nvqpp_createDynamicResult( +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, +// CHECK-SAME: %[[VAL_1:.*]]: i64, +// CHECK-SAME: %[[VAL_2:.*]]: !cc.ptr, i64}>>, +// CHECK-SAME: %[[VAL_3:.*]]: i64) -> !cc.struct<{!cc.ptr, i64}> { +// CHECK: %[[VAL_4:.*]] = arith.constant false +// CHECK: %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr, i64}>>) -> !cc.ptr +// CHECK: %[[VAL_6:.*]] = cc.load %[[VAL_5]] : !cc.ptr +// CHECK: %[[VAL_7:.*]] = arith.addi %[[VAL_1]], %[[VAL_6]] : i64 +// CHECK: %[[VAL_8:.*]] = call @malloc(%[[VAL_7]]) : (i64) -> !cc.ptr +// CHECK: %[[VAL_9:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr) -> !cc.ptr> +// CHECK: call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_8]], %[[VAL_0]], %[[VAL_1]], %[[VAL_4]]) : (!cc.ptr, !cc.ptr, i64, i1) -> () +// CHECK: %[[VAL_10:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr, i64}>>) -> !cc.ptr> +// CHECK: %[[VAL_11:.*]] = cc.load %[[VAL_10]] : !cc.ptr> +// CHECK: %[[VAL_12:.*]] = cc.compute_ptr %[[VAL_9]]{{\[}}%[[VAL_1]]] : (!cc.ptr>, i64) -> !cc.ptr +// CHECK: call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_12]], %[[VAL_11]], %[[VAL_6]], %[[VAL_4]]) : (!cc.ptr, !cc.ptr, i64, i1) -> () +// CHECK: %[[VAL_13:.*]] = cc.undef !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_14:.*]] = cc.insert_value %[[VAL_8]], %[[VAL_13]][0] : (!cc.struct<{!cc.ptr, i64}>, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_15:.*]] = cc.insert_value %[[VAL_7]], %[[VAL_14]][1] : (!cc.struct<{!cc.ptr, i64}>, i64) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_16:.*]] = cc.compute_ptr %[[VAL_9]]{{\[}}%[[VAL_3]]] : (!cc.ptr>, i64) -> !cc.ptr +// CHECK: %[[VAL_17:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr) -> !cc.ptr> +// CHECK: cc.store %[[VAL_12]], %[[VAL_17]] : !cc.ptr> +// CHECK: return %[[VAL_15]] : !cc.struct<{!cc.ptr, i64}> +// CHECK: } +// CHECK: llvm.mlir.global external constant @test_0.kernelName("test_0\00") {addr_space = 0 : i32} + +// CHECK-LABEL: func.func @test_0.returnOffset() -> i64 { +// CHECK: %[[VAL_0:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> [1] : i64 +// CHECK: return %[[VAL_0]] : i64 +// CHECK: } // CHECK-LABEL: func.func @test_0.thunk( -// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr, i64}> { +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, +// CHECK-SAME: %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr, i64}> { // CHECK: %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr) -> !cc.ptr, i64}>}>> // CHECK: %[[VAL_3:.*]] = cc.load %[[VAL_2]] : !cc.ptr, i64}>}>> // CHECK: %[[VAL_4:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 @@ -181,8 +269,42 @@ func.func @test_1(%0: !cc.ptr, !cc.ptr, !cc.ptr, i64}> // CHECK: } +// CHECK-LABEL: func.func @test_0.argsCreator( +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr>, +// CHECK-SAME: %[[VAL_1:.*]]: !cc.ptr>) -> i64 { +// CHECK: %[[VAL_2:.*]] = cc.load %[[VAL_0]] : !cc.ptr> +// CHECK: %[[VAL_3:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr) -> !cc.ptr +// CHECK: %[[VAL_4:.*]] = cc.load %[[VAL_3]] : !cc.ptr +// CHECK: %[[VAL_5:.*]] = cc.alloca i64 +// CHECK: %[[VAL_6:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 +// CHECK: %[[VAL_7:.*]] = call @malloc(%[[VAL_6]]) : (i64) -> !cc.ptr +// CHECK: %[[VAL_8:.*]] = cc.alloca !cc.ptr +// CHECK: %[[VAL_9:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr) -> !cc.ptr +// CHECK: cc.store %[[VAL_4]], %[[VAL_9]] : !cc.ptr +// CHECK: cc.store %[[VAL_7]], %[[VAL_1]] : !cc.ptr> +// CHECK: return %[[VAL_6]] : i64 +// CHECK: } + +// CHECK-LABEL: llvm.func @test_0.kernelRegFunc() { +// CHECK: %[[VAL_0:.*]] = func.constant @test_0.argsCreator : (!cc.ptr>, !cc.ptr>) -> i64 +// CHECK: %[[VAL_1:.*]] = llvm.mlir.addressof @test_0.kernelName : !llvm.ptr> +// CHECK: %[[VAL_2:.*]] = cc.cast %[[VAL_1]] : (!llvm.ptr>) -> !cc.ptr +// CHECK: func.call @cudaqRegisterKernelName(%[[VAL_2]]) : (!cc.ptr) -> () +// CHECK: %[[VAL_3:.*]] = cc.func_ptr %[[VAL_0]] : ((!cc.ptr>, !cc.ptr>) -> i64) -> !cc.ptr +// CHECK: func.call @cudaqRegisterArgsCreator(%[[VAL_2]], %[[VAL_3]]) : (!cc.ptr, !cc.ptr) -> () +// CHECK: llvm.return +// CHECK: } +// CHECK: llvm.mlir.global_ctors {ctors = [@test_0.kernelRegFunc], priorities = [17 : i32]} +// CHECK: llvm.mlir.global external constant @test_1.kernelName("test_1\00") {addr_space = 0 : i32} + +// CHECK-LABEL: func.func @test_1.returnOffset() -> i64 { +// CHECK: %[[VAL_0:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> [1] : i64 +// CHECK: return %[[VAL_0]] : i64 +// CHECK: } + // CHECK-LABEL: func.func @test_1.thunk( -// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr, i64}> { +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, +// CHECK-SAME: %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr, i64}> { // CHECK: %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr) -> !cc.ptr, i64}>}>> // CHECK: %[[VAL_3:.*]] = cc.load %[[VAL_2]] : !cc.ptr, i64}>}>> // CHECK: %[[VAL_4:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 @@ -201,3 +323,31 @@ func.func @test_1(%0: !cc.ptr, !cc.ptr, !cc.ptr !cc.struct<{!cc.ptr, i64}> // CHECK: return %[[VAL_12]] : !cc.struct<{!cc.ptr, i64}> // CHECK: } + +// CHECK-LABEL: func.func @test_1.argsCreator( +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr>, +// CHECK-SAME: %[[VAL_1:.*]]: !cc.ptr>) -> i64 { +// CHECK: %[[VAL_2:.*]] = cc.load %[[VAL_0]] : !cc.ptr> +// CHECK: %[[VAL_3:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr) -> !cc.ptr +// CHECK: %[[VAL_4:.*]] = cc.load %[[VAL_3]] : !cc.ptr +// CHECK: %[[VAL_5:.*]] = cc.alloca i64 +// CHECK: %[[VAL_6:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 +// CHECK: %[[VAL_7:.*]] = call @malloc(%[[VAL_6]]) : (i64) -> !cc.ptr +// CHECK: %[[VAL_8:.*]] = cc.alloca !cc.ptr +// CHECK: %[[VAL_9:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr) -> !cc.ptr +// CHECK: cc.store %[[VAL_4]], %[[VAL_9]] : !cc.ptr +// CHECK: cc.store %[[VAL_7]], %[[VAL_1]] : !cc.ptr> +// CHECK: return %[[VAL_6]] : i64 +// CHECK: } + +// CHECK-LABEL: llvm.func @test_1.kernelRegFunc() { +// CHECK: %[[VAL_0:.*]] = func.constant @test_1.argsCreator : (!cc.ptr>, !cc.ptr>) -> i64 +// CHECK: %[[VAL_1:.*]] = llvm.mlir.addressof @test_1.kernelName : !llvm.ptr> +// CHECK: %[[VAL_2:.*]] = cc.cast %[[VAL_1]] : (!llvm.ptr>) -> !cc.ptr +// CHECK: func.call @cudaqRegisterKernelName(%[[VAL_2]]) : (!cc.ptr) -> () +// CHECK: %[[VAL_3:.*]] = cc.func_ptr %[[VAL_0]] : ((!cc.ptr>, !cc.ptr>) -> i64) -> !cc.ptr +// CHECK: func.call @cudaqRegisterArgsCreator(%[[VAL_2]], %[[VAL_3]]) : (!cc.ptr, !cc.ptr) -> () +// CHECK: llvm.return +// CHECK: } +// CHECK: llvm.mlir.global_ctors {ctors = [@test_1.kernelRegFunc], priorities = [17 : i32]} + diff --git a/test/Translate/argument.qke b/test/Translate/argument.qke index 6a3532805a..82c7179b1f 100644 --- a/test/Translate/argument.qke +++ b/test/Translate/argument.qke @@ -6,7 +6,7 @@ // the terms of the Apache License 2.0 which accompanies this distribution. // // ========================================================================== // -// RUN: cudaq-opt --kernel-execution=codegen=1 --canonicalize %s | \ +// RUN: cudaq-opt -kernel-execution -canonicalize %s | \ // RUN: cudaq-translate --convert-to=qir | FileCheck %s // NB: the mangled name map is required for the kernel-execution pass. @@ -31,7 +31,7 @@ func.func @test_0(%0: !cc.ptr, %1: !cc.ptr, %1: !cc.ptr, %1: !cc.ptr, %1: !cc.ptr, %1: !cc.ptr, %1: !cc.ptr, %1: !cc.ptr, %1: !cc.ptr, %1: !cc.ptr, !cc.std func.func @test_3(%0: !cc.ptr, %1: !cc.ptr, !cc.ptr, !cc.ptr}>, !cc.struct<{!cc.ptr, !cc.ptr, !cc.ptr}>}>>) { return } -} // CHECK-LABEL: define void @__nvqpp__mlirgen__test_3({ { i16*, i64 }, { float*, i64 } } -// CHECK-SAME: %[[VAL_0:.*]]) {{.*}}{ +// CHECK-SAME: %[[VAL_0:.*]]) local_unnamed_addr { // CHECK: %[[VAL_1:.*]] = extractvalue { { i16*, i64 }, { float*, i64 } } %[[VAL_0]], 0 // CHECK: %[[VAL_2:.*]] = extractvalue { i16*, i64 } %[[VAL_1]], 0 // CHECK: %[[VAL_3:.*]] = extractvalue { i16*, i64 } %[[VAL_1]], 1 @@ -202,7 +246,7 @@ func.func @test_3(%0: !cc.ptr, %1: !cc.ptr, %1: !cc.ptr, %1: !cc.ptr, %1: !cc.ptr, %1: !cc.ptr, %1: !cc.ptr, %1: !cc.ptr, %1: !cc.ptr, %1: !cc.ptr, %1: !cc.ptr, %1: !cc.ptr !cc.stdvec { func.func @test_0(%1: !cc.ptr, !cc.ptr, !cc.ptr}>> {llvm.sret = !cc.struct<{!cc.ptr, !cc.ptr, !cc.ptr}>}, %this: !cc.ptr, %2: i32) { return } - -// CHECK-LABEL: define { i1*, i64 } @__nvqpp__mlirgen__test_0( -// CHECK-SAME: i32 %[[VAL_1:.*]]) {{.*}}{ -// CHECK: %[[VAL_2:.*]] = sext i32 %[[VAL_1]] to i64 -// CHECK: %[[VAL_3:.*]] = tail call %[[VAL_4:.*]]* @__quantum__rt__qubit_allocate_array(i64 %[[VAL_2]]) -// CHECK: %[[VAL_5:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(%[[VAL_4]]* %[[VAL_3]]) -// CHECK: %[[VAL_6:.*]] = icmp sgt i64 %[[VAL_5]], 0 -// CHECK: br i1 %[[VAL_6]], label %[[VAL_7:.*]], label %[[VAL_8:.*]] -// CHECK: ._crit_edge.thread: ; preds = %[[VAL_9:.*]] -// CHECK: %[[VAL_10:.*]] = alloca i8, i64 %[[VAL_5]], align 1 -// CHECK: br label %[[VAL_11:.*]] -// CHECK: .lr.ph: ; preds = %[[VAL_9]], %[[VAL_7]] -// CHECK: %[[VAL_12:.*]] = phi i64 [ %[[VAL_13:.*]], %[[VAL_7]] ], [ 0, %[[VAL_9]] ] -// CHECK: %[[VAL_14:.*]] = tail call i8* @__quantum__rt__array_get_element_ptr_1d(%[[VAL_4]]* %[[VAL_3]], i64 %[[VAL_12]]) -// CHECK: %[[VAL_15:.*]] = bitcast i8* %[[VAL_14]] to %[[VAL_16:.*]]** -// CHECK: %[[VAL_17:.*]] = load %[[VAL_16]]*, %[[VAL_16]]** %[[VAL_15]], align 8 -// CHECK: tail call void @__quantum__qis__h(%[[VAL_16]]* %[[VAL_17]]) -// CHECK: %[[VAL_13]] = add nuw nsw i64 %[[VAL_12]], 1 -// CHECK: %[[VAL_18:.*]] = icmp eq i64 %[[VAL_13]], %[[VAL_5]] -// CHECK: br i1 %[[VAL_18]], label %[[VAL_19:.*]], label %[[VAL_7]] -// CHECK: ._crit_edge: ; preds = %[[VAL_7]] -// CHECK: %[[VAL_20:.*]] = alloca i8, i64 %[[VAL_5]], align 1 -// CHECK: br i1 %[[VAL_6]], label %[[VAL_21:.*]], label %[[VAL_11]] -// CHECK: .lr.ph4: ; preds = %[[VAL_19]], %[[VAL_21]] -// CHECK: %[[VAL_22:.*]] = phi i64 [ %[[VAL_23:.*]], %[[VAL_21]] ], [ 0, %[[VAL_19]] ] -// CHECK: %[[VAL_24:.*]] = tail call i8* @__quantum__rt__array_get_element_ptr_1d(%[[VAL_4]]* %[[VAL_3]], i64 %[[VAL_22]]) -// CHECK: %[[VAL_25:.*]] = bitcast i8* %[[VAL_24]] to %[[VAL_16]]** -// CHECK: %[[VAL_26:.*]] = load %[[VAL_16]]*, %[[VAL_16]]** %[[VAL_25]], align 8 -// CHECK: %[[VAL_27:.*]] = tail call %[[VAL_28:.*]]* @__quantum__qis__mz(%[[VAL_16]]* %[[VAL_26]]) -// CHECK: %[[VAL_29:.*]] = bitcast %[[VAL_28]]* %[[VAL_27]] to i1* -// CHECK: %[[VAL_30:.*]] = load i1, i1* %[[VAL_29]], align 1 -// CHECK: %[[VAL_31:.*]] = getelementptr i8, i8* %[[VAL_20]], i64 %[[VAL_22]] -// CHECK: %[[VAL_32:.*]] = zext i1 %[[VAL_30]] to i8 -// CHECK: store i8 %[[VAL_32]], i8* %[[VAL_31]], align 1 -// CHECK: %[[VAL_23]] = add nuw nsw i64 %[[VAL_22]], 1 -// CHECK: %[[VAL_33:.*]] = icmp eq i64 %[[VAL_23]], %[[VAL_5]] -// CHECK: br i1 %[[VAL_33]], label %[[VAL_11]], label %[[VAL_21]] -// CHECK: ._crit_edge5: ; preds = %[[VAL_21]], %[[VAL_8]], %[[VAL_19]] -// CHECK: %[[VAL_34:.*]] = phi i8* [ %[[VAL_10]], %[[VAL_8]] ], [ %[[VAL_20]], %[[VAL_19]] ], [ %[[VAL_20]], %[[VAL_21]] ] -// CHECK: %[[VAL_35:.*]] = call i8* @__nvqpp_vectorCopyCtor(i8* nonnull %[[VAL_34]], i64 %[[VAL_5]], i64 1) -// CHECK: %[[VAL_36:.*]] = bitcast i8* %[[VAL_35]] to i1* -// CHECK: %[[VAL_37:.*]] = insertvalue { i1*, i64 } undef, i1* %[[VAL_36]], 0 -// CHECK: %[[VAL_38:.*]] = insertvalue { i1*, i64 } %[[VAL_37]], i64 %[[VAL_5]], 1 -// CHECK: call void @__quantum__rt__qubit_release_array(%Array* %[[VAL_3]]) -// CHECK: ret { i1*, i64 } %[[VAL_38]] +// CHECK-LABEL: define { i1*, i64 } @__nvqpp__mlirgen__test_0(i32 +// CHECK-SAME: %[[VAL_0:.*]]) local_unnamed_addr { +// CHECK: %[[VAL_1:.*]] = sext i32 %[[VAL_0]] to i64 +// CHECK: %[[VAL_2:.*]] = tail call %[[VAL_3:.*]]* @__quantum__rt__qubit_allocate_array(i64 %[[VAL_1]]) +// CHECK: %[[VAL_4:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(%[[VAL_3]]* %[[VAL_2]]) +// CHECK: %[[VAL_5:.*]] = icmp sgt i64 %[[VAL_4]], 0 +// CHECK: br i1 %[[VAL_5]], label %[[VAL_6:.*]], label %[[VAL_7:.*]] +// CHECK: ._crit_edge.thread: ; preds = %[[VAL_8:.*]] +// CHECK: %[[VAL_9:.*]] = alloca i8, i64 %[[VAL_4]], align 1 +// CHECK: br label %[[VAL_10:.*]] +// CHECK: .lr.ph: ; preds = %[[VAL_8]], %[[VAL_6]] +// CHECK: %[[VAL_11:.*]] = phi i64 [ %[[VAL_12:.*]], %[[VAL_6]] ], [ 0, %[[VAL_8]] ] +// CHECK: %[[VAL_13:.*]] = tail call i8* @__quantum__rt__array_get_element_ptr_1d(%[[VAL_3]]* %[[VAL_2]], i64 %[[VAL_11]]) +// CHECK: %[[VAL_14:.*]] = bitcast i8* %[[VAL_13]] to %[[VAL_15:.*]]** +// CHECK: %[[VAL_16:.*]] = load %[[VAL_15]]*, %[[VAL_15]]** %[[VAL_14]], align 8 +// CHECK: tail call void @__quantum__qis__h(%[[VAL_15]]* %[[VAL_16]]) +// CHECK: %[[VAL_12]] = add nuw nsw i64 %[[VAL_11]], 1 +// CHECK: %[[VAL_17:.*]] = icmp eq i64 %[[VAL_12]], %[[VAL_4]] +// CHECK: br i1 %[[VAL_17]], label %[[VAL_18:.*]], label %[[VAL_6]] +// CHECK: ._crit_edge: ; preds = %[[VAL_6]] +// CHECK: %[[VAL_19:.*]] = alloca i8, i64 %[[VAL_4]], align 1 +// CHECK: br i1 %[[VAL_5]], label %[[VAL_20:.*]], label %[[VAL_10]] +// CHECK: .lr.ph4: ; preds = %[[VAL_18]], %[[VAL_20]] +// CHECK: %[[VAL_21:.*]] = phi i64 [ %[[VAL_22:.*]], %[[VAL_20]] ], [ 0, %[[VAL_18]] ] +// CHECK: %[[VAL_23:.*]] = tail call i8* @__quantum__rt__array_get_element_ptr_1d(%[[VAL_3]]* %[[VAL_2]], i64 %[[VAL_21]]) +// CHECK: %[[VAL_24:.*]] = bitcast i8* %[[VAL_23]] to %[[VAL_15]]** +// CHECK: %[[VAL_25:.*]] = load %[[VAL_15]]*, %[[VAL_15]]** %[[VAL_24]], align 8 +// CHECK: %[[VAL_26:.*]] = tail call %[[VAL_27:.*]]* @__quantum__qis__mz(%[[VAL_15]]* %[[VAL_25]]) +// CHECK: %[[VAL_28:.*]] = bitcast %[[VAL_27]]* %[[VAL_26]] to i1* +// CHECK: %[[VAL_29:.*]] = load i1, i1* %[[VAL_28]], align 1 +// CHECK: %[[VAL_30:.*]] = getelementptr i8, i8* %[[VAL_19]], i64 %[[VAL_21]] +// CHECK: %[[VAL_31:.*]] = zext i1 %[[VAL_29]] to i8 +// CHECK: store i8 %[[VAL_31]], i8* %[[VAL_30]], align 1 +// CHECK: %[[VAL_22]] = add nuw nsw i64 %[[VAL_21]], 1 +// CHECK: %[[VAL_32:.*]] = icmp eq i64 %[[VAL_22]], %[[VAL_4]] +// CHECK: br i1 %[[VAL_32]], label %[[VAL_10]], label %[[VAL_20]] +// CHECK: ._crit_edge5: ; preds = %[[VAL_20]], %[[VAL_7]], %[[VAL_18]] +// CHECK: %[[VAL_33:.*]] = phi i8* [ %[[VAL_9]], %[[VAL_7]] ], [ %[[VAL_19]], %[[VAL_18]] ], [ %[[VAL_19]], %[[VAL_20]] ] +// CHECK: %[[VAL_34:.*]] = call i8* @__nvqpp_vectorCopyCtor(i8* nonnull %[[VAL_33]], i64 %[[VAL_4]], i64 1) +// CHECK: %[[VAL_35:.*]] = bitcast i8* %[[VAL_34]] to i1* +// CHECK: %[[VAL_36:.*]] = insertvalue { i1*, i64 } undef, i1* %[[VAL_35]], 0 +// CHECK: %[[VAL_37:.*]] = insertvalue { i1*, i64 } %[[VAL_36]], i64 %[[VAL_4]], 1 +// CHECK: call void @__quantum__rt__qubit_release_array(%[[VAL_3]]* %[[VAL_2]]) +// CHECK: ret { i1*, i64 } %[[VAL_37]] // CHECK: } // CHECK-LABEL: define void @test_0({ i8*, i8*, i8* }* sret({ i8*, i8*, i8* }) -// CHECK-SAME: %[[VAL_0:.*]], i8* nocapture readnone %[[VAL_1:.*]], i32 %[[VAL_2:.*]]) {{.*}}{ -// CHECK: %[[VAL_3:.*]] = alloca { i32, { i1*, i64 } }, align 8 +// CHECK-SAME: %[[VAL_0:.*]], i8* nocapture readnone +// CHECK-SAME: %[[VAL_1:.*]], i32 +// CHECK-SAME: %[[VAL_2:.*]]) local_unnamed_addr { +// CHECK: %[[VAL_3:.*]] = alloca { i32, { i1*, i64 } }, align 4 // CHECK: %[[VAL_4:.*]] = bitcast { i32, { i1*, i64 } }* %[[VAL_3]] to i8* // CHECK: %[[VAL_5:.*]] = getelementptr inbounds { i32, { i1*, i64 } }, { i32, { i1*, i64 } }* %[[VAL_3]], i64 0, i32 0 -// CHECK: store i32 %[[VAL_2]], i32* %[[VAL_5]], align 8 -// CHECK: %[[VAL_6:.*]] = call { i8*, i64 } @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_0.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_0.thunk to i8*), i8* nonnull %[[VAL_4]], i64 24, i64 8) -// CHECK: %[[VAL_7:.*]] = extractvalue { i8*, i64 } %[[VAL_6]], 0 -// CHECK: %[[VAL_8:.*]] = icmp eq i8* %[[VAL_7]], null -// CHECK: %[[VAL_9:.*]] = getelementptr i8, i8* %[[VAL_7]], i64 8 -// CHECK: %[[VAL_10:.*]] = bitcast i8* %[[VAL_9]] to { i1*, i64 }* -// CHECK: %[[VAL_11:.*]] = getelementptr inbounds { i32, { i1*, i64 } }, { i32, { i1*, i64 } }* %[[VAL_3]], i64 0, i32 1 -// CHECK: %[[VAL_12:.*]] = select i1 %[[VAL_8]], { i1*, i64 }* %[[VAL_11]], { i1*, i64 }* %[[VAL_10]] -// CHECK: %[[VAL_13:.*]] = bitcast { i1*, i64 }* %[[VAL_12]] to i8** -// CHECK: %[[VAL_14:.*]] = load i8*, i8** %[[VAL_13]], align 8 -// CHECK: %[[VAL_15:.*]] = getelementptr inbounds { i32, { i1*, i64 } }, { i32, { i1*, i64 } }* %[[VAL_3]], i64 0, i32 1, i32 1 -// CHECK: %[[VAL_16:.*]] = getelementptr i8, i8* %[[VAL_7]], i64 16 -// CHECK: %[[VAL_17:.*]] = bitcast i8* %[[VAL_16]] to i64* -// CHECK: %[[VAL_18:.*]] = select i1 %[[VAL_8]], i64* %[[VAL_15]], i64* %[[VAL_17]] -// CHECK: %[[VAL_19:.*]] = load i64, i64* %[[VAL_18]], align 4 -// CHECK: %[[VAL_20:.*]] = bitcast { i8*, i8*, i8* }* %[[VAL_0]] to i8* -// CHECK: call void @__nvqpp_initializer_list_to_vector_bool(i8* %[[VAL_20]], i8* %[[VAL_14]], i64 %[[VAL_19]]) -// CHECK: call void @free(i8* %[[VAL_7]]) +// CHECK: store i32 %[[VAL_2]], i32* %[[VAL_5]], align 4 +// CHECK: %[[VAL_6:.*]] = alloca { i8**, i8**, i8** }, align 8 +// CHECK: %[[VAL_7:.*]] = alloca [1 x i8*], align 8 +// CHECK: %[[VAL_8:.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* %[[VAL_7]], i64 0, i64 0 +// CHECK: %[[VAL_9:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_6]], i64 0, i32 0 +// CHECK: store i8** %[[VAL_8]], i8*** %[[VAL_9]], align 8 +// CHECK: %[[VAL_10:.*]] = ptrtoint [1 x i8*]* %[[VAL_7]] to i64 +// CHECK: %[[VAL_11:.*]] = add i64 %[[VAL_10]], 8 +// CHECK: %[[VAL_12:.*]] = inttoptr i64 %[[VAL_11]] to i8** +// CHECK: %[[VAL_13:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_6]], i64 0, i32 1 +// CHECK: store i8** %[[VAL_12]], i8*** %[[VAL_13]], align 8 +// CHECK: %[[VAL_14:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_6]], i64 0, i32 2 +// CHECK: store i8** %[[VAL_12]], i8*** %[[VAL_14]], align 8 +// CHECK: %[[VAL_15:.*]] = alloca i32, align 4 +// CHECK: store i32 %[[VAL_2]], i32* %[[VAL_15]], align 4 +// CHECK: %[[VAL_16:.*]] = bitcast [1 x i8*]* %[[VAL_7]] to i32** +// CHECK: store i32* %[[VAL_15]], i32** %[[VAL_16]], align 8 +// CHECK: %[[VAL_17:.*]] = bitcast { i8**, i8**, i8** }* %[[VAL_6]] to i8* +// CHECK: %[[VAL_18:.*]] = call { i8*, i64 } @hybridLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_0.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_0.thunk to i8*), i8* nonnull %[[VAL_4]], i64 24, i64 8, i8* nonnull %[[VAL_17]]) +// CHECK: %[[VAL_19:.*]] = extractvalue { i8*, i64 } %[[VAL_18]], 0 +// CHECK: %[[VAL_20:.*]] = icmp eq i8* %[[VAL_19]], null +// CHECK: %[[VAL_21:.*]] = getelementptr i8, i8* %[[VAL_19]], i64 8 +// CHECK: %[[VAL_22:.*]] = bitcast i8* %[[VAL_21]] to { i1*, i64 }* +// CHECK: %[[VAL_23:.*]] = getelementptr inbounds { i32, { i1*, i64 } }, { i32, { i1*, i64 } }* %[[VAL_3]], i64 0, i32 1 +// CHECK: %[[VAL_24:.*]] = select i1 %[[VAL_20]], { i1*, i64 }* %[[VAL_23]], { i1*, i64 }* %[[VAL_22]] +// CHECK: %[[VAL_25:.*]] = bitcast { i1*, i64 }* %[[VAL_24]] to i8** +// CHECK: %[[VAL_26:.*]] = load i8*, i8** %[[VAL_25]], align 8 +// CHECK: %[[VAL_27:.*]] = getelementptr inbounds { i32, { i1*, i64 } }, { i32, { i1*, i64 } }* %[[VAL_3]], i64 0, i32 1, i32 1 +// CHECK: %[[VAL_28:.*]] = getelementptr i8, i8* %[[VAL_19]], i64 16 +// CHECK: %[[VAL_29:.*]] = bitcast i8* %[[VAL_28]] to i64* +// CHECK: %[[VAL_30:.*]] = select i1 %[[VAL_20]], i64* %[[VAL_27]], i64* %[[VAL_29]] +// CHECK: %[[VAL_31:.*]] = load i64, i64* %[[VAL_30]], align 4 +// CHECK: %[[VAL_32:.*]] = bitcast { i8*, i8*, i8* }* %[[VAL_0]] to i8* +// CHECK: call void @__nvqpp_initializer_list_to_vector_bool(i8* %[[VAL_32]], i8* %[[VAL_26]], i64 %[[VAL_31]]) +// CHECK: call void @free(i8* %[[VAL_19]]) // CHECK: ret void // CHECK: } @@ -152,35 +171,45 @@ func.func @test_1(%this: !cc.ptr) -> i16 { return %0 : i16 } -// CHECK-LABEL: define { i1, i1 } @__nvqpp__mlirgen__test_1() -// CHECK: %[[VAL_1:.*]] = tail call %[[VAL_2:.*]]* @__quantum__rt__qubit_allocate_array(i64 2) -// CHECK: %[[VAL_3:.*]] = tail call i8* @__quantum__rt__array_get_element_ptr_1d(%[[VAL_2]]* %[[VAL_1]], i64 0) -// CHECK: %[[VAL_4:.*]] = bitcast i8* %[[VAL_3]] to %[[VAL_5:.*]]** -// CHECK: %[[VAL_6:.*]] = load %[[VAL_5]]*, %[[VAL_5]]** %[[VAL_4]], align 8 -// CHECK: %[[VAL_7:.*]] = tail call i8* @__quantum__rt__array_get_element_ptr_1d(%[[VAL_2]]* %[[VAL_1]], i64 1) -// CHECK: %[[VAL_8:.*]] = bitcast i8* %[[VAL_7]] to %[[VAL_5]]** -// CHECK: %[[VAL_9:.*]] = load %[[VAL_5]]*, %[[VAL_5]]** %[[VAL_8]], align 8 -// CHECK: tail call void @__quantum__qis__h(%[[VAL_5]]* %[[VAL_6]]) -// CHECK: tail call void (i64, void (%[[VAL_2]]*, %[[VAL_5]]*)*, ...) @invokeWithControlQubits(i64 1, void (%[[VAL_2]]*, %[[VAL_5]]*)* nonnull @__quantum__qis__x__ctl, %[[VAL_5]]* %[[VAL_6]], %[[VAL_5]]* %[[VAL_9]]) -// CHECK: %[[VAL_10:.*]] = tail call %[[VAL_11:.*]]* @__quantum__qis__mz(%[[VAL_5]]* %[[VAL_6]]) -// CHECK: %[[VAL_12:.*]] = bitcast %Result* %[[VAL_10]] to i1* -// CHECK: %[[VAL_13:.*]] = load i1, i1* %[[VAL_12]], align 1 -// CHECK: %[[VAL_14:.*]] = tail call %[[VAL_11]]* @__quantum__qis__mz(%[[VAL_5]]* %[[VAL_9]]) -// CHECK: %[[VAL_15:.*]] = bitcast %Result* %[[VAL_14]] to i1* -// CHECK: %[[VAL_16:.*]] = load i1, i1* %[[VAL_15]], align 1 -// CHECK: %[[VAL_20:.*]] = insertvalue { i1, i1 } undef, i1 %[[VAL_13]], 0 -// CHECK: %[[VAL_19:.*]] = insertvalue { i1, i1 } %[[VAL_20]], i1 %[[VAL_16]], 1 -// CHECK: tail call void @__quantum__rt__qubit_release_array(%[[VAL_2]]* %[[VAL_1]]) -// CHECK: ret { i1, i1 } %[[VAL_19]] -// CHECK: } - -// CHECK-LABEL: define i16 @test_1(i8* nocapture readnone -// CHECK-SAME: %[[VAL_0:.*]]) {{.*}}{ -// CHECK-NEXT: %[[VAL_2:.*]] = alloca i16, align 8 -// CHECK: %[[VAL_3:.*]] = bitcast i16* %[[VAL_2]] to i8* -// CHECK: call { i8*, i64 } @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_1.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_1.thunk to i8*), i8* nonnull %[[VAL_3]], i64 2, i64 0) -// CHECK: %[[VAL_4:.*]] = load i16, i16* %[[VAL_2]], align 8 -// CHECK: ret i16 %[[VAL_4]] +// CHECK-LABEL: define { i1, i1 } @__nvqpp__mlirgen__test_1() local_unnamed_addr { +// CHECK: %[[VAL_0:.*]] = tail call %[[VAL_1:.*]]* @__quantum__rt__qubit_allocate_array(i64 2) +// CHECK: %[[VAL_2:.*]] = tail call i8* @__quantum__rt__array_get_element_ptr_1d(%[[VAL_1]]* %[[VAL_0]], i64 0) +// CHECK: %[[VAL_3:.*]] = bitcast i8* %[[VAL_2]] to %[[VAL_4:.*]]** +// CHECK: %[[VAL_5:.*]] = load %[[VAL_4]]*, %[[VAL_4]]** %[[VAL_3]], align 8 +// CHECK: %[[VAL_6:.*]] = tail call i8* @__quantum__rt__array_get_element_ptr_1d(%[[VAL_1]]* %[[VAL_0]], i64 1) +// CHECK: %[[VAL_7:.*]] = bitcast i8* %[[VAL_6]] to %[[VAL_4]]** +// CHECK: %[[VAL_8:.*]] = load %[[VAL_4]]*, %[[VAL_4]]** %[[VAL_7]], align 8 +// CHECK: tail call void @__quantum__qis__h(%[[VAL_4]]* %[[VAL_5]]) +// CHECK: tail call void (i64, void (%[[VAL_1]]*, %[[VAL_4]]*)*, ...) @invokeWithControlQubits(i64 1, void (%[[VAL_1]]*, %[[VAL_4]]*)* nonnull @__quantum__qis__x__ctl, %[[VAL_4]]* %[[VAL_5]], %[[VAL_4]]* %[[VAL_8]]) +// CHECK: %[[VAL_9:.*]] = tail call %[[VAL_10:.*]]* @__quantum__qis__mz(%[[VAL_4]]* %[[VAL_5]]) +// CHECK: %[[VAL_11:.*]] = bitcast %[[VAL_10]]* %[[VAL_9]] to i1* +// CHECK: %[[VAL_12:.*]] = load i1, i1* %[[VAL_11]], align 1 +// CHECK: %[[VAL_13:.*]] = tail call %[[VAL_10]]* @__quantum__qis__mz(%[[VAL_4]]* %[[VAL_8]]) +// CHECK: %[[VAL_14:.*]] = bitcast %[[VAL_10]]* %[[VAL_13]] to i1* +// CHECK: %[[VAL_15:.*]] = load i1, i1* %[[VAL_14]], align 1 +// CHECK: %[[VAL_16:.*]] = insertvalue { i1, i1 } undef, i1 %[[VAL_12]], 0 +// CHECK: %[[VAL_17:.*]] = insertvalue { i1, i1 } %[[VAL_16]], i1 %[[VAL_15]], 1 +// CHECK: tail call void @__quantum__rt__qubit_release_array(%[[VAL_1]]* %[[VAL_0]]) +// CHECK: ret { i1, i1 } %[[VAL_17]] +// CHECK: } + +// CHECK-LABEL: define i16 @test_1(i8* nocapture readnone +// CHECK-SAME: %[[VAL_0:.*]]) local_unnamed_addr { +// CHECK: %[[VAL_1:.*]] = alloca [0 x i8*], align 8 +// CHECK: %[[VAL_2:.*]] = alloca i16, align 2 +// CHECK: %[[VAL_3:.*]] = alloca { i8**, i8**, i8** }, align 8 +// CHECK: %[[VAL_4:.*]] = bitcast i16* %[[VAL_2]] to i8* +// CHECK: %[[VAL_5:.*]] = getelementptr inbounds [0 x i8*], [0 x i8*]* %[[VAL_1]], i64 0, i64 0 +// CHECK: %[[VAL_6:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_3]], i64 0, i32 0 +// CHECK: store i8** %[[VAL_5]], i8*** %[[VAL_6]], align 8 +// CHECK: %[[VAL_7:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_3]], i64 0, i32 1 +// CHECK: store i8** %[[VAL_5]], i8*** %[[VAL_7]], align 8 +// CHECK: %[[VAL_8:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_3]], i64 0, i32 2 +// CHECK: store i8** %[[VAL_5]], i8*** %[[VAL_8]], align 8 +// CHECK: %[[VAL_9:.*]] = bitcast { i8**, i8**, i8** }* %[[VAL_3]] to i8* +// CHECK: %[[VAL_10:.*]] = call { i8*, i64 } @hybridLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_1.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_1.thunk to i8*), i8* nonnull %[[VAL_4]], i64 2, i64 0, i8* nonnull %[[VAL_9]]) +// CHECK: %[[VAL_11:.*]] = load i16, i16* %[[VAL_2]], align 2 +// CHECK: ret i16 %[[VAL_11]] // CHECK: } // struct{i16, f32, f64, i64} -> sret ptr @@ -201,20 +230,32 @@ func.func @test_2(%1: !cc.ptr> {llvm.sret = !cc return } -// CHECK-LABEL: define { i16, float, double, i64 } @__nvqpp__mlirgen__test_2() +// CHECK-LABEL: define { i16, float, double, i64 } @__nvqpp__mlirgen__test_2() local_unnamed_addr {{.*}} { // CHECK: ret { i16, float, double, i64 } { i16 8, float 0x40159999A0000000, double 3.783000e+01, i64 1479 } // CHECK: } // CHECK-LABEL: define void @test_2({ i16, float, double, i64 }* nocapture writeonly sret({ i16, float, double, i64 }) -// CHECK-SAME: %[[VAL_0:.*]], i8* nocapture readnone %[[VAL_1:.*]]) {{.*}}{ -// CHECK: %[[VAL_2:.*]] = alloca { { i16, float, double, i64 } }, align 8 -// CHECK: %[[VAL_3:.*]] = bitcast { { i16, float, double, i64 } }* %[[VAL_2]] to i8* -// CHECK: call { i8*, i64 } @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_2.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_2.thunk to i8*), i8* nonnull %[[VAL_3]], i64 24, i64 0) -// CHECK: %[[VAL_4:.*]] = bitcast { i16, float, double, i64 }* %[[VAL_0]] to i8* -// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 8 dereferenceable(24) %[[VAL_4]], i8* noundef nonnull align 8 dereferenceable(24) %[[VAL_3]], i64 24, i1 false) +// CHECK-SAME: %[[VAL_0:.*]], i8* nocapture readnone +// CHECK-SAME: %[[VAL_1:.*]]) local_unnamed_addr { +// CHECK: %[[VAL_2:.*]] = alloca [0 x i8*], align 8 +// CHECK: %[[VAL_3:.*]] = alloca [24 x i8], align 1 +// CHECK: %[[VAL_4:.*]] = alloca { i8**, i8**, i8** }, align 8 +// CHECK: %[[VAL_5:.*]] = getelementptr inbounds [24 x i8], [24 x i8]* %[[VAL_3]], i64 0, i64 0 +// CHECK: %[[VAL_6:.*]] = getelementptr inbounds [0 x i8*], [0 x i8*]* %[[VAL_2]], i64 0, i64 0 +// CHECK: %[[VAL_7:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_4]], i64 0, i32 0 +// CHECK: store i8** %[[VAL_6]], i8*** %[[VAL_7]], align 8 +// CHECK: %[[VAL_8:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_4]], i64 0, i32 1 +// CHECK: store i8** %[[VAL_6]], i8*** %[[VAL_8]], align 8 +// CHECK: %[[VAL_9:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_4]], i64 0, i32 2 +// CHECK: store i8** %[[VAL_6]], i8*** %[[VAL_9]], align 8 +// CHECK: %[[VAL_10:.*]] = bitcast { i8**, i8**, i8** }* %[[VAL_4]] to i8* +// CHECK: %[[VAL_11:.*]] = call { i8*, i64 } @hybridLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_2.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_2.thunk to i8*), i8* nonnull %[[VAL_5]], i64 24, i64 0, i8* nonnull %[[VAL_10]]) +// CHECK: %[[VAL_12:.*]] = bitcast { i16, float, double, i64 }* %[[VAL_0]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 8 dereferenceable(24) %[[VAL_12]], i8* noundef nonnull align 1 dereferenceable(24) %[[VAL_5]], i64 24, i1 false) // CHECK: ret void // CHECK: } + // array -> sret ptr func.func @__nvqpp__mlirgen__test_3() -> !cc.array { %rv = cc.undef !cc.array @@ -235,17 +276,28 @@ func.func @test_3(%1: !cc.ptr> {llvm.sret = !cc.array> {llvm.sret = !cc.struct return } -// CHECK-LABEL: define { i64, double } @__nvqpp__mlirgen__test_4() {{.*}}{ +// CHECK-LABEL: define { i64, double } @__nvqpp__mlirgen__test_4() local_unnamed_addr {{.*}} { // CHECK: ret { i64, double } { i64 537892, double 0x40578DA858793DD9 } // CHECK: } // CHECK-LABEL: define void @test_4({ i64, double }* nocapture writeonly sret({ i64, double }) -// CHECK-SAME: %[[VAL_0:.*]], i8* nocapture readnone %[[VAL_1:.*]]) {{.*}}{ -// CHECK: %[[VAL_2:.*]] = alloca { i64, double }, align 8 -// CHECK: %[[VAL_3:.*]] = bitcast { i64, double }* %[[VAL_2]] to i8* -// CHECK: call { i8*, i64 } @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_4.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_4.thunk to i8*), i8* nonnull %[[VAL_3]], i64 16, i64 0) -// CHECK: %[[VAL_4:.*]] = bitcast { i64, double }* %[[VAL_0]] to i8* -// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 8 dereferenceable(16) %[[VAL_4]], i8* noundef nonnull align 8 dereferenceable(16) %[[VAL_3]], i64 16, i1 false) +// CHECK-SAME: %[[VAL_0:.*]], i8* nocapture readnone +// CHECK-SAME: %[[VAL_1:.*]]) local_unnamed_addr { +// CHECK: %[[VAL_2:.*]] = alloca [0 x i8*], align 8 +// CHECK: %[[VAL_3:.*]] = alloca [16 x i8], align 1 +// CHECK: %[[VAL_4:.*]] = alloca { i8**, i8**, i8** }, align 8 +// CHECK: %[[VAL_5:.*]] = getelementptr inbounds [16 x i8], [16 x i8]* %[[VAL_3]], i64 0, i64 0 +// CHECK: %[[VAL_6:.*]] = getelementptr inbounds [0 x i8*], [0 x i8*]* %[[VAL_2]], i64 0, i64 0 +// CHECK: %[[VAL_7:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_4]], i64 0, i32 0 +// CHECK: store i8** %[[VAL_6]], i8*** %[[VAL_7]], align 8 +// CHECK: %[[VAL_8:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_4]], i64 0, i32 1 +// CHECK: store i8** %[[VAL_6]], i8*** %[[VAL_8]], align 8 +// CHECK: %[[VAL_9:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_4]], i64 0, i32 2 +// CHECK: store i8** %[[VAL_6]], i8*** %[[VAL_9]], align 8 +// CHECK: %[[VAL_10:.*]] = bitcast { i8**, i8**, i8** }* %[[VAL_4]] to i8* +// CHECK: %[[VAL_11:.*]] = call { i8*, i64 } @hybridLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_4.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_4.thunk to i8*), i8* nonnull %[[VAL_5]], i64 16, i64 0, i8* nonnull %[[VAL_10]]) +// CHECK: %[[VAL_12:.*]] = bitcast { i64, double }* %[[VAL_0]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 8 dereferenceable(16) %[[VAL_12]], i8* noundef nonnull align 1 dereferenceable(16) %[[VAL_5]], i64 16, i1 false) // CHECK: ret void // CHECK: } @@ -284,102 +347,114 @@ func.func @test_5(%sret: !cc.ptr> {llvm.sret = !cc.struct return } -// CHECK-LABEL: define { i64, double } @__nvqpp__mlirgen__test_5() {{.*}}{ +// CHECK-LABEL: define { i64, double } @__nvqpp__mlirgen__test_5() local_unnamed_addr {{.*}} { // CHECK: ret { i64, double } { i64 537892, double 0x40578DA858793DD9 } // CHECK: } // CHECK-LABEL: define void @test_5({ i64, double }* nocapture writeonly sret({ i64, double }) -// CHECK-SAME: %[[VAL_0:.*]]) {{.*}}{ -// CHECK: %[[VAL_1:.*]] = alloca { i64, double }, align 8 -// CHECK: %[[VAL_2:.*]] = bitcast { i64, double }* %[[VAL_1]] to i8* -// CHECK: call { i8*, i64 } @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_5.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_5.thunk to i8*), i8* nonnull %[[VAL_2]], i64 16, i64 0) -// CHECK: %[[VAL_3:.*]] = bitcast { i64, double }* %[[VAL_0]] to i8* -// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 8 dereferenceable(16) %[[VAL_3]], i8* noundef nonnull align 8 dereferenceable(16) %[[VAL_2]], i64 16, i1 false) +// CHECK-SAME: %[[VAL_0:.*]]) local_unnamed_addr { +// CHECK: %[[VAL_1:.*]] = alloca [0 x i8*], align 8 +// CHECK: %[[VAL_2:.*]] = alloca [16 x i8], align 1 +// CHECK: %[[VAL_3:.*]] = alloca { i8**, i8**, i8** }, align 8 +// CHECK: %[[VAL_4:.*]] = getelementptr inbounds [16 x i8], [16 x i8]* %[[VAL_2]], i64 0, i64 0 +// CHECK: %[[VAL_5:.*]] = getelementptr inbounds [0 x i8*], [0 x i8*]* %[[VAL_1]], i64 0, i64 0 +// CHECK: %[[VAL_6:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_3]], i64 0, i32 0 +// CHECK: store i8** %[[VAL_5]], i8*** %[[VAL_6]], align 8 +// CHECK: %[[VAL_7:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_3]], i64 0, i32 1 +// CHECK: store i8** %[[VAL_5]], i8*** %[[VAL_7]], align 8 +// CHECK: %[[VAL_8:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_3]], i64 0, i32 2 +// CHECK: store i8** %[[VAL_5]], i8*** %[[VAL_8]], align 8 +// CHECK: %[[VAL_9:.*]] = bitcast { i8**, i8**, i8** }* %[[VAL_3]] to i8* +// CHECK: %[[VAL_10:.*]] = call { i8*, i64 } @hybridLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_5.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_5.thunk to i8*), i8* nonnull %[[VAL_4]], i64 16, i64 0, i8* nonnull %[[VAL_9]]) +// CHECK: %[[VAL_11:.*]] = bitcast { i64, double }* %[[VAL_0]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 8 dereferenceable(16) %[[VAL_11]], i8* noundef nonnull align 1 dereferenceable(16) %[[VAL_4]], i64 16, i1 false) // CHECK: ret void // CHECK: } } - //===----------------------------------------------------------------------===// -// CHECK-LABEL: define i64 @test_0.returnOffset() +// CHECK-LABEL: define i64 @test_0.returnOffset() local_unnamed_addr {{.*}} { // CHECK: ret i64 8 // CHECK: } // CHECK-LABEL: define { i8*, i64 } @test_0.thunk(i8* nocapture -// CHECK-SAME: %[[VAL_0:.*]], i1 %[[VAL_1:.*]]) { +// CHECK-SAME: %[[VAL_0:.*]], i1 +// CHECK-SAME: %[[VAL_1:.*]]) { // CHECK: %[[VAL_2:.*]] = bitcast i8* %[[VAL_0]] to i32* // CHECK: %[[VAL_3:.*]] = load i32, i32* %[[VAL_2]], align 4 -// CHECK: %[[VAL_5:.*]] = sext i32 %[[VAL_3]] to i64 -// CHECK: %[[VAL_6:.*]] = tail call %[[VAL_7:.*]]* @__quantum__rt__qubit_allocate_array(i64 %[[VAL_5]]) -// CHECK: %[[VAL_8:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(%[[VAL_7]]* %[[VAL_6]]) -// CHECK: %[[VAL_9:.*]] = icmp sgt i64 %[[VAL_8]], 0 -// CHECK: br i1 %[[VAL_9]], label %[[VAL_10:.*]], label %[[VAL_11:.*]] -// CHECK: ._crit_edge.thread: ; preds = %[[VAL_12:.*]] -// CHECK: %[[VAL_13:.*]] = alloca i8, i64 %[[VAL_8]], align 1 -// CHECK: br label %[[VAL_14:.*]] -// CHECK: .lr.ph: ; preds = %[[VAL_12]], %[[VAL_10]] -// CHECK: %[[VAL_15:.*]] = phi i64 [ %[[VAL_16:.*]], %[[VAL_10]] ], [ 0, %[[VAL_12]] ] -// CHECK: %[[VAL_17:.*]] = tail call i8* @__quantum__rt__array_get_element_ptr_1d(%[[VAL_7]]* %[[VAL_6]], i64 %[[VAL_15]]) -// CHECK: %[[VAL_18:.*]] = bitcast i8* %[[VAL_17]] to %[[VAL_19:.*]]** -// CHECK: %[[VAL_20:.*]] = load %[[VAL_19]]*, %[[VAL_19]]** %[[VAL_18]], align 8 -// CHECK: tail call void @__quantum__qis__h(%[[VAL_19]]* %[[VAL_20]]) -// CHECK: %[[VAL_16]] = add nuw nsw i64 %[[VAL_15]], 1 -// CHECK: %[[VAL_21:.*]] = icmp eq i64 %[[VAL_16]], %[[VAL_8]] -// CHECK: br i1 %[[VAL_21]], label %[[VAL_22:.*]], label %[[VAL_10]] -// CHECK: ._crit_edge: ; preds = %[[VAL_10]] -// CHECK: %[[VAL_23:.*]] = alloca i8, i64 %[[VAL_8]], align 1 -// CHECK: br i1 %[[VAL_9]], label %[[VAL_24:.*]], label %[[VAL_14]] -// CHECK: [[VAL_24]]: ; preds = %[[VAL_22]], %[[VAL_24]] -// CHECK: %[[VAL_25:.*]] = phi i64 [ %[[VAL_26:.*]], %[[VAL_24]] ], [ 0, %[[VAL_22]] ] -// CHECK: %[[VAL_27:.*]] = tail call i8* @__quantum__rt__array_get_element_ptr_1d(%[[VAL_7]]* %[[VAL_6]], i64 %[[VAL_25]]) -// CHECK: %[[VAL_28:.*]] = bitcast i8* %[[VAL_27]] to %[[VAL_19]]** -// CHECK: %[[VAL_29:.*]] = load %[[VAL_19]]*, %[[VAL_19]]** %[[VAL_28]], align 8 -// CHECK: %[[VAL_30:.*]] = tail call %[[VAL_31:.*]]* @__quantum__qis__mz(%[[VAL_19]]* %[[VAL_29]]) -// CHECK: %[[VAL_32:.*]] = bitcast %[[VAL_31]]* %[[VAL_30]] to i1* -// CHECK: %[[VAL_33:.*]] = load i1, i1* %[[VAL_32]], align 1 -// CHECK: %[[VAL_34:.*]] = getelementptr i8, i8* %[[VAL_23]], i64 %[[VAL_25]] -// CHECK: %[[VAL_35:.*]] = zext i1 %[[VAL_33]] to i8 -// CHECK: store i8 %[[VAL_35]], i8* %[[VAL_34]], align 1 -// CHECK: %[[VAL_26]] = add nuw nsw i64 %[[VAL_25]], 1 -// CHECK: %[[VAL_36:.*]] = icmp eq i64 %[[VAL_26]], %[[VAL_8]] -// CHECK: br i1 %[[VAL_36]], label %[[VAL_14]], label %[[VAL_24]] -// CHECK: [[VAL_14]]: ; preds = %[[VAL_24]], %[[VAL_11]], %[[VAL_22]] -// CHECK: %[[VAL_37:.*]] = phi i8* [ %[[VAL_13]], %[[VAL_11]] ], [ %[[VAL_23]], %[[VAL_22]] ], [ %[[VAL_23]], %[[VAL_24]] ] -// CHECK: %[[VAL_38:.*]] = call i8* @__nvqpp_vectorCopyCtor(i8* nonnull %[[VAL_37]], i64 %[[VAL_8]], i64 1) -// CHECK: call void @__quantum__rt__qubit_release_array(%[[VAL_7]]* %[[VAL_6]]) -// CHECK: %[[VAL_50:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 8 -// CHECK: %[[VAL_51:.*]] = bitcast i8* %[[VAL_50]] to i8** -// CHECK: store i8* %[[VAL_38]], i8** %[[VAL_51]], align 8 -// CHECK: %[[VAL_52:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 16 -// CHECK: %[[VAL_53:.*]] = bitcast i8* %[[VAL_52]] to i64* -// CHECK: store i64 %[[VAL_8]], i64* %[[VAL_53]], align 8 +// CHECK: %[[VAL_4:.*]] = sext i32 %[[VAL_3]] to i64 +// CHECK: %[[VAL_5:.*]] = tail call %[[VAL_6:.*]]* @__quantum__rt__qubit_allocate_array(i64 %[[VAL_4]]) +// CHECK: %[[VAL_7:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(%[[VAL_6]]* %[[VAL_5]]) +// CHECK: %[[VAL_8:.*]] = icmp sgt i64 %[[VAL_7]], 0 +// CHECK: br i1 %[[VAL_8]], label %[[VAL_9:.*]], label %[[VAL_10:.*]] +// CHECK: ._crit_edge.thread: ; preds = %[[VAL_11:.*]] +// CHECK: %[[VAL_12:.*]] = alloca i8, i64 %[[VAL_7]], align 1 +// CHECK: br label %[[VAL_13:.*]] +// CHECK: .lr.ph: ; preds = %[[VAL_11]], %[[VAL_9]] +// CHECK: %[[VAL_14:.*]] = phi i64 [ %[[VAL_15:.*]], %[[VAL_9]] ], [ 0, %[[VAL_11]] ] +// CHECK: %[[VAL_16:.*]] = tail call i8* @__quantum__rt__array_get_element_ptr_1d(%[[VAL_6]]* %[[VAL_5]], i64 %[[VAL_14]]) +// CHECK: %[[VAL_17:.*]] = bitcast i8* %[[VAL_16]] to %[[VAL_18:.*]]** +// CHECK: %[[VAL_19:.*]] = load %[[VAL_18]]*, %[[VAL_18]]** %[[VAL_17]], align 8 +// CHECK: tail call void @__quantum__qis__h(%[[VAL_18]]* %[[VAL_19]]) +// CHECK: %[[VAL_15]] = add nuw nsw i64 %[[VAL_14]], 1 +// CHECK: %[[VAL_20:.*]] = icmp eq i64 %[[VAL_15]], %[[VAL_7]] +// CHECK: br i1 %[[VAL_20]], label %[[VAL_21:.*]], label %[[VAL_9]] +// CHECK: ._crit_edge: ; preds = %[[VAL_9]] +// CHECK: %[[VAL_22:.*]] = alloca i8, i64 %[[VAL_7]], align 1 +// CHECK: br i1 %[[VAL_8]], label %[[VAL_23:.*]], label %[[VAL_13]] +// CHECK: .lr.ph6: ; preds = %[[VAL_21]], %[[VAL_23]] +// CHECK: %[[VAL_24:.*]] = phi i64 [ %[[VAL_25:.*]], %[[VAL_23]] ], [ 0, %[[VAL_21]] ] +// CHECK: %[[VAL_26:.*]] = tail call i8* @__quantum__rt__array_get_element_ptr_1d(%[[VAL_6]]* %[[VAL_5]], i64 %[[VAL_24]]) +// CHECK: %[[VAL_27:.*]] = bitcast i8* %[[VAL_26]] to %[[VAL_18]]** +// CHECK: %[[VAL_28:.*]] = load %[[VAL_18]]*, %[[VAL_18]]** %[[VAL_27]], align 8 +// CHECK: %[[VAL_29:.*]] = tail call %[[VAL_30:.*]]* @__quantum__qis__mz(%[[VAL_18]]* %[[VAL_28]]) +// CHECK: %[[VAL_31:.*]] = bitcast %[[VAL_30]]* %[[VAL_29]] to i1* +// CHECK: %[[VAL_32:.*]] = load i1, i1* %[[VAL_31]], align 1 +// CHECK: %[[VAL_33:.*]] = getelementptr i8, i8* %[[VAL_22]], i64 %[[VAL_24]] +// CHECK: %[[VAL_34:.*]] = zext i1 %[[VAL_32]] to i8 +// CHECK: store i8 %[[VAL_34]], i8* %[[VAL_33]], align 1 +// CHECK: %[[VAL_25]] = add nuw nsw i64 %[[VAL_24]], 1 +// CHECK: %[[VAL_35:.*]] = icmp eq i64 %[[VAL_25]], %[[VAL_7]] +// CHECK: br i1 %[[VAL_35]], label %[[VAL_13]], label %[[VAL_23]] +// CHECK: ._crit_edge7: ; preds = %[[VAL_23]], %[[VAL_10]], %[[VAL_21]] +// CHECK: %[[VAL_36:.*]] = phi i8* [ %[[VAL_12]], %[[VAL_10]] ], [ %[[VAL_22]], %[[VAL_21]] ], [ %[[VAL_22]], %[[VAL_23]] ] +// CHECK: %[[VAL_37:.*]] = call i8* @__nvqpp_vectorCopyCtor(i8* nonnull %[[VAL_36]], i64 %[[VAL_7]], i64 1) +// CHECK: call void @__quantum__rt__qubit_release_array(%[[VAL_6]]* %[[VAL_5]]) +// CHECK: %[[VAL_38:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 8 +// CHECK: %[[VAL_39:.*]] = bitcast i8* %[[VAL_38]] to i8** +// CHECK: store i8* %[[VAL_37]], i8** %[[VAL_39]], align 8 +// CHECK: %[[VAL_40:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 16 +// CHECK: %[[VAL_41:.*]] = bitcast i8* %[[VAL_40]] to i64* +// CHECK: store i64 %[[VAL_7]], i64* %[[VAL_41]], align 8 // CHECK: br i1 %[[VAL_1]], label %[[VAL_42:.*]], label %[[VAL_43:.*]] -// CHECK: [[VAL_43]]: ; preds = %[[VAL_14]], %[[VAL_42]] -// CHECK: %[[VAL_44:.*]] = phi { i8*, i64 } [ %[[VAL_45:.*]], %[[VAL_42]] ], [ zeroinitializer, %[[VAL_14]] ] +// CHECK: common.ret: ; preds = %[[VAL_13]], %[[VAL_42]] +// CHECK: %[[VAL_44:.*]] = phi { i8*, i64 } [ %[[VAL_45:.*]], %[[VAL_42]] ], [ zeroinitializer, %[[VAL_13]] ] // CHECK: ret { i8*, i64 } %[[VAL_44]] -// CHECK: [[VAL_42]]: ; preds = %[[VAL_14]] -// CHECK: %[[VAL_46:.*]] = add i64 %[[VAL_8]], 24 +// CHECK: 31: ; preds = %[[VAL_13]] +// CHECK: %[[VAL_46:.*]] = add i64 %[[VAL_7]], 24 // CHECK: %[[VAL_47:.*]] = call i8* @malloc(i64 %[[VAL_46]]) // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 1 dereferenceable(24) %[[VAL_47]], i8* noundef nonnull align 1 dereferenceable(24) %[[VAL_0]], i64 24, i1 false) // CHECK: %[[VAL_48:.*]] = getelementptr i8, i8* %[[VAL_47]], i64 24 -// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %[[VAL_48]], i8* align 1 %[[VAL_38]], i64 %[[VAL_8]], i1 false) +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %[[VAL_48]], i8* align 1 %[[VAL_37]], i64 %[[VAL_7]], i1 false) // CHECK: %[[VAL_49:.*]] = insertvalue { i8*, i64 } undef, i8* %[[VAL_47]], 0 // CHECK: %[[VAL_45]] = insertvalue { i8*, i64 } %[[VAL_49]], i64 %[[VAL_46]], 1 +// CHECK: %[[VAL_50:.*]] = getelementptr i8, i8* %[[VAL_47]], i64 8 +// CHECK: %[[VAL_51:.*]] = bitcast i8* %[[VAL_50]] to i8** +// CHECK: store i8* %[[VAL_48]], i8** %[[VAL_51]], align 8 // CHECK: br label %[[VAL_43]] // CHECK: } // CHECK-LABEL: define i64 @test_0.argsCreator(i8** nocapture readonly -// CHECK-SAME: %[[VAL_0:.*]], i8** nocapture writeonly -// CHECK-SAME: %[[VAL_1:.*]]) #{{[0-9]+}} { +// CHECK-SAME: %[[VAL_0:.*]], i8** nocapture writeonly +// CHECK-SAME: %[[VAL_1:.*]]) {{.*}} { // CHECK: %[[VAL_2:.*]] = bitcast i8** %[[VAL_0]] to i32** // CHECK: %[[VAL_3:.*]] = load i32*, i32** %[[VAL_2]], align 8 // CHECK: %[[VAL_4:.*]] = load i32, i32* %[[VAL_3]], align 4 -// CHECK: %[[VAL_5:.*]] = insertvalue { i32, { i1*, i64 } } undef, i32 %[[VAL_4]], 0 -// CHECK: %[[VAL_6:.*]] = tail call dereferenceable_or_null(24) i8* @malloc(i64 24) -// CHECK: %[[VAL_7:.*]] = bitcast i8* %[[VAL_6]] to { i32, { i1*, i64 } }* -// CHECK: store { i32, { i1*, i64 } } %[[VAL_5]], { i32, { i1*, i64 } }* %[[VAL_7]], align 8 -// CHECK: store i8* %[[VAL_6]], i8** %[[VAL_1]], align 8 +// CHECK: %[[VAL_5:.*]] = tail call dereferenceable_or_null(24) i8* @malloc(i64 24) +// CHECK: %[[VAL_6:.*]] = bitcast i8* %[[VAL_5]] to i32* +// CHECK: store i32 %[[VAL_4]], i32* %[[VAL_6]], align 4 +// CHECK: store i8* %[[VAL_5]], i8** %[[VAL_1]], align 8 // CHECK: ret i64 24 // CHECK: } @@ -389,13 +464,13 @@ func.func @test_5(%sret: !cc.ptr> {llvm.sret = !cc.struct // CHECK: ret void // CHECK: } -// CHECK-LABEL: define i64 @test_1.returnOffset() +// CHECK-LABEL: define i64 @test_1.returnOffset() local_unnamed_addr {{.*}} { // CHECK: ret i64 0 // CHECK: } // CHECK-LABEL: define { i8*, i64 } @test_1.thunk(i8* nocapture writeonly -// CHECK-SAME: %[[VAL_0:.*]], i1 -// CHECK-SAME: %[[VAL_1:.*]]) { +// CHECK-SAME: %[[VAL_0:.*]], i1 +// CHECK-SAME: %[[VAL_1:.*]]) { // CHECK: %[[VAL_2:.*]] = tail call %[[VAL_3:.*]]* @__quantum__rt__qubit_allocate_array(i64 2) // CHECK: %[[VAL_4:.*]] = tail call i8* @__quantum__rt__array_get_element_ptr_1d(%[[VAL_3]]* %[[VAL_2]], i64 0) // CHECK: %[[VAL_5:.*]] = bitcast i8* %[[VAL_4]] to %[[VAL_6:.*]]** @@ -421,8 +496,8 @@ func.func @test_5(%sret: !cc.ptr> {llvm.sret = !cc.struct // CHECK: } // CHECK-LABEL: define i64 @test_1.argsCreator(i8** nocapture readnone -// CHECK-SAME: %[[VAL_0:.*]], i8** nocapture writeonly -// CHECK-SAME: %[[VAL_1:.*]]) #{{[0-9]+}} { +// CHECK-SAME: %[[VAL_0:.*]], i8** nocapture writeonly +// CHECK-SAME: %[[VAL_1:.*]]) {{.*}} { // CHECK: %[[VAL_2:.*]] = tail call dereferenceable_or_null(2) i8* @malloc(i64 2) // CHECK: store i8* %[[VAL_2]], i8** %[[VAL_1]], align 8 // CHECK: ret i64 2 @@ -434,21 +509,21 @@ func.func @test_5(%sret: !cc.ptr> {llvm.sret = !cc.struct // CHECK: ret void // CHECK: } -// CHECK-LABEL: define i64 @test_2.returnOffset() +// CHECK-LABEL: define i64 @test_2.returnOffset() local_unnamed_addr {{.*}} { // CHECK: ret i64 0 // CHECK: } // CHECK-LABEL: define { i8*, i64 } @test_2.thunk(i8* nocapture writeonly -// CHECK-SAME: %[[VAL_0:.*]], i1 -// CHECK-SAME: %[[VAL_1:.*]]) #{{[0-9]+}} { +// CHECK-SAME: %[[VAL_0:.*]], i1 +// CHECK-SAME: %[[VAL_1:.*]]) {{.*}} { // CHECK: %[[VAL_2:.*]] = bitcast i8* %[[VAL_0]] to { i16, float, double, i64 }* // CHECK: store { i16, float, double, i64 } { i16 8, float 0x40159999A0000000, double 3.783000e+01, i64 1479 }, { i16, float, double, i64 }* %[[VAL_2]], align 8 // CHECK: ret { i8*, i64 } zeroinitializer // CHECK: } // CHECK-LABEL: define i64 @test_2.argsCreator(i8** nocapture readnone -// CHECK-SAME: %[[VAL_0:.*]], i8** nocapture writeonly -// CHECK-SAME: %[[VAL_1:.*]]) #{{[0-9]+}} { +// CHECK-SAME: %[[VAL_0:.*]], i8** nocapture writeonly +// CHECK-SAME: %[[VAL_1:.*]]) {{.*}} { // CHECK: %[[VAL_2:.*]] = tail call dereferenceable_or_null(24) i8* @malloc(i64 24) // CHECK: store i8* %[[VAL_2]], i8** %[[VAL_1]], align 8 // CHECK: ret i64 24 @@ -460,12 +535,13 @@ func.func @test_5(%sret: !cc.ptr> {llvm.sret = !cc.struct // CHECK: ret void // CHECK: } -// CHECK-LABEL: define i64 @test_3.returnOffset() +// CHECK-LABEL: define i64 @test_3.returnOffset() local_unnamed_addr {{.*}} { // CHECK: ret i64 0 // CHECK: } // CHECK-LABEL: define { i8*, i64 } @test_3.thunk(i8* nocapture writeonly -// CHECK-SAME: %[[VAL_0:.*]], i1 %[[VAL_1:.*]]) #{{[0-9]+}} { +// CHECK-SAME: %[[VAL_0:.*]], i1 +// CHECK-SAME: %[[VAL_1:.*]]) {{.*}} { // CHECK: %[[VAL_2:.*]] = bitcast i8* %[[VAL_0]] to i64* // CHECK: store i64 5, i64* %[[VAL_2]], align 4 // CHECK: %[[VAL_3:.*]] = getelementptr inbounds i8, i8* %[[VAL_0]], i64 8 @@ -484,8 +560,8 @@ func.func @test_5(%sret: !cc.ptr> {llvm.sret = !cc.struct // CHECK: } // CHECK-LABEL: define i64 @test_3.argsCreator(i8** nocapture readnone -// CHECK-SAME: %[[VAL_0:.*]], i8** nocapture writeonly -// CHECK-SAME: %[[VAL_1:.*]]) #{{[0-9]+}} { +// CHECK-SAME: %[[VAL_0:.*]], i8** nocapture writeonly +// CHECK-SAME: %[[VAL_1:.*]]) {{.*}} { // CHECK: %[[VAL_2:.*]] = tail call dereferenceable_or_null(40) i8* @malloc(i64 40) // CHECK: store i8* %[[VAL_2]], i8** %[[VAL_1]], align 8 // CHECK: ret i64 40 @@ -497,12 +573,13 @@ func.func @test_5(%sret: !cc.ptr> {llvm.sret = !cc.struct // CHECK: ret void // CHECK: } -// CHECK-LABEL: define i64 @test_4.returnOffset() +// CHECK-LABEL: define i64 @test_4.returnOffset() local_unnamed_addr {{.*}} { // CHECK: ret i64 0 // CHECK: } // CHECK-LABEL: define { i8*, i64 } @test_4.thunk(i8* nocapture writeonly -// CHECK-SAME: %[[VAL_0:.*]], i1 %[[VAL_1:.*]]) #{{[0-9]+}} { +// CHECK-SAME: %[[VAL_0:.*]], i1 +// CHECK-SAME: %[[VAL_1:.*]]) {{.*}} { // CHECK: %[[VAL_2:.*]] = bitcast i8* %[[VAL_0]] to i64* // CHECK: store i64 537892, i64* %[[VAL_2]], align 4 // CHECK: %[[VAL_3:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 8 @@ -512,8 +589,8 @@ func.func @test_5(%sret: !cc.ptr> {llvm.sret = !cc.struct // CHECK: } // CHECK-LABEL: define i64 @test_4.argsCreator(i8** nocapture readnone -// CHECK-SAME: %[[VAL_0:.*]], i8** nocapture writeonly -// CHECK-SAME: %[[VAL_1:.*]]) #{{[0-9]+}} { +// CHECK-SAME: %[[VAL_0:.*]], i8** nocapture writeonly +// CHECK-SAME: %[[VAL_1:.*]]) {{.*}} { // CHECK: %[[VAL_2:.*]] = tail call dereferenceable_or_null(16) i8* @malloc(i64 16) // CHECK: store i8* %[[VAL_2]], i8** %[[VAL_1]], align 8 // CHECK: ret i64 16 @@ -525,12 +602,13 @@ func.func @test_5(%sret: !cc.ptr> {llvm.sret = !cc.struct // CHECK: ret void // CHECK: } -// CHECK-LABEL: define i64 @test_5.returnOffset() +// CHECK-LABEL: define i64 @test_5.returnOffset() local_unnamed_addr {{.*}} { // CHECK: ret i64 0 // CHECK: } // CHECK-LABEL: define { i8*, i64 } @test_5.thunk(i8* nocapture writeonly -// CHECK-SAME: %[[VAL_0:.*]], i1 %[[VAL_1:.*]]) #{{[0-9]+}} { +// CHECK-SAME: %[[VAL_0:.*]], i1 +// CHECK-SAME: %[[VAL_1:.*]]) {{.*}} { // CHECK: %[[VAL_2:.*]] = bitcast i8* %[[VAL_0]] to i64* // CHECK: store i64 537892, i64* %[[VAL_2]], align 4 // CHECK: %[[VAL_3:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 8 @@ -540,8 +618,8 @@ func.func @test_5(%sret: !cc.ptr> {llvm.sret = !cc.struct // CHECK: } // CHECK-LABEL: define i64 @test_5.argsCreator(i8** nocapture readnone -// CHECK-SAME: %[[VAL_0:.*]], i8** nocapture writeonly -// CHECK-SAME: %[[VAL_1:.*]]) #{{[0-9]+}} { +// CHECK-SAME: %[[VAL_0:.*]], i8** nocapture writeonly +// CHECK-SAME: %[[VAL_1:.*]]) {{.*}} { // CHECK: %[[VAL_2:.*]] = tail call dereferenceable_or_null(16) i8* @malloc(i64 16) // CHECK: store i8* %[[VAL_2]], i8** %[[VAL_1]], align 8 // CHECK: ret i64 16 From 1a9d9ca97d7415842685f6374396f2041a8af02f Mon Sep 17 00:00:00 2001 From: Eric Schweitz Date: Thu, 31 Oct 2024 11:41:03 -0700 Subject: [PATCH 02/19] Fix bug in python. The python layers do convert a list of booleans to std::vector so it needs to be undone. Signed-off-by: Eric Schweitz --- .../Transforms/GenKernelExecution.cpp | 46 ++++++++++++++----- 1 file changed, 34 insertions(+), 12 deletions(-) diff --git a/lib/Optimizer/Transforms/GenKernelExecution.cpp b/lib/Optimizer/Transforms/GenKernelExecution.cpp index 78a968d822..4db2c7992b 100644 --- a/lib/Optimizer/Transforms/GenKernelExecution.cpp +++ b/lib/Optimizer/Transforms/GenKernelExecution.cpp @@ -356,14 +356,16 @@ class GenerateKernelExecution // logical device side argument. May drop some arguments on the floor if they // cannot be encoded. template - SmallVector> zipArgumentsWithDeviceTypes( - Location loc, OpBuilder &builder, ValueRange args, TypeRange types, - SmallVectorImpl *freeVectorBuffers = nullptr) { + SmallVector> + zipArgumentsWithDeviceTypes(Location loc, OpBuilder &builder, ValueRange args, + TypeRange types, + SmallVectorImpl &freeVectorBuffers) { SmallVector> result; if constexpr (argsAreReferences) { // Simple case: the number of args must be equal to the types. assert(args.size() == types.size() && "arguments and types must have same size"); + auto *ctx = builder.getContext(); for (auto iter : llvm::enumerate(llvm::zip(args, types))) { // Remove the reference. Value v = std::get(iter.value()); @@ -371,10 +373,18 @@ class GenerateKernelExecution if (!(cudaq::cc::isDynamicType(ty) || isStateType(ty) || isa(ty))) v = builder.create(loc, v); - // NB: Will a vector be passed as a C++ object or "unrolled" by - // the caller into a contiguous string of bytes, where each byte is a - // bool? Assume the latter for now, since it's likely the way python - // will do / continue to do it. + // Python will pass a std::vector to us here. Unpack it. + if (auto stdvecTy = dyn_cast(ty)) + if (stdvecTy.getElementType() == IntegerType::get(ctx, 1)) { + Type stdvecHostTy = + cudaq::opt::factory::stlVectorType(stdvecTy.getElementType()); + Value tmp = builder.create(loc, stdvecHostTy); + builder.create(loc, std::nullopt, + cudaq::stdvecBoolUnpackToInitList, + ArrayRef{tmp, v}); + freeVectorBuffers.push_back(tmp); + v = tmp; + } result.emplace_back(iter.index(), v, ty); } } else /*constexpr*/ { @@ -401,9 +411,7 @@ class GenerateKernelExecution cudaq::stdvecBoolUnpackToInitList, ArrayRef{tmp, *argIter}); result.emplace_back(argPos, tmp, devTy); - assert(freeVectorBuffers && - "must have a vector to return heap allocations"); - freeVectorBuffers->push_back(tmp); + freeVectorBuffers.push_back(tmp); continue; } @@ -730,8 +738,9 @@ class GenerateKernelExecution // Zip the arguments with the device side argument types. Recall that some // of the (left-most) arguments may have been dropped on the floor. const bool hasDynamicSignature = isDynamicSignature(devKernelTy); + SmallVector freeVectorBuffers; auto zippy = zipArgumentsWithDeviceTypes( - loc, builder, pseudoArgs, passedDevArgTys); + loc, builder, pseudoArgs, passedDevArgTys, freeVectorBuffers); auto sizeScratch = builder.create(loc, i64Ty); auto messageBufferSize = [&]() -> Value { if (hasDynamicSignature) @@ -765,6 +774,19 @@ class GenerateKernelExecution populateMessageBuffer(loc, builder, msgBufferPrefix, zippy); } + if (!freeVectorBuffers.empty()) { + // Need to free any temporary vector-like buffers. These arise when + // there is a std::vector argument, which we translate into a + // std::vector to reuse the same code as any other std::vector. + for (auto vecVar : freeVectorBuffers) { + auto ptrPtrTy = cudaq::cc::PointerType::get(ptrI8Ty); + auto ptrPtr = builder.create(loc, ptrPtrTy, vecVar); + Value freeMe = builder.create(loc, ptrPtr); + builder.create(loc, std::nullopt, "free", + ArrayRef{freeMe}); + } + } + // Return the message buffer and its size in bytes. builder.create(loc, rawMessageBuffer, entry->getArgument(1)); @@ -1168,7 +1190,7 @@ class GenerateKernelExecution const bool hasDynamicSignature = isDynamicSignature(devFuncTy); SmallVector freeVectorBuffers; auto zippy = zipArgumentsWithDeviceTypes( - loc, builder, blockValues, devFuncTy.getInputs(), &freeVectorBuffers); + loc, builder, blockValues, devFuncTy.getInputs(), freeVectorBuffers); auto sizeScratch = builder.create(loc, i64Ty); auto messageBufferSize = [&]() -> Value { if (hasDynamicSignature) From 2d22ae60ec18946257d1add3726c0397a6ef554e Mon Sep 17 00:00:00 2001 From: Eric Schweitz Date: Thu, 31 Oct 2024 13:29:13 -0700 Subject: [PATCH 03/19] Fix test by adding a cast. Signed-off-by: Eric Schweitz --- include/cudaq/Optimizer/Dialect/CC/CCTypes.td | 4 ++-- python/tests/kernel/test_observe_kernel.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/cudaq/Optimizer/Dialect/CC/CCTypes.td b/include/cudaq/Optimizer/Dialect/CC/CCTypes.td index b7cf72d234..aa03aedc07 100644 --- a/include/cudaq/Optimizer/Dialect/CC/CCTypes.td +++ b/include/cudaq/Optimizer/Dialect/CC/CCTypes.td @@ -121,10 +121,10 @@ def cc_StructType : CCType<"Struct", "struct", let extraClassDeclaration = [{ // O(1) bool isEmpty() const { return getMembers().empty(); } - + // O(n) std::size_t getNumMembers() const { return getMembers().size(); } - + Type getMember(unsigned position) { return getMembers()[position]; } }]; } diff --git a/python/tests/kernel/test_observe_kernel.py b/python/tests/kernel/test_observe_kernel.py index 5bf9d5a812..24c63ba90a 100644 --- a/python/tests/kernel/test_observe_kernel.py +++ b/python/tests/kernel/test_observe_kernel.py @@ -302,8 +302,7 @@ def test_pack_args_pauli_list(): def generateRandomPauliStrings(numQubits, numPaulis): s = ['X', 'Y', 'Z', 'I'] return [ - ''.join([random.choice(s) - for i in range(numQubits)]) + ''.join([random.choice(s) for i in range(numQubits)]) for i in range(numPaulis) ] @@ -336,7 +335,8 @@ def gqeCirc2(N: int, thetas: list[float], paulis: list[cudaq.pauli_word]): ts = np.random.rand(len(pauliStings)) exp_val1 = cudaq.observe_async(gqeCirc1, obs, numQubits, list(ts), - pauliStings[0]).get().expectation() + cudaq.pauli_word( + pauliStings[0])).get().expectation() print('observe_async exp_val1', exp_val1) exp_val2 = cudaq.observe_async(gqeCirc2, obs, numQubits, list(ts), pauliStings).get().expectation() From 0a2b517ced579431b5e6640308cea678bc8ece9d Mon Sep 17 00:00:00 2001 From: Eric Schweitz Date: Thu, 31 Oct 2024 15:22:05 -0700 Subject: [PATCH 04/19] Add another new executable test. Not fully enabled until bugs in the thunk unpacking code are fixed. Move functions to static functions. Add handling of std::vector. std::vector is a class that is distinct from all other std::vector and it needs to be handled with special code on the host side. (On the device side, it is forced to look like any other span.) Signed-off-by: Eric Schweitz --- include/cudaq/Optimizer/Builder/Intrinsics.h | 5 + lib/Optimizer/Builder/Factory.cpp | 19 +- lib/Optimizer/Builder/Intrinsics.cpp | 8 +- .../Transforms/GenKernelExecution.cpp | 1548 +++++++++-------- runtime/cudaq/cudaq.cpp | 27 +- runtime/cudaq/qis/qubit_qis.h | 6 +- .../SeparateCompilation/arith_spans.cpp | 229 +++ test/AST-Quake/calling_convention.cpp | 4 +- test/Quake/kernel_exec-1.qke | 6 +- test/Quake/kernel_exec-2.qke | 2 +- test/Quake/return_vector.qke | 272 +-- 11 files changed, 1297 insertions(+), 829 deletions(-) create mode 100644 targettests/SeparateCompilation/arith_spans.cpp diff --git a/include/cudaq/Optimizer/Builder/Intrinsics.h b/include/cudaq/Optimizer/Builder/Intrinsics.h index fa9ce53097..5884dbb39e 100644 --- a/include/cudaq/Optimizer/Builder/Intrinsics.h +++ b/include/cudaq/Optimizer/Builder/Intrinsics.h @@ -36,11 +36,16 @@ static constexpr const char getCudaqSizeFromTriple[] = // typically specialized to be bit packed). static constexpr const char stdvecBoolCtorFromInitList[] = "__nvqpp_initializer_list_to_vector_bool"; + // Convert a (likely packed) std::vector into a sequence of bytes, each // holding a boolean value. static constexpr const char stdvecBoolUnpackToInitList[] = "__nvqpp_vector_bool_to_initializer_list"; +// Free any temporary buffers used to hold std::vector data. +static constexpr const char stdvecBoolFreeTemporaryLists[] = + "__nvqpp_vector_bool_free_temporary_initlists"; + // The internal data of the cudaq::state object must be `2**n` in length. This // function returns the value `n`. static constexpr const char getNumQubitsFromCudaqState[] = diff --git a/lib/Optimizer/Builder/Factory.cpp b/lib/Optimizer/Builder/Factory.cpp index 5a4e5cb43b..5c090d4271 100644 --- a/lib/Optimizer/Builder/Factory.cpp +++ b/lib/Optimizer/Builder/Factory.cpp @@ -321,6 +321,22 @@ cc::StructType factory::stlVectorType(Type eleTy) { return cc::StructType::get(ctx, ArrayRef{ptrTy, ptrTy, ptrTy}); } +// Note that this is the raw host type, where std::vector is distinct. +// When converting to the device side, the distinction is deliberately removed +// making std::vector the same format as std::vector. +static cc::StructType stlHostVectorType(Type eleTy) { + MLIRContext *ctx = eleTy.getContext(); + if (eleTy != IntegerType::get(ctx, 1)) { + // std::vector where T != bool. + return factory::stlVectorType(eleTy); + } + // std::vector is a different type than std::vector. + auto ptrTy = cc::PointerType::get(eleTy); + auto i8Ty = IntegerType::get(ctx, 8); + auto padout = cc::ArrayType::get(ctx, i8Ty, 32); + return cc::StructType::get(ctx, ArrayRef{ptrTy, padout}); +} + // FIXME: Give these front-end names so we can disambiguate more types. cc::StructType factory::getDynamicBufferType(MLIRContext *ctx) { auto ptrTy = cc::PointerType::get(IntegerType::get(ctx, 8)); @@ -344,8 +360,7 @@ Type factory::getSRetElementType(FunctionType funcTy) { Type factory::convertToHostSideType(Type ty) { if (auto memrefTy = dyn_cast(ty)) - return factory::stlVectorType( - convertToHostSideType(memrefTy.getElementType())); + return stlHostVectorType(convertToHostSideType(memrefTy.getElementType())); if (isa(ty)) return cc::PointerType::get(IntegerType::get(ty.getContext(), 8)); if (isa(ty)) diff --git a/lib/Optimizer/Builder/Intrinsics.cpp b/lib/Optimizer/Builder/Intrinsics.cpp index 1774475b1b..d0db1bdf82 100644 --- a/lib/Optimizer/Builder/Intrinsics.cpp +++ b/lib/Optimizer/Builder/Intrinsics.cpp @@ -307,11 +307,17 @@ static constexpr IntrinsicCode intrinsicTable[] = { return %0 : !cc.ptr })#"}, + // __nvqpp_vector_bool_free_temporary_lists + {cudaq::stdvecBoolFreeTemporaryLists, + {}, + R"#( + func.func private @__nvqpp_vector_bool_free_temporary_initlists(!cc.ptr) -> ())#"}, + // __nvqpp_vector_bool_to_initializer_list {cudaq::stdvecBoolUnpackToInitList, {}, R"#( - func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr, !cc.ptr, !cc.ptr}>>, !cc.ptr, !cc.ptr, !cc.ptr}>>) -> ())#"}, + func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr, !cc.ptr, !cc.ptr}>>, !cc.ptr, !cc.array}>>, !cc.ptr>) -> ())#"}, {"__nvqpp_zeroDynamicResult", {}, R"#( func.func private @__nvqpp_zeroDynamicResult() -> !cc.struct<{!cc.ptr, i64}> { diff --git a/lib/Optimizer/Transforms/GenKernelExecution.cpp b/lib/Optimizer/Transforms/GenKernelExecution.cpp index 4db2c7992b..7e450c2da7 100644 --- a/lib/Optimizer/Transforms/GenKernelExecution.cpp +++ b/lib/Optimizer/Transforms/GenKernelExecution.cpp @@ -55,615 +55,905 @@ static bool isStateType(Type ty) { return false; } -/// This pass adds a `.thunk` function and a rewritten C++ host -/// side (mangled) stub to the code for every entry-point kernel in the module. -/// It may also generate a `.argsCreator` function. Finally, it -/// creates registration hooks for the CUDA-Q runtime to be able to find the -/// kernel by name and, as appropriate, the `.argsCreator` -/// function. -namespace { -class GenerateKernelExecution - : public cudaq::opt::impl::GenerateKernelExecutionBase< - GenerateKernelExecution> { -public: - using GenerateKernelExecutionBase::GenerateKernelExecutionBase; +/// Creates the function signature for a thunk function. The signature is always +/// the same for all thunk functions. +/// +/// Every thunk function has an identical signature, making it callable from a +/// generic "kernel launcher" in the CUDA-Q runtime. +/// +/// This signature is defined as: `(ptr, bool) -> {ptr, i64}`. +/// +/// The first argument is a pointer to a data buffer that encodes all the +/// arguments (and static return) values to (and from) the kernel in the +/// pointer-free encoding. The second argument indicates if this call is to a +/// remote process (if true). The result is a pointer and size (span) if the +/// kernel returns a dynamically sized result, otherwise it will be +/// `{nullptr, 0}`. It is the responsibility of calling code to free any +/// dynamic result buffer(s) and convert those to `std::vector` objects. +static FunctionType getThunkType(MLIRContext *ctx) { + auto ptrTy = cudaq::cc::PointerType::get(IntegerType::get(ctx, 8)); + return FunctionType::get(ctx, {ptrTy, IntegerType::get(ctx, 1)}, + {cudaq::opt::factory::getDynamicBufferType(ctx)}); +} -private: - /// Creates the function signature for a thunk function. The signature is - /// always the same for all thunk functions. - /// - /// Every thunk function has an identical signature, making it callable from a - /// generic "kernel launcher" in the CUDA-Q runtime. - /// - /// This signature is defined as: `(ptr, bool) -> {ptr, i64}`. - /// - /// The first argument is a pointer to a data buffer that encodes all the - /// arguments (and static return) values to (and from) the kernel in the - /// pointer-free encoding. The second argument indicates if this call is to a - /// remote process (if true). The result is a pointer and size (span) if the - /// kernel returns a dynamically sized result, otherwise it will be - /// `{nullptr, 0}`. It is the responsibility of calling code to free any - /// dynamic result buffer(s) and convert those to `std::vector` objects. - FunctionType getThunkType(MLIRContext *ctx) { - auto ptrTy = cudaq::cc::PointerType::get(IntegerType::get(ctx, 8)); - return FunctionType::get(ctx, {ptrTy, IntegerType::get(ctx, 1)}, - {cudaq::opt::factory::getDynamicBufferType(ctx)}); - } +/// Generate code to read the length from a host-side string object. (On the +/// device side, a string is encoded as a span.) The length of a string is the +/// number of bytes of data. +/// +/// In order to handle a std::string value it is assumed to be laid out in +/// memory as the following structure. +/// +/// +/// struct vector { +/// i8* data; +/// i64 length; +/// [i8 x 16] inlinedata; +/// }; +/// +/// +/// This implementation does \e not support wide characters. +static Value genStringLength(Location loc, OpBuilder &builder, + Value stringArg) { + Type stringTy = stringArg.getType(); + assert(isa(stringTy) && + isa( + cast(stringTy).getElementType()) && + cast( + cast(stringTy).getElementType()) + .getMember(1) == builder.getI64Type() && + "host side string expected"); + auto ptrTy = cast(stringTy); + auto strTy = cast(ptrTy.getElementType()); + auto lenPtr = builder.create( + loc, cudaq::cc::PointerType::get(strTy.getMember(1)), stringArg, + ArrayRef{1}); + return builder.create(loc, lenPtr); +} - /// Generate code to read the length from a host-side string object. (On the - /// device side, a string is encoded as a span.) The length of a string is the - /// number of bytes of data. - /// - /// In order to handle a std::string value it is assumed to be laid out in - /// memory as the following structure. - /// - /// - /// struct vector { - /// i8* data; - /// i64 length; - /// [i8 x 16] inlinedata; - /// }; - /// - /// - /// This implementation does \e not support wide characters. - Value genStringLength(Location loc, OpBuilder &builder, Value stringArg) { - Type stringTy = stringArg.getType(); - assert(isa(stringTy) && - isa( - cast(stringTy).getElementType()) && - cast( - cast(stringTy).getElementType()) - .getMember(1) == builder.getI64Type() && - "host side string expected"); - auto ptrTy = cast(stringTy); - auto strTy = cast(ptrTy.getElementType()); - auto lenPtr = builder.create( - loc, cudaq::cc::PointerType::get(strTy.getMember(1)), stringArg, - ArrayRef{1}); - return builder.create(loc, lenPtr); - } +/// Generate code that computes the size in bytes of a `std::vector` array +/// in the same way as a `std::vector::size()`. This assumes the vector is +/// laid out in memory as the following structure. +/// +/// +/// struct vector { +/// T* begin; +/// T* end; +/// T* allocated_end; +/// }; +/// +/// +/// The first two elements are pointers to the beginning and end of the data +/// in the vector, respectively. This data is kept in a contiguous memory +/// range. The following implementation follows what Clang CodeGen produces +/// for `std::vector::size()` without the final `sdiv` op that divides the +/// `sizeof(data[N])` by the `sizeof(T)`. The result is the total required +/// memory size for the vector data itself in \e bytes. +static Value genVectorSize(Location loc, OpBuilder &builder, Value vecArg) { + auto vecTy = cast(vecArg.getType()); + auto vecStructTy = cast(vecTy.getElementType()); + assert(vecStructTy.getNumMembers() == 3 && + vecStructTy.getMember(0) == vecStructTy.getMember(1) && + vecStructTy.getMember(0) == vecStructTy.getMember(2) && + "host side vector expected"); + auto vecElePtrTy = cudaq::cc::PointerType::get(vecStructTy.getMember(0)); + + // Get the pointer to the pointer of the end of the array + Value endPtr = builder.create( + loc, vecElePtrTy, vecArg, ArrayRef{1}); + + // Get the pointer to the pointer of the beginning of the array + Value beginPtr = builder.create( + loc, vecElePtrTy, vecArg, ArrayRef{0}); + + // Load to a T* + endPtr = builder.create(loc, endPtr); + beginPtr = builder.create(loc, beginPtr); + + // Map those pointers to integers + Type i64Ty = builder.getI64Type(); + Value endInt = builder.create(loc, i64Ty, endPtr); + Value beginInt = builder.create(loc, i64Ty, beginPtr); + + // Subtracting these will give us the size in bytes. + return builder.create(loc, endInt, beginInt); +} - /// Generate code that computes the size in bytes of a `std::vector` array - /// in the same way as a `std::vector::size()`. This assumes the vector is - /// laid out in memory as the following structure. - /// - /// - /// struct vector { - /// T* begin; - /// T* end; - /// T* allocated_end; - /// }; - /// - /// - /// The first two elements are pointers to the beginning and end of the data - /// in the vector, respectively. This data is kept in a contiguous memory - /// range. The following implementation follows what Clang CodeGen produces - /// for `std::vector::size()` without the final `sdiv` op that divides the - /// `sizeof(data[N])` by the `sizeof(T)`. The result is the total required - /// memory size for the vector data itself in \e bytes. - Value genVectorSize(Location loc, OpBuilder &builder, Value vecArg) { - auto vecTy = cast(vecArg.getType()); - auto vecStructTy = cast(vecTy.getElementType()); - assert(vecStructTy.getNumMembers() == 3 && - vecStructTy.getMember(0) == vecStructTy.getMember(1) && - vecStructTy.getMember(0) == vecStructTy.getMember(2) && - "host side vector expected"); - auto vecElePtrTy = cudaq::cc::PointerType::get(vecStructTy.getMember(0)); - - // Get the pointer to the pointer of the end of the array - Value endPtr = builder.create( - loc, vecElePtrTy, vecArg, ArrayRef{1}); - - // Get the pointer to the pointer of the beginning of the array - Value beginPtr = builder.create( - loc, vecElePtrTy, vecArg, ArrayRef{0}); - - // Load to a T* - endPtr = builder.create(loc, endPtr); - beginPtr = builder.create(loc, beginPtr); - - // Map those pointers to integers - Type i64Ty = builder.getI64Type(); - Value endInt = builder.create(loc, i64Ty, endPtr); - Value beginInt = builder.create(loc, i64Ty, beginPtr); +static Value genComputeReturnOffset(Location loc, OpBuilder &builder, + FunctionType funcTy, + cudaq::cc::StructType msgStructTy) { + if (funcTy.getNumResults() == 0) + return builder.create(loc, NoResultOffset, 64); + std::int32_t numKernelArgs = funcTy.getNumInputs(); + auto i64Ty = builder.getI64Type(); + return builder.create( + loc, i64Ty, msgStructTy, ArrayRef{numKernelArgs}); +} - // Subtracting these will give us the size in bytes. - return builder.create(loc, endInt, beginInt); - } +/// Create a function that determines the return value offset in the message +/// buffer. +static void genReturnOffsetFunction(Location loc, OpBuilder &builder, + FunctionType devKernelTy, + cudaq::cc::StructType msgStructTy, + const std::string &classNameStr) { + auto *ctx = builder.getContext(); + auto i64Ty = builder.getI64Type(); + auto funcTy = FunctionType::get(ctx, {}, {i64Ty}); + auto returnOffsetFunc = + builder.create(loc, classNameStr + ".returnOffset", funcTy); + OpBuilder::InsertionGuard guard(builder); + auto *entry = returnOffsetFunc.addEntryBlock(); + builder.setInsertionPointToStart(entry); + auto result = genComputeReturnOffset(loc, builder, devKernelTy, msgStructTy); + builder.create(loc, result); +} - /// Helper that converts a byte length to a length of i64. - Value convertLengthBytesToLengthI64(Location loc, OpBuilder &builder, - Value length) { - auto eight = builder.create(loc, 8, 64); - return builder.create(loc, length, eight); - } +static cudaq::cc::PointerType getByteAddressableType(OpBuilder &builder) { + return cudaq::cc::PointerType::get( + cudaq::cc::ArrayType::get(builder.getI8Type())); +} + +static cudaq::cc::PointerType getPointerToPointerType(OpBuilder &builder) { + return cudaq::cc::PointerType::get( + cudaq::cc::PointerType::get(builder.getI8Type())); +} + +static bool isDynamicSignature(FunctionType devFuncTy) { + for (auto t : devFuncTy.getInputs()) + if (cudaq::cc::isDynamicType(t)) + return true; + for (auto t : devFuncTy.getResults()) + if (cudaq::cc::isDynamicType(t)) + return true; + return false; +} - Value genComputeReturnOffset(Location loc, OpBuilder &builder, - FunctionType funcTy, - cudaq::cc::StructType msgStructTy) { - if (funcTy.getNumResults() == 0) - return builder.create(loc, NoResultOffset, 64); - std::int32_t numKernelArgs = funcTy.getNumInputs(); +static std::pair +genByteSizeAndElementCount(Location loc, OpBuilder &builder, Type eleTy, + Value size, Value arg, Type t) { + // If this is a vector>, convert the bytes of vector to bytes of + // length (i64). + if (auto sty = dyn_cast(eleTy)) { + auto eTy = cast(arg.getType()).getElementType(); + auto fTy = cast(eTy).getMember(0); + auto tTy = cast(fTy).getElementType(); auto i64Ty = builder.getI64Type(); - return builder.create( - loc, i64Ty, msgStructTy, ArrayRef{numKernelArgs}); + auto eleSize = builder.create(loc, i64Ty, tTy); + Value count = builder.create(loc, size, eleSize); + auto ate = builder.create(loc, 8, 64); + size = builder.create(loc, count, ate); + return {size, count}; } - /// Create a function that determines the return value offset in the message - /// buffer. - void genReturnOffsetFunction(Location loc, OpBuilder &builder, - FunctionType devKernelTy, - cudaq::cc::StructType msgStructTy, - const std::string &classNameStr) { - auto *ctx = builder.getContext(); + // If this is a vector, convert the bytes of string to bytes of length + // (i64). + if (isa(eleTy)) { + auto fore = builder.create(loc, 4, 64); + size = builder.create(loc, size, fore); + auto ate = builder.create(loc, 8, 64); + Value count = builder.create(loc, size, ate); + return {size, count}; + } + + // If this is a vector>, convert the bytes of struct to bytes of + // struct with converted members. + if (isa(eleTy)) { + auto eleTy = cast(arg.getType()).getElementType(); auto i64Ty = builder.getI64Type(); - auto funcTy = FunctionType::get(ctx, {}, {i64Ty}); - auto returnOffsetFunc = builder.create( - loc, classNameStr + ".returnOffset", funcTy); - OpBuilder::InsertionGuard guard(builder); - auto *entry = returnOffsetFunc.addEntryBlock(); - builder.setInsertionPointToStart(entry); - auto result = - genComputeReturnOffset(loc, builder, devKernelTy, msgStructTy); - builder.create(loc, result); + auto hostStrSize = builder.create(loc, i64Ty, eleTy); + Value count = builder.create(loc, size, hostStrSize); + Type packedTy = cudaq::opt::factory::genArgumentBufferType(t); + auto packSize = builder.create(loc, i64Ty, packedTy); + size = builder.create(loc, count, packSize); + return {size, count}; } + return {}; +} - static cudaq::cc::PointerType getByteAddressableType(OpBuilder &builder) { - return cudaq::cc::PointerType::get( - cudaq::cc::ArrayType::get(builder.getI8Type())); +static bool isStdVectorBool(Type ty) { + auto stdvecTy = dyn_cast(ty); + return stdvecTy && + (stdvecTy.getElementType() == IntegerType::get(ty.getContext(), 1)); +} + +/// Recursively check if \p ty contains a `std::vector`. +static bool hasStdVectorBool(Type ty) { + if (isStdVectorBool(ty)) + return true; + if (auto sty = dyn_cast(ty)) + return hasStdVectorBool(sty.getElementType()); + if (auto sty = dyn_cast(ty)) + for (auto mem : sty.getMembers()) + if (hasStdVectorBool(mem)) + return true; + return false; +} + +// The host-side type of a `std::vector` is distinct from the transient +// type for a `std::vector`. The former is a unique data type with a size +// of 40 bytes. The latter is identical to `std::vector` (which has a size +// of 24 bytes). +static Type convertToTransientType(Type ty) { + if (isStdVectorBool(ty)) { + auto *ctx = ty.getContext(); + return cudaq::opt::factory::stlVectorType(IntegerType::get(ctx, 1)); } + if (auto sty = dyn_cast(ty)) + return cudaq::opt::factory::stlVectorType( + convertToTransientType(sty.getElementType())); + if (auto sty = dyn_cast(ty)) { + SmallVector newMems; + for (auto mem : sty.getMembers()) + newMems.push_back(convertToTransientType(mem)); + auto *ctx = ty.getContext(); + return cudaq::cc::StructType::get(ctx, newMems); + } + return cudaq::opt::factory::convertToHostSideType(ty); +} - static cudaq::cc::PointerType getPointerToPointerType(OpBuilder &builder) { - return cudaq::cc::PointerType::get( - cudaq::cc::PointerType::get(builder.getI8Type())); +static std::pair +convertAllStdVectorBool(Location loc, OpBuilder &builder, Value arg, Type ty, + Value heapTracker, + std::optional preallocated = std::nullopt) { + // If we are here, `ty` must be a `std::vector` or recursively contain a + // `std::vector`. + + // Handle `std::vector`. + if (isStdVectorBool(ty)) { + auto stdvecTy = cast(ty); + Type stdvecHostTy = + cudaq::opt::factory::stlVectorType(stdvecTy.getElementType()); + Value tmp = preallocated.has_value() + ? *preallocated + : builder.create(loc, stdvecHostTy); + builder.create(loc, std::nullopt, + cudaq::stdvecBoolUnpackToInitList, + ArrayRef{tmp, arg, heapTracker}); + return {tmp, true}; } - static bool isDynamicSignature(FunctionType devFuncTy) { - for (auto t : devFuncTy.getInputs()) - if (cudaq::cc::isDynamicType(t)) - return true; - for (auto t : devFuncTy.getResults()) - if (cudaq::cc::isDynamicType(t)) - return true; - return false; + // Handle `std::vector` where `T` != `bool`. + if (auto sty = dyn_cast(ty)) { + // arg is a std::vector. + // It's type must be ptr, ptr, ptr>>. + auto seleTy = sty.getElementType(); + auto ptrArgTy = cast(arg.getType()); + auto argVecTy = cast(ptrArgTy.getElementType()); + auto subVecPtrTy = cudaq::cc::PointerType::get(argVecTy.getMember(0)); + // Compute the pointer to the pointer to the first T element. + auto inputRef = builder.create( + loc, subVecPtrTy, arg, ArrayRef{0}); + auto startInput = builder.create(loc, inputRef); + auto startTy = startInput.getType(); + auto subArrTy = cudaq::cc::ArrayType::get( + cast(startTy).getElementType()); + auto input = builder.create( + loc, cudaq::cc::PointerType::get(subArrTy), startInput); + auto transientTy = convertToTransientType(sty); + Value tmp = builder.create(loc, transientTy); + Value sizeDelta = genVectorSize(loc, builder, arg); + auto count = [&]() -> Value { + if (cudaq::cc::isDynamicType(seleTy)) { + auto p = genByteSizeAndElementCount(loc, builder, seleTy, sizeDelta, + arg, sty); + return p.second; + } + auto sizeEle = builder.create( + loc, builder.getI64Type(), seleTy); + return builder.create(loc, sizeDelta, sizeEle); + }(); + auto sizeTransientTy = builder.create( + loc, builder.getI64Type(), transientTy); + Value sizeInBytes = + builder.create(loc, count, sizeTransientTy); + + // Create a new vector that we'll store the converted data into. + Value byteBuffer = builder.create( + loc, builder.getI8Type(), sizeInBytes); + + // Initialize the temporary vector. + auto transEleTy = cast(transientTy).getMember(0); + auto vecEleTy = cudaq::cc::PointerType::get(transEleTy); + auto tmpBegin = builder.create( + loc, vecEleTy, tmp, ArrayRef{0}); + auto bufferBegin = + builder.create(loc, transEleTy, byteBuffer); + builder.create(loc, bufferBegin, tmpBegin); + auto tmpEnd = builder.create( + loc, vecEleTy, tmp, ArrayRef{1}); + auto byteBufferEnd = builder.create( + loc, cudaq::cc::PointerType::get(builder.getI8Type()), byteBuffer, + ArrayRef{sizeInBytes}); + auto bufferEnd = + builder.create(loc, transEleTy, byteBufferEnd); + builder.create(loc, bufferEnd, tmpEnd); + auto tmpEnd2 = builder.create( + loc, vecEleTy, tmp, ArrayRef{2}); + builder.create(loc, bufferEnd, tmpEnd2); + + // Loop over each element in the outer vector and initialize it to the inner + // vector value. The data may be heap allocated.) + auto transientEleTy = convertToTransientType(seleTy); + auto transientBufferTy = + cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(transientEleTy)); + auto buffer = + builder.create(loc, transientBufferTy, byteBuffer); + + cudaq::opt::factory::createInvariantLoop( + builder, loc, count, + [&](OpBuilder &builder, Location loc, Region &, Block &block) { + Value i = block.getArgument(0); + Value inp = builder.create( + loc, startTy, input, ArrayRef{i}); + auto currentVector = builder.create( + loc, cudaq::cc::PointerType::get(transientEleTy), buffer, + ArrayRef{i}); + convertAllStdVectorBool(loc, builder, inp, seleTy, heapTracker, + currentVector); + }); + return {tmp, true}; } - static std::pair - genByteSizeAndElementCount(Location loc, OpBuilder &builder, Type eleTy, - Value size, Value arg, Type t) { - // If this is a vector>, convert the bytes of vector - // to bytes of length (i64). - if (isa(eleTy)) { - auto three = builder.create(loc, 3, 64); - size = builder.create(loc, size, three); - auto ate = builder.create(loc, 8, 64); - Value count = builder.create(loc, size, ate); - return {size, count}; - } - // If this is a vector, convert the bytes of string to - // bytes of length (i64). - if (isa(eleTy)) { - auto fore = builder.create(loc, 4, 64); - size = builder.create(loc, size, fore); - auto ate = builder.create(loc, 8, 64); - Value count = builder.create(loc, size, ate); - return {size, count}; - } - // If this is a vector>, convert the bytes of struct - // to bytes of struct with converted members. - if (isa(eleTy)) { - auto eleTy = cast(arg.getType()).getElementType(); - auto i64Ty = builder.getI64Type(); - auto hostStrSize = builder.create(loc, i64Ty, eleTy); - Value count = builder.create(loc, size, hostStrSize); - Type packedTy = cudaq::opt::factory::genArgumentBufferType(t); - auto packSize = builder.create(loc, i64Ty, packedTy); - size = builder.create(loc, count, packSize); - return {size, count}; + // Handle `struct { ... };`. + if (auto sty = dyn_cast(ty)) { + auto bufferTy = convertToTransientType(ty); + auto argPtrTy = cast(arg.getType()); + auto argStrTy = cast(argPtrTy.getElementType()); + + // Create a new struct that we'll store the converted data into. + Value buffer = builder.create(loc, bufferTy); + + // Loop over each element. Replace each with the converted value. + for (auto iter : llvm::enumerate(sty.getMembers())) { + std::int32_t i = iter.index(); + Type memTy = iter.value(); + auto fromPtr = builder.create( + loc, cudaq::cc::PointerType::get(argStrTy.getMember(i)), arg, + ArrayRef{i}); + auto transientTy = convertToTransientType(memTy); + Value toPtr = builder.create( + loc, cudaq::cc::PointerType::get(transientTy), buffer, + ArrayRef{i}); + convertAllStdVectorBool(loc, builder, fromPtr, memTy, heapTracker, toPtr); } - return {}; + return {buffer, true}; } + return {arg, false}; +} - Value descendThroughDynamicType(Location loc, OpBuilder &builder, Type ty, - Value addend, Value arg, Value tmp) { - auto i64Ty = builder.getI64Type(); - Value tySize = - TypeSwitch(ty) - // A char span is dynamic, but it is not recursively dynamic. Just - // read the length of the string out. - .Case([&](cudaq::cc::CharspanType t) -> Value { - return genStringLength(loc, builder, arg); - }) - // A std::vector is dynamic and may be recursive dynamic as well. - .Case([&](cudaq::cc::StdvecType t) -> Value { - // Compute the byte span of the vector. - Value size = genVectorSize(loc, builder, arg); - auto eleTy = t.getElementType(); - if (!cudaq::cc::isDynamicType(eleTy)) - return size; - - // Otherwise, we have a recursively dynamic case. - auto [bytes, count] = - genByteSizeAndElementCount(loc, builder, eleTy, size, arg, t); - assert(count && "vector must have elements"); - size = bytes; - - // At this point, arg is a known vector of elements of dynamic - // type, so walk over the vector and recurse on each element. - // `size` is already the proper size of the lengths of each of the - // elements in turn. - builder.create(loc, size, tmp); - auto ptrTy = cast(arg.getType()); - auto strTy = cast(ptrTy.getElementType()); - auto memTy = cast(strTy.getMember(0)); - auto arrTy = - cudaq::cc::PointerType::get(cudaq::cc::PointerType::get( - cudaq::cc::ArrayType::get(memTy.getElementType()))); - auto castPtr = builder.create(loc, arrTy, arg); - auto castArg = builder.create(loc, castPtr); - auto castPtrTy = - cudaq::cc::PointerType::get(memTy.getElementType()); - cudaq::opt::factory::createInvariantLoop( - builder, loc, count, - [&](OpBuilder &builder, Location loc, Region &, - Block &block) { - Value i = block.getArgument(0); - auto ai = builder.create( - loc, castPtrTy, castArg, - ArrayRef{i}); - auto tmpVal = builder.create(loc, tmp); - Value innerSize = descendThroughDynamicType( - loc, builder, eleTy, tmpVal, ai, tmp); - builder.create(loc, innerSize, tmp); - }); - return builder.create(loc, tmp); - }) - // A struct can be dynamic if it contains dynamic members. Get the - // static portion of the struct first, which will have length slots. - // Then get the dynamic sizes for the dynamic members. - .Case([&](cudaq::cc::StructType t) -> Value { - if (cudaq::cc::isDynamicType(t)) { - Type packedTy = cudaq::opt::factory::genArgumentBufferType(t); - Value strSize = - builder.create(loc, i64Ty, packedTy); - for (auto [i, m] : llvm::enumerate(t.getMembers())) { - if (cudaq::cc::isDynamicType(m)) { - auto hostPtrTy = - cast(arg.getType()); - auto hostStrTy = - cast(hostPtrTy.getElementType()); - auto pm = - cudaq::cc::PointerType::get(hostStrTy.getMember(i)); - auto ai = builder.create( - loc, pm, arg, ArrayRef{i}); - strSize = descendThroughDynamicType(loc, builder, m, - strSize, ai, tmp); - } - } - return strSize; - } - return builder.create(loc, i64Ty, t); - }) - .Default([&](Type t) -> Value { - return builder.create(loc, i64Ty, t); - }); - return builder.create(loc, tySize, addend); - } +static std::pair unpackAnyStdVectorBool(Location loc, + OpBuilder &builder, + Value arg, Type ty, + Value heapTracker) { + if (hasStdVectorBool(ty)) + return convertAllStdVectorBool(loc, builder, arg, ty, heapTracker); + return {arg, false}; +} - // Take the list of host-side arguments and device side argument types and zip - // them together logically with the position. Generates any fixup code that's - // needed, like when the device side uses a pair of arguments for a single - // logical device side argument. May drop some arguments on the floor if they - // cannot be encoded. - template - SmallVector> - zipArgumentsWithDeviceTypes(Location loc, OpBuilder &builder, ValueRange args, - TypeRange types, - SmallVectorImpl &freeVectorBuffers) { - SmallVector> result; - if constexpr (argsAreReferences) { - // Simple case: the number of args must be equal to the types. - assert(args.size() == types.size() && - "arguments and types must have same size"); - auto *ctx = builder.getContext(); - for (auto iter : llvm::enumerate(llvm::zip(args, types))) { - // Remove the reference. - Value v = std::get(iter.value()); - Type ty = std::get(iter.value()); - if (!(cudaq::cc::isDynamicType(ty) || isStateType(ty) || - isa(ty))) - v = builder.create(loc, v); - // Python will pass a std::vector to us here. Unpack it. - if (auto stdvecTy = dyn_cast(ty)) - if (stdvecTy.getElementType() == IntegerType::get(ctx, 1)) { - Type stdvecHostTy = - cudaq::opt::factory::stlVectorType(stdvecTy.getElementType()); - Value tmp = builder.create(loc, stdvecHostTy); - builder.create(loc, std::nullopt, - cudaq::stdvecBoolUnpackToInitList, - ArrayRef{tmp, v}); - freeVectorBuffers.push_back(tmp); - v = tmp; - } - result.emplace_back(iter.index(), v, ty); +// Take the list of host-side arguments and device side argument types and zip +// them together logically with the position. Generates any fixup code that's +// needed, like when the device side uses a pair of arguments for a single +// logical device side argument. May drop some arguments on the floor if they +// cannot be encoded. +template +static SmallVector> +zipArgumentsWithDeviceTypes(Location loc, OpBuilder &builder, ValueRange args, + TypeRange types, Value heapTracker) { + SmallVector> result; + if constexpr (argsAreReferences) { + // Simple case: the number of args must be equal to the types. + assert(args.size() == types.size() && + "arguments and types must have same size"); + for (auto iter : llvm::enumerate(llvm::zip(args, types))) { + // Remove the reference. + Value v = std::get(iter.value()); + Type ty = std::get(iter.value()); + if (!(cudaq::cc::isDynamicType(ty) || isStateType(ty) || + isa(ty))) + v = builder.create(loc, v); + // Python will pass a std::vector to us here. Unpack it. + auto pear = unpackAnyStdVectorBool(loc, builder, v, ty, heapTracker); + v = pear.first; + result.emplace_back(iter.index(), v, ty); + } + } else /*constexpr*/ { + // In this case, we *may* have logical arguments that are passed in pairs. + auto *ctx = builder.getContext(); + auto *parent = builder.getBlock()->getParentOp(); + auto module = parent->getParentOfType(); + auto lastArg = args.end(); + auto tyIter = types.begin(); + unsigned argPos = 0; + for (auto argIter = args.begin(); argIter != lastArg; + ++argIter, ++tyIter, ++argPos) { + assert(tyIter != types.end()); + Type devTy = *tyIter; + + // std::vector isn't really a std::vector<>. Use the helper + // function to unpack it so it looks like any other vector. + auto pear = + unpackAnyStdVectorBool(loc, builder, *argIter, devTy, heapTracker); + if (pear.second) { + result.emplace_back(argPos, pear.first, devTy); + continue; } - } else /*constexpr*/ { - // In this case, we *may* have logical arguments that are passed in pairs. - auto *ctx = builder.getContext(); - auto *parent = builder.getBlock()->getParentOp(); - auto module = parent->getParentOfType(); - auto lastArg = args.end(); - auto tyIter = types.begin(); - unsigned argPos = 0; - for (auto argIter = args.begin(); argIter != lastArg; - ++argIter, ++tyIter, ++argPos) { - assert(tyIter != types.end()); - Type devTy = *tyIter; - - // std::vector isn't really a std::vector<>. Use the helper - // function to unpack it so it looks like any other vector. - if (auto stdvecTy = dyn_cast(devTy)) - if (stdvecTy.getElementType() == IntegerType::get(ctx, 1)) { - Type stdvecHostTy = - cudaq::opt::factory::stlVectorType(stdvecTy.getElementType()); - Value tmp = builder.create(loc, stdvecHostTy); - builder.create(loc, std::nullopt, - cudaq::stdvecBoolUnpackToInitList, - ArrayRef{tmp, *argIter}); - result.emplace_back(argPos, tmp, devTy); - freeVectorBuffers.push_back(tmp); - continue; - } - // Check for a struct passed in a pair of arguments. - if (isa(devTy) && - !isa((*argIter).getType()) && - cudaq::opt::factory::isX86_64(module) && - cudaq::opt::factory::structUsesTwoArguments(devTy)) { - auto first = *argIter++; - auto second = *argIter; - // TODO: Investigate if it's correct to assume the register layout - // will match the memory layout of the small struct. - auto pairTy = cudaq::cc::StructType::get( - ctx, ArrayRef{first.getType(), second.getType()}); - auto tmp = builder.create(loc, pairTy); - auto tmp1 = builder.create( - loc, cudaq::cc::PointerType::get(first.getType()), tmp); - builder.create(loc, first, tmp1); - auto tmp2 = builder.create( - loc, cudaq::cc::PointerType::get(second.getType()), tmp, - ArrayRef{1}); - builder.create(loc, second, tmp2); - auto devPtrTy = cudaq::cc::PointerType::get(devTy); - Value devVal = builder.create(loc, devPtrTy, tmp); - if (!cudaq::cc::isDynamicType(devTy)) - devVal = builder.create(loc, devVal); - result.emplace_back(argPos, devVal, devTy); - continue; - } + // Check for a struct passed in a pair of arguments. + if (isa(devTy) && + !isa((*argIter).getType()) && + cudaq::opt::factory::isX86_64(module) && + cudaq::opt::factory::structUsesTwoArguments(devTy)) { + auto first = *argIter++; + auto second = *argIter; + // TODO: Investigate if it's correct to assume the register layout + // will match the memory layout of the small struct. + auto pairTy = cudaq::cc::StructType::get( + ctx, ArrayRef{first.getType(), second.getType()}); + auto tmp = builder.create(loc, pairTy); + auto tmp1 = builder.create( + loc, cudaq::cc::PointerType::get(first.getType()), tmp); + builder.create(loc, first, tmp1); + auto tmp2 = builder.create( + loc, cudaq::cc::PointerType::get(second.getType()), tmp, + ArrayRef{1}); + builder.create(loc, second, tmp2); + auto devPtrTy = cudaq::cc::PointerType::get(devTy); + Value devVal = builder.create(loc, devPtrTy, tmp); + if (!cudaq::cc::isDynamicType(devTy)) + devVal = builder.create(loc, devVal); + result.emplace_back(argPos, devVal, devTy); + continue; + } - // Is this a static struct passed as a byval pointer? - if (isa(devTy) && - isa((*argIter).getType()) && - !cudaq::cc::isDynamicType(devTy)) { - Value devVal = builder.create(loc, *argIter); - result.emplace_back(argPos, devVal, devTy); - continue; - } - result.emplace_back(argPos, *argIter, devTy); + // Is this a static struct passed as a byval pointer? + if (isa(devTy) && + isa((*argIter).getType()) && + !cudaq::cc::isDynamicType(devTy)) { + Value devVal = builder.create(loc, *argIter); + result.emplace_back(argPos, devVal, devTy); + continue; } + result.emplace_back(argPos, *argIter, devTy); } - return result; } + return result; +} - Value genSizeOfDynamicMessageBuffer( - Location loc, OpBuilder &builder, cudaq::cc::StructType structTy, - ArrayRef> zippy, Value tmp) { - auto i64Ty = builder.getI64Type(); - Value initSize = builder.create(loc, i64Ty, structTy); - for (auto [_, a, t] : zippy) - if (cudaq::cc::isDynamicType(t)) - initSize = descendThroughDynamicType(loc, builder, t, initSize, a, tmp); - return initSize; - } +static Value descendThroughDynamicType(Location loc, OpBuilder &builder, + Type ty, Value addend, Value arg, + Value tmp) { + auto i64Ty = builder.getI64Type(); + Value tySize = + TypeSwitch(ty) + // A char span is dynamic, but it is not recursively dynamic. Just + // read the length of the string out. + .Case([&](cudaq::cc::CharspanType t) -> Value { + return genStringLength(loc, builder, arg); + }) + // A std::vector is dynamic and may be recursive dynamic as well. + .Case([&](cudaq::cc::StdvecType t) -> Value { + // Compute the byte span of the vector. + Value size = genVectorSize(loc, builder, arg); + auto eleTy = t.getElementType(); + if (!cudaq::cc::isDynamicType(eleTy)) + return size; + + // Otherwise, we have a recursively dynamic case. + auto [bytes, count] = + genByteSizeAndElementCount(loc, builder, eleTy, size, arg, t); + assert(count && "vector must have elements"); + size = bytes; + + // At this point, arg is a known vector of elements of dynamic + // type, so walk over the vector and recurse on each element. + // `size` is already the proper size of the lengths of each of the + // elements in turn. + builder.create(loc, size, tmp); + auto ptrTy = cast(arg.getType()); + auto strTy = cast(ptrTy.getElementType()); + auto memTy = cast(strTy.getMember(0)); + auto arrTy = + cudaq::cc::PointerType::get(cudaq::cc::PointerType::get( + cudaq::cc::ArrayType::get(memTy.getElementType()))); + auto castPtr = builder.create(loc, arrTy, arg); + auto castArg = builder.create(loc, castPtr); + auto castPtrTy = + cudaq::cc::PointerType::get(memTy.getElementType()); + cudaq::opt::factory::createInvariantLoop( + builder, loc, count, + [&](OpBuilder &builder, Location loc, Region &, Block &block) { + Value i = block.getArgument(0); + auto ai = builder.create( + loc, castPtrTy, castArg, + ArrayRef{i}); + auto tmpVal = builder.create(loc, tmp); + Value innerSize = descendThroughDynamicType( + loc, builder, eleTy, tmpVal, ai, tmp); + builder.create(loc, innerSize, tmp); + }); + return builder.create(loc, tmp); + }) + // A struct can be dynamic if it contains dynamic members. Get the + // static portion of the struct first, which will have length slots. + // Then get the dynamic sizes for the dynamic members. + .Case([&](cudaq::cc::StructType t) -> Value { + if (cudaq::cc::isDynamicType(t)) { + Type packedTy = cudaq::opt::factory::genArgumentBufferType(t); + Value strSize = + builder.create(loc, i64Ty, packedTy); + for (auto [i, m] : llvm::enumerate(t.getMembers())) { + if (cudaq::cc::isDynamicType(m)) { + auto hostPtrTy = cast(arg.getType()); + auto hostStrTy = + cast(hostPtrTy.getElementType()); + auto pm = cudaq::cc::PointerType::get(hostStrTy.getMember(i)); + auto ai = builder.create( + loc, pm, arg, ArrayRef{i}); + strSize = descendThroughDynamicType(loc, builder, m, strSize, + ai, tmp); + } + } + return strSize; + } + return builder.create(loc, i64Ty, t); + }) + .Default([&](Type t) -> Value { + return builder.create(loc, i64Ty, t); + }); + return builder.create(loc, tySize, addend); +} - Value populateStringAddendum(Location loc, OpBuilder &builder, Value host, - Value sizeSlot, Value addendum) { - Value size = genStringLength(loc, builder, host); - builder.create(loc, size, sizeSlot); - auto ptrI8Ty = cudaq::cc::PointerType::get(builder.getI8Type()); - auto ptrPtrI8 = getPointerToPointerType(builder); - auto fromPtrPtr = builder.create(loc, ptrPtrI8, host); - auto fromPtr = builder.create(loc, fromPtrPtr); - auto notVolatile = builder.create(loc, 0, 1); - auto toPtr = builder.create(loc, ptrI8Ty, addendum); - builder.create(loc, std::nullopt, cudaq::llvmMemCopyIntrinsic, - ValueRange{toPtr, fromPtr, size, notVolatile}); - auto ptrI8Arr = getByteAddressableType(builder); - auto addBytes = builder.create(loc, ptrI8Arr, addendum); - return builder.create( - loc, ptrI8Ty, addBytes, ArrayRef{size}); - } +static Value genSizeOfDynamicMessageBuffer( + Location loc, OpBuilder &builder, cudaq::cc::StructType structTy, + ArrayRef> zippy, Value tmp) { + auto i64Ty = builder.getI64Type(); + Value initSize = builder.create(loc, i64Ty, structTy); + for (auto [_, a, t] : zippy) + if (cudaq::cc::isDynamicType(t)) + initSize = descendThroughDynamicType(loc, builder, t, initSize, a, tmp); + return initSize; +} - // Simple case when the vector data is known to not hold dynamic data. - Value populateVectorAddendum(Location loc, OpBuilder &builder, Value host, - Value sizeSlot, Value addendum) { - Value size = genVectorSize(loc, builder, host); - builder.create(loc, size, sizeSlot); - auto ptrI8Ty = cudaq::cc::PointerType::get(builder.getI8Type()); - auto ptrPtrI8 = getPointerToPointerType(builder); - auto fromPtrPtr = builder.create(loc, ptrPtrI8, host); - auto fromPtr = builder.create(loc, fromPtrPtr); - auto notVolatile = builder.create(loc, 0, 1); - auto toPtr = builder.create(loc, ptrI8Ty, addendum); - builder.create(loc, std::nullopt, cudaq::llvmMemCopyIntrinsic, - ValueRange{toPtr, fromPtr, size, notVolatile}); - auto ptrI8Arr = getByteAddressableType(builder); - auto addBytes = builder.create(loc, ptrI8Arr, addendum); - return builder.create( - loc, ptrI8Ty, addBytes, ArrayRef{size}); - } +static Value populateStringAddendum(Location loc, OpBuilder &builder, + Value host, Value sizeSlot, + Value addendum) { + Value size = genStringLength(loc, builder, host); + builder.create(loc, size, sizeSlot); + auto ptrI8Ty = cudaq::cc::PointerType::get(builder.getI8Type()); + auto ptrPtrI8 = getPointerToPointerType(builder); + auto fromPtrPtr = builder.create(loc, ptrPtrI8, host); + auto fromPtr = builder.create(loc, fromPtrPtr); + auto notVolatile = builder.create(loc, 0, 1); + auto toPtr = builder.create(loc, ptrI8Ty, addendum); + builder.create(loc, std::nullopt, cudaq::llvmMemCopyIntrinsic, + ValueRange{toPtr, fromPtr, size, notVolatile}); + auto ptrI8Arr = getByteAddressableType(builder); + auto addBytes = builder.create(loc, ptrI8Arr, addendum); + return builder.create( + loc, ptrI8Ty, addBytes, ArrayRef{size}); +} - Value populateDynamicAddendum(Location loc, OpBuilder &builder, Type devArgTy, - Value host, Value sizeSlot, Value addendum, - Value addendumScratch) { - if (isa(devArgTy)) - return populateStringAddendum(loc, builder, host, sizeSlot, addendum); - if (auto vecTy = dyn_cast(devArgTy)) { - auto eleTy = vecTy.getElementType(); - if (cudaq::cc::isDynamicType(eleTy)) { - // Recursive case. Visit each dynamic element, copying it. - Value size = genVectorSize(loc, builder, host); - auto [bytes, count] = genByteSizeAndElementCount(loc, builder, eleTy, - size, host, devArgTy); - size = bytes; - builder.create(loc, size, sizeSlot); - // Convert from bytes to vector length in elements. - // Compute new addendum start. - auto addrTy = getByteAddressableType(builder); - auto castEnd = builder.create(loc, addrTy, addendum); - Value newAddendum = builder.create( - loc, addendum.getType(), castEnd, - ArrayRef{size}); - builder.create(loc, newAddendum, addendumScratch); - auto sizeBlockTy = cudaq::cc::PointerType::get( - cudaq::cc::ArrayType::get(builder.getI64Type())); - auto ptrI64Ty = cudaq::cc::PointerType::get(builder.getI64Type()); - // In the recursive case, the next block of addendum is a vector of - // sizes in bytes. Each size will be the size of the vector at that - // offset. - auto sizeBlock = - builder.create(loc, sizeBlockTy, addendum); - auto ptrPtrBlockTy = cudaq::cc::PointerType::get( - cast( - cast(host.getType()).getElementType()) - .getMember(0)); - // The host argument is a std::vector, so we want to get the address of - // "front" out of the vector (the first pointer in the triple) and step - // over the contiguous range of vectors in the host block. The vector of - // vectors forms a ragged array structure in host memory. - auto hostBeginPtrRef = builder.create( - loc, ptrPtrBlockTy, host, ArrayRef{0}); - auto hostBegin = - builder.create(loc, hostBeginPtrRef); - auto hostEleTy = cast(hostBegin.getType()); - auto hostBlockTy = cudaq::cc::PointerType::get( - cudaq::cc::ArrayType::get(hostEleTy.getElementType())); - auto hostBlock = - builder.create(loc, hostBlockTy, hostBegin); - // Loop over each vector element in the vector (recursively). - cudaq::opt::factory::createInvariantLoop( - builder, loc, count, - [&](OpBuilder &builder, Location loc, Region &, Block &block) { - Value i = block.getArgument(0); - Value addm = - builder.create(loc, addendumScratch); - auto subSlot = builder.create( - loc, ptrI64Ty, sizeBlock, - ArrayRef{i}); - auto subHost = builder.create( - loc, hostEleTy, hostBlock, - ArrayRef{i}); - Value newAddm = populateDynamicAddendum( - loc, builder, eleTy, subHost, subSlot, addm, addendumScratch); - builder.create(loc, newAddm, addendumScratch); - }); - return builder.create(loc, addendumScratch); - } - return populateVectorAddendum(loc, builder, host, sizeSlot, addendum); +// Simple case when the vector data is known to not hold dynamic data. +static Value populateVectorAddendum(Location loc, OpBuilder &builder, + Value host, Value sizeSlot, + Value addendum) { + Value size = genVectorSize(loc, builder, host); + builder.create(loc, size, sizeSlot); + auto ptrI8Ty = cudaq::cc::PointerType::get(builder.getI8Type()); + auto ptrPtrI8 = getPointerToPointerType(builder); + auto fromPtrPtr = builder.create(loc, ptrPtrI8, host); + auto fromPtr = builder.create(loc, fromPtrPtr); + auto notVolatile = builder.create(loc, 0, 1); + auto toPtr = builder.create(loc, ptrI8Ty, addendum); + builder.create(loc, std::nullopt, cudaq::llvmMemCopyIntrinsic, + ValueRange{toPtr, fromPtr, size, notVolatile}); + auto ptrI8Arr = getByteAddressableType(builder); + auto addBytes = builder.create(loc, ptrI8Arr, addendum); + return builder.create( + loc, ptrI8Ty, addBytes, ArrayRef{size}); +} + +static Value populateDynamicAddendum(Location loc, OpBuilder &builder, + Type devArgTy, Value host, Value sizeSlot, + Value addendum, Value addendumScratch) { + if (isa(devArgTy)) + return populateStringAddendum(loc, builder, host, sizeSlot, addendum); + if (auto vecTy = dyn_cast(devArgTy)) { + auto eleTy = vecTy.getElementType(); + if (cudaq::cc::isDynamicType(eleTy)) { + // Recursive case. Visit each dynamic element, copying it. + Value size = genVectorSize(loc, builder, host); + auto [bytes, count] = + genByteSizeAndElementCount(loc, builder, eleTy, size, host, devArgTy); + size = bytes; + builder.create(loc, size, sizeSlot); + + // Convert from bytes to vector length in elements. + // Compute new addendum start. + auto addrTy = getByteAddressableType(builder); + auto castEnd = builder.create(loc, addrTy, addendum); + Value newAddendum = builder.create( + loc, addendum.getType(), castEnd, + ArrayRef{size}); + builder.create(loc, newAddendum, addendumScratch); + Type dataTy = cudaq::opt::factory::genArgumentBufferType(eleTy); + auto arrDataTy = cudaq::cc::ArrayType::get(dataTy); + auto sizeBlockTy = cudaq::cc::PointerType::get(arrDataTy); + auto ptrDataTy = cudaq::cc::PointerType::get(dataTy); + + // In the recursive case, the next block of addendum is a vector of + // elements which are either sizes or contain sizes. The sizes are i64 + // and expressed in bytes. Each size will be the size of the span of the + // element (or its subfields) at that offset. + auto sizeBlock = + builder.create(loc, sizeBlockTy, addendum); + auto hostEleTy = + cast(host.getType()).getElementType(); + auto ptrPtrBlockTy = cudaq::cc::PointerType::get( + cast(hostEleTy).getMember(0)); + + // The host argument is a std::vector, so we want to get the address of + // "front" out of the vector (the first pointer in the triple) and step + // over the contiguous range of vectors in the host block. The vector of + // vectors forms a ragged array structure in host memory. + auto hostBeginPtrRef = builder.create( + loc, ptrPtrBlockTy, host, ArrayRef{0}); + auto hostBegin = builder.create(loc, hostBeginPtrRef); + auto hostBeginEleTy = cast(hostBegin.getType()); + auto hostBlockTy = cudaq::cc::PointerType::get( + cudaq::cc::ArrayType::get(hostBeginEleTy.getElementType())); + auto hostBlock = + builder.create(loc, hostBlockTy, hostBegin); + + // Loop over each vector element in the vector (recursively). + cudaq::opt::factory::createInvariantLoop( + builder, loc, count, + [&](OpBuilder &builder, Location loc, Region &, Block &block) { + Value i = block.getArgument(0); + Value addm = + builder.create(loc, addendumScratch); + auto subSlot = builder.create( + loc, ptrDataTy, sizeBlock, + ArrayRef{i}); + auto subHost = builder.create( + loc, hostBeginEleTy, hostBlock, + ArrayRef{i}); + Value newAddm = populateDynamicAddendum( + loc, builder, eleTy, subHost, subSlot, addm, addendumScratch); + builder.create(loc, newAddm, addendumScratch); + }); + return builder.create(loc, addendumScratch); } - auto devStrTy = cast(devArgTy); - auto hostStrTy = cast( - cast(sizeSlot.getType()).getElementType()); - assert(devStrTy.getNumMembers() == hostStrTy.getNumMembers()); - for (auto iter : llvm::enumerate(devStrTy.getMembers())) { - std::int32_t iterIdx = iter.index(); - auto hostPtrTy = cast(host.getType()); - auto hostMemTy = cast(hostPtrTy.getElementType()) - .getMember(iterIdx); - auto val = builder.create( - loc, cudaq::cc::PointerType::get(hostMemTy), host, + return populateVectorAddendum(loc, builder, host, sizeSlot, addendum); + } + auto devStrTy = cast(devArgTy); + auto hostStrTy = cast( + cast(sizeSlot.getType()).getElementType()); + assert(devStrTy.getNumMembers() == hostStrTy.getNumMembers()); + for (auto iter : llvm::enumerate(devStrTy.getMembers())) { + std::int32_t iterIdx = iter.index(); + auto hostPtrTy = cast(host.getType()); + auto hostMemTy = cast(hostPtrTy.getElementType()) + .getMember(iterIdx); + auto val = builder.create( + loc, cudaq::cc::PointerType::get(hostMemTy), host, + ArrayRef{iterIdx}); + Type iterTy = iter.value(); + if (cudaq::cc::isDynamicType(iterTy)) { + Value fieldInSlot = builder.create( + loc, cudaq::cc::PointerType::get(builder.getI64Type()), sizeSlot, ArrayRef{iterIdx}); - Type iterTy = iter.value(); - if (cudaq::cc::isDynamicType(iterTy)) { - Value fieldInSlot = builder.create( - loc, cudaq::cc::PointerType::get(builder.getI64Type()), sizeSlot, - ArrayRef{iterIdx}); - addendum = populateDynamicAddendum( - loc, builder, iterTy, val, fieldInSlot, addendum, addendumScratch); - } else { - Value fieldInSlot = builder.create( - loc, cudaq::cc::PointerType::get(iterTy), sizeSlot, - ArrayRef{iterIdx}); - auto v = builder.create(loc, val); - builder.create(loc, v, fieldInSlot); - } + addendum = populateDynamicAddendum(loc, builder, iterTy, val, fieldInSlot, + addendum, addendumScratch); + } else { + Value fieldInSlot = builder.create( + loc, cudaq::cc::PointerType::get(iterTy), sizeSlot, + ArrayRef{iterIdx}); + auto v = builder.create(loc, val); + builder.create(loc, v, fieldInSlot); } - return addendum; } + return addendum; +} - void populateMessageBuffer(Location loc, OpBuilder &builder, - Value msgBufferBase, - ArrayRef> zippy, - Value addendum = {}, Value addendumScratch = {}) { - auto structTy = cast( - cast(msgBufferBase.getType()).getElementType()); - // Loop over all the arguments and populate the message buffer. - for (auto [idx, arg, devArgTy] : zippy) { - if (cudaq::cc::isDynamicType(devArgTy)) { - assert(addendum && "must have addendum to encode dynamic argument(s)"); - // Get the address of the slot to be filled. - auto memberTy = cast(structTy).getMember(idx); - auto ptrTy = cudaq::cc::PointerType::get(memberTy); - auto slot = builder.create( - loc, ptrTy, msgBufferBase, ArrayRef{idx}); - addendum = populateDynamicAddendum(loc, builder, devArgTy, arg, slot, - addendum, addendumScratch); - continue; - } - - // If the argument is a callable, skip it. - if (isa(devArgTy)) - continue; - // If the argument is an empty struct, skip it. - if (auto strTy = dyn_cast(devArgTy); - strTy && strTy.isEmpty()) - continue; - +static void +populateMessageBuffer(Location loc, OpBuilder &builder, Value msgBufferBase, + ArrayRef> zippy, + Value addendum = {}, Value addendumScratch = {}) { + auto structTy = cast( + cast(msgBufferBase.getType()).getElementType()); + // Loop over all the arguments and populate the message buffer. + for (auto [idx, arg, devArgTy] : zippy) { + if (cudaq::cc::isDynamicType(devArgTy)) { + assert(addendum && "must have addendum to encode dynamic argument(s)"); // Get the address of the slot to be filled. auto memberTy = cast(structTy).getMember(idx); auto ptrTy = cudaq::cc::PointerType::get(memberTy); - Value slot = builder.create( + auto slot = builder.create( loc, ptrTy, msgBufferBase, ArrayRef{idx}); + addendum = populateDynamicAddendum(loc, builder, devArgTy, arg, slot, + addendum, addendumScratch); + continue; + } - // Argument is a packaged kernel. In this case, the argument is some - // unknown kernel that may be called. The packaged argument is coming - // from opaque C++ host code, so we need to identify what kernel it - // references and then pass its name as a span of characters to the - // launch kernel. - if (isa(devArgTy)) { - auto i64Ty = builder.getI64Type(); - auto kernKey = builder.create( - loc, i64Ty, cudaq::runtime::getLinkableKernelKey, ValueRange{arg}); - builder.create(loc, kernKey.getResult(0), slot); - continue; - } + // If the argument is a callable, skip it. + if (isa(devArgTy)) + continue; + // If the argument is an empty struct, skip it. + if (auto strTy = dyn_cast(devArgTy); + strTy && strTy.isEmpty()) + continue; + + // Get the address of the slot to be filled. + auto memberTy = cast(structTy).getMember(idx); + auto ptrTy = cudaq::cc::PointerType::get(memberTy); + Value slot = builder.create( + loc, ptrTy, msgBufferBase, ArrayRef{idx}); + + // Argument is a packaged kernel. In this case, the argument is some + // unknown kernel that may be called. The packaged argument is coming + // from opaque C++ host code, so we need to identify what kernel it + // references and then pass its name as a span of characters to the + // launch kernel. + if (isa(devArgTy)) { + auto i64Ty = builder.getI64Type(); + auto kernKey = builder.create( + loc, i64Ty, cudaq::runtime::getLinkableKernelKey, ValueRange{arg}); + builder.create(loc, kernKey.getResult(0), slot); + continue; + } - // Just pass the raw pointer. The buffer is supposed to be pointer-free - // since it may be unpacked in a different address space. However, if this - // is a simulation and things are in the same address space, we pass the - // pointer for convenience. - if (isa(devArgTy)) - arg = builder.create(loc, memberTy, arg); - - if (isa(arg.getType()) && - (cudaq::cc::PointerType::get(arg.getType()) != slot.getType())) { - slot = builder.create( - loc, cudaq::cc::PointerType::get(arg.getType()), slot); - } - builder.create(loc, arg, slot); + // Just pass the raw pointer. The buffer is supposed to be pointer-free + // since it may be unpacked in a different address space. However, if this + // is a simulation and things are in the same address space, we pass the + // pointer for convenience. + if (isa(devArgTy)) + arg = builder.create(loc, memberTy, arg); + + if (isa(arg.getType()) && + (cudaq::cc::PointerType::get(arg.getType()) != slot.getType())) { + slot = builder.create( + loc, cudaq::cc::PointerType::get(arg.getType()), slot); } + builder.create(loc, arg, slot); } +} + +/// A kernel function that takes a quantum type argument (also known as a pure +/// device kernel) cannot be called directly from C++ (classical) code. It must +/// be called via other quantum code. +static bool hasLegalType(FunctionType funTy) { + for (auto ty : funTy.getInputs()) + if (quake::isQuantumType(ty)) + return false; + for (auto ty : funTy.getResults()) + if (quake::isQuantumType(ty)) + return false; + return true; +} + +static MutableArrayRef +dropAnyHiddenArguments(MutableArrayRef args, FunctionType funcTy, + bool hasThisPointer) { + const bool hiddenSRet = cudaq::opt::factory::hasHiddenSRet(funcTy); + const unsigned count = + cudaq::cc::numberOfHiddenArgs(hasThisPointer, hiddenSRet); + if (count > 0 && args.size() >= count && + std::all_of(args.begin(), args.begin() + count, [](auto i) { + return isa(i.getType()); + })) + return args.drop_front(count); + return args; +} + +static std::pair +lookupHostEntryPointFunc(StringRef mangledEntryPointName, ModuleOp module, + func::FuncOp funcOp) { + if (mangledEntryPointName.equals("BuilderKernel.EntryPoint") || + mangledEntryPointName.contains("_PyKernelEntryPointRewrite")) { + // No host entry point needed. + return {false, func::FuncOp{}}; + } + if (auto *decl = module.lookupSymbol(mangledEntryPointName)) + if (auto func = dyn_cast(decl)) { + func.eraseBody(); + return {true, func}; + } + funcOp.emitOpError("could not generate the host-side kernel function (" + + mangledEntryPointName + ")"); + return {true, func::FuncOp{}}; +} + +/// Generate code to initialize the std::vector, \p sret, from an initializer +/// list with data at \p data and length \p size. Use the library helper +/// routine. This function takes two !llvm.ptr arguments. +static void genStdvecBoolFromInitList(Location loc, OpBuilder &builder, + Value sret, Value data, Value size) { + auto ptrTy = cudaq::cc::PointerType::get(builder.getContext()); + auto castData = builder.create(loc, ptrTy, data); + auto castSret = builder.create(loc, ptrTy, sret); + builder.create(loc, std::nullopt, + cudaq::stdvecBoolCtorFromInitList, + ArrayRef{castSret, castData, size}); +} + +/// Generate a `std::vector` (where `T != bool`) from an initializer list. +/// This is done with the assumption that `std::vector` is implemented as a +/// triple of pointers. The original content of the vector is freed and the new +/// content, which is already on the stack, is moved into the `std::vector`. +static void genStdvecTFromInitList(Location loc, OpBuilder &builder, Value sret, + Value data, Value tSize, Value vecSize) { + auto i8Ty = builder.getI8Type(); + auto stlVectorTy = + cudaq::cc::PointerType::get(cudaq::opt::factory::stlVectorType(i8Ty)); + auto ptrTy = cudaq::cc::PointerType::get(i8Ty); + auto castSret = builder.create(loc, stlVectorTy, sret); + auto ptrPtrTy = cudaq::cc::PointerType::get(ptrTy); + auto sret0 = builder.create( + loc, ptrPtrTy, castSret, SmallVector{0}); + auto arrI8Ty = cudaq::cc::ArrayType::get(i8Ty); + auto ptrArrTy = cudaq::cc::PointerType::get(arrI8Ty); + auto buffPtr0 = builder.create(loc, ptrTy, data); + builder.create(loc, buffPtr0, sret0); + auto sret1 = builder.create( + loc, ptrPtrTy, castSret, SmallVector{1}); + Value byteLen = builder.create(loc, tSize, vecSize); + auto buffPtr = builder.create(loc, ptrArrTy, data); + auto endPtr = builder.create( + loc, ptrTy, buffPtr, SmallVector{byteLen}); + builder.create(loc, endPtr, sret1); + auto sret2 = builder.create( + loc, ptrPtrTy, castSret, SmallVector{2}); + builder.create(loc, endPtr, sret2); +} + +// Alloca a pointer to a pointer and initialize it to nullptr. +static Value createEmptyHeapTracker(Location loc, OpBuilder &builder) { + auto ptrI8Ty = cudaq::cc::PointerType::get(builder.getI8Type()); + auto result = builder.create(loc, ptrI8Ty); + auto zero = builder.create(loc, 0, 64); + auto null = builder.create(loc, ptrI8Ty, zero); + builder.create(loc, null, result); + return result; +} + +// If there are temporaries, call the helper to free them. +static void maybeFreeHeapAllocations(Location loc, OpBuilder &builder, + Value heapTracker) { + auto head = builder.create(loc, heapTracker); + auto zero = builder.create(loc, 0, 64); + auto headAsInt = + builder.create(loc, builder.getI64Type(), head); + auto cmp = builder.create(loc, arith::CmpIPredicate::ne, + headAsInt, zero); + // If there are no std::vector to unpack, then the heapTracker will be + // set to `nullptr` and otherwise unused. That will allow the compiler to DCE + // this call after constant propagation. + builder.create( + loc, TypeRange{}, cmp, + [&](OpBuilder &builder, Location loc, Region ®ion) { + region.push_back(new Block()); + auto &body = region.front(); + OpBuilder::InsertionGuard guard(builder); + builder.setInsertionPointToStart(&body); + builder.create(loc, std::nullopt, + cudaq::stdvecBoolFreeTemporaryLists, + ArrayRef{head}); + builder.create(loc); + }); +} + +/// This pass adds a `.thunk` function and a rewritten C++ host +/// side (mangled) stub to the code for every entry-point kernel in the module. +/// It may also generate a `.argsCreator` function. Finally, it +/// creates registration hooks for the CUDA-Q runtime to be able to find the +/// kernel by name and, as appropriate, the `.argsCreator` +/// function. +namespace { +class GenerateKernelExecution + : public cudaq::opt::impl::GenerateKernelExecutionBase< + GenerateKernelExecution> { +public: + using GenerateKernelExecutionBase::GenerateKernelExecutionBase; /// Creates a function that can take a block of pointers to argument values /// and using the compiler's knowledge of a kernel encodes those argument @@ -738,9 +1028,9 @@ class GenerateKernelExecution // Zip the arguments with the device side argument types. Recall that some // of the (left-most) arguments may have been dropped on the floor. const bool hasDynamicSignature = isDynamicSignature(devKernelTy); - SmallVector freeVectorBuffers; + Value heapTracker = createEmptyHeapTracker(loc, builder); auto zippy = zipArgumentsWithDeviceTypes( - loc, builder, pseudoArgs, passedDevArgTys, freeVectorBuffers); + loc, builder, pseudoArgs, passedDevArgTys, heapTracker); auto sizeScratch = builder.create(loc, i64Ty); auto messageBufferSize = [&]() -> Value { if (hasDynamicSignature) @@ -774,18 +1064,7 @@ class GenerateKernelExecution populateMessageBuffer(loc, builder, msgBufferPrefix, zippy); } - if (!freeVectorBuffers.empty()) { - // Need to free any temporary vector-like buffers. These arise when - // there is a std::vector argument, which we translate into a - // std::vector to reuse the same code as any other std::vector. - for (auto vecVar : freeVectorBuffers) { - auto ptrPtrTy = cudaq::cc::PointerType::get(ptrI8Ty); - auto ptrPtr = builder.create(loc, ptrPtrTy, vecVar); - Value freeMe = builder.create(loc, ptrPtr); - builder.create(loc, std::nullopt, "free", - ArrayRef{freeMe}); - } - } + maybeFreeHeapAllocations(loc, builder, heapTracker); // Return the message buffer and its size in bytes. builder.create(loc, rawMessageBuffer, @@ -1086,82 +1365,6 @@ class GenerateKernelExecution return thunk; } - /// Generate code to initialize the std::vector, \p sret, from an - /// initializer list with data at \p data and length \p size. Use the library - /// helper routine. This function takes two !llvm.ptr arguments. - void genStdvecBoolFromInitList(Location loc, OpBuilder &builder, Value sret, - Value data, Value size) { - auto ptrTy = cudaq::cc::PointerType::get(builder.getContext()); - auto castData = builder.create(loc, ptrTy, data); - auto castSret = builder.create(loc, ptrTy, sret); - builder.create(loc, std::nullopt, - cudaq::stdvecBoolCtorFromInitList, - ArrayRef{castSret, castData, size}); - } - - /// Generate a `std::vector` (where `T != bool`) from an initializer list. - /// This is done with the assumption that `std::vector` is implemented as a - /// triple of pointers. The original content of the vector is freed and the - /// new content, which is already on the stack, is moved into the - /// `std::vector`. - void genStdvecTFromInitList(Location loc, OpBuilder &builder, Value sret, - Value data, Value tSize, Value vecSize) { - auto i8Ty = builder.getI8Type(); - auto stlVectorTy = - cudaq::cc::PointerType::get(cudaq::opt::factory::stlVectorType(i8Ty)); - auto ptrTy = cudaq::cc::PointerType::get(i8Ty); - auto castSret = builder.create(loc, stlVectorTy, sret); - auto ptrPtrTy = cudaq::cc::PointerType::get(ptrTy); - auto sret0 = builder.create( - loc, ptrPtrTy, castSret, SmallVector{0}); - auto arrI8Ty = cudaq::cc::ArrayType::get(i8Ty); - auto ptrArrTy = cudaq::cc::PointerType::get(arrI8Ty); - auto buffPtr0 = builder.create(loc, ptrTy, data); - builder.create(loc, buffPtr0, sret0); - auto sret1 = builder.create( - loc, ptrPtrTy, castSret, SmallVector{1}); - Value byteLen = builder.create(loc, tSize, vecSize); - auto buffPtr = builder.create(loc, ptrArrTy, data); - auto endPtr = builder.create( - loc, ptrTy, buffPtr, SmallVector{byteLen}); - builder.create(loc, endPtr, sret1); - auto sret2 = builder.create( - loc, ptrPtrTy, castSret, SmallVector{2}); - builder.create(loc, endPtr, sret2); - } - - static MutableArrayRef - dropAnyHiddenArguments(MutableArrayRef args, - FunctionType funcTy, bool hasThisPointer) { - const bool hiddenSRet = cudaq::opt::factory::hasHiddenSRet(funcTy); - const unsigned count = - cudaq::cc::numberOfHiddenArgs(hasThisPointer, hiddenSRet); - if (count > 0 && args.size() >= count && - std::all_of(args.begin(), args.begin() + count, [](auto i) { - return isa(i.getType()); - })) - return args.drop_front(count); - return args; - } - - static std::pair - lookupHostEntryPointFunc(StringRef mangledEntryPointName, ModuleOp module, - func::FuncOp funcOp) { - if (mangledEntryPointName.equals("BuilderKernel.EntryPoint") || - mangledEntryPointName.contains("_PyKernelEntryPointRewrite")) { - // No host entry point needed. - return {false, func::FuncOp{}}; - } - if (auto *decl = module.lookupSymbol(mangledEntryPointName)) - if (auto func = dyn_cast(decl)) { - func.eraseBody(); - return {true, func}; - } - funcOp.emitOpError("could not generate the host-side kernel function (" + - mangledEntryPointName + ")"); - return {true, func::FuncOp{}}; - } - /// Generate an all new entry point body, calling someLaunchKernel in /// the runtime library. Pass along the thunk, so the runtime can call the /// quantum circuit. These entry points may be `operator()` member functions @@ -1188,9 +1391,9 @@ class GenerateKernelExecution SmallVector blockValues(blockArgs.size()); std::copy(blockArgs.begin(), blockArgs.end(), blockValues.begin()); const bool hasDynamicSignature = isDynamicSignature(devFuncTy); - SmallVector freeVectorBuffers; + Value heapTracker = createEmptyHeapTracker(loc, builder); auto zippy = zipArgumentsWithDeviceTypes( - loc, builder, blockValues, devFuncTy.getInputs(), freeVectorBuffers); + loc, builder, blockValues, devFuncTy.getInputs(), heapTracker); auto sizeScratch = builder.create(loc, i64Ty); auto messageBufferSize = [&]() -> Value { if (hasDynamicSignature) @@ -1224,20 +1427,7 @@ class GenerateKernelExecution populateMessageBuffer(loc, builder, msgBufferPrefix, zippy); } - if (!freeVectorBuffers.empty()) { - // Need to free any temporary vector-like buffers. These arise when - // there is a std::vector argument, which we translate into a - // std::vector to reuse the same code as any other std::vector. - for (auto vecVar : freeVectorBuffers) { - auto ptrPtrTy = cudaq::cc::PointerType::get(ptrI8Ty); - auto ptrPtr = - builder.create(loc, ptrPtrTy, vecVar); - Value freeMe = builder.create(loc, ptrPtr); - builder.create(loc, std::nullopt, "free", - ArrayRef{freeMe}); - } - } - + maybeFreeHeapAllocations(loc, builder, heapTracker); extendedStructSize = messageBufferSize; Value loadThunk = builder.create(loc, thunkTy, thunkFunc.getName()); @@ -1485,19 +1675,6 @@ class GenerateKernelExecution builder.create(loc, results); } - /// A kernel function that takes a quantum type argument (also known as a pure - /// device kernel) cannot be called directly from C++ (classical) code. It - /// must be called via other quantum code. - bool hasLegalType(FunctionType funTy) { - for (auto ty : funTy.getInputs()) - if (quake::isQuantumType(ty)) - return false; - for (auto ty : funTy.getResults()) - if (quake::isQuantumType(ty)) - return false; - return true; - } - /// Generate a function to be executed at load-time which will register the /// kernel with the runtime. LLVM::LLVMFuncOp registerKernelWithRuntimeForExecution( @@ -1618,6 +1795,10 @@ class GenerateKernelExecution irBuilder.loadIntrinsic(module, cudaq::stdvecBoolUnpackToInitList))) return module.emitError(std::string("could not load ") + cudaq::stdvecBoolUnpackToInitList); + if (failed(irBuilder.loadIntrinsic(module, + cudaq::stdvecBoolFreeTemporaryLists))) + return module.emitError(std::string("could not load ") + + cudaq::stdvecBoolFreeTemporaryLists); if (failed(irBuilder.loadIntrinsic(module, cudaq::llvmMemCopyIntrinsic))) return module.emitError(std::string("could not load ") + cudaq::llvmMemCopyIntrinsic); @@ -1628,7 +1809,6 @@ class GenerateKernelExecution return success(); } -public: void runOnOperation() override { auto module = getOperation(); auto *ctx = module.getContext(); diff --git a/runtime/cudaq/cudaq.cpp b/runtime/cudaq/cudaq.cpp index 10ecc3b914..d6cbc3c227 100644 --- a/runtime/cudaq/cudaq.cpp +++ b/runtime/cudaq/cudaq.cpp @@ -470,20 +470,37 @@ void __nvqpp_initializer_list_to_vector_bool(std::vector &result, /// `std::vector` overload. The conversion turns the `std::vector` /// into a mock vector structure that looks like `std::vector`. The /// calling routine must cleanup the buffer allocated by this code. -void __nvqpp_vector_bool_to_initializer_list(void *outData, - const std::vector &inVec) { +/// This helper routine may only be called on the host side. +void __nvqpp_vector_bool_to_initializer_list( + void *outData, const std::vector &inVec, + std::vector **allocations) { // The MockVector must be allocated by the caller. struct MockVector { char *start; char *end; + char *end2; }; MockVector *mockVec = reinterpret_cast(outData); auto outSize = inVec.size(); // The buffer allocated here must be freed by the caller. - mockVec->start = static_cast(malloc(outSize)); - mockVec->end = mockVec->start + outSize; + if (!*allocations) + *allocations = new std::vector; + char *newData = static_cast(malloc(outSize)); + (*allocations)->push_back(newData); + mockVec->start = newData; + mockVec->end2 = mockVec->end = newData + outSize; for (unsigned i = 0; i < outSize; ++i) - (mockVec->start)[i] = static_cast(inVec[i]); + newData[i] = static_cast(inVec[i]); +} + +/// This helper routine deletes the vector that tracks all the temporaries that +/// were created as well as the temporaries themselves. +/// This routine may only be called on the host side. +void __nvqpp_vector_bool_free_temporary_initlists( + std::vector *allocations) { + for (auto *p : *allocations) + free(p); + delete allocations; } } } // namespace cudaq::support diff --git a/runtime/cudaq/qis/qubit_qis.h b/runtime/cudaq/qis/qubit_qis.h index c83dffe844..cb3e2a6a73 100644 --- a/runtime/cudaq/qis/qubit_qis.h +++ b/runtime/cudaq/qis/qubit_qis.h @@ -828,11 +828,13 @@ std::vector mz(qubit &q, Qs &&...qs) { } namespace support { -// Helper to initialize a `vector` data structure. +// Helpers to deal with the `vector` specialized template type. extern "C" { void __nvqpp_initializer_list_to_vector_bool(std::vector &, char *, std::size_t); -void __nvqpp_vector_bool_to_initializer_list(void *, const std::vector &); +void __nvqpp_vector_bool_to_initializer_list(void *, const std::vector &, + std::vector **); +void __nvqpp_vector_bool_free_temporary_initlists(std::vector *); } } // namespace support diff --git a/targettests/SeparateCompilation/arith_spans.cpp b/targettests/SeparateCompilation/arith_spans.cpp new file mode 100644 index 0000000000..67dc8f329e --- /dev/null +++ b/targettests/SeparateCompilation/arith_spans.cpp @@ -0,0 +1,229 @@ +/******************************************************************************* + * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +// clang-format off +// RUN: if [ command -v split-file ]; then \ +// RUN: split-file %s %t && \ +// RUN: nvq++ %cpp_std --enable-mlir -c %t/span_dumps.cpp -o %t/span_dumps.o && \ +// RUN: nvq++ %cpp_std --enable-mlir -c %t/span_exercise.cpp -o %t/span_exercise.o && \ +// RUN: nvq++ %cpp_std --enable-mlir %t/span_dumps.o %t/span_exercise.o -o %t/spanaroo.out && \ +// RUN: %t/spanaroo.out | FileCheck %s ; else \ +// RUN: echo "skipping" ; fi +// clang-format on + +//--- span_dumps.cpp + +#include +#include +#include + +extern "C" { +void dump_bool_vector(std::span x) { + std::cout << "booleans: "; + for (auto i : x) + std::cout << i << ' '; + std::cout << '\n'; +} + +void dump_int_vector(std::span x) { + std::cout << "integers: "; + for (auto i : x) + std::cout << i << ' '; + std::cout << '\n'; +} + +void dump_double_vector(std::span x) { + std::cout << "doubles: "; + for (auto d : x) + std::cout << d << ' '; + std::cout << '\n'; +} +} + +//--- span_exercise.cpp + +#include +#include + +// Fake host C++ signature that matches. +extern "C" { +void dump_int_vector(const std::vector &pw); +void dump_bool_vector(const std::vector &pw); +void dump_double_vector(const std::vector &pw); +} + +__qpu__ void kern1(std::vector arg) { dump_int_vector(arg); } + +__qpu__ void kern2(std::vector> arg) { + for (unsigned i = 0; i < arg.size(); ++i) + dump_int_vector(arg[i]); +} + +struct IntVectorPair { + std::vector _0; + std::vector _1; +}; + +__qpu__ void kern3(IntVectorPair ivp) { + dump_int_vector(ivp._0); + dump_int_vector(ivp._1); +} + +__qpu__ void kern4(std::vector vivp) { + for (unsigned i = 0; i < vivp.size(); ++i) { + dump_int_vector(vivp[i]._0); + dump_int_vector(vivp[i]._1); + } +} + +__qpu__ void qern1(std::vector arg) { dump_double_vector(arg); } + +__qpu__ void qern2(std::vector> arg) { + for (unsigned i = 0; i < arg.size(); ++i) + dump_double_vector(arg[i]); +} + +struct DoubleVectorPair { + std::vector _0; + std::vector _1; +}; + +__qpu__ void qern3(DoubleVectorPair ivp) { + dump_double_vector(ivp._0); + dump_double_vector(ivp._1); +} + +__qpu__ void qern4(std::vector vivp) { + for (unsigned i = 0; i < vivp.size(); ++i) { + dump_double_vector(vivp[i]._0); + dump_double_vector(vivp[i]._1); + } +} + +__qpu__ void cern1(std::vector arg) { dump_bool_vector(arg); } + +__qpu__ void cern2(std::vector> arg) { + for (unsigned i = 0; i < arg.size(); ++i) + dump_bool_vector(arg[i]); +} + +struct BoolVectorPair { + std::vector _0; + std::vector _1; +}; + +__qpu__ void cern3(BoolVectorPair ivp) { + dump_bool_vector(ivp._0); + dump_bool_vector(ivp._1); +} + +__qpu__ void cern4(std::vector vivp) { + for (unsigned i = 0; i < vivp.size(); ++i) { + dump_bool_vector(vivp[i]._0); + dump_bool_vector(vivp[i]._1); + } +} + +int main() { + std::vector pw0 = {345, 1, 2}; + std::cout << "---\n"; + kern1(pw0); + std::vector pw1 = {92347, 3, 4}; + std::vector pw2 = {2358, 5, 6}; + std::vector pw3 = {45, 7, 18}; + std::vector> vpw{pw0, pw1, pw2, pw3}; + std::cout << "---\n"; + kern2(vpw); + + IntVectorPair ivp = {{8, 238, 44}, {0, -4, 81, 92745}}; + std::cout << "---\n"; + kern3(ivp); + + IntVectorPair ivp2 = {{5, -87, 43, 1, 76}, {0, 0, 2, 1}}; + IntVectorPair ivp3 = {{1}, {-2, 3}}; + IntVectorPair ivp4 = {{-4, -5, 6}, {-7, -8, -9, 88}}; + std::vector vivp = {ivp, ivp2, ivp3, ivp4}; + std::cout << "---\n"; + // kern4(vivp); + + std::vector dpw0 = {3.45, 1., 2.}; + std::cout << "---\n"; + qern1(dpw0); + std::vector dpw1 = {92.347, 2.3, 4.}; + std::vector dpw2 = {235.8, 5.5, 6.4}; + std::vector dpw3 = {4.5, 77.7, 18.2}; + std::vector> vdpw{dpw0, dpw1, dpw2, dpw3}; + std::cout << "---\n"; + qern2(vdpw); + + DoubleVectorPair dvp = {{8., 2.38, 4.4}, {0., -4.99, 81.5, 92.745}}; + std::cout << "---\n"; + qern3(dvp); + + DoubleVectorPair dvp2 = {{5., -8.7, 4.3, 1., 7.6}, {0., 0., 2., 1.}}; + DoubleVectorPair dvp3 = {{1.}, {-2., 3.}}; + DoubleVectorPair dvp4 = {{-4., -5., 6.}, {-7., -8., -9., .88}}; + std::vector vdvp = {dvp, dvp2, dvp3, dvp4}; + std::cout << "---\n"; + // qern4(vdvp); + + std::vector bpw0 = {true, false}; + std::cout << "---\n"; + cern1(bpw0); + std::vector bpw1 = {false, false, false}; + std::vector bpw2 = {false, true, false, true}; + std::vector bpw3 = {false, false, true, false, true}; + std::vector> vbpw{bpw0, bpw1, bpw2, bpw3}; + std::cout << "---\n"; + cern2(vbpw); + + BoolVectorPair bvp = {{false, false}, {false, true, true, false}}; + std::cout << "---\n"; + cern3(bvp); + + BoolVectorPair bvp2 = {{false, true, true, false, true, false}, + {false, true, true, false, false, false, true, false}}; + BoolVectorPair bvp3 = {{false}, {true, true}}; + BoolVectorPair bvp4 = {{true, false, false}, {false, true, false, true}}; + std::vector vbvp = {bvp, bvp2, bvp3, bvp4}; + std::cout << "---\n"; + // cern4(vbvp); + + return 0; +} + +// CHECK: --- +// CHECK: integers: 345 1 2 +// CHECK: --- +// CHECK: integers: 345 1 2 +// CHECK: integers: 92347 3 4 +// CHECK: integers: 2358 5 6 +// CHECK: integers: 45 7 18 +// CHECK: --- +// CHECK: integers: 8 238 44 +// CHECK: integers: 0 -4 81 92745 +// CHECK: --- +// CHECK: doubles: 3.45 1 2 +// CHECK: --- +// CHECK: doubles: 3.45 1 2 +// CHECK: doubles: 92.347 2.3 4 +// CHECK: doubles: 235.8 5.5 6.4 +// CHECK: doubles: 4.5 77.7 18.2 +// CHECK: --- +// CHECK: doubles: 8 2.38 4.4 +// CHECK: doubles: 0 -4.99 81.5 92.745 +// CHECK: --- +// CHECK: booleans: 1 0 +// CHECK: --- +// CHECK: booleans: 1 0 +// CHECK: booleans: 0 0 0 +// CHECK: booleans: 0 1 0 1 +// CHECK: booleans: 0 0 1 0 1 +// CHECK: --- +// CHECK: booleans: 0 0 +// CHECK: booleans: 0 1 1 0 diff --git a/test/AST-Quake/calling_convention.cpp b/test/AST-Quake/calling_convention.cpp index 3d2c6e2e4a..fcf7c26cda 100644 --- a/test/AST-Quake/calling_convention.cpp +++ b/test/AST-Quake/calling_convention.cpp @@ -278,9 +278,7 @@ struct V3 { // CHECK-SAME: %[[VAL_2:.*]]: !cc.ptr, !cc.ptr, !cc.ptr}>>, // CHECK-SAME: %[[VAL_3:.*]]: !cc.ptr, !cc.ptr, !cc.ptr}>>) // CHECK-LABEL: func.func @_ZN2V3clESt6vectorIlSaIlEES0_IbSaIbEE( -// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, -// CHECK-SAME: %[[VAL_1:.*]]: !cc.ptr, !cc.ptr, !cc.ptr}>>, -// CHECK-SAME: %[[VAL_2:.*]]: !cc.ptr, !cc.ptr, !cc.ptr}>>) +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, %[[VAL_1:.*]]: !cc.ptr, !cc.ptr, !cc.ptr}>>, %[[VAL_2:.*]]: !cc.ptr, !cc.array}>>) // clang-format on //===----------------------------------------------------------------------===// diff --git a/test/Quake/kernel_exec-1.qke b/test/Quake/kernel_exec-1.qke index 044bf93782..9bae7ecebf 100644 --- a/test/Quake/kernel_exec-1.qke +++ b/test/Quake/kernel_exec-1.qke @@ -123,7 +123,7 @@ module attributes {quake.mangled_name_map = { // ALT: func.func private @malloc(i64) -> !cc.ptr // ALT: func.func private @free(!cc.ptr) // ALT: func.func private @__nvqpp_initializer_list_to_vector_bool(!cc.ptr, !cc.ptr, i64) -// ALT: func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr, !cc.ptr, !cc.ptr}>>, !cc.ptr, !cc.ptr, !cc.ptr}>>) +// ALT: func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr, !cc.ptr, !cc.ptr}>>, !cc.ptr, !cc.array}>>, !cc.ptr>) // ALT: func.func private @llvm.memcpy.p0i8.p0i8.i64(!cc.ptr, !cc.ptr, i64, i1) // ALT-LABEL: func.func private @__nvqpp_zeroDynamicResult() -> !cc.struct<{!cc.ptr, i64}> { @@ -250,7 +250,7 @@ module attributes {quake.mangled_name_map = { // STREAMLINED: func.func private @malloc(i64) -> !cc.ptr // STREAMLINED: func.func private @free(!cc.ptr) // STREAMLINED: func.func private @__nvqpp_initializer_list_to_vector_bool(!cc.ptr, !cc.ptr, i64) -// STREAMLINED: func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr, !cc.ptr, !cc.ptr}>>, !cc.ptr, !cc.ptr, !cc.ptr}>>) +// STREAMLINED: func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr, !cc.ptr, !cc.ptr}>>, !cc.ptr, !cc.array}>>, !cc.ptr>) // STREAMLINED: func.func private @llvm.memcpy.p0i8.p0i8.i64(!cc.ptr, !cc.ptr, i64, i1) // STREAMLINED-LABEL: func.func private @__nvqpp_zeroDynamicResult() -> !cc.struct<{!cc.ptr, i64}> { @@ -359,7 +359,7 @@ module attributes {quake.mangled_name_map = { // HYBRID: func.func private @malloc(i64) -> !cc.ptr // HYBRID: func.func private @free(!cc.ptr) // HYBRID: func.func private @__nvqpp_initializer_list_to_vector_bool(!cc.ptr, !cc.ptr, i64) -// HYBRID: func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr, !cc.ptr, !cc.ptr}>>, !cc.ptr, !cc.ptr, !cc.ptr}>>) +// HYBRID: func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr, !cc.ptr, !cc.ptr}>>, !cc.ptr, !cc.array}>>, !cc.ptr>) // HYBRID: func.func private @llvm.memcpy.p0i8.p0i8.i64(!cc.ptr, !cc.ptr, i64, i1) // HYBRID-LABEL: func.func private @__nvqpp_zeroDynamicResult() -> !cc.struct<{!cc.ptr, i64}> { diff --git a/test/Quake/kernel_exec-2.qke b/test/Quake/kernel_exec-2.qke index e8be1ab6ac..ebc29811a1 100644 --- a/test/Quake/kernel_exec-2.qke +++ b/test/Quake/kernel_exec-2.qke @@ -131,7 +131,7 @@ __nvqpp__mlirgen__function_cargo = "pants"}} { // CHECK: func.func private @malloc(i64) -> !cc.ptr // CHECK: func.func private @free(!cc.ptr) // CHECK: func.func private @__nvqpp_initializer_list_to_vector_bool(!cc.ptr, !cc.ptr, i64) -// CHECK: func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr, !cc.ptr, !cc.ptr}>>, !cc.ptr, !cc.ptr, !cc.ptr}>>) +// CHECK: func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr, !cc.ptr, !cc.ptr}>>, !cc.ptr, !cc.array}>>, !cc.ptr>) // CHECK: func.func private @llvm.memcpy.p0i8.p0i8.i64(!cc.ptr, !cc.ptr, i64, i1) // CHECK-LABEL: func.func private @__nvqpp_zeroDynamicResult() -> !cc.struct<{!cc.ptr, i64}> { diff --git a/test/Quake/return_vector.qke b/test/Quake/return_vector.qke index 0c706ca7b1..bba89bb5dd 100644 --- a/test/Quake/return_vector.qke +++ b/test/Quake/return_vector.qke @@ -28,7 +28,7 @@ func.func @test_0(%0: !cc.ptr, !cc.ptr, !cc.ptr !cc.stdvec { +// CHECK-SAME: %[[VAL_0:.*]]: i32) -> !cc.stdvec { // CHECK: %[[VAL_1:.*]] = arith.constant 8 : i64 // CHECK: %[[VAL_2:.*]] = arith.constant 256 : i64 // CHECK: %[[VAL_3:.*]] = call @malloc(%[[VAL_2]]) : (i64) -> !cc.ptr @@ -37,72 +37,79 @@ func.func @test_0(%0: !cc.ptr, !cc.ptr, !cc.ptr, !cc.ptr, !cc.ptr}>> {llvm.sret = !cc.struct<{!cc.ptr, !cc.ptr, !cc.ptr}>}, -// CHECK-SAME: %[[VAL_1:.*]]: !cc.ptr, -// CHECK-SAME: %[[VAL_2:.*]]: i32) { +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, !cc.ptr, !cc.ptr}>> {llvm.sret = !cc.struct<{!cc.ptr, !cc.ptr, !cc.ptr}>}, %[[VAL_1:.*]]: !cc.ptr, %[[VAL_2:.*]]: i32) { // CHECK: %[[VAL_3:.*]] = arith.constant 4 : i64 // CHECK: %[[VAL_4:.*]] = arith.constant 8 : i64 -// CHECK: %[[VAL_5:.*]] = arith.constant 0 : i64 -// CHECK: %[[VAL_6:.*]] = constant @test_0.thunk : (!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}> -// CHECK: %[[VAL_7:.*]] = cc.alloca i64 -// CHECK: %[[VAL_8:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 -// CHECK: %[[VAL_9:.*]] = cc.alloca i8{{\[}}%[[VAL_8]] : i64] -// CHECK: %[[VAL_10:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr>) -> !cc.ptr, i64}>}>> -// CHECK: %[[VAL_11:.*]] = cc.alloca !cc.ptr -// CHECK: %[[VAL_12:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr>) -> !cc.ptr -// CHECK: cc.store %[[VAL_2]], %[[VAL_12]] : !cc.ptr -// CHECK: %[[VAL_13:.*]] = cc.func_ptr %[[VAL_6]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr -// CHECK: %[[VAL_14:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr>) -> !cc.ptr -// CHECK: %[[VAL_15:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> [1] : i64 -// CHECK: %[[VAL_16:.*]] = cc.alloca !cc.struct<{!cc.ptr>, !cc.ptr>, !cc.ptr>}> -// CHECK: %[[VAL_17:.*]] = cc.alloca !cc.array x 1> -// CHECK: %[[VAL_18:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr x 1>>) -> !cc.ptr> -// CHECK: %[[VAL_19:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> -// CHECK: cc.store %[[VAL_18]], %[[VAL_19]] : !cc.ptr>> -// CHECK: %[[VAL_20:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr x 1>>) -> i64 -// CHECK: %[[VAL_21:.*]] = arith.addi %[[VAL_20]], %[[VAL_4]] : i64 -// CHECK: %[[VAL_22:.*]] = cc.cast %[[VAL_21]] : (i64) -> !cc.ptr> -// CHECK: %[[VAL_23:.*]] = cc.compute_ptr %[[VAL_16]][1] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> -// CHECK: cc.store %[[VAL_22]], %[[VAL_23]] : !cc.ptr>> -// CHECK: %[[VAL_24:.*]] = cc.compute_ptr %[[VAL_16]][2] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> -// CHECK: cc.store %[[VAL_22]], %[[VAL_24]] : !cc.ptr>> -// CHECK: %[[VAL_25:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr x 1>>) -> !cc.ptr> -// CHECK: %[[VAL_26:.*]] = cc.alloca i32 -// CHECK: cc.store %[[VAL_2]], %[[VAL_26]] : !cc.ptr -// CHECK: %[[VAL_27:.*]] = cc.cast %[[VAL_26]] : (!cc.ptr) -> !cc.ptr -// CHECK: cc.store %[[VAL_27]], %[[VAL_25]] : !cc.ptr> -// CHECK: %[[VAL_28:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr -// CHECK: %[[VAL_29:.*]] = llvm.mlir.addressof @test_0.kernelName : !llvm.ptr> -// CHECK: %[[VAL_30:.*]] = cc.cast %[[VAL_29]] : (!llvm.ptr>) -> !cc.ptr -// CHECK: %[[VAL_31:.*]] = call @hybridLaunchKernel(%[[VAL_30]], %[[VAL_13]], %[[VAL_14]], %[[VAL_8]], %[[VAL_15]], %[[VAL_28]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> -// CHECK: %[[VAL_32:.*]] = cc.extract_value %[[VAL_31]][0] : (!cc.struct<{!cc.ptr, i64}>) -> !cc.ptr -// CHECK: %[[VAL_33:.*]] = cc.cast %[[VAL_32]] : (!cc.ptr) -> i64 -// CHECK: %[[VAL_34:.*]] = arith.cmpi ne, %[[VAL_33]], %[[VAL_5]] : i64 -// CHECK: cf.cond_br %[[VAL_34]], ^bb1, ^bb2 +// CHECK: %[[VAL_5:.*]] = constant @test_0.thunk : (!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_6:.*]] = arith.constant 0 : i64 +// CHECK: %[[VAL_7:.*]] = cc.alloca !cc.ptr +// CHECK: %[[VAL_8:.*]] = cc.cast %[[VAL_6]] : (i64) -> !cc.ptr +// CHECK: cc.store %[[VAL_8]], %[[VAL_7]] : !cc.ptr> +// CHECK: %[[VAL_9:.*]] = cc.alloca i64 +// CHECK: %[[VAL_10:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 +// CHECK: %[[VAL_11:.*]] = cc.alloca i8{{\[}}%[[VAL_10]] : i64] +// CHECK: %[[VAL_12:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr>) -> !cc.ptr, i64}>}>> +// CHECK: %[[VAL_13:.*]] = cc.alloca !cc.ptr +// CHECK: %[[VAL_14:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr>) -> !cc.ptr +// CHECK: cc.store %[[VAL_2]], %[[VAL_14]] : !cc.ptr +// CHECK: %[[VAL_15:.*]] = cc.load %[[VAL_7]] : !cc.ptr> +// CHECK: %[[VAL_16:.*]] = cc.cast %[[VAL_15]] : (!cc.ptr) -> i64 +// CHECK: %[[VAL_17:.*]] = arith.cmpi ne, %[[VAL_16]], %[[VAL_6]] : i64 +// CHECK: cc.if(%[[VAL_17]]) { +// CHECK: func.call @__nvqpp_vector_bool_free_temporary_initlists(%[[VAL_15]]) : (!cc.ptr) -> () +// CHECK: } +// CHECK: %[[VAL_18:.*]] = cc.func_ptr %[[VAL_5]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr +// CHECK: %[[VAL_19:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr>) -> !cc.ptr +// CHECK: %[[VAL_20:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> [1] : i64 +// CHECK: %[[VAL_21:.*]] = cc.alloca !cc.struct<{!cc.ptr>, !cc.ptr>, !cc.ptr>}> +// CHECK: %[[VAL_22:.*]] = cc.alloca !cc.array x 1> +// CHECK: %[[VAL_23:.*]] = cc.cast %[[VAL_22]] : (!cc.ptr x 1>>) -> !cc.ptr> +// CHECK: %[[VAL_24:.*]] = cc.cast %[[VAL_21]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// CHECK: cc.store %[[VAL_23]], %[[VAL_24]] : !cc.ptr>> +// CHECK: %[[VAL_25:.*]] = cc.cast %[[VAL_22]] : (!cc.ptr x 1>>) -> i64 +// CHECK: %[[VAL_26:.*]] = arith.addi %[[VAL_25]], %[[VAL_4]] : i64 +// CHECK: %[[VAL_27:.*]] = cc.cast %[[VAL_26]] : (i64) -> !cc.ptr> +// CHECK: %[[VAL_28:.*]] = cc.compute_ptr %[[VAL_21]][1] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// CHECK: cc.store %[[VAL_27]], %[[VAL_28]] : !cc.ptr>> +// CHECK: %[[VAL_29:.*]] = cc.compute_ptr %[[VAL_21]][2] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// CHECK: cc.store %[[VAL_27]], %[[VAL_29]] : !cc.ptr>> +// CHECK: %[[VAL_30:.*]] = cc.cast %[[VAL_22]] : (!cc.ptr x 1>>) -> !cc.ptr> +// CHECK: %[[VAL_31:.*]] = cc.alloca i32 +// CHECK: cc.store %[[VAL_2]], %[[VAL_31]] : !cc.ptr +// CHECK: %[[VAL_32:.*]] = cc.cast %[[VAL_31]] : (!cc.ptr) -> !cc.ptr +// CHECK: cc.store %[[VAL_32]], %[[VAL_30]] : !cc.ptr> +// CHECK: %[[VAL_33:.*]] = cc.cast %[[VAL_21]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr +// CHECK: %[[VAL_34:.*]] = llvm.mlir.addressof @test_0.kernelName : !llvm.ptr> +// CHECK: %[[VAL_35:.*]] = cc.cast %[[VAL_34]] : (!llvm.ptr>) -> !cc.ptr +// CHECK: %[[VAL_36:.*]] = call @hybridLaunchKernel(%[[VAL_35]], %[[VAL_18]], %[[VAL_19]], %[[VAL_10]], %[[VAL_20]], %[[VAL_33]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_37:.*]] = cc.extract_value %[[VAL_36]][0] : (!cc.struct<{!cc.ptr, i64}>) -> !cc.ptr +// CHECK: %[[VAL_38:.*]] = cc.cast %[[VAL_37]] : (!cc.ptr) -> i64 +// CHECK: %[[VAL_39:.*]] = arith.cmpi ne, %[[VAL_38]], %[[VAL_6]] : i64 +// CHECK: cf.cond_br %[[VAL_39]], ^bb1, ^bb2 // CHECK: ^bb1: -// CHECK: %[[VAL_35:.*]] = cc.cast %[[VAL_32]] : (!cc.ptr) -> !cc.ptr, i64}>}>> -// CHECK: %[[VAL_36:.*]] = cc.compute_ptr %[[VAL_35]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> -// CHECK: cf.br ^bb3(%[[VAL_36]] : !cc.ptr, i64}>>) +// CHECK: %[[VAL_40:.*]] = cc.cast %[[VAL_37]] : (!cc.ptr) -> !cc.ptr, i64}>}>> +// CHECK: %[[VAL_41:.*]] = cc.compute_ptr %[[VAL_40]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> +// CHECK: cf.br ^bb3(%[[VAL_41]] : !cc.ptr, i64}>>) // CHECK: ^bb2: -// CHECK: %[[VAL_37:.*]] = cc.compute_ptr %[[VAL_10]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> -// CHECK: cf.br ^bb3(%[[VAL_37]] : !cc.ptr, i64}>>) -// CHECK: ^bb3(%[[VAL_38:.*]]: !cc.ptr, i64}>>): -// CHECK: %[[VAL_39:.*]] = cc.cast %[[VAL_38]] : (!cc.ptr, i64}>>) -> !cc.ptr> -// CHECK: %[[VAL_40:.*]] = cc.load %[[VAL_39]] : !cc.ptr> -// CHECK: %[[VAL_41:.*]] = cc.compute_ptr %[[VAL_38]][1] : (!cc.ptr, i64}>>) -> !cc.ptr -// CHECK: %[[VAL_42:.*]] = cc.load %[[VAL_41]] : !cc.ptr -// CHECK: %[[VAL_43:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr, !cc.ptr, !cc.ptr}>> -// CHECK: %[[VAL_44:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> -// CHECK: %[[VAL_45:.*]] = cc.cast %[[VAL_40]] : (!cc.ptr) -> !cc.ptr -// CHECK: cc.store %[[VAL_45]], %[[VAL_44]] : !cc.ptr> -// CHECK: %[[VAL_46:.*]] = cc.compute_ptr %[[VAL_43]][1] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> -// CHECK: %[[VAL_47:.*]] = arith.muli %[[VAL_42]], %[[VAL_3]] : i64 -// CHECK: %[[VAL_48:.*]] = cc.cast %[[VAL_40]] : (!cc.ptr) -> !cc.ptr> -// CHECK: %[[VAL_49:.*]] = cc.compute_ptr %[[VAL_48]]{{\[}}%[[VAL_47]]] : (!cc.ptr>, i64) -> !cc.ptr -// CHECK: cc.store %[[VAL_49]], %[[VAL_46]] : !cc.ptr> -// CHECK: %[[VAL_50:.*]] = cc.compute_ptr %[[VAL_43]][2] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> -// CHECK: cc.store %[[VAL_49]], %[[VAL_50]] : !cc.ptr> -// CHECK: call @free(%[[VAL_32]]) : (!cc.ptr) -> () +// CHECK: %[[VAL_42:.*]] = cc.compute_ptr %[[VAL_12]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> +// CHECK: cf.br ^bb3(%[[VAL_42]] : !cc.ptr, i64}>>) +// CHECK: ^bb3(%[[VAL_43:.*]]: !cc.ptr, i64}>>): +// CHECK: %[[VAL_44:.*]] = cc.cast %[[VAL_43]] : (!cc.ptr, i64}>>) -> !cc.ptr> +// CHECK: %[[VAL_45:.*]] = cc.load %[[VAL_44]] : !cc.ptr> +// CHECK: %[[VAL_46:.*]] = cc.compute_ptr %[[VAL_43]][1] : (!cc.ptr, i64}>>) -> !cc.ptr +// CHECK: %[[VAL_47:.*]] = cc.load %[[VAL_46]] : !cc.ptr +// CHECK: %[[VAL_48:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr, !cc.ptr, !cc.ptr}>> +// CHECK: %[[VAL_49:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: %[[VAL_50:.*]] = cc.cast %[[VAL_45]] : (!cc.ptr) -> !cc.ptr +// CHECK: cc.store %[[VAL_50]], %[[VAL_49]] : !cc.ptr> +// CHECK: %[[VAL_51:.*]] = cc.compute_ptr %[[VAL_48]][1] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: %[[VAL_52:.*]] = arith.muli %[[VAL_47]], %[[VAL_3]] : i64 +// CHECK: %[[VAL_53:.*]] = cc.cast %[[VAL_45]] : (!cc.ptr) -> !cc.ptr> +// CHECK: %[[VAL_54:.*]] = cc.compute_ptr %[[VAL_53]]{{\[}}%[[VAL_52]]] : (!cc.ptr>, i64) -> !cc.ptr +// CHECK: cc.store %[[VAL_54]], %[[VAL_51]] : !cc.ptr> +// CHECK: %[[VAL_55:.*]] = cc.compute_ptr %[[VAL_48]][2] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: cc.store %[[VAL_54]], %[[VAL_55]] : !cc.ptr> +// CHECK: call @free(%[[VAL_37]]) : (!cc.ptr) -> () // CHECK: return // CHECK: } @@ -117,10 +124,9 @@ func.func @__nvqpp__mlirgen__test_1(%arg0: i32) -> !cc.stdvec { func.func @test_1(%0: !cc.ptr, !cc.ptr, !cc.ptr}>> {llvm.sret = !cc.struct<{!cc.ptr, !cc.ptr, !cc.ptr}>}, %1: !cc.ptr, %2: i32) { return } -} // CHECK-LABEL: func.func @__nvqpp__mlirgen__test_1( -// CHECK-SAME: %[[VAL_0:.*]]: i32) -> !cc.stdvec { +// CHECK-SAME: %[[VAL_0:.*]]: i32) -> !cc.stdvec { // CHECK: %[[VAL_1:.*]] = arith.constant 9 : i64 // CHECK: %[[VAL_2:.*]] = arith.constant 520 : i64 // CHECK: %[[VAL_3:.*]] = call @malloc(%[[VAL_2]]) : (i64) -> !cc.ptr @@ -129,73 +135,83 @@ func.func @test_1(%0: !cc.ptr, !cc.ptr, !cc.ptr, !cc.ptr, !cc.ptr}>> {llvm.sret = !cc.struct<{!cc.ptr, !cc.ptr, !cc.ptr}>}, -// CHECK-SAME: %[[VAL_1:.*]]: !cc.ptr, -// CHECK-SAME: %[[VAL_2:.*]]: i32) { +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, !cc.ptr, !cc.ptr}>> {llvm.sret = !cc.struct<{!cc.ptr, !cc.ptr, !cc.ptr}>}, %[[VAL_1:.*]]: !cc.ptr, %[[VAL_2:.*]]: i32) { // CHECK: %[[VAL_3:.*]] = arith.constant 8 : i64 -// CHECK: %[[VAL_4:.*]] = arith.constant 0 : i64 -// CHECK: %[[VAL_5:.*]] = constant @test_1.thunk : (!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}> -// CHECK: %[[VAL_6:.*]] = cc.alloca i64 -// CHECK: %[[VAL_7:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 -// CHECK: %[[VAL_8:.*]] = cc.alloca i8{{\[}}%[[VAL_7]] : i64] -// CHECK: %[[VAL_9:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr>) -> !cc.ptr, i64}>}>> -// CHECK: %[[VAL_10:.*]] = cc.alloca !cc.ptr -// CHECK: %[[VAL_11:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr>) -> !cc.ptr -// CHECK: cc.store %[[VAL_2]], %[[VAL_11]] : !cc.ptr -// CHECK: %[[VAL_12:.*]] = cc.func_ptr %[[VAL_5]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr -// CHECK: %[[VAL_13:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr>) -> !cc.ptr -// CHECK: %[[VAL_14:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> [1] : i64 -// CHECK: %[[VAL_15:.*]] = cc.alloca !cc.struct<{!cc.ptr>, !cc.ptr>, !cc.ptr>}> -// CHECK: %[[VAL_16:.*]] = cc.alloca !cc.array x 1> -// CHECK: %[[VAL_17:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr x 1>>) -> !cc.ptr> -// CHECK: %[[VAL_18:.*]] = cc.cast %[[VAL_15]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> -// CHECK: cc.store %[[VAL_17]], %[[VAL_18]] : !cc.ptr>> -// CHECK: %[[VAL_19:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr x 1>>) -> i64 -// CHECK: %[[VAL_20:.*]] = arith.addi %[[VAL_19]], %[[VAL_3]] : i64 -// CHECK: %[[VAL_21:.*]] = cc.cast %[[VAL_20]] : (i64) -> !cc.ptr> -// CHECK: %[[VAL_22:.*]] = cc.compute_ptr %[[VAL_15]][1] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> -// CHECK: cc.store %[[VAL_21]], %[[VAL_22]] : !cc.ptr>> -// CHECK: %[[VAL_23:.*]] = cc.compute_ptr %[[VAL_15]][2] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> -// CHECK: cc.store %[[VAL_21]], %[[VAL_23]] : !cc.ptr>> -// CHECK: %[[VAL_24:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr x 1>>) -> !cc.ptr> -// CHECK: %[[VAL_25:.*]] = cc.alloca i32 -// CHECK: cc.store %[[VAL_2]], %[[VAL_25]] : !cc.ptr -// CHECK: %[[VAL_26:.*]] = cc.cast %[[VAL_25]] : (!cc.ptr) -> !cc.ptr -// CHECK: cc.store %[[VAL_26]], %[[VAL_24]] : !cc.ptr> -// CHECK: %[[VAL_27:.*]] = cc.cast %[[VAL_15]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr -// CHECK: %[[VAL_28:.*]] = llvm.mlir.addressof @test_1.kernelName : !llvm.ptr> -// CHECK: %[[VAL_29:.*]] = cc.cast %[[VAL_28]] : (!llvm.ptr>) -> !cc.ptr -// CHECK: %[[VAL_30:.*]] = call @hybridLaunchKernel(%[[VAL_29]], %[[VAL_12]], %[[VAL_13]], %[[VAL_7]], %[[VAL_14]], %[[VAL_27]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> -// CHECK: %[[VAL_31:.*]] = cc.extract_value %[[VAL_30]][0] : (!cc.struct<{!cc.ptr, i64}>) -> !cc.ptr -// CHECK: %[[VAL_32:.*]] = cc.cast %[[VAL_31]] : (!cc.ptr) -> i64 -// CHECK: %[[VAL_33:.*]] = arith.cmpi ne, %[[VAL_32]], %[[VAL_4]] : i64 -// CHECK: cf.cond_br %[[VAL_33]], ^bb1, ^bb2 +// CHECK: %[[VAL_4:.*]] = constant @test_1.thunk : (!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_5:.*]] = arith.constant 0 : i64 +// CHECK: %[[VAL_6:.*]] = cc.alloca !cc.ptr +// CHECK: %[[VAL_7:.*]] = cc.cast %[[VAL_5]] : (i64) -> !cc.ptr +// CHECK: cc.store %[[VAL_7]], %[[VAL_6]] : !cc.ptr> +// CHECK: %[[VAL_8:.*]] = cc.alloca i64 +// CHECK: %[[VAL_9:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 +// CHECK: %[[VAL_10:.*]] = cc.alloca i8{{\[}}%[[VAL_9]] : i64] +// CHECK: %[[VAL_11:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr>) -> !cc.ptr, i64}>}>> +// CHECK: %[[VAL_12:.*]] = cc.alloca !cc.ptr +// CHECK: %[[VAL_13:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr>) -> !cc.ptr +// CHECK: cc.store %[[VAL_2]], %[[VAL_13]] : !cc.ptr +// CHECK: %[[VAL_14:.*]] = cc.load %[[VAL_6]] : !cc.ptr> +// CHECK: %[[VAL_15:.*]] = cc.cast %[[VAL_14]] : (!cc.ptr) -> i64 +// CHECK: %[[VAL_16:.*]] = arith.cmpi ne, %[[VAL_15]], %[[VAL_5]] : i64 +// CHECK: cc.if(%[[VAL_16]]) { +// CHECK: func.call @__nvqpp_vector_bool_free_temporary_initlists(%[[VAL_14]]) : (!cc.ptr) -> () +// CHECK: } +// CHECK: %[[VAL_17:.*]] = cc.func_ptr %[[VAL_4]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr +// CHECK: %[[VAL_18:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr>) -> !cc.ptr +// CHECK: %[[VAL_19:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> [1] : i64 +// CHECK: %[[VAL_20:.*]] = cc.alloca !cc.struct<{!cc.ptr>, !cc.ptr>, !cc.ptr>}> +// CHECK: %[[VAL_21:.*]] = cc.alloca !cc.array x 1> +// CHECK: %[[VAL_22:.*]] = cc.cast %[[VAL_21]] : (!cc.ptr x 1>>) -> !cc.ptr> +// CHECK: %[[VAL_23:.*]] = cc.cast %[[VAL_20]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// CHECK: cc.store %[[VAL_22]], %[[VAL_23]] : !cc.ptr>> +// CHECK: %[[VAL_24:.*]] = cc.cast %[[VAL_21]] : (!cc.ptr x 1>>) -> i64 +// CHECK: %[[VAL_25:.*]] = arith.addi %[[VAL_24]], %[[VAL_3]] : i64 +// CHECK: %[[VAL_26:.*]] = cc.cast %[[VAL_25]] : (i64) -> !cc.ptr> +// CHECK: %[[VAL_27:.*]] = cc.compute_ptr %[[VAL_20]][1] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// CHECK: cc.store %[[VAL_26]], %[[VAL_27]] : !cc.ptr>> +// CHECK: %[[VAL_28:.*]] = cc.compute_ptr %[[VAL_20]][2] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// CHECK: cc.store %[[VAL_26]], %[[VAL_28]] : !cc.ptr>> +// CHECK: %[[VAL_29:.*]] = cc.cast %[[VAL_21]] : (!cc.ptr x 1>>) -> !cc.ptr> +// CHECK: %[[VAL_30:.*]] = cc.alloca i32 +// CHECK: cc.store %[[VAL_2]], %[[VAL_30]] : !cc.ptr +// CHECK: %[[VAL_31:.*]] = cc.cast %[[VAL_30]] : (!cc.ptr) -> !cc.ptr +// CHECK: cc.store %[[VAL_31]], %[[VAL_29]] : !cc.ptr> +// CHECK: %[[VAL_32:.*]] = cc.cast %[[VAL_20]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr +// CHECK: %[[VAL_33:.*]] = llvm.mlir.addressof @test_1.kernelName : !llvm.ptr> +// CHECK: %[[VAL_34:.*]] = cc.cast %[[VAL_33]] : (!llvm.ptr>) -> !cc.ptr +// CHECK: %[[VAL_35:.*]] = call @hybridLaunchKernel(%[[VAL_34]], %[[VAL_17]], %[[VAL_18]], %[[VAL_9]], %[[VAL_19]], %[[VAL_32]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_36:.*]] = cc.extract_value %[[VAL_35]][0] : (!cc.struct<{!cc.ptr, i64}>) -> !cc.ptr +// CHECK: %[[VAL_37:.*]] = cc.cast %[[VAL_36]] : (!cc.ptr) -> i64 +// CHECK: %[[VAL_38:.*]] = arith.cmpi ne, %[[VAL_37]], %[[VAL_5]] : i64 +// CHECK: cf.cond_br %[[VAL_38]], ^bb1, ^bb2 // CHECK: ^bb1: -// CHECK: %[[VAL_34:.*]] = cc.cast %[[VAL_31]] : (!cc.ptr) -> !cc.ptr, i64}>}>> -// CHECK: %[[VAL_35:.*]] = cc.compute_ptr %[[VAL_34]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> -// CHECK: cf.br ^bb3(%[[VAL_35]] : !cc.ptr, i64}>>) +// CHECK: %[[VAL_39:.*]] = cc.cast %[[VAL_36]] : (!cc.ptr) -> !cc.ptr, i64}>}>> +// CHECK: %[[VAL_40:.*]] = cc.compute_ptr %[[VAL_39]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> +// CHECK: cf.br ^bb3(%[[VAL_40]] : !cc.ptr, i64}>>) // CHECK: ^bb2: -// CHECK: %[[VAL_36:.*]] = cc.compute_ptr %[[VAL_9]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> -// CHECK: cf.br ^bb3(%[[VAL_36]] : !cc.ptr, i64}>>) -// CHECK: ^bb3(%[[VAL_37:.*]]: !cc.ptr, i64}>>): -// CHECK: %[[VAL_38:.*]] = cc.cast %[[VAL_37]] : (!cc.ptr, i64}>>) -> !cc.ptr> -// CHECK: %[[VAL_39:.*]] = cc.load %[[VAL_38]] : !cc.ptr> -// CHECK: %[[VAL_40:.*]] = cc.compute_ptr %[[VAL_37]][1] : (!cc.ptr, i64}>>) -> !cc.ptr -// CHECK: %[[VAL_41:.*]] = cc.load %[[VAL_40]] : !cc.ptr -// CHECK: %[[VAL_42:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr, !cc.ptr, !cc.ptr}>> -// CHECK: %[[VAL_43:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> -// CHECK: %[[VAL_44:.*]] = cc.cast %[[VAL_39]] : (!cc.ptr) -> !cc.ptr -// CHECK: cc.store %[[VAL_44]], %[[VAL_43]] : !cc.ptr> -// CHECK: %[[VAL_45:.*]] = cc.compute_ptr %[[VAL_42]][1] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> -// CHECK: %[[VAL_46:.*]] = arith.muli %[[VAL_41]], %[[VAL_3]] : i64 -// CHECK: %[[VAL_47:.*]] = cc.cast %[[VAL_39]] : (!cc.ptr) -> !cc.ptr> -// CHECK: %[[VAL_48:.*]] = cc.compute_ptr %[[VAL_47]]{{\[}}%[[VAL_46]]] : (!cc.ptr>, i64) -> !cc.ptr -// CHECK: cc.store %[[VAL_48]], %[[VAL_45]] : !cc.ptr> -// CHECK: %[[VAL_49:.*]] = cc.compute_ptr %[[VAL_42]][2] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> -// CHECK: cc.store %[[VAL_48]], %[[VAL_49]] : !cc.ptr> -// CHECK: call @free(%[[VAL_31]]) : (!cc.ptr) -> () +// CHECK: %[[VAL_41:.*]] = cc.compute_ptr %[[VAL_11]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> +// CHECK: cf.br ^bb3(%[[VAL_41]] : !cc.ptr, i64}>>) +// CHECK: ^bb3(%[[VAL_42:.*]]: !cc.ptr, i64}>>): +// CHECK: %[[VAL_43:.*]] = cc.cast %[[VAL_42]] : (!cc.ptr, i64}>>) -> !cc.ptr> +// CHECK: %[[VAL_44:.*]] = cc.load %[[VAL_43]] : !cc.ptr> +// CHECK: %[[VAL_45:.*]] = cc.compute_ptr %[[VAL_42]][1] : (!cc.ptr, i64}>>) -> !cc.ptr +// CHECK: %[[VAL_46:.*]] = cc.load %[[VAL_45]] : !cc.ptr +// CHECK: %[[VAL_47:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr, !cc.ptr, !cc.ptr}>> +// CHECK: %[[VAL_48:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: %[[VAL_49:.*]] = cc.cast %[[VAL_44]] : (!cc.ptr) -> !cc.ptr +// CHECK: cc.store %[[VAL_49]], %[[VAL_48]] : !cc.ptr> +// CHECK: %[[VAL_50:.*]] = cc.compute_ptr %[[VAL_47]][1] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: %[[VAL_51:.*]] = arith.muli %[[VAL_46]], %[[VAL_3]] : i64 +// CHECK: %[[VAL_52:.*]] = cc.cast %[[VAL_44]] : (!cc.ptr) -> !cc.ptr> +// CHECK: %[[VAL_53:.*]] = cc.compute_ptr %[[VAL_52]]{{\[}}%[[VAL_51]]] : (!cc.ptr>, i64) -> !cc.ptr +// CHECK: cc.store %[[VAL_53]], %[[VAL_50]] : !cc.ptr> +// CHECK: %[[VAL_54:.*]] = cc.compute_ptr %[[VAL_47]][2] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: cc.store %[[VAL_53]], %[[VAL_54]] : !cc.ptr> +// CHECK: call @free(%[[VAL_36]]) : (!cc.ptr) -> () // CHECK: return // CHECK: } + +} + // CHECK: func.func private @hybridLaunchKernel(!cc.ptr, !cc.ptr, !cc.ptr, i64, i64, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> // CHECK: func.func private @cudaqRegisterArgsCreator(!cc.ptr, !cc.ptr) // CHECK: llvm.func @cudaqRegisterLambdaName(!llvm.ptr, !llvm.ptr) attributes {sym_visibility = "private"} @@ -204,7 +220,7 @@ func.func @test_1(%0: !cc.ptr, !cc.ptr, !cc.ptr) // CHECK: func.func private @free(!cc.ptr) // CHECK: func.func private @__nvqpp_initializer_list_to_vector_bool(!cc.ptr, !cc.ptr, i64) -// CHECK: func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr, !cc.ptr, !cc.ptr}>>, !cc.ptr, !cc.ptr, !cc.ptr}>>) +// CHECK: func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr, !cc.ptr, !cc.ptr}>>, !cc.ptr, !cc.array}>>, !cc.ptr>) // CHECK: func.func private @llvm.memcpy.p0i8.p0i8.i64(!cc.ptr, !cc.ptr, i64, i1) // CHECK-LABEL: func.func private @__nvqpp_zeroDynamicResult() -> !cc.struct<{!cc.ptr, i64}> { From 1e69502b725b38e8d4b4458740ac8e0f4aa0d700 Mon Sep 17 00:00:00 2001 From: Eric Schweitz Date: Wed, 6 Nov 2024 12:43:11 -0800 Subject: [PATCH 05/19] Fix warnings. Signed-off-by: Eric Schweitz --- lib/Optimizer/Transforms/GenKernelExecution.cpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/lib/Optimizer/Transforms/GenKernelExecution.cpp b/lib/Optimizer/Transforms/GenKernelExecution.cpp index 7e450c2da7..5dbc4aa5a2 100644 --- a/lib/Optimizer/Transforms/GenKernelExecution.cpp +++ b/lib/Optimizer/Transforms/GenKernelExecution.cpp @@ -580,7 +580,9 @@ static Value descendThroughDynamicType(Location loc, OpBuilder &builder, Type packedTy = cudaq::opt::factory::genArgumentBufferType(t); Value strSize = builder.create(loc, i64Ty, packedTy); - for (auto [i, m] : llvm::enumerate(t.getMembers())) { + for (auto iter : llvm::enumerate(t.getMembers())) { + std::int32_t i = iter.index(); + auto m = iter.value(); if (cudaq::cc::isDynamicType(m)) { auto hostPtrTy = cast(arg.getType()); auto hostStrTy = @@ -763,13 +765,14 @@ populateMessageBuffer(Location loc, OpBuilder &builder, Value msgBufferBase, cast(msgBufferBase.getType()).getElementType()); // Loop over all the arguments and populate the message buffer. for (auto [idx, arg, devArgTy] : zippy) { + std::int32_t i = idx; if (cudaq::cc::isDynamicType(devArgTy)) { assert(addendum && "must have addendum to encode dynamic argument(s)"); // Get the address of the slot to be filled. - auto memberTy = cast(structTy).getMember(idx); + auto memberTy = cast(structTy).getMember(i); auto ptrTy = cudaq::cc::PointerType::get(memberTy); auto slot = builder.create( - loc, ptrTy, msgBufferBase, ArrayRef{idx}); + loc, ptrTy, msgBufferBase, ArrayRef{i}); addendum = populateDynamicAddendum(loc, builder, devArgTy, arg, slot, addendum, addendumScratch); continue; @@ -784,10 +787,10 @@ populateMessageBuffer(Location loc, OpBuilder &builder, Value msgBufferBase, continue; // Get the address of the slot to be filled. - auto memberTy = cast(structTy).getMember(idx); + auto memberTy = cast(structTy).getMember(i); auto ptrTy = cudaq::cc::PointerType::get(memberTy); Value slot = builder.create( - loc, ptrTy, msgBufferBase, ArrayRef{idx}); + loc, ptrTy, msgBufferBase, ArrayRef{i}); // Argument is a packaged kernel. In this case, the argument is some // unknown kernel that may be called. The packaged argument is coming From cafa13067ad533a974fbf36bdaafedd89249f36c Mon Sep 17 00:00:00 2001 From: Eric Schweitz Date: Wed, 6 Nov 2024 16:50:47 -0800 Subject: [PATCH 06/19] Fix python test. Signed-off-by: Eric Schweitz --- lib/Optimizer/Transforms/QuakeSynthesizer.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp index 7365c03370..82e6896c06 100644 --- a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp +++ b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp @@ -122,8 +122,8 @@ synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter, ATTR arrayAttr, MAKER makeElementValue) { auto *ctx = builder.getContext(); auto argTy = argument.getType(); - assert(isa(argTy)); - auto strTy = cast(argTy); + assert(isa(argTy)); + auto strTy = cast(argTy); auto eleTy = cast(strTy.getElementType()); builder.setInsertionPointToStart(argument.getOwner()); auto argLoc = argument.getLoc(); @@ -566,7 +566,7 @@ class QuakeSynthesizer // If std::vector type, add it to the list of vector info. // These will be processed when we reach the buffer's appendix. - if (auto vecTy = dyn_cast(type)) { + if (auto vecTy = dyn_cast(type)) { auto eleTy = vecTy.getElementType(); if (!isa( eleTy)) { From 2b56f8f177752a898bba2d017597784ad51886e8 Mon Sep 17 00:00:00 2001 From: Eric Schweitz Date: Thu, 7 Nov 2024 08:27:06 -0800 Subject: [PATCH 07/19] Fix aarch64 bug. Signed-off-by: Eric Schweitz --- lib/Optimizer/Transforms/GenKernelExecution.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Optimizer/Transforms/GenKernelExecution.cpp b/lib/Optimizer/Transforms/GenKernelExecution.cpp index 5dbc4aa5a2..21c5c9c9ab 100644 --- a/lib/Optimizer/Transforms/GenKernelExecution.cpp +++ b/lib/Optimizer/Transforms/GenKernelExecution.cpp @@ -812,7 +812,7 @@ populateMessageBuffer(Location loc, OpBuilder &builder, Value msgBufferBase, if (isa(devArgTy)) arg = builder.create(loc, memberTy, arg); - if (isa(arg.getType()) && + if (isa(arg.getType()) && (cudaq::cc::PointerType::get(arg.getType()) != slot.getType())) { slot = builder.create( loc, cudaq::cc::PointerType::get(arg.getType()), slot); From 25b3e02dcd8bc32ec8b200f4fe98b41d6c1781ec Mon Sep 17 00:00:00 2001 From: Eric Schweitz Date: Thu, 7 Nov 2024 10:02:13 -0800 Subject: [PATCH 08/19] Remove some of the NYI from tests that are fixed. Signed-off-by: Eric Schweitz --- targettests/Kernel/signature-0.cpp | 5 +---- targettests/Kernel/signature-4.cpp | 12 ++++-------- targettests/Kernel/signature-5.cpp | 21 ++++++++++++++++++--- 3 files changed, 23 insertions(+), 15 deletions(-) diff --git a/targettests/Kernel/signature-0.cpp b/targettests/Kernel/signature-0.cpp index 882fb24704..0adf9c8779 100644 --- a/targettests/Kernel/signature-0.cpp +++ b/targettests/Kernel/signature-0.cpp @@ -65,12 +65,9 @@ class Qernel6 { } }; -// FIXME: unhandled ctor call -#define NYI /*__qpu__*/ - class Qernel7 { public: - std::vector operator()(std::vector v) NYI { return v; } + std::vector operator()(std::vector v) __qpu__ { return v; } }; int main() { diff --git a/targettests/Kernel/signature-4.cpp b/targettests/Kernel/signature-4.cpp index 14deb5c55f..e1826b1c99 100644 --- a/targettests/Kernel/signature-4.cpp +++ b/targettests/Kernel/signature-4.cpp @@ -14,10 +14,8 @@ // Tests that we can take a small struct, a struct with a vector member, a // vector of small structs, and a large struct as an argument and return the -// same. Currently, DefaultQPU::launchKernel does not handle return values at -// all. +// same. -// FIXME #define NYI /*__qpu__*/ void ok() { std::cout << "ok\n"; } @@ -39,7 +37,7 @@ class QernelS1a { }; struct QernelS1 { - S1 operator()(S1 s) NYI { + S1 operator()(S1 s) __qpu__ { if (s._1 == 4 && s._2 == 8.2) ok(); else @@ -48,7 +46,6 @@ struct QernelS1 { } }; -// struct with vector member not yet supported struct S2 { int _1; std::vector _2; @@ -66,6 +63,7 @@ struct QernelS2a { }; struct QernelS2 { + // kernel result type not supported (bridge) S2 operator()(S2 s) NYI { s._1++; s._2[0] = 0.0; @@ -84,16 +82,14 @@ class QernelS3a { } }; -// ctor in return not supported struct QernelS3 { - std::vector operator()(std::vector s) NYI { + std::vector operator()(std::vector s) __qpu__ { s[0]._1++; s[0]._2 = 0.0; return s; } }; -// bug in bridge std::vector mock_ctor(const std::vector &v) { return v; } struct QernelS4 { diff --git a/targettests/Kernel/signature-5.cpp b/targettests/Kernel/signature-5.cpp index a42b5b8518..a2fa263560 100644 --- a/targettests/Kernel/signature-5.cpp +++ b/targettests/Kernel/signature-5.cpp @@ -15,7 +15,6 @@ // Test kernels can take arguments of tuple or pair as well as return values of // same. -// FIXME: tuple and pair are not handled. #define NYI /*__qpu__*/ void ok() { std::cout << "ok\n"; } @@ -24,7 +23,7 @@ void fail() { std::cout << "fail\n"; } using S1 = std::tuple; struct QernelS1a { - void operator()(S1 s) NYI { + void operator()(S1 s) __qpu__ { if (std::get<0>(s) == 1 && std::get<1>(s) == 2 && std::get<2>(s) == 4) ok(); else @@ -38,10 +37,18 @@ struct QernelS1 { } }; +S1 qernel_s1b_helper(S1 s) { + return {std::get<2>(s) + 1, std::get<1>(s) + 1, std::get<0>(s) + 1}; +} + +struct QernelS1b { + S1 operator()(S1 s) NYI { return qernel_s1b_helper(s); } +}; + using S2 = std::tuple>; struct QernelS2a { - void operator()(S2 s) NYI { + void operator()(S2 s) __qpu__ { if (std::get<0>(s) == 8.16 && std::get<1>(s) == 32.64f && std::get<2>(s).size() == 2) ok(); @@ -88,6 +95,13 @@ int main() { ok(); else fail(); + std::cout << "QernelS1b "; + auto updated_s1b = QernelS1b{}(s1); + if (std::get<0>(updated_s1b) == 5 && std::get<1>(updated_s1b) == 3 && + std::get<2>(updated_s1b) == 2) + ok(); + else + fail(); std::vector v = {128, 256}; S2 s2 = {8.16, 32.64f, v}; @@ -117,6 +131,7 @@ int main() { // clang-format off // CHECK-LABEL: QernelS1a ok // CHECK-NEXT: QernelS1 ok +// CHECK-NEXT: QernelS1b ok // CHECK-NEXT: QernelS2a ok // CHECK-NEXT: QernelS2 ok // CHECK-NEXT: ok From 33f78ebb2e8ad8e5841a7b01e8ae9243ab238d71 Mon Sep 17 00:00:00 2001 From: Eric Schweitz Date: Thu, 7 Nov 2024 15:58:14 -0800 Subject: [PATCH 09/19] Aarch64 fixes. Signed-off-by: Eric Schweitz --- targettests/Kernel/signature-4.cpp | 2 +- test/AST-Quake/calling_convention-aarch64.cpp | 2 +- test/Translate/return_values.qke | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/targettests/Kernel/signature-4.cpp b/targettests/Kernel/signature-4.cpp index e1826b1c99..00e9effc93 100644 --- a/targettests/Kernel/signature-4.cpp +++ b/targettests/Kernel/signature-4.cpp @@ -37,7 +37,7 @@ class QernelS1a { }; struct QernelS1 { - S1 operator()(S1 s) __qpu__ { + S1 operator()(S1 s) NYI { if (s._1 == 4 && s._2 == 8.2) ok(); else diff --git a/test/AST-Quake/calling_convention-aarch64.cpp b/test/AST-Quake/calling_convention-aarch64.cpp index 174aaf3558..22d60856e0 100644 --- a/test/AST-Quake/calling_convention-aarch64.cpp +++ b/test/AST-Quake/calling_convention-aarch64.cpp @@ -271,7 +271,7 @@ struct V3 { // CHECK-LABEL: func.func @_ZN2V3clESt6vectorIlSaIlEES0_IbSaIbEE( // CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, // CHECK-SAME: %[[VAL_1:.*]]: !cc.ptr, !cc.ptr, !cc.ptr}>>, -// CHECK-SAME: %[[VAL_2:.*]]: !cc.ptr, !cc.ptr, !cc.ptr}>>) +// CHECK-SAME: %[[VAL_2:.*]]: !cc.ptr, !cc.array}>>) // clang-format on //===----------------------------------------------------------------------===// diff --git a/test/Translate/return_values.qke b/test/Translate/return_values.qke index 0fb1365420..35922fc7da 100644 --- a/test/Translate/return_values.qke +++ b/test/Translate/return_values.qke @@ -196,7 +196,7 @@ func.func @test_1(%this: !cc.ptr) -> i16 { // CHECK-LABEL: define i16 @test_1(i8* nocapture readnone // CHECK-SAME: %[[VAL_0:.*]]) local_unnamed_addr { // CHECK: %[[VAL_1:.*]] = alloca [0 x i8*], align 8 -// CHECK: %[[VAL_2:.*]] = alloca i16, align 2 +// CHECK: %[[VAL_2:.*]] = alloca i16 // CHECK: %[[VAL_3:.*]] = alloca { i8**, i8**, i8** }, align 8 // CHECK: %[[VAL_4:.*]] = bitcast i16* %[[VAL_2]] to i8* // CHECK: %[[VAL_5:.*]] = getelementptr inbounds [0 x i8*], [0 x i8*]* %[[VAL_1]], i64 0, i64 0 From b7c0f16d9a4dc4f66b3ff14518d6c5fbd3a8747b Mon Sep 17 00:00:00 2001 From: Eric Schweitz Date: Fri, 8 Nov 2024 07:03:56 -0800 Subject: [PATCH 10/19] Remove align since it is different on aarch64. Signed-off-by: Eric Schweitz --- test/Translate/return_values.qke | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/Translate/return_values.qke b/test/Translate/return_values.qke index 35922fc7da..9361f10012 100644 --- a/test/Translate/return_values.qke +++ b/test/Translate/return_values.qke @@ -208,7 +208,7 @@ func.func @test_1(%this: !cc.ptr) -> i16 { // CHECK: store i8** %[[VAL_5]], i8*** %[[VAL_8]], align 8 // CHECK: %[[VAL_9:.*]] = bitcast { i8**, i8**, i8** }* %[[VAL_3]] to i8* // CHECK: %[[VAL_10:.*]] = call { i8*, i64 } @hybridLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_1.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_1.thunk to i8*), i8* nonnull %[[VAL_4]], i64 2, i64 0, i8* nonnull %[[VAL_9]]) -// CHECK: %[[VAL_11:.*]] = load i16, i16* %[[VAL_2]], align 2 +// CHECK: %[[VAL_11:.*]] = load i16, i16* %[[VAL_2]] // CHECK: ret i16 %[[VAL_11]] // CHECK: } From 08effd4cdc893890afd0a05a0c90f9f3f766d69c Mon Sep 17 00:00:00 2001 From: Eric Schweitz Date: Wed, 13 Nov 2024 08:12:21 -0800 Subject: [PATCH 11/19] Updates to make std::string representation depend on the header files in use. Signed-off-by: Eric Schweitz --- include/cudaq/Optimizer/Builder/Factory.h | 10 +- include/cudaq/Optimizer/Builder/Runtime.h | 8 + lib/Frontend/nvqpp/ASTBridge.cpp | 26 ++- lib/Optimizer/Builder/Factory.cpp | 30 +-- lib/Optimizer/Builder/Intrinsics.cpp | 11 ++ .../Transforms/GenKernelExecution.cpp | 182 ++++++++++-------- runtime/cudaq/qis/pauli_word.h | 36 +++- 7 files changed, 195 insertions(+), 108 deletions(-) diff --git a/include/cudaq/Optimizer/Builder/Factory.h b/include/cudaq/Optimizer/Builder/Factory.h index dccca9be24..99083de521 100644 --- a/include/cudaq/Optimizer/Builder/Factory.h +++ b/include/cudaq/Optimizer/Builder/Factory.h @@ -128,9 +128,13 @@ inline mlir::Type stateImplType(mlir::Type eleTy) { return cudaq::opt::factory::getPointerType(eleTy.getContext()); } -// Host side types for std::string and std::vector +// Generate host side type for std::string. The result is the type of a block of +// bytes and the length to allocate. This allows for the creation of code to +// allocate a variable, stride across such a variable, etc. The ModuleOp must +// contain the sizeof a pauli_word in its attributes. +cudaq::cc::ArrayType genHostStringType(mlir::ModuleOp module); -cudaq::cc::StructType stlStringType(mlir::MLIRContext *ctx); +// Host side types for std::vector cudaq::cc::StructType stlVectorType(mlir::Type eleTy); //===----------------------------------------------------------------------===// @@ -247,7 +251,7 @@ mlir::FunctionType toHostSideFuncType(mlir::FunctionType funcTy, bool addThisPtr, mlir::ModuleOp module); /// Convert device type, \p ty, to host side type. -mlir::Type convertToHostSideType(mlir::Type ty); +mlir::Type convertToHostSideType(mlir::Type ty, mlir::ModuleOp module); // Return `true` if the given type corresponds to a standard vector type // according to our convention. diff --git a/include/cudaq/Optimizer/Builder/Runtime.h b/include/cudaq/Optimizer/Builder/Runtime.h index e65c05a857..8bcacbdb35 100644 --- a/include/cudaq/Optimizer/Builder/Runtime.h +++ b/include/cudaq/Optimizer/Builder/Runtime.h @@ -52,4 +52,12 @@ static constexpr const char CudaqRegisterKernelName[] = static constexpr const char cudaqAHSPrefixName[] = "__analog_hamiltonian_kernel__"; +// Host-side helper functions for working with `cudaq::pauli_word` or a +// `std::string`. +static constexpr const char sizeofStringAttrName[] = "cc.sizeof_string"; +static constexpr const char getPauliWordSize[] = + "_ZNK5cudaq10pauli_word11_nvqpp_sizeEv"; +static constexpr const char getPauliWordData[] = + "_ZNK5cudaq10pauli_word11_nvqpp_dataEv"; + } // namespace cudaq::runtime diff --git a/lib/Frontend/nvqpp/ASTBridge.cpp b/lib/Frontend/nvqpp/ASTBridge.cpp index 806f3c6bde..2e4f1d810f 100644 --- a/lib/Frontend/nvqpp/ASTBridge.cpp +++ b/lib/Frontend/nvqpp/ASTBridge.cpp @@ -153,10 +153,10 @@ class QPUCodeFinder : public clang::RecursiveASTVisitor { using Base = clang::RecursiveASTVisitor; explicit QPUCodeFinder( cudaq::EmittedFunctionsCollection &funcsToEmit, clang::CallGraph &cgb, - clang::ItaniumMangleContext *mangler, + clang::ItaniumMangleContext *mangler, ModuleOp module, std::unordered_map &customOperations) : functionsToEmit(funcsToEmit), callGraphBuilder(cgb), mangler(mangler), - customOperationNames(customOperations) {} + module(module), customOperationNames(customOperations) {} /// Add a kernel to the list of kernels to process. template @@ -332,6 +332,25 @@ class QPUCodeFinder : public clang::RecursiveASTVisitor { tuplesAreReversed = !opt->isZero(); } } + if (cudaq::isInNamespace(x, "cudaq") && + cudaq::isInNamespace(x, "details") && + x->getName().equals("_nvqpp_sizeof")) { + // This constexpr is the sizeof a pauli_word and a std::string. + auto loc = x->getLocation(); + auto opt = x->getAnyInitializer()->getIntegerConstantExpr( + x->getASTContext(), &loc, false); + assert(opt && "must compute the sizeof a cudaq::pauli_word"); + auto sizeofString = opt->getZExtValue(); + auto sizeAttr = module->getAttr(cudaq::runtime::sizeofStringAttrName); + if (sizeAttr) { + assert(sizeofString == cast(sizeAttr).getUInt()); + } else { + auto *ctx = module.getContext(); + auto i64Ty = IntegerType::get(ctx, 64); + module->setAttr(cudaq::runtime::sizeofStringAttrName, + IntegerAttr::get(i64Ty, sizeofString)); + } + } // The check to make sure that quantum data types are only used in kernels // is done here. This checks both variable declarations and parameters. if (quantumTypesNotAllowed) @@ -357,6 +376,7 @@ class QPUCodeFinder : public clang::RecursiveASTVisitor { cudaq::EmittedFunctionsCollection &functionsToEmit; clang::CallGraph &callGraphBuilder; clang::ItaniumMangleContext *mangler; + ModuleOp module; std::unordered_map &customOperationNames; // A class that is being visited. Need to run semantics checks on it if and // only if it has a quantum kernel. @@ -648,7 +668,7 @@ void ASTBridgeAction::ASTBridgeConsumer::HandleTranslationUnit( bool ASTBridgeAction::ASTBridgeConsumer::HandleTopLevelDecl( clang::DeclGroupRef dg) { - QPUCodeFinder finder(functionsToEmit, callGraphBuilder, mangler, + QPUCodeFinder finder(functionsToEmit, callGraphBuilder, mangler, module.get(), customOperationNames); // Loop over all decls, saving the function decls that are quantum kernels. for (const auto *decl : dg) diff --git a/lib/Optimizer/Builder/Factory.cpp b/lib/Optimizer/Builder/Factory.cpp index 5c090d4271..dc7f866610 100644 --- a/lib/Optimizer/Builder/Factory.cpp +++ b/lib/Optimizer/Builder/Factory.cpp @@ -7,6 +7,7 @@ ******************************************************************************/ #include "cudaq/Optimizer/Builder/Intrinsics.h" +#include "cudaq/Optimizer/Builder/Runtime.h" #include "cudaq/Optimizer/CodeGen/QIRFunctionNames.h" #include "cudaq/Optimizer/Dialect/CC/CCOps.h" #include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h" @@ -305,12 +306,13 @@ cc::LoopOp factory::createMonotonicLoop( return loop; } -cc::StructType factory::stlStringType(MLIRContext *ctx) { +cc::ArrayType factory::genHostStringType(ModuleOp mod) { + auto *ctx = mod.getContext(); auto i8Ty = IntegerType::get(ctx, 8); - auto ptrI8Ty = cc::PointerType::get(i8Ty); - auto i64Ty = IntegerType::get(ctx, 64); - auto padTy = cc::ArrayType::get(ctx, i8Ty, 16); - return cc::StructType::get(ctx, ArrayRef{ptrI8Ty, i64Ty, padTy}); + auto sizeAttr = mod->getAttr(cudaq::runtime::sizeofStringAttrName); + assert(sizeAttr && "module must have cc.sizeof_string attribute"); + auto size = cast(sizeAttr).getInt(); + return cc::ArrayType::get(ctx, i8Ty, size); } // FIXME: We should get the underlying structure of a std::vector from the @@ -358,18 +360,19 @@ Type factory::getSRetElementType(FunctionType funcTy) { return funcTy.getResult(0); } -Type factory::convertToHostSideType(Type ty) { +Type factory::convertToHostSideType(Type ty, ModuleOp mod) { if (auto memrefTy = dyn_cast(ty)) - return stlHostVectorType(convertToHostSideType(memrefTy.getElementType())); + return stlHostVectorType( + convertToHostSideType(memrefTy.getElementType(), mod)); if (isa(ty)) return cc::PointerType::get(IntegerType::get(ty.getContext(), 8)); - if (isa(ty)) - return factory::stlStringType(ty.getContext()); + if (auto csTy = dyn_cast(ty)) + return genHostStringType(mod); auto *ctx = ty.getContext(); if (auto structTy = dyn_cast(ty)) { SmallVector newMembers; for (auto mem : structTy.getMembers()) - newMembers.push_back(convertToHostSideType(mem)); + newMembers.push_back(convertToHostSideType(mem, mod)); if (structTy.getName()) return cc::StructType::get(ctx, structTy.getName(), newMembers, structTy.getBitSize(), structTy.getAlignment(), @@ -589,7 +592,7 @@ FunctionType factory::toHostSideFuncType(FunctionType funcTy, bool addThisPtr, // returned via a sret argument in the first position. When this argument // is added, the this pointer becomes the second argument. Both are opaque // pointers at this point. - auto eleTy = convertToHostSideType(getSRetElementType(funcTy)); + auto eleTy = convertToHostSideType(getSRetElementType(funcTy), module); inputTys.push_back(cc::PointerType::get(eleTy)); hasSRet = true; } else { @@ -605,7 +608,7 @@ FunctionType factory::toHostSideFuncType(FunctionType funcTy, bool addThisPtr, // Add all the explicit (not hidden) arguments after the hidden ones. for (auto kernelTy : funcTy.getInputs()) { - auto hostTy = convertToHostSideType(kernelTy); + auto hostTy = convertToHostSideType(kernelTy, module); if (auto strTy = dyn_cast(hostTy)) { // On x86_64 and aarch64, a struct that is smaller than 128 bits may be // passed in registers as separate arguments. See classifyArgumentType() @@ -646,6 +649,9 @@ FunctionType factory::toHostSideFuncType(FunctionType funcTy, bool addThisPtr, } // Pass a struct as a byval pointer. hostTy = cc::PointerType::get(hostTy); + } else if (isa(hostTy)) { + // Pass a raw data block as a pointer. (It's a struct passed as a blob.) + hostTy = cc::PointerType::get(hostTy); } inputTys.push_back(hostTy); } diff --git a/lib/Optimizer/Builder/Intrinsics.cpp b/lib/Optimizer/Builder/Intrinsics.cpp index d0db1bdf82..4dbe616967 100644 --- a/lib/Optimizer/Builder/Intrinsics.cpp +++ b/lib/Optimizer/Builder/Intrinsics.cpp @@ -49,6 +49,17 @@ inline bool operator<(const IntrinsicCode &icode, const IntrinsicCode &jcode) { /// well as prototypes for LLVM intrinsics and C library calls that are used by /// the compiler. The table should be kept in sorted order. static constexpr IntrinsicCode intrinsicTable[] = { + // These following pauli_word helper functions are only available on the + // host-side. They ought not be called in kernel code. + {cudaq::runtime::getPauliWordData, + {}, + "func.func private @_ZNK5cudaq10pauli_word11_nvqpp_dataEv(%pw : " + "!cc.ptr) -> !cc.ptr"}, + {cudaq::runtime::getPauliWordSize, + {cudaq::runtime::getPauliWordData}, + "func.func private @_ZNK5cudaq10pauli_word11_nvqpp_sizeEv(%pw : " + "!cc.ptr) -> i64"}, + // Initialize a (preallocated) buffer (the first parameter) with i64 values // on the semi-open range `[0..n)` where `n` is the second parameter. {cudaq::runtime::getLinkableKernelKey, diff --git a/lib/Optimizer/Transforms/GenKernelExecution.cpp b/lib/Optimizer/Transforms/GenKernelExecution.cpp index 21c5c9c9ab..4b7a5a5018 100644 --- a/lib/Optimizer/Transforms/GenKernelExecution.cpp +++ b/lib/Optimizer/Transforms/GenKernelExecution.cpp @@ -96,18 +96,15 @@ static Value genStringLength(Location loc, OpBuilder &builder, Value stringArg) { Type stringTy = stringArg.getType(); assert(isa(stringTy) && - isa( + isa( cast(stringTy).getElementType()) && - cast( - cast(stringTy).getElementType()) - .getMember(1) == builder.getI64Type() && "host side string expected"); - auto ptrTy = cast(stringTy); - auto strTy = cast(ptrTy.getElementType()); - auto lenPtr = builder.create( - loc, cudaq::cc::PointerType::get(strTy.getMember(1)), stringArg, - ArrayRef{1}); - return builder.create(loc, lenPtr); + auto callArg = builder.create( + loc, cudaq::cc::PointerType::get(builder.getI8Type()), stringArg); + auto lenRes = builder.create(loc, builder.getI64Type(), + cudaq::runtime::getPauliWordSize, + ValueRange{callArg}); + return lenRes.getResult(0); } /// Generate code that computes the size in bytes of a `std::vector` array @@ -208,8 +205,8 @@ static bool isDynamicSignature(FunctionType devFuncTy) { } static std::pair -genByteSizeAndElementCount(Location loc, OpBuilder &builder, Type eleTy, - Value size, Value arg, Type t) { +genByteSizeAndElementCount(Location loc, OpBuilder &builder, ModuleOp module, + Type eleTy, Value size, Value arg, Type t) { // If this is a vector>, convert the bytes of vector to bytes of // length (i64). if (auto sty = dyn_cast(eleTy)) { @@ -227,8 +224,10 @@ genByteSizeAndElementCount(Location loc, OpBuilder &builder, Type eleTy, // If this is a vector, convert the bytes of string to bytes of length // (i64). if (isa(eleTy)) { - auto fore = builder.create(loc, 4, 64); - size = builder.create(loc, size, fore); + auto arrTy = cudaq::opt::factory::genHostStringType(module); + auto words = + builder.create(loc, arrTy.getSize() / 8, 64); + size = builder.create(loc, size, words); auto ate = builder.create(loc, 8, 64); Value count = builder.create(loc, size, ate); return {size, count}; @@ -272,27 +271,27 @@ static bool hasStdVectorBool(Type ty) { // type for a `std::vector`. The former is a unique data type with a size // of 40 bytes. The latter is identical to `std::vector` (which has a size // of 24 bytes). -static Type convertToTransientType(Type ty) { +static Type convertToTransientType(Type ty, ModuleOp mod) { if (isStdVectorBool(ty)) { auto *ctx = ty.getContext(); return cudaq::opt::factory::stlVectorType(IntegerType::get(ctx, 1)); } if (auto sty = dyn_cast(ty)) return cudaq::opt::factory::stlVectorType( - convertToTransientType(sty.getElementType())); + convertToTransientType(sty.getElementType(), mod)); if (auto sty = dyn_cast(ty)) { SmallVector newMems; for (auto mem : sty.getMembers()) - newMems.push_back(convertToTransientType(mem)); + newMems.push_back(convertToTransientType(mem, mod)); auto *ctx = ty.getContext(); return cudaq::cc::StructType::get(ctx, newMems); } - return cudaq::opt::factory::convertToHostSideType(ty); + return cudaq::opt::factory::convertToHostSideType(ty, mod); } static std::pair -convertAllStdVectorBool(Location loc, OpBuilder &builder, Value arg, Type ty, - Value heapTracker, +convertAllStdVectorBool(Location loc, OpBuilder &builder, ModuleOp module, + Value arg, Type ty, Value heapTracker, std::optional preallocated = std::nullopt) { // If we are here, `ty` must be a `std::vector` or recursively contain a // `std::vector`. @@ -328,13 +327,13 @@ convertAllStdVectorBool(Location loc, OpBuilder &builder, Value arg, Type ty, cast(startTy).getElementType()); auto input = builder.create( loc, cudaq::cc::PointerType::get(subArrTy), startInput); - auto transientTy = convertToTransientType(sty); + auto transientTy = convertToTransientType(sty, module); Value tmp = builder.create(loc, transientTy); Value sizeDelta = genVectorSize(loc, builder, arg); auto count = [&]() -> Value { if (cudaq::cc::isDynamicType(seleTy)) { - auto p = genByteSizeAndElementCount(loc, builder, seleTy, sizeDelta, - arg, sty); + auto p = genByteSizeAndElementCount(loc, builder, module, seleTy, + sizeDelta, arg, sty); return p.second; } auto sizeEle = builder.create( @@ -372,7 +371,7 @@ convertAllStdVectorBool(Location loc, OpBuilder &builder, Value arg, Type ty, // Loop over each element in the outer vector and initialize it to the inner // vector value. The data may be heap allocated.) - auto transientEleTy = convertToTransientType(seleTy); + auto transientEleTy = convertToTransientType(seleTy, module); auto transientBufferTy = cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(transientEleTy)); auto buffer = @@ -387,15 +386,15 @@ convertAllStdVectorBool(Location loc, OpBuilder &builder, Value arg, Type ty, auto currentVector = builder.create( loc, cudaq::cc::PointerType::get(transientEleTy), buffer, ArrayRef{i}); - convertAllStdVectorBool(loc, builder, inp, seleTy, heapTracker, - currentVector); + convertAllStdVectorBool(loc, builder, module, inp, seleTy, + heapTracker, currentVector); }); return {tmp, true}; } // Handle `struct { ... };`. if (auto sty = dyn_cast(ty)) { - auto bufferTy = convertToTransientType(ty); + auto bufferTy = convertToTransientType(ty, module); auto argPtrTy = cast(arg.getType()); auto argStrTy = cast(argPtrTy.getElementType()); @@ -409,23 +408,23 @@ convertAllStdVectorBool(Location loc, OpBuilder &builder, Value arg, Type ty, auto fromPtr = builder.create( loc, cudaq::cc::PointerType::get(argStrTy.getMember(i)), arg, ArrayRef{i}); - auto transientTy = convertToTransientType(memTy); + auto transientTy = convertToTransientType(memTy, module); Value toPtr = builder.create( loc, cudaq::cc::PointerType::get(transientTy), buffer, ArrayRef{i}); - convertAllStdVectorBool(loc, builder, fromPtr, memTy, heapTracker, toPtr); + convertAllStdVectorBool(loc, builder, module, fromPtr, memTy, heapTracker, + toPtr); } return {buffer, true}; } return {arg, false}; } -static std::pair unpackAnyStdVectorBool(Location loc, - OpBuilder &builder, - Value arg, Type ty, - Value heapTracker) { +static std::pair +unpackAnyStdVectorBool(Location loc, OpBuilder &builder, ModuleOp module, + Value arg, Type ty, Value heapTracker) { if (hasStdVectorBool(ty)) - return convertAllStdVectorBool(loc, builder, arg, ty, heapTracker); + return convertAllStdVectorBool(loc, builder, module, arg, ty, heapTracker); return {arg, false}; } @@ -436,8 +435,9 @@ static std::pair unpackAnyStdVectorBool(Location loc, // cannot be encoded. template static SmallVector> -zipArgumentsWithDeviceTypes(Location loc, OpBuilder &builder, ValueRange args, - TypeRange types, Value heapTracker) { +zipArgumentsWithDeviceTypes(Location loc, OpBuilder &builder, ModuleOp module, + ValueRange args, TypeRange types, + Value heapTracker) { SmallVector> result; if constexpr (argsAreReferences) { // Simple case: the number of args must be equal to the types. @@ -451,7 +451,8 @@ zipArgumentsWithDeviceTypes(Location loc, OpBuilder &builder, ValueRange args, isa(ty))) v = builder.create(loc, v); // Python will pass a std::vector to us here. Unpack it. - auto pear = unpackAnyStdVectorBool(loc, builder, v, ty, heapTracker); + auto pear = + unpackAnyStdVectorBool(loc, builder, module, v, ty, heapTracker); v = pear.first; result.emplace_back(iter.index(), v, ty); } @@ -470,8 +471,8 @@ zipArgumentsWithDeviceTypes(Location loc, OpBuilder &builder, ValueRange args, // std::vector isn't really a std::vector<>. Use the helper // function to unpack it so it looks like any other vector. - auto pear = - unpackAnyStdVectorBool(loc, builder, *argIter, devTy, heapTracker); + auto pear = unpackAnyStdVectorBool(loc, builder, module, *argIter, devTy, + heapTracker); if (pear.second) { result.emplace_back(argPos, pear.first, devTy); continue; @@ -519,8 +520,8 @@ zipArgumentsWithDeviceTypes(Location loc, OpBuilder &builder, ValueRange args, } static Value descendThroughDynamicType(Location loc, OpBuilder &builder, - Type ty, Value addend, Value arg, - Value tmp) { + ModuleOp module, Type ty, Value addend, + Value arg, Value tmp) { auto i64Ty = builder.getI64Type(); Value tySize = TypeSwitch(ty) @@ -538,8 +539,8 @@ static Value descendThroughDynamicType(Location loc, OpBuilder &builder, return size; // Otherwise, we have a recursively dynamic case. - auto [bytes, count] = - genByteSizeAndElementCount(loc, builder, eleTy, size, arg, t); + auto [bytes, count] = genByteSizeAndElementCount( + loc, builder, module, eleTy, size, arg, t); assert(count && "vector must have elements"); size = bytes; @@ -567,7 +568,7 @@ static Value descendThroughDynamicType(Location loc, OpBuilder &builder, ArrayRef{i}); auto tmpVal = builder.create(loc, tmp); Value innerSize = descendThroughDynamicType( - loc, builder, eleTy, tmpVal, ai, tmp); + loc, builder, module, eleTy, tmpVal, ai, tmp); builder.create(loc, innerSize, tmp); }); return builder.create(loc, tmp); @@ -590,8 +591,8 @@ static Value descendThroughDynamicType(Location loc, OpBuilder &builder, auto pm = cudaq::cc::PointerType::get(hostStrTy.getMember(i)); auto ai = builder.create( loc, pm, arg, ArrayRef{i}); - strSize = descendThroughDynamicType(loc, builder, m, strSize, - ai, tmp); + strSize = descendThroughDynamicType(loc, builder, module, m, + strSize, ai, tmp); } } return strSize; @@ -604,14 +605,17 @@ static Value descendThroughDynamicType(Location loc, OpBuilder &builder, return builder.create(loc, tySize, addend); } -static Value genSizeOfDynamicMessageBuffer( - Location loc, OpBuilder &builder, cudaq::cc::StructType structTy, - ArrayRef> zippy, Value tmp) { +static Value +genSizeOfDynamicMessageBuffer(Location loc, OpBuilder &builder, ModuleOp module, + cudaq::cc::StructType structTy, + ArrayRef> zippy, + Value tmp) { auto i64Ty = builder.getI64Type(); Value initSize = builder.create(loc, i64Ty, structTy); for (auto [_, a, t] : zippy) if (cudaq::cc::isDynamicType(t)) - initSize = descendThroughDynamicType(loc, builder, t, initSize, a, tmp); + initSize = + descendThroughDynamicType(loc, builder, module, t, initSize, a, tmp); return initSize; } @@ -621,13 +625,14 @@ static Value populateStringAddendum(Location loc, OpBuilder &builder, Value size = genStringLength(loc, builder, host); builder.create(loc, size, sizeSlot); auto ptrI8Ty = cudaq::cc::PointerType::get(builder.getI8Type()); - auto ptrPtrI8 = getPointerToPointerType(builder); - auto fromPtrPtr = builder.create(loc, ptrPtrI8, host); - auto fromPtr = builder.create(loc, fromPtrPtr); + auto fromPtr = builder.create(loc, ptrI8Ty, host); + auto dataPtr = builder.create( + loc, ptrI8Ty, cudaq::runtime::getPauliWordData, ValueRange{fromPtr}); auto notVolatile = builder.create(loc, 0, 1); auto toPtr = builder.create(loc, ptrI8Ty, addendum); - builder.create(loc, std::nullopt, cudaq::llvmMemCopyIntrinsic, - ValueRange{toPtr, fromPtr, size, notVolatile}); + builder.create( + loc, std::nullopt, cudaq::llvmMemCopyIntrinsic, + ValueRange{toPtr, dataPtr.getResult(0), size, notVolatile}); auto ptrI8Arr = getByteAddressableType(builder); auto addBytes = builder.create(loc, ptrI8Arr, addendum); return builder.create( @@ -655,8 +660,9 @@ static Value populateVectorAddendum(Location loc, OpBuilder &builder, } static Value populateDynamicAddendum(Location loc, OpBuilder &builder, - Type devArgTy, Value host, Value sizeSlot, - Value addendum, Value addendumScratch) { + ModuleOp module, Type devArgTy, Value host, + Value sizeSlot, Value addendum, + Value addendumScratch) { if (isa(devArgTy)) return populateStringAddendum(loc, builder, host, sizeSlot, addendum); if (auto vecTy = dyn_cast(devArgTy)) { @@ -664,8 +670,8 @@ static Value populateDynamicAddendum(Location loc, OpBuilder &builder, if (cudaq::cc::isDynamicType(eleTy)) { // Recursive case. Visit each dynamic element, copying it. Value size = genVectorSize(loc, builder, host); - auto [bytes, count] = - genByteSizeAndElementCount(loc, builder, eleTy, size, host, devArgTy); + auto [bytes, count] = genByteSizeAndElementCount( + loc, builder, module, eleTy, size, host, devArgTy); size = bytes; builder.create(loc, size, sizeSlot); @@ -719,8 +725,9 @@ static Value populateDynamicAddendum(Location loc, OpBuilder &builder, auto subHost = builder.create( loc, hostBeginEleTy, hostBlock, ArrayRef{i}); - Value newAddm = populateDynamicAddendum( - loc, builder, eleTy, subHost, subSlot, addm, addendumScratch); + Value newAddm = + populateDynamicAddendum(loc, builder, module, eleTy, subHost, + subSlot, addm, addendumScratch); builder.create(loc, newAddm, addendumScratch); }); return builder.create(loc, addendumScratch); @@ -744,8 +751,9 @@ static Value populateDynamicAddendum(Location loc, OpBuilder &builder, Value fieldInSlot = builder.create( loc, cudaq::cc::PointerType::get(builder.getI64Type()), sizeSlot, ArrayRef{iterIdx}); - addendum = populateDynamicAddendum(loc, builder, iterTy, val, fieldInSlot, - addendum, addendumScratch); + addendum = + populateDynamicAddendum(loc, builder, module, iterTy, val, + fieldInSlot, addendum, addendumScratch); } else { Value fieldInSlot = builder.create( loc, cudaq::cc::PointerType::get(iterTy), sizeSlot, @@ -758,7 +766,8 @@ static Value populateDynamicAddendum(Location loc, OpBuilder &builder, } static void -populateMessageBuffer(Location loc, OpBuilder &builder, Value msgBufferBase, +populateMessageBuffer(Location loc, OpBuilder &builder, ModuleOp module, + Value msgBufferBase, ArrayRef> zippy, Value addendum = {}, Value addendumScratch = {}) { auto structTy = cast( @@ -773,8 +782,8 @@ populateMessageBuffer(Location loc, OpBuilder &builder, Value msgBufferBase, auto ptrTy = cudaq::cc::PointerType::get(memberTy); auto slot = builder.create( loc, ptrTy, msgBufferBase, ArrayRef{i}); - addendum = populateDynamicAddendum(loc, builder, devArgTy, arg, slot, - addendum, addendumScratch); + addendum = populateDynamicAddendum(loc, builder, module, devArgTy, arg, + slot, addendum, addendumScratch); continue; } @@ -972,6 +981,7 @@ class GenerateKernelExecution /// buffer. (Message buffers are at least the size of \p structTy but may be /// extended.) func::FuncOp genKernelArgsCreatorFunction(Location loc, OpBuilder &builder, + ModuleOp module, FunctionType devKernelTy, cudaq::cc::StructType msgStructTy, const std::string &classNameStr, @@ -987,7 +997,7 @@ class GenerateKernelExecution SmallVector passedHostArgTys; for (auto ty : passedDevArgTys) { - Type hostTy = cudaq::opt::factory::convertToHostSideType(ty); + Type hostTy = cudaq::opt::factory::convertToHostSideType(ty, module); if (cudaq::cc::isDynamicType(ty)) hostTy = cudaq::cc::PointerType::get(hostTy); passedHostArgTys.push_back(hostTy); @@ -1033,12 +1043,12 @@ class GenerateKernelExecution const bool hasDynamicSignature = isDynamicSignature(devKernelTy); Value heapTracker = createEmptyHeapTracker(loc, builder); auto zippy = zipArgumentsWithDeviceTypes( - loc, builder, pseudoArgs, passedDevArgTys, heapTracker); + loc, builder, module, pseudoArgs, passedDevArgTys, heapTracker); auto sizeScratch = builder.create(loc, i64Ty); auto messageBufferSize = [&]() -> Value { if (hasDynamicSignature) - return genSizeOfDynamicMessageBuffer(loc, builder, msgStructTy, zippy, - sizeScratch); + return genSizeOfDynamicMessageBuffer(loc, builder, module, msgStructTy, + zippy, sizeScratch); return builder.create(loc, i64Ty, msgStructTy); }(); @@ -1061,10 +1071,10 @@ class GenerateKernelExecution Value addendumPtr = builder.create( loc, ptrI8Ty, arrMessageBuffer, ArrayRef{prefixSize}); - populateMessageBuffer(loc, builder, msgBufferPrefix, zippy, addendumPtr, - addendumScratch); + populateMessageBuffer(loc, builder, module, msgBufferPrefix, zippy, + addendumPtr, addendumScratch); } else { - populateMessageBuffer(loc, builder, msgBufferPrefix, zippy); + populateMessageBuffer(loc, builder, module, msgBufferPrefix, zippy); } maybeFreeHeapAllocations(loc, builder, heapTracker); @@ -1372,7 +1382,7 @@ class GenerateKernelExecution /// the runtime library. Pass along the thunk, so the runtime can call the /// quantum circuit. These entry points may be `operator()` member functions /// in a class, so account for the `this` argument here. - void genNewHostEntryPoint(Location loc, OpBuilder &builder, + void genNewHostEntryPoint(Location loc, OpBuilder &builder, ModuleOp module, FunctionType devFuncTy, LLVM::GlobalOp kernelNameObj, func::FuncOp hostFunc, bool addThisPtr, cudaq::cc::StructType structTy, @@ -1396,12 +1406,12 @@ class GenerateKernelExecution const bool hasDynamicSignature = isDynamicSignature(devFuncTy); Value heapTracker = createEmptyHeapTracker(loc, builder); auto zippy = zipArgumentsWithDeviceTypes( - loc, builder, blockValues, devFuncTy.getInputs(), heapTracker); + loc, builder, module, blockValues, devFuncTy.getInputs(), heapTracker); auto sizeScratch = builder.create(loc, i64Ty); auto messageBufferSize = [&]() -> Value { if (hasDynamicSignature) - return genSizeOfDynamicMessageBuffer(loc, builder, structTy, zippy, - sizeScratch); + return genSizeOfDynamicMessageBuffer(loc, builder, module, structTy, + zippy, sizeScratch); return builder.create(loc, i64Ty, structTy); }(); @@ -1424,10 +1434,10 @@ class GenerateKernelExecution Value addendumPtr = builder.create( loc, ptrI8Ty, rawMessageBuffer, ArrayRef{prefixSize}); - populateMessageBuffer(loc, builder, msgBufferPrefix, zippy, addendumPtr, - addendumScratch); + populateMessageBuffer(loc, builder, module, msgBufferPrefix, zippy, + addendumPtr, addendumScratch); } else { - populateMessageBuffer(loc, builder, msgBufferPrefix, zippy); + populateMessageBuffer(loc, builder, module, msgBufferPrefix, zippy); } maybeFreeHeapAllocations(loc, builder, heapTracker); @@ -1809,6 +1819,10 @@ class GenerateKernelExecution return module.emitError("could not load __nvqpp_zeroDynamicResult"); if (failed(irBuilder.loadIntrinsic(module, "__nvqpp_createDynamicResult"))) return module.emitError("could not load __nvqpp_createDynamicResult"); + if (failed( + irBuilder.loadIntrinsic(module, cudaq::runtime::getPauliWordSize))) + return module.emitError( + "could not load cudaq::pauli_word::_nvqpp_size or _nvqpp_data"); return success(); } @@ -1897,7 +1911,7 @@ class GenerateKernelExecution // Generate the argsCreator function used by synthesis. if (startingArgIdx == 0) { argsCreatorFunc = genKernelArgsCreatorFunction( - loc, builder, funcTy, structTy, classNameStr, hostFuncTy, + loc, builder, module, funcTy, structTy, classNameStr, hostFuncTy, hasThisPtr); } else { // We are operating in a very special case where we want the @@ -1909,7 +1923,7 @@ class GenerateKernelExecution cudaq::opt::factory::buildInvokeStructType(funcTy, startingArgIdx); argsCreatorFunc = genKernelArgsCreatorFunction( - loc, builder, funcTy, structTy_argsCreator, classNameStr, + loc, builder, module, funcTy, structTy_argsCreator, classNameStr, hostFuncTy, hasThisPtr); } } @@ -1917,8 +1931,8 @@ class GenerateKernelExecution // Generate a new mangled function on the host side to call the // callback function. if (hostEntryNeeded) - genNewHostEntryPoint(loc, builder, funcTy, kernelNameObj, hostFunc, - hasThisPtr, structTy, thunk); + genNewHostEntryPoint(loc, builder, module, funcTy, kernelNameObj, + hostFunc, hasThisPtr, structTy, thunk); // Generate a function at startup to register this kernel as having // been processed for kernel execution. diff --git a/runtime/cudaq/qis/pauli_word.h b/runtime/cudaq/qis/pauli_word.h index 2265003083..7bdc026ffb 100644 --- a/runtime/cudaq/qis/pauli_word.h +++ b/runtime/cudaq/qis/pauli_word.h @@ -5,28 +5,30 @@ * This source code and the accompanying materials are made available under * * the terms of the Apache License 2.0 which accompanies this distribution. * ******************************************************************************/ + #pragma once +#include #include namespace cudaq { + /// @brief The `pauli_word` is a thin wrapper on a Pauli tensor product string, /// e.g. `XXYZ` on 4 qubits. class pauli_word { -private: - std::string term; - public: pauli_word() = default; - pauli_word(std::string &&t) : term{std::move(t)} {} - pauli_word(const std::string &t) : term(t) {} - pauli_word(const char *const p) : term{p} {} + pauli_word(std::string &&t) : term{std::move(t)} { to_upper_case(); } + pauli_word(const std::string &t) : term(t) { to_upper_case(); } + pauli_word(const char *const p) : term{p} { to_upper_case(); } pauli_word &operator=(const std::string &t) { term = t; + to_upper_case(); return *this; } pauli_word &operator=(const char *const p) { term = p; + to_upper_case(); return *this; } @@ -34,5 +36,27 @@ class pauli_word { // TODO: Obsolete? Used by KernelWrapper.h only. const std::vector data() const { return {term.begin(), term.end()}; } + +private: + // Convert the string member to upper case at construction/assignment. + // TODO: This should probably verify the string contains only letters valid in + // this alphabet: I, X, Y, and Z. + void to_upper_case() { + std::transform(term.begin(), term.end(), term.begin(), ::toupper); + } + + // These methods used by the compiler. + __attribute__((used)) const char *_nvqpp_data() const { return term.data(); } + __attribute__((used)) std::uint64_t _nvqpp_size() const { + return term.size(); + } + + std::string term; ///< Pauli words are string-like encodings. }; + +namespace details { +static_assert(sizeof(std::string) == sizeof(pauli_word)); +// This constant used by the compiler. +static constexpr std::uint64_t _nvqpp_sizeof = sizeof(pauli_word); +} // namespace details } // namespace cudaq From ef96fb1a254db06477cc453c6631b5c2a532344e Mon Sep 17 00:00:00 2001 From: Eric Schweitz Date: Wed, 13 Nov 2024 08:45:00 -0800 Subject: [PATCH 12/19] Get around spelling checks. Signed-off-by: Eric Schweitz --- runtime/cudaq/qis/pauli_word.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runtime/cudaq/qis/pauli_word.h b/runtime/cudaq/qis/pauli_word.h index 7bdc026ffb..d18df6d22b 100644 --- a/runtime/cudaq/qis/pauli_word.h +++ b/runtime/cudaq/qis/pauli_word.h @@ -51,7 +51,7 @@ class pauli_word { return term.size(); } - std::string term; ///< Pauli words are string-like encodings. + std::string term; ///< Pauli words are string-like. }; namespace details { From 89243eddbf3ed888723174d19e9488e57425464e Mon Sep 17 00:00:00 2001 From: Eric Schweitz Date: Fri, 15 Nov 2024 09:36:04 -0800 Subject: [PATCH 13/19] Adds a portability layer for std::string to helpers defined at library/binding build time. Signed-off-by: Eric Schweitz --- include/cudaq/Optimizer/Builder/Runtime.h | 5 +++- lib/Optimizer/Builder/Factory.cpp | 8 ++++-- lib/Optimizer/Builder/Intrinsics.cpp | 12 +++++++- .../Transforms/GenKernelExecution.cpp | 28 +++++++++++-------- runtime/cudaq/cudaq.cpp | 7 +++++ 5 files changed, 44 insertions(+), 16 deletions(-) diff --git a/include/cudaq/Optimizer/Builder/Runtime.h b/include/cudaq/Optimizer/Builder/Runtime.h index 8bcacbdb35..339334830c 100644 --- a/include/cudaq/Optimizer/Builder/Runtime.h +++ b/include/cudaq/Optimizer/Builder/Runtime.h @@ -53,11 +53,14 @@ static constexpr const char cudaqAHSPrefixName[] = "__analog_hamiltonian_kernel__"; // Host-side helper functions for working with `cudaq::pauli_word` or a -// `std::string`. +// `std::string`. These include both fully dynamic and binding time (library +// build time) helper functions. static constexpr const char sizeofStringAttrName[] = "cc.sizeof_string"; static constexpr const char getPauliWordSize[] = "_ZNK5cudaq10pauli_word11_nvqpp_sizeEv"; static constexpr const char getPauliWordData[] = "_ZNK5cudaq10pauli_word11_nvqpp_dataEv"; +static constexpr const char bindingGetStringData[] = "__nvqpp_getStringData"; +static constexpr const char bindingGetStringSize[] = "__nvqpp_getStringSize"; } // namespace cudaq::runtime diff --git a/lib/Optimizer/Builder/Factory.cpp b/lib/Optimizer/Builder/Factory.cpp index dc7f866610..5bdd71e67e 100644 --- a/lib/Optimizer/Builder/Factory.cpp +++ b/lib/Optimizer/Builder/Factory.cpp @@ -310,9 +310,11 @@ cc::ArrayType factory::genHostStringType(ModuleOp mod) { auto *ctx = mod.getContext(); auto i8Ty = IntegerType::get(ctx, 8); auto sizeAttr = mod->getAttr(cudaq::runtime::sizeofStringAttrName); - assert(sizeAttr && "module must have cc.sizeof_string attribute"); - auto size = cast(sizeAttr).getInt(); - return cc::ArrayType::get(ctx, i8Ty, size); + if (sizeAttr) { + auto size = cast(sizeAttr).getInt(); + return cc::ArrayType::get(ctx, i8Ty, size); + } + return cc::ArrayType::get(ctx, i8Ty, sizeof(std::string)); } // FIXME: We should get the underlying structure of a std::vector from the diff --git a/lib/Optimizer/Builder/Intrinsics.cpp b/lib/Optimizer/Builder/Intrinsics.cpp index 4dbe616967..1826241eaa 100644 --- a/lib/Optimizer/Builder/Intrinsics.cpp +++ b/lib/Optimizer/Builder/Intrinsics.cpp @@ -56,7 +56,8 @@ static constexpr IntrinsicCode intrinsicTable[] = { "func.func private @_ZNK5cudaq10pauli_word11_nvqpp_dataEv(%pw : " "!cc.ptr) -> !cc.ptr"}, {cudaq::runtime::getPauliWordSize, - {cudaq::runtime::getPauliWordData}, + {cudaq::runtime::getPauliWordData, cudaq::runtime::bindingGetStringData, + cudaq::runtime::bindingGetStringSize}, "func.func private @_ZNK5cudaq10pauli_word11_nvqpp_sizeEv(%pw : " "!cc.ptr) -> i64"}, @@ -303,6 +304,15 @@ static constexpr IntrinsicCode intrinsicTable[] = { func.func private @__nvqpp_getStateVectorLength_fp64(%p : i64, %o : i64) -> i64 )#"}, + // Quasi-portable entry points for use with non-C++ front ends (Python). + {cudaq::runtime::bindingGetStringData, + {}, + "func.func private @__nvqpp_getStringData(%p: !cc.ptr) -> " + "!cc.ptr"}, + {cudaq::runtime::bindingGetStringSize, + {}, + "func.func private @__nvqpp_getStringSize(%p: !cc.ptr) -> i64"}, + // __nvqpp_initializer_list_to_vector_bool {cudaq::stdvecBoolCtorFromInitList, {}, diff --git a/lib/Optimizer/Transforms/GenKernelExecution.cpp b/lib/Optimizer/Transforms/GenKernelExecution.cpp index 4b7a5a5018..f7459915a4 100644 --- a/lib/Optimizer/Transforms/GenKernelExecution.cpp +++ b/lib/Optimizer/Transforms/GenKernelExecution.cpp @@ -92,8 +92,8 @@ static FunctionType getThunkType(MLIRContext *ctx) { /// /// /// This implementation does \e not support wide characters. -static Value genStringLength(Location loc, OpBuilder &builder, - Value stringArg) { +static Value genStringLength(Location loc, OpBuilder &builder, Value stringArg, + ModuleOp module) { Type stringTy = stringArg.getType(); assert(isa(stringTy) && isa( @@ -101,9 +101,11 @@ static Value genStringLength(Location loc, OpBuilder &builder, "host side string expected"); auto callArg = builder.create( loc, cudaq::cc::PointerType::get(builder.getI8Type()), stringArg); + StringRef helperName = module->getAttr(cudaq::runtime::sizeofStringAttrName) + ? cudaq::runtime::getPauliWordSize + : cudaq::runtime::bindingGetStringSize; auto lenRes = builder.create(loc, builder.getI64Type(), - cudaq::runtime::getPauliWordSize, - ValueRange{callArg}); + helperName, ValueRange{callArg}); return lenRes.getResult(0); } @@ -528,7 +530,7 @@ static Value descendThroughDynamicType(Location loc, OpBuilder &builder, // A char span is dynamic, but it is not recursively dynamic. Just // read the length of the string out. .Case([&](cudaq::cc::CharspanType t) -> Value { - return genStringLength(loc, builder, arg); + return genStringLength(loc, builder, arg, module); }) // A std::vector is dynamic and may be recursive dynamic as well. .Case([&](cudaq::cc::StdvecType t) -> Value { @@ -620,14 +622,17 @@ genSizeOfDynamicMessageBuffer(Location loc, OpBuilder &builder, ModuleOp module, } static Value populateStringAddendum(Location loc, OpBuilder &builder, - Value host, Value sizeSlot, - Value addendum) { - Value size = genStringLength(loc, builder, host); + Value host, Value sizeSlot, Value addendum, + ModuleOp module) { + Value size = genStringLength(loc, builder, host, module); builder.create(loc, size, sizeSlot); auto ptrI8Ty = cudaq::cc::PointerType::get(builder.getI8Type()); auto fromPtr = builder.create(loc, ptrI8Ty, host); - auto dataPtr = builder.create( - loc, ptrI8Ty, cudaq::runtime::getPauliWordData, ValueRange{fromPtr}); + StringRef helperName = module->getAttr(cudaq::runtime::sizeofStringAttrName) + ? cudaq::runtime::getPauliWordData + : cudaq::runtime::bindingGetStringData; + auto dataPtr = builder.create(loc, ptrI8Ty, helperName, + ValueRange{fromPtr}); auto notVolatile = builder.create(loc, 0, 1); auto toPtr = builder.create(loc, ptrI8Ty, addendum); builder.create( @@ -664,7 +669,8 @@ static Value populateDynamicAddendum(Location loc, OpBuilder &builder, Value sizeSlot, Value addendum, Value addendumScratch) { if (isa(devArgTy)) - return populateStringAddendum(loc, builder, host, sizeSlot, addendum); + return populateStringAddendum(loc, builder, host, sizeSlot, addendum, + module); if (auto vecTy = dyn_cast(devArgTy)) { auto eleTy = vecTy.getElementType(); if (cudaq::cc::isDynamicType(eleTy)) { diff --git a/runtime/cudaq/cudaq.cpp b/runtime/cudaq/cudaq.cpp index d6cbc3c227..ca84a43121 100644 --- a/runtime/cudaq/cudaq.cpp +++ b/runtime/cudaq/cudaq.cpp @@ -502,5 +502,12 @@ void __nvqpp_vector_bool_free_temporary_initlists( free(p); delete allocations; } + +/// Quasi-portable string helpers for Python (non-C++ frontends). These library +/// helper functions allow non-C++ front-ends to remain portable with the core +/// layer. As these helpers ought to be built along with the bindings, there +/// should not be a compatibility issue. +const char *__nvqpp_getStringData(const std::string &s) { return s.data(); } +std::uint64_t __nvqpp_getStringSize(const std::string &s) { return s.size(); } } } // namespace cudaq::support From 0effc49ae3cb0dcc5f54d3899d1bc53730195603 Mon Sep 17 00:00:00 2001 From: Eric Schweitz Date: Fri, 15 Nov 2024 13:29:25 -0800 Subject: [PATCH 14/19] Add std header to see if that fixes gcc's problem. Signed-off-by: Eric Schweitz --- runtime/cudaq/qis/qubit_qis.h | 1 + 1 file changed, 1 insertion(+) diff --git a/runtime/cudaq/qis/qubit_qis.h b/runtime/cudaq/qis/qubit_qis.h index cb3e2a6a73..c05a862bf9 100644 --- a/runtime/cudaq/qis/qubit_qis.h +++ b/runtime/cudaq/qis/qubit_qis.h @@ -17,6 +17,7 @@ #include "cudaq/qis/qreg.h" #include "cudaq/qis/qvector.h" #include "cudaq/spin_op.h" +#include #include #include From 362904579c258dce130f0994a15ab0e2415b063e Mon Sep 17 00:00:00 2001 From: Eric Schweitz Date: Fri, 15 Nov 2024 14:25:08 -0800 Subject: [PATCH 15/19] Add header to another file. Signed-off-by: Eric Schweitz --- runtime/cudaq/qis/pauli_word.h | 1 + 1 file changed, 1 insertion(+) diff --git a/runtime/cudaq/qis/pauli_word.h b/runtime/cudaq/qis/pauli_word.h index d18df6d22b..4a49a706a1 100644 --- a/runtime/cudaq/qis/pauli_word.h +++ b/runtime/cudaq/qis/pauli_word.h @@ -8,6 +8,7 @@ #pragma once +#include #include #include From c2edbf182d2cb54ed513dc580719b72340d5d8f8 Mon Sep 17 00:00:00 2001 From: Eric Schweitz Date: Thu, 14 Nov 2024 16:49:36 -0800 Subject: [PATCH 16/19] Fix issues with more complicated recursive argument structures. Kernels need to be able to receive recursively dynamic composable types. This includes compositions of surface types of vectors and structs. Extend the test. Signed-off-by: Eric Schweitz --- include/cudaq/Optimizer/Builder/Runtime.h | 10 + lib/Frontend/nvqpp/ConvertStmt.cpp | 4 +- lib/Optimizer/Dialect/CC/CCOps.cpp | 4 + .../Transforms/GenKernelExecution.cpp | 413 ++++++++++-------- .../SeparateCompilation/arith_spans.cpp | 154 ++++++- test/Quake/kernel_exec-1.qke | 24 +- test/Quake/kernel_exec-2.qke | 35 +- test/Quake/return_vector.qke | 28 +- test/Translate/argument.qke | 72 +-- 9 files changed, 465 insertions(+), 279 deletions(-) diff --git a/include/cudaq/Optimizer/Builder/Runtime.h b/include/cudaq/Optimizer/Builder/Runtime.h index 339334830c..4fc9405272 100644 --- a/include/cudaq/Optimizer/Builder/Runtime.h +++ b/include/cudaq/Optimizer/Builder/Runtime.h @@ -10,6 +10,16 @@ #include "cudaq/Optimizer/Builder/Factory.h" +//===----------------------------------------------------------------------===// +// +// Runtime helper functions are functions that will appear in the runtime +// library (implementations are defined in either the headers or libraries in +// the `runtime` directory). These helper functions may never be assumed to +// appear on the device-side, so these helpers should only be used in host-side +// code. +// +//===----------------------------------------------------------------------===// + namespace cudaq::runtime { /// Prefix for all kernel entry functions. diff --git a/lib/Frontend/nvqpp/ConvertStmt.cpp b/lib/Frontend/nvqpp/ConvertStmt.cpp index 8c67f68283..efc6c889c9 100644 --- a/lib/Frontend/nvqpp/ConvertStmt.cpp +++ b/lib/Frontend/nvqpp/ConvertStmt.cpp @@ -331,7 +331,9 @@ bool QuakeBridgeVisitor::VisitReturnStmt(clang::ReturnStmt *x) { ValueRange{heapCopy, dynSize}); }; IRBuilder irb(builder); - Value tySize = irb.getByteSizeOfType(loc, eleTy); + Value tySize; + if (!cudaq::cc::isDynamicType(eleTy)) + tySize = irb.getByteSizeOfType(loc, eleTy); if (!tySize) { TODO_x(toLocation(x), x, mangler, "unhandled vector element type"); return false; diff --git a/lib/Optimizer/Dialect/CC/CCOps.cpp b/lib/Optimizer/Dialect/CC/CCOps.cpp index 9d539640ac..8221aa5e81 100644 --- a/lib/Optimizer/Dialect/CC/CCOps.cpp +++ b/lib/Optimizer/Dialect/CC/CCOps.cpp @@ -106,6 +106,10 @@ Value cudaq::cc::getByteSizeOfType(OpBuilder &builder, Location loc, Type ty, return builder.create(loc, builder.getI64Type(), v, scale); }) + .Case([&](cudaq::cc::SpanLikeType) -> Value { + // Uniformly on the device size: {ptr, i64} + return createInt(16); + }) .Default({}); } diff --git a/lib/Optimizer/Transforms/GenKernelExecution.cpp b/lib/Optimizer/Transforms/GenKernelExecution.cpp index f7459915a4..8554d6e576 100644 --- a/lib/Optimizer/Transforms/GenKernelExecution.cpp +++ b/lib/Optimizer/Transforms/GenKernelExecution.cpp @@ -19,7 +19,6 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/ToolOutputFile.h" -#include "mlir/Analysis/DataLayoutAnalysis.h" #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h" #include "mlir/IR/Diagnostics.h" #include "mlir/Transforms/Passes.h" @@ -238,11 +237,14 @@ genByteSizeAndElementCount(Location loc, OpBuilder &builder, ModuleOp module, // If this is a vector>, convert the bytes of struct to bytes of // struct with converted members. if (isa(eleTy)) { - auto eleTy = cast(arg.getType()).getElementType(); + auto vecTy = cast(arg.getType()).getElementType(); + auto vecEleRefTy = cast(vecTy).getMember(0); + auto vecEleTy = cast(vecEleRefTy).getElementType(); auto i64Ty = builder.getI64Type(); - auto hostStrSize = builder.create(loc, i64Ty, eleTy); + auto hostStrSize = + builder.create(loc, i64Ty, vecEleTy); Value count = builder.create(loc, size, hostStrSize); - Type packedTy = cudaq::opt::factory::genArgumentBufferType(t); + Type packedTy = cudaq::opt::factory::genArgumentBufferType(eleTy); auto packSize = builder.create(loc, i64Ty, packedTy); size = builder.create(loc, count, packSize); return {size, count}; @@ -330,7 +332,12 @@ convertAllStdVectorBool(Location loc, OpBuilder &builder, ModuleOp module, auto input = builder.create( loc, cudaq::cc::PointerType::get(subArrTy), startInput); auto transientTy = convertToTransientType(sty, module); - Value tmp = builder.create(loc, transientTy); + auto tmp = [&]() -> Value { + if (preallocated) + return builder.create( + loc, cudaq::cc::PointerType::get(transientTy), *preallocated); + return builder.create(loc, transientTy); + }(); Value sizeDelta = genVectorSize(loc, builder, arg); auto count = [&]() -> Value { if (cudaq::cc::isDynamicType(seleTy)) { @@ -342,8 +349,10 @@ convertAllStdVectorBool(Location loc, OpBuilder &builder, ModuleOp module, loc, builder.getI64Type(), seleTy); return builder.create(loc, sizeDelta, sizeEle); }(); + auto transEleTy = cast(transientTy).getMember(0); + auto dataTy = cast(transEleTy).getElementType(); auto sizeTransientTy = builder.create( - loc, builder.getI64Type(), transientTy); + loc, builder.getI64Type(), dataTy); Value sizeInBytes = builder.create(loc, count, sizeTransientTy); @@ -352,7 +361,6 @@ convertAllStdVectorBool(Location loc, OpBuilder &builder, ModuleOp module, loc, builder.getI8Type(), sizeInBytes); // Initialize the temporary vector. - auto transEleTy = cast(transientTy).getMember(0); auto vecEleTy = cudaq::cc::PointerType::get(transEleTy); auto tmpBegin = builder.create( loc, vecEleTy, tmp, ArrayRef{0}); @@ -400,8 +408,14 @@ convertAllStdVectorBool(Location loc, OpBuilder &builder, ModuleOp module, auto argPtrTy = cast(arg.getType()); auto argStrTy = cast(argPtrTy.getElementType()); - // Create a new struct that we'll store the converted data into. - Value buffer = builder.create(loc, bufferTy); + // If a struct was preallocated, use it. Otherwise, create a new struct that + // we'll store the converted data into. + auto buffer = [&]() -> Value { + if (preallocated) + return builder.create( + loc, cudaq::cc::PointerType::get(bufferTy), *preallocated); + return builder.create(loc, bufferTy); + }(); // Loop over each element. Replace each with the converted value. for (auto iter : llvm::enumerate(sty.getMembers())) { @@ -960,6 +974,211 @@ static void maybeFreeHeapAllocations(Location loc, OpBuilder &builder, }); } +/// Fetch an argument from the comm buffer. Here, the argument is not dynamic so +/// it can be read as is out of the buffer. +static Value fetchInputValue(Location loc, OpBuilder &builder, Type devTy, + Value ptr) { + assert(!cudaq::cc::isDynamicType(devTy) && "must not be a dynamic type"); + if (isa(devTy)) { + // An indirect callable passes a key value which will be used to determine + // the kernel that is being called. + auto key = builder.create(loc, ptr); + return builder.create(loc, devTy, key); + } + + if (isa(devTy)) { + // A direct callable will have already been effectively inlined and this + // argument should not be referenced. + return builder.create(loc, devTy); + } + + auto ptrDevTy = cudaq::cc::PointerType::get(devTy); + if (auto strTy = dyn_cast(devTy)) { + // Argument is a struct. + if (strTy.isEmpty()) + return builder.create(loc, devTy); + + // Cast to avoid conflicts between layout compatible, distinct struct types. + auto structPtr = builder.create(loc, ptrDevTy, ptr); + return builder.create(loc, structPtr); + } + + // Default case: argument passed as a value inplace. + return builder.create(loc, ptr); +} + +/// Helper routine to generate code to increment the trailing data pointer to +/// the next block of data (if any). +static Value incrementTrailingDataPointer(Location loc, OpBuilder &builder, + Value trailingData, Value bytes) { + auto i8Ty = builder.getI8Type(); + auto bufferTy = cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(i8Ty)); + auto buffPtr = builder.create(loc, bufferTy, trailingData); + auto i8PtrTy = cudaq::cc::PointerType::get(i8Ty); + return builder.create( + loc, i8PtrTy, buffPtr, ArrayRef{bytes}); +} + +/// In the thunk, we need to unpack any `std::vector` objects encoded in the +/// packet. Since these have dynamic size, they are encoded as trailing bytes +/// by offset and size. The offset is implicit from the values of the +/// arguments. All sizes are encoded as `int64_t`. +/// +/// A vector of vector of ... T is encoded as a int64_t (length). At the +/// offset of the level `i` vector will be a sequence of sizes for the level +/// `i+1` vectors. For the leaf vector level, `n`, the blocks of data for each +/// vector will be immediately following for each vector at level `n` for the +/// branch of the tree being encoded. +/// +/// For example, a variable defined and initialized as +/// ``` +/// vector>> example = +/// {{{'a'}, {'b', 'c'}, {'z'}}, {{'d' 'e', 'f'}}}; +/// ``` +/// +/// and passed as an argument to a kernel will be encoded as the following +/// block. The block will have a structure with the declared arguments +/// followed by an addendum of variable data, where the vector data is +/// encoded. +/// +/// ``` +/// arguments: { ..., 1, ... } +/// addendum: [[3; 1 2 1, a, b c, z] [1; 3, d e f]] +/// ``` +static std::pair constructDynamicInputValue(Location loc, + OpBuilder &builder, + Type devTy, Value ptr, + Value trailingData) { + assert(cudaq::cc::isDynamicType(devTy) && "must be dynamic type"); + // There are 2 cases. + // 1. The dynamic type is a std::span of any legal device argument type. + // 2. The dynamic type is a struct containing at least 1 std::span. + if (auto spanTy = dyn_cast(devTy)) { + // ptr: a pointer to the length of the block in bytes. + // trailingData: the block of data to decode. + auto eleTy = spanTy.getElementType(); + auto i64Ty = builder.getI64Type(); + auto buffEleTy = cudaq::opt::factory::genArgumentBufferType(eleTy); + + // Get the size of each element in the vector and compute the vector's + // logical length. + auto eleSize = builder.create(loc, i64Ty, buffEleTy); + Value bytes = builder.create(loc, ptr); + auto vecLength = builder.create(loc, bytes, eleSize); + + if (cudaq::cc::isDynamicType(eleTy)) { + // The vector is recursively dynamic. + // Create a new block in which to place the stdvec/struct data in + // device-side format. + Value newVecData = + builder.create(loc, eleTy, vecLength); + // Compute new trailing data, skipping the current vector's data. + auto nextTrailingData = + incrementTrailingDataPointer(loc, builder, trailingData, bytes); + + // For each element in the vector, convert it to device-side format and + // save the result in newVecData. + auto elePtrTy = cudaq::cc::PointerType::get(eleTy); + auto packTy = cudaq::opt::factory::genArgumentBufferType(eleTy); + Type packedArrTy = + cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(packTy)); + Type packedEleTy = cudaq::cc::PointerType::get(packTy); + auto arrPtr = + builder.create(loc, packedArrTy, trailingData); + auto trailingDataVar = + builder.create(loc, nextTrailingData.getType()); + builder.create(loc, nextTrailingData, + trailingDataVar); + cudaq::opt::factory::createInvariantLoop( + builder, loc, vecLength, + [&](OpBuilder &builder, Location loc, Region &, Block &block) { + Value i = block.getArgument(0); + auto nextTrailingData = + builder.create(loc, trailingDataVar); + auto vecMemPtr = builder.create( + loc, packedEleTy, arrPtr, + ArrayRef{i}); + auto r = constructDynamicInputValue(loc, builder, eleTy, vecMemPtr, + nextTrailingData); + auto newVecPtr = builder.create( + loc, elePtrTy, newVecData, + ArrayRef{i}); + builder.create(loc, r.first, newVecPtr); + builder.create(loc, r.second, trailingDataVar); + }); + + // Create the new outer stdvec span as the result. + Value stdvecResult = builder.create( + loc, spanTy, newVecData, vecLength); + nextTrailingData = + builder.create(loc, trailingDataVar); + return {stdvecResult, nextTrailingData}; + } + + // This vector has constant data, so just use the data in-place and + // construct the stdvec span with it. + auto castTrailingData = builder.create( + loc, cudaq::cc::PointerType::get(eleTy), trailingData); + Value stdvecResult = builder.create( + loc, spanTy, castTrailingData, vecLength); + auto nextTrailingData = + incrementTrailingDataPointer(loc, builder, trailingData, bytes); + return {stdvecResult, nextTrailingData}; + } + + // Argument must be a struct. + // The struct contains dynamic components. Extract them and build up the + // struct value to be passed as an argument. + // ptr: pointer to the first element of the struct or a vector length. + // tailingData: the block of data for the first dynamic type field. + auto strTy = cast(devTy); + auto ptrEleTy = cast(ptr.getType()).getElementType(); + auto packedTy = cast(ptrEleTy); + Value result = builder.create(loc, strTy); + assert(strTy.getNumMembers() == packedTy.getNumMembers()); + for (auto iter : + llvm::enumerate(llvm::zip(strTy.getMembers(), packedTy.getMembers()))) { + auto devMemTy = std::get<0>(iter.value()); + std::int32_t off = iter.index(); + auto packedMemTy = std::get<1>(iter.value()); + auto dataPtr = builder.create( + loc, cudaq::cc::PointerType::get(packedMemTy), ptr, + ArrayRef{off}); + if (cudaq::cc::isDynamicType(devMemTy)) { + auto r = constructDynamicInputValue(loc, builder, devMemTy, dataPtr, + trailingData); + result = builder.create(loc, strTy, result, + r.first, off); + trailingData = r.second; + continue; + } + auto val = fetchInputValue(loc, builder, devMemTy, dataPtr); + result = + builder.create(loc, strTy, result, val, off); + } + return {result, trailingData}; +} + +/// Translate the buffer data to a sequence of arguments suitable to the +/// actual kernel call. +/// +/// \param inTy The actual expected type of the argument. +/// \param structTy The modified buffer type over all the arguments at the +/// current level. +static std::pair +processInputValue(Location loc, OpBuilder &builder, Value trailingData, + Value ptrPackedStruct, Type inTy, std::int64_t off, + cudaq::cc::StructType packedStructTy) { + auto packedPtr = builder.create( + loc, cudaq::cc::PointerType::get(packedStructTy.getMember(off)), + ptrPackedStruct, ArrayRef{off}); + if (cudaq::cc::isDynamicType(inTy)) + return constructDynamicInputValue(loc, builder, inTy, packedPtr, + trailingData); + auto val = fetchInputValue(loc, builder, inTy, packedPtr); + return {val, trailingData}; +} + /// This pass adds a `.thunk` function and a rewritten C++ host /// side (mangled) stub to the code for every entry-point kernel in the module. /// It may also generate a `.argsCreator` function. Finally, it @@ -1096,174 +1315,6 @@ class GenerateKernelExecution return argsCreatorFunc; } - /// In the thunk, we need to unpack any `std::vector` objects encoded in the - /// packet. Since these have dynamic size, they are encoded as trailing bytes - /// by offset and size. The offset is implicit from the values of the - /// arguments. All sizes are encoded as `int64_t`. - /// - /// A vector of vector of ... T is encoded as a int64_t (length). At the - /// offset of the level `i` vector will be a sequence of sizes for the level - /// `i+1` vectors. For the leaf vector level, `n`, the blocks of data for each - /// vector will be immediately following for each vector at level `n` for the - /// branch of the tree being encoded. - /// - /// For example, a variable defined and initialized as - /// ``` - /// vector>> example = - /// {{{'a'}, {'b', 'c'}, {'z'}}, {{'d' 'e', 'f'}}}; - /// ``` - /// - /// and passed as an argument to a kernel will be encoded as the following - /// block. The block will have a structure with the declared arguments - /// followed by an addendum of variable data, where the vector data is - /// encoded. - /// - /// ``` - /// arguments: { ..., 1, ... } - /// addendum: [[3; 1 2 1, a, b c, z] [1; 3, d e f]] - /// ``` - std::pair unpackStdVector(Location loc, OpBuilder &builder, - cudaq::cc::SpanLikeType stdvecTy, - Value vecSize, Value trailingData) { - // Convert the pointer-free std::vector to a span structure to be - // passed. A span structure is a pointer and a size (in element - // units). Note that this structure may be recursive. - auto i8Ty = builder.getI8Type(); - auto ptrI8Ty = cudaq::cc::PointerType::get(i8Ty); - auto bytesTy = getByteAddressableType(builder); - Type eleTy = stdvecTy.getElementType(); - auto innerStdvecTy = dyn_cast(eleTy); - std::size_t eleSize = - innerStdvecTy ? /*(i64Type/8)*/ 8 : dataLayout->getTypeSize(eleTy); - auto eleSizeVal = [&]() -> Value { - if (eleSize) - return builder.create(loc, eleSize, 64); - assert(isa(eleTy) || - (isa(eleTy) && - !cast(eleTy).isUnknownSize())); - auto i64Ty = builder.getI64Type(); - return builder.create(loc, i64Ty, eleTy); - }(); - auto vecLength = builder.create(loc, vecSize, eleSizeVal); - if (innerStdvecTy) { - // Recursive case: std::vector> - // TODO: Uses stack allocation, however it may be better to use heap - // allocation. It's not clear the QPU has heap memory allocation. If this - // uses heap allocation, then the thunk must free that memory *after* the - // kernel proper returns. - auto vecTmp = builder.create(loc, eleTy, vecLength); - auto currentEnd = builder.create(loc, ptrI8Ty); - auto i64Ty = builder.getI64Type(); - auto arrI64Ty = cudaq::cc::ArrayType::get(i64Ty); - auto arrTy = cudaq::cc::PointerType::get(arrI64Ty); - auto innerVec = - builder.create(loc, arrTy, trailingData); - auto trailingBytes = - builder.create(loc, bytesTy, trailingData); - trailingData = builder.create( - loc, ptrI8Ty, trailingBytes, vecSize); - builder.create(loc, trailingData, currentEnd); - // Loop over each subvector in the vector and recursively unpack it into - // the vecTmp variable. Leaf vectors do not need a fresh variable. This - // effectively translates all the size/offset information for all the - // subvectors into temps. - Value vecLengthIndex = builder.create( - loc, builder.getI64Type(), vecLength, - cudaq::cc::CastOpMode::Unsigned); - cudaq::opt::factory::createInvariantLoop( - builder, loc, vecLengthIndex, - [&](OpBuilder &builder, Location loc, Region &, Block &block) { - Value i = block.getArgument(0); - auto innerPtr = builder.create( - loc, cudaq::cc::PointerType::get(i64Ty), innerVec, - SmallVector{i}); - Value innerVecSize = - builder.create(loc, innerPtr); - Value tmp = builder.create(loc, currentEnd); - auto unpackPair = - unpackStdVector(loc, builder, innerStdvecTy, innerVecSize, tmp); - auto ptrInnerTy = cudaq::cc::PointerType::get(innerStdvecTy); - auto subVecPtr = builder.create( - loc, ptrInnerTy, vecTmp, - SmallVector{i}); - builder.create(loc, unpackPair.first, - subVecPtr); - builder.create(loc, unpackPair.second, - currentEnd); - }); - auto coerceResult = builder.create( - loc, cudaq::cc::PointerType::get(stdvecTy), vecTmp); - trailingData = builder.create(loc, currentEnd); - Value result = builder.create( - loc, stdvecTy, coerceResult, vecLength); - return {result, trailingData}; - } - // Must divide by byte, 8 bits. - // The data is at trailingData and is valid for vecLength of eleTy. - auto castData = builder.create( - loc, cudaq::cc::PointerType::get(eleTy), trailingData); - Value stdVecResult = builder.create( - loc, stdvecTy, castData, vecLength); - auto arrTy = cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(i8Ty)); - Value casted = builder.create(loc, arrTy, trailingData); - trailingData = - builder.create(loc, ptrI8Ty, casted, vecSize); - return {stdVecResult, trailingData}; - } - - /// Translate the buffer data to a sequence of arguments suitable to the - /// actual kernel call. - /// - /// \param inTy The actual expected type of the argument. - /// \param structTy The modified buffer type over all the arguments at the - /// current level. - std::pair processInputValue(Location loc, OpBuilder &builder, - Value trailingData, Value val, - Type inTy, std::int64_t off, - cudaq::cc::StructType structTy) { - if (isa(inTy)) { - auto i64Ty = builder.getI64Type(); - auto key = - builder.create(loc, i64Ty, val, off); - return {builder.create(loc, inTy, key), trailingData}; - } - if (isa(inTy)) - return {builder.create(loc, inTy), trailingData}; - if (auto stdVecTy = dyn_cast(inTy)) { - Value vecSize = builder.create( - loc, builder.getI64Type(), val, off); - return unpackStdVector(loc, builder, stdVecTy, vecSize, trailingData); - } - if (auto strTy = dyn_cast(inTy)) { - if (!cudaq::cc::isDynamicType(strTy)) { - if (strTy.isEmpty()) - return {builder.create(loc, inTy), trailingData}; - return {builder.create(loc, inTy, val, off), - trailingData}; - } - // The struct contains dynamic components. Extract them and build up the - // struct value to be passed as an argument. - Type buffMemTy = structTy.getMember(off); - Value strVal = builder.create(loc, inTy); - Value subVal = - builder.create(loc, buffMemTy, val, off); - // Convert the argument type, strTy, to a buffer type. - auto memberArgTy = cast( - cudaq::opt::factory::genArgumentBufferType(strTy)); - for (auto iter : llvm::enumerate(strTy.getMembers())) { - auto [a, t] = - processInputValue(loc, builder, trailingData, subVal, iter.value(), - iter.index(), memberArgTy); - trailingData = t; - strVal = builder.create(loc, inTy, strVal, a, - iter.index()); - } - return {strVal, trailingData}; - } - return {builder.create(loc, inTy, val, off), - trailingData}; - } - /// Generate the thunk function. This function is called by the library /// callback function to "unpack" the arguments and pass them to the kernel /// function on the QPU side. The thunk will also save any return values to @@ -1285,7 +1336,6 @@ class GenerateKernelExecution auto castOp = builder.create(loc, structPtrTy, thunkEntry->getArgument(0)); auto isClientServer = thunkEntry->getArgument(1); - Value val = builder.create(loc, castOp); auto i64Ty = builder.getI64Type(); // Compute the struct size without the trailing bytes, structSize. @@ -1306,7 +1356,7 @@ class GenerateKernelExecution SmallVector args; const std::int32_t offset = funcTy.getNumInputs(); for (auto inp : llvm::enumerate(funcTy.getInputs())) { - auto [a, t] = processInputValue(loc, builder, trailingData, val, + auto [a, t] = processInputValue(loc, builder, trailingData, castOp, inp.value(), inp.index(), structTy); trailingData = t; args.push_back(a); @@ -1838,8 +1888,6 @@ class GenerateKernelExecution auto builder = OpBuilder::atBlockEnd(module.getBody()); auto mangledNameMap = module->getAttrOfType(cudaq::runtime::mangledNameMap); - DataLayoutAnalysis dla(module); // caches module's data layout information. - dataLayout = &dla.getAtOrAbove(module); std::error_code ec; llvm::ToolOutputFile out(outputFilename, ec, llvm::sys::fs::OF_None); if (ec) { @@ -1956,8 +2004,5 @@ class GenerateKernelExecution } out.keep(); } - -private: - const DataLayout *dataLayout = nullptr; }; } // namespace diff --git a/targettests/SeparateCompilation/arith_spans.cpp b/targettests/SeparateCompilation/arith_spans.cpp index 67dc8f329e..4de3979ed1 100644 --- a/targettests/SeparateCompilation/arith_spans.cpp +++ b/targettests/SeparateCompilation/arith_spans.cpp @@ -37,6 +37,19 @@ void dump_int_vector(std::span x) { std::cout << '\n'; } +void dump_2d_int_vector(std::span> x) { + std::cout << "integer matrix: {\n"; + for (auto s : x) { + std::cout << " "; + for (auto i : s) + std::cout << i << " "; + std::cout << '\n'; + } + std::cout << "}\n"; +} + +void dump_int_scalar(int x) { std::cout << "scalar integer: " << x << '\n'; } + void dump_double_vector(std::span x) { std::cout << "doubles: "; for (auto d : x) @@ -53,8 +66,10 @@ void dump_double_vector(std::span x) { // Fake host C++ signature that matches. extern "C" { void dump_int_vector(const std::vector &pw); +void dump_int_scalar(int v); void dump_bool_vector(const std::vector &pw); void dump_double_vector(const std::vector &pw); +void dump_2d_int_vector(const std::vector> &pw); } __qpu__ void kern1(std::vector arg) { dump_int_vector(arg); } @@ -129,6 +144,21 @@ __qpu__ void cern4(std::vector vivp) { } } +struct Interesting { + std::vector>> ragged3d; + int flags; + std::vector angular; +}; + +__qpu__ void exciting(std::vector vi) { + for (unsigned i = 0; i < vi.size(); ++i) { + for (unsigned j = 0; j < vi[i].ragged3d.size(); ++j) + dump_2d_int_vector(vi[i].ragged3d[j]); + dump_int_scalar(vi[i].flags); + dump_double_vector(vi[i].angular); + } +} + int main() { std::vector pw0 = {345, 1, 2}; std::cout << "---\n"; @@ -149,7 +179,7 @@ int main() { IntVectorPair ivp4 = {{-4, -5, 6}, {-7, -8, -9, 88}}; std::vector vivp = {ivp, ivp2, ivp3, ivp4}; std::cout << "---\n"; - // kern4(vivp); + kern4(vivp); std::vector dpw0 = {3.45, 1., 2.}; std::cout << "---\n"; @@ -170,7 +200,7 @@ int main() { DoubleVectorPair dvp4 = {{-4., -5., 6.}, {-7., -8., -9., .88}}; std::vector vdvp = {dvp, dvp2, dvp3, dvp4}; std::cout << "---\n"; - // qern4(vdvp); + qern4(vdvp); std::vector bpw0 = {true, false}; std::cout << "---\n"; @@ -192,7 +222,25 @@ int main() { BoolVectorPair bvp4 = {{true, false, false}, {false, true, false, true}}; std::vector vbvp = {bvp, bvp2, bvp3, bvp4}; std::cout << "---\n"; - // cern4(vbvp); + cern4(vbvp); + + std::vector> ix0 = {pw0, pw0}; + std::vector> ix1 = {pw1, pw0}; + std::vector> ix2 = {pw2, pw3, pw3}; + std::vector> ix3 = {{404}, {101, 202}}; + std::vector>> i3d0 = {ix0, ix1}; + std::vector>> i3d1 = {ix1}; + std::vector>> i3d2 = {ix2, ix3}; + std::vector>> i3d3 = {ix3}; + std::vector>> i3d4 = {ix2, ix0, ix0}; + Interesting in0 = {i3d0, 66, {2.0, 4.0}}; + Interesting in1 = {i3d1, 123, {3.0, 6.0}}; + Interesting in2 = {i3d2, 561, {4.0, 8.0}}; + Interesting in3 = {i3d3, 72341, {5.0, 10.0}}; + Interesting in4 = {i3d4, -2348, {12.0, 5280.1}}; + std::vector ving = {in0, in1, in2, in3, in4}; + std::cout << "===\n"; + exciting(ving); return 0; } @@ -201,29 +249,105 @@ int main() { // CHECK: integers: 345 1 2 // CHECK: --- // CHECK: integers: 345 1 2 -// CHECK: integers: 92347 3 4 -// CHECK: integers: 2358 5 6 -// CHECK: integers: 45 7 18 +// CHECK-NEXT: integers: 92347 3 4 +// CHECK-NEXT: integers: 2358 5 6 +// CHECK-NEXT: integers: 45 7 18 +// CHECK: --- +// CHECK: integers: 8 238 44 +// CHECK-NEXT: integers: 0 -4 81 92745 // CHECK: --- // CHECK: integers: 8 238 44 -// CHECK: integers: 0 -4 81 92745 +// CHECK-NEXT: integers: 0 -4 81 92745 +// CHECK-NEXT: integers: 5 -87 43 1 76 +// CHECK-NEXT: integers: 0 0 2 1 +// CHECK-NEXT: integers: 1 +// CHECK-NEXT: integers: -2 3 +// CHECK-NEXT: integers: -4 -5 6 +// CHECK-NEXT: integers: -7 -8 -9 88 // CHECK: --- // CHECK: doubles: 3.45 1 2 // CHECK: --- // CHECK: doubles: 3.45 1 2 -// CHECK: doubles: 92.347 2.3 4 -// CHECK: doubles: 235.8 5.5 6.4 -// CHECK: doubles: 4.5 77.7 18.2 +// CHECK-NEXT: doubles: 92.347 2.3 4 +// CHECK-NEXT: doubles: 235.8 5.5 6.4 +// CHECK-NEXT: doubles: 4.5 77.7 18.2 // CHECK: --- // CHECK: doubles: 8 2.38 4.4 -// CHECK: doubles: 0 -4.99 81.5 92.745 +// CHECK-NEXT: doubles: 0 -4.99 81.5 92.745 +// CHECK: --- +// CHECK: doubles: 8 2.38 4.4 +// CHECK-NEXT: doubles: 0 -4.99 81.5 92.745 +// CHECK-NEXT: doubles: 5 -8.7 4.3 1 7.6 +// CHECK-NEXT: doubles: 0 0 2 1 +// CHECK-NEXT: doubles: 1 +// CHECK-NEXT: doubles: -2 3 +// CHECK-NEXT: doubles: -4 -5 6 +// CHECK-NEXT: doubles: -7 -8 -9 0.88 // CHECK: --- // CHECK: booleans: 1 0 // CHECK: --- // CHECK: booleans: 1 0 -// CHECK: booleans: 0 0 0 -// CHECK: booleans: 0 1 0 1 -// CHECK: booleans: 0 0 1 0 1 +// CHECK-NEXT: booleans: 0 0 0 +// CHECK-NEXT: booleans: 0 1 0 1 +// CHECK-NEXT: booleans: 0 0 1 0 1 +// CHECK: --- +// CHECK: booleans: 0 0 +// CHECK-NEXT: booleans: 0 1 1 0 // CHECK: --- // CHECK: booleans: 0 0 -// CHECK: booleans: 0 1 1 0 +// CHECK-NEXT: booleans: 0 1 1 0 +// CHECK-NEXT: booleans: 0 1 1 0 1 0 +// CHECK-NEXT: booleans: 0 1 1 0 0 0 1 0 +// CHECK-NEXT: booleans: 0 +// CHECK-NEXT: booleans: 1 1 +// CHECK-NEXT: booleans: 1 0 0 +// CHECK-NEXT: booleans: 0 1 0 1 +// CHECK: === +// CHECK: integer matrix: { +// CHECK-NEXT: 345 1 2 +// CHECK-NEXT: 345 1 2 +// CHECK-NEXT: } +// CHECK-NEXT: integer matrix: { +// CHECK-NEXT: 92347 3 4 +// CHECK-NEXT: 345 1 2 +// CHECK-NEXT: } +// CHECK-NEXT: scalar integer: 66 +// CHECK-NEXT: doubles: 2 4 +// CHECK-NEXT: integer matrix: { +// CHECK-NEXT: 92347 3 4 +// CHECK-NEXT: 345 1 2 +// CHECK-NEXT: } +// CHECK-NEXT: scalar integer: 123 +// CHECK-NEXT: doubles: 3 6 +// CHECK-NEXT: integer matrix: { +// CHECK-NEXT: 2358 5 6 +// CHECK-NEXT: 45 7 18 +// CHECK-NEXT: 45 7 18 +// CHECK-NEXT: } +// CHECK-NEXT: integer matrix: { +// CHECK-NEXT: 404 +// CHECK-NEXT: 101 202 +// CHECK-NEXT: } +// CHECK-NEXT: scalar integer: 561 +// CHECK-NEXT: doubles: 4 8 +// CHECK-NEXT: integer matrix: { +// CHECK-NEXT: 404 +// CHECK-NEXT: 101 202 +// CHECK-NEXT: } +// CHECK-NEXT: scalar integer: 72341 +// CHECK-NEXT: doubles: 5 10 +// CHECK-NEXT: integer matrix: { +// CHECK-NEXT: 2358 5 6 +// CHECK-NEXT: 45 7 18 +// CHECK-NEXT: 45 7 18 +// CHECK-NEXT: } +// CHECK-NEXT: integer matrix: { +// CHECK-NEXT: 345 1 2 +// CHECK-NEXT: 345 1 2 +// CHECK-NEXT: } +// CHECK-NEXT: integer matrix: { +// CHECK-NEXT: 345 1 2 +// CHECK-NEXT: 345 1 2 +// CHECK-NEXT: } +// CHECK-NEXT: scalar integer: -2348 +// CHECK-NEXT: doubles: 12 5280.1 diff --git a/test/Quake/kernel_exec-1.qke b/test/Quake/kernel_exec-1.qke index 9bae7ecebf..cd079998ae 100644 --- a/test/Quake/kernel_exec-1.qke +++ b/test/Quake/kernel_exec-1.qke @@ -170,11 +170,11 @@ module attributes {quake.mangled_name_map = { // ALT-SAME: %[[VAL_0:.*]]: !cc.ptr, // ALT-SAME: %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr, i64}> { // ALT: %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr) -> !cc.ptr> -// ALT: %[[VAL_3:.*]] = cc.load %[[VAL_2]] : !cc.ptr> -// ALT: %[[VAL_4:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64 -// ALT: %[[VAL_5:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr) -> !cc.ptr> -// ALT: %[[VAL_6:.*]] = cc.compute_ptr %[[VAL_5]]{{\[}}%[[VAL_4]]] : (!cc.ptr>, i64) -> !cc.ptr -// ALT: %[[VAL_7:.*]] = cc.extract_value %[[VAL_3]][0] : (!cc.struct<{i32, f64}>) -> i32 +// ALT: %[[VAL_3:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64 +// ALT: %[[VAL_4:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr) -> !cc.ptr> +// ALT: %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_4]]{{\[}}%[[VAL_3]]] : (!cc.ptr>, i64) -> !cc.ptr +// ALT: %[[VAL_6:.*]] = cc.compute_ptr %[[VAL_2]][0] : (!cc.ptr>) -> !cc.ptr +// ALT: %[[VAL_7:.*]] = cc.load %[[VAL_6]] : !cc.ptr // ALT: %[[VAL_8:.*]] = call @__nvqpp__mlirgen__ghz(%[[VAL_7]]) : (i32) -> f64 // ALT: %[[VAL_9:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr>) -> !cc.ptr // ALT: cc.store %[[VAL_8]], %[[VAL_9]] : !cc.ptr @@ -403,14 +403,14 @@ module attributes {quake.mangled_name_map = { // HYBRID: } // HYBRID-LABEL: func.func @ghz.thunk( -// HYBRID-SAME: %[[VAL_0:.*]]: !cc.ptr, -// HYBRID-SAME: %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr, i64}> { +// HYBRID-SAME: %[[VAL_0:.*]]: !cc.ptr, +// HYBRID-SAME: %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr, i64}> { // HYBRID: %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr) -> !cc.ptr> -// HYBRID: %[[VAL_3:.*]] = cc.load %[[VAL_2]] : !cc.ptr> -// HYBRID: %[[VAL_4:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64 -// HYBRID: %[[VAL_5:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr) -> !cc.ptr> -// HYBRID: %[[VAL_6:.*]] = cc.compute_ptr %[[VAL_5]]{{\[}}%[[VAL_4]]] : (!cc.ptr>, i64) -> !cc.ptr -// HYBRID: %[[VAL_7:.*]] = cc.extract_value %[[VAL_3]][0] : (!cc.struct<{i32, f64}>) -> i32 +// HYBRID: %[[VAL_3:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64 +// HYBRID: %[[VAL_4:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr) -> !cc.ptr> +// HYBRID: %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_4]]{{\[}}%[[VAL_3]]] : (!cc.ptr>, i64) -> !cc.ptr +// HYBRID: %[[VAL_6:.*]] = cc.compute_ptr %[[VAL_2]][0] : (!cc.ptr>) -> !cc.ptr +// HYBRID: %[[VAL_7:.*]] = cc.load %[[VAL_6]] : !cc.ptr // HYBRID: %[[VAL_8:.*]] = call @__nvqpp__mlirgen__ghz(%[[VAL_7]]) : (i32) -> f64 // HYBRID: %[[VAL_9:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr>) -> !cc.ptr // HYBRID: cc.store %[[VAL_8]], %[[VAL_9]] : !cc.ptr diff --git a/test/Quake/kernel_exec-2.qke b/test/Quake/kernel_exec-2.qke index ebc29811a1..b94412cb11 100644 --- a/test/Quake/kernel_exec-2.qke +++ b/test/Quake/kernel_exec-2.qke @@ -172,24 +172,25 @@ __nvqpp__mlirgen__function_cargo = "pants"}} { // CHECK: } // CHECK-LABEL: func.func @function_hawaiian.thunk( -// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, -// CHECK-SAME: %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr, i64}> { +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, +// CHECK-SAME: %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr, i64}> { // CHECK: %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr) -> !cc.ptr> -// CHECK: %[[VAL_3:.*]] = cc.load %[[VAL_2]] : !cc.ptr> -// CHECK: %[[VAL_4:.*]] = cc.sizeof !cc.struct<{i1, i64}> : i64 -// CHECK: %[[VAL_5:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr) -> !cc.ptr> -// CHECK: %[[VAL_6:.*]] = cc.compute_ptr %[[VAL_5]]{{\[}}%[[VAL_4]]] : (!cc.ptr>, i64) -> !cc.ptr -// CHECK: %[[VAL_7:.*]] = cc.extract_value %[[VAL_3]][0] : (!cc.struct<{i1, i64}>) -> i1 -// CHECK: %[[VAL_8:.*]] = cc.extract_value %[[VAL_3]][1] : (!cc.struct<{i1, i64}>) -> i64 -// CHECK: %[[VAL_9:.*]] = arith.constant 4 : i64 -// CHECK: %[[VAL_10:.*]] = arith.divsi %[[VAL_8]], %[[VAL_9]] : i64 -// CHECK: %[[VAL_11:.*]] = cc.cast %[[VAL_6]] : (!cc.ptr) -> !cc.ptr -// CHECK: %[[VAL_12:.*]] = cc.stdvec_init %[[VAL_11]], %[[VAL_10]] : (!cc.ptr, i64) -> !cc.stdvec -// CHECK: %[[VAL_13:.*]] = cc.cast %[[VAL_6]] : (!cc.ptr) -> !cc.ptr> -// CHECK: %[[VAL_14:.*]] = cc.compute_ptr %[[VAL_13]]{{\[}}%[[VAL_8]]] : (!cc.ptr>, i64) -> !cc.ptr -// CHECK: call @__nvqpp__mlirgen__function_hawaiian(%[[VAL_7]], %[[VAL_12]]) : (i1, !cc.stdvec) -> () -// CHECK: %[[VAL_15:.*]] = call @__nvqpp_zeroDynamicResult() : () -> !cc.struct<{!cc.ptr, i64}> -// CHECK: return %[[VAL_15]] : !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_3:.*]] = cc.sizeof !cc.struct<{i1, i64}> : i64 +// CHECK: %[[VAL_4:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr) -> !cc.ptr> +// CHECK: %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_4]]{{\[}}%[[VAL_3]]] : (!cc.ptr>, i64) -> !cc.ptr +// CHECK: %[[VAL_6:.*]] = cc.compute_ptr %[[VAL_2]][0] : (!cc.ptr>) -> !cc.ptr +// CHECK: %[[VAL_7:.*]] = cc.load %[[VAL_6]] : !cc.ptr +// CHECK: %[[VAL_8:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr>) -> !cc.ptr +// CHECK: %[[VAL_9:.*]] = cc.sizeof i32 : i64 +// CHECK: %[[VAL_10:.*]] = cc.load %[[VAL_8]] : !cc.ptr +// CHECK: %[[VAL_11:.*]] = arith.divsi %[[VAL_10]], %[[VAL_9]] : i64 +// CHECK: %[[VAL_12:.*]] = cc.cast %[[VAL_5]] : (!cc.ptr) -> !cc.ptr +// CHECK: %[[VAL_13:.*]] = cc.stdvec_init %[[VAL_12]], %[[VAL_11]] : (!cc.ptr, i64) -> !cc.stdvec +// CHECK: %[[VAL_14:.*]] = cc.cast %[[VAL_5]] : (!cc.ptr) -> !cc.ptr> +// CHECK: %[[VAL_15:.*]] = cc.compute_ptr %[[VAL_14]]{{\[}}%[[VAL_10]]] : (!cc.ptr>, i64) -> !cc.ptr +// CHECK: call @__nvqpp__mlirgen__function_hawaiian(%[[VAL_7]], %[[VAL_13]]) : (i1, !cc.stdvec) -> () +// CHECK: %[[VAL_16:.*]] = call @__nvqpp_zeroDynamicResult() : () -> !cc.struct<{!cc.ptr, i64}> +// CHECK: return %[[VAL_16]] : !cc.struct<{!cc.ptr, i64}> // CHECK: } // CHECK-LABEL: func.func @function_hawaiian.argsCreator( diff --git a/test/Quake/return_vector.qke b/test/Quake/return_vector.qke index bba89bb5dd..58bcd2f089 100644 --- a/test/Quake/return_vector.qke +++ b/test/Quake/return_vector.qke @@ -264,12 +264,12 @@ func.func @test_1(%0: !cc.ptr, !cc.ptr, !cc.ptr, -// CHECK-SAME: %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr, i64}> { +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, +// CHECK-SAME: %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr, i64}> { // CHECK: %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr) -> !cc.ptr, i64}>}>> -// CHECK: %[[VAL_3:.*]] = cc.load %[[VAL_2]] : !cc.ptr, i64}>}>> -// CHECK: %[[VAL_4:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 -// CHECK: %[[VAL_5:.*]] = cc.extract_value %[[VAL_3]][0] : (!cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}>) -> i32 +// CHECK: %[[VAL_3:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 +// CHECK: %[[VAL_4:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr) -> !cc.ptr +// CHECK: %[[VAL_5:.*]] = cc.load %[[VAL_4]] : !cc.ptr // CHECK: %[[VAL_6:.*]] = call @__nvqpp__mlirgen__test_0(%[[VAL_5]]) : (i32) -> !cc.stdvec // CHECK: %[[VAL_7:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> // CHECK: %[[VAL_8:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr, i64}>>) -> !cc.ptr> @@ -278,7 +278,7 @@ func.func @test_1(%0: !cc.ptr, !cc.ptr, !cc.ptr, i64}>>) -> !cc.ptr, i64}>> // CHECK: %[[VAL_10:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> [1] : i64 -// CHECK: %[[VAL_11:.*]] = call @__nvqpp_createDynamicResult(%[[VAL_0]], %[[VAL_4]], %[[VAL_9]], %[[VAL_10]]) : (!cc.ptr, i64, !cc.ptr, i64}>>, i64) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_11:.*]] = call @__nvqpp_createDynamicResult(%[[VAL_0]], %[[VAL_3]], %[[VAL_9]], %[[VAL_10]]) : (!cc.ptr, i64, !cc.ptr, i64}>>, i64) -> !cc.struct<{!cc.ptr, i64}> // CHECK: return %[[VAL_11]] : !cc.struct<{!cc.ptr, i64}> // CHECK: ^bb2: // CHECK: %[[VAL_12:.*]] = call @__nvqpp_zeroDynamicResult() : () -> !cc.struct<{!cc.ptr, i64}> @@ -319,12 +319,12 @@ func.func @test_1(%0: !cc.ptr, !cc.ptr, !cc.ptr, -// CHECK-SAME: %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr, i64}> { +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, +// CHECK-SAME: %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr, i64}> { // CHECK: %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr) -> !cc.ptr, i64}>}>> -// CHECK: %[[VAL_3:.*]] = cc.load %[[VAL_2]] : !cc.ptr, i64}>}>> -// CHECK: %[[VAL_4:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 -// CHECK: %[[VAL_5:.*]] = cc.extract_value %[[VAL_3]][0] : (!cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}>) -> i32 +// CHECK: %[[VAL_3:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 +// CHECK: %[[VAL_4:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr) -> !cc.ptr +// CHECK: %[[VAL_5:.*]] = cc.load %[[VAL_4]] : !cc.ptr // CHECK: %[[VAL_6:.*]] = call @__nvqpp__mlirgen__test_1(%[[VAL_5]]) : (i32) -> !cc.stdvec // CHECK: %[[VAL_7:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> // CHECK: %[[VAL_8:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr, i64}>>) -> !cc.ptr> @@ -333,7 +333,7 @@ func.func @test_1(%0: !cc.ptr, !cc.ptr, !cc.ptr, i64}>>) -> !cc.ptr, i64}>> // CHECK: %[[VAL_10:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> [1] : i64 -// CHECK: %[[VAL_11:.*]] = call @__nvqpp_createDynamicResult(%[[VAL_0]], %[[VAL_4]], %[[VAL_9]], %[[VAL_10]]) : (!cc.ptr, i64, !cc.ptr, i64}>>, i64) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_11:.*]] = call @__nvqpp_createDynamicResult(%[[VAL_0]], %[[VAL_3]], %[[VAL_9]], %[[VAL_10]]) : (!cc.ptr, i64, !cc.ptr, i64}>>, i64) -> !cc.struct<{!cc.ptr, i64}> // CHECK: return %[[VAL_11]] : !cc.struct<{!cc.ptr, i64}> // CHECK: ^bb2: // CHECK: %[[VAL_12:.*]] = call @__nvqpp_zeroDynamicResult() : () -> !cc.struct<{!cc.ptr, i64}> @@ -341,8 +341,8 @@ func.func @test_1(%0: !cc.ptr, !cc.ptr, !cc.ptr>, -// CHECK-SAME: %[[VAL_1:.*]]: !cc.ptr>) -> i64 { +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr>, +// CHECK-SAME: %[[VAL_1:.*]]: !cc.ptr>) -> i64 { // CHECK: %[[VAL_2:.*]] = cc.load %[[VAL_0]] : !cc.ptr> // CHECK: %[[VAL_3:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr) -> !cc.ptr // CHECK: %[[VAL_4:.*]] = cc.load %[[VAL_3]] : !cc.ptr diff --git a/test/Translate/argument.qke b/test/Translate/argument.qke index 82c7179b1f..865a622a55 100644 --- a/test/Translate/argument.qke +++ b/test/Translate/argument.qke @@ -297,12 +297,12 @@ func.func @test_3(%0: !cc.ptr, %1: !cc.ptr, %1: !cc.ptr, %1: !cc.ptr, %1: !cc.ptr Date: Fri, 15 Nov 2024 14:34:20 -0800 Subject: [PATCH 17/19] clang-format Signed-off-by: Eric Schweitz --- lib/Optimizer/Transforms/GenKernelExecution.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/Optimizer/Transforms/GenKernelExecution.cpp b/lib/Optimizer/Transforms/GenKernelExecution.cpp index 8554d6e576..b542ab49cc 100644 --- a/lib/Optimizer/Transforms/GenKernelExecution.cpp +++ b/lib/Optimizer/Transforms/GenKernelExecution.cpp @@ -351,8 +351,8 @@ convertAllStdVectorBool(Location loc, OpBuilder &builder, ModuleOp module, }(); auto transEleTy = cast(transientTy).getMember(0); auto dataTy = cast(transEleTy).getElementType(); - auto sizeTransientTy = builder.create( - loc, builder.getI64Type(), dataTy); + auto sizeTransientTy = + builder.create(loc, builder.getI64Type(), dataTy); Value sizeInBytes = builder.create(loc, count, sizeTransientTy); From d3a9b1c14ea8b4eec1b585fadb45556d34c32709 Mon Sep 17 00:00:00 2001 From: Eric Schweitz Date: Fri, 15 Nov 2024 14:57:44 -0800 Subject: [PATCH 18/19] Fix argument type. Signed-off-by: Eric Schweitz --- lib/Optimizer/Transforms/GenKernelExecution.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Optimizer/Transforms/GenKernelExecution.cpp b/lib/Optimizer/Transforms/GenKernelExecution.cpp index b542ab49cc..69f4e93681 100644 --- a/lib/Optimizer/Transforms/GenKernelExecution.cpp +++ b/lib/Optimizer/Transforms/GenKernelExecution.cpp @@ -1167,7 +1167,7 @@ static std::pair constructDynamicInputValue(Location loc, /// current level. static std::pair processInputValue(Location loc, OpBuilder &builder, Value trailingData, - Value ptrPackedStruct, Type inTy, std::int64_t off, + Value ptrPackedStruct, Type inTy, std::int32_t off, cudaq::cc::StructType packedStructTy) { auto packedPtr = builder.create( loc, cudaq::cc::PointerType::get(packedStructTy.getMember(off)), From 7338095a6fffc5f004482efb010659da02af500d Mon Sep 17 00:00:00 2001 From: Eric Schweitz Date: Tue, 19 Nov 2024 15:30:31 -0800 Subject: [PATCH 19/19] Review comments. Signed-off-by: Eric Schweitz --- include/cudaq/Optimizer/Builder/Factory.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/cudaq/Optimizer/Builder/Factory.h b/include/cudaq/Optimizer/Builder/Factory.h index 99083de521..596381ee17 100644 --- a/include/cudaq/Optimizer/Builder/Factory.h +++ b/include/cudaq/Optimizer/Builder/Factory.h @@ -131,7 +131,7 @@ inline mlir::Type stateImplType(mlir::Type eleTy) { // Generate host side type for std::string. The result is the type of a block of // bytes and the length to allocate. This allows for the creation of code to // allocate a variable, stride across such a variable, etc. The ModuleOp must -// contain the sizeof a pauli_word in its attributes. +// contain the size of a pauli_word in its attributes. cudaq::cc::ArrayType genHostStringType(mlir::ModuleOp module); // Host side types for std::vector