diff --git a/include/cudaq/Optimizer/Builder/Factory.h b/include/cudaq/Optimizer/Builder/Factory.h index 24e933117a..596381ee17 100644 --- a/include/cudaq/Optimizer/Builder/Factory.h +++ b/include/cudaq/Optimizer/Builder/Factory.h @@ -128,9 +128,13 @@ inline mlir::Type stateImplType(mlir::Type eleTy) { return cudaq::opt::factory::getPointerType(eleTy.getContext()); } -// Host side types for std::string and std::vector +// Generate host side type for std::string. The result is the type of a block of +// bytes and the length to allocate. This allows for the creation of code to +// allocate a variable, stride across such a variable, etc. The ModuleOp must +// contain the size of a pauli_word in its attributes. +cudaq::cc::ArrayType genHostStringType(mlir::ModuleOp module); -cudaq::cc::StructType stlStringType(mlir::MLIRContext *ctx); +// Host side types for std::vector cudaq::cc::StructType stlVectorType(mlir::Type eleTy); //===----------------------------------------------------------------------===// @@ -246,6 +250,9 @@ bool hasSRet(mlir::func::FuncOp funcOp); mlir::FunctionType toHostSideFuncType(mlir::FunctionType funcTy, bool addThisPtr, mlir::ModuleOp module); +/// Convert device type, \p ty, to host side type. +mlir::Type convertToHostSideType(mlir::Type ty, mlir::ModuleOp module); + // Return `true` if the given type corresponds to a standard vector type // according to our convention. // The convention is a `ptr, ptr, ptr>>`. diff --git a/include/cudaq/Optimizer/Builder/Intrinsics.h b/include/cudaq/Optimizer/Builder/Intrinsics.h index fa9ce53097..5884dbb39e 100644 --- a/include/cudaq/Optimizer/Builder/Intrinsics.h +++ b/include/cudaq/Optimizer/Builder/Intrinsics.h @@ -36,11 +36,16 @@ static constexpr const char getCudaqSizeFromTriple[] = // typically specialized to be bit packed). static constexpr const char stdvecBoolCtorFromInitList[] = "__nvqpp_initializer_list_to_vector_bool"; + // Convert a (likely packed) std::vector into a sequence of bytes, each // holding a boolean value. static constexpr const char stdvecBoolUnpackToInitList[] = "__nvqpp_vector_bool_to_initializer_list"; +// Free any temporary buffers used to hold std::vector data. +static constexpr const char stdvecBoolFreeTemporaryLists[] = + "__nvqpp_vector_bool_free_temporary_initlists"; + // The internal data of the cudaq::state object must be `2**n` in length. This // function returns the value `n`. static constexpr const char getNumQubitsFromCudaqState[] = diff --git a/include/cudaq/Optimizer/Builder/Runtime.h b/include/cudaq/Optimizer/Builder/Runtime.h index e65c05a857..4fc9405272 100644 --- a/include/cudaq/Optimizer/Builder/Runtime.h +++ b/include/cudaq/Optimizer/Builder/Runtime.h @@ -10,6 +10,16 @@ #include "cudaq/Optimizer/Builder/Factory.h" +//===----------------------------------------------------------------------===// +// +// Runtime helper functions are functions that will appear in the runtime +// library (implementations are defined in either the headers or libraries in +// the `runtime` directory). These helper functions may never be assumed to +// appear on the device-side, so these helpers should only be used in host-side +// code. +// +//===----------------------------------------------------------------------===// + namespace cudaq::runtime { /// Prefix for all kernel entry functions. @@ -52,4 +62,15 @@ static constexpr const char CudaqRegisterKernelName[] = static constexpr const char cudaqAHSPrefixName[] = "__analog_hamiltonian_kernel__"; +// Host-side helper functions for working with `cudaq::pauli_word` or a +// `std::string`. These include both fully dynamic and binding time (library +// build time) helper functions. +static constexpr const char sizeofStringAttrName[] = "cc.sizeof_string"; +static constexpr const char getPauliWordSize[] = + "_ZNK5cudaq10pauli_word11_nvqpp_sizeEv"; +static constexpr const char getPauliWordData[] = + "_ZNK5cudaq10pauli_word11_nvqpp_dataEv"; +static constexpr const char bindingGetStringData[] = "__nvqpp_getStringData"; +static constexpr const char bindingGetStringSize[] = "__nvqpp_getStringSize"; + } // namespace cudaq::runtime diff --git a/include/cudaq/Optimizer/Dialect/CC/CCTypes.td b/include/cudaq/Optimizer/Dialect/CC/CCTypes.td index d8a5820abe..aa03aedc07 100644 --- a/include/cudaq/Optimizer/Dialect/CC/CCTypes.td +++ b/include/cudaq/Optimizer/Dialect/CC/CCTypes.td @@ -119,7 +119,12 @@ def cc_StructType : CCType<"Struct", "struct", ]; let extraClassDeclaration = [{ + // O(1) bool isEmpty() const { return getMembers().empty(); } + + // O(n) + std::size_t getNumMembers() const { return getMembers().size(); } + Type getMember(unsigned position) { return getMembers()[position]; } }]; } diff --git a/lib/Frontend/nvqpp/ASTBridge.cpp b/lib/Frontend/nvqpp/ASTBridge.cpp index 806f3c6bde..2e4f1d810f 100644 --- a/lib/Frontend/nvqpp/ASTBridge.cpp +++ b/lib/Frontend/nvqpp/ASTBridge.cpp @@ -153,10 +153,10 @@ class QPUCodeFinder : public clang::RecursiveASTVisitor { using Base = clang::RecursiveASTVisitor; explicit QPUCodeFinder( cudaq::EmittedFunctionsCollection &funcsToEmit, clang::CallGraph &cgb, - clang::ItaniumMangleContext *mangler, + clang::ItaniumMangleContext *mangler, ModuleOp module, std::unordered_map &customOperations) : functionsToEmit(funcsToEmit), callGraphBuilder(cgb), mangler(mangler), - customOperationNames(customOperations) {} + module(module), customOperationNames(customOperations) {} /// Add a kernel to the list of kernels to process. template @@ -332,6 +332,25 @@ class QPUCodeFinder : public clang::RecursiveASTVisitor { tuplesAreReversed = !opt->isZero(); } } + if (cudaq::isInNamespace(x, "cudaq") && + cudaq::isInNamespace(x, "details") && + x->getName().equals("_nvqpp_sizeof")) { + // This constexpr is the sizeof a pauli_word and a std::string. + auto loc = x->getLocation(); + auto opt = x->getAnyInitializer()->getIntegerConstantExpr( + x->getASTContext(), &loc, false); + assert(opt && "must compute the sizeof a cudaq::pauli_word"); + auto sizeofString = opt->getZExtValue(); + auto sizeAttr = module->getAttr(cudaq::runtime::sizeofStringAttrName); + if (sizeAttr) { + assert(sizeofString == cast(sizeAttr).getUInt()); + } else { + auto *ctx = module.getContext(); + auto i64Ty = IntegerType::get(ctx, 64); + module->setAttr(cudaq::runtime::sizeofStringAttrName, + IntegerAttr::get(i64Ty, sizeofString)); + } + } // The check to make sure that quantum data types are only used in kernels // is done here. This checks both variable declarations and parameters. if (quantumTypesNotAllowed) @@ -357,6 +376,7 @@ class QPUCodeFinder : public clang::RecursiveASTVisitor { cudaq::EmittedFunctionsCollection &functionsToEmit; clang::CallGraph &callGraphBuilder; clang::ItaniumMangleContext *mangler; + ModuleOp module; std::unordered_map &customOperationNames; // A class that is being visited. Need to run semantics checks on it if and // only if it has a quantum kernel. @@ -648,7 +668,7 @@ void ASTBridgeAction::ASTBridgeConsumer::HandleTranslationUnit( bool ASTBridgeAction::ASTBridgeConsumer::HandleTopLevelDecl( clang::DeclGroupRef dg) { - QPUCodeFinder finder(functionsToEmit, callGraphBuilder, mangler, + QPUCodeFinder finder(functionsToEmit, callGraphBuilder, mangler, module.get(), customOperationNames); // Loop over all decls, saving the function decls that are quantum kernels. for (const auto *decl : dg) diff --git a/lib/Frontend/nvqpp/ConvertStmt.cpp b/lib/Frontend/nvqpp/ConvertStmt.cpp index 8c67f68283..efc6c889c9 100644 --- a/lib/Frontend/nvqpp/ConvertStmt.cpp +++ b/lib/Frontend/nvqpp/ConvertStmt.cpp @@ -331,7 +331,9 @@ bool QuakeBridgeVisitor::VisitReturnStmt(clang::ReturnStmt *x) { ValueRange{heapCopy, dynSize}); }; IRBuilder irb(builder); - Value tySize = irb.getByteSizeOfType(loc, eleTy); + Value tySize; + if (!cudaq::cc::isDynamicType(eleTy)) + tySize = irb.getByteSizeOfType(loc, eleTy); if (!tySize) { TODO_x(toLocation(x), x, mangler, "unhandled vector element type"); return false; diff --git a/lib/Optimizer/Builder/Factory.cpp b/lib/Optimizer/Builder/Factory.cpp index 421943c6c9..5bdd71e67e 100644 --- a/lib/Optimizer/Builder/Factory.cpp +++ b/lib/Optimizer/Builder/Factory.cpp @@ -7,6 +7,7 @@ ******************************************************************************/ #include "cudaq/Optimizer/Builder/Intrinsics.h" +#include "cudaq/Optimizer/Builder/Runtime.h" #include "cudaq/Optimizer/CodeGen/QIRFunctionNames.h" #include "cudaq/Optimizer/Dialect/CC/CCOps.h" #include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h" @@ -305,12 +306,15 @@ cc::LoopOp factory::createMonotonicLoop( return loop; } -cc::StructType factory::stlStringType(MLIRContext *ctx) { +cc::ArrayType factory::genHostStringType(ModuleOp mod) { + auto *ctx = mod.getContext(); auto i8Ty = IntegerType::get(ctx, 8); - auto ptrI8Ty = cc::PointerType::get(i8Ty); - auto i64Ty = IntegerType::get(ctx, 64); - auto padTy = cc::ArrayType::get(ctx, i8Ty, 16); - return cc::StructType::get(ctx, ArrayRef{ptrI8Ty, i64Ty, padTy}); + auto sizeAttr = mod->getAttr(cudaq::runtime::sizeofStringAttrName); + if (sizeAttr) { + auto size = cast(sizeAttr).getInt(); + return cc::ArrayType::get(ctx, i8Ty, size); + } + return cc::ArrayType::get(ctx, i8Ty, sizeof(std::string)); } // FIXME: We should get the underlying structure of a std::vector from the @@ -321,6 +325,22 @@ cc::StructType factory::stlVectorType(Type eleTy) { return cc::StructType::get(ctx, ArrayRef{ptrTy, ptrTy, ptrTy}); } +// Note that this is the raw host type, where std::vector is distinct. +// When converting to the device side, the distinction is deliberately removed +// making std::vector the same format as std::vector. +static cc::StructType stlHostVectorType(Type eleTy) { + MLIRContext *ctx = eleTy.getContext(); + if (eleTy != IntegerType::get(ctx, 1)) { + // std::vector where T != bool. + return factory::stlVectorType(eleTy); + } + // std::vector is a different type than std::vector. + auto ptrTy = cc::PointerType::get(eleTy); + auto i8Ty = IntegerType::get(ctx, 8); + auto padout = cc::ArrayType::get(ctx, i8Ty, 32); + return cc::StructType::get(ctx, ArrayRef{ptrTy, padout}); +} + // FIXME: Give these front-end names so we can disambiguate more types. cc::StructType factory::getDynamicBufferType(MLIRContext *ctx) { auto ptrTy = cc::PointerType::get(IntegerType::get(ctx, 8)); @@ -342,24 +362,19 @@ Type factory::getSRetElementType(FunctionType funcTy) { return funcTy.getResult(0); } -static Type convertToHostSideType(Type ty) { +Type factory::convertToHostSideType(Type ty, ModuleOp mod) { if (auto memrefTy = dyn_cast(ty)) - return convertToHostSideType( - factory::stlVectorType(memrefTy.getElementType())); + return stlHostVectorType( + convertToHostSideType(memrefTy.getElementType(), mod)); if (isa(ty)) return cc::PointerType::get(IntegerType::get(ty.getContext(), 8)); - if (auto memrefTy = dyn_cast(ty)) { - // `pauli_word` is an object with a std::vector in the header files at - // present. This data type *must* be updated if it becomes a std::string - // once again. - return convertToHostSideType( - factory::stlVectorType(IntegerType::get(ty.getContext(), 8))); - } + if (auto csTy = dyn_cast(ty)) + return genHostStringType(mod); auto *ctx = ty.getContext(); if (auto structTy = dyn_cast(ty)) { SmallVector newMembers; for (auto mem : structTy.getMembers()) - newMembers.push_back(convertToHostSideType(mem)); + newMembers.push_back(convertToHostSideType(mem, mod)); if (structTy.getName()) return cc::StructType::get(ctx, structTy.getName(), newMembers, structTy.getBitSize(), structTy.getAlignment(), @@ -579,7 +594,7 @@ FunctionType factory::toHostSideFuncType(FunctionType funcTy, bool addThisPtr, // returned via a sret argument in the first position. When this argument // is added, the this pointer becomes the second argument. Both are opaque // pointers at this point. - auto eleTy = convertToHostSideType(getSRetElementType(funcTy)); + auto eleTy = convertToHostSideType(getSRetElementType(funcTy), module); inputTys.push_back(cc::PointerType::get(eleTy)); hasSRet = true; } else { @@ -595,7 +610,7 @@ FunctionType factory::toHostSideFuncType(FunctionType funcTy, bool addThisPtr, // Add all the explicit (not hidden) arguments after the hidden ones. for (auto kernelTy : funcTy.getInputs()) { - auto hostTy = convertToHostSideType(kernelTy); + auto hostTy = convertToHostSideType(kernelTy, module); if (auto strTy = dyn_cast(hostTy)) { // On x86_64 and aarch64, a struct that is smaller than 128 bits may be // passed in registers as separate arguments. See classifyArgumentType() @@ -636,6 +651,9 @@ FunctionType factory::toHostSideFuncType(FunctionType funcTy, bool addThisPtr, } // Pass a struct as a byval pointer. hostTy = cc::PointerType::get(hostTy); + } else if (isa(hostTy)) { + // Pass a raw data block as a pointer. (It's a struct passed as a blob.) + hostTy = cc::PointerType::get(hostTy); } inputTys.push_back(hostTy); } diff --git a/lib/Optimizer/Builder/Intrinsics.cpp b/lib/Optimizer/Builder/Intrinsics.cpp index 1774475b1b..1826241eaa 100644 --- a/lib/Optimizer/Builder/Intrinsics.cpp +++ b/lib/Optimizer/Builder/Intrinsics.cpp @@ -49,6 +49,18 @@ inline bool operator<(const IntrinsicCode &icode, const IntrinsicCode &jcode) { /// well as prototypes for LLVM intrinsics and C library calls that are used by /// the compiler. The table should be kept in sorted order. static constexpr IntrinsicCode intrinsicTable[] = { + // These following pauli_word helper functions are only available on the + // host-side. They ought not be called in kernel code. + {cudaq::runtime::getPauliWordData, + {}, + "func.func private @_ZNK5cudaq10pauli_word11_nvqpp_dataEv(%pw : " + "!cc.ptr) -> !cc.ptr"}, + {cudaq::runtime::getPauliWordSize, + {cudaq::runtime::getPauliWordData, cudaq::runtime::bindingGetStringData, + cudaq::runtime::bindingGetStringSize}, + "func.func private @_ZNK5cudaq10pauli_word11_nvqpp_sizeEv(%pw : " + "!cc.ptr) -> i64"}, + // Initialize a (preallocated) buffer (the first parameter) with i64 values // on the semi-open range `[0..n)` where `n` is the second parameter. {cudaq::runtime::getLinkableKernelKey, @@ -292,6 +304,15 @@ static constexpr IntrinsicCode intrinsicTable[] = { func.func private @__nvqpp_getStateVectorLength_fp64(%p : i64, %o : i64) -> i64 )#"}, + // Quasi-portable entry points for use with non-C++ front ends (Python). + {cudaq::runtime::bindingGetStringData, + {}, + "func.func private @__nvqpp_getStringData(%p: !cc.ptr) -> " + "!cc.ptr"}, + {cudaq::runtime::bindingGetStringSize, + {}, + "func.func private @__nvqpp_getStringSize(%p: !cc.ptr) -> i64"}, + // __nvqpp_initializer_list_to_vector_bool {cudaq::stdvecBoolCtorFromInitList, {}, @@ -307,11 +328,17 @@ static constexpr IntrinsicCode intrinsicTable[] = { return %0 : !cc.ptr })#"}, + // __nvqpp_vector_bool_free_temporary_lists + {cudaq::stdvecBoolFreeTemporaryLists, + {}, + R"#( + func.func private @__nvqpp_vector_bool_free_temporary_initlists(!cc.ptr) -> ())#"}, + // __nvqpp_vector_bool_to_initializer_list {cudaq::stdvecBoolUnpackToInitList, {}, R"#( - func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr, !cc.ptr, !cc.ptr}>>, !cc.ptr, !cc.ptr, !cc.ptr}>>) -> ())#"}, + func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr, !cc.ptr, !cc.ptr}>>, !cc.ptr, !cc.array}>>, !cc.ptr>) -> ())#"}, {"__nvqpp_zeroDynamicResult", {}, R"#( func.func private @__nvqpp_zeroDynamicResult() -> !cc.struct<{!cc.ptr, i64}> { diff --git a/lib/Optimizer/Dialect/CC/CCOps.cpp b/lib/Optimizer/Dialect/CC/CCOps.cpp index 9d539640ac..8221aa5e81 100644 --- a/lib/Optimizer/Dialect/CC/CCOps.cpp +++ b/lib/Optimizer/Dialect/CC/CCOps.cpp @@ -106,6 +106,10 @@ Value cudaq::cc::getByteSizeOfType(OpBuilder &builder, Location loc, Type ty, return builder.create(loc, builder.getI64Type(), v, scale); }) + .Case([&](cudaq::cc::SpanLikeType) -> Value { + // Uniformly on the device size: {ptr, i64} + return createInt(16); + }) .Default({}); } diff --git a/lib/Optimizer/Dialect/CC/CCTypes.cpp b/lib/Optimizer/Dialect/CC/CCTypes.cpp index 816695e173..0543a12a51 100644 --- a/lib/Optimizer/Dialect/CC/CCTypes.cpp +++ b/lib/Optimizer/Dialect/CC/CCTypes.cpp @@ -158,7 +158,7 @@ Type cc::SpanLikeType::getElementType() const { } bool isDynamicType(Type ty) { - if (isa(ty)) + if (isa(ty)) return true; if (auto strTy = dyn_cast(ty)) { for (auto memTy : strTy.getMembers()) diff --git a/lib/Optimizer/Transforms/DecompositionPatterns.cpp b/lib/Optimizer/Transforms/DecompositionPatterns.cpp index de32b86e45..bdf8e9244c 100644 --- a/lib/Optimizer/Transforms/DecompositionPatterns.cpp +++ b/lib/Optimizer/Transforms/DecompositionPatterns.cpp @@ -362,6 +362,9 @@ struct ExpPauliDecomposition : public OpRewritePattern { auto strAttr = cast(attr.value()); optPauliWordStr = strAttr.getValue(); } + } else if (auto lit = addrOp.getDefiningOp< + cudaq::cc::CreateStringLiteralOp>()) { + optPauliWordStr = lit.getStringLiteral(); } } } @@ -369,7 +372,7 @@ struct ExpPauliDecomposition : public OpRewritePattern { // Assert that we have a constant known pauli word if (!optPauliWordStr.has_value()) - return failure(); + return expPauliOp.emitOpError("cannot determine pauli word string"); auto pauliWordStr = optPauliWordStr.value(); diff --git a/lib/Optimizer/Transforms/GenKernelExecution.cpp b/lib/Optimizer/Transforms/GenKernelExecution.cpp index 2e45c8df96..69f4e93681 100644 --- a/lib/Optimizer/Transforms/GenKernelExecution.cpp +++ b/lib/Optimizer/Transforms/GenKernelExecution.cpp @@ -15,10 +15,10 @@ #include "cudaq/Optimizer/Transforms/Passes.h" #include "cudaq/Todo.h" #include "clang/Basic/Version.h" +#include "llvm/ADT/TypeSwitch.h" #include "llvm/Support/Debug.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/ToolOutputFile.h" -#include "mlir/Analysis/DataLayoutAnalysis.h" #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h" #include "mlir/IR/Diagnostics.h" #include "mlir/Transforms/Passes.h" @@ -48,283 +48,1149 @@ static bool isCodegenArgumentGather(std::size_t kind) { return kind == 0 || kind == 2; } -/// This pass adds a `.thunk` function and a rewritten C++ host -/// side (mangled) stub to the code for every entry-point kernel in the module. -/// It may also generate a `.argsCreator` function. Finally, it -/// creates registration hooks for the CUDA-Q runtime to be able to find the -/// kernel by name and, as appropriate, the `.argsCreator` -/// function. -namespace { -class GenerateKernelExecution - : public cudaq::opt::impl::GenerateKernelExecutionBase< - GenerateKernelExecution> { -public: - using GenerateKernelExecutionBase::GenerateKernelExecutionBase; +static bool isStateType(Type ty) { + if (auto ptrTy = dyn_cast(ty)) + return isa(ptrTy.getElementType()); + return false; +} - /// Creates the function signature for a thunk function. The signature is - /// always the same for all thunk functions. - /// - /// Every thunk function has an identical signature, making it callable from a - /// generic "kernel launcher" in the CUDA-Q runtime. - /// - /// This signature is defined as: `(ptr, bool) -> {ptr, i64}`. - /// - /// The first argument is a pointer to a data buffer that encodes all the - /// arguments (and static return) values to (and from) the kernel in the - /// pointer-free encoding. The second argument indicates if this call is to a - /// remote process (if true). The result is a pointer and size (span) if the - /// kernel returns a dynamically sized result, otherwise it will be - /// `{nullptr, 0}`. It is the responsibility of calling code to free any - /// dynamic result buffer(s) and convert those to `std::vector` objects. - FunctionType getThunkType(MLIRContext *ctx) { - auto ptrTy = cudaq::cc::PointerType::get(IntegerType::get(ctx, 8)); - return FunctionType::get(ctx, {ptrTy, IntegerType::get(ctx, 1)}, - {cudaq::opt::factory::getDynamicBufferType(ctx)}); +/// Creates the function signature for a thunk function. The signature is always +/// the same for all thunk functions. +/// +/// Every thunk function has an identical signature, making it callable from a +/// generic "kernel launcher" in the CUDA-Q runtime. +/// +/// This signature is defined as: `(ptr, bool) -> {ptr, i64}`. +/// +/// The first argument is a pointer to a data buffer that encodes all the +/// arguments (and static return) values to (and from) the kernel in the +/// pointer-free encoding. The second argument indicates if this call is to a +/// remote process (if true). The result is a pointer and size (span) if the +/// kernel returns a dynamically sized result, otherwise it will be +/// `{nullptr, 0}`. It is the responsibility of calling code to free any +/// dynamic result buffer(s) and convert those to `std::vector` objects. +static FunctionType getThunkType(MLIRContext *ctx) { + auto ptrTy = cudaq::cc::PointerType::get(IntegerType::get(ctx, 8)); + return FunctionType::get(ctx, {ptrTy, IntegerType::get(ctx, 1)}, + {cudaq::opt::factory::getDynamicBufferType(ctx)}); +} + +/// Generate code to read the length from a host-side string object. (On the +/// device side, a string is encoded as a span.) The length of a string is the +/// number of bytes of data. +/// +/// In order to handle a std::string value it is assumed to be laid out in +/// memory as the following structure. +/// +/// +/// struct vector { +/// i8* data; +/// i64 length; +/// [i8 x 16] inlinedata; +/// }; +/// +/// +/// This implementation does \e not support wide characters. +static Value genStringLength(Location loc, OpBuilder &builder, Value stringArg, + ModuleOp module) { + Type stringTy = stringArg.getType(); + assert(isa(stringTy) && + isa( + cast(stringTy).getElementType()) && + "host side string expected"); + auto callArg = builder.create( + loc, cudaq::cc::PointerType::get(builder.getI8Type()), stringArg); + StringRef helperName = module->getAttr(cudaq::runtime::sizeofStringAttrName) + ? cudaq::runtime::getPauliWordSize + : cudaq::runtime::bindingGetStringSize; + auto lenRes = builder.create(loc, builder.getI64Type(), + helperName, ValueRange{callArg}); + return lenRes.getResult(0); +} + +/// Generate code that computes the size in bytes of a `std::vector` array +/// in the same way as a `std::vector::size()`. This assumes the vector is +/// laid out in memory as the following structure. +/// +/// +/// struct vector { +/// T* begin; +/// T* end; +/// T* allocated_end; +/// }; +/// +/// +/// The first two elements are pointers to the beginning and end of the data +/// in the vector, respectively. This data is kept in a contiguous memory +/// range. The following implementation follows what Clang CodeGen produces +/// for `std::vector::size()` without the final `sdiv` op that divides the +/// `sizeof(data[N])` by the `sizeof(T)`. The result is the total required +/// memory size for the vector data itself in \e bytes. +static Value genVectorSize(Location loc, OpBuilder &builder, Value vecArg) { + auto vecTy = cast(vecArg.getType()); + auto vecStructTy = cast(vecTy.getElementType()); + assert(vecStructTy.getNumMembers() == 3 && + vecStructTy.getMember(0) == vecStructTy.getMember(1) && + vecStructTy.getMember(0) == vecStructTy.getMember(2) && + "host side vector expected"); + auto vecElePtrTy = cudaq::cc::PointerType::get(vecStructTy.getMember(0)); + + // Get the pointer to the pointer of the end of the array + Value endPtr = builder.create( + loc, vecElePtrTy, vecArg, ArrayRef{1}); + + // Get the pointer to the pointer of the beginning of the array + Value beginPtr = builder.create( + loc, vecElePtrTy, vecArg, ArrayRef{0}); + + // Load to a T* + endPtr = builder.create(loc, endPtr); + beginPtr = builder.create(loc, beginPtr); + + // Map those pointers to integers + Type i64Ty = builder.getI64Type(); + Value endInt = builder.create(loc, i64Ty, endPtr); + Value beginInt = builder.create(loc, i64Ty, beginPtr); + + // Subtracting these will give us the size in bytes. + return builder.create(loc, endInt, beginInt); +} + +static Value genComputeReturnOffset(Location loc, OpBuilder &builder, + FunctionType funcTy, + cudaq::cc::StructType msgStructTy) { + if (funcTy.getNumResults() == 0) + return builder.create(loc, NoResultOffset, 64); + std::int32_t numKernelArgs = funcTy.getNumInputs(); + auto i64Ty = builder.getI64Type(); + return builder.create( + loc, i64Ty, msgStructTy, ArrayRef{numKernelArgs}); +} + +/// Create a function that determines the return value offset in the message +/// buffer. +static void genReturnOffsetFunction(Location loc, OpBuilder &builder, + FunctionType devKernelTy, + cudaq::cc::StructType msgStructTy, + const std::string &classNameStr) { + auto *ctx = builder.getContext(); + auto i64Ty = builder.getI64Type(); + auto funcTy = FunctionType::get(ctx, {}, {i64Ty}); + auto returnOffsetFunc = + builder.create(loc, classNameStr + ".returnOffset", funcTy); + OpBuilder::InsertionGuard guard(builder); + auto *entry = returnOffsetFunc.addEntryBlock(); + builder.setInsertionPointToStart(entry); + auto result = genComputeReturnOffset(loc, builder, devKernelTy, msgStructTy); + builder.create(loc, result); +} + +static cudaq::cc::PointerType getByteAddressableType(OpBuilder &builder) { + return cudaq::cc::PointerType::get( + cudaq::cc::ArrayType::get(builder.getI8Type())); +} + +static cudaq::cc::PointerType getPointerToPointerType(OpBuilder &builder) { + return cudaq::cc::PointerType::get( + cudaq::cc::PointerType::get(builder.getI8Type())); +} + +static bool isDynamicSignature(FunctionType devFuncTy) { + for (auto t : devFuncTy.getInputs()) + if (cudaq::cc::isDynamicType(t)) + return true; + for (auto t : devFuncTy.getResults()) + if (cudaq::cc::isDynamicType(t)) + return true; + return false; +} + +static std::pair +genByteSizeAndElementCount(Location loc, OpBuilder &builder, ModuleOp module, + Type eleTy, Value size, Value arg, Type t) { + // If this is a vector>, convert the bytes of vector to bytes of + // length (i64). + if (auto sty = dyn_cast(eleTy)) { + auto eTy = cast(arg.getType()).getElementType(); + auto fTy = cast(eTy).getMember(0); + auto tTy = cast(fTy).getElementType(); + auto i64Ty = builder.getI64Type(); + auto eleSize = builder.create(loc, i64Ty, tTy); + Value count = builder.create(loc, size, eleSize); + auto ate = builder.create(loc, 8, 64); + size = builder.create(loc, count, ate); + return {size, count}; } - /// Add LLVM code with the OpBuilder that computes the size in bytes - /// of a `std::vector` array in the same way as a `std::vector::size()`. - /// This assumes the vector is laid out in memory as the following structure. - /// - /// - /// struct vector { - /// T* begin; - /// T* end; - /// T* allocated_end; - /// }; - /// - /// - /// The first two elements are pointers to the beginning and end of the data - /// in the vector, respectively. This data is kept in a contiguous memory - /// range. The following implementation follows what Clang CodeGen produces - /// for `std::vector::size()` without the final `sdiv` op that divides the - /// `sizeof(data[N])` by the `sizeof(T)`. The result is the total required - /// memory size for the vector data itself in \e bytes. - /// - /// In order to handle a std::string value it is assumed to be laid out in - /// memory as the following structure. - /// - /// - /// struct vector { - /// i8* data; - /// i64 length; - /// [i8 x 16] inlinedata; - /// }; - /// - /// - /// In the string case, the size can just be read from the data structure. - Value getVectorSize(Location loc, OpBuilder &builder, - cudaq::cc::PointerType ptrTy, Value arg) { - // Create the i64 type - Type i64Ty = builder.getI64Type(); + // If this is a vector, convert the bytes of string to bytes of length + // (i64). + if (isa(eleTy)) { + auto arrTy = cudaq::opt::factory::genHostStringType(module); + auto words = + builder.create(loc, arrTy.getSize() / 8, 64); + size = builder.create(loc, size, words); + auto ate = builder.create(loc, 8, 64); + Value count = builder.create(loc, size, ate); + return {size, count}; + } - // We're given ptr>, get that struct type (struct) - auto inpStructTy = cast(ptrTy.getElementType()); + // If this is a vector>, convert the bytes of struct to bytes of + // struct with converted members. + if (isa(eleTy)) { + auto vecTy = cast(arg.getType()).getElementType(); + auto vecEleRefTy = cast(vecTy).getMember(0); + auto vecEleTy = cast(vecEleRefTy).getElementType(); + auto i64Ty = builder.getI64Type(); + auto hostStrSize = + builder.create(loc, i64Ty, vecEleTy); + Value count = builder.create(loc, size, hostStrSize); + Type packedTy = cudaq::opt::factory::genArgumentBufferType(eleTy); + auto packSize = builder.create(loc, i64Ty, packedTy); + size = builder.create(loc, count, packSize); + return {size, count}; + } + return {}; +} - if (inpStructTy.getMember(1) == i64Ty) { - // This is a string, so just read the length out. - auto ptrI64Ty = cudaq::cc::PointerType::get(i64Ty); - auto lenPtr = builder.create( - loc, ptrI64Ty, arg, SmallVector{1}); - return builder.create(loc, lenPtr); - } +static bool isStdVectorBool(Type ty) { + auto stdvecTy = dyn_cast(ty); + return stdvecTy && + (stdvecTy.getElementType() == IntegerType::get(ty.getContext(), 1)); +} - // For the following GEP calls, we'll expect them to return T** - auto ptrTtype = cudaq::cc::PointerType::get(inpStructTy.getMember(0)); +/// Recursively check if \p ty contains a `std::vector`. +static bool hasStdVectorBool(Type ty) { + if (isStdVectorBool(ty)) + return true; + if (auto sty = dyn_cast(ty)) + return hasStdVectorBool(sty.getElementType()); + if (auto sty = dyn_cast(ty)) + for (auto mem : sty.getMembers()) + if (hasStdVectorBool(mem)) + return true; + return false; +} - // Get the pointer to the pointer of the end of the array - Value endPtr = builder.create( - loc, ptrTtype, arg, SmallVector{1}); +// The host-side type of a `std::vector` is distinct from the transient +// type for a `std::vector`. The former is a unique data type with a size +// of 40 bytes. The latter is identical to `std::vector` (which has a size +// of 24 bytes). +static Type convertToTransientType(Type ty, ModuleOp mod) { + if (isStdVectorBool(ty)) { + auto *ctx = ty.getContext(); + return cudaq::opt::factory::stlVectorType(IntegerType::get(ctx, 1)); + } + if (auto sty = dyn_cast(ty)) + return cudaq::opt::factory::stlVectorType( + convertToTransientType(sty.getElementType(), mod)); + if (auto sty = dyn_cast(ty)) { + SmallVector newMems; + for (auto mem : sty.getMembers()) + newMems.push_back(convertToTransientType(mem, mod)); + auto *ctx = ty.getContext(); + return cudaq::cc::StructType::get(ctx, newMems); + } + return cudaq::opt::factory::convertToHostSideType(ty, mod); +} - // Get the pointer to the pointer of the beginning of the array - Value beginPtr = builder.create( - loc, ptrTtype, arg, SmallVector{0}); +static std::pair +convertAllStdVectorBool(Location loc, OpBuilder &builder, ModuleOp module, + Value arg, Type ty, Value heapTracker, + std::optional preallocated = std::nullopt) { + // If we are here, `ty` must be a `std::vector` or recursively contain a + // `std::vector`. + + // Handle `std::vector`. + if (isStdVectorBool(ty)) { + auto stdvecTy = cast(ty); + Type stdvecHostTy = + cudaq::opt::factory::stlVectorType(stdvecTy.getElementType()); + Value tmp = preallocated.has_value() + ? *preallocated + : builder.create(loc, stdvecHostTy); + builder.create(loc, std::nullopt, + cudaq::stdvecBoolUnpackToInitList, + ArrayRef{tmp, arg, heapTracker}); + return {tmp, true}; + } + + // Handle `std::vector` where `T` != `bool`. + if (auto sty = dyn_cast(ty)) { + // arg is a std::vector. + // It's type must be ptr, ptr, ptr>>. + auto seleTy = sty.getElementType(); + auto ptrArgTy = cast(arg.getType()); + auto argVecTy = cast(ptrArgTy.getElementType()); + auto subVecPtrTy = cudaq::cc::PointerType::get(argVecTy.getMember(0)); + // Compute the pointer to the pointer to the first T element. + auto inputRef = builder.create( + loc, subVecPtrTy, arg, ArrayRef{0}); + auto startInput = builder.create(loc, inputRef); + auto startTy = startInput.getType(); + auto subArrTy = cudaq::cc::ArrayType::get( + cast(startTy).getElementType()); + auto input = builder.create( + loc, cudaq::cc::PointerType::get(subArrTy), startInput); + auto transientTy = convertToTransientType(sty, module); + auto tmp = [&]() -> Value { + if (preallocated) + return builder.create( + loc, cudaq::cc::PointerType::get(transientTy), *preallocated); + return builder.create(loc, transientTy); + }(); + Value sizeDelta = genVectorSize(loc, builder, arg); + auto count = [&]() -> Value { + if (cudaq::cc::isDynamicType(seleTy)) { + auto p = genByteSizeAndElementCount(loc, builder, module, seleTy, + sizeDelta, arg, sty); + return p.second; + } + auto sizeEle = builder.create( + loc, builder.getI64Type(), seleTy); + return builder.create(loc, sizeDelta, sizeEle); + }(); + auto transEleTy = cast(transientTy).getMember(0); + auto dataTy = cast(transEleTy).getElementType(); + auto sizeTransientTy = + builder.create(loc, builder.getI64Type(), dataTy); + Value sizeInBytes = + builder.create(loc, count, sizeTransientTy); + + // Create a new vector that we'll store the converted data into. + Value byteBuffer = builder.create( + loc, builder.getI8Type(), sizeInBytes); + + // Initialize the temporary vector. + auto vecEleTy = cudaq::cc::PointerType::get(transEleTy); + auto tmpBegin = builder.create( + loc, vecEleTy, tmp, ArrayRef{0}); + auto bufferBegin = + builder.create(loc, transEleTy, byteBuffer); + builder.create(loc, bufferBegin, tmpBegin); + auto tmpEnd = builder.create( + loc, vecEleTy, tmp, ArrayRef{1}); + auto byteBufferEnd = builder.create( + loc, cudaq::cc::PointerType::get(builder.getI8Type()), byteBuffer, + ArrayRef{sizeInBytes}); + auto bufferEnd = + builder.create(loc, transEleTy, byteBufferEnd); + builder.create(loc, bufferEnd, tmpEnd); + auto tmpEnd2 = builder.create( + loc, vecEleTy, tmp, ArrayRef{2}); + builder.create(loc, bufferEnd, tmpEnd2); + + // Loop over each element in the outer vector and initialize it to the inner + // vector value. The data may be heap allocated.) + auto transientEleTy = convertToTransientType(seleTy, module); + auto transientBufferTy = + cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(transientEleTy)); + auto buffer = + builder.create(loc, transientBufferTy, byteBuffer); - // Load to a T* - endPtr = builder.create(loc, endPtr); - beginPtr = builder.create(loc, beginPtr); + cudaq::opt::factory::createInvariantLoop( + builder, loc, count, + [&](OpBuilder &builder, Location loc, Region &, Block &block) { + Value i = block.getArgument(0); + Value inp = builder.create( + loc, startTy, input, ArrayRef{i}); + auto currentVector = builder.create( + loc, cudaq::cc::PointerType::get(transientEleTy), buffer, + ArrayRef{i}); + convertAllStdVectorBool(loc, builder, module, inp, seleTy, + heapTracker, currentVector); + }); + return {tmp, true}; + } - // Map those pointers to integers - Value endInt = builder.create(loc, i64Ty, endPtr); - Value beginInt = builder.create(loc, i64Ty, beginPtr); + // Handle `struct { ... };`. + if (auto sty = dyn_cast(ty)) { + auto bufferTy = convertToTransientType(ty, module); + auto argPtrTy = cast(arg.getType()); + auto argStrTy = cast(argPtrTy.getElementType()); + + // If a struct was preallocated, use it. Otherwise, create a new struct that + // we'll store the converted data into. + auto buffer = [&]() -> Value { + if (preallocated) + return builder.create( + loc, cudaq::cc::PointerType::get(bufferTy), *preallocated); + return builder.create(loc, bufferTy); + }(); - // Subtracting these will give us the size in bytes. - return builder.create(loc, endInt, beginInt); + // Loop over each element. Replace each with the converted value. + for (auto iter : llvm::enumerate(sty.getMembers())) { + std::int32_t i = iter.index(); + Type memTy = iter.value(); + auto fromPtr = builder.create( + loc, cudaq::cc::PointerType::get(argStrTy.getMember(i)), arg, + ArrayRef{i}); + auto transientTy = convertToTransientType(memTy, module); + Value toPtr = builder.create( + loc, cudaq::cc::PointerType::get(transientTy), buffer, + ArrayRef{i}); + convertAllStdVectorBool(loc, builder, module, fromPtr, memTy, heapTracker, + toPtr); + } + return {buffer, true}; } + return {arg, false}; +} + +static std::pair +unpackAnyStdVectorBool(Location loc, OpBuilder &builder, ModuleOp module, + Value arg, Type ty, Value heapTracker) { + if (hasStdVectorBool(ty)) + return convertAllStdVectorBool(loc, builder, module, arg, ty, heapTracker); + return {arg, false}; +} - /// Helper that converts a byte length to a length of i64. - Value convertLengthBytesToLengthI64(Location loc, OpBuilder &builder, - Value length) { - auto eight = builder.create(loc, 8, 64); - return builder.create(loc, length, eight); +// Take the list of host-side arguments and device side argument types and zip +// them together logically with the position. Generates any fixup code that's +// needed, like when the device side uses a pair of arguments for a single +// logical device side argument. May drop some arguments on the floor if they +// cannot be encoded. +template +static SmallVector> +zipArgumentsWithDeviceTypes(Location loc, OpBuilder &builder, ModuleOp module, + ValueRange args, TypeRange types, + Value heapTracker) { + SmallVector> result; + if constexpr (argsAreReferences) { + // Simple case: the number of args must be equal to the types. + assert(args.size() == types.size() && + "arguments and types must have same size"); + for (auto iter : llvm::enumerate(llvm::zip(args, types))) { + // Remove the reference. + Value v = std::get(iter.value()); + Type ty = std::get(iter.value()); + if (!(cudaq::cc::isDynamicType(ty) || isStateType(ty) || + isa(ty))) + v = builder.create(loc, v); + // Python will pass a std::vector to us here. Unpack it. + auto pear = + unpackAnyStdVectorBool(loc, builder, module, v, ty, heapTracker); + v = pear.first; + result.emplace_back(iter.index(), v, ty); + } + } else /*constexpr*/ { + // In this case, we *may* have logical arguments that are passed in pairs. + auto *ctx = builder.getContext(); + auto *parent = builder.getBlock()->getParentOp(); + auto module = parent->getParentOfType(); + auto lastArg = args.end(); + auto tyIter = types.begin(); + unsigned argPos = 0; + for (auto argIter = args.begin(); argIter != lastArg; + ++argIter, ++tyIter, ++argPos) { + assert(tyIter != types.end()); + Type devTy = *tyIter; + + // std::vector isn't really a std::vector<>. Use the helper + // function to unpack it so it looks like any other vector. + auto pear = unpackAnyStdVectorBool(loc, builder, module, *argIter, devTy, + heapTracker); + if (pear.second) { + result.emplace_back(argPos, pear.first, devTy); + continue; + } + + // Check for a struct passed in a pair of arguments. + if (isa(devTy) && + !isa((*argIter).getType()) && + cudaq::opt::factory::isX86_64(module) && + cudaq::opt::factory::structUsesTwoArguments(devTy)) { + auto first = *argIter++; + auto second = *argIter; + // TODO: Investigate if it's correct to assume the register layout + // will match the memory layout of the small struct. + auto pairTy = cudaq::cc::StructType::get( + ctx, ArrayRef{first.getType(), second.getType()}); + auto tmp = builder.create(loc, pairTy); + auto tmp1 = builder.create( + loc, cudaq::cc::PointerType::get(first.getType()), tmp); + builder.create(loc, first, tmp1); + auto tmp2 = builder.create( + loc, cudaq::cc::PointerType::get(second.getType()), tmp, + ArrayRef{1}); + builder.create(loc, second, tmp2); + auto devPtrTy = cudaq::cc::PointerType::get(devTy); + Value devVal = builder.create(loc, devPtrTy, tmp); + if (!cudaq::cc::isDynamicType(devTy)) + devVal = builder.create(loc, devVal); + result.emplace_back(argPos, devVal, devTy); + continue; + } + + // Is this a static struct passed as a byval pointer? + if (isa(devTy) && + isa((*argIter).getType()) && + !cudaq::cc::isDynamicType(devTy)) { + Value devVal = builder.create(loc, *argIter); + result.emplace_back(argPos, devVal, devTy); + continue; + } + result.emplace_back(argPos, *argIter, devTy); + } } + return result; +} + +static Value descendThroughDynamicType(Location loc, OpBuilder &builder, + ModuleOp module, Type ty, Value addend, + Value arg, Value tmp) { + auto i64Ty = builder.getI64Type(); + Value tySize = + TypeSwitch(ty) + // A char span is dynamic, but it is not recursively dynamic. Just + // read the length of the string out. + .Case([&](cudaq::cc::CharspanType t) -> Value { + return genStringLength(loc, builder, arg, module); + }) + // A std::vector is dynamic and may be recursive dynamic as well. + .Case([&](cudaq::cc::StdvecType t) -> Value { + // Compute the byte span of the vector. + Value size = genVectorSize(loc, builder, arg); + auto eleTy = t.getElementType(); + if (!cudaq::cc::isDynamicType(eleTy)) + return size; + + // Otherwise, we have a recursively dynamic case. + auto [bytes, count] = genByteSizeAndElementCount( + loc, builder, module, eleTy, size, arg, t); + assert(count && "vector must have elements"); + size = bytes; + + // At this point, arg is a known vector of elements of dynamic + // type, so walk over the vector and recurse on each element. + // `size` is already the proper size of the lengths of each of the + // elements in turn. + builder.create(loc, size, tmp); + auto ptrTy = cast(arg.getType()); + auto strTy = cast(ptrTy.getElementType()); + auto memTy = cast(strTy.getMember(0)); + auto arrTy = + cudaq::cc::PointerType::get(cudaq::cc::PointerType::get( + cudaq::cc::ArrayType::get(memTy.getElementType()))); + auto castPtr = builder.create(loc, arrTy, arg); + auto castArg = builder.create(loc, castPtr); + auto castPtrTy = + cudaq::cc::PointerType::get(memTy.getElementType()); + cudaq::opt::factory::createInvariantLoop( + builder, loc, count, + [&](OpBuilder &builder, Location loc, Region &, Block &block) { + Value i = block.getArgument(0); + auto ai = builder.create( + loc, castPtrTy, castArg, + ArrayRef{i}); + auto tmpVal = builder.create(loc, tmp); + Value innerSize = descendThroughDynamicType( + loc, builder, module, eleTy, tmpVal, ai, tmp); + builder.create(loc, innerSize, tmp); + }); + return builder.create(loc, tmp); + }) + // A struct can be dynamic if it contains dynamic members. Get the + // static portion of the struct first, which will have length slots. + // Then get the dynamic sizes for the dynamic members. + .Case([&](cudaq::cc::StructType t) -> Value { + if (cudaq::cc::isDynamicType(t)) { + Type packedTy = cudaq::opt::factory::genArgumentBufferType(t); + Value strSize = + builder.create(loc, i64Ty, packedTy); + for (auto iter : llvm::enumerate(t.getMembers())) { + std::int32_t i = iter.index(); + auto m = iter.value(); + if (cudaq::cc::isDynamicType(m)) { + auto hostPtrTy = cast(arg.getType()); + auto hostStrTy = + cast(hostPtrTy.getElementType()); + auto pm = cudaq::cc::PointerType::get(hostStrTy.getMember(i)); + auto ai = builder.create( + loc, pm, arg, ArrayRef{i}); + strSize = descendThroughDynamicType(loc, builder, module, m, + strSize, ai, tmp); + } + } + return strSize; + } + return builder.create(loc, i64Ty, t); + }) + .Default([&](Type t) -> Value { + return builder.create(loc, i64Ty, t); + }); + return builder.create(loc, tySize, addend); +} - /// This computes a vector's size and handles recursive vector types. This - /// first value returned is the size of the top level (outermost) vector in - /// bytes. The second value is the recursive size of all the vectors within - /// the outer vector. - std::pair - computeRecursiveVectorSize(Location loc, OpBuilder &builder, Value hostArg, - cudaq::cc::PointerType hostVecTy, - cudaq::cc::SpanLikeType stdvecTy) { - Value topLevelSize; - Value recursiveSize; - auto eleTy = stdvecTy.getElementType(); - if (auto sTy = dyn_cast(eleTy)) { - // This is the recursive case. vector>. Convert size of - // vectors to i64s. - topLevelSize = computeHostVectorLengthInBytes( - loc, builder, hostArg, stdvecTy.getElementType(), hostVecTy); - auto nested = fetchHostVectorFront(loc, builder, hostArg, hostVecTy); - auto tmp = builder.create(loc, builder.getI64Type()); - builder.create(loc, topLevelSize, tmp); - // Convert bytes to units of i64. (Divide by 8) - auto topLevelCount = - convertLengthBytesToLengthI64(loc, builder, topLevelSize); - // Now walk the vectors recursively. - auto topLevelIndex = builder.create( - loc, builder.getI64Type(), topLevelCount, - cudaq::cc::CastOpMode::Unsigned); +static Value +genSizeOfDynamicMessageBuffer(Location loc, OpBuilder &builder, ModuleOp module, + cudaq::cc::StructType structTy, + ArrayRef> zippy, + Value tmp) { + auto i64Ty = builder.getI64Type(); + Value initSize = builder.create(loc, i64Ty, structTy); + for (auto [_, a, t] : zippy) + if (cudaq::cc::isDynamicType(t)) + initSize = + descendThroughDynamicType(loc, builder, module, t, initSize, a, tmp); + return initSize; +} + +static Value populateStringAddendum(Location loc, OpBuilder &builder, + Value host, Value sizeSlot, Value addendum, + ModuleOp module) { + Value size = genStringLength(loc, builder, host, module); + builder.create(loc, size, sizeSlot); + auto ptrI8Ty = cudaq::cc::PointerType::get(builder.getI8Type()); + auto fromPtr = builder.create(loc, ptrI8Ty, host); + StringRef helperName = module->getAttr(cudaq::runtime::sizeofStringAttrName) + ? cudaq::runtime::getPauliWordData + : cudaq::runtime::bindingGetStringData; + auto dataPtr = builder.create(loc, ptrI8Ty, helperName, + ValueRange{fromPtr}); + auto notVolatile = builder.create(loc, 0, 1); + auto toPtr = builder.create(loc, ptrI8Ty, addendum); + builder.create( + loc, std::nullopt, cudaq::llvmMemCopyIntrinsic, + ValueRange{toPtr, dataPtr.getResult(0), size, notVolatile}); + auto ptrI8Arr = getByteAddressableType(builder); + auto addBytes = builder.create(loc, ptrI8Arr, addendum); + return builder.create( + loc, ptrI8Ty, addBytes, ArrayRef{size}); +} + +// Simple case when the vector data is known to not hold dynamic data. +static Value populateVectorAddendum(Location loc, OpBuilder &builder, + Value host, Value sizeSlot, + Value addendum) { + Value size = genVectorSize(loc, builder, host); + builder.create(loc, size, sizeSlot); + auto ptrI8Ty = cudaq::cc::PointerType::get(builder.getI8Type()); + auto ptrPtrI8 = getPointerToPointerType(builder); + auto fromPtrPtr = builder.create(loc, ptrPtrI8, host); + auto fromPtr = builder.create(loc, fromPtrPtr); + auto notVolatile = builder.create(loc, 0, 1); + auto toPtr = builder.create(loc, ptrI8Ty, addendum); + builder.create(loc, std::nullopt, cudaq::llvmMemCopyIntrinsic, + ValueRange{toPtr, fromPtr, size, notVolatile}); + auto ptrI8Arr = getByteAddressableType(builder); + auto addBytes = builder.create(loc, ptrI8Arr, addendum); + return builder.create( + loc, ptrI8Ty, addBytes, ArrayRef{size}); +} + +static Value populateDynamicAddendum(Location loc, OpBuilder &builder, + ModuleOp module, Type devArgTy, Value host, + Value sizeSlot, Value addendum, + Value addendumScratch) { + if (isa(devArgTy)) + return populateStringAddendum(loc, builder, host, sizeSlot, addendum, + module); + if (auto vecTy = dyn_cast(devArgTy)) { + auto eleTy = vecTy.getElementType(); + if (cudaq::cc::isDynamicType(eleTy)) { + // Recursive case. Visit each dynamic element, copying it. + Value size = genVectorSize(loc, builder, host); + auto [bytes, count] = genByteSizeAndElementCount( + loc, builder, module, eleTy, size, host, devArgTy); + size = bytes; + builder.create(loc, size, sizeSlot); + + // Convert from bytes to vector length in elements. + // Compute new addendum start. + auto addrTy = getByteAddressableType(builder); + auto castEnd = builder.create(loc, addrTy, addendum); + Value newAddendum = builder.create( + loc, addendum.getType(), castEnd, + ArrayRef{size}); + builder.create(loc, newAddendum, addendumScratch); + Type dataTy = cudaq::opt::factory::genArgumentBufferType(eleTy); + auto arrDataTy = cudaq::cc::ArrayType::get(dataTy); + auto sizeBlockTy = cudaq::cc::PointerType::get(arrDataTy); + auto ptrDataTy = cudaq::cc::PointerType::get(dataTy); + + // In the recursive case, the next block of addendum is a vector of + // elements which are either sizes or contain sizes. The sizes are i64 + // and expressed in bytes. Each size will be the size of the span of the + // element (or its subfields) at that offset. + auto sizeBlock = + builder.create(loc, sizeBlockTy, addendum); + auto hostEleTy = + cast(host.getType()).getElementType(); + auto ptrPtrBlockTy = cudaq::cc::PointerType::get( + cast(hostEleTy).getMember(0)); + + // The host argument is a std::vector, so we want to get the address of + // "front" out of the vector (the first pointer in the triple) and step + // over the contiguous range of vectors in the host block. The vector of + // vectors forms a ragged array structure in host memory. + auto hostBeginPtrRef = builder.create( + loc, ptrPtrBlockTy, host, ArrayRef{0}); + auto hostBegin = builder.create(loc, hostBeginPtrRef); + auto hostBeginEleTy = cast(hostBegin.getType()); + auto hostBlockTy = cudaq::cc::PointerType::get( + cudaq::cc::ArrayType::get(hostBeginEleTy.getElementType())); + auto hostBlock = + builder.create(loc, hostBlockTy, hostBegin); + + // Loop over each vector element in the vector (recursively). cudaq::opt::factory::createInvariantLoop( - builder, loc, topLevelIndex, + builder, loc, count, [&](OpBuilder &builder, Location loc, Region &, Block &block) { Value i = block.getArgument(0); - auto sub = builder.create(loc, hostVecTy, - nested, i); - auto p = - computeRecursiveVectorSize(loc, builder, sub, hostVecTy, sTy); - auto subSz = builder.create(loc, tmp); - auto sum = builder.create(loc, p.second, subSz); - builder.create(loc, sum, tmp); + Value addm = + builder.create(loc, addendumScratch); + auto subSlot = builder.create( + loc, ptrDataTy, sizeBlock, + ArrayRef{i}); + auto subHost = builder.create( + loc, hostBeginEleTy, hostBlock, + ArrayRef{i}); + Value newAddm = + populateDynamicAddendum(loc, builder, module, eleTy, subHost, + subSlot, addm, addendumScratch); + builder.create(loc, newAddm, addendumScratch); }); - recursiveSize = builder.create(loc, tmp); + return builder.create(loc, addendumScratch); + } + return populateVectorAddendum(loc, builder, host, sizeSlot, addendum); + } + auto devStrTy = cast(devArgTy); + auto hostStrTy = cast( + cast(sizeSlot.getType()).getElementType()); + assert(devStrTy.getNumMembers() == hostStrTy.getNumMembers()); + for (auto iter : llvm::enumerate(devStrTy.getMembers())) { + std::int32_t iterIdx = iter.index(); + auto hostPtrTy = cast(host.getType()); + auto hostMemTy = cast(hostPtrTy.getElementType()) + .getMember(iterIdx); + auto val = builder.create( + loc, cudaq::cc::PointerType::get(hostMemTy), host, + ArrayRef{iterIdx}); + Type iterTy = iter.value(); + if (cudaq::cc::isDynamicType(iterTy)) { + Value fieldInSlot = builder.create( + loc, cudaq::cc::PointerType::get(builder.getI64Type()), sizeSlot, + ArrayRef{iterIdx}); + addendum = + populateDynamicAddendum(loc, builder, module, iterTy, val, + fieldInSlot, addendum, addendumScratch); } else { - // Non-recusive case. Just compute the size of the top-level vector. - topLevelSize = getVectorSize(loc, builder, hostVecTy, hostArg); - recursiveSize = topLevelSize; + Value fieldInSlot = builder.create( + loc, cudaq::cc::PointerType::get(iterTy), sizeSlot, + ArrayRef{iterIdx}); + auto v = builder.create(loc, val); + builder.create(loc, v, fieldInSlot); } - return {topLevelSize, recursiveSize}; } + return addendum; +} - /// This computes a dynamic struct's size and handles recursive dynamic types. - /// This first value returned is the initial value of the top level - /// (outermost) struct to be saved in the buffer. More specifically, any - /// (recursive) member that is a vector is replaced by a i64 byte size. The - /// offset of the trailing data is, as always, implicit. The second value is - /// the recursive size of all the dynamic components within the outer struct. - std::pair computeRecursiveDynamicStructSize( - Location loc, OpBuilder &builder, cudaq::cc::StructType structTy, - Value arg, Value totalSize, cudaq::cc::StructType genTy) { - Value retval = builder.create(loc, genTy); - auto argTy = cast(arg.getType()); - for (auto iter : llvm::enumerate(structTy.getMembers())) { - auto memTy = iter.value(); - std::int32_t off = iter.index(); - auto structMemTy = - cast(argTy.getElementType()).getMember(off); - auto structMemPtrTy = cudaq::cc::PointerType::get(structMemTy); - auto memPtrVal = builder.create( - loc, structMemPtrTy, arg, ArrayRef{off}); - if (cudaq::cc::isDynamicType(memTy)) { - if (auto sTy = dyn_cast(memTy)) { - auto gTy = cast(structMemTy); - auto pr = computeRecursiveDynamicStructSize( - loc, builder, sTy, memPtrVal, totalSize, gTy); - retval = builder.create( - loc, retval.getType(), retval, pr.first, off); - totalSize = builder.create(loc, totalSize, pr.second); - continue; - } - auto memStdVecTy = cast(memTy); - Type eTy = memStdVecTy.getElementType(); - auto stlVecTy = cudaq::opt::factory::stlVectorType(eTy); - auto ptrMemTy = cudaq::cc::PointerType::get(stlVecTy); - auto pr = computeRecursiveVectorSize(loc, builder, memPtrVal, ptrMemTy, - memStdVecTy); - retval = builder.create( - loc, retval.getType(), retval, pr.second, off); - totalSize = builder.create(loc, totalSize, pr.first); - continue; - } - auto memVal = builder.create(loc, memPtrVal); - retval = builder.create(loc, retval.getType(), - retval, memVal, off); +static void +populateMessageBuffer(Location loc, OpBuilder &builder, ModuleOp module, + Value msgBufferBase, + ArrayRef> zippy, + Value addendum = {}, Value addendumScratch = {}) { + auto structTy = cast( + cast(msgBufferBase.getType()).getElementType()); + // Loop over all the arguments and populate the message buffer. + for (auto [idx, arg, devArgTy] : zippy) { + std::int32_t i = idx; + if (cudaq::cc::isDynamicType(devArgTy)) { + assert(addendum && "must have addendum to encode dynamic argument(s)"); + // Get the address of the slot to be filled. + auto memberTy = cast(structTy).getMember(i); + auto ptrTy = cudaq::cc::PointerType::get(memberTy); + auto slot = builder.create( + loc, ptrTy, msgBufferBase, ArrayRef{i}); + addendum = populateDynamicAddendum(loc, builder, module, devArgTy, arg, + slot, addendum, addendumScratch); + continue; + } + + // If the argument is a callable, skip it. + if (isa(devArgTy)) + continue; + // If the argument is an empty struct, skip it. + if (auto strTy = dyn_cast(devArgTy); + strTy && strTy.isEmpty()) + continue; + + // Get the address of the slot to be filled. + auto memberTy = cast(structTy).getMember(i); + auto ptrTy = cudaq::cc::PointerType::get(memberTy); + Value slot = builder.create( + loc, ptrTy, msgBufferBase, ArrayRef{i}); + + // Argument is a packaged kernel. In this case, the argument is some + // unknown kernel that may be called. The packaged argument is coming + // from opaque C++ host code, so we need to identify what kernel it + // references and then pass its name as a span of characters to the + // launch kernel. + if (isa(devArgTy)) { + auto i64Ty = builder.getI64Type(); + auto kernKey = builder.create( + loc, i64Ty, cudaq::runtime::getLinkableKernelKey, ValueRange{arg}); + builder.create(loc, kernKey.getResult(0), slot); + continue; } - return {retval, totalSize}; + + // Just pass the raw pointer. The buffer is supposed to be pointer-free + // since it may be unpacked in a different address space. However, if this + // is a simulation and things are in the same address space, we pass the + // pointer for convenience. + if (isa(devArgTy)) + arg = builder.create(loc, memberTy, arg); + + if (isa(arg.getType()) && + (cudaq::cc::PointerType::get(arg.getType()) != slot.getType())) { + slot = builder.create( + loc, cudaq::cc::PointerType::get(arg.getType()), slot); + } + builder.create(loc, arg, slot); } +} - /// Copy a vector's data, which must be \p bytes in length, from \p hostArg to - /// \p outputBuffer. The hostArg must have a pointer type that is compatible - /// with the triple pointer std::vector base implementation. - Value copyVectorData(Location loc, OpBuilder &builder, Value bytes, - Value hostArg, Value outputBuffer) { - auto notVolatile = builder.create(loc, 0, 1); - auto inStructTy = cast( - cast(hostArg.getType()).getElementType()); - auto beginPtr = builder.create( - loc, cudaq::cc::PointerType::get(inStructTy.getMember(0)), hostArg, - SmallVector{0}); - auto fromBuff = builder.create(loc, beginPtr); - auto i8Ty = builder.getI8Type(); - auto vecFromBuff = cudaq::opt::factory::createCast( - builder, loc, cudaq::cc::PointerType::get(i8Ty), fromBuff); - builder.create( - loc, std::nullopt, cudaq::llvmMemCopyIntrinsic, - ValueRange{outputBuffer, vecFromBuff, bytes, notVolatile}); - auto i8ArrTy = cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(i8Ty)); - auto buf1 = - cudaq::opt::factory::createCast(builder, loc, i8ArrTy, outputBuffer); - // Increment outputBuffer by size bytes. - return builder.create( - loc, outputBuffer.getType(), buf1, SmallVector{bytes}); +/// A kernel function that takes a quantum type argument (also known as a pure +/// device kernel) cannot be called directly from C++ (classical) code. It must +/// be called via other quantum code. +static bool hasLegalType(FunctionType funTy) { + for (auto ty : funTy.getInputs()) + if (quake::isQuantumType(ty)) + return false; + for (auto ty : funTy.getResults()) + if (quake::isQuantumType(ty)) + return false; + return true; +} + +static MutableArrayRef +dropAnyHiddenArguments(MutableArrayRef args, FunctionType funcTy, + bool hasThisPointer) { + const bool hiddenSRet = cudaq::opt::factory::hasHiddenSRet(funcTy); + const unsigned count = + cudaq::cc::numberOfHiddenArgs(hasThisPointer, hiddenSRet); + if (count > 0 && args.size() >= count && + std::all_of(args.begin(), args.begin() + count, [](auto i) { + return isa(i.getType()); + })) + return args.drop_front(count); + return args; +} + +static std::pair +lookupHostEntryPointFunc(StringRef mangledEntryPointName, ModuleOp module, + func::FuncOp funcOp) { + if (mangledEntryPointName.equals("BuilderKernel.EntryPoint") || + mangledEntryPointName.contains("_PyKernelEntryPointRewrite")) { + // No host entry point needed. + return {false, func::FuncOp{}}; } + if (auto *decl = module.lookupSymbol(mangledEntryPointName)) + if (auto func = dyn_cast(decl)) { + func.eraseBody(); + return {true, func}; + } + funcOp.emitOpError("could not generate the host-side kernel function (" + + mangledEntryPointName + ")"); + return {true, func::FuncOp{}}; +} + +/// Generate code to initialize the std::vector, \p sret, from an initializer +/// list with data at \p data and length \p size. Use the library helper +/// routine. This function takes two !llvm.ptr arguments. +static void genStdvecBoolFromInitList(Location loc, OpBuilder &builder, + Value sret, Value data, Value size) { + auto ptrTy = cudaq::cc::PointerType::get(builder.getContext()); + auto castData = builder.create(loc, ptrTy, data); + auto castSret = builder.create(loc, ptrTy, sret); + builder.create(loc, std::nullopt, + cudaq::stdvecBoolCtorFromInitList, + ArrayRef{castSret, castData, size}); +} + +/// Generate a `std::vector` (where `T != bool`) from an initializer list. +/// This is done with the assumption that `std::vector` is implemented as a +/// triple of pointers. The original content of the vector is freed and the new +/// content, which is already on the stack, is moved into the `std::vector`. +static void genStdvecTFromInitList(Location loc, OpBuilder &builder, Value sret, + Value data, Value tSize, Value vecSize) { + auto i8Ty = builder.getI8Type(); + auto stlVectorTy = + cudaq::cc::PointerType::get(cudaq::opt::factory::stlVectorType(i8Ty)); + auto ptrTy = cudaq::cc::PointerType::get(i8Ty); + auto castSret = builder.create(loc, stlVectorTy, sret); + auto ptrPtrTy = cudaq::cc::PointerType::get(ptrTy); + auto sret0 = builder.create( + loc, ptrPtrTy, castSret, SmallVector{0}); + auto arrI8Ty = cudaq::cc::ArrayType::get(i8Ty); + auto ptrArrTy = cudaq::cc::PointerType::get(arrI8Ty); + auto buffPtr0 = builder.create(loc, ptrTy, data); + builder.create(loc, buffPtr0, sret0); + auto sret1 = builder.create( + loc, ptrPtrTy, castSret, SmallVector{1}); + Value byteLen = builder.create(loc, tSize, vecSize); + auto buffPtr = builder.create(loc, ptrArrTy, data); + auto endPtr = builder.create( + loc, ptrTy, buffPtr, SmallVector{byteLen}); + builder.create(loc, endPtr, sret1); + auto sret2 = builder.create( + loc, ptrPtrTy, castSret, SmallVector{2}); + builder.create(loc, endPtr, sret2); +} + +// Alloca a pointer to a pointer and initialize it to nullptr. +static Value createEmptyHeapTracker(Location loc, OpBuilder &builder) { + auto ptrI8Ty = cudaq::cc::PointerType::get(builder.getI8Type()); + auto result = builder.create(loc, ptrI8Ty); + auto zero = builder.create(loc, 0, 64); + auto null = builder.create(loc, ptrI8Ty, zero); + builder.create(loc, null, result); + return result; +} + +// If there are temporaries, call the helper to free them. +static void maybeFreeHeapAllocations(Location loc, OpBuilder &builder, + Value heapTracker) { + auto head = builder.create(loc, heapTracker); + auto zero = builder.create(loc, 0, 64); + auto headAsInt = + builder.create(loc, builder.getI64Type(), head); + auto cmp = builder.create(loc, arith::CmpIPredicate::ne, + headAsInt, zero); + // If there are no std::vector to unpack, then the heapTracker will be + // set to `nullptr` and otherwise unused. That will allow the compiler to DCE + // this call after constant propagation. + builder.create( + loc, TypeRange{}, cmp, + [&](OpBuilder &builder, Location loc, Region ®ion) { + region.push_back(new Block()); + auto &body = region.front(); + OpBuilder::InsertionGuard guard(builder); + builder.setInsertionPointToStart(&body); + builder.create(loc, std::nullopt, + cudaq::stdvecBoolFreeTemporaryLists, + ArrayRef{head}); + builder.create(loc); + }); +} - /// Given that \p arg is a SpanLikeType value, compute its extent size (the - /// number of elements in the outermost vector times `sizeof(int64_t)`) and - /// total recursive size (both values are in bytes). We add the extent size - /// into the message buffer field and increase the size of the addend by the - /// total recursive size. - std::pair insertVectorSizeAndIncrementExtraBytes( - Location loc, OpBuilder &builder, Value arg, - cudaq::cc::PointerType ptrInTy, cudaq::cc::SpanLikeType stdvecTy, - Value stVal, std::int32_t idx, Value extraBytes) { - auto [extentSize, recursiveSize] = - computeRecursiveVectorSize(loc, builder, arg, ptrInTy, stdvecTy); - stVal = builder.create(loc, stVal.getType(), - stVal, extentSize, idx); - extraBytes = builder.create(loc, extraBytes, recursiveSize); - return {stVal, extraBytes}; +/// Fetch an argument from the comm buffer. Here, the argument is not dynamic so +/// it can be read as is out of the buffer. +static Value fetchInputValue(Location loc, OpBuilder &builder, Type devTy, + Value ptr) { + assert(!cudaq::cc::isDynamicType(devTy) && "must not be a dynamic type"); + if (isa(devTy)) { + // An indirect callable passes a key value which will be used to determine + // the kernel that is being called. + auto key = builder.create(loc, ptr); + return builder.create(loc, devTy, key); } - Value genComputeReturnOffset(Location loc, OpBuilder &builder, - FunctionType funcTy, - cudaq::cc::StructType msgStructTy) { - if (funcTy.getNumResults() == 0) - return builder.create(loc, NoResultOffset, 64); - std::int32_t numKernelArgs = funcTy.getNumInputs(); - auto i64Ty = builder.getI64Type(); - return builder.create( - loc, i64Ty, msgStructTy, ArrayRef{numKernelArgs}); + if (isa(devTy)) { + // A direct callable will have already been effectively inlined and this + // argument should not be referenced. + return builder.create(loc, devTy); } - /// Create a function that determines the return value offset in the message - /// buffer. - void genReturnOffsetFunction(Location loc, OpBuilder &builder, - FunctionType devKernelTy, - cudaq::cc::StructType msgStructTy, - const std::string &classNameStr) { - auto *ctx = builder.getContext(); + auto ptrDevTy = cudaq::cc::PointerType::get(devTy); + if (auto strTy = dyn_cast(devTy)) { + // Argument is a struct. + if (strTy.isEmpty()) + return builder.create(loc, devTy); + + // Cast to avoid conflicts between layout compatible, distinct struct types. + auto structPtr = builder.create(loc, ptrDevTy, ptr); + return builder.create(loc, structPtr); + } + + // Default case: argument passed as a value inplace. + return builder.create(loc, ptr); +} + +/// Helper routine to generate code to increment the trailing data pointer to +/// the next block of data (if any). +static Value incrementTrailingDataPointer(Location loc, OpBuilder &builder, + Value trailingData, Value bytes) { + auto i8Ty = builder.getI8Type(); + auto bufferTy = cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(i8Ty)); + auto buffPtr = builder.create(loc, bufferTy, trailingData); + auto i8PtrTy = cudaq::cc::PointerType::get(i8Ty); + return builder.create( + loc, i8PtrTy, buffPtr, ArrayRef{bytes}); +} + +/// In the thunk, we need to unpack any `std::vector` objects encoded in the +/// packet. Since these have dynamic size, they are encoded as trailing bytes +/// by offset and size. The offset is implicit from the values of the +/// arguments. All sizes are encoded as `int64_t`. +/// +/// A vector of vector of ... T is encoded as a int64_t (length). At the +/// offset of the level `i` vector will be a sequence of sizes for the level +/// `i+1` vectors. For the leaf vector level, `n`, the blocks of data for each +/// vector will be immediately following for each vector at level `n` for the +/// branch of the tree being encoded. +/// +/// For example, a variable defined and initialized as +/// ``` +/// vector>> example = +/// {{{'a'}, {'b', 'c'}, {'z'}}, {{'d' 'e', 'f'}}}; +/// ``` +/// +/// and passed as an argument to a kernel will be encoded as the following +/// block. The block will have a structure with the declared arguments +/// followed by an addendum of variable data, where the vector data is +/// encoded. +/// +/// ``` +/// arguments: { ..., 1, ... } +/// addendum: [[3; 1 2 1, a, b c, z] [1; 3, d e f]] +/// ``` +static std::pair constructDynamicInputValue(Location loc, + OpBuilder &builder, + Type devTy, Value ptr, + Value trailingData) { + assert(cudaq::cc::isDynamicType(devTy) && "must be dynamic type"); + // There are 2 cases. + // 1. The dynamic type is a std::span of any legal device argument type. + // 2. The dynamic type is a struct containing at least 1 std::span. + if (auto spanTy = dyn_cast(devTy)) { + // ptr: a pointer to the length of the block in bytes. + // trailingData: the block of data to decode. + auto eleTy = spanTy.getElementType(); auto i64Ty = builder.getI64Type(); - auto funcTy = FunctionType::get(ctx, {}, {i64Ty}); - auto returnOffsetFunc = builder.create( - loc, classNameStr + ".returnOffset", funcTy); - OpBuilder::InsertionGuard guard(builder); - auto *entry = returnOffsetFunc.addEntryBlock(); - builder.setInsertionPointToStart(entry); - auto result = - genComputeReturnOffset(loc, builder, devKernelTy, msgStructTy); - builder.create(loc, result); + auto buffEleTy = cudaq::opt::factory::genArgumentBufferType(eleTy); + + // Get the size of each element in the vector and compute the vector's + // logical length. + auto eleSize = builder.create(loc, i64Ty, buffEleTy); + Value bytes = builder.create(loc, ptr); + auto vecLength = builder.create(loc, bytes, eleSize); + + if (cudaq::cc::isDynamicType(eleTy)) { + // The vector is recursively dynamic. + // Create a new block in which to place the stdvec/struct data in + // device-side format. + Value newVecData = + builder.create(loc, eleTy, vecLength); + // Compute new trailing data, skipping the current vector's data. + auto nextTrailingData = + incrementTrailingDataPointer(loc, builder, trailingData, bytes); + + // For each element in the vector, convert it to device-side format and + // save the result in newVecData. + auto elePtrTy = cudaq::cc::PointerType::get(eleTy); + auto packTy = cudaq::opt::factory::genArgumentBufferType(eleTy); + Type packedArrTy = + cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(packTy)); + Type packedEleTy = cudaq::cc::PointerType::get(packTy); + auto arrPtr = + builder.create(loc, packedArrTy, trailingData); + auto trailingDataVar = + builder.create(loc, nextTrailingData.getType()); + builder.create(loc, nextTrailingData, + trailingDataVar); + cudaq::opt::factory::createInvariantLoop( + builder, loc, vecLength, + [&](OpBuilder &builder, Location loc, Region &, Block &block) { + Value i = block.getArgument(0); + auto nextTrailingData = + builder.create(loc, trailingDataVar); + auto vecMemPtr = builder.create( + loc, packedEleTy, arrPtr, + ArrayRef{i}); + auto r = constructDynamicInputValue(loc, builder, eleTy, vecMemPtr, + nextTrailingData); + auto newVecPtr = builder.create( + loc, elePtrTy, newVecData, + ArrayRef{i}); + builder.create(loc, r.first, newVecPtr); + builder.create(loc, r.second, trailingDataVar); + }); + + // Create the new outer stdvec span as the result. + Value stdvecResult = builder.create( + loc, spanTy, newVecData, vecLength); + nextTrailingData = + builder.create(loc, trailingDataVar); + return {stdvecResult, nextTrailingData}; + } + + // This vector has constant data, so just use the data in-place and + // construct the stdvec span with it. + auto castTrailingData = builder.create( + loc, cudaq::cc::PointerType::get(eleTy), trailingData); + Value stdvecResult = builder.create( + loc, spanTy, castTrailingData, vecLength); + auto nextTrailingData = + incrementTrailingDataPointer(loc, builder, trailingData, bytes); + return {stdvecResult, nextTrailingData}; + } + + // Argument must be a struct. + // The struct contains dynamic components. Extract them and build up the + // struct value to be passed as an argument. + // ptr: pointer to the first element of the struct or a vector length. + // tailingData: the block of data for the first dynamic type field. + auto strTy = cast(devTy); + auto ptrEleTy = cast(ptr.getType()).getElementType(); + auto packedTy = cast(ptrEleTy); + Value result = builder.create(loc, strTy); + assert(strTy.getNumMembers() == packedTy.getNumMembers()); + for (auto iter : + llvm::enumerate(llvm::zip(strTy.getMembers(), packedTy.getMembers()))) { + auto devMemTy = std::get<0>(iter.value()); + std::int32_t off = iter.index(); + auto packedMemTy = std::get<1>(iter.value()); + auto dataPtr = builder.create( + loc, cudaq::cc::PointerType::get(packedMemTy), ptr, + ArrayRef{off}); + if (cudaq::cc::isDynamicType(devMemTy)) { + auto r = constructDynamicInputValue(loc, builder, devMemTy, dataPtr, + trailingData); + result = builder.create(loc, strTy, result, + r.first, off); + trailingData = r.second; + continue; + } + auto val = fetchInputValue(loc, builder, devMemTy, dataPtr); + result = + builder.create(loc, strTy, result, val, off); } + return {result, trailingData}; +} + +/// Translate the buffer data to a sequence of arguments suitable to the +/// actual kernel call. +/// +/// \param inTy The actual expected type of the argument. +/// \param structTy The modified buffer type over all the arguments at the +/// current level. +static std::pair +processInputValue(Location loc, OpBuilder &builder, Value trailingData, + Value ptrPackedStruct, Type inTy, std::int32_t off, + cudaq::cc::StructType packedStructTy) { + auto packedPtr = builder.create( + loc, cudaq::cc::PointerType::get(packedStructTy.getMember(off)), + ptrPackedStruct, ArrayRef{off}); + if (cudaq::cc::isDynamicType(inTy)) + return constructDynamicInputValue(loc, builder, inTy, packedPtr, + trailingData); + auto val = fetchInputValue(loc, builder, inTy, packedPtr); + return {val, trailingData}; +} + +/// This pass adds a `.thunk` function and a rewritten C++ host +/// side (mangled) stub to the code for every entry-point kernel in the module. +/// It may also generate a `.argsCreator` function. Finally, it +/// creates registration hooks for the CUDA-Q runtime to be able to find the +/// kernel by name and, as appropriate, the `.argsCreator` +/// function. +namespace { +class GenerateKernelExecution + : public cudaq::opt::impl::GenerateKernelExecutionBase< + GenerateKernelExecution> { +public: + using GenerateKernelExecutionBase::GenerateKernelExecutionBase; /// Creates a function that can take a block of pointers to argument values /// and using the compiler's knowledge of a kernel encodes those argument @@ -340,6 +1206,7 @@ class GenerateKernelExecution /// buffer. (Message buffers are at least the size of \p structTy but may be /// extended.) func::FuncOp genKernelArgsCreatorFunction(Location loc, OpBuilder &builder, + ModuleOp module, FunctionType devKernelTy, cudaq::cc::StructType msgStructTy, const std::string &classNameStr, @@ -348,14 +1215,18 @@ class GenerateKernelExecution auto *ctx = builder.getContext(); Type i8Ty = builder.getI8Type(); Type ptrI8Ty = cudaq::cc::PointerType::get(i8Ty); - auto ptrPtrType = cudaq::cc::PointerType::get(ptrI8Ty); + auto ptrPtrType = getPointerToPointerType(builder); Type i64Ty = builder.getI64Type(); auto structPtrTy = cudaq::cc::PointerType::get(msgStructTy); - auto getHostArgType = [&](unsigned idx) { - bool hasSRet = cudaq::opt::factory::hasHiddenSRet(hostFuncTy); - unsigned count = cudaq::cc::numberOfHiddenArgs(hasThisPtr, hasSRet); - return hostFuncTy.getInput(count + idx); - }; + auto passedDevArgTys = devKernelTy.getInputs().drop_front(startingArgIdx); + + SmallVector passedHostArgTys; + for (auto ty : passedDevArgTys) { + Type hostTy = cudaq::opt::factory::convertToHostSideType(ty, module); + if (cudaq::cc::isDynamicType(ty)) + hostTy = cudaq::cc::PointerType::get(hostTy); + passedHostArgTys.push_back(hostTy); + } // Create the function that we'll fill. auto funcType = FunctionType::get(ctx, {ptrPtrType, ptrPtrType}, {i64Ty}); @@ -365,365 +1236,83 @@ class GenerateKernelExecution auto *entry = argsCreatorFunc.addEntryBlock(); builder.setInsertionPointToStart(entry); - // Get the original function args - auto kernelArgTypes = devKernelTy.getInputs().drop_front(startingArgIdx); + // Convert all the arguments passed in the array of void* to appear as if + // they had been naturally passed as C++ arguments. + // This means, casting to the correct type (host-side) and removing the + // outer pointer by a dereference. Each argument must be a valid reference + // at this point, so if the dereference fails (say it is a nullptr), it is a + // bug in the code that is calling this argsCreator. - // Init the struct - Value stVal = builder.create(loc, msgStructTy); - - // Get the variadic void* args - auto variadicArgs = builder.create( + // Get the array of void* args. + auto argsArray = builder.create( loc, cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(ptrI8Ty)), entry->getArgument(0)); - // Initialize the counter for extra size. - Value zero = builder.create(loc, 0, 64); - Value extraBytes = zero; - - // Process all the arguments for the original call by looping over the - // kernel's arguments. - bool hasTrailingData = false; - DenseMap replacementArgs; - for (auto kaIter : llvm::enumerate(kernelArgTypes)) { - std::int32_t idx = kaIter.index(); - - // The current cudaq kernel arg and message buffer element type. - Type currArgTy = kaIter.value(); - Type currEleTy = msgStructTy.getMember(idx); - - // Skip any elements that are callables or empty structures. - if (isa(currEleTy)) - continue; - if (auto strTy = dyn_cast(currEleTy)) - if (strTy.isEmpty()) - continue; - - // Get the pointer to the argument from out of the block of pointers, - // which are the variadic args. - Value argPtrPtr = builder.create( - loc, ptrPtrType, variadicArgs, - SmallVector{idx}); - Value argPtr = builder.create(loc, ptrI8Ty, argPtrPtr); - - if (auto stdvecTy = dyn_cast(currArgTy)) { - // If this is a vector argument, then we will add data to the message - // buffer's addendum (unless the vector is length 0). - auto ptrInTy = cudaq::cc::PointerType::get( - cudaq::opt::factory::stlVectorType(stdvecTy.getElementType())); - - Value arg = builder.create(loc, ptrInTy, argPtr); - if (stdvecTy.getElementType() == builder.getI1Type()) { - // Create a mock vector of i8 and populate the bools, 1 per char. - Value temp = builder.create( - loc, ptrInTy.getElementType()); - builder.create(loc, std::nullopt, - cudaq::stdvecBoolUnpackToInitList, - ArrayRef{temp, arg}); - replacementArgs[idx] = temp; - arg = temp; - } - - auto [p1, p2] = insertVectorSizeAndIncrementExtraBytes( - loc, builder, arg, ptrInTy, stdvecTy, stVal, idx, extraBytes); - stVal = p1; - extraBytes = p2; - hasTrailingData = true; - continue; - } + // Loop over the array and cast the void* to the host-side type. + SmallVector pseudoArgs; + for (auto iter : llvm::enumerate(passedHostArgTys)) { + std::int32_t i = iter.index(); + auto parg = builder.create( + loc, ptrPtrType, argsArray, ArrayRef{i}); + Type ty = iter.value(); + // parg is a pointer to a pointer as it is an element of an array of + // pointers. Always dereference the first layer here. + Value deref = builder.create(loc, parg); + if (!isa(ty)) + ty = cudaq::cc::PointerType::get(ty); + pseudoArgs.push_back(builder.create(loc, ty, deref)); + } - if (auto strTy = dyn_cast(currArgTy)) { - Value v = argPtr; - if (!cudaq::cc::isDynamicType(strTy)) { - // struct is static size, so just load the value (byval ptr). - v = builder.create( - loc, cudaq::cc::PointerType::get(currEleTy), v); - v = builder.create(loc, v); - stVal = builder.create(loc, stVal.getType(), - stVal, v, idx); - continue; - } - auto genTy = cast(currEleTy); - Value zero = builder.create(loc, 0, 64); - Type hostArgTy = getHostArgType(idx); - v = builder.create(loc, hostArgTy, v); - auto [quakeVal, recursiveSize] = computeRecursiveDynamicStructSize( - loc, builder, strTy, v, zero, genTy); - stVal = builder.create(loc, stVal.getType(), - stVal, quakeVal, idx); - extraBytes = - builder.create(loc, extraBytes, recursiveSize); - hasTrailingData = true; - continue; - } - if (auto ptrTy = dyn_cast(currEleTy)) { - if (isa(ptrTy.getElementType())) { - // Special case: if the argument is a `cudaq::state*`, then just pass - // the pointer. We can do that in this case because the synthesis step - // (which will receive the argument data) is assumed to run in the - // same memory space. - argPtr = builder.create(loc, currEleTy, argPtr); - stVal = builder.create(loc, stVal.getType(), - stVal, argPtr, idx); - } - continue; - } + // Zip the arguments with the device side argument types. Recall that some + // of the (left-most) arguments may have been dropped on the floor. + const bool hasDynamicSignature = isDynamicSignature(devKernelTy); + Value heapTracker = createEmptyHeapTracker(loc, builder); + auto zippy = zipArgumentsWithDeviceTypes( + loc, builder, module, pseudoArgs, passedDevArgTys, heapTracker); + auto sizeScratch = builder.create(loc, i64Ty); + auto messageBufferSize = [&]() -> Value { + if (hasDynamicSignature) + return genSizeOfDynamicMessageBuffer(loc, builder, module, msgStructTy, + zippy, sizeScratch); + return builder.create(loc, i64Ty, msgStructTy); + }(); - // cast to the struct element type, void* -> TYPE * - argPtr = builder.create( - loc, cudaq::cc::PointerType::get(currEleTy), argPtr); - Value loadedVal = - builder.create(loc, currEleTy, argPtr); - stVal = builder.create(loc, stVal.getType(), - stVal, loadedVal, idx); + // Allocate the message buffer on the heap. It must outlive this call. + auto buff = builder.create(loc, ptrI8Ty, "malloc", + ValueRange(messageBufferSize)); + Value rawMessageBuffer = buff.getResult(0); + Value msgBufferPrefix = + builder.create(loc, structPtrTy, rawMessageBuffer); + + // Populate the message buffer with the pointer-free argument values. + if (hasDynamicSignature) { + auto addendumScratch = builder.create(loc, ptrI8Ty); + Value prefixSize = + builder.create(loc, i64Ty, msgStructTy); + auto arrMessageBuffer = builder.create( + loc, cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(i8Ty)), + rawMessageBuffer); + // Compute the position of the addendum. + Value addendumPtr = builder.create( + loc, ptrI8Ty, arrMessageBuffer, + ArrayRef{prefixSize}); + populateMessageBuffer(loc, builder, module, msgBufferPrefix, zippy, + addendumPtr, addendumScratch); + } else { + populateMessageBuffer(loc, builder, module, msgBufferPrefix, zippy); } - // Compute the struct size - Value structSize = - builder.create(loc, i64Ty, msgStructTy); - - // Here we do have vector args - Value extendedStructSize = - hasTrailingData - ? builder.create(loc, structSize, extraBytes) - : structSize; - // If no vector args, handle this simple case and drop out - Value buff = builder - .create(loc, ptrI8Ty, "malloc", - ValueRange(extendedStructSize)) - .getResult(0); - - Value casted = builder.create(loc, structPtrTy, buff); - builder.create(loc, stVal, casted); - if (hasTrailingData) { - auto arrTy = cudaq::cc::ArrayType::get(i8Ty); - auto ptrArrTy = cudaq::cc::PointerType::get(arrTy); - auto cast1 = builder.create(loc, ptrArrTy, buff); - Value vecToBuffer = builder.create( - loc, ptrI8Ty, cast1, SmallVector{structSize}); - for (auto iter : llvm::enumerate(msgStructTy.getMembers())) { - std::int32_t idx = iter.index(); - if (idx == static_cast(kernelArgTypes.size())) - break; - // Get the corresponding cudaq kernel arg type - auto currArgTy = kernelArgTypes[idx]; - if (auto stdvecTy = dyn_cast(currArgTy)) { - auto bytes = builder.create( - loc, builder.getI64Type(), stVal, idx); - Value argPtrPtr = builder.create( - loc, ptrPtrType, variadicArgs, - ArrayRef{idx}); - auto ptrInTy = cudaq::cc::PointerType::get( - cudaq::opt::factory::stlVectorType(stdvecTy.getElementType())); - Value arg = - builder.create(loc, ptrI8Ty, argPtrPtr); - arg = builder.create(loc, ptrInTy, arg); - vecToBuffer = encodeVectorData(loc, builder, bytes, stdvecTy, arg, - vecToBuffer, ptrInTy); - if (stdvecTy.getElementType() == builder.getI1Type()) { - auto ptrI1Ty = cudaq::cc::PointerType::get(builder.getI1Type()); - assert(replacementArgs.count(idx) && "must be in map"); - auto arg = replacementArgs[idx]; - auto heapPtr = builder.create( - loc, cudaq::cc::PointerType::get(ptrI1Ty), arg, - ArrayRef{0}); - auto loadHeapPtr = builder.create(loc, heapPtr); - auto i8Ty = builder.getI8Type(); - Value heapCast = builder.create( - loc, cudaq::cc::PointerType::get(i8Ty), loadHeapPtr); - builder.create(loc, std::nullopt, "free", - ArrayRef{heapCast}); - } - } else if (auto strTy = dyn_cast(currArgTy)) { - if (cudaq::cc::isDynamicType(strTy)) { - Value argPtrPtr = builder.create( - loc, ptrPtrType, variadicArgs, - ArrayRef{idx}); - Value arg = - builder.create(loc, ptrI8Ty, argPtrPtr); - Type hostArgTy = getHostArgType(idx); - arg = builder.create(loc, hostArgTy, arg); - auto structPtrArrTy = cudaq::cc::PointerType::get( - cudaq::cc::ArrayType::get(msgStructTy)); - auto temp = - builder.create(loc, structPtrArrTy, buff); - vecToBuffer = encodeDynamicStructData(loc, builder, strTy, arg, - temp, vecToBuffer); - } - } - } - } - builder.create(loc, buff, entry->getArgument(1)); - builder.create(loc, ValueRange{extendedStructSize}); - return argsCreatorFunc; - } + maybeFreeHeapAllocations(loc, builder, heapTracker); - /// In the thunk, we need to unpack any `std::vector` objects encoded in the - /// packet. Since these have dynamic size, they are encoded as trailing bytes - /// by offset and size. The offset is implicit from the values of the - /// arguments. All sizes are encoded as `int64_t`. - /// - /// A vector of vector of ... T is encoded as a int64_t (length). At the - /// offset of the level `i` vector will be a sequence of sizes for the level - /// `i+1` vectors. For the leaf vector level, `n`, the blocks of data for each - /// vector will be immediately following for each vector at level `n` for the - /// branch of the tree being encoded. - /// - /// For example, a variable defined and initialized as - /// ``` - /// vector>> example = - /// {{{'a'}, {'b', 'c'}, {'z'}}, {{'d' 'e', 'f'}}}; - /// ``` - /// - /// and passed as an argument to a kernel will be encoded as the following - /// block. The block will have a structure with the declared arguments - /// followed by an addendum of variable data, where the vector data is - /// encoded. - /// - /// ``` - /// arguments: { ..., 1, ... } - /// addendum: [[3; 1 2 1, a, b c, z] [1; 3, d e f]] - /// ``` - std::pair unpackStdVector(Location loc, OpBuilder &builder, - cudaq::cc::SpanLikeType stdvecTy, - Value vecSize, Value trailingData) { - // Convert the pointer-free std::vector to a span structure to be - // passed. A span structure is a pointer and a size (in element - // units). Note that this structure may be recursive. - auto i8Ty = builder.getI8Type(); - auto arrI8Ty = cudaq::cc::ArrayType::get(i8Ty); - auto ptrI8Ty = cudaq::cc::PointerType::get(i8Ty); - auto bytesTy = cudaq::cc::PointerType::get(arrI8Ty); - Type eleTy = stdvecTy.getElementType(); - auto innerStdvecTy = dyn_cast(eleTy); - std::size_t eleSize = - innerStdvecTy ? /*(i64Type/8)*/ 8 : dataLayout->getTypeSize(eleTy); - auto eleSizeVal = [&]() -> Value { - if (eleSize) - return builder.create(loc, eleSize, 64); - assert(isa(eleTy) || - (isa(eleTy) && - !cast(eleTy).isUnknownSize())); - auto i64Ty = builder.getI64Type(); - return builder.create(loc, i64Ty, eleTy); - }(); - auto vecLength = builder.create(loc, vecSize, eleSizeVal); - if (innerStdvecTy) { - // Recursive case: std::vector> - // TODO: Uses stack allocation, however it may be better to use heap - // allocation. It's not clear the QPU has heap memory allocation. If this - // uses heap allocation, then the thunk must free that memory *after* the - // kernel proper returns. - auto vecTmp = builder.create(loc, eleTy, vecLength); - auto currentEnd = builder.create(loc, ptrI8Ty); - auto i64Ty = builder.getI64Type(); - auto arrI64Ty = cudaq::cc::ArrayType::get(i64Ty); - auto arrTy = cudaq::cc::PointerType::get(arrI64Ty); - auto innerVec = - builder.create(loc, arrTy, trailingData); - auto trailingBytes = - builder.create(loc, bytesTy, trailingData); - trailingData = builder.create( - loc, ptrI8Ty, trailingBytes, vecSize); - builder.create(loc, trailingData, currentEnd); - // Loop over each subvector in the vector and recursively unpack it into - // the vecTmp variable. Leaf vectors do not need a fresh variable. This - // effectively translates all the size/offset information for all the - // subvectors into temps. - Value vecLengthIndex = builder.create( - loc, builder.getI64Type(), vecLength, - cudaq::cc::CastOpMode::Unsigned); - cudaq::opt::factory::createInvariantLoop( - builder, loc, vecLengthIndex, - [&](OpBuilder &builder, Location loc, Region &, Block &block) { - Value i = block.getArgument(0); - auto innerPtr = builder.create( - loc, cudaq::cc::PointerType::get(i64Ty), innerVec, - SmallVector{i}); - Value innerVecSize = - builder.create(loc, innerPtr); - Value tmp = builder.create(loc, currentEnd); - auto unpackPair = - unpackStdVector(loc, builder, innerStdvecTy, innerVecSize, tmp); - auto ptrInnerTy = cudaq::cc::PointerType::get(innerStdvecTy); - auto subVecPtr = builder.create( - loc, ptrInnerTy, vecTmp, - SmallVector{i}); - builder.create(loc, unpackPair.first, - subVecPtr); - builder.create(loc, unpackPair.second, - currentEnd); - }); - auto coerceResult = builder.create( - loc, cudaq::cc::PointerType::get(stdvecTy), vecTmp); - trailingData = builder.create(loc, currentEnd); - Value result = builder.create( - loc, stdvecTy, coerceResult, vecLength); - return {result, trailingData}; - } - // Must divide by byte, 8 bits. - // The data is at trailingData and is valid for vecLength of eleTy. - auto castData = builder.create( - loc, cudaq::cc::PointerType::get(eleTy), trailingData); - Value stdVecResult = builder.create( - loc, stdvecTy, castData, vecLength); - auto arrTy = cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(i8Ty)); - Value casted = builder.create(loc, arrTy, trailingData); - trailingData = - builder.create(loc, ptrI8Ty, casted, vecSize); - return {stdVecResult, trailingData}; - } + // Return the message buffer and its size in bytes. + builder.create(loc, rawMessageBuffer, + entry->getArgument(1)); + builder.create(loc, ValueRange{messageBufferSize}); - /// Translate the buffer data to a sequence of arguments suitable to the - /// actual kernel call. - /// - /// \param inTy The actual expected type of the argument. - /// \param structTy The modified buffer type over all the arguments at the - /// current level. - std::pair processInputValue(Location loc, OpBuilder &builder, - Value trailingData, Value val, - Type inTy, std::int64_t off, - cudaq::cc::StructType structTy) { - if (isa(inTy)) { - auto i64Ty = builder.getI64Type(); - auto key = - builder.create(loc, i64Ty, val, off); - return {builder.create(loc, inTy, key), trailingData}; - } - if (isa(inTy)) - return {builder.create(loc, inTy), trailingData}; - if (auto stdVecTy = dyn_cast(inTy)) { - Value vecSize = builder.create( - loc, builder.getI64Type(), val, off); - return unpackStdVector(loc, builder, stdVecTy, vecSize, trailingData); - } - if (auto strTy = dyn_cast(inTy)) { - if (!cudaq::cc::isDynamicType(strTy)) { - if (strTy.isEmpty()) - return {builder.create(loc, inTy), trailingData}; - return {builder.create(loc, inTy, val, off), - trailingData}; - } - // The struct contains dynamic components. Extract them and build up the - // struct value to be passed as an argument. - Type buffMemTy = structTy.getMember(off); - Value strVal = builder.create(loc, inTy); - Value subVal = - builder.create(loc, buffMemTy, val, off); - // Convert the argument type, strTy, to a buffer type. - auto memberArgTy = cast( - cudaq::opt::factory::genArgumentBufferType(strTy)); - for (auto iter : llvm::enumerate(strTy.getMembers())) { - auto [a, t] = - processInputValue(loc, builder, trailingData, subVal, iter.value(), - iter.index(), memberArgTy); - trailingData = t; - strVal = builder.create(loc, inTy, strVal, a, - iter.index()); - } - return {strVal, trailingData}; - } - return {builder.create(loc, inTy, val, off), - trailingData}; + // Note: the .argsCreator will have allocated space for a static result in + // the message buffer. If the kernel returns a dynamic result, the launch + // kernel code will have to properly return it in the appropriate context. + return argsCreatorFunc; } /// Generate the thunk function. This function is called by the library @@ -747,7 +1336,6 @@ class GenerateKernelExecution auto castOp = builder.create(loc, structPtrTy, thunkEntry->getArgument(0)); auto isClientServer = thunkEntry->getArgument(1); - Value val = builder.create(loc, castOp); auto i64Ty = builder.getI64Type(); // Compute the struct size without the trailing bytes, structSize. @@ -768,7 +1356,7 @@ class GenerateKernelExecution SmallVector args; const std::int32_t offset = funcTy.getNumInputs(); for (auto inp : llvm::enumerate(funcTy.getInputs())) { - auto [a, t] = processInputValue(loc, builder, trailingData, val, + auto [a, t] = processInputValue(loc, builder, trailingData, castOp, inp.value(), inp.index(), structTy); trailingData = t; args.push_back(a); @@ -846,438 +1434,76 @@ class GenerateKernelExecution return thunk; } - /// Generate code to initialize the std::vector, \p sret, from an - /// initializer list with data at \p data and length \p size. Use the library - /// helper routine. This function takes two !llvm.ptr arguments. - void genStdvecBoolFromInitList(Location loc, OpBuilder &builder, Value sret, - Value data, Value size) { - auto ptrTy = cudaq::cc::PointerType::get(builder.getContext()); - auto castData = builder.create(loc, ptrTy, data); - auto castSret = builder.create(loc, ptrTy, sret); - builder.create(loc, std::nullopt, - cudaq::stdvecBoolCtorFromInitList, - ArrayRef{castSret, castData, size}); - } - - /// Generate a `std::vector` (where `T != bool`) from an initializer list. - /// This is done with the assumption that `std::vector` is implemented as a - /// triple of pointers. The original content of the vector is freed and the - /// new content, which is already on the stack, is moved into the - /// `std::vector`. - void genStdvecTFromInitList(Location loc, OpBuilder &builder, Value sret, - Value data, Value tSize, Value vecSize) { - auto i8Ty = builder.getI8Type(); - auto stlVectorTy = - cudaq::cc::PointerType::get(cudaq::opt::factory::stlVectorType(i8Ty)); - auto ptrTy = cudaq::cc::PointerType::get(i8Ty); - auto castSret = builder.create(loc, stlVectorTy, sret); - auto ptrPtrTy = cudaq::cc::PointerType::get(ptrTy); - auto sret0 = builder.create( - loc, ptrPtrTy, castSret, SmallVector{0}); - auto arrI8Ty = cudaq::cc::ArrayType::get(i8Ty); - auto ptrArrTy = cudaq::cc::PointerType::get(arrI8Ty); - auto buffPtr0 = builder.create(loc, ptrTy, data); - builder.create(loc, buffPtr0, sret0); - auto sret1 = builder.create( - loc, ptrPtrTy, castSret, SmallVector{1}); - Value byteLen = builder.create(loc, tSize, vecSize); - auto buffPtr = builder.create(loc, ptrArrTy, data); - auto endPtr = builder.create( - loc, ptrTy, buffPtr, SmallVector{byteLen}); - builder.create(loc, endPtr, sret1); - auto sret2 = builder.create( - loc, ptrPtrTy, castSret, SmallVector{2}); - builder.create(loc, endPtr, sret2); - } - - static MutableArrayRef - dropAnyHiddenArguments(MutableArrayRef args, - FunctionType funcTy, bool hasThisPointer) { - const bool hiddenSRet = cudaq::opt::factory::hasHiddenSRet(funcTy); - const unsigned count = - cudaq::cc::numberOfHiddenArgs(hasThisPointer, hiddenSRet); - if (count > 0 && args.size() >= count && - std::all_of(args.begin(), args.begin() + count, [](auto i) { - return isa(i.getType()); - })) - return args.drop_front(count); - return args; - } - - // Return the vector's length, computed on the CPU side, in bytes. - Value computeHostVectorLengthInBytes(Location loc, OpBuilder &builder, - Value hostArg, Type eleTy, - cudaq::cc::PointerType hostVecTy) { - auto rawSize = getVectorSize(loc, builder, hostVecTy, hostArg); - if (isa(eleTy)) { - auto three = builder.create(loc, 3, 64); - return builder.create(loc, rawSize, three); - } - return rawSize; - } - - Value fetchHostVectorFront(Location loc, OpBuilder &builder, Value hostArg, - cudaq::cc::PointerType hostVecTy) { - auto inpStructTy = cast(hostVecTy.getElementType()); - auto ptrTtype = cudaq::cc::PointerType::get(inpStructTy.getMember(0)); - auto beginPtr = builder.create( - loc, ptrTtype, hostArg, SmallVector{0}); - auto ptrArrSTy = cudaq::opt::factory::getIndexedObjectType(inpStructTy); - auto vecPtr = builder.create( - loc, cudaq::cc::PointerType::get(ptrArrSTy), beginPtr); - return builder.create(loc, vecPtr); - } - - Value recursiveVectorDataCopy(Location loc, OpBuilder &builder, Value hostArg, - Value buffPtr, cudaq::cc::SpanLikeType stdvecTy, - cudaq::cc::PointerType hostVecTy) { - auto vecLen = computeHostVectorLengthInBytes(loc, builder, hostArg, - stdvecTy, hostVecTy); - auto nested = fetchHostVectorFront(loc, builder, hostArg, hostVecTy); - auto vecLogicalLen = convertLengthBytesToLengthI64(loc, builder, vecLen); - auto vecLenIndex = builder.create( - loc, builder.getI64Type(), vecLogicalLen, - cudaq::cc::CastOpMode::Unsigned); - auto buffPtrTy = cast(buffPtr.getType()); - auto tmp = builder.create(loc, buffPtrTy); - auto buffArrTy = cudaq::cc::ArrayType::get(buffPtrTy.getElementType()); - auto castPtr = builder.create( - loc, cudaq::cc::PointerType::get(buffArrTy), buffPtr); - auto newEnd = builder.create( - loc, buffPtrTy, castPtr, SmallVector{vecLen}); - builder.create(loc, newEnd, tmp); - auto i64Ty = builder.getI64Type(); - auto arrI64Ty = cudaq::cc::ArrayType::get(i64Ty); - auto ptrI64Ty = cudaq::cc::PointerType::get(i64Ty); - auto ptrArrTy = cudaq::cc::PointerType::get(arrI64Ty); - auto vecBasePtr = builder.create(loc, ptrArrTy, buffPtr); - auto nestedArr = builder.create(loc, hostVecTy, nested); - auto hostArrVecTy = cudaq::cc::PointerType::get( - cudaq::cc::ArrayType::get(hostVecTy.getElementType())); - cudaq::opt::factory::createInvariantLoop( - builder, loc, vecLenIndex, - [&](OpBuilder &builder, Location loc, Region &, Block &block) { - Value i = block.getArgument(0); - auto currBuffPtr = builder.create( - loc, ptrI64Ty, vecBasePtr, ArrayRef{i}); - auto upCast = - builder.create(loc, hostArrVecTy, nestedArr); - auto hostSubVec = builder.create( - loc, hostVecTy, upCast, ArrayRef{i}); - Value buff = builder.create(loc, tmp); - // Compute and save the byte size. - auto vecSz = computeHostVectorLengthInBytes( - loc, builder, hostSubVec, stdvecTy.getElementType(), hostVecTy); - builder.create(loc, vecSz, currBuffPtr); - // Recursively copy vector data. - auto endBuff = encodeVectorData(loc, builder, vecSz, stdvecTy, - hostSubVec, buff, hostVecTy); - builder.create(loc, endBuff, tmp); - }); - return builder.create(loc, tmp); - } - - /// Recursively encode a `std::vector` into a buffer's addendum. The data is - /// read from \p hostArg. The data is \p bytes size long if this is a leaf - /// vector, otherwise the size is computed on-the-fly during the encoding of - /// the ragged array. - /// \return The new pointer to the end of the addendum block. - Value encodeVectorData(Location loc, OpBuilder &builder, Value bytes, - cudaq::cc::SpanLikeType stdvecTy, Value hostArg, - Value bufferAddendum, cudaq::cc::PointerType ptrInTy) { - auto eleTy = stdvecTy.getElementType(); - if (auto subVecTy = dyn_cast(eleTy)) - return recursiveVectorDataCopy(loc, builder, hostArg, bufferAddendum, - subVecTy, ptrInTy); - return copyVectorData(loc, builder, bytes, hostArg, bufferAddendum); - } - - /// Recursively encode a struct which has dynamically sized members (such as - /// vectors). The vector members are encoded as i64 sizes with the data - /// attached to the buffer addendum. - /// \return The new pointer to the end of the addendum block. - Value encodeDynamicStructData(Location loc, OpBuilder &builder, - cudaq::cc::StructType deviceTy, Value hostArg, - Value bufferArg, Value bufferAddendum) { - for (auto iter : llvm::enumerate(deviceTy.getMembers())) { - auto memTy = iter.value(); - if (auto vecTy = dyn_cast(memTy)) { - Type eTy = vecTy.getElementType(); - auto hostTy = cudaq::opt::factory::stlVectorType(eTy); - auto ptrHostTy = cudaq::cc::PointerType::get(hostTy); - auto ptrI64Ty = cudaq::cc::PointerType::get(builder.getI64Type()); - std::int32_t offset = iter.index(); - auto sizeAddr = builder.create( - loc, ptrI64Ty, bufferArg, - ArrayRef{0, 0, offset}); - auto size = builder.create(loc, sizeAddr); - auto vecAddr = builder.create( - loc, ptrHostTy, hostArg, - ArrayRef{offset}); - bufferAddendum = encodeVectorData(loc, builder, size, vecTy, vecAddr, - bufferAddendum, ptrHostTy); - } else if (auto strTy = dyn_cast(memTy)) { - if (cudaq::cc::isDynamicType(strTy)) { - auto ptrStrTy = cudaq::cc::PointerType::get(strTy); - std::int32_t idx = iter.index(); - auto strAddr = builder.create( - loc, ptrStrTy, bufferArg, - ArrayRef{idx}); - bufferAddendum = encodeDynamicStructData(loc, builder, strTy, strAddr, - bufferArg, bufferAddendum); - } - } else if (auto arrTy = dyn_cast(memTy)) { - // This is like vector type if the array has dynamic size. If it has a - // constant size, it is like a struct with n identical members. - TODO_loc(loc, "array type"); - } - } - return bufferAddendum; - } - - static std::pair - lookupHostEntryPointFunc(StringRef mangledEntryPointName, ModuleOp module, - func::FuncOp funcOp) { - if (mangledEntryPointName.equals("BuilderKernel.EntryPoint") || - mangledEntryPointName.contains("_PyKernelEntryPointRewrite")) { - // No host entry point needed. - return {false, func::FuncOp{}}; - } - if (auto *decl = module.lookupSymbol(mangledEntryPointName)) - if (auto func = dyn_cast(decl)) { - func.eraseBody(); - return {true, func}; - } - funcOp.emitOpError("could not generate the host-side kernel function (" + - mangledEntryPointName + ")"); - return {true, func::FuncOp{}}; - } - - /// Generate an all new entry point body, calling launchKernel in the runtime - /// library. Pass along the thunk, so the runtime can call the quantum - /// circuit. These entry points are `operator()` member functions in a class, - /// so account for the `this` argument here. - void genNewHostEntryPoint(Location loc, OpBuilder &builder, + /// Generate an all new entry point body, calling someLaunchKernel in + /// the runtime library. Pass along the thunk, so the runtime can call the + /// quantum circuit. These entry points may be `operator()` member functions + /// in a class, so account for the `this` argument here. + void genNewHostEntryPoint(Location loc, OpBuilder &builder, ModuleOp module, FunctionType devFuncTy, LLVM::GlobalOp kernelNameObj, func::FuncOp hostFunc, bool addThisPtr, cudaq::cc::StructType structTy, func::FuncOp thunkFunc) { auto *ctx = builder.getContext(); auto i64Ty = builder.getI64Type(); - std::int32_t offset = devFuncTy.getNumInputs(); + auto i8Ty = builder.getI8Type(); + auto ptrI8Ty = cudaq::cc::PointerType::get(i8Ty); auto thunkTy = getThunkType(ctx); auto structPtrTy = cudaq::cc::PointerType::get(structTy); - Block *hostFuncEntryBlock = hostFunc.addEntryBlock(); + const std::int32_t offset = devFuncTy.getNumInputs(); + Block *hostFuncEntryBlock = hostFunc.addEntryBlock(); OpBuilder::InsertionGuard guard(builder); builder.setInsertionPointToStart(hostFuncEntryBlock); - auto i8Ty = builder.getI8Type(); - auto ptrI8Ty = cudaq::cc::PointerType::get(i8Ty); - Value temp; + SmallVector blockArgs{dropAnyHiddenArguments( + hostFuncEntryBlock->getArguments(), devFuncTy, addThisPtr)}; + SmallVector blockValues(blockArgs.size()); + std::copy(blockArgs.begin(), blockArgs.end(), blockValues.begin()); + const bool hasDynamicSignature = isDynamicSignature(devFuncTy); + Value heapTracker = createEmptyHeapTracker(loc, builder); + auto zippy = zipArgumentsWithDeviceTypes( + loc, builder, module, blockValues, devFuncTy.getInputs(), heapTracker); + auto sizeScratch = builder.create(loc, i64Ty); + auto messageBufferSize = [&]() -> Value { + if (hasDynamicSignature) + return genSizeOfDynamicMessageBuffer(loc, builder, module, structTy, + zippy, sizeScratch); + return builder.create(loc, i64Ty, structTy); + }(); + + Value msgBufferPrefix; Value castTemp; Value resultOffset; Value castLoadThunk; Value extendedStructSize; if (isCodegenPackedData(codegenKind)) { - Value stVal = builder.create(loc, structTy); - - // Process all the arguments for the original call, ignoring any hidden - // arguments (such as the `this` pointer). - auto zero = builder.create(loc, 0, 64); - Value extraBytes = zero; - bool hasTrailingData = false; - SmallVector blockArgs{dropAnyHiddenArguments( - hostFuncEntryBlock->getArguments(), devFuncTy, addThisPtr)}; - std::int32_t idx = 0; - SmallVector blockValues(blockArgs.size()); - std::copy(blockArgs.begin(), blockArgs.end(), blockValues.begin()); - for (auto iter = blockArgs.begin(), end = blockArgs.end(); iter != end; - ++iter, ++idx) { - Value arg = *iter; - Type inTy = arg.getType(); - Type quakeTy = devFuncTy.getInput(idx); - // If the argument is a callable, skip it. - if (isa(quakeTy)) - continue; - - // Argument is a packaged kernel. In this case, the argument is some - // unknown kernel that may be called. The packaged argument is coming - // from opaque C++ host code, so we need to identify what kernel it - // references and then pass its name as a span of characters to the - // launch kernel. - if (isa(quakeTy)) { - auto kernKey = builder.create( - loc, i64Ty, cudaq::runtime::getLinkableKernelKey, - ValueRange{arg}); - stVal = builder.create( - loc, stVal.getType(), stVal, kernKey.getResult(0), idx); - continue; - } - - // If the argument is an empty struct, skip it. - if (auto strTy = dyn_cast(quakeTy)) - if (strTy.isEmpty()) - continue; - - if (auto stdvecTy = dyn_cast(quakeTy)) { - // Per the CUDA-Q spec, an entry point kernel must take a `[const] - // std::vector` value argument. - // Should the spec stipulate that pure device kernels must pass by - // read-only reference, i.e., take `const std::vector &` arguments? - auto ptrInTy = cast(inTy); - // If this is a std::vector, unpack it. - if (stdvecTy.getElementType() == builder.getI1Type()) { - // Create a mock vector of i8 and populate the bools, 1 per char. - Value tmp = builder.create( - loc, ptrInTy.getElementType()); - builder.create(loc, std::nullopt, - cudaq::stdvecBoolUnpackToInitList, - ValueRange{tmp, arg}); - arg = blockValues[idx] = tmp; - } - // FIXME: call the `size` member function. For expediency, assume this - // is an std::vector and the size is the scaled delta between the - // first two pointers. Use the unscaled size for now. - auto [p1, p2] = insertVectorSizeAndIncrementExtraBytes( - loc, builder, arg, ptrInTy, stdvecTy, stVal, idx, extraBytes); - stVal = p1; - extraBytes = p2; - hasTrailingData = true; - continue; - } - if (auto strTy = dyn_cast(quakeTy)) { - if (!isa(arg.getType())) { - // If argument is not a pointer, then struct was promoted into a - // register. - auto *parent = builder.getBlock()->getParentOp(); - auto module = parent->getParentOfType(); - auto tmp = builder.create(loc, quakeTy); - auto cast = builder.create( - loc, cudaq::cc::PointerType::get(arg.getType()), tmp); - if (cudaq::opt::factory::isX86_64(module)) { - builder.create(loc, arg, cast); - if (cudaq::opt::factory::structUsesTwoArguments(quakeTy)) { - auto arrTy = cudaq::cc::ArrayType::get(builder.getI8Type()); - auto cast = builder.create( - loc, cudaq::cc::PointerType::get(arrTy), tmp); - auto hiPtr = builder.create( - loc, cudaq::cc::PointerType::get(builder.getI8Type()), cast, - cudaq::cc::ComputePtrArg{8}); - ++iter; - Value nextArg = *iter; - auto cast2 = builder.create( - loc, cudaq::cc::PointerType::get(nextArg.getType()), hiPtr); - builder.create(loc, nextArg, cast2); - } - } else { - builder.create(loc, arg, cast); - } - // Load the assembled (sub-)struct and insert into the buffer value. - Value v = builder.create(loc, tmp); - stVal = builder.create( - loc, stVal.getType(), stVal, v, idx); - continue; - } - if (!cudaq::cc::isDynamicType(strTy)) { - // struct is static size, so just load the value (byval ptr). - Value v = builder.create(loc, arg); - stVal = builder.create( - loc, stVal.getType(), stVal, v, idx); - continue; - } - auto genTy = cast( - cudaq::opt::factory::genArgumentBufferType(strTy)); - Value zero = builder.create(loc, 0, 64); - auto [quakeVal, recursiveSize] = computeRecursiveDynamicStructSize( - loc, builder, strTy, arg, zero, genTy); - stVal = builder.create( - loc, stVal.getType(), stVal, quakeVal, idx); - extraBytes = - builder.create(loc, extraBytes, recursiveSize); - hasTrailingData = true; - continue; - } - if (auto ptrTy = dyn_cast(inTy)) { - if (isa(ptrTy.getElementType())) { - // Special case: if the argument is a `cudaq::state*`, then just - // pass the pointer. We can do that in this case because the - // synthesis step (which will receive the argument data) is assumed - // to run in the same memory space. - Value argPtr = builder.create(loc, inTy, arg); - stVal = builder.create( - loc, stVal.getType(), stVal, argPtr, idx); - } - continue; - } - - stVal = builder.create(loc, stVal.getType(), - stVal, arg, idx); + auto rawMessageBuffer = + builder.create(loc, i8Ty, messageBufferSize); + msgBufferPrefix = + builder.create(loc, structPtrTy, rawMessageBuffer); + + if (hasDynamicSignature) { + auto addendumScratch = + builder.create(loc, ptrI8Ty); + Value prefixSize = + builder.create(loc, i64Ty, structTy); + Value addendumPtr = builder.create( + loc, ptrI8Ty, rawMessageBuffer, + ArrayRef{prefixSize}); + populateMessageBuffer(loc, builder, module, msgBufferPrefix, zippy, + addendumPtr, addendumScratch); + } else { + populateMessageBuffer(loc, builder, module, msgBufferPrefix, zippy); } - // Compute the struct size without the trailing bytes, structSize, and - // with the trailing bytes, extendedStructSize. - Value structSize = - builder.create(loc, i64Ty, structTy); - extendedStructSize = - builder.create(loc, structSize, extraBytes); - - // Allocate our struct to save the argument to. - auto buff = - builder.create(loc, i8Ty, extendedStructSize); - - temp = builder.create(loc, structPtrTy, buff); - - // Store the arguments to the argument section. - builder.create(loc, stVal, temp); - - auto structPtrArrTy = - cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(structTy)); - temp = builder.create(loc, structPtrArrTy, buff); - - // Append the vector data to the end of the struct. - if (hasTrailingData) { - Value vecToBuffer = builder.create( - loc, ptrI8Ty, buff, SmallVector{structSize}); - // Ignore any hidden `this` argument. - for (auto inp : llvm::enumerate(blockValues)) { - Value arg = inp.value(); - Type inTy = arg.getType(); - std::int32_t idx = inp.index(); - Type quakeTy = devFuncTy.getInput(idx); - if (auto stdvecTy = dyn_cast(quakeTy)) { - auto bytes = builder.create(loc, i64Ty, - stVal, idx); - assert(stdvecTy == devFuncTy.getInput(idx)); - auto ptrInTy = cast(inTy); - vecToBuffer = encodeVectorData(loc, builder, bytes, stdvecTy, arg, - vecToBuffer, ptrInTy); - if (stdvecTy.getElementType() == builder.getI1Type()) { - auto ptrI1Ty = cudaq::cc::PointerType::get(builder.getI1Type()); - auto heapPtr = builder.create( - loc, cudaq::cc::PointerType::get(ptrI1Ty), arg, - ArrayRef{0}); - auto loadHeapPtr = - builder.create(loc, heapPtr); - Value heapCast = builder.create( - loc, cudaq::cc::PointerType::get(i8Ty), loadHeapPtr); - builder.create(loc, std::nullopt, "free", - ArrayRef{heapCast}); - } - continue; - } - if (auto strTy = dyn_cast(quakeTy)) { - if (cudaq::cc::isDynamicType(strTy)) - vecToBuffer = encodeDynamicStructData(loc, builder, strTy, arg, - temp, vecToBuffer); - } - } - } + maybeFreeHeapAllocations(loc, builder, heapTracker); + extendedStructSize = messageBufferSize; Value loadThunk = builder.create(loc, thunkTy, thunkFunc.getName()); castLoadThunk = builder.create(loc, ptrI8Ty, loadThunk); - castTemp = builder.create(loc, ptrI8Ty, temp); + castTemp = + builder.create(loc, ptrI8Ty, msgBufferPrefix); resultOffset = genComputeReturnOffset(loc, builder, devFuncTy, structTy); } @@ -1397,7 +1623,8 @@ class GenerateKernelExecution builder.setInsertionPointToEnd(elseBlock); // span was returned in the original buffer. Value mRes = builder.create( - loc, ptrResTy, temp, ArrayRef{0, offset}); + loc, ptrResTy, msgBufferPrefix, + ArrayRef{offset}); builder.create(loc, endifBlock, ArrayRef{mRes}); builder.setInsertionPointToEnd(endifBlock); launchResult = endifBlock->getArgument(0); @@ -1454,7 +1681,8 @@ class GenerateKernelExecution if (resultVal) { // Static values. std::vector are necessarily sret, see below. auto resPtr = builder.create( - loc, ptrResTy, temp, ArrayRef{0, offset}); + loc, ptrResTy, msgBufferPrefix, + ArrayRef{offset}); Type castToTy = cudaq::cc::PointerType::get(hostFuncTy.getResult(0)); auto castResPtr = [&]() -> Value { if (castToTy == ptrResTy) @@ -1496,8 +1724,8 @@ class GenerateKernelExecution // type for the memcpy, so the device should return an (aggregate) // value of suitable size. auto resPtr = builder.create( - loc, ptrResTy, temp, - ArrayRef{0, offset}); + loc, ptrResTy, msgBufferPrefix, + ArrayRef{offset}); auto castMsgBuff = builder.create(loc, ptrI8Ty, resPtr); Type eleTy = @@ -1516,19 +1744,6 @@ class GenerateKernelExecution builder.create(loc, results); } - /// A kernel function that takes a quantum type argument (also known as a pure - /// device kernel) cannot be called directly from C++ (classical) code. It - /// must be called via other quantum code. - bool hasLegalType(FunctionType funTy) { - for (auto ty : funTy.getInputs()) - if (quake::isQuantumType(ty)) - return false; - for (auto ty : funTy.getResults()) - if (quake::isQuantumType(ty)) - return false; - return true; - } - /// Generate a function to be executed at load-time which will register the /// kernel with the runtime. LLVM::LLVMFuncOp registerKernelWithRuntimeForExecution( @@ -1649,6 +1864,10 @@ class GenerateKernelExecution irBuilder.loadIntrinsic(module, cudaq::stdvecBoolUnpackToInitList))) return module.emitError(std::string("could not load ") + cudaq::stdvecBoolUnpackToInitList); + if (failed(irBuilder.loadIntrinsic(module, + cudaq::stdvecBoolFreeTemporaryLists))) + return module.emitError(std::string("could not load ") + + cudaq::stdvecBoolFreeTemporaryLists); if (failed(irBuilder.loadIntrinsic(module, cudaq::llvmMemCopyIntrinsic))) return module.emitError(std::string("could not load ") + cudaq::llvmMemCopyIntrinsic); @@ -1656,6 +1875,10 @@ class GenerateKernelExecution return module.emitError("could not load __nvqpp_zeroDynamicResult"); if (failed(irBuilder.loadIntrinsic(module, "__nvqpp_createDynamicResult"))) return module.emitError("could not load __nvqpp_createDynamicResult"); + if (failed( + irBuilder.loadIntrinsic(module, cudaq::runtime::getPauliWordSize))) + return module.emitError( + "could not load cudaq::pauli_word::_nvqpp_size or _nvqpp_data"); return success(); } @@ -1665,8 +1888,6 @@ class GenerateKernelExecution auto builder = OpBuilder::atBlockEnd(module.getBody()); auto mangledNameMap = module->getAttrOfType(cudaq::runtime::mangledNameMap); - DataLayoutAnalysis dla(module); // caches module's data layout information. - dataLayout = &dla.getAtOrAbove(module); std::error_code ec; llvm::ToolOutputFile out(outputFilename, ec, llvm::sys::fs::OF_None); if (ec) { @@ -1744,7 +1965,7 @@ class GenerateKernelExecution // Generate the argsCreator function used by synthesis. if (startingArgIdx == 0) { argsCreatorFunc = genKernelArgsCreatorFunction( - loc, builder, funcTy, structTy, classNameStr, hostFuncTy, + loc, builder, module, funcTy, structTy, classNameStr, hostFuncTy, hasThisPtr); } else { // We are operating in a very special case where we want the @@ -1756,7 +1977,7 @@ class GenerateKernelExecution cudaq::opt::factory::buildInvokeStructType(funcTy, startingArgIdx); argsCreatorFunc = genKernelArgsCreatorFunction( - loc, builder, funcTy, structTy_argsCreator, classNameStr, + loc, builder, module, funcTy, structTy_argsCreator, classNameStr, hostFuncTy, hasThisPtr); } } @@ -1764,8 +1985,8 @@ class GenerateKernelExecution // Generate a new mangled function on the host side to call the // callback function. if (hostEntryNeeded) - genNewHostEntryPoint(loc, builder, funcTy, kernelNameObj, hostFunc, - hasThisPtr, structTy, thunk); + genNewHostEntryPoint(loc, builder, module, funcTy, kernelNameObj, + hostFunc, hasThisPtr, structTy, thunk); // Generate a function at startup to register this kernel as having // been processed for kernel execution. @@ -1783,7 +2004,5 @@ class GenerateKernelExecution } out.keep(); } - - const DataLayout *dataLayout = nullptr; }; } // namespace diff --git a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp index 166f558275..82e6896c06 100644 --- a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp +++ b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp @@ -122,15 +122,9 @@ synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter, ATTR arrayAttr, MAKER makeElementValue) { auto *ctx = builder.getContext(); auto argTy = argument.getType(); - assert(isa(argTy) || - isa(argTy)); - ELETY eleTy = [&]() -> ELETY { - if (auto strTy = dyn_cast(argTy)) - return cast(strTy.getElementType()); - // Force cast this to ELETY. This will only happen for CharspanType. - return cast(cudaq::opt::factory::getCharType(ctx)); - }(); - auto strTy = cudaq::cc::StdvecType::get(ctx, eleTy); + assert(isa(argTy)); + auto strTy = cast(argTy); + auto eleTy = cast(strTy.getElementType()); builder.setInsertionPointToStart(argument.getOwner()); auto argLoc = argument.getLoc(); auto conArray = builder.create( @@ -572,7 +566,7 @@ class QuakeSynthesizer // If std::vector type, add it to the list of vector info. // These will be processed when we reach the buffer's appendix. - if (auto vecTy = dyn_cast(type)) { + if (auto vecTy = dyn_cast(type)) { auto eleTy = vecTy.getElementType(); if (!isa( eleTy)) { @@ -621,19 +615,6 @@ class QuakeSynthesizer continue; } - if (auto charSpanTy = dyn_cast(type)) { - const char *ptrToSizeInBuffer = - static_cast(args) + offset; - auto sizeFromBuffer = - *reinterpret_cast(ptrToSizeInBuffer); - std::size_t bytesInType = sizeof(char); - auto vectorSize = sizeFromBuffer / bytesInType; - stdVecInfo.emplace_back( - argNum, cudaq::opt::factory::getCharType(builder.getContext()), - vectorSize); - continue; - } - funcOp.emitOpError("We cannot synthesize argument(s) of this type."); signalPassFailure(); return; diff --git a/python/tests/kernel/test_observe_kernel.py b/python/tests/kernel/test_observe_kernel.py index 5bf9d5a812..24c63ba90a 100644 --- a/python/tests/kernel/test_observe_kernel.py +++ b/python/tests/kernel/test_observe_kernel.py @@ -302,8 +302,7 @@ def test_pack_args_pauli_list(): def generateRandomPauliStrings(numQubits, numPaulis): s = ['X', 'Y', 'Z', 'I'] return [ - ''.join([random.choice(s) - for i in range(numQubits)]) + ''.join([random.choice(s) for i in range(numQubits)]) for i in range(numPaulis) ] @@ -336,7 +335,8 @@ def gqeCirc2(N: int, thetas: list[float], paulis: list[cudaq.pauli_word]): ts = np.random.rand(len(pauliStings)) exp_val1 = cudaq.observe_async(gqeCirc1, obs, numQubits, list(ts), - pauliStings[0]).get().expectation() + cudaq.pauli_word( + pauliStings[0])).get().expectation() print('observe_async exp_val1', exp_val1) exp_val2 = cudaq.observe_async(gqeCirc2, obs, numQubits, list(ts), pauliStings).get().expectation() diff --git a/python/utils/OpaqueArguments.h b/python/utils/OpaqueArguments.h index 3e410a07b6..46afd2fedc 100644 --- a/python/utils/OpaqueArguments.h +++ b/python/utils/OpaqueArguments.h @@ -101,7 +101,7 @@ inline py::args simplifiedValidateInputArguments(py::args &args) { arg = args[i].attr("tolist")(); } else if (py::isinstance(arg)) { - arg = cudaq::pauli_word(py::cast(arg)); + arg = py::cast(arg); } else if (py::isinstance(arg)) { py::list arg_list = py::cast(arg); const bool all_strings = [&]() { @@ -330,8 +330,7 @@ inline void packArgs(OpaqueArguments &argData, py::args args, addArgument(argData, arg.cast()); }) .Case([&](cudaq::cc::CharspanType ty) { - addArgument(argData, - cudaq::pauli_word(arg.cast().str())); + addArgument(argData, arg.cast().str()); }) .Case([&](cudaq::cc::PointerType ty) { if (isa(ty.getElementType())) { @@ -432,8 +431,7 @@ inline void packArgs(OpaqueArguments &argData, py::args args, .Case([&](cudaq::cc::CharspanType type) { genericVecAllocator.template operator()( [](py::handle element, int index, int elementIndex) { - auto pw = element.cast(); - return cudaq::pauli_word(pw.str()); + return element.cast().str(); }); return; }) diff --git a/runtime/common/ArgumentConversion.cpp b/runtime/common/ArgumentConversion.cpp index 0de2589752..c310966a07 100644 --- a/runtime/common/ArgumentConversion.cpp +++ b/runtime/common/ArgumentConversion.cpp @@ -77,14 +77,16 @@ static Value genConstant(OpBuilder &builder, FloatType fltTy, long double *v) { static Value genConstant(OpBuilder &builder, const std::string &v, ModuleOp substMod) { auto loc = builder.getUnknownLoc(); - cudaq::IRBuilder irBuilder(builder); - auto cString = irBuilder.genCStringLiteralAppendNul(loc, substMod, v); - auto addr = builder.create( - loc, cudaq::cc::PointerType::get(cString.getType()), cString.getName()); - auto i8PtrTy = cudaq::cc::PointerType::get(builder.getI8Type()); - auto cast = builder.create(loc, i8PtrTy, addr); + auto *ctx = builder.getContext(); + auto i8Ty = builder.getI8Type(); + auto strLitTy = cudaq::cc::PointerType::get( + cudaq::cc::ArrayType::get(ctx, i8Ty, v.size() + 1)); + auto strLit = + builder.create(loc, strLitTy, v); + auto i8PtrTy = cudaq::cc::PointerType::get(i8Ty); + auto cast = builder.create(loc, i8PtrTy, strLit); auto size = builder.create(loc, v.size(), 64); - auto chSpanTy = cudaq::cc::CharspanType::get(builder.getContext()); + auto chSpanTy = cudaq::cc::CharspanType::get(ctx); return builder.create(loc, chSpanTy, cast, size); } @@ -218,6 +220,21 @@ Value dispatchSubtype(OpBuilder &builder, Type ty, void *p, ModuleOp substMod, .Default({}); } +// Get the size of \p eleTy on the host side in bytes. +static std::size_t getHostSideElementSize(Type eleTy, + llvm::DataLayout &layout) { + if (isa(eleTy)) + return sizeof(std::vector); + if (isa(eleTy)) { + // char span type is a std::string on host side. + return sizeof(std::string); + } + // Note: we want the size on the host side, but `getDataSize()` returns the + // size on the device side. This is ok for now since they are the same for + // most types and the special cases are handled above. + return cudaq::opt::getDataSize(layout, eleTy); +} + Value genConstant(OpBuilder &builder, cudaq::cc::StdvecType vecTy, void *p, ModuleOp substMod, llvm::DataLayout &layout) { typedef const char *VectorType[3]; @@ -227,11 +244,7 @@ Value genConstant(OpBuilder &builder, cudaq::cc::StdvecType vecTy, void *p, return {}; auto eleTy = vecTy.getElementType(); auto elePtrTy = cudaq::cc::PointerType::get(eleTy); - auto eleSize = cudaq::opt::getDataSize(layout, eleTy); - if (isa(eleTy)) { - // char span type (i.e. pauli word) is a `vector` - eleSize = sizeof(VectorType); - } + auto eleSize = getHostSideElementSize(eleTy, layout); assert(eleSize && "element must have a size"); auto loc = builder.getUnknownLoc(); diff --git a/runtime/cudaq/cudaq.cpp b/runtime/cudaq/cudaq.cpp index 10ecc3b914..ca84a43121 100644 --- a/runtime/cudaq/cudaq.cpp +++ b/runtime/cudaq/cudaq.cpp @@ -470,20 +470,44 @@ void __nvqpp_initializer_list_to_vector_bool(std::vector &result, /// `std::vector` overload. The conversion turns the `std::vector` /// into a mock vector structure that looks like `std::vector`. The /// calling routine must cleanup the buffer allocated by this code. -void __nvqpp_vector_bool_to_initializer_list(void *outData, - const std::vector &inVec) { +/// This helper routine may only be called on the host side. +void __nvqpp_vector_bool_to_initializer_list( + void *outData, const std::vector &inVec, + std::vector **allocations) { // The MockVector must be allocated by the caller. struct MockVector { char *start; char *end; + char *end2; }; MockVector *mockVec = reinterpret_cast(outData); auto outSize = inVec.size(); // The buffer allocated here must be freed by the caller. - mockVec->start = static_cast(malloc(outSize)); - mockVec->end = mockVec->start + outSize; + if (!*allocations) + *allocations = new std::vector; + char *newData = static_cast(malloc(outSize)); + (*allocations)->push_back(newData); + mockVec->start = newData; + mockVec->end2 = mockVec->end = newData + outSize; for (unsigned i = 0; i < outSize; ++i) - (mockVec->start)[i] = static_cast(inVec[i]); + newData[i] = static_cast(inVec[i]); } + +/// This helper routine deletes the vector that tracks all the temporaries that +/// were created as well as the temporaries themselves. +/// This routine may only be called on the host side. +void __nvqpp_vector_bool_free_temporary_initlists( + std::vector *allocations) { + for (auto *p : *allocations) + free(p); + delete allocations; +} + +/// Quasi-portable string helpers for Python (non-C++ frontends). These library +/// helper functions allow non-C++ front-ends to remain portable with the core +/// layer. As these helpers ought to be built along with the bindings, there +/// should not be a compatibility issue. +const char *__nvqpp_getStringData(const std::string &s) { return s.data(); } +std::uint64_t __nvqpp_getStringSize(const std::string &s) { return s.size(); } } } // namespace cudaq::support diff --git a/runtime/cudaq/qis/pauli_word.h b/runtime/cudaq/qis/pauli_word.h index afcd446e77..4a49a706a1 100644 --- a/runtime/cudaq/qis/pauli_word.h +++ b/runtime/cudaq/qis/pauli_word.h @@ -5,23 +5,59 @@ * This source code and the accompanying materials are made available under * * the terms of the Apache License 2.0 which accompanies this distribution. * ******************************************************************************/ + #pragma once +#include +#include #include -#include namespace cudaq { -/// @brief The `pauli_word` is a thin wrapper on a -/// Pauli tensor product string, e.g. `XXYZ` on 4 -// qubits. -class pauli_word { -private: - std::vector term; +/// @brief The `pauli_word` is a thin wrapper on a Pauli tensor product string, +/// e.g. `XXYZ` on 4 qubits. +class pauli_word { public: pauli_word() = default; - pauli_word(const std::string t) : term(t.begin(), t.end()) {} - std::string str() const { return std::string(term.begin(), term.end()); } - const std::vector &data() const { return term; } + pauli_word(std::string &&t) : term{std::move(t)} { to_upper_case(); } + pauli_word(const std::string &t) : term(t) { to_upper_case(); } + pauli_word(const char *const p) : term{p} { to_upper_case(); } + pauli_word &operator=(const std::string &t) { + term = t; + to_upper_case(); + return *this; + } + pauli_word &operator=(const char *const p) { + term = p; + to_upper_case(); + return *this; + } + + std::string str() const { return term; } + + // TODO: Obsolete? Used by KernelWrapper.h only. + const std::vector data() const { return {term.begin(), term.end()}; } + +private: + // Convert the string member to upper case at construction/assignment. + // TODO: This should probably verify the string contains only letters valid in + // this alphabet: I, X, Y, and Z. + void to_upper_case() { + std::transform(term.begin(), term.end(), term.begin(), ::toupper); + } + + // These methods used by the compiler. + __attribute__((used)) const char *_nvqpp_data() const { return term.data(); } + __attribute__((used)) std::uint64_t _nvqpp_size() const { + return term.size(); + } + + std::string term; ///< Pauli words are string-like. }; -} // namespace cudaq \ No newline at end of file + +namespace details { +static_assert(sizeof(std::string) == sizeof(pauli_word)); +// This constant used by the compiler. +static constexpr std::uint64_t _nvqpp_sizeof = sizeof(pauli_word); +} // namespace details +} // namespace cudaq diff --git a/runtime/cudaq/qis/qubit_qis.h b/runtime/cudaq/qis/qubit_qis.h index c83dffe844..c05a862bf9 100644 --- a/runtime/cudaq/qis/qubit_qis.h +++ b/runtime/cudaq/qis/qubit_qis.h @@ -17,6 +17,7 @@ #include "cudaq/qis/qreg.h" #include "cudaq/qis/qvector.h" #include "cudaq/spin_op.h" +#include #include #include @@ -828,11 +829,13 @@ std::vector mz(qubit &q, Qs &&...qs) { } namespace support { -// Helper to initialize a `vector` data structure. +// Helpers to deal with the `vector` specialized template type. extern "C" { void __nvqpp_initializer_list_to_vector_bool(std::vector &, char *, std::size_t); -void __nvqpp_vector_bool_to_initializer_list(void *, const std::vector &); +void __nvqpp_vector_bool_to_initializer_list(void *, const std::vector &, + std::vector **); +void __nvqpp_vector_bool_free_temporary_initlists(std::vector *); } } // namespace support diff --git a/runtime/test/test_argument_conversion.cpp b/runtime/test/test_argument_conversion.cpp index 987bfd4c34..c16b43ddb7 100644 --- a/runtime/test/test_argument_conversion.cpp +++ b/runtime/test/test_argument_conversion.cpp @@ -202,12 +202,11 @@ void test_scalars(mlir::MLIRContext *ctx) { // CHECK: Substitution module: // CHECK-LABEL: cc.arg_subst[0] { -// CHECK: %[[VAL_0:.*]] = cc.address_of @cstr.58595A00 : !cc.ptr> -// CHECK: %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr>) -> !cc.ptr +// CHECK: %[[VAL_0:.*]] = cc.string_literal "XYZ" : !cc.ptr> +// CHECK: %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr>) -> !cc.ptr // CHECK: %[[VAL_2:.*]] = arith.constant 3 : i64 // CHECK: %[[VAL_3:.*]] = cc.stdvec_init %[[VAL_1]], %[[VAL_2]] : (!cc.ptr, i64) -> !cc.charspan // CHECK: } -// CHECK-DAG: llvm.mlir.global private constant @cstr.58595A00("XYZ\00") {addr_space = 0 : i32} // clang-format on } @@ -250,14 +249,14 @@ void test_vectors(mlir::MLIRContext *ctx) { // clang-format off // CHECK-LABEL: cc.arg_subst[0] { // CHECK: %[[VAL_0:.*]] = cc.alloca !cc.array -// CHECK: %[[VAL_1:.*]] = cc.address_of @cstr.585800 : !cc.ptr> -// CHECK: %[[VAL_2:.*]] = cc.cast %[[VAL_1]] : (!cc.ptr>) -> !cc.ptr +// CHECK: %[[VAL_1:.*]] = cc.string_literal "XX" : !cc.ptr> +// CHECK: %[[VAL_2:.*]] = cc.cast %[[VAL_1]] : (!cc.ptr>) -> !cc.ptr // CHECK: %[[VAL_3:.*]] = arith.constant 2 : i64 // CHECK: %[[VAL_4:.*]] = cc.stdvec_init %[[VAL_2]], %[[VAL_3]] : (!cc.ptr, i64) -> !cc.charspan // CHECK: %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_0]][0] : (!cc.ptr>) -> !cc.ptr // CHECK: cc.store %[[VAL_4]], %[[VAL_5]] : !cc.ptr -// CHECK: %[[VAL_6:.*]] = cc.address_of @cstr.585900 : !cc.ptr> -// CHECK: %[[VAL_7:.*]] = cc.cast %[[VAL_6]] : (!cc.ptr>) -> !cc.ptr +// CHECK: %[[VAL_6:.*]] = cc.string_literal "XY" : !cc.ptr> +// CHECK: %[[VAL_7:.*]] = cc.cast %[[VAL_6]] : (!cc.ptr>) -> !cc.ptr // CHECK: %[[VAL_8:.*]] = arith.constant 2 : i64 // CHECK: %[[VAL_9:.*]] = cc.stdvec_init %[[VAL_7]], %[[VAL_8]] : (!cc.ptr, i64) -> !cc.charspan // CHECK: %[[VAL_10:.*]] = cc.compute_ptr %[[VAL_0]][1] : (!cc.ptr>) -> !cc.ptr @@ -265,8 +264,6 @@ void test_vectors(mlir::MLIRContext *ctx) { // CHECK: %[[VAL_11:.*]] = arith.constant 2 : i64 // CHECK: %[[VAL_12:.*]] = cc.stdvec_init %[[VAL_0]], %[[VAL_11]] : (!cc.ptr>, i64) -> !cc.stdvec // CHECK: } -// CHECK-DAG: llvm.mlir.global private constant @cstr.585800("XX\00") {addr_space = 0 : i32} -// CHECK-DAG: llvm.mlir.global private constant @cstr.585900("XY\00") {addr_space = 0 : i32} // clang-format on } @@ -502,14 +499,14 @@ void test_combinations(mlir::MLIRContext *ctx) { // CHECK-DAG: func.func private @__nvqpp_cudaq_state_createFromData_fp64(!cc.ptr, i64) -> !cc.ptr // CHECK-LABEL: cc.arg_subst[2] { // CHECK: %[[VAL_0:.*]] = cc.alloca !cc.array -// CHECK: %[[VAL_1:.*]] = cc.address_of @cstr.585800 : !cc.ptr> -// CHECK: %[[VAL_2:.*]] = cc.cast %[[VAL_1]] : (!cc.ptr>) -> !cc.ptr +// CHECK: %[[VAL_1:.*]] = cc.string_literal "XX" : !cc.ptr> +// CHECK: %[[VAL_2:.*]] = cc.cast %[[VAL_1]] : (!cc.ptr>) -> !cc.ptr // CHECK: %[[VAL_3:.*]] = arith.constant 2 : i64 // CHECK: %[[VAL_4:.*]] = cc.stdvec_init %[[VAL_2]], %[[VAL_3]] : (!cc.ptr, i64) -> !cc.charspan // CHECK: %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_0]][0] : (!cc.ptr>) -> !cc.ptr // CHECK: cc.store %[[VAL_4]], %[[VAL_5]] : !cc.ptr -// CHECK: %[[VAL_6:.*]] = cc.address_of @cstr.585900 : !cc.ptr> -// CHECK: %[[VAL_7:.*]] = cc.cast %[[VAL_6]] : (!cc.ptr>) -> !cc.ptr +// CHECK: %[[VAL_6:.*]] = cc.string_literal "XY" : !cc.ptr> +// CHECK: %[[VAL_7:.*]] = cc.cast %[[VAL_6]] : (!cc.ptr>) -> !cc.ptr // CHECK: %[[VAL_8:.*]] = arith.constant 2 : i64 // CHECK: %[[VAL_9:.*]] = cc.stdvec_init %[[VAL_7]], %[[VAL_8]] : (!cc.ptr, i64) -> !cc.charspan // CHECK: %[[VAL_10:.*]] = cc.compute_ptr %[[VAL_0]][1] : (!cc.ptr>) -> !cc.ptr @@ -517,8 +514,6 @@ void test_combinations(mlir::MLIRContext *ctx) { // CHECK: %[[VAL_11:.*]] = arith.constant 2 : i64 // CHECK: %[[VAL_12:.*]] = cc.stdvec_init %[[VAL_0]], %[[VAL_11]] : (!cc.ptr>, i64) -> !cc.stdvec // CHECK: } -// CHECK-DAG: llvm.mlir.global private constant @cstr.585800("XX\00") {addr_space = 0 : i32} -// CHECK-DAG: llvm.mlir.global private constant @cstr.585900("XY\00") {addr_space = 0 : i32} // clang-format on } diff --git a/targettests/Kernel/signature-0.cpp b/targettests/Kernel/signature-0.cpp index 882fb24704..0adf9c8779 100644 --- a/targettests/Kernel/signature-0.cpp +++ b/targettests/Kernel/signature-0.cpp @@ -65,12 +65,9 @@ class Qernel6 { } }; -// FIXME: unhandled ctor call -#define NYI /*__qpu__*/ - class Qernel7 { public: - std::vector operator()(std::vector v) NYI { return v; } + std::vector operator()(std::vector v) __qpu__ { return v; } }; int main() { diff --git a/targettests/Kernel/signature-4.cpp b/targettests/Kernel/signature-4.cpp index 14deb5c55f..00e9effc93 100644 --- a/targettests/Kernel/signature-4.cpp +++ b/targettests/Kernel/signature-4.cpp @@ -14,10 +14,8 @@ // Tests that we can take a small struct, a struct with a vector member, a // vector of small structs, and a large struct as an argument and return the -// same. Currently, DefaultQPU::launchKernel does not handle return values at -// all. +// same. -// FIXME #define NYI /*__qpu__*/ void ok() { std::cout << "ok\n"; } @@ -48,7 +46,6 @@ struct QernelS1 { } }; -// struct with vector member not yet supported struct S2 { int _1; std::vector _2; @@ -66,6 +63,7 @@ struct QernelS2a { }; struct QernelS2 { + // kernel result type not supported (bridge) S2 operator()(S2 s) NYI { s._1++; s._2[0] = 0.0; @@ -84,16 +82,14 @@ class QernelS3a { } }; -// ctor in return not supported struct QernelS3 { - std::vector operator()(std::vector s) NYI { + std::vector operator()(std::vector s) __qpu__ { s[0]._1++; s[0]._2 = 0.0; return s; } }; -// bug in bridge std::vector mock_ctor(const std::vector &v) { return v; } struct QernelS4 { diff --git a/targettests/Kernel/signature-5.cpp b/targettests/Kernel/signature-5.cpp index a42b5b8518..a2fa263560 100644 --- a/targettests/Kernel/signature-5.cpp +++ b/targettests/Kernel/signature-5.cpp @@ -15,7 +15,6 @@ // Test kernels can take arguments of tuple or pair as well as return values of // same. -// FIXME: tuple and pair are not handled. #define NYI /*__qpu__*/ void ok() { std::cout << "ok\n"; } @@ -24,7 +23,7 @@ void fail() { std::cout << "fail\n"; } using S1 = std::tuple; struct QernelS1a { - void operator()(S1 s) NYI { + void operator()(S1 s) __qpu__ { if (std::get<0>(s) == 1 && std::get<1>(s) == 2 && std::get<2>(s) == 4) ok(); else @@ -38,10 +37,18 @@ struct QernelS1 { } }; +S1 qernel_s1b_helper(S1 s) { + return {std::get<2>(s) + 1, std::get<1>(s) + 1, std::get<0>(s) + 1}; +} + +struct QernelS1b { + S1 operator()(S1 s) NYI { return qernel_s1b_helper(s); } +}; + using S2 = std::tuple>; struct QernelS2a { - void operator()(S2 s) NYI { + void operator()(S2 s) __qpu__ { if (std::get<0>(s) == 8.16 && std::get<1>(s) == 32.64f && std::get<2>(s).size() == 2) ok(); @@ -88,6 +95,13 @@ int main() { ok(); else fail(); + std::cout << "QernelS1b "; + auto updated_s1b = QernelS1b{}(s1); + if (std::get<0>(updated_s1b) == 5 && std::get<1>(updated_s1b) == 3 && + std::get<2>(updated_s1b) == 2) + ok(); + else + fail(); std::vector v = {128, 256}; S2 s2 = {8.16, 32.64f, v}; @@ -117,6 +131,7 @@ int main() { // clang-format off // CHECK-LABEL: QernelS1a ok // CHECK-NEXT: QernelS1 ok +// CHECK-NEXT: QernelS1b ok // CHECK-NEXT: QernelS2a ok // CHECK-NEXT: QernelS2 ok // CHECK-NEXT: ok diff --git a/targettests/Remote-Sim/pauli_word.cpp b/targettests/Remote-Sim/pauli_word.cpp index cd68042325..7624d948c0 100644 --- a/targettests/Remote-Sim/pauli_word.cpp +++ b/targettests/Remote-Sim/pauli_word.cpp @@ -10,7 +10,6 @@ // clang-format off // RUN: nvq++ %cpp_std --target remote-mqpu --remote-mqpu-auto-launch 1 %s -o %t && %t -// RUN: nvq++ %cpp_std --enable-mlir --target remote-mqpu --remote-mqpu-auto-launch 1 %s -o %t && %t // clang-format on #include "remote_test_assert.h" diff --git a/targettests/SeparateCompilation/arith_spans.cpp b/targettests/SeparateCompilation/arith_spans.cpp new file mode 100644 index 0000000000..4de3979ed1 --- /dev/null +++ b/targettests/SeparateCompilation/arith_spans.cpp @@ -0,0 +1,353 @@ +/******************************************************************************* + * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +// clang-format off +// RUN: if [ command -v split-file ]; then \ +// RUN: split-file %s %t && \ +// RUN: nvq++ %cpp_std --enable-mlir -c %t/span_dumps.cpp -o %t/span_dumps.o && \ +// RUN: nvq++ %cpp_std --enable-mlir -c %t/span_exercise.cpp -o %t/span_exercise.o && \ +// RUN: nvq++ %cpp_std --enable-mlir %t/span_dumps.o %t/span_exercise.o -o %t/spanaroo.out && \ +// RUN: %t/spanaroo.out | FileCheck %s ; else \ +// RUN: echo "skipping" ; fi +// clang-format on + +//--- span_dumps.cpp + +#include +#include +#include + +extern "C" { +void dump_bool_vector(std::span x) { + std::cout << "booleans: "; + for (auto i : x) + std::cout << i << ' '; + std::cout << '\n'; +} + +void dump_int_vector(std::span x) { + std::cout << "integers: "; + for (auto i : x) + std::cout << i << ' '; + std::cout << '\n'; +} + +void dump_2d_int_vector(std::span> x) { + std::cout << "integer matrix: {\n"; + for (auto s : x) { + std::cout << " "; + for (auto i : s) + std::cout << i << " "; + std::cout << '\n'; + } + std::cout << "}\n"; +} + +void dump_int_scalar(int x) { std::cout << "scalar integer: " << x << '\n'; } + +void dump_double_vector(std::span x) { + std::cout << "doubles: "; + for (auto d : x) + std::cout << d << ' '; + std::cout << '\n'; +} +} + +//--- span_exercise.cpp + +#include +#include + +// Fake host C++ signature that matches. +extern "C" { +void dump_int_vector(const std::vector &pw); +void dump_int_scalar(int v); +void dump_bool_vector(const std::vector &pw); +void dump_double_vector(const std::vector &pw); +void dump_2d_int_vector(const std::vector> &pw); +} + +__qpu__ void kern1(std::vector arg) { dump_int_vector(arg); } + +__qpu__ void kern2(std::vector> arg) { + for (unsigned i = 0; i < arg.size(); ++i) + dump_int_vector(arg[i]); +} + +struct IntVectorPair { + std::vector _0; + std::vector _1; +}; + +__qpu__ void kern3(IntVectorPair ivp) { + dump_int_vector(ivp._0); + dump_int_vector(ivp._1); +} + +__qpu__ void kern4(std::vector vivp) { + for (unsigned i = 0; i < vivp.size(); ++i) { + dump_int_vector(vivp[i]._0); + dump_int_vector(vivp[i]._1); + } +} + +__qpu__ void qern1(std::vector arg) { dump_double_vector(arg); } + +__qpu__ void qern2(std::vector> arg) { + for (unsigned i = 0; i < arg.size(); ++i) + dump_double_vector(arg[i]); +} + +struct DoubleVectorPair { + std::vector _0; + std::vector _1; +}; + +__qpu__ void qern3(DoubleVectorPair ivp) { + dump_double_vector(ivp._0); + dump_double_vector(ivp._1); +} + +__qpu__ void qern4(std::vector vivp) { + for (unsigned i = 0; i < vivp.size(); ++i) { + dump_double_vector(vivp[i]._0); + dump_double_vector(vivp[i]._1); + } +} + +__qpu__ void cern1(std::vector arg) { dump_bool_vector(arg); } + +__qpu__ void cern2(std::vector> arg) { + for (unsigned i = 0; i < arg.size(); ++i) + dump_bool_vector(arg[i]); +} + +struct BoolVectorPair { + std::vector _0; + std::vector _1; +}; + +__qpu__ void cern3(BoolVectorPair ivp) { + dump_bool_vector(ivp._0); + dump_bool_vector(ivp._1); +} + +__qpu__ void cern4(std::vector vivp) { + for (unsigned i = 0; i < vivp.size(); ++i) { + dump_bool_vector(vivp[i]._0); + dump_bool_vector(vivp[i]._1); + } +} + +struct Interesting { + std::vector>> ragged3d; + int flags; + std::vector angular; +}; + +__qpu__ void exciting(std::vector vi) { + for (unsigned i = 0; i < vi.size(); ++i) { + for (unsigned j = 0; j < vi[i].ragged3d.size(); ++j) + dump_2d_int_vector(vi[i].ragged3d[j]); + dump_int_scalar(vi[i].flags); + dump_double_vector(vi[i].angular); + } +} + +int main() { + std::vector pw0 = {345, 1, 2}; + std::cout << "---\n"; + kern1(pw0); + std::vector pw1 = {92347, 3, 4}; + std::vector pw2 = {2358, 5, 6}; + std::vector pw3 = {45, 7, 18}; + std::vector> vpw{pw0, pw1, pw2, pw3}; + std::cout << "---\n"; + kern2(vpw); + + IntVectorPair ivp = {{8, 238, 44}, {0, -4, 81, 92745}}; + std::cout << "---\n"; + kern3(ivp); + + IntVectorPair ivp2 = {{5, -87, 43, 1, 76}, {0, 0, 2, 1}}; + IntVectorPair ivp3 = {{1}, {-2, 3}}; + IntVectorPair ivp4 = {{-4, -5, 6}, {-7, -8, -9, 88}}; + std::vector vivp = {ivp, ivp2, ivp3, ivp4}; + std::cout << "---\n"; + kern4(vivp); + + std::vector dpw0 = {3.45, 1., 2.}; + std::cout << "---\n"; + qern1(dpw0); + std::vector dpw1 = {92.347, 2.3, 4.}; + std::vector dpw2 = {235.8, 5.5, 6.4}; + std::vector dpw3 = {4.5, 77.7, 18.2}; + std::vector> vdpw{dpw0, dpw1, dpw2, dpw3}; + std::cout << "---\n"; + qern2(vdpw); + + DoubleVectorPair dvp = {{8., 2.38, 4.4}, {0., -4.99, 81.5, 92.745}}; + std::cout << "---\n"; + qern3(dvp); + + DoubleVectorPair dvp2 = {{5., -8.7, 4.3, 1., 7.6}, {0., 0., 2., 1.}}; + DoubleVectorPair dvp3 = {{1.}, {-2., 3.}}; + DoubleVectorPair dvp4 = {{-4., -5., 6.}, {-7., -8., -9., .88}}; + std::vector vdvp = {dvp, dvp2, dvp3, dvp4}; + std::cout << "---\n"; + qern4(vdvp); + + std::vector bpw0 = {true, false}; + std::cout << "---\n"; + cern1(bpw0); + std::vector bpw1 = {false, false, false}; + std::vector bpw2 = {false, true, false, true}; + std::vector bpw3 = {false, false, true, false, true}; + std::vector> vbpw{bpw0, bpw1, bpw2, bpw3}; + std::cout << "---\n"; + cern2(vbpw); + + BoolVectorPair bvp = {{false, false}, {false, true, true, false}}; + std::cout << "---\n"; + cern3(bvp); + + BoolVectorPair bvp2 = {{false, true, true, false, true, false}, + {false, true, true, false, false, false, true, false}}; + BoolVectorPair bvp3 = {{false}, {true, true}}; + BoolVectorPair bvp4 = {{true, false, false}, {false, true, false, true}}; + std::vector vbvp = {bvp, bvp2, bvp3, bvp4}; + std::cout << "---\n"; + cern4(vbvp); + + std::vector> ix0 = {pw0, pw0}; + std::vector> ix1 = {pw1, pw0}; + std::vector> ix2 = {pw2, pw3, pw3}; + std::vector> ix3 = {{404}, {101, 202}}; + std::vector>> i3d0 = {ix0, ix1}; + std::vector>> i3d1 = {ix1}; + std::vector>> i3d2 = {ix2, ix3}; + std::vector>> i3d3 = {ix3}; + std::vector>> i3d4 = {ix2, ix0, ix0}; + Interesting in0 = {i3d0, 66, {2.0, 4.0}}; + Interesting in1 = {i3d1, 123, {3.0, 6.0}}; + Interesting in2 = {i3d2, 561, {4.0, 8.0}}; + Interesting in3 = {i3d3, 72341, {5.0, 10.0}}; + Interesting in4 = {i3d4, -2348, {12.0, 5280.1}}; + std::vector ving = {in0, in1, in2, in3, in4}; + std::cout << "===\n"; + exciting(ving); + + return 0; +} + +// CHECK: --- +// CHECK: integers: 345 1 2 +// CHECK: --- +// CHECK: integers: 345 1 2 +// CHECK-NEXT: integers: 92347 3 4 +// CHECK-NEXT: integers: 2358 5 6 +// CHECK-NEXT: integers: 45 7 18 +// CHECK: --- +// CHECK: integers: 8 238 44 +// CHECK-NEXT: integers: 0 -4 81 92745 +// CHECK: --- +// CHECK: integers: 8 238 44 +// CHECK-NEXT: integers: 0 -4 81 92745 +// CHECK-NEXT: integers: 5 -87 43 1 76 +// CHECK-NEXT: integers: 0 0 2 1 +// CHECK-NEXT: integers: 1 +// CHECK-NEXT: integers: -2 3 +// CHECK-NEXT: integers: -4 -5 6 +// CHECK-NEXT: integers: -7 -8 -9 88 +// CHECK: --- +// CHECK: doubles: 3.45 1 2 +// CHECK: --- +// CHECK: doubles: 3.45 1 2 +// CHECK-NEXT: doubles: 92.347 2.3 4 +// CHECK-NEXT: doubles: 235.8 5.5 6.4 +// CHECK-NEXT: doubles: 4.5 77.7 18.2 +// CHECK: --- +// CHECK: doubles: 8 2.38 4.4 +// CHECK-NEXT: doubles: 0 -4.99 81.5 92.745 +// CHECK: --- +// CHECK: doubles: 8 2.38 4.4 +// CHECK-NEXT: doubles: 0 -4.99 81.5 92.745 +// CHECK-NEXT: doubles: 5 -8.7 4.3 1 7.6 +// CHECK-NEXT: doubles: 0 0 2 1 +// CHECK-NEXT: doubles: 1 +// CHECK-NEXT: doubles: -2 3 +// CHECK-NEXT: doubles: -4 -5 6 +// CHECK-NEXT: doubles: -7 -8 -9 0.88 +// CHECK: --- +// CHECK: booleans: 1 0 +// CHECK: --- +// CHECK: booleans: 1 0 +// CHECK-NEXT: booleans: 0 0 0 +// CHECK-NEXT: booleans: 0 1 0 1 +// CHECK-NEXT: booleans: 0 0 1 0 1 +// CHECK: --- +// CHECK: booleans: 0 0 +// CHECK-NEXT: booleans: 0 1 1 0 +// CHECK: --- +// CHECK: booleans: 0 0 +// CHECK-NEXT: booleans: 0 1 1 0 +// CHECK-NEXT: booleans: 0 1 1 0 1 0 +// CHECK-NEXT: booleans: 0 1 1 0 0 0 1 0 +// CHECK-NEXT: booleans: 0 +// CHECK-NEXT: booleans: 1 1 +// CHECK-NEXT: booleans: 1 0 0 +// CHECK-NEXT: booleans: 0 1 0 1 +// CHECK: === +// CHECK: integer matrix: { +// CHECK-NEXT: 345 1 2 +// CHECK-NEXT: 345 1 2 +// CHECK-NEXT: } +// CHECK-NEXT: integer matrix: { +// CHECK-NEXT: 92347 3 4 +// CHECK-NEXT: 345 1 2 +// CHECK-NEXT: } +// CHECK-NEXT: scalar integer: 66 +// CHECK-NEXT: doubles: 2 4 +// CHECK-NEXT: integer matrix: { +// CHECK-NEXT: 92347 3 4 +// CHECK-NEXT: 345 1 2 +// CHECK-NEXT: } +// CHECK-NEXT: scalar integer: 123 +// CHECK-NEXT: doubles: 3 6 +// CHECK-NEXT: integer matrix: { +// CHECK-NEXT: 2358 5 6 +// CHECK-NEXT: 45 7 18 +// CHECK-NEXT: 45 7 18 +// CHECK-NEXT: } +// CHECK-NEXT: integer matrix: { +// CHECK-NEXT: 404 +// CHECK-NEXT: 101 202 +// CHECK-NEXT: } +// CHECK-NEXT: scalar integer: 561 +// CHECK-NEXT: doubles: 4 8 +// CHECK-NEXT: integer matrix: { +// CHECK-NEXT: 404 +// CHECK-NEXT: 101 202 +// CHECK-NEXT: } +// CHECK-NEXT: scalar integer: 72341 +// CHECK-NEXT: doubles: 5 10 +// CHECK-NEXT: integer matrix: { +// CHECK-NEXT: 2358 5 6 +// CHECK-NEXT: 45 7 18 +// CHECK-NEXT: 45 7 18 +// CHECK-NEXT: } +// CHECK-NEXT: integer matrix: { +// CHECK-NEXT: 345 1 2 +// CHECK-NEXT: 345 1 2 +// CHECK-NEXT: } +// CHECK-NEXT: integer matrix: { +// CHECK-NEXT: 345 1 2 +// CHECK-NEXT: 345 1 2 +// CHECK-NEXT: } +// CHECK-NEXT: scalar integer: -2348 +// CHECK-NEXT: doubles: 12 5280.1 diff --git a/targettests/SeparateCompilation/pauli_words.cpp b/targettests/SeparateCompilation/pauli_words.cpp new file mode 100644 index 0000000000..31ac339e0c --- /dev/null +++ b/targettests/SeparateCompilation/pauli_words.cpp @@ -0,0 +1,65 @@ +/******************************************************************************* + * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +// clang-format off +// RUN: if [ command -v split-file ]; then \ +// RUN: split-file %s %t && \ +// RUN: nvq++ %cpp_std --enable-mlir -c %t/pauli_word_display.cpp -o %t/pauli_word_display.o && \ +// RUN: nvq++ %cpp_std --enable-mlir -c %t/pauli_wordle.cpp -o %t/pauli_wordle.o && \ +// RUN: nvq++ %cpp_std --enable-mlir %t/pauli_word_display.o %t/pauli_wordle.o -o %t/pauli_wordle.out && \ +// RUN: %t/pauli_wordle.out | FileCheck %s ; else \ +// RUN: echo "skipping" ; fi +// clang-format on + +//--- pauli_word_display.cpp + +#include +#include +#include + +extern "C" { +void display(std::span x) { + std::string s{x.data(), x.size()}; + std::cout << "pauli word: " << s << '\n'; +} +} + +//--- pauli_wordle.cpp + +#include + +// Fake host C++ signature that matches. Since this is called on the device side +// the pauli_word will have been converted to a span. +extern "C" void display(const cudaq::pauli_word &pw); + +__qpu__ void kerny(std::vector arg) { + display(arg[0]); + display(arg[1]); + display(arg[2]); + display(arg[3]); +} + +__qpu__ void kernub(cudaq::pauli_word arg) { display(arg); } + +int main() { + cudaq::pauli_word pw0 = "YYZ"; + kernub(pw0); + + cudaq::pauli_word pw1 = "ZIZ"; + cudaq::pauli_word pw2 = "XXXY"; + cudaq::pauli_word pw3 = "YIIII"; + std::vector vpw{pw0, pw1, pw2, pw3}; + kerny(vpw); + return 0; +} + +// CHECK: pauli word: YYZ +// CHECK: pauli word: YYZ +// CHECK: pauli word: ZIZ +// CHECK: pauli word: XXXY +// CHECK: pauli word: YIIII diff --git a/targettests/execution/exp_pauli.cpp b/targettests/execution/exp_pauli.cpp index bf7ed5bac1..014d86ccf6 100644 --- a/targettests/execution/exp_pauli.cpp +++ b/targettests/execution/exp_pauli.cpp @@ -8,17 +8,18 @@ // clang-format off // Simulators -// RUN: nvq++ %cpp_std --enable-mlir %s -o %t && %t | FileCheck %s -// RUN: nvq++ %cpp_std --enable-mlir --target remote-mqpu -fkernel-exec-kind=2 %s -o %t && %t | FileCheck %s +// RUN: nvq++ %cpp_std --enable-mlir %s -o %t && %t | FileCheck %s +// RUN: nvq++ %cpp_std -fkernel-exec-kind=2 --enable-mlir -target remote-mqpu %s -o %t && %t | FileCheck %s // // Quantum emulators -// RUN: nvq++ %cpp_std --target quantinuum --emulate -fkernel-exec-kind=2 %s -o %t && %t | FileCheck %s -// RUN: nvq++ %cpp_std --target ionq --emulate -fkernel-exec-kind=2 %s -o %t && %t | FileCheck %s +// RUN: nvq++ %cpp_std -fkernel-exec-kind=2 -target quantinuum --emulate %s -o %t && %t | FileCheck %s +// RUN: nvq++ %cpp_std -fkernel-exec-kind=2 -target ionq --emulate %s -o %t && %t | FileCheck %s +// RUN: nvq++ %cpp_std -fkernel-exec-kind=2 -target oqc --emulate %s -o %t && %t | FileCheck %s +// RUN: nvq++ %cpp_std -fkernel-exec-kind=2 -target anyon --emulate %s -o %t && %t | FileCheck %s + // 2 different IQM machines for 2 different topologies -// RUN: nvq++ %cpp_std --target iqm --iqm-machine Adonis --emulate -fkernel-exec-kind=2 %s -o %t && %t | FileCheck %s -// RUN: nvq++ %cpp_std --target iqm --iqm-machine Apollo --emulate -fkernel-exec-kind=2 %s -o %t && %t | FileCheck %s -// RUN: nvq++ %cpp_std --target oqc --emulate -fkernel-exec-kind=2 %s -o %t && %t | FileCheck %s -// RUN: nvq++ %cpp_std --target anyon --emulate -fkernel-exec-kind=2 %s -o %t && %t | FileCheck %s +// RUN: nvq++ %cpp_std -fkernel-exec-kind=2 -target iqm --iqm-machine Adonis --emulate %s -o %t && %t | FileCheck %s +// RUN: nvq++ %cpp_std -fkernel-exec-kind=2 -target iqm --iqm-machine Apollo --emulate %s -o %t && %t | FileCheck %s // clang-format on #include diff --git a/test/AST-Quake/calling_convention-aarch64.cpp b/test/AST-Quake/calling_convention-aarch64.cpp index 174aaf3558..22d60856e0 100644 --- a/test/AST-Quake/calling_convention-aarch64.cpp +++ b/test/AST-Quake/calling_convention-aarch64.cpp @@ -271,7 +271,7 @@ struct V3 { // CHECK-LABEL: func.func @_ZN2V3clESt6vectorIlSaIlEES0_IbSaIbEE( // CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, // CHECK-SAME: %[[VAL_1:.*]]: !cc.ptr, !cc.ptr, !cc.ptr}>>, -// CHECK-SAME: %[[VAL_2:.*]]: !cc.ptr, !cc.ptr, !cc.ptr}>>) +// CHECK-SAME: %[[VAL_2:.*]]: !cc.ptr, !cc.array}>>) // clang-format on //===----------------------------------------------------------------------===// diff --git a/test/AST-Quake/calling_convention.cpp b/test/AST-Quake/calling_convention.cpp index 3d2c6e2e4a..fcf7c26cda 100644 --- a/test/AST-Quake/calling_convention.cpp +++ b/test/AST-Quake/calling_convention.cpp @@ -278,9 +278,7 @@ struct V3 { // CHECK-SAME: %[[VAL_2:.*]]: !cc.ptr, !cc.ptr, !cc.ptr}>>, // CHECK-SAME: %[[VAL_3:.*]]: !cc.ptr, !cc.ptr, !cc.ptr}>>) // CHECK-LABEL: func.func @_ZN2V3clESt6vectorIlSaIlEES0_IbSaIbEE( -// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, -// CHECK-SAME: %[[VAL_1:.*]]: !cc.ptr, !cc.ptr, !cc.ptr}>>, -// CHECK-SAME: %[[VAL_2:.*]]: !cc.ptr, !cc.ptr, !cc.ptr}>>) +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, %[[VAL_1:.*]]: !cc.ptr, !cc.ptr, !cc.ptr}>>, %[[VAL_2:.*]]: !cc.ptr, !cc.array}>>) // clang-format on //===----------------------------------------------------------------------===// diff --git a/test/Quake/kernel_exec-1.qke b/test/Quake/kernel_exec-1.qke index 37ac7c7229..cd079998ae 100644 --- a/test/Quake/kernel_exec-1.qke +++ b/test/Quake/kernel_exec-1.qke @@ -6,15 +6,13 @@ // the terms of the Apache License 2.0 which accompanies this distribution. // // ========================================================================== // -// RUN: cudaq-opt --kernel-execution=codegen=1 %s | FileCheck %s -// RUN: cudaq-opt --kernel-execution=codegen=2 %s | FileCheck --check-prefix=STREAM %s -// RUN: cudaq-opt --kernel-execution %s | FileCheck --check-prefix=HYBRID %s +// RUN: cudaq-opt -kernel-execution=codegen=1 %s | FileCheck --check-prefix=ALT %s +// RUN: cudaq-opt -kernel-execution=codegen=2 %s | FileCheck --check-prefix=STREAMLINED %s +// RUN: cudaq-opt -kernel-execution %s | FileCheck --check-prefix=HYBRID %s module attributes {quake.mangled_name_map = { __nvqpp__mlirgen__ghz = "_ZN3ghzclEi"}} { -// CHECK-LABEL: func.func @__nvqpp__mlirgen__ghz( - func.func @__nvqpp__mlirgen__ghz(%arg0: i32) -> f64 { %0 = cc.alloca i32 cc.store %arg0, %0 : !cc.ptr @@ -83,174 +81,369 @@ module attributes {quake.mangled_name_map = { } } -// Check the generated code. +// ALT-LABEL: func.func @_ZN3ghzclEi( +// ALT-SAME: %[[VAL_0:.*]]: !cc.ptr, +// ALT-SAME: %[[VAL_1:.*]]: i32) -> f64 { +// ALT: %[[VAL_2:.*]] = cc.alloca i64 +// ALT: %[[VAL_3:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64 +// ALT: %[[VAL_4:.*]] = cc.alloca i8{{\[}}%[[VAL_3]] : i64] +// ALT: %[[VAL_5:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr>) -> !cc.ptr> +// ALT: %[[VAL_6:.*]] = cc.compute_ptr %[[VAL_5]][0] : (!cc.ptr>) -> !cc.ptr +// ALT: cc.store %[[VAL_1]], %[[VAL_6]] : !cc.ptr +// ALT: %[[VAL_7:.*]] = constant @ghz.thunk : (!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}> +// ALT: %[[VAL_8:.*]] = cc.func_ptr %[[VAL_7]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr +// ALT: %[[VAL_9:.*]] = cc.cast %[[VAL_5]] : (!cc.ptr>) -> !cc.ptr +// ALT: %[[VAL_10:.*]] = cc.offsetof !cc.struct<{i32, f64}> [1] : i64 +// ALT: %[[VAL_11:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr> +// ALT: %[[VAL_12:.*]] = cc.cast %[[VAL_11]] : (!llvm.ptr>) -> !cc.ptr +// ALT: %[[VAL_13:.*]] = call @altLaunchKernel(%[[VAL_12]], %[[VAL_8]], %[[VAL_9]], %[[VAL_3]], %[[VAL_10]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) -> !cc.struct<{!cc.ptr, i64}> +// ALT: %[[VAL_14:.*]] = cc.extract_value %[[VAL_13]][0] : (!cc.struct<{!cc.ptr, i64}>) -> !cc.ptr +// ALT: %[[VAL_15:.*]] = cc.cast %[[VAL_14]] : (!cc.ptr) -> i64 +// ALT: %[[VAL_16:.*]] = arith.constant 0 : i64 +// ALT: %[[VAL_17:.*]] = arith.cmpi ne, %[[VAL_15]], %[[VAL_16]] : i64 +// ALT: cf.cond_br %[[VAL_17]], ^bb1, ^bb2 +// ALT: ^bb1: +// ALT: %[[VAL_18:.*]] = cc.cast %[[VAL_14]] : (!cc.ptr) -> !cc.ptr> +// ALT: %[[VAL_19:.*]] = cc.compute_ptr %[[VAL_18]][1] : (!cc.ptr>) -> !cc.ptr +// ALT: cf.br ^bb3(%[[VAL_19]] : !cc.ptr) +// ALT: ^bb2: +// ALT: %[[VAL_20:.*]] = cc.compute_ptr %[[VAL_5]][1] : (!cc.ptr>) -> !cc.ptr +// ALT: cf.br ^bb3(%[[VAL_20]] : !cc.ptr) +// ALT: ^bb3(%[[VAL_21:.*]]: !cc.ptr): +// ALT: %[[VAL_22:.*]] = cc.compute_ptr %[[VAL_5]][1] : (!cc.ptr>) -> !cc.ptr +// ALT: %[[VAL_23:.*]] = cc.load %[[VAL_22]] : !cc.ptr +// ALT: return %[[VAL_23]] : f64 +// ALT: } +// ALT: func.func private @altLaunchKernel(!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) -> !cc.struct<{!cc.ptr, i64}> +// ALT: func.func private @cudaqRegisterArgsCreator(!cc.ptr, !cc.ptr) +// ALT: llvm.func @cudaqRegisterLambdaName(!llvm.ptr, !llvm.ptr) attributes {sym_visibility = "private"} +// ALT: func.func private @__cudaq_registerLinkableKernel(!cc.ptr, !cc.ptr, !cc.ptr) +// ALT: func.func private @__cudaq_getLinkableKernelKey(!cc.ptr) -> i64 +// ALT: func.func private @cudaqRegisterKernelName(!cc.ptr) +// ALT: func.func private @malloc(i64) -> !cc.ptr +// ALT: func.func private @free(!cc.ptr) +// ALT: func.func private @__nvqpp_initializer_list_to_vector_bool(!cc.ptr, !cc.ptr, i64) +// ALT: func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr, !cc.ptr, !cc.ptr}>>, !cc.ptr, !cc.array}>>, !cc.ptr>) +// ALT: func.func private @llvm.memcpy.p0i8.p0i8.i64(!cc.ptr, !cc.ptr, i64, i1) + +// ALT-LABEL: func.func private @__nvqpp_zeroDynamicResult() -> !cc.struct<{!cc.ptr, i64}> { +// ALT: %[[VAL_0:.*]] = arith.constant 0 : i64 +// ALT: %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (i64) -> !cc.ptr +// ALT: %[[VAL_2:.*]] = cc.undef !cc.struct<{!cc.ptr, i64}> +// ALT: %[[VAL_3:.*]] = cc.insert_value %[[VAL_1]], %[[VAL_2]][0] : (!cc.struct<{!cc.ptr, i64}>, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> +// ALT: %[[VAL_4:.*]] = cc.insert_value %[[VAL_0]], %[[VAL_3]][1] : (!cc.struct<{!cc.ptr, i64}>, i64) -> !cc.struct<{!cc.ptr, i64}> +// ALT: return %[[VAL_4]] : !cc.struct<{!cc.ptr, i64}> +// ALT: } + +// ALT-LABEL: func.func private @__nvqpp_createDynamicResult( +// ALT-SAME: %[[VAL_0:.*]]: !cc.ptr, +// ALT-SAME: %[[VAL_1:.*]]: i64, +// ALT-SAME: %[[VAL_2:.*]]: !cc.ptr, i64}>>, +// ALT-SAME: %[[VAL_3:.*]]: i64) -> !cc.struct<{!cc.ptr, i64}> { +// ALT: %[[VAL_4:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr, i64}>>) -> !cc.ptr +// ALT: %[[VAL_5:.*]] = cc.load %[[VAL_4]] : !cc.ptr +// ALT: %[[VAL_6:.*]] = arith.addi %[[VAL_1]], %[[VAL_5]] : i64 +// ALT: %[[VAL_7:.*]] = call @malloc(%[[VAL_6]]) : (i64) -> !cc.ptr +// ALT: %[[VAL_8:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr) -> !cc.ptr> +// ALT: %[[VAL_9:.*]] = arith.constant false +// ALT: call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_7]], %[[VAL_0]], %[[VAL_1]], %[[VAL_9]]) : (!cc.ptr, !cc.ptr, i64, i1) -> () +// ALT: %[[VAL_10:.*]] = cc.compute_ptr %[[VAL_2]][0] : (!cc.ptr, i64}>>) -> !cc.ptr> +// ALT: %[[VAL_11:.*]] = cc.load %[[VAL_10]] : !cc.ptr> +// ALT: %[[VAL_12:.*]] = cc.compute_ptr %[[VAL_8]]{{\[}}%[[VAL_1]]] : (!cc.ptr>, i64) -> !cc.ptr +// ALT: call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_12]], %[[VAL_11]], %[[VAL_5]], %[[VAL_9]]) : (!cc.ptr, !cc.ptr, i64, i1) -> () +// ALT: %[[VAL_13:.*]] = cc.undef !cc.struct<{!cc.ptr, i64}> +// ALT: %[[VAL_14:.*]] = cc.insert_value %[[VAL_7]], %[[VAL_13]][0] : (!cc.struct<{!cc.ptr, i64}>, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> +// ALT: %[[VAL_15:.*]] = cc.insert_value %[[VAL_6]], %[[VAL_14]][1] : (!cc.struct<{!cc.ptr, i64}>, i64) -> !cc.struct<{!cc.ptr, i64}> +// ALT: %[[VAL_16:.*]] = cc.compute_ptr %[[VAL_8]]{{\[}}%[[VAL_3]]] : (!cc.ptr>, i64) -> !cc.ptr +// ALT: %[[VAL_17:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr) -> !cc.ptr> +// ALT: cc.store %[[VAL_12]], %[[VAL_17]] : !cc.ptr> +// ALT: return %[[VAL_15]] : !cc.struct<{!cc.ptr, i64}> +// ALT: } +// ALT: llvm.mlir.global external constant @ghz.kernelName("ghz\00") {addr_space = 0 : i32} + +// ALT-LABEL: func.func @ghz.returnOffset() -> i64 { +// ALT: %[[VAL_0:.*]] = cc.offsetof !cc.struct<{i32, f64}> [1] : i64 +// ALT: return %[[VAL_0]] : i64 +// ALT: } + +// ALT-LABEL: func.func @ghz.thunk( +// ALT-SAME: %[[VAL_0:.*]]: !cc.ptr, +// ALT-SAME: %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr, i64}> { +// ALT: %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr) -> !cc.ptr> +// ALT: %[[VAL_3:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64 +// ALT: %[[VAL_4:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr) -> !cc.ptr> +// ALT: %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_4]]{{\[}}%[[VAL_3]]] : (!cc.ptr>, i64) -> !cc.ptr +// ALT: %[[VAL_6:.*]] = cc.compute_ptr %[[VAL_2]][0] : (!cc.ptr>) -> !cc.ptr +// ALT: %[[VAL_7:.*]] = cc.load %[[VAL_6]] : !cc.ptr +// ALT: %[[VAL_8:.*]] = call @__nvqpp__mlirgen__ghz(%[[VAL_7]]) : (i32) -> f64 +// ALT: %[[VAL_9:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr>) -> !cc.ptr +// ALT: cc.store %[[VAL_8]], %[[VAL_9]] : !cc.ptr +// ALT: %[[VAL_10:.*]] = call @__nvqpp_zeroDynamicResult() : () -> !cc.struct<{!cc.ptr, i64}> +// ALT: return %[[VAL_10]] : !cc.struct<{!cc.ptr, i64}> +// ALT: } -// CHECK-LABEL: func.func @_ZN3ghzclEi( -// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, %[[VAL_1:.*]]: i32) -> f64 { -// CHECK-DAG: %[[VAL_2:.*]] = cc.undef !cc.struct<{i32, f64}> -// CHECK-DAG: %[[VAL_3:.*]] = arith.constant 0 : i64 -// CHECK: %[[VAL_4:.*]] = cc.insert_value %[[VAL_1]], %[[VAL_2]][0] : (!cc.struct<{i32, f64}>, i32) -> !cc.struct<{i32, f64}> -// CHECK: %[[VAL_5:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64 -// CHECK: %[[VAL_6:.*]] = arith.addi %[[VAL_5]], %[[VAL_3]] : i64 -// CHECK: %[[VAL_7:.*]] = cc.alloca i8{{\[}}%[[VAL_6]] : i64] -// CHECK: %[[VAL_8:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr>) -> !cc.ptr> -// CHECK: cc.store %[[VAL_4]], %[[VAL_8]] : !cc.ptr> -// CHECK: %[[VAL_9:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr>) -> !cc.ptr x ?>> -// CHECK: %[[VAL_10:.*]] = constant @ghz.thunk : (!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}> -// CHECK: %[[VAL_11:.*]] = cc.func_ptr %[[VAL_10]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr -// CHECK: %[[VAL_12:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr x ?>>) -> !cc.ptr -// CHECK: %[[VAL_13:.*]] = cc.offsetof !cc.struct<{i32, f64}> [1] : i64 -// CHECK: %[[VAL_14:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr> -// CHECK: %[[VAL_15:.*]] = cc.cast %[[VAL_14]] : (!llvm.ptr>) -> !cc.ptr -// CHECK: %[[VAL_16:.*]] = call @altLaunchKernel(%[[VAL_15]], %[[VAL_11]], %[[VAL_12]], %[[VAL_6]], %[[VAL_13]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) -> !cc.struct<{!cc.ptr, i64}> -// CHECK: %[[VAL_17:.*]] = cc.extract_value %[[VAL_16]][0] : (!cc.struct<{!cc.ptr, i64}>) -> !cc.ptr -// CHECK: %[[VAL_18:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr) -> i64 -// CHECK: %[[VAL_19:.*]] = arith.constant 0 : i64 -// CHECK: %[[VAL_20:.*]] = arith.cmpi ne, %[[VAL_18]], %[[VAL_19]] : i64 -// CHECK: cf.cond_br %[[VAL_20]], ^bb1, ^bb2 -// CHECK: ^bb1: -// CHECK: %[[VAL_21:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr) -> !cc.ptr> -// CHECK: %[[VAL_22:.*]] = cc.compute_ptr %[[VAL_21]][1] : (!cc.ptr>) -> !cc.ptr -// CHECK: cf.br ^bb3(%[[VAL_22]] : !cc.ptr) -// CHECK: ^bb2: -// CHECK: %[[VAL_23:.*]] = cc.compute_ptr %[[VAL_9]][0, 1] : (!cc.ptr x ?>>) -> !cc.ptr -// CHECK: cf.br ^bb3(%[[VAL_23]] : !cc.ptr) -// CHECK: ^bb3(%[[VAL_24:.*]]: !cc.ptr): -// CHECK: %[[VAL_25:.*]] = cc.compute_ptr %[[VAL_9]][0, 1] : (!cc.ptr x ?>>) -> !cc.ptr -// CHECK: %[[VAL_26:.*]] = cc.load %[[VAL_25]] : !cc.ptr -// CHECK: return %[[VAL_26]] : f64 -// CHECK: } +// ALT-LABEL: func.func @ghz.argsCreator( +// ALT-SAME: %[[VAL_0:.*]]: !cc.ptr>, +// ALT-SAME: %[[VAL_1:.*]]: !cc.ptr>) -> i64 { +// ALT: %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr>) -> !cc.ptr x ?>> +// ALT: %[[VAL_3:.*]] = cc.compute_ptr %[[VAL_2]][0] : (!cc.ptr x ?>>) -> !cc.ptr> +// ALT: %[[VAL_4:.*]] = cc.load %[[VAL_3]] : !cc.ptr> +// ALT: %[[VAL_5:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr) -> !cc.ptr +// ALT: %[[VAL_6:.*]] = cc.load %[[VAL_5]] : !cc.ptr +// ALT: %[[VAL_7:.*]] = cc.alloca i64 +// ALT: %[[VAL_8:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64 +// ALT: %[[VAL_9:.*]] = call @malloc(%[[VAL_8]]) : (i64) -> !cc.ptr +// ALT: %[[VAL_10:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr) -> !cc.ptr> +// ALT: %[[VAL_11:.*]] = cc.compute_ptr %[[VAL_10]][0] : (!cc.ptr>) -> !cc.ptr +// ALT: cc.store %[[VAL_6]], %[[VAL_11]] : !cc.ptr +// ALT: cc.store %[[VAL_9]], %[[VAL_1]] : !cc.ptr> +// ALT: return %[[VAL_8]] : i64 +// ALT: } -// CHECK: func.func private @altLaunchKernel(!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) -> !cc.struct<{!cc.ptr, i64}> -// CHECK: func.func private @cudaqRegisterKernelName(!cc.ptr) +// ALT-LABEL: llvm.func @ghz.kernelRegFunc() { +// ALT: %[[VAL_0:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr> +// ALT: %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (!llvm.ptr>) -> !cc.ptr +// ALT: func.call @cudaqRegisterKernelName(%[[VAL_1]]) : (!cc.ptr) -> () +// ALT: %[[VAL_2:.*]] = func.constant @ghz.argsCreator : (!cc.ptr>, !cc.ptr>) -> i64 +// ALT: %[[VAL_3:.*]] = cc.func_ptr %[[VAL_2]] : ((!cc.ptr>, !cc.ptr>) -> i64) -> !cc.ptr +// ALT: func.call @cudaqRegisterArgsCreator(%[[VAL_1]], %[[VAL_3]]) : (!cc.ptr, !cc.ptr) -> () +// ALT: llvm.return +// ALT: } +// ALT: llvm.mlir.global_ctors {ctors = [@ghz.kernelRegFunc], priorities = [17 : i32]} -// CHECK: llvm.mlir.global external constant @ghz.kernelName("ghz\00") {addr_space = 0 : i32} +// STREAMLINED-LABEL: func.func @_ZN3ghzclEi( +// STREAMLINED-SAME: %[[VAL_0:.*]]: !cc.ptr, +// STREAMLINED-SAME: %[[VAL_1:.*]]: i32) -> f64 { +// STREAMLINED: %[[VAL_2:.*]] = cc.alloca i64 +// STREAMLINED: %[[VAL_3:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64 +// STREAMLINED: %[[VAL_4:.*]] = cc.alloca !cc.struct<{!cc.ptr>, !cc.ptr>, !cc.ptr>}> +// STREAMLINED: %[[VAL_5:.*]] = cc.alloca !cc.array x 1> +// STREAMLINED: %[[VAL_6:.*]] = cc.sizeof !cc.array x 1> : i64 +// STREAMLINED: %[[VAL_7:.*]] = cc.cast %[[VAL_5]] : (!cc.ptr x 1>>) -> !cc.ptr> +// STREAMLINED: %[[VAL_8:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// STREAMLINED: cc.store %[[VAL_7]], %[[VAL_8]] : !cc.ptr>> +// STREAMLINED: %[[VAL_9:.*]] = cc.cast %[[VAL_5]] : (!cc.ptr x 1>>) -> i64 +// STREAMLINED: %[[VAL_10:.*]] = arith.addi %[[VAL_9]], %[[VAL_6]] : i64 +// STREAMLINED: %[[VAL_11:.*]] = cc.cast %[[VAL_10]] : (i64) -> !cc.ptr> +// STREAMLINED: %[[VAL_12:.*]] = cc.compute_ptr %[[VAL_4]][1] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// STREAMLINED: cc.store %[[VAL_11]], %[[VAL_12]] : !cc.ptr>> +// STREAMLINED: %[[VAL_13:.*]] = cc.compute_ptr %[[VAL_4]][2] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// STREAMLINED: cc.store %[[VAL_11]], %[[VAL_13]] : !cc.ptr>> +// STREAMLINED: %[[VAL_14:.*]] = cc.compute_ptr %[[VAL_5]][0] : (!cc.ptr x 1>>) -> !cc.ptr> +// STREAMLINED: %[[VAL_15:.*]] = cc.alloca i32 +// STREAMLINED: cc.store %[[VAL_1]], %[[VAL_15]] : !cc.ptr +// STREAMLINED: %[[VAL_16:.*]] = cc.cast %[[VAL_15]] : (!cc.ptr) -> !cc.ptr +// STREAMLINED: cc.store %[[VAL_16]], %[[VAL_14]] : !cc.ptr> +// STREAMLINED: %[[VAL_17:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr +// STREAMLINED: %[[VAL_18:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr> +// STREAMLINED: %[[VAL_19:.*]] = cc.cast %[[VAL_18]] : (!llvm.ptr>) -> !cc.ptr +// STREAMLINED: call @streamlinedLaunchKernel(%[[VAL_19]], %[[VAL_17]]) : (!cc.ptr, !cc.ptr) -> () +// STREAMLINED: %[[VAL_20:.*]] = cc.undef f64 +// STREAMLINED: return %[[VAL_20]] : f64 +// STREAMLINED: } +// STREAMLINED: func.func private @streamlinedLaunchKernel(!cc.ptr, !cc.ptr) +// STREAMLINED: func.func private @cudaqRegisterArgsCreator(!cc.ptr, !cc.ptr) +// STREAMLINED: llvm.func @cudaqRegisterLambdaName(!llvm.ptr, !llvm.ptr) attributes {sym_visibility = "private"} +// STREAMLINED: func.func private @__cudaq_registerLinkableKernel(!cc.ptr, !cc.ptr, !cc.ptr) +// STREAMLINED: func.func private @__cudaq_getLinkableKernelKey(!cc.ptr) -> i64 +// STREAMLINED: func.func private @cudaqRegisterKernelName(!cc.ptr) +// STREAMLINED: func.func private @malloc(i64) -> !cc.ptr +// STREAMLINED: func.func private @free(!cc.ptr) +// STREAMLINED: func.func private @__nvqpp_initializer_list_to_vector_bool(!cc.ptr, !cc.ptr, i64) +// STREAMLINED: func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr, !cc.ptr, !cc.ptr}>>, !cc.ptr, !cc.array}>>, !cc.ptr>) +// STREAMLINED: func.func private @llvm.memcpy.p0i8.p0i8.i64(!cc.ptr, !cc.ptr, i64, i1) -// CHECK-LABEL: func.func @ghz.thunk( -// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, -// CHECK-SAME: %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr, i64}> { -// CHECK: %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr) -> !cc.ptr> -// CHECK: %[[VAL_3:.*]] = cc.load %[[VAL_2]] : !cc.ptr> -// CHECK: %[[VAL_7:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64 -// CHECK: %[[VAL_20:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr) -> !cc.ptr> -// CHECK: %[[VAL_8:.*]] = cc.compute_ptr %[[VAL_20]][%[[VAL_7]]] : (!cc.ptr>, i64) -> !cc.ptr -// CHECK: %[[VAL_9:.*]] = cc.extract_value %[[VAL_3]][0] : (!cc.struct<{i32, f64}>) -> i32 -// CHECK: %[[VAL_10:.*]] = call @__nvqpp__mlirgen__ghz(%[[VAL_9]]) : (i32) -> f64 -// CHECK: %[[VAL_11:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr>) -> !cc.ptr -// CHECK: cc.store %[[VAL_10]], %[[VAL_11]] : !cc.ptr -// CHECK: %[[VAL_12:.*]] = call @__nvqpp_zeroDynamicResult() : () -> !cc.struct<{!cc.ptr, i64}> -// CHECK: return %[[VAL_12]] : !cc.struct<{!cc.ptr, i64}> -// CHECK: } +// STREAMLINED-LABEL: func.func private @__nvqpp_zeroDynamicResult() -> !cc.struct<{!cc.ptr, i64}> { +// STREAMLINED: %[[VAL_0:.*]] = arith.constant 0 : i64 +// STREAMLINED: %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (i64) -> !cc.ptr +// STREAMLINED: %[[VAL_2:.*]] = cc.undef !cc.struct<{!cc.ptr, i64}> +// STREAMLINED: %[[VAL_3:.*]] = cc.insert_value %[[VAL_1]], %[[VAL_2]][0] : (!cc.struct<{!cc.ptr, i64}>, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> +// STREAMLINED: %[[VAL_4:.*]] = cc.insert_value %[[VAL_0]], %[[VAL_3]][1] : (!cc.struct<{!cc.ptr, i64}>, i64) -> !cc.struct<{!cc.ptr, i64}> +// STREAMLINED: return %[[VAL_4]] : !cc.struct<{!cc.ptr, i64}> +// STREAMLINED: } -// CHECK-LABEL: func.func @ghz.argsCreator( -// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr>, -// CHECK-SAME: %[[VAL_1:.*]]: !cc.ptr>) -> i64 { -// CHECK: %[[VAL_2:.*]] = cc.undef !cc.struct<{i32, f64}> -// CHECK: %[[VAL_14:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr>) -> !cc.ptr x ?>> -// CHECK: %[[VAL_3:.*]] = arith.constant 0 : i64 -// CHECK: %[[VAL_4:.*]] = cc.compute_ptr %[[VAL_14]][0] : (!cc.ptr x ?>>) -> !cc.ptr> -// CHECK: %[[VAL_5:.*]] = cc.load %[[VAL_4]] : !cc.ptr> -// CHECK: %[[VAL_6:.*]] = cc.cast %[[VAL_5]] : (!cc.ptr) -> !cc.ptr -// CHECK: %[[VAL_7:.*]] = cc.load %[[VAL_6]] : !cc.ptr -// CHECK: %[[VAL_8:.*]] = cc.insert_value %[[VAL_7]], %[[VAL_2]][0] : (!cc.struct<{i32, f64}>, i32) -> !cc.struct<{i32, f64}> -// CHECK: %[[VAL_11:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64 -// CHECK: %[[VAL_12:.*]] = call @malloc(%[[VAL_11]]) : (i64) -> !cc.ptr -// CHECK: %[[VAL_13:.*]] = cc.cast %[[VAL_12]] : (!cc.ptr) -> !cc.ptr> -// CHECK: cc.store %[[VAL_8]], %[[VAL_13]] : !cc.ptr> -// CHECK: cc.store %[[VAL_12]], %[[VAL_1]] : !cc.ptr> -// CHECK: return %[[VAL_11]] : i64 -// CHECK: } +// STREAMLINED-LABEL: func.func private @__nvqpp_createDynamicResult( +// STREAMLINED-SAME: %[[VAL_0:.*]]: !cc.ptr, +// STREAMLINED-SAME: %[[VAL_1:.*]]: i64, +// STREAMLINED-SAME: %[[VAL_2:.*]]: !cc.ptr, i64}>>, +// STREAMLINED-SAME: %[[VAL_3:.*]]: i64) -> !cc.struct<{!cc.ptr, i64}> { +// STREAMLINED: %[[VAL_4:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr, i64}>>) -> !cc.ptr +// STREAMLINED: %[[VAL_5:.*]] = cc.load %[[VAL_4]] : !cc.ptr +// STREAMLINED: %[[VAL_6:.*]] = arith.addi %[[VAL_1]], %[[VAL_5]] : i64 +// STREAMLINED: %[[VAL_7:.*]] = call @malloc(%[[VAL_6]]) : (i64) -> !cc.ptr +// STREAMLINED: %[[VAL_8:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr) -> !cc.ptr> +// STREAMLINED: %[[VAL_9:.*]] = arith.constant false +// STREAMLINED: call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_7]], %[[VAL_0]], %[[VAL_1]], %[[VAL_9]]) : (!cc.ptr, !cc.ptr, i64, i1) -> () +// STREAMLINED: %[[VAL_10:.*]] = cc.compute_ptr %[[VAL_2]][0] : (!cc.ptr, i64}>>) -> !cc.ptr> +// STREAMLINED: %[[VAL_11:.*]] = cc.load %[[VAL_10]] : !cc.ptr> +// STREAMLINED: %[[VAL_12:.*]] = cc.compute_ptr %[[VAL_8]]{{\[}}%[[VAL_1]]] : (!cc.ptr>, i64) -> !cc.ptr +// STREAMLINED: call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_12]], %[[VAL_11]], %[[VAL_5]], %[[VAL_9]]) : (!cc.ptr, !cc.ptr, i64, i1) -> () +// STREAMLINED: %[[VAL_13:.*]] = cc.undef !cc.struct<{!cc.ptr, i64}> +// STREAMLINED: %[[VAL_14:.*]] = cc.insert_value %[[VAL_7]], %[[VAL_13]][0] : (!cc.struct<{!cc.ptr, i64}>, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> +// STREAMLINED: %[[VAL_15:.*]] = cc.insert_value %[[VAL_6]], %[[VAL_14]][1] : (!cc.struct<{!cc.ptr, i64}>, i64) -> !cc.struct<{!cc.ptr, i64}> +// STREAMLINED: %[[VAL_16:.*]] = cc.compute_ptr %[[VAL_8]]{{\[}}%[[VAL_3]]] : (!cc.ptr>, i64) -> !cc.ptr +// STREAMLINED: %[[VAL_17:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr) -> !cc.ptr> +// STREAMLINED: cc.store %[[VAL_12]], %[[VAL_17]] : !cc.ptr> +// STREAMLINED: return %[[VAL_15]] : !cc.struct<{!cc.ptr, i64}> +// STREAMLINED: } +// STREAMLINED: llvm.mlir.global external constant @ghz.kernelName("ghz\00") {addr_space = 0 : i32} -// CHECK-LABEL: llvm.func @ghz.kernelRegFunc() { -// CHECK: %[[VAL_0:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr> -// CHECK: %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (!llvm.ptr>) -> !cc.ptr -// CHECK: func.call @cudaqRegisterKernelName(%[[VAL_1]]) : (!cc.ptr) -> () -// CHECK: %[[VAL_2:.*]] = func.constant @ghz.argsCreator : (!cc.ptr>, !cc.ptr>) -> i64 -// CHECK: %[[VAL_3:.*]] = cc.func_ptr %[[VAL_2]] : ((!cc.ptr>, !cc.ptr>) -> i64) -> !cc.ptr -// CHECK: func.call @cudaqRegisterArgsCreator(%[[VAL_1]], %[[VAL_3]]) : (!cc.ptr, !cc.ptr) -> () -// CHECK: llvm.return -// CHECK: } +// STREAMLINED-LABEL: llvm.func @ghz.kernelRegFunc() { +// STREAMLINED: %[[VAL_0:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr> +// STREAMLINED: %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (!llvm.ptr>) -> !cc.ptr +// STREAMLINED: func.call @cudaqRegisterKernelName(%[[VAL_1]]) : (!cc.ptr) -> () +// STREAMLINED: llvm.return +// STREAMLINED: } +// STREAMLINED: llvm.mlir.global_ctors {ctors = [@ghz.kernelRegFunc], priorities = [17 : i32]} -// STREAM-LABEL: func.func @_ZN3ghzclEi( -// STREAM-SAME: %[[VAL_0:.*]]: !cc.ptr, %[[VAL_1:.*]]: i32) -> f64 { -// STREAM: %[[VAL_2:.*]] = cc.alloca !cc.struct<{!cc.ptr>, !cc.ptr>, !cc.ptr>}> -// STREAM: %[[VAL_3:.*]] = cc.alloca !cc.array x 1> -// STREAM: %[[VAL_4:.*]] = cc.sizeof !cc.array x 1> : i64 -// STREAM: %[[VAL_5:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr x 1>>) -> !cc.ptr> -// STREAM: %[[VAL_6:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> -// STREAM: cc.store %[[VAL_5]], %[[VAL_6]] : !cc.ptr>> -// STREAM: %[[VAL_7:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr x 1>>) -> i64 -// STREAM: %[[VAL_8:.*]] = arith.addi %[[VAL_7]], %[[VAL_4]] : i64 -// STREAM: %[[VAL_9:.*]] = cc.cast %[[VAL_8]] : (i64) -> !cc.ptr> -// STREAM: %[[VAL_10:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> -// STREAM: cc.store %[[VAL_9]], %[[VAL_10]] : !cc.ptr>> -// STREAM: %[[VAL_11:.*]] = cc.compute_ptr %[[VAL_2]][2] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> -// STREAM: cc.store %[[VAL_9]], %[[VAL_11]] : !cc.ptr>> -// STREAM: %[[VAL_14:.*]] = cc.compute_ptr %[[VAL_3]][0] : (!cc.ptr x 1>>) -> !cc.ptr> -// STREAM: %[[VAL_15:.*]] = cc.alloca i32 -// STREAM: cc.store %[[VAL_1]], %[[VAL_15]] : !cc.ptr -// STREAM: %[[VAL_16:.*]] = cc.cast %[[VAL_15]] : (!cc.ptr) -> !cc.ptr -// STREAM: cc.store %[[VAL_16]], %[[VAL_14]] : !cc.ptr> -// STREAM: %[[VAL_19:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr -// STREAM: %[[VAL_20:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr> -// STREAM: %[[VAL_21:.*]] = cc.cast %[[VAL_20]] : (!llvm.ptr>) -> !cc.ptr -// STREAM: call @streamlinedLaunchKernel(%[[VAL_21]], %[[VAL_19]]) : (!cc.ptr, !cc.ptr) -> () -// STREAM: %[[VAL_22:.*]] = cc.undef f64 -// STREAM: return %[[VAL_22]] : f64 -// STREAM: } // HYBRID-LABEL: func.func @_ZN3ghzclEi( -// HYBRID-SAME: %[[VAL_0:.*]]: !cc.ptr, %[[VAL_1:.*]]: i32) -> f64 { -// HYBRID: %[[VAL_2:.*]] = cc.undef !cc.struct<{i32, f64}> -// HYBRID: %[[VAL_3:.*]] = arith.constant 0 : i64 -// HYBRID: %[[VAL_4:.*]] = cc.insert_value %[[VAL_1]], %[[VAL_2]][0] : (!cc.struct<{i32, f64}>, i32) -> !cc.struct<{i32, f64}> -// HYBRID: %[[VAL_5:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64 -// HYBRID: %[[VAL_6:.*]] = arith.addi %[[VAL_5]], %[[VAL_3]] : i64 -// HYBRID: %[[VAL_7:.*]] = cc.alloca i8{{\[}}%[[VAL_6]] : i64] -// HYBRID: %[[VAL_8:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr>) -> !cc.ptr> -// HYBRID: cc.store %[[VAL_4]], %[[VAL_8]] : !cc.ptr> -// HYBRID: %[[VAL_9:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr>) -> !cc.ptr x ?>> -// HYBRID: %[[VAL_10:.*]] = constant @ghz.thunk : (!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}> -// HYBRID: %[[VAL_11:.*]] = cc.func_ptr %[[VAL_10]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr -// HYBRID: %[[VAL_12:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr x ?>>) -> !cc.ptr -// HYBRID: %[[VAL_13:.*]] = cc.offsetof !cc.struct<{i32, f64}> [1] : i64 -// HYBRID: %[[VAL_14:.*]] = cc.alloca !cc.struct<{!cc.ptr>, !cc.ptr>, !cc.ptr>}> -// HYBRID: %[[VAL_15:.*]] = cc.alloca !cc.array x 1> -// HYBRID: %[[VAL_16:.*]] = cc.sizeof !cc.array x 1> : i64 -// HYBRID: %[[VAL_17:.*]] = cc.cast %[[VAL_15]] : (!cc.ptr x 1>>) -> !cc.ptr> -// HYBRID: %[[VAL_18:.*]] = cc.cast %[[VAL_14]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> -// HYBRID: cc.store %[[VAL_17]], %[[VAL_18]] : !cc.ptr>> -// HYBRID: %[[VAL_19:.*]] = cc.cast %[[VAL_15]] : (!cc.ptr x 1>>) -> i64 -// HYBRID: %[[VAL_20:.*]] = arith.addi %[[VAL_19]], %[[VAL_16]] : i64 -// HYBRID: %[[VAL_21:.*]] = cc.cast %[[VAL_20]] : (i64) -> !cc.ptr> -// HYBRID: %[[VAL_22:.*]] = cc.compute_ptr %[[VAL_14]][1] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> -// HYBRID: cc.store %[[VAL_21]], %[[VAL_22]] : !cc.ptr>> -// HYBRID: %[[VAL_23:.*]] = cc.compute_ptr %[[VAL_14]][2] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> -// HYBRID: cc.store %[[VAL_21]], %[[VAL_23]] : !cc.ptr>> -// HYBRID: %[[VAL_24:.*]] = cc.compute_ptr %[[VAL_15]][0] : (!cc.ptr x 1>>) -> !cc.ptr> -// HYBRID: %[[VAL_25:.*]] = cc.alloca i32 -// HYBRID: cc.store %[[VAL_1]], %[[VAL_25]] : !cc.ptr -// HYBRID: %[[VAL_26:.*]] = cc.cast %[[VAL_25]] : (!cc.ptr) -> !cc.ptr -// HYBRID: cc.store %[[VAL_26]], %[[VAL_24]] : !cc.ptr> -// HYBRID: %[[VAL_27:.*]] = cc.cast %[[VAL_14]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr -// HYBRID: %[[VAL_28:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr> -// HYBRID: %[[VAL_29:.*]] = cc.cast %[[VAL_28]] : (!llvm.ptr>) -> !cc.ptr -// HYBRID: %[[VAL_30:.*]] = call @hybridLaunchKernel(%[[VAL_29]], %[[VAL_11]], %[[VAL_12]], %[[VAL_6]], %[[VAL_13]], %[[VAL_27]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> -// HYBRID: %[[VAL_31:.*]] = cc.extract_value %[[VAL_30]][0] : (!cc.struct<{!cc.ptr, i64}>) -> !cc.ptr -// HYBRID: %[[VAL_32:.*]] = cc.cast %[[VAL_31]] : (!cc.ptr) -> i64 -// HYBRID: %[[VAL_33:.*]] = arith.constant 0 : i64 -// HYBRID: %[[VAL_34:.*]] = arith.cmpi ne, %[[VAL_32]], %[[VAL_33]] : i64 -// HYBRID: cf.cond_br %[[VAL_34]], ^bb1, ^bb2 +// HYBRID-SAME: %[[VAL_0:.*]]: !cc.ptr, +// HYBRID-SAME: %[[VAL_1:.*]]: i32) -> f64 { +// HYBRID: %[[VAL_2:.*]] = cc.alloca i64 +// HYBRID: %[[VAL_3:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64 +// HYBRID: %[[VAL_4:.*]] = cc.alloca i8{{\[}}%[[VAL_3]] : i64] +// HYBRID: %[[VAL_5:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr>) -> !cc.ptr> +// HYBRID: %[[VAL_6:.*]] = cc.compute_ptr %[[VAL_5]][0] : (!cc.ptr>) -> !cc.ptr +// HYBRID: cc.store %[[VAL_1]], %[[VAL_6]] : !cc.ptr +// HYBRID: %[[VAL_7:.*]] = constant @ghz.thunk : (!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}> +// HYBRID: %[[VAL_8:.*]] = cc.func_ptr %[[VAL_7]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr +// HYBRID: %[[VAL_9:.*]] = cc.cast %[[VAL_5]] : (!cc.ptr>) -> !cc.ptr +// HYBRID: %[[VAL_10:.*]] = cc.offsetof !cc.struct<{i32, f64}> [1] : i64 +// HYBRID: %[[VAL_11:.*]] = cc.alloca !cc.struct<{!cc.ptr>, !cc.ptr>, !cc.ptr>}> +// HYBRID: %[[VAL_12:.*]] = cc.alloca !cc.array x 1> +// HYBRID: %[[VAL_13:.*]] = cc.sizeof !cc.array x 1> : i64 +// HYBRID: %[[VAL_14:.*]] = cc.cast %[[VAL_12]] : (!cc.ptr x 1>>) -> !cc.ptr> +// HYBRID: %[[VAL_15:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// HYBRID: cc.store %[[VAL_14]], %[[VAL_15]] : !cc.ptr>> +// HYBRID: %[[VAL_16:.*]] = cc.cast %[[VAL_12]] : (!cc.ptr x 1>>) -> i64 +// HYBRID: %[[VAL_17:.*]] = arith.addi %[[VAL_16]], %[[VAL_13]] : i64 +// HYBRID: %[[VAL_18:.*]] = cc.cast %[[VAL_17]] : (i64) -> !cc.ptr> +// HYBRID: %[[VAL_19:.*]] = cc.compute_ptr %[[VAL_11]][1] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// HYBRID: cc.store %[[VAL_18]], %[[VAL_19]] : !cc.ptr>> +// HYBRID: %[[VAL_20:.*]] = cc.compute_ptr %[[VAL_11]][2] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// HYBRID: cc.store %[[VAL_18]], %[[VAL_20]] : !cc.ptr>> +// HYBRID: %[[VAL_21:.*]] = cc.compute_ptr %[[VAL_12]][0] : (!cc.ptr x 1>>) -> !cc.ptr> +// HYBRID: %[[VAL_22:.*]] = cc.alloca i32 +// HYBRID: cc.store %[[VAL_1]], %[[VAL_22]] : !cc.ptr +// HYBRID: %[[VAL_23:.*]] = cc.cast %[[VAL_22]] : (!cc.ptr) -> !cc.ptr +// HYBRID: cc.store %[[VAL_23]], %[[VAL_21]] : !cc.ptr> +// HYBRID: %[[VAL_24:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr +// HYBRID: %[[VAL_25:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr> +// HYBRID: %[[VAL_26:.*]] = cc.cast %[[VAL_25]] : (!llvm.ptr>) -> !cc.ptr +// HYBRID: %[[VAL_27:.*]] = call @hybridLaunchKernel(%[[VAL_26]], %[[VAL_8]], %[[VAL_9]], %[[VAL_3]], %[[VAL_10]], %[[VAL_24]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> +// HYBRID: %[[VAL_28:.*]] = cc.extract_value %[[VAL_27]][0] : (!cc.struct<{!cc.ptr, i64}>) -> !cc.ptr +// HYBRID: %[[VAL_29:.*]] = cc.cast %[[VAL_28]] : (!cc.ptr) -> i64 +// HYBRID: %[[VAL_30:.*]] = arith.constant 0 : i64 +// HYBRID: %[[VAL_31:.*]] = arith.cmpi ne, %[[VAL_29]], %[[VAL_30]] : i64 +// HYBRID: cf.cond_br %[[VAL_31]], ^bb1, ^bb2 // HYBRID: ^bb1: -// HYBRID: %[[VAL_35:.*]] = cc.cast %[[VAL_31]] : (!cc.ptr) -> !cc.ptr> -// HYBRID: %[[VAL_36:.*]] = cc.compute_ptr %[[VAL_35]][1] : (!cc.ptr>) -> !cc.ptr -// HYBRID: cf.br ^bb3(%[[VAL_36]] : !cc.ptr) +// HYBRID: %[[VAL_32:.*]] = cc.cast %[[VAL_28]] : (!cc.ptr) -> !cc.ptr> +// HYBRID: %[[VAL_33:.*]] = cc.compute_ptr %[[VAL_32]][1] : (!cc.ptr>) -> !cc.ptr +// HYBRID: cf.br ^bb3(%[[VAL_33]] : !cc.ptr) // HYBRID: ^bb2: -// HYBRID: %[[VAL_37:.*]] = cc.compute_ptr %[[VAL_9]][0, 1] : (!cc.ptr x ?>>) -> !cc.ptr -// HYBRID: cf.br ^bb3(%[[VAL_37]] : !cc.ptr) -// HYBRID: ^bb3(%[[VAL_38:.*]]: !cc.ptr): -// HYBRID: %[[VAL_39:.*]] = cc.compute_ptr %[[VAL_9]][0, 1] : (!cc.ptr x ?>>) -> !cc.ptr -// HYBRID: %[[VAL_40:.*]] = cc.load %[[VAL_39]] : !cc.ptr -// HYBRID: return %[[VAL_40]] : f64 +// HYBRID: %[[VAL_34:.*]] = cc.compute_ptr %[[VAL_5]][1] : (!cc.ptr>) -> !cc.ptr +// HYBRID: cf.br ^bb3(%[[VAL_34]] : !cc.ptr) +// HYBRID: ^bb3(%[[VAL_35:.*]]: !cc.ptr): +// HYBRID: %[[VAL_36:.*]] = cc.compute_ptr %[[VAL_5]][1] : (!cc.ptr>) -> !cc.ptr +// HYBRID: %[[VAL_37:.*]] = cc.load %[[VAL_36]] : !cc.ptr +// HYBRID: return %[[VAL_37]] : f64 +// HYBRID: } +// HYBRID: func.func private @hybridLaunchKernel(!cc.ptr, !cc.ptr, !cc.ptr, i64, i64, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> +// HYBRID: func.func private @cudaqRegisterArgsCreator(!cc.ptr, !cc.ptr) +// HYBRID: llvm.func @cudaqRegisterLambdaName(!llvm.ptr, !llvm.ptr) attributes {sym_visibility = "private"} +// HYBRID: func.func private @__cudaq_registerLinkableKernel(!cc.ptr, !cc.ptr, !cc.ptr) +// HYBRID: func.func private @__cudaq_getLinkableKernelKey(!cc.ptr) -> i64 +// HYBRID: func.func private @cudaqRegisterKernelName(!cc.ptr) +// HYBRID: func.func private @malloc(i64) -> !cc.ptr +// HYBRID: func.func private @free(!cc.ptr) +// HYBRID: func.func private @__nvqpp_initializer_list_to_vector_bool(!cc.ptr, !cc.ptr, i64) +// HYBRID: func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr, !cc.ptr, !cc.ptr}>>, !cc.ptr, !cc.array}>>, !cc.ptr>) +// HYBRID: func.func private @llvm.memcpy.p0i8.p0i8.i64(!cc.ptr, !cc.ptr, i64, i1) + +// HYBRID-LABEL: func.func private @__nvqpp_zeroDynamicResult() -> !cc.struct<{!cc.ptr, i64}> { +// HYBRID: %[[VAL_0:.*]] = arith.constant 0 : i64 +// HYBRID: %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (i64) -> !cc.ptr +// HYBRID: %[[VAL_2:.*]] = cc.undef !cc.struct<{!cc.ptr, i64}> +// HYBRID: %[[VAL_3:.*]] = cc.insert_value %[[VAL_1]], %[[VAL_2]][0] : (!cc.struct<{!cc.ptr, i64}>, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> +// HYBRID: %[[VAL_4:.*]] = cc.insert_value %[[VAL_0]], %[[VAL_3]][1] : (!cc.struct<{!cc.ptr, i64}>, i64) -> !cc.struct<{!cc.ptr, i64}> +// HYBRID: return %[[VAL_4]] : !cc.struct<{!cc.ptr, i64}> +// HYBRID: } + +// HYBRID-LABEL: func.func private @__nvqpp_createDynamicResult( +// HYBRID-SAME: %[[VAL_0:.*]]: !cc.ptr, +// HYBRID-SAME: %[[VAL_1:.*]]: i64, +// HYBRID-SAME: %[[VAL_2:.*]]: !cc.ptr, i64}>>, +// HYBRID-SAME: %[[VAL_3:.*]]: i64) -> !cc.struct<{!cc.ptr, i64}> { +// HYBRID: %[[VAL_4:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr, i64}>>) -> !cc.ptr +// HYBRID: %[[VAL_5:.*]] = cc.load %[[VAL_4]] : !cc.ptr +// HYBRID: %[[VAL_6:.*]] = arith.addi %[[VAL_1]], %[[VAL_5]] : i64 +// HYBRID: %[[VAL_7:.*]] = call @malloc(%[[VAL_6]]) : (i64) -> !cc.ptr +// HYBRID: %[[VAL_8:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr) -> !cc.ptr> +// HYBRID: %[[VAL_9:.*]] = arith.constant false +// HYBRID: call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_7]], %[[VAL_0]], %[[VAL_1]], %[[VAL_9]]) : (!cc.ptr, !cc.ptr, i64, i1) -> () +// HYBRID: %[[VAL_10:.*]] = cc.compute_ptr %[[VAL_2]][0] : (!cc.ptr, i64}>>) -> !cc.ptr> +// HYBRID: %[[VAL_11:.*]] = cc.load %[[VAL_10]] : !cc.ptr> +// HYBRID: %[[VAL_12:.*]] = cc.compute_ptr %[[VAL_8]]{{\[}}%[[VAL_1]]] : (!cc.ptr>, i64) -> !cc.ptr +// HYBRID: call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_12]], %[[VAL_11]], %[[VAL_5]], %[[VAL_9]]) : (!cc.ptr, !cc.ptr, i64, i1) -> () +// HYBRID: %[[VAL_13:.*]] = cc.undef !cc.struct<{!cc.ptr, i64}> +// HYBRID: %[[VAL_14:.*]] = cc.insert_value %[[VAL_7]], %[[VAL_13]][0] : (!cc.struct<{!cc.ptr, i64}>, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> +// HYBRID: %[[VAL_15:.*]] = cc.insert_value %[[VAL_6]], %[[VAL_14]][1] : (!cc.struct<{!cc.ptr, i64}>, i64) -> !cc.struct<{!cc.ptr, i64}> +// HYBRID: %[[VAL_16:.*]] = cc.compute_ptr %[[VAL_8]]{{\[}}%[[VAL_3]]] : (!cc.ptr>, i64) -> !cc.ptr +// HYBRID: %[[VAL_17:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr) -> !cc.ptr> +// HYBRID: cc.store %[[VAL_12]], %[[VAL_17]] : !cc.ptr> +// HYBRID: return %[[VAL_15]] : !cc.struct<{!cc.ptr, i64}> // HYBRID: } +// HYBRID: llvm.mlir.global external constant @ghz.kernelName("ghz\00") {addr_space = 0 : i32} + +// HYBRID-LABEL: func.func @ghz.returnOffset() -> i64 { +// HYBRID: %[[VAL_0:.*]] = cc.offsetof !cc.struct<{i32, f64}> [1] : i64 +// HYBRID: return %[[VAL_0]] : i64 +// HYBRID: } + +// HYBRID-LABEL: func.func @ghz.thunk( +// HYBRID-SAME: %[[VAL_0:.*]]: !cc.ptr, +// HYBRID-SAME: %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr, i64}> { +// HYBRID: %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr) -> !cc.ptr> +// HYBRID: %[[VAL_3:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64 +// HYBRID: %[[VAL_4:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr) -> !cc.ptr> +// HYBRID: %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_4]]{{\[}}%[[VAL_3]]] : (!cc.ptr>, i64) -> !cc.ptr +// HYBRID: %[[VAL_6:.*]] = cc.compute_ptr %[[VAL_2]][0] : (!cc.ptr>) -> !cc.ptr +// HYBRID: %[[VAL_7:.*]] = cc.load %[[VAL_6]] : !cc.ptr +// HYBRID: %[[VAL_8:.*]] = call @__nvqpp__mlirgen__ghz(%[[VAL_7]]) : (i32) -> f64 +// HYBRID: %[[VAL_9:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr>) -> !cc.ptr +// HYBRID: cc.store %[[VAL_8]], %[[VAL_9]] : !cc.ptr +// HYBRID: %[[VAL_10:.*]] = call @__nvqpp_zeroDynamicResult() : () -> !cc.struct<{!cc.ptr, i64}> +// HYBRID: return %[[VAL_10]] : !cc.struct<{!cc.ptr, i64}> +// HYBRID: } + +// HYBRID-LABEL: func.func @ghz.argsCreator( +// HYBRID-SAME: %[[VAL_0:.*]]: !cc.ptr>, +// HYBRID-SAME: %[[VAL_1:.*]]: !cc.ptr>) -> i64 { +// HYBRID: %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr>) -> !cc.ptr x ?>> +// HYBRID: %[[VAL_3:.*]] = cc.compute_ptr %[[VAL_2]][0] : (!cc.ptr x ?>>) -> !cc.ptr> +// HYBRID: %[[VAL_4:.*]] = cc.load %[[VAL_3]] : !cc.ptr> +// HYBRID: %[[VAL_5:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr) -> !cc.ptr +// HYBRID: %[[VAL_6:.*]] = cc.load %[[VAL_5]] : !cc.ptr +// HYBRID: %[[VAL_7:.*]] = cc.alloca i64 +// HYBRID: %[[VAL_8:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64 +// HYBRID: %[[VAL_9:.*]] = call @malloc(%[[VAL_8]]) : (i64) -> !cc.ptr +// HYBRID: %[[VAL_10:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr) -> !cc.ptr> +// HYBRID: %[[VAL_11:.*]] = cc.compute_ptr %[[VAL_10]][0] : (!cc.ptr>) -> !cc.ptr +// HYBRID: cc.store %[[VAL_6]], %[[VAL_11]] : !cc.ptr +// HYBRID: cc.store %[[VAL_9]], %[[VAL_1]] : !cc.ptr> +// HYBRID: return %[[VAL_8]] : i64 +// HYBRID: } + +// HYBRID-LABEL: llvm.func @ghz.kernelRegFunc() { +// HYBRID: %[[VAL_0:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr> +// HYBRID: %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (!llvm.ptr>) -> !cc.ptr +// HYBRID: func.call @cudaqRegisterKernelName(%[[VAL_1]]) : (!cc.ptr) -> () +// HYBRID: %[[VAL_2:.*]] = func.constant @ghz.argsCreator : (!cc.ptr>, !cc.ptr>) -> i64 +// HYBRID: %[[VAL_3:.*]] = cc.func_ptr %[[VAL_2]] : ((!cc.ptr>, !cc.ptr>) -> i64) -> !cc.ptr +// HYBRID: func.call @cudaqRegisterArgsCreator(%[[VAL_1]], %[[VAL_3]]) : (!cc.ptr, !cc.ptr) -> () +// HYBRID: llvm.return +// HYBRID: } +// HYBRID: llvm.mlir.global_ctors {ctors = [@ghz.kernelRegFunc], priorities = [17 : i32]} + diff --git a/test/Quake/kernel_exec-2.qke b/test/Quake/kernel_exec-2.qke index a9b04b8449..b94412cb11 100644 --- a/test/Quake/kernel_exec-2.qke +++ b/test/Quake/kernel_exec-2.qke @@ -6,7 +6,7 @@ // the terms of the Apache License 2.0 which accompanies this distribution. // // ========================================================================== // -// RUN: cudaq-opt --kernel-execution=codegen=1 %s | FileCheck %s +// RUN: cudaq-opt -kernel-execution %s | FileCheck %s module attributes {quake.mangled_name_map = { __nvqpp__mlirgen__function_hawaiian = "shirt", @@ -36,120 +36,210 @@ __nvqpp__mlirgen__function_cargo = "pants"}} { } } +// CHECK-LABEL: func.func @__nvqpp__mlirgen__function_cargo( +// CHECK-SAME: %[[VAL_0:.*]]: !cc.stdvec, +// CHECK-SAME: %[[VAL_1:.*]]: !quake.ref) attributes {"cudaq-kernel", no_this} { +// CHECK: return +// CHECK: } + +// CHECK-LABEL: func.func @__nvqpp__mlirgen__function_hawaiian( +// CHECK-SAME: %[[VAL_0:.*]]: i1, +// CHECK-SAME: %[[VAL_1:.*]]: !cc.stdvec) attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} { +// CHECK: %[[VAL_2:.*]] = quake.alloca !quake.ref +// CHECK: %[[VAL_3:.*]] = quake.alloca !quake.ref +// CHECK: %[[VAL_4:.*]] = quake.alloca !quake.ref +// CHECK: cc.if(%[[VAL_0]]) { +// CHECK: quake.x %[[VAL_4]] : (!quake.ref) -> () +// CHECK: } +// CHECK: call @__nvqpp__mlirgen__function_cargo(%[[VAL_1]], %[[VAL_4]]) : (!cc.stdvec, !quake.ref) -> () +// CHECK: return +// CHECK: } + // CHECK-LABEL: func.func @shirt( -// CHECK-SAME: %[[VAL_0:.*]]: i1, %[[VAL_1:.*]]: !cc.ptr, !cc.ptr, !cc.ptr}>>) { -// CHECK: %[[VAL_2:.*]] = cc.undef !cc.struct<{i1, i64}> -// CHECK: %[[VAL_3:.*]] = arith.constant 0 : i64 -// CHECK: %[[VAL_4:.*]] = cc.insert_value %[[VAL_0]], %[[VAL_2]][0] : (!cc.struct<{i1, i64}>, i1) -> !cc.struct<{i1, i64}> -// CHECK: %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_1]][1] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> -// CHECK: %[[VAL_6:.*]] = cc.compute_ptr %[[VAL_1]][0] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK-SAME: %[[VAL_0:.*]]: i1, +// CHECK-SAME: %[[VAL_1:.*]]: !cc.ptr, !cc.ptr, !cc.ptr}>>) { +// CHECK: %[[VAL_2:.*]] = cc.alloca i64 +// CHECK: %[[VAL_3:.*]] = cc.sizeof !cc.struct<{i1, i64}> : i64 +// CHECK: %[[VAL_4:.*]] = cc.compute_ptr %[[VAL_1]][1] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_1]][0] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: %[[VAL_6:.*]] = cc.load %[[VAL_4]] : !cc.ptr> // CHECK: %[[VAL_7:.*]] = cc.load %[[VAL_5]] : !cc.ptr> -// CHECK: %[[VAL_8:.*]] = cc.load %[[VAL_6]] : !cc.ptr> +// CHECK: %[[VAL_8:.*]] = cc.cast %[[VAL_6]] : (!cc.ptr) -> i64 // CHECK: %[[VAL_9:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr) -> i64 -// CHECK: %[[VAL_10:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr) -> i64 -// CHECK: %[[VAL_11:.*]] = arith.subi %[[VAL_9]], %[[VAL_10]] : i64 -// CHECK: %[[VAL_12:.*]] = cc.insert_value %[[VAL_11]], %[[VAL_4]][1] : (!cc.struct<{i1, i64}>, i64) -> !cc.struct<{i1, i64}> -// CHECK: %[[VAL_13:.*]] = arith.addi %[[VAL_3]], %[[VAL_11]] : i64 -// CHECK: %[[VAL_16:.*]] = cc.sizeof !cc.struct<{i1, i64}> : i64 -// CHECK: %[[VAL_17:.*]] = arith.addi %[[VAL_16]], %[[VAL_13]] : i64 -// CHECK: %[[VAL_18:.*]] = cc.alloca i8[%[[VAL_17]] : i64] -// CHECK: %[[VAL_19:.*]] = cc.cast %[[VAL_18]] : (!cc.ptr>) -> !cc.ptr> -// CHECK: cc.store %[[VAL_12]], %[[VAL_19]] : !cc.ptr> -// CHECK: %[[VAL_20:.*]] = cc.cast %[[VAL_18]] : (!cc.ptr>) -> !cc.ptr x ?>> -// CHECK: %[[VAL_21:.*]] = cc.compute_ptr %[[VAL_18]][%[[VAL_16]]] : (!cc.ptr>, i64) -> !cc.ptr -// CHECK: %[[VAL_22:.*]] = cc.extract_value %[[VAL_12]][1] : (!cc.struct<{i1, i64}>) -> i64 -// CHECK: %[[VAL_23:.*]] = arith.constant false -// CHECK: %[[VAL_24:.*]] = cc.compute_ptr %[[VAL_1]][0] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> -// CHECK: %[[VAL_25:.*]] = cc.load %[[VAL_24]] : !cc.ptr> -// CHECK: %[[VAL_26:.*]] = cc.cast %[[VAL_25]] : (!cc.ptr) -> !cc.ptr -// CHECK: call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_21]], %[[VAL_26]], %[[VAL_22]], %[[VAL_23]]) : (!cc.ptr, !cc.ptr, i64, i1) -> () -// CHECK: %[[VAL_90:.*]] = cc.cast %[[VAL_21]] : -// CHECK: %[[VAL_27:.*]] = cc.compute_ptr %[[VAL_90]][%[[VAL_22]]] : (!cc.ptr>, i64) -> !cc.ptr -// CHECK: %[[VAL_29:.*]] = constant @function_hawaiian.thunk : (!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}> -// CHECK: %[[VAL_31:.*]] = cc.func_ptr %[[VAL_29]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr -// CHECK: %[[VAL_32:.*]] = cc.cast %[[VAL_20]] : (!cc.ptr x ?>>) -> !cc.ptr -// CHECK: %[[VAL_33:.*]] = arith.constant 2147483647 : i64 -// CHECK: %[[VAL_28:.*]] = llvm.mlir.addressof @function_hawaiian.kernelName : !llvm.ptr> -// CHECK: %[[VAL_30:.*]] = cc.cast %[[VAL_28]] : (!llvm.ptr>) -> !cc.ptr -// CHECK: call @altLaunchKernel(%[[VAL_30]], %[[VAL_31]], %[[VAL_32]], %[[VAL_17]], %[[VAL_33]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_10:.*]] = arith.subi %[[VAL_8]], %[[VAL_9]] : i64 +// CHECK: %[[VAL_11:.*]] = arith.addi %[[VAL_10]], %[[VAL_3]] : i64 +// CHECK: %[[VAL_12:.*]] = cc.alloca i8{{\[}}%[[VAL_11]] : i64] +// CHECK: %[[VAL_13:.*]] = cc.cast %[[VAL_12]] : (!cc.ptr>) -> !cc.ptr> +// CHECK: %[[VAL_14:.*]] = cc.alloca !cc.ptr +// CHECK: %[[VAL_15:.*]] = cc.sizeof !cc.struct<{i1, i64}> : i64 +// CHECK: %[[VAL_16:.*]] = cc.compute_ptr %[[VAL_12]]{{\[}}%[[VAL_15]]] : (!cc.ptr>, i64) -> !cc.ptr +// CHECK: %[[VAL_17:.*]] = cc.compute_ptr %[[VAL_13]][0] : (!cc.ptr>) -> !cc.ptr +// CHECK: cc.store %[[VAL_0]], %[[VAL_17]] : !cc.ptr +// CHECK: %[[VAL_18:.*]] = cc.compute_ptr %[[VAL_13]][1] : (!cc.ptr>) -> !cc.ptr +// CHECK: %[[VAL_19:.*]] = cc.compute_ptr %[[VAL_1]][1] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: %[[VAL_20:.*]] = cc.compute_ptr %[[VAL_1]][0] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: %[[VAL_21:.*]] = cc.load %[[VAL_19]] : !cc.ptr> +// CHECK: %[[VAL_22:.*]] = cc.load %[[VAL_20]] : !cc.ptr> +// CHECK: %[[VAL_23:.*]] = cc.cast %[[VAL_21]] : (!cc.ptr) -> i64 +// CHECK: %[[VAL_24:.*]] = cc.cast %[[VAL_22]] : (!cc.ptr) -> i64 +// CHECK: %[[VAL_25:.*]] = arith.subi %[[VAL_23]], %[[VAL_24]] : i64 +// CHECK: cc.store %[[VAL_25]], %[[VAL_18]] : !cc.ptr +// CHECK: %[[VAL_26:.*]] = cc.cast %[[VAL_1]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: %[[VAL_27:.*]] = cc.load %[[VAL_26]] : !cc.ptr> +// CHECK: %[[VAL_28:.*]] = arith.constant false +// CHECK: %[[VAL_29:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr) -> !cc.ptr +// CHECK: call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_29]], %[[VAL_27]], %[[VAL_25]], %[[VAL_28]]) : (!cc.ptr, !cc.ptr, i64, i1) -> () +// CHECK: %[[VAL_30:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr) -> !cc.ptr> +// CHECK: %[[VAL_31:.*]] = cc.compute_ptr %[[VAL_30]]{{\[}}%[[VAL_25]]] : (!cc.ptr>, i64) -> !cc.ptr +// CHECK: %[[VAL_32:.*]] = constant @function_hawaiian.thunk : (!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_33:.*]] = cc.func_ptr %[[VAL_32]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr +// CHECK: %[[VAL_34:.*]] = cc.cast %[[VAL_13]] : (!cc.ptr>) -> !cc.ptr +// CHECK: %[[VAL_35:.*]] = arith.constant 2147483647 : i64 +// CHECK: %[[VAL_36:.*]] = cc.alloca !cc.struct<{!cc.ptr>, !cc.ptr>, !cc.ptr>}> +// CHECK: %[[VAL_37:.*]] = cc.alloca !cc.array x 2> +// CHECK: %[[VAL_38:.*]] = cc.sizeof !cc.array x 2> : i64 +// CHECK: %[[VAL_39:.*]] = cc.cast %[[VAL_37]] : (!cc.ptr x 2>>) -> !cc.ptr> +// CHECK: %[[VAL_40:.*]] = cc.cast %[[VAL_36]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// CHECK: cc.store %[[VAL_39]], %[[VAL_40]] : !cc.ptr>> +// CHECK: %[[VAL_41:.*]] = cc.cast %[[VAL_37]] : (!cc.ptr x 2>>) -> i64 +// CHECK: %[[VAL_42:.*]] = arith.addi %[[VAL_41]], %[[VAL_38]] : i64 +// CHECK: %[[VAL_43:.*]] = cc.cast %[[VAL_42]] : (i64) -> !cc.ptr> +// CHECK: %[[VAL_44:.*]] = cc.compute_ptr %[[VAL_36]][1] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// CHECK: cc.store %[[VAL_43]], %[[VAL_44]] : !cc.ptr>> +// CHECK: %[[VAL_45:.*]] = cc.compute_ptr %[[VAL_36]][2] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// CHECK: cc.store %[[VAL_43]], %[[VAL_45]] : !cc.ptr>> +// CHECK: %[[VAL_46:.*]] = cc.compute_ptr %[[VAL_37]][0] : (!cc.ptr x 2>>) -> !cc.ptr> +// CHECK: %[[VAL_47:.*]] = cc.alloca i1 +// CHECK: cc.store %[[VAL_0]], %[[VAL_47]] : !cc.ptr +// CHECK: %[[VAL_48:.*]] = cc.cast %[[VAL_47]] : (!cc.ptr) -> !cc.ptr +// CHECK: cc.store %[[VAL_48]], %[[VAL_46]] : !cc.ptr> +// CHECK: %[[VAL_49:.*]] = cc.compute_ptr %[[VAL_37]][1] : (!cc.ptr x 2>>) -> !cc.ptr> +// CHECK: %[[VAL_50:.*]] = cc.cast %[[VAL_1]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr +// CHECK: cc.store %[[VAL_50]], %[[VAL_49]] : !cc.ptr> +// CHECK: %[[VAL_51:.*]] = cc.cast %[[VAL_36]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr +// CHECK: %[[VAL_52:.*]] = llvm.mlir.addressof @function_hawaiian.kernelName : !llvm.ptr> +// CHECK: %[[VAL_53:.*]] = cc.cast %[[VAL_52]] : (!llvm.ptr>) -> !cc.ptr +// CHECK: %[[VAL_54:.*]] = call @hybridLaunchKernel(%[[VAL_53]], %[[VAL_33]], %[[VAL_34]], %[[VAL_11]], %[[VAL_35]], %[[VAL_51]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> // CHECK: return // CHECK: } - -// CHECK-DAG: func.func private @altLaunchKernel(!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) -> !cc.struct<{!cc.ptr, i64}> -// CHECK-DAG: func.func private @cudaqRegisterKernelName(!cc.ptr) -// CHECK-DAG: func.func private @cudaqRegisterArgsCreator(!cc.ptr, !cc.ptr) -// CHECK-DAG: func.func private @malloc(i64) -> !cc.ptr -// CHECK-DAG: func.func private @__nvqpp_initializer_list_to_vector_bool(!cc.ptr, !cc.ptr, i64) -// CHECK-DAG: func.func private @llvm.memcpy.p0i8.p0i8.i64(!cc.ptr, !cc.ptr, i64, i1) +// CHECK: func.func private @hybridLaunchKernel(!cc.ptr, !cc.ptr, !cc.ptr, i64, i64, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: func.func private @cudaqRegisterArgsCreator(!cc.ptr, !cc.ptr) +// CHECK: llvm.func @cudaqRegisterLambdaName(!llvm.ptr, !llvm.ptr) attributes {sym_visibility = "private"} +// CHECK: func.func private @__cudaq_registerLinkableKernel(!cc.ptr, !cc.ptr, !cc.ptr) +// CHECK: func.func private @__cudaq_getLinkableKernelKey(!cc.ptr) -> i64 +// CHECK: func.func private @cudaqRegisterKernelName(!cc.ptr) +// CHECK: func.func private @malloc(i64) -> !cc.ptr +// CHECK: func.func private @free(!cc.ptr) +// CHECK: func.func private @__nvqpp_initializer_list_to_vector_bool(!cc.ptr, !cc.ptr, i64) +// CHECK: func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr, !cc.ptr, !cc.ptr}>>, !cc.ptr, !cc.array}>>, !cc.ptr>) +// CHECK: func.func private @llvm.memcpy.p0i8.p0i8.i64(!cc.ptr, !cc.ptr, i64, i1) // CHECK-LABEL: func.func private @__nvqpp_zeroDynamicResult() -> !cc.struct<{!cc.ptr, i64}> { +// CHECK: %[[VAL_0:.*]] = arith.constant 0 : i64 +// CHECK: %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (i64) -> !cc.ptr +// CHECK: %[[VAL_2:.*]] = cc.undef !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_3:.*]] = cc.insert_value %[[VAL_1]], %[[VAL_2]][0] : (!cc.struct<{!cc.ptr, i64}>, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_4:.*]] = cc.insert_value %[[VAL_0]], %[[VAL_3]][1] : (!cc.struct<{!cc.ptr, i64}>, i64) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: return %[[VAL_4]] : !cc.struct<{!cc.ptr, i64}> +// CHECK: } // CHECK-LABEL: func.func private @__nvqpp_createDynamicResult( +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, %[[VAL_1:.*]]: i64, %[[VAL_2:.*]]: !cc.ptr, i64}>>, %[[VAL_3:.*]]: i64) -> !cc.struct<{!cc.ptr, i64}> { +// CHECK: %[[VAL_4:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr, i64}>>) -> !cc.ptr +// CHECK: %[[VAL_5:.*]] = cc.load %[[VAL_4]] : !cc.ptr +// CHECK: %[[VAL_6:.*]] = arith.addi %[[VAL_1]], %[[VAL_5]] : i64 +// CHECK: %[[VAL_7:.*]] = call @malloc(%[[VAL_6]]) : (i64) -> !cc.ptr +// CHECK: %[[VAL_8:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr) -> !cc.ptr> +// CHECK: %[[VAL_9:.*]] = arith.constant false +// CHECK: call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_7]], %[[VAL_0]], %[[VAL_1]], %[[VAL_9]]) : (!cc.ptr, !cc.ptr, i64, i1) -> () +// CHECK: %[[VAL_10:.*]] = cc.compute_ptr %[[VAL_2]][0] : (!cc.ptr, i64}>>) -> !cc.ptr> +// CHECK: %[[VAL_11:.*]] = cc.load %[[VAL_10]] : !cc.ptr> +// CHECK: %[[VAL_12:.*]] = cc.compute_ptr %[[VAL_8]]{{\[}}%[[VAL_1]]] : (!cc.ptr>, i64) -> !cc.ptr +// CHECK: call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_12]], %[[VAL_11]], %[[VAL_5]], %[[VAL_9]]) : (!cc.ptr, !cc.ptr, i64, i1) -> () +// CHECK: %[[VAL_13:.*]] = cc.undef !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_14:.*]] = cc.insert_value %[[VAL_7]], %[[VAL_13]][0] : (!cc.struct<{!cc.ptr, i64}>, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_15:.*]] = cc.insert_value %[[VAL_6]], %[[VAL_14]][1] : (!cc.struct<{!cc.ptr, i64}>, i64) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_16:.*]] = cc.compute_ptr %[[VAL_8]]{{\[}}%[[VAL_3]]] : (!cc.ptr>, i64) -> !cc.ptr +// CHECK: %[[VAL_17:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr) -> !cc.ptr> +// CHECK: cc.store %[[VAL_12]], %[[VAL_17]] : !cc.ptr> +// CHECK: return %[[VAL_15]] : !cc.struct<{!cc.ptr, i64}> +// CHECK: } +// CHECK: llvm.mlir.global external constant @function_hawaiian.kernelName("function_hawaiian\00") {addr_space = 0 : i32} -// CHECK: llvm.mlir.global external constant @function_hawaiian.kernelName("function +// CHECK-LABEL: func.func @function_hawaiian.returnOffset() -> i64 { +// CHECK: %[[VAL_0:.*]] = arith.constant 2147483647 : i64 +// CHECK: return %[[VAL_0]] : i64 +// CHECK: } // CHECK-LABEL: func.func @function_hawaiian.thunk( -// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr, i64}> { +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, +// CHECK-SAME: %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr, i64}> { // CHECK: %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr) -> !cc.ptr> -// CHECK: %[[VAL_3:.*]] = cc.load %[[VAL_2]] : !cc.ptr> -// CHECK: %[[VAL_7:.*]] = cc.sizeof !cc.struct<{i1, i64}> : i64 -// CHECK: %[[VAL_20:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr) -> !cc.ptr> -// CHECK: %[[VAL_8:.*]] = cc.compute_ptr %[[VAL_20]][%[[VAL_7]]] : (!cc.ptr>, i64) -> !cc.ptr -// CHECK: %[[VAL_9:.*]] = cc.extract_value %[[VAL_3]][0] : (!cc.struct<{i1, i64}>) -> i1 -// CHECK: %[[VAL_10:.*]] = cc.extract_value %[[VAL_3]][1] : (!cc.struct<{i1, i64}>) -> i64 -// CHECK: %[[VAL_11:.*]] = arith.constant 4 : i64 -// CHECK: %[[VAL_12:.*]] = arith.divsi %[[VAL_10]], %[[VAL_11]] : i64 -// CHECK: %[[VAL_13:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr) -> !cc.ptr -// CHECK: %[[VAL_14:.*]] = cc.stdvec_init %[[VAL_13]], %[[VAL_12]] : (!cc.ptr, i64) -> !cc.stdvec -// CHECK: %[[VAL_90:.*]] = cc.cast %[[VAL_8]] : -// CHECK: %[[VAL_15:.*]] = cc.compute_ptr %[[VAL_90]][%[[VAL_10]]] : (!cc.ptr>, i64) -> !cc.ptr -// CHECK: call @__nvqpp__mlirgen__function_hawaiian(%[[VAL_9]], %[[VAL_14]]) : (i1, !cc.stdvec) -> () +// CHECK: %[[VAL_3:.*]] = cc.sizeof !cc.struct<{i1, i64}> : i64 +// CHECK: %[[VAL_4:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr) -> !cc.ptr> +// CHECK: %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_4]]{{\[}}%[[VAL_3]]] : (!cc.ptr>, i64) -> !cc.ptr +// CHECK: %[[VAL_6:.*]] = cc.compute_ptr %[[VAL_2]][0] : (!cc.ptr>) -> !cc.ptr +// CHECK: %[[VAL_7:.*]] = cc.load %[[VAL_6]] : !cc.ptr +// CHECK: %[[VAL_8:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr>) -> !cc.ptr +// CHECK: %[[VAL_9:.*]] = cc.sizeof i32 : i64 +// CHECK: %[[VAL_10:.*]] = cc.load %[[VAL_8]] : !cc.ptr +// CHECK: %[[VAL_11:.*]] = arith.divsi %[[VAL_10]], %[[VAL_9]] : i64 +// CHECK: %[[VAL_12:.*]] = cc.cast %[[VAL_5]] : (!cc.ptr) -> !cc.ptr +// CHECK: %[[VAL_13:.*]] = cc.stdvec_init %[[VAL_12]], %[[VAL_11]] : (!cc.ptr, i64) -> !cc.stdvec +// CHECK: %[[VAL_14:.*]] = cc.cast %[[VAL_5]] : (!cc.ptr) -> !cc.ptr> +// CHECK: %[[VAL_15:.*]] = cc.compute_ptr %[[VAL_14]]{{\[}}%[[VAL_10]]] : (!cc.ptr>, i64) -> !cc.ptr +// CHECK: call @__nvqpp__mlirgen__function_hawaiian(%[[VAL_7]], %[[VAL_13]]) : (i1, !cc.stdvec) -> () // CHECK: %[[VAL_16:.*]] = call @__nvqpp_zeroDynamicResult() : () -> !cc.struct<{!cc.ptr, i64}> // CHECK: return %[[VAL_16]] : !cc.struct<{!cc.ptr, i64}> // CHECK: } // CHECK-LABEL: func.func @function_hawaiian.argsCreator( -// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr>, %[[VAL_1:.*]]: !cc.ptr>) -> i64 { -// CHECK: %[[VAL_2:.*]] = cc.undef !cc.struct<{i1, i64}> -// CHECK: %[[VAL_90:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr>) -> -// CHECK: %[[VAL_3:.*]] = arith.constant 0 : i64 -// CHECK: %[[VAL_4:.*]] = cc.compute_ptr %[[VAL_90]][0] : (!cc.ptr x ?>>) -> !cc.ptr> -// CHECK: %[[VAL_5:.*]] = cc.load %[[VAL_4]] : !cc.ptr> -// CHECK: %[[VAL_6:.*]] = cc.cast %[[VAL_5]] : (!cc.ptr) -> !cc.ptr -// CHECK: %[[VAL_7:.*]] = cc.load %[[VAL_6]] : !cc.ptr -// CHECK: %[[VAL_8:.*]] = cc.insert_value %[[VAL_7]], %[[VAL_2]][0] : (!cc.struct<{i1, i64}>, i1) -> !cc.struct<{i1, i64}> -// CHECK: %[[VAL_9:.*]] = cc.compute_ptr %[[VAL_90]][1] : (!cc.ptr x ?>>) -> !cc.ptr> -// CHECK: %[[VAL_10:.*]] = cc.load %[[VAL_9]] : !cc.ptr> -// CHECK: %[[VAL_11:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr) -> !cc.ptr, !cc.ptr, !cc.ptr}>> -// CHECK: %[[VAL_12:.*]] = cc.compute_ptr %[[VAL_11]][1] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> -// CHECK: %[[VAL_13:.*]] = cc.compute_ptr %[[VAL_11]][0] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr>, +// CHECK-SAME: %[[VAL_1:.*]]: !cc.ptr>) -> i64 { +// CHECK: %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr>) -> !cc.ptr x ?>> +// CHECK: %[[VAL_3:.*]] = cc.compute_ptr %[[VAL_2]][0] : (!cc.ptr x ?>>) -> !cc.ptr> +// CHECK: %[[VAL_4:.*]] = cc.load %[[VAL_3]] : !cc.ptr> +// CHECK: %[[VAL_5:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr) -> !cc.ptr +// CHECK: %[[VAL_6:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr x ?>>) -> !cc.ptr> +// CHECK: %[[VAL_7:.*]] = cc.load %[[VAL_6]] : !cc.ptr> +// CHECK: %[[VAL_8:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr) -> !cc.ptr, !cc.ptr, !cc.ptr}>> +// CHECK: %[[VAL_9:.*]] = cc.load %[[VAL_5]] : !cc.ptr +// CHECK: %[[VAL_10:.*]] = cc.alloca i64 +// CHECK: %[[VAL_11:.*]] = cc.sizeof !cc.struct<{i1, i64}> : i64 +// CHECK: %[[VAL_12:.*]] = cc.compute_ptr %[[VAL_8]][1] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: %[[VAL_13:.*]] = cc.compute_ptr %[[VAL_8]][0] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> // CHECK: %[[VAL_14:.*]] = cc.load %[[VAL_12]] : !cc.ptr> // CHECK: %[[VAL_15:.*]] = cc.load %[[VAL_13]] : !cc.ptr> // CHECK: %[[VAL_16:.*]] = cc.cast %[[VAL_14]] : (!cc.ptr) -> i64 // CHECK: %[[VAL_17:.*]] = cc.cast %[[VAL_15]] : (!cc.ptr) -> i64 // CHECK: %[[VAL_18:.*]] = arith.subi %[[VAL_16]], %[[VAL_17]] : i64 -// CHECK: %[[VAL_19:.*]] = cc.insert_value %[[VAL_18]], %[[VAL_8]][1] : (!cc.struct<{i1, i64}>, i64) -> !cc.struct<{i1, i64}> -// CHECK: %[[VAL_20:.*]] = arith.addi %[[VAL_3]], %[[VAL_18]] : i64 +// CHECK: %[[VAL_19:.*]] = arith.addi %[[VAL_18]], %[[VAL_11]] : i64 +// CHECK: %[[VAL_20:.*]] = call @malloc(%[[VAL_19]]) : (i64) -> !cc.ptr +// CHECK: %[[VAL_21:.*]] = cc.cast %[[VAL_20]] : (!cc.ptr) -> !cc.ptr> +// CHECK: %[[VAL_22:.*]] = cc.alloca !cc.ptr // CHECK: %[[VAL_23:.*]] = cc.sizeof !cc.struct<{i1, i64}> : i64 -// CHECK: %[[VAL_24:.*]] = arith.addi %[[VAL_23]], %[[VAL_20]] : i64 -// CHECK: %[[VAL_25:.*]] = call @malloc(%[[VAL_24]]) : (i64) -> !cc.ptr -// CHECK: %[[VAL_26:.*]] = cc.cast %[[VAL_25]] : (!cc.ptr) -> !cc.ptr> -// CHECK: cc.store %[[VAL_19]], %[[VAL_26]] : !cc.ptr> -// CHECK: %[[VAL_80:.*]] = cc.cast %[[VAL_25]] : -// CHECK: %[[VAL_27:.*]] = cc.compute_ptr %[[VAL_80]][%[[VAL_23]]] : (!cc.ptr>, i64) -> !cc.ptr -// CHECK: %[[VAL_28:.*]] = cc.extract_value %[[VAL_19]][1] : (!cc.struct<{i1, i64}>) -> i64 -// CHECK: %[[VAL_29:.*]] = cc.compute_ptr %[[VAL_90]][1] : (!cc.ptr x ?>>) -> !cc.ptr> -// CHECK: %[[VAL_30:.*]] = cc.load %[[VAL_29]] : !cc.ptr> -// CHECK: %[[VAL_31:.*]] = cc.cast %[[VAL_30]] : (!cc.ptr) -> !cc.ptr, !cc.ptr, !cc.ptr}>> -// CHECK: %[[VAL_32:.*]] = arith.constant false -// CHECK: %[[VAL_33:.*]] = cc.compute_ptr %[[VAL_31]][0] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> -// CHECK: %[[VAL_34:.*]] = cc.load %[[VAL_33]] : !cc.ptr> -// CHECK: %[[VAL_35:.*]] = cc.cast %[[VAL_34]] : (!cc.ptr) -> !cc.ptr -// CHECK: call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_27]], %[[VAL_35]], %[[VAL_28]], %[[VAL_32]]) : (!cc.ptr, !cc.ptr, i64, i1) -> () -// CHECK: %[[VAL_83:.*]] = cc.cast %[[VAL_27]] : -// CHECK: %[[VAL_36:.*]] = cc.compute_ptr %[[VAL_83]][%[[VAL_28]]] : (!cc.ptr>, i64) -> !cc.ptr -// CHECK: cc.store %[[VAL_25]], %[[VAL_1]] : !cc.ptr> -// CHECK: return %[[VAL_24]] : i64 +// CHECK: %[[VAL_24:.*]] = cc.cast %[[VAL_20]] : (!cc.ptr) -> !cc.ptr> +// CHECK: %[[VAL_25:.*]] = cc.compute_ptr %[[VAL_24]]{{\[}}%[[VAL_23]]] : (!cc.ptr>, i64) -> !cc.ptr +// CHECK: %[[VAL_26:.*]] = cc.compute_ptr %[[VAL_21]][0] : (!cc.ptr>) -> !cc.ptr +// CHECK: cc.store %[[VAL_9]], %[[VAL_26]] : !cc.ptr +// CHECK: %[[VAL_27:.*]] = cc.compute_ptr %[[VAL_21]][1] : (!cc.ptr>) -> !cc.ptr +// CHECK: %[[VAL_28:.*]] = cc.compute_ptr %[[VAL_8]][1] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: %[[VAL_29:.*]] = cc.compute_ptr %[[VAL_8]][0] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: %[[VAL_30:.*]] = cc.load %[[VAL_28]] : !cc.ptr> +// CHECK: %[[VAL_31:.*]] = cc.load %[[VAL_29]] : !cc.ptr> +// CHECK: %[[VAL_32:.*]] = cc.cast %[[VAL_30]] : (!cc.ptr) -> i64 +// CHECK: %[[VAL_33:.*]] = cc.cast %[[VAL_31]] : (!cc.ptr) -> i64 +// CHECK: %[[VAL_34:.*]] = arith.subi %[[VAL_32]], %[[VAL_33]] : i64 +// CHECK: cc.store %[[VAL_34]], %[[VAL_27]] : !cc.ptr +// CHECK: %[[VAL_35:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: %[[VAL_36:.*]] = cc.load %[[VAL_35]] : !cc.ptr> +// CHECK: %[[VAL_37:.*]] = arith.constant false +// CHECK: %[[VAL_38:.*]] = cc.cast %[[VAL_25]] : (!cc.ptr) -> !cc.ptr +// CHECK: call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_38]], %[[VAL_36]], %[[VAL_34]], %[[VAL_37]]) : (!cc.ptr, !cc.ptr, i64, i1) -> () +// CHECK: %[[VAL_39:.*]] = cc.cast %[[VAL_25]] : (!cc.ptr) -> !cc.ptr> +// CHECK: %[[VAL_40:.*]] = cc.compute_ptr %[[VAL_39]]{{\[}}%[[VAL_34]]] : (!cc.ptr>, i64) -> !cc.ptr +// CHECK: cc.store %[[VAL_20]], %[[VAL_1]] : !cc.ptr> +// CHECK: return %[[VAL_19]] : i64 // CHECK: } // CHECK-LABEL: llvm.func @function_hawaiian.kernelRegFunc() { @@ -161,6 +251,5 @@ __nvqpp__mlirgen__function_cargo = "pants"}} { // CHECK: func.call @cudaqRegisterArgsCreator(%[[VAL_1]], %[[VAL_3]]) : (!cc.ptr, !cc.ptr) -> () // CHECK: llvm.return // CHECK: } - // CHECK: llvm.mlir.global_ctors {ctors = [@function_hawaiian.kernelRegFunc], priorities = [17 : i32]} diff --git a/test/Quake/lambda_kernel_exec.qke b/test/Quake/lambda_kernel_exec.qke index 606b644ffe..aedb9564b5 100644 --- a/test/Quake/lambda_kernel_exec.qke +++ b/test/Quake/lambda_kernel_exec.qke @@ -15,7 +15,7 @@ // CHECK: llvm.call @cudaqRegisterLambdaName(%[[VAL_1]], %{{.*}}) : (!llvm.ptr, !llvm.ptr) -> () module attributes {quake.mangled_name_map = {__nvqpp__mlirgen__lambda.main.canHaveMultiple = "_ZZ4mainENK3$_1clEv", __nvqpp__mlirgen__lambda.main.test = "_ZZ4mainENK3$_0clEv"}} { - func.func @__nvqpp__mlirgen__lambda.main.test() attributes {"cudaq-entrypoint"} { + func.func @__nvqpp__mlirgen__lambda.main.test() attributes {"cudaq-entrypoint", no_this} { %c2_i32 = arith.constant 2 : i32 %0 = arith.extsi %c2_i32 : i32 to i64 %1 = quake.alloca !quake.veq[%0 : i64] @@ -54,7 +54,7 @@ module attributes {quake.mangled_name_map = {__nvqpp__mlirgen__lambda.main.canHa // CHECK-NEXT: %[[VAL_4:.*]] = cc.cast %[[VAL_3]] : (!llvm.ptr>) -> !llvm.ptr // CHECK: llvm.call @cudaqRegisterLambdaName(%[[VAL_4]], %{{.*}}) : (!llvm.ptr, !llvm.ptr) -> () - func.func @__nvqpp__mlirgen__lambda.main.canHaveMultiple() attributes {"cudaq-entrypoint"} { + func.func @__nvqpp__mlirgen__lambda.main.canHaveMultiple() attributes {"cudaq-entrypoint", no_this} { %c2_i32 = arith.constant 2 : i32 %0 = arith.extsi %c2_i32 : i32 to i64 %1 = quake.alloca !quake.veq[%0 : i64] diff --git a/test/Quake/return_vector.qke b/test/Quake/return_vector.qke index 90ccc90610..58bcd2f089 100644 --- a/test/Quake/return_vector.qke +++ b/test/Quake/return_vector.qke @@ -6,8 +6,7 @@ // the terms of the Apache License 2.0 which accompanies this distribution. // // ========================================================================== // -// RUN: cudaq-opt --add-dealloc --kernel-execution=codegen=1 --canonicalize %s \ -// RUN: | FileCheck %s +// RUN: cudaq-opt -add-dealloc -kernel-execution -canonicalize %s | FileCheck %s // NB: the mangled name map is required for the kernel-execution pass. module attributes{ quake.mangled_name_map = { @@ -29,61 +28,88 @@ func.func @test_0(%0: !cc.ptr, !cc.ptr, !cc.ptr !cc.stdvec { -// CHECK-DAG: %[[VAL_2:.*]] = arith.constant 8 : i64 -// CHECK-DAG: %[[VAL_3:.*]] = arith.constant 256 : i64 -// CHECK: %[[VAL_4:.*]] = call @malloc(%[[VAL_3]]) : (i64) -> !cc.ptr -// CHECK: %[[VAL_5:.*]] = cc.stdvec_init %[[VAL_4]], %[[VAL_2]] : (!cc.ptr, i64) -> !cc.stdvec -// CHECK: return %[[VAL_5]] : !cc.stdvec +// CHECK-SAME: %[[VAL_0:.*]]: i32) -> !cc.stdvec { +// CHECK: %[[VAL_1:.*]] = arith.constant 8 : i64 +// CHECK: %[[VAL_2:.*]] = arith.constant 256 : i64 +// CHECK: %[[VAL_3:.*]] = call @malloc(%[[VAL_2]]) : (i64) -> !cc.ptr +// CHECK: %[[VAL_4:.*]] = cc.stdvec_init %[[VAL_3]], %[[VAL_1]] : (!cc.ptr, i64) -> !cc.stdvec +// CHECK: return %[[VAL_4]] : !cc.stdvec // CHECK: } // CHECK-LABEL: func.func @test_0( -// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, !cc.ptr, !cc.ptr}>> {llvm.sret = !cc.struct<{!cc.ptr, !cc.ptr, !cc.ptr}>}, -// CHECK-SAME: %[[VAL_1:.*]]: !cc.ptr, %[[VAL_2:.*]]: i32) { +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, !cc.ptr, !cc.ptr}>> {llvm.sret = !cc.struct<{!cc.ptr, !cc.ptr, !cc.ptr}>}, %[[VAL_1:.*]]: !cc.ptr, %[[VAL_2:.*]]: i32) { // CHECK: %[[VAL_3:.*]] = arith.constant 4 : i64 -// CHECK: %[[VAL_4:.*]] = constant @test_0.thunk : (!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}> -// CHECK: %[[VAL_5:.*]] = arith.constant 0 : i64 -// CHECK: %[[VAL_6:.*]] = cc.undef !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> -// CHECK: %[[VAL_7:.*]] = cc.insert_value %[[VAL_2]], %[[VAL_6]][0] : (!cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}>, i32) -> !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> -// CHECK: %[[VAL_8:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 -// CHECK: %[[VAL_9:.*]] = cc.alloca i8{{\[}}%[[VAL_8]] : i64] -// CHECK: %[[VAL_10:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr>) -> !cc.ptr, i64}>}>> -// CHECK: cc.store %[[VAL_7]], %[[VAL_10]] : !cc.ptr, i64}>}>> -// CHECK: %[[VAL_11:.*]] = cc.func_ptr %[[VAL_4]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr -// CHECK: %[[VAL_12:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr>) -> !cc.ptr -// CHECK: %[[VAL_13:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> [1] : i64 -// CHECK: %[[VAL_14:.*]] = llvm.mlir.addressof @test_0.kernelName : !llvm.ptr> -// CHECK: %[[VAL_15:.*]] = cc.cast %[[VAL_14]] : (!llvm.ptr>) -> !cc.ptr -// CHECK: %[[VAL_16:.*]] = call @altLaunchKernel(%[[VAL_15]], %[[VAL_11]], %[[VAL_12]], %[[VAL_8]], %[[VAL_13]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) -> !cc.struct<{!cc.ptr, i64}> -// CHECK: %[[VAL_17:.*]] = cc.extract_value %[[VAL_16]][0] : (!cc.struct<{!cc.ptr, i64}>) -> !cc.ptr -// CHECK: %[[VAL_18:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr) -> i64 -// CHECK: %[[VAL_19:.*]] = arith.cmpi ne, %[[VAL_18]], %[[VAL_5]] : i64 -// CHECK: cf.cond_br %[[VAL_19]], ^bb1, ^bb2 +// CHECK: %[[VAL_4:.*]] = arith.constant 8 : i64 +// CHECK: %[[VAL_5:.*]] = constant @test_0.thunk : (!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_6:.*]] = arith.constant 0 : i64 +// CHECK: %[[VAL_7:.*]] = cc.alloca !cc.ptr +// CHECK: %[[VAL_8:.*]] = cc.cast %[[VAL_6]] : (i64) -> !cc.ptr +// CHECK: cc.store %[[VAL_8]], %[[VAL_7]] : !cc.ptr> +// CHECK: %[[VAL_9:.*]] = cc.alloca i64 +// CHECK: %[[VAL_10:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 +// CHECK: %[[VAL_11:.*]] = cc.alloca i8{{\[}}%[[VAL_10]] : i64] +// CHECK: %[[VAL_12:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr>) -> !cc.ptr, i64}>}>> +// CHECK: %[[VAL_13:.*]] = cc.alloca !cc.ptr +// CHECK: %[[VAL_14:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr>) -> !cc.ptr +// CHECK: cc.store %[[VAL_2]], %[[VAL_14]] : !cc.ptr +// CHECK: %[[VAL_15:.*]] = cc.load %[[VAL_7]] : !cc.ptr> +// CHECK: %[[VAL_16:.*]] = cc.cast %[[VAL_15]] : (!cc.ptr) -> i64 +// CHECK: %[[VAL_17:.*]] = arith.cmpi ne, %[[VAL_16]], %[[VAL_6]] : i64 +// CHECK: cc.if(%[[VAL_17]]) { +// CHECK: func.call @__nvqpp_vector_bool_free_temporary_initlists(%[[VAL_15]]) : (!cc.ptr) -> () +// CHECK: } +// CHECK: %[[VAL_18:.*]] = cc.func_ptr %[[VAL_5]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr +// CHECK: %[[VAL_19:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr>) -> !cc.ptr +// CHECK: %[[VAL_20:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> [1] : i64 +// CHECK: %[[VAL_21:.*]] = cc.alloca !cc.struct<{!cc.ptr>, !cc.ptr>, !cc.ptr>}> +// CHECK: %[[VAL_22:.*]] = cc.alloca !cc.array x 1> +// CHECK: %[[VAL_23:.*]] = cc.cast %[[VAL_22]] : (!cc.ptr x 1>>) -> !cc.ptr> +// CHECK: %[[VAL_24:.*]] = cc.cast %[[VAL_21]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// CHECK: cc.store %[[VAL_23]], %[[VAL_24]] : !cc.ptr>> +// CHECK: %[[VAL_25:.*]] = cc.cast %[[VAL_22]] : (!cc.ptr x 1>>) -> i64 +// CHECK: %[[VAL_26:.*]] = arith.addi %[[VAL_25]], %[[VAL_4]] : i64 +// CHECK: %[[VAL_27:.*]] = cc.cast %[[VAL_26]] : (i64) -> !cc.ptr> +// CHECK: %[[VAL_28:.*]] = cc.compute_ptr %[[VAL_21]][1] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// CHECK: cc.store %[[VAL_27]], %[[VAL_28]] : !cc.ptr>> +// CHECK: %[[VAL_29:.*]] = cc.compute_ptr %[[VAL_21]][2] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// CHECK: cc.store %[[VAL_27]], %[[VAL_29]] : !cc.ptr>> +// CHECK: %[[VAL_30:.*]] = cc.cast %[[VAL_22]] : (!cc.ptr x 1>>) -> !cc.ptr> +// CHECK: %[[VAL_31:.*]] = cc.alloca i32 +// CHECK: cc.store %[[VAL_2]], %[[VAL_31]] : !cc.ptr +// CHECK: %[[VAL_32:.*]] = cc.cast %[[VAL_31]] : (!cc.ptr) -> !cc.ptr +// CHECK: cc.store %[[VAL_32]], %[[VAL_30]] : !cc.ptr> +// CHECK: %[[VAL_33:.*]] = cc.cast %[[VAL_21]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr +// CHECK: %[[VAL_34:.*]] = llvm.mlir.addressof @test_0.kernelName : !llvm.ptr> +// CHECK: %[[VAL_35:.*]] = cc.cast %[[VAL_34]] : (!llvm.ptr>) -> !cc.ptr +// CHECK: %[[VAL_36:.*]] = call @hybridLaunchKernel(%[[VAL_35]], %[[VAL_18]], %[[VAL_19]], %[[VAL_10]], %[[VAL_20]], %[[VAL_33]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_37:.*]] = cc.extract_value %[[VAL_36]][0] : (!cc.struct<{!cc.ptr, i64}>) -> !cc.ptr +// CHECK: %[[VAL_38:.*]] = cc.cast %[[VAL_37]] : (!cc.ptr) -> i64 +// CHECK: %[[VAL_39:.*]] = arith.cmpi ne, %[[VAL_38]], %[[VAL_6]] : i64 +// CHECK: cf.cond_br %[[VAL_39]], ^bb1, ^bb2 // CHECK: ^bb1: -// CHECK: %[[VAL_20:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr) -> !cc.ptr, i64}>}>> -// CHECK: %[[VAL_21:.*]] = cc.compute_ptr %[[VAL_20]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> -// CHECK: cf.br ^bb3(%[[VAL_21]] : !cc.ptr, i64}>>) +// CHECK: %[[VAL_40:.*]] = cc.cast %[[VAL_37]] : (!cc.ptr) -> !cc.ptr, i64}>}>> +// CHECK: %[[VAL_41:.*]] = cc.compute_ptr %[[VAL_40]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> +// CHECK: cf.br ^bb3(%[[VAL_41]] : !cc.ptr, i64}>>) // CHECK: ^bb2: -// CHECK: %[[VAL_22:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr>) -> !cc.ptr, i64}>}>> -// CHECK: %[[VAL_23:.*]] = cc.compute_ptr %[[VAL_22]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> -// CHECK: cf.br ^bb3(%[[VAL_23]] : !cc.ptr, i64}>>) -// CHECK: ^bb3(%[[VAL_24:.*]]: !cc.ptr, i64}>>): -// CHECK: %[[VAL_25:.*]] = cc.cast %[[VAL_24]] : (!cc.ptr, i64}>>) -> !cc.ptr> -// CHECK: %[[VAL_26:.*]] = cc.load %[[VAL_25]] : !cc.ptr> -// CHECK: %[[VAL_27:.*]] = cc.compute_ptr %[[VAL_24]][1] : (!cc.ptr, i64}>>) -> !cc.ptr -// CHECK: %[[VAL_28:.*]] = cc.load %[[VAL_27]] : !cc.ptr -// CHECK: %[[VAL_29:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr, !cc.ptr, !cc.ptr}>> -// CHECK: %[[VAL_30:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> -// CHECK: %[[VAL_31:.*]] = cc.cast %[[VAL_26]] : (!cc.ptr) -> !cc.ptr -// CHECK: cc.store %[[VAL_31]], %[[VAL_30]] : !cc.ptr> -// CHECK: %[[VAL_32:.*]] = cc.compute_ptr %[[VAL_29]][1] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> -// CHECK: %[[VAL_33:.*]] = arith.muli %[[VAL_28]], %[[VAL_3]] : i64 -// CHECK: %[[VAL_34:.*]] = cc.cast %[[VAL_26]] : (!cc.ptr) -> !cc.ptr> -// CHECK: %[[VAL_35:.*]] = cc.compute_ptr %[[VAL_34]]{{\[}}%[[VAL_33]]] : (!cc.ptr>, i64) -> !cc.ptr -// CHECK: cc.store %[[VAL_35]], %[[VAL_32]] : !cc.ptr> -// CHECK: %[[VAL_36:.*]] = cc.compute_ptr %[[VAL_29]][2] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> -// CHECK: cc.store %[[VAL_35]], %[[VAL_36]] : !cc.ptr> -// CHECK: call @free(%[[VAL_17]]) : (!cc.ptr) -> () +// CHECK: %[[VAL_42:.*]] = cc.compute_ptr %[[VAL_12]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> +// CHECK: cf.br ^bb3(%[[VAL_42]] : !cc.ptr, i64}>>) +// CHECK: ^bb3(%[[VAL_43:.*]]: !cc.ptr, i64}>>): +// CHECK: %[[VAL_44:.*]] = cc.cast %[[VAL_43]] : (!cc.ptr, i64}>>) -> !cc.ptr> +// CHECK: %[[VAL_45:.*]] = cc.load %[[VAL_44]] : !cc.ptr> +// CHECK: %[[VAL_46:.*]] = cc.compute_ptr %[[VAL_43]][1] : (!cc.ptr, i64}>>) -> !cc.ptr +// CHECK: %[[VAL_47:.*]] = cc.load %[[VAL_46]] : !cc.ptr +// CHECK: %[[VAL_48:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr, !cc.ptr, !cc.ptr}>> +// CHECK: %[[VAL_49:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: %[[VAL_50:.*]] = cc.cast %[[VAL_45]] : (!cc.ptr) -> !cc.ptr +// CHECK: cc.store %[[VAL_50]], %[[VAL_49]] : !cc.ptr> +// CHECK: %[[VAL_51:.*]] = cc.compute_ptr %[[VAL_48]][1] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: %[[VAL_52:.*]] = arith.muli %[[VAL_47]], %[[VAL_3]] : i64 +// CHECK: %[[VAL_53:.*]] = cc.cast %[[VAL_45]] : (!cc.ptr) -> !cc.ptr> +// CHECK: %[[VAL_54:.*]] = cc.compute_ptr %[[VAL_53]]{{\[}}%[[VAL_52]]] : (!cc.ptr>, i64) -> !cc.ptr +// CHECK: cc.store %[[VAL_54]], %[[VAL_51]] : !cc.ptr> +// CHECK: %[[VAL_55:.*]] = cc.compute_ptr %[[VAL_48]][2] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: cc.store %[[VAL_54]], %[[VAL_55]] : !cc.ptr> +// CHECK: call @free(%[[VAL_37]]) : (!cc.ptr) -> () // CHECK: return // CHECK: } @@ -100,72 +126,150 @@ func.func @test_1(%0: !cc.ptr, !cc.ptr, !cc.ptr !cc.stdvec { -// CHECK: %[[VAL_2:.*]] = arith.constant 9 : i64 -// CHECK: %[[VAL_3:.*]] = arith.constant 520 : i64 -// CHECK: %[[VAL_4:.*]] = call @malloc(%[[VAL_3]]) : (i64) -> !cc.ptr -// CHECK: %[[VAL_5:.*]] = cc.stdvec_init %[[VAL_4]], %[[VAL_2]] : (!cc.ptr, i64) -> !cc.stdvec -// CHECK: return +// CHECK-SAME: %[[VAL_0:.*]]: i32) -> !cc.stdvec { +// CHECK: %[[VAL_1:.*]] = arith.constant 9 : i64 +// CHECK: %[[VAL_2:.*]] = arith.constant 520 : i64 +// CHECK: %[[VAL_3:.*]] = call @malloc(%[[VAL_2]]) : (i64) -> !cc.ptr +// CHECK: %[[VAL_4:.*]] = cc.stdvec_init %[[VAL_3]], %[[VAL_1]] : (!cc.ptr, i64) -> !cc.stdvec +// CHECK: return %[[VAL_4]] : !cc.stdvec // CHECK: } // CHECK-LABEL: func.func @test_1( -// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, !cc.ptr, !cc.ptr}>> {llvm.sret = !cc.struct<{!cc.ptr, !cc.ptr, !cc.ptr}>}, -// CHECK-SAME: %[[VAL_1:.*]]: !cc.ptr, %[[VAL_2:.*]]: i32) { +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, !cc.ptr, !cc.ptr}>> {llvm.sret = !cc.struct<{!cc.ptr, !cc.ptr, !cc.ptr}>}, %[[VAL_1:.*]]: !cc.ptr, %[[VAL_2:.*]]: i32) { // CHECK: %[[VAL_3:.*]] = arith.constant 8 : i64 // CHECK: %[[VAL_4:.*]] = constant @test_1.thunk : (!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}> // CHECK: %[[VAL_5:.*]] = arith.constant 0 : i64 -// CHECK: %[[VAL_6:.*]] = cc.undef !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> -// CHECK: %[[VAL_7:.*]] = cc.insert_value %[[VAL_2]], %[[VAL_6]][0] : (!cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}>, i32) -> !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> -// CHECK: %[[VAL_8:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 -// CHECK: %[[VAL_9:.*]] = cc.alloca i8{{\[}}%[[VAL_8]] : i64] -// CHECK: %[[VAL_10:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr>) -> !cc.ptr, i64}>}>> -// CHECK: cc.store %[[VAL_7]], %[[VAL_10]] : !cc.ptr, i64}>}>> -// CHECK: %[[VAL_11:.*]] = cc.func_ptr %[[VAL_4]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr -// CHECK: %[[VAL_12:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr>) -> !cc.ptr -// CHECK: %[[VAL_13:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> [1] : i64 -// CHECK: %[[VAL_14:.*]] = llvm.mlir.addressof @test_1.kernelName : !llvm.ptr> -// CHECK: %[[VAL_15:.*]] = cc.cast %[[VAL_14]] : (!llvm.ptr>) -> !cc.ptr -// CHECK: %[[VAL_16:.*]] = call @altLaunchKernel(%[[VAL_15]], %[[VAL_11]], %[[VAL_12]], %[[VAL_8]], %[[VAL_13]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64) -> !cc.struct<{!cc.ptr, i64}> -// CHECK: %[[VAL_17:.*]] = cc.extract_value %[[VAL_16]][0] : (!cc.struct<{!cc.ptr, i64}>) -> !cc.ptr -// CHECK: %[[VAL_18:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr) -> i64 -// CHECK: %[[VAL_19:.*]] = arith.cmpi ne, %[[VAL_18]], %[[VAL_5]] : i64 -// CHECK: cf.cond_br %[[VAL_19]], ^bb1, ^bb2 +// CHECK: %[[VAL_6:.*]] = cc.alloca !cc.ptr +// CHECK: %[[VAL_7:.*]] = cc.cast %[[VAL_5]] : (i64) -> !cc.ptr +// CHECK: cc.store %[[VAL_7]], %[[VAL_6]] : !cc.ptr> +// CHECK: %[[VAL_8:.*]] = cc.alloca i64 +// CHECK: %[[VAL_9:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 +// CHECK: %[[VAL_10:.*]] = cc.alloca i8{{\[}}%[[VAL_9]] : i64] +// CHECK: %[[VAL_11:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr>) -> !cc.ptr, i64}>}>> +// CHECK: %[[VAL_12:.*]] = cc.alloca !cc.ptr +// CHECK: %[[VAL_13:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr>) -> !cc.ptr +// CHECK: cc.store %[[VAL_2]], %[[VAL_13]] : !cc.ptr +// CHECK: %[[VAL_14:.*]] = cc.load %[[VAL_6]] : !cc.ptr> +// CHECK: %[[VAL_15:.*]] = cc.cast %[[VAL_14]] : (!cc.ptr) -> i64 +// CHECK: %[[VAL_16:.*]] = arith.cmpi ne, %[[VAL_15]], %[[VAL_5]] : i64 +// CHECK: cc.if(%[[VAL_16]]) { +// CHECK: func.call @__nvqpp_vector_bool_free_temporary_initlists(%[[VAL_14]]) : (!cc.ptr) -> () +// CHECK: } +// CHECK: %[[VAL_17:.*]] = cc.func_ptr %[[VAL_4]] : ((!cc.ptr, i1) -> !cc.struct<{!cc.ptr, i64}>) -> !cc.ptr +// CHECK: %[[VAL_18:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr>) -> !cc.ptr +// CHECK: %[[VAL_19:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> [1] : i64 +// CHECK: %[[VAL_20:.*]] = cc.alloca !cc.struct<{!cc.ptr>, !cc.ptr>, !cc.ptr>}> +// CHECK: %[[VAL_21:.*]] = cc.alloca !cc.array x 1> +// CHECK: %[[VAL_22:.*]] = cc.cast %[[VAL_21]] : (!cc.ptr x 1>>) -> !cc.ptr> +// CHECK: %[[VAL_23:.*]] = cc.cast %[[VAL_20]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// CHECK: cc.store %[[VAL_22]], %[[VAL_23]] : !cc.ptr>> +// CHECK: %[[VAL_24:.*]] = cc.cast %[[VAL_21]] : (!cc.ptr x 1>>) -> i64 +// CHECK: %[[VAL_25:.*]] = arith.addi %[[VAL_24]], %[[VAL_3]] : i64 +// CHECK: %[[VAL_26:.*]] = cc.cast %[[VAL_25]] : (i64) -> !cc.ptr> +// CHECK: %[[VAL_27:.*]] = cc.compute_ptr %[[VAL_20]][1] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// CHECK: cc.store %[[VAL_26]], %[[VAL_27]] : !cc.ptr>> +// CHECK: %[[VAL_28:.*]] = cc.compute_ptr %[[VAL_20]][2] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr>> +// CHECK: cc.store %[[VAL_26]], %[[VAL_28]] : !cc.ptr>> +// CHECK: %[[VAL_29:.*]] = cc.cast %[[VAL_21]] : (!cc.ptr x 1>>) -> !cc.ptr> +// CHECK: %[[VAL_30:.*]] = cc.alloca i32 +// CHECK: cc.store %[[VAL_2]], %[[VAL_30]] : !cc.ptr +// CHECK: %[[VAL_31:.*]] = cc.cast %[[VAL_30]] : (!cc.ptr) -> !cc.ptr +// CHECK: cc.store %[[VAL_31]], %[[VAL_29]] : !cc.ptr> +// CHECK: %[[VAL_32:.*]] = cc.cast %[[VAL_20]] : (!cc.ptr>, !cc.ptr>, !cc.ptr>}>>) -> !cc.ptr +// CHECK: %[[VAL_33:.*]] = llvm.mlir.addressof @test_1.kernelName : !llvm.ptr> +// CHECK: %[[VAL_34:.*]] = cc.cast %[[VAL_33]] : (!llvm.ptr>) -> !cc.ptr +// CHECK: %[[VAL_35:.*]] = call @hybridLaunchKernel(%[[VAL_34]], %[[VAL_17]], %[[VAL_18]], %[[VAL_9]], %[[VAL_19]], %[[VAL_32]]) : (!cc.ptr, !cc.ptr, !cc.ptr, i64, i64, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_36:.*]] = cc.extract_value %[[VAL_35]][0] : (!cc.struct<{!cc.ptr, i64}>) -> !cc.ptr +// CHECK: %[[VAL_37:.*]] = cc.cast %[[VAL_36]] : (!cc.ptr) -> i64 +// CHECK: %[[VAL_38:.*]] = arith.cmpi ne, %[[VAL_37]], %[[VAL_5]] : i64 +// CHECK: cf.cond_br %[[VAL_38]], ^bb1, ^bb2 // CHECK: ^bb1: -// CHECK: %[[VAL_20:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr) -> !cc.ptr, i64}>}>> -// CHECK: %[[VAL_21:.*]] = cc.compute_ptr %[[VAL_20]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> -// CHECK: cf.br ^bb3(%[[VAL_21]] : !cc.ptr, i64}>>) +// CHECK: %[[VAL_39:.*]] = cc.cast %[[VAL_36]] : (!cc.ptr) -> !cc.ptr, i64}>}>> +// CHECK: %[[VAL_40:.*]] = cc.compute_ptr %[[VAL_39]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> +// CHECK: cf.br ^bb3(%[[VAL_40]] : !cc.ptr, i64}>>) // CHECK: ^bb2: -// CHECK: %[[VAL_22:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr>) -> !cc.ptr, i64}>}>> -// CHECK: %[[VAL_23:.*]] = cc.compute_ptr %[[VAL_22]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> -// CHECK: cf.br ^bb3(%[[VAL_23]] : !cc.ptr, i64}>>) -// CHECK: ^bb3(%[[VAL_24:.*]]: !cc.ptr, i64}>>): -// CHECK: %[[VAL_25:.*]] = cc.cast %[[VAL_24]] : (!cc.ptr, i64}>>) -> !cc.ptr> -// CHECK: %[[VAL_26:.*]] = cc.load %[[VAL_25]] : !cc.ptr> -// CHECK: %[[VAL_27:.*]] = cc.compute_ptr %[[VAL_24]][1] : (!cc.ptr, i64}>>) -> !cc.ptr -// CHECK: %[[VAL_28:.*]] = cc.load %[[VAL_27]] : !cc.ptr -// CHECK: %[[VAL_29:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr, !cc.ptr, !cc.ptr}>> -// CHECK: %[[VAL_30:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> -// CHECK: %[[VAL_31:.*]] = cc.cast %[[VAL_26]] : (!cc.ptr) -> !cc.ptr -// CHECK: cc.store %[[VAL_31]], %[[VAL_30]] : !cc.ptr> -// CHECK: %[[VAL_32:.*]] = cc.compute_ptr %[[VAL_29]][1] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> -// CHECK: %[[VAL_33:.*]] = arith.muli %[[VAL_28]], %[[VAL_3]] : i64 -// CHECK: %[[VAL_34:.*]] = cc.cast %[[VAL_26]] : (!cc.ptr) -> !cc.ptr> -// CHECK: %[[VAL_35:.*]] = cc.compute_ptr %[[VAL_34]]{{\[}}%[[VAL_33]]] : (!cc.ptr>, i64) -> !cc.ptr -// CHECK: cc.store %[[VAL_35]], %[[VAL_32]] : !cc.ptr> -// CHECK: %[[VAL_36:.*]] = cc.compute_ptr %[[VAL_29]][2] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> -// CHECK: cc.store %[[VAL_35]], %[[VAL_36]] : !cc.ptr> -// CHECK: call @free(%[[VAL_17]]) : (!cc.ptr) -> () +// CHECK: %[[VAL_41:.*]] = cc.compute_ptr %[[VAL_11]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> +// CHECK: cf.br ^bb3(%[[VAL_41]] : !cc.ptr, i64}>>) +// CHECK: ^bb3(%[[VAL_42:.*]]: !cc.ptr, i64}>>): +// CHECK: %[[VAL_43:.*]] = cc.cast %[[VAL_42]] : (!cc.ptr, i64}>>) -> !cc.ptr> +// CHECK: %[[VAL_44:.*]] = cc.load %[[VAL_43]] : !cc.ptr> +// CHECK: %[[VAL_45:.*]] = cc.compute_ptr %[[VAL_42]][1] : (!cc.ptr, i64}>>) -> !cc.ptr +// CHECK: %[[VAL_46:.*]] = cc.load %[[VAL_45]] : !cc.ptr +// CHECK: %[[VAL_47:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr, !cc.ptr, !cc.ptr}>> +// CHECK: %[[VAL_48:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: %[[VAL_49:.*]] = cc.cast %[[VAL_44]] : (!cc.ptr) -> !cc.ptr +// CHECK: cc.store %[[VAL_49]], %[[VAL_48]] : !cc.ptr> +// CHECK: %[[VAL_50:.*]] = cc.compute_ptr %[[VAL_47]][1] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: %[[VAL_51:.*]] = arith.muli %[[VAL_46]], %[[VAL_3]] : i64 +// CHECK: %[[VAL_52:.*]] = cc.cast %[[VAL_44]] : (!cc.ptr) -> !cc.ptr> +// CHECK: %[[VAL_53:.*]] = cc.compute_ptr %[[VAL_52]]{{\[}}%[[VAL_51]]] : (!cc.ptr>, i64) -> !cc.ptr +// CHECK: cc.store %[[VAL_53]], %[[VAL_50]] : !cc.ptr> +// CHECK: %[[VAL_54:.*]] = cc.compute_ptr %[[VAL_47]][2] : (!cc.ptr, !cc.ptr, !cc.ptr}>>) -> !cc.ptr> +// CHECK: cc.store %[[VAL_53]], %[[VAL_54]] : !cc.ptr> +// CHECK: call @free(%[[VAL_36]]) : (!cc.ptr) -> () // CHECK: return // CHECK: } } +// CHECK: func.func private @hybridLaunchKernel(!cc.ptr, !cc.ptr, !cc.ptr, i64, i64, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: func.func private @cudaqRegisterArgsCreator(!cc.ptr, !cc.ptr) +// CHECK: llvm.func @cudaqRegisterLambdaName(!llvm.ptr, !llvm.ptr) attributes {sym_visibility = "private"} +// CHECK: func.func private @__cudaq_registerLinkableKernel(!cc.ptr, !cc.ptr, !cc.ptr) +// CHECK: func.func private @__cudaq_getLinkableKernelKey(!cc.ptr) -> i64 +// CHECK: func.func private @cudaqRegisterKernelName(!cc.ptr) +// CHECK: func.func private @free(!cc.ptr) +// CHECK: func.func private @__nvqpp_initializer_list_to_vector_bool(!cc.ptr, !cc.ptr, i64) +// CHECK: func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr, !cc.ptr, !cc.ptr}>>, !cc.ptr, !cc.array}>>, !cc.ptr>) +// CHECK: func.func private @llvm.memcpy.p0i8.p0i8.i64(!cc.ptr, !cc.ptr, i64, i1) + +// CHECK-LABEL: func.func private @__nvqpp_zeroDynamicResult() -> !cc.struct<{!cc.ptr, i64}> { +// CHECK: %[[VAL_0:.*]] = arith.constant 0 : i64 +// CHECK: %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (i64) -> !cc.ptr +// CHECK: %[[VAL_2:.*]] = cc.undef !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_3:.*]] = cc.insert_value %[[VAL_1]], %[[VAL_2]][0] : (!cc.struct<{!cc.ptr, i64}>, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_4:.*]] = cc.insert_value %[[VAL_0]], %[[VAL_3]][1] : (!cc.struct<{!cc.ptr, i64}>, i64) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: return %[[VAL_4]] : !cc.struct<{!cc.ptr, i64}> +// CHECK: } + +// CHECK-LABEL: func.func private @__nvqpp_createDynamicResult( +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, +// CHECK-SAME: %[[VAL_1:.*]]: i64, +// CHECK-SAME: %[[VAL_2:.*]]: !cc.ptr, i64}>>, +// CHECK-SAME: %[[VAL_3:.*]]: i64) -> !cc.struct<{!cc.ptr, i64}> { +// CHECK: %[[VAL_4:.*]] = arith.constant false +// CHECK: %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr, i64}>>) -> !cc.ptr +// CHECK: %[[VAL_6:.*]] = cc.load %[[VAL_5]] : !cc.ptr +// CHECK: %[[VAL_7:.*]] = arith.addi %[[VAL_1]], %[[VAL_6]] : i64 +// CHECK: %[[VAL_8:.*]] = call @malloc(%[[VAL_7]]) : (i64) -> !cc.ptr +// CHECK: %[[VAL_9:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr) -> !cc.ptr> +// CHECK: call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_8]], %[[VAL_0]], %[[VAL_1]], %[[VAL_4]]) : (!cc.ptr, !cc.ptr, i64, i1) -> () +// CHECK: %[[VAL_10:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr, i64}>>) -> !cc.ptr> +// CHECK: %[[VAL_11:.*]] = cc.load %[[VAL_10]] : !cc.ptr> +// CHECK: %[[VAL_12:.*]] = cc.compute_ptr %[[VAL_9]]{{\[}}%[[VAL_1]]] : (!cc.ptr>, i64) -> !cc.ptr +// CHECK: call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_12]], %[[VAL_11]], %[[VAL_6]], %[[VAL_4]]) : (!cc.ptr, !cc.ptr, i64, i1) -> () +// CHECK: %[[VAL_13:.*]] = cc.undef !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_14:.*]] = cc.insert_value %[[VAL_8]], %[[VAL_13]][0] : (!cc.struct<{!cc.ptr, i64}>, !cc.ptr) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_15:.*]] = cc.insert_value %[[VAL_7]], %[[VAL_14]][1] : (!cc.struct<{!cc.ptr, i64}>, i64) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_16:.*]] = cc.compute_ptr %[[VAL_9]]{{\[}}%[[VAL_3]]] : (!cc.ptr>, i64) -> !cc.ptr +// CHECK: %[[VAL_17:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr) -> !cc.ptr> +// CHECK: cc.store %[[VAL_12]], %[[VAL_17]] : !cc.ptr> +// CHECK: return %[[VAL_15]] : !cc.struct<{!cc.ptr, i64}> +// CHECK: } +// CHECK: llvm.mlir.global external constant @test_0.kernelName("test_0\00") {addr_space = 0 : i32} + +// CHECK-LABEL: func.func @test_0.returnOffset() -> i64 { +// CHECK: %[[VAL_0:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> [1] : i64 +// CHECK: return %[[VAL_0]] : i64 +// CHECK: } + // CHECK-LABEL: func.func @test_0.thunk( -// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr, i64}> { +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, +// CHECK-SAME: %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr, i64}> { // CHECK: %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr) -> !cc.ptr, i64}>}>> -// CHECK: %[[VAL_3:.*]] = cc.load %[[VAL_2]] : !cc.ptr, i64}>}>> -// CHECK: %[[VAL_4:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 -// CHECK: %[[VAL_5:.*]] = cc.extract_value %[[VAL_3]][0] : (!cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}>) -> i32 +// CHECK: %[[VAL_3:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 +// CHECK: %[[VAL_4:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr) -> !cc.ptr +// CHECK: %[[VAL_5:.*]] = cc.load %[[VAL_4]] : !cc.ptr // CHECK: %[[VAL_6:.*]] = call @__nvqpp__mlirgen__test_0(%[[VAL_5]]) : (i32) -> !cc.stdvec // CHECK: %[[VAL_7:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> // CHECK: %[[VAL_8:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr, i64}>>) -> !cc.ptr> @@ -174,19 +278,53 @@ func.func @test_1(%0: !cc.ptr, !cc.ptr, !cc.ptr, i64}>>) -> !cc.ptr, i64}>> // CHECK: %[[VAL_10:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> [1] : i64 -// CHECK: %[[VAL_11:.*]] = call @__nvqpp_createDynamicResult(%[[VAL_0]], %[[VAL_4]], %[[VAL_9]], %[[VAL_10]]) : (!cc.ptr, i64, !cc.ptr, i64}>>, i64) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_11:.*]] = call @__nvqpp_createDynamicResult(%[[VAL_0]], %[[VAL_3]], %[[VAL_9]], %[[VAL_10]]) : (!cc.ptr, i64, !cc.ptr, i64}>>, i64) -> !cc.struct<{!cc.ptr, i64}> // CHECK: return %[[VAL_11]] : !cc.struct<{!cc.ptr, i64}> // CHECK: ^bb2: // CHECK: %[[VAL_12:.*]] = call @__nvqpp_zeroDynamicResult() : () -> !cc.struct<{!cc.ptr, i64}> // CHECK: return %[[VAL_12]] : !cc.struct<{!cc.ptr, i64}> // CHECK: } +// CHECK-LABEL: func.func @test_0.argsCreator( +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr>, +// CHECK-SAME: %[[VAL_1:.*]]: !cc.ptr>) -> i64 { +// CHECK: %[[VAL_2:.*]] = cc.load %[[VAL_0]] : !cc.ptr> +// CHECK: %[[VAL_3:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr) -> !cc.ptr +// CHECK: %[[VAL_4:.*]] = cc.load %[[VAL_3]] : !cc.ptr +// CHECK: %[[VAL_5:.*]] = cc.alloca i64 +// CHECK: %[[VAL_6:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 +// CHECK: %[[VAL_7:.*]] = call @malloc(%[[VAL_6]]) : (i64) -> !cc.ptr +// CHECK: %[[VAL_8:.*]] = cc.alloca !cc.ptr +// CHECK: %[[VAL_9:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr) -> !cc.ptr +// CHECK: cc.store %[[VAL_4]], %[[VAL_9]] : !cc.ptr +// CHECK: cc.store %[[VAL_7]], %[[VAL_1]] : !cc.ptr> +// CHECK: return %[[VAL_6]] : i64 +// CHECK: } + +// CHECK-LABEL: llvm.func @test_0.kernelRegFunc() { +// CHECK: %[[VAL_0:.*]] = func.constant @test_0.argsCreator : (!cc.ptr>, !cc.ptr>) -> i64 +// CHECK: %[[VAL_1:.*]] = llvm.mlir.addressof @test_0.kernelName : !llvm.ptr> +// CHECK: %[[VAL_2:.*]] = cc.cast %[[VAL_1]] : (!llvm.ptr>) -> !cc.ptr +// CHECK: func.call @cudaqRegisterKernelName(%[[VAL_2]]) : (!cc.ptr) -> () +// CHECK: %[[VAL_3:.*]] = cc.func_ptr %[[VAL_0]] : ((!cc.ptr>, !cc.ptr>) -> i64) -> !cc.ptr +// CHECK: func.call @cudaqRegisterArgsCreator(%[[VAL_2]], %[[VAL_3]]) : (!cc.ptr, !cc.ptr) -> () +// CHECK: llvm.return +// CHECK: } +// CHECK: llvm.mlir.global_ctors {ctors = [@test_0.kernelRegFunc], priorities = [17 : i32]} +// CHECK: llvm.mlir.global external constant @test_1.kernelName("test_1\00") {addr_space = 0 : i32} + +// CHECK-LABEL: func.func @test_1.returnOffset() -> i64 { +// CHECK: %[[VAL_0:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> [1] : i64 +// CHECK: return %[[VAL_0]] : i64 +// CHECK: } + // CHECK-LABEL: func.func @test_1.thunk( -// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr, i64}> { +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr, +// CHECK-SAME: %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr, i64}> { // CHECK: %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr) -> !cc.ptr, i64}>}>> -// CHECK: %[[VAL_3:.*]] = cc.load %[[VAL_2]] : !cc.ptr, i64}>}>> -// CHECK: %[[VAL_4:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 -// CHECK: %[[VAL_5:.*]] = cc.extract_value %[[VAL_3]][0] : (!cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}>) -> i32 +// CHECK: %[[VAL_3:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 +// CHECK: %[[VAL_4:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr) -> !cc.ptr +// CHECK: %[[VAL_5:.*]] = cc.load %[[VAL_4]] : !cc.ptr // CHECK: %[[VAL_6:.*]] = call @__nvqpp__mlirgen__test_1(%[[VAL_5]]) : (i32) -> !cc.stdvec // CHECK: %[[VAL_7:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr, i64}>}>>) -> !cc.ptr, i64}>> // CHECK: %[[VAL_8:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr, i64}>>) -> !cc.ptr> @@ -195,9 +333,37 @@ func.func @test_1(%0: !cc.ptr, !cc.ptr, !cc.ptr, i64}>>) -> !cc.ptr, i64}>> // CHECK: %[[VAL_10:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> [1] : i64 -// CHECK: %[[VAL_11:.*]] = call @__nvqpp_createDynamicResult(%[[VAL_0]], %[[VAL_4]], %[[VAL_9]], %[[VAL_10]]) : (!cc.ptr, i64, !cc.ptr, i64}>>, i64) -> !cc.struct<{!cc.ptr, i64}> +// CHECK: %[[VAL_11:.*]] = call @__nvqpp_createDynamicResult(%[[VAL_0]], %[[VAL_3]], %[[VAL_9]], %[[VAL_10]]) : (!cc.ptr, i64, !cc.ptr, i64}>>, i64) -> !cc.struct<{!cc.ptr, i64}> // CHECK: return %[[VAL_11]] : !cc.struct<{!cc.ptr, i64}> // CHECK: ^bb2: // CHECK: %[[VAL_12:.*]] = call @__nvqpp_zeroDynamicResult() : () -> !cc.struct<{!cc.ptr, i64}> // CHECK: return %[[VAL_12]] : !cc.struct<{!cc.ptr, i64}> // CHECK: } + +// CHECK-LABEL: func.func @test_1.argsCreator( +// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr>, +// CHECK-SAME: %[[VAL_1:.*]]: !cc.ptr>) -> i64 { +// CHECK: %[[VAL_2:.*]] = cc.load %[[VAL_0]] : !cc.ptr> +// CHECK: %[[VAL_3:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr) -> !cc.ptr +// CHECK: %[[VAL_4:.*]] = cc.load %[[VAL_3]] : !cc.ptr +// CHECK: %[[VAL_5:.*]] = cc.alloca i64 +// CHECK: %[[VAL_6:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr, i64}>}> : i64 +// CHECK: %[[VAL_7:.*]] = call @malloc(%[[VAL_6]]) : (i64) -> !cc.ptr +// CHECK: %[[VAL_8:.*]] = cc.alloca !cc.ptr +// CHECK: %[[VAL_9:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr) -> !cc.ptr +// CHECK: cc.store %[[VAL_4]], %[[VAL_9]] : !cc.ptr +// CHECK: cc.store %[[VAL_7]], %[[VAL_1]] : !cc.ptr> +// CHECK: return %[[VAL_6]] : i64 +// CHECK: } + +// CHECK-LABEL: llvm.func @test_1.kernelRegFunc() { +// CHECK: %[[VAL_0:.*]] = func.constant @test_1.argsCreator : (!cc.ptr>, !cc.ptr>) -> i64 +// CHECK: %[[VAL_1:.*]] = llvm.mlir.addressof @test_1.kernelName : !llvm.ptr> +// CHECK: %[[VAL_2:.*]] = cc.cast %[[VAL_1]] : (!llvm.ptr>) -> !cc.ptr +// CHECK: func.call @cudaqRegisterKernelName(%[[VAL_2]]) : (!cc.ptr) -> () +// CHECK: %[[VAL_3:.*]] = cc.func_ptr %[[VAL_0]] : ((!cc.ptr>, !cc.ptr>) -> i64) -> !cc.ptr +// CHECK: func.call @cudaqRegisterArgsCreator(%[[VAL_2]], %[[VAL_3]]) : (!cc.ptr, !cc.ptr) -> () +// CHECK: llvm.return +// CHECK: } +// CHECK: llvm.mlir.global_ctors {ctors = [@test_1.kernelRegFunc], priorities = [17 : i32]} + diff --git a/test/Translate/argument.qke b/test/Translate/argument.qke index 6a3532805a..865a622a55 100644 --- a/test/Translate/argument.qke +++ b/test/Translate/argument.qke @@ -6,7 +6,7 @@ // the terms of the Apache License 2.0 which accompanies this distribution. // // ========================================================================== // -// RUN: cudaq-opt --kernel-execution=codegen=1 --canonicalize %s | \ +// RUN: cudaq-opt -kernel-execution -canonicalize %s | \ // RUN: cudaq-translate --convert-to=qir | FileCheck %s // NB: the mangled name map is required for the kernel-execution pass. @@ -31,7 +31,7 @@ func.func @test_0(%0: !cc.ptr, %1: !cc.ptr, %1: !cc.ptr, %1: !cc.ptr, %1: !cc.ptr, %1: !cc.ptr, %1: !cc.ptr, %1: !cc.ptr, %1: !cc.ptr, %1: !cc.ptr, !cc.std func.func @test_3(%0: !cc.ptr, %1: !cc.ptr, !cc.ptr, !cc.ptr}>, !cc.struct<{!cc.ptr, !cc.ptr, !cc.ptr}>}>>) { return } -} // CHECK-LABEL: define void @__nvqpp__mlirgen__test_3({ { i16*, i64 }, { float*, i64 } } -// CHECK-SAME: %[[VAL_0:.*]]) {{.*}}{ +// CHECK-SAME: %[[VAL_0:.*]]) local_unnamed_addr { // CHECK: %[[VAL_1:.*]] = extractvalue { { i16*, i64 }, { float*, i64 } } %[[VAL_0]], 0 // CHECK: %[[VAL_2:.*]] = extractvalue { i16*, i64 } %[[VAL_1]], 0 // CHECK: %[[VAL_3:.*]] = extractvalue { i16*, i64 } %[[VAL_1]], 1 @@ -202,7 +246,7 @@ func.func @test_3(%0: !cc.ptr, %1: !cc.ptr, %1: !cc.ptr, %1: !cc.ptr, %1: !cc.ptr, %1: !cc.ptr, %1: !cc.ptr, %1: !cc.ptr !cc.stdvec { func.func @test_0(%1: !cc.ptr, !cc.ptr, !cc.ptr}>> {llvm.sret = !cc.struct<{!cc.ptr, !cc.ptr, !cc.ptr}>}, %this: !cc.ptr, %2: i32) { return } - -// CHECK-LABEL: define { i1*, i64 } @__nvqpp__mlirgen__test_0( -// CHECK-SAME: i32 %[[VAL_1:.*]]) {{.*}}{ -// CHECK: %[[VAL_2:.*]] = sext i32 %[[VAL_1]] to i64 -// CHECK: %[[VAL_3:.*]] = tail call %[[VAL_4:.*]]* @__quantum__rt__qubit_allocate_array(i64 %[[VAL_2]]) -// CHECK: %[[VAL_5:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(%[[VAL_4]]* %[[VAL_3]]) -// CHECK: %[[VAL_6:.*]] = icmp sgt i64 %[[VAL_5]], 0 -// CHECK: br i1 %[[VAL_6]], label %[[VAL_7:.*]], label %[[VAL_8:.*]] -// CHECK: ._crit_edge.thread: ; preds = %[[VAL_9:.*]] -// CHECK: %[[VAL_10:.*]] = alloca i8, i64 %[[VAL_5]], align 1 -// CHECK: br label %[[VAL_11:.*]] -// CHECK: .lr.ph: ; preds = %[[VAL_9]], %[[VAL_7]] -// CHECK: %[[VAL_12:.*]] = phi i64 [ %[[VAL_13:.*]], %[[VAL_7]] ], [ 0, %[[VAL_9]] ] -// CHECK: %[[VAL_14:.*]] = tail call i8* @__quantum__rt__array_get_element_ptr_1d(%[[VAL_4]]* %[[VAL_3]], i64 %[[VAL_12]]) -// CHECK: %[[VAL_15:.*]] = bitcast i8* %[[VAL_14]] to %[[VAL_16:.*]]** -// CHECK: %[[VAL_17:.*]] = load %[[VAL_16]]*, %[[VAL_16]]** %[[VAL_15]], align 8 -// CHECK: tail call void @__quantum__qis__h(%[[VAL_16]]* %[[VAL_17]]) -// CHECK: %[[VAL_13]] = add nuw nsw i64 %[[VAL_12]], 1 -// CHECK: %[[VAL_18:.*]] = icmp eq i64 %[[VAL_13]], %[[VAL_5]] -// CHECK: br i1 %[[VAL_18]], label %[[VAL_19:.*]], label %[[VAL_7]] -// CHECK: ._crit_edge: ; preds = %[[VAL_7]] -// CHECK: %[[VAL_20:.*]] = alloca i8, i64 %[[VAL_5]], align 1 -// CHECK: br i1 %[[VAL_6]], label %[[VAL_21:.*]], label %[[VAL_11]] -// CHECK: .lr.ph4: ; preds = %[[VAL_19]], %[[VAL_21]] -// CHECK: %[[VAL_22:.*]] = phi i64 [ %[[VAL_23:.*]], %[[VAL_21]] ], [ 0, %[[VAL_19]] ] -// CHECK: %[[VAL_24:.*]] = tail call i8* @__quantum__rt__array_get_element_ptr_1d(%[[VAL_4]]* %[[VAL_3]], i64 %[[VAL_22]]) -// CHECK: %[[VAL_25:.*]] = bitcast i8* %[[VAL_24]] to %[[VAL_16]]** -// CHECK: %[[VAL_26:.*]] = load %[[VAL_16]]*, %[[VAL_16]]** %[[VAL_25]], align 8 -// CHECK: %[[VAL_27:.*]] = tail call %[[VAL_28:.*]]* @__quantum__qis__mz(%[[VAL_16]]* %[[VAL_26]]) -// CHECK: %[[VAL_29:.*]] = bitcast %[[VAL_28]]* %[[VAL_27]] to i1* -// CHECK: %[[VAL_30:.*]] = load i1, i1* %[[VAL_29]], align 1 -// CHECK: %[[VAL_31:.*]] = getelementptr i8, i8* %[[VAL_20]], i64 %[[VAL_22]] -// CHECK: %[[VAL_32:.*]] = zext i1 %[[VAL_30]] to i8 -// CHECK: store i8 %[[VAL_32]], i8* %[[VAL_31]], align 1 -// CHECK: %[[VAL_23]] = add nuw nsw i64 %[[VAL_22]], 1 -// CHECK: %[[VAL_33:.*]] = icmp eq i64 %[[VAL_23]], %[[VAL_5]] -// CHECK: br i1 %[[VAL_33]], label %[[VAL_11]], label %[[VAL_21]] -// CHECK: ._crit_edge5: ; preds = %[[VAL_21]], %[[VAL_8]], %[[VAL_19]] -// CHECK: %[[VAL_34:.*]] = phi i8* [ %[[VAL_10]], %[[VAL_8]] ], [ %[[VAL_20]], %[[VAL_19]] ], [ %[[VAL_20]], %[[VAL_21]] ] -// CHECK: %[[VAL_35:.*]] = call i8* @__nvqpp_vectorCopyCtor(i8* nonnull %[[VAL_34]], i64 %[[VAL_5]], i64 1) -// CHECK: %[[VAL_36:.*]] = bitcast i8* %[[VAL_35]] to i1* -// CHECK: %[[VAL_37:.*]] = insertvalue { i1*, i64 } undef, i1* %[[VAL_36]], 0 -// CHECK: %[[VAL_38:.*]] = insertvalue { i1*, i64 } %[[VAL_37]], i64 %[[VAL_5]], 1 -// CHECK: call void @__quantum__rt__qubit_release_array(%Array* %[[VAL_3]]) -// CHECK: ret { i1*, i64 } %[[VAL_38]] +// CHECK-LABEL: define { i1*, i64 } @__nvqpp__mlirgen__test_0(i32 +// CHECK-SAME: %[[VAL_0:.*]]) local_unnamed_addr { +// CHECK: %[[VAL_1:.*]] = sext i32 %[[VAL_0]] to i64 +// CHECK: %[[VAL_2:.*]] = tail call %[[VAL_3:.*]]* @__quantum__rt__qubit_allocate_array(i64 %[[VAL_1]]) +// CHECK: %[[VAL_4:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(%[[VAL_3]]* %[[VAL_2]]) +// CHECK: %[[VAL_5:.*]] = icmp sgt i64 %[[VAL_4]], 0 +// CHECK: br i1 %[[VAL_5]], label %[[VAL_6:.*]], label %[[VAL_7:.*]] +// CHECK: ._crit_edge.thread: ; preds = %[[VAL_8:.*]] +// CHECK: %[[VAL_9:.*]] = alloca i8, i64 %[[VAL_4]], align 1 +// CHECK: br label %[[VAL_10:.*]] +// CHECK: .lr.ph: ; preds = %[[VAL_8]], %[[VAL_6]] +// CHECK: %[[VAL_11:.*]] = phi i64 [ %[[VAL_12:.*]], %[[VAL_6]] ], [ 0, %[[VAL_8]] ] +// CHECK: %[[VAL_13:.*]] = tail call i8* @__quantum__rt__array_get_element_ptr_1d(%[[VAL_3]]* %[[VAL_2]], i64 %[[VAL_11]]) +// CHECK: %[[VAL_14:.*]] = bitcast i8* %[[VAL_13]] to %[[VAL_15:.*]]** +// CHECK: %[[VAL_16:.*]] = load %[[VAL_15]]*, %[[VAL_15]]** %[[VAL_14]], align 8 +// CHECK: tail call void @__quantum__qis__h(%[[VAL_15]]* %[[VAL_16]]) +// CHECK: %[[VAL_12]] = add nuw nsw i64 %[[VAL_11]], 1 +// CHECK: %[[VAL_17:.*]] = icmp eq i64 %[[VAL_12]], %[[VAL_4]] +// CHECK: br i1 %[[VAL_17]], label %[[VAL_18:.*]], label %[[VAL_6]] +// CHECK: ._crit_edge: ; preds = %[[VAL_6]] +// CHECK: %[[VAL_19:.*]] = alloca i8, i64 %[[VAL_4]], align 1 +// CHECK: br i1 %[[VAL_5]], label %[[VAL_20:.*]], label %[[VAL_10]] +// CHECK: .lr.ph4: ; preds = %[[VAL_18]], %[[VAL_20]] +// CHECK: %[[VAL_21:.*]] = phi i64 [ %[[VAL_22:.*]], %[[VAL_20]] ], [ 0, %[[VAL_18]] ] +// CHECK: %[[VAL_23:.*]] = tail call i8* @__quantum__rt__array_get_element_ptr_1d(%[[VAL_3]]* %[[VAL_2]], i64 %[[VAL_21]]) +// CHECK: %[[VAL_24:.*]] = bitcast i8* %[[VAL_23]] to %[[VAL_15]]** +// CHECK: %[[VAL_25:.*]] = load %[[VAL_15]]*, %[[VAL_15]]** %[[VAL_24]], align 8 +// CHECK: %[[VAL_26:.*]] = tail call %[[VAL_27:.*]]* @__quantum__qis__mz(%[[VAL_15]]* %[[VAL_25]]) +// CHECK: %[[VAL_28:.*]] = bitcast %[[VAL_27]]* %[[VAL_26]] to i1* +// CHECK: %[[VAL_29:.*]] = load i1, i1* %[[VAL_28]], align 1 +// CHECK: %[[VAL_30:.*]] = getelementptr i8, i8* %[[VAL_19]], i64 %[[VAL_21]] +// CHECK: %[[VAL_31:.*]] = zext i1 %[[VAL_29]] to i8 +// CHECK: store i8 %[[VAL_31]], i8* %[[VAL_30]], align 1 +// CHECK: %[[VAL_22]] = add nuw nsw i64 %[[VAL_21]], 1 +// CHECK: %[[VAL_32:.*]] = icmp eq i64 %[[VAL_22]], %[[VAL_4]] +// CHECK: br i1 %[[VAL_32]], label %[[VAL_10]], label %[[VAL_20]] +// CHECK: ._crit_edge5: ; preds = %[[VAL_20]], %[[VAL_7]], %[[VAL_18]] +// CHECK: %[[VAL_33:.*]] = phi i8* [ %[[VAL_9]], %[[VAL_7]] ], [ %[[VAL_19]], %[[VAL_18]] ], [ %[[VAL_19]], %[[VAL_20]] ] +// CHECK: %[[VAL_34:.*]] = call i8* @__nvqpp_vectorCopyCtor(i8* nonnull %[[VAL_33]], i64 %[[VAL_4]], i64 1) +// CHECK: %[[VAL_35:.*]] = bitcast i8* %[[VAL_34]] to i1* +// CHECK: %[[VAL_36:.*]] = insertvalue { i1*, i64 } undef, i1* %[[VAL_35]], 0 +// CHECK: %[[VAL_37:.*]] = insertvalue { i1*, i64 } %[[VAL_36]], i64 %[[VAL_4]], 1 +// CHECK: call void @__quantum__rt__qubit_release_array(%[[VAL_3]]* %[[VAL_2]]) +// CHECK: ret { i1*, i64 } %[[VAL_37]] // CHECK: } // CHECK-LABEL: define void @test_0({ i8*, i8*, i8* }* sret({ i8*, i8*, i8* }) -// CHECK-SAME: %[[VAL_0:.*]], i8* nocapture readnone %[[VAL_1:.*]], i32 %[[VAL_2:.*]]) {{.*}}{ -// CHECK: %[[VAL_3:.*]] = alloca { i32, { i1*, i64 } }, align 8 +// CHECK-SAME: %[[VAL_0:.*]], i8* nocapture readnone +// CHECK-SAME: %[[VAL_1:.*]], i32 +// CHECK-SAME: %[[VAL_2:.*]]) local_unnamed_addr { +// CHECK: %[[VAL_3:.*]] = alloca { i32, { i1*, i64 } }, align 4 // CHECK: %[[VAL_4:.*]] = bitcast { i32, { i1*, i64 } }* %[[VAL_3]] to i8* // CHECK: %[[VAL_5:.*]] = getelementptr inbounds { i32, { i1*, i64 } }, { i32, { i1*, i64 } }* %[[VAL_3]], i64 0, i32 0 -// CHECK: store i32 %[[VAL_2]], i32* %[[VAL_5]], align 8 -// CHECK: %[[VAL_6:.*]] = call { i8*, i64 } @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_0.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_0.thunk to i8*), i8* nonnull %[[VAL_4]], i64 24, i64 8) -// CHECK: %[[VAL_7:.*]] = extractvalue { i8*, i64 } %[[VAL_6]], 0 -// CHECK: %[[VAL_8:.*]] = icmp eq i8* %[[VAL_7]], null -// CHECK: %[[VAL_9:.*]] = getelementptr i8, i8* %[[VAL_7]], i64 8 -// CHECK: %[[VAL_10:.*]] = bitcast i8* %[[VAL_9]] to { i1*, i64 }* -// CHECK: %[[VAL_11:.*]] = getelementptr inbounds { i32, { i1*, i64 } }, { i32, { i1*, i64 } }* %[[VAL_3]], i64 0, i32 1 -// CHECK: %[[VAL_12:.*]] = select i1 %[[VAL_8]], { i1*, i64 }* %[[VAL_11]], { i1*, i64 }* %[[VAL_10]] -// CHECK: %[[VAL_13:.*]] = bitcast { i1*, i64 }* %[[VAL_12]] to i8** -// CHECK: %[[VAL_14:.*]] = load i8*, i8** %[[VAL_13]], align 8 -// CHECK: %[[VAL_15:.*]] = getelementptr inbounds { i32, { i1*, i64 } }, { i32, { i1*, i64 } }* %[[VAL_3]], i64 0, i32 1, i32 1 -// CHECK: %[[VAL_16:.*]] = getelementptr i8, i8* %[[VAL_7]], i64 16 -// CHECK: %[[VAL_17:.*]] = bitcast i8* %[[VAL_16]] to i64* -// CHECK: %[[VAL_18:.*]] = select i1 %[[VAL_8]], i64* %[[VAL_15]], i64* %[[VAL_17]] -// CHECK: %[[VAL_19:.*]] = load i64, i64* %[[VAL_18]], align 4 -// CHECK: %[[VAL_20:.*]] = bitcast { i8*, i8*, i8* }* %[[VAL_0]] to i8* -// CHECK: call void @__nvqpp_initializer_list_to_vector_bool(i8* %[[VAL_20]], i8* %[[VAL_14]], i64 %[[VAL_19]]) -// CHECK: call void @free(i8* %[[VAL_7]]) +// CHECK: store i32 %[[VAL_2]], i32* %[[VAL_5]], align 4 +// CHECK: %[[VAL_6:.*]] = alloca { i8**, i8**, i8** }, align 8 +// CHECK: %[[VAL_7:.*]] = alloca [1 x i8*], align 8 +// CHECK: %[[VAL_8:.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* %[[VAL_7]], i64 0, i64 0 +// CHECK: %[[VAL_9:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_6]], i64 0, i32 0 +// CHECK: store i8** %[[VAL_8]], i8*** %[[VAL_9]], align 8 +// CHECK: %[[VAL_10:.*]] = ptrtoint [1 x i8*]* %[[VAL_7]] to i64 +// CHECK: %[[VAL_11:.*]] = add i64 %[[VAL_10]], 8 +// CHECK: %[[VAL_12:.*]] = inttoptr i64 %[[VAL_11]] to i8** +// CHECK: %[[VAL_13:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_6]], i64 0, i32 1 +// CHECK: store i8** %[[VAL_12]], i8*** %[[VAL_13]], align 8 +// CHECK: %[[VAL_14:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_6]], i64 0, i32 2 +// CHECK: store i8** %[[VAL_12]], i8*** %[[VAL_14]], align 8 +// CHECK: %[[VAL_15:.*]] = alloca i32, align 4 +// CHECK: store i32 %[[VAL_2]], i32* %[[VAL_15]], align 4 +// CHECK: %[[VAL_16:.*]] = bitcast [1 x i8*]* %[[VAL_7]] to i32** +// CHECK: store i32* %[[VAL_15]], i32** %[[VAL_16]], align 8 +// CHECK: %[[VAL_17:.*]] = bitcast { i8**, i8**, i8** }* %[[VAL_6]] to i8* +// CHECK: %[[VAL_18:.*]] = call { i8*, i64 } @hybridLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_0.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_0.thunk to i8*), i8* nonnull %[[VAL_4]], i64 24, i64 8, i8* nonnull %[[VAL_17]]) +// CHECK: %[[VAL_19:.*]] = extractvalue { i8*, i64 } %[[VAL_18]], 0 +// CHECK: %[[VAL_20:.*]] = icmp eq i8* %[[VAL_19]], null +// CHECK: %[[VAL_21:.*]] = getelementptr i8, i8* %[[VAL_19]], i64 8 +// CHECK: %[[VAL_22:.*]] = bitcast i8* %[[VAL_21]] to { i1*, i64 }* +// CHECK: %[[VAL_23:.*]] = getelementptr inbounds { i32, { i1*, i64 } }, { i32, { i1*, i64 } }* %[[VAL_3]], i64 0, i32 1 +// CHECK: %[[VAL_24:.*]] = select i1 %[[VAL_20]], { i1*, i64 }* %[[VAL_23]], { i1*, i64 }* %[[VAL_22]] +// CHECK: %[[VAL_25:.*]] = bitcast { i1*, i64 }* %[[VAL_24]] to i8** +// CHECK: %[[VAL_26:.*]] = load i8*, i8** %[[VAL_25]], align 8 +// CHECK: %[[VAL_27:.*]] = getelementptr inbounds { i32, { i1*, i64 } }, { i32, { i1*, i64 } }* %[[VAL_3]], i64 0, i32 1, i32 1 +// CHECK: %[[VAL_28:.*]] = getelementptr i8, i8* %[[VAL_19]], i64 16 +// CHECK: %[[VAL_29:.*]] = bitcast i8* %[[VAL_28]] to i64* +// CHECK: %[[VAL_30:.*]] = select i1 %[[VAL_20]], i64* %[[VAL_27]], i64* %[[VAL_29]] +// CHECK: %[[VAL_31:.*]] = load i64, i64* %[[VAL_30]], align 4 +// CHECK: %[[VAL_32:.*]] = bitcast { i8*, i8*, i8* }* %[[VAL_0]] to i8* +// CHECK: call void @__nvqpp_initializer_list_to_vector_bool(i8* %[[VAL_32]], i8* %[[VAL_26]], i64 %[[VAL_31]]) +// CHECK: call void @free(i8* %[[VAL_19]]) // CHECK: ret void // CHECK: } @@ -152,35 +171,45 @@ func.func @test_1(%this: !cc.ptr) -> i16 { return %0 : i16 } -// CHECK-LABEL: define { i1, i1 } @__nvqpp__mlirgen__test_1() -// CHECK: %[[VAL_1:.*]] = tail call %[[VAL_2:.*]]* @__quantum__rt__qubit_allocate_array(i64 2) -// CHECK: %[[VAL_3:.*]] = tail call i8* @__quantum__rt__array_get_element_ptr_1d(%[[VAL_2]]* %[[VAL_1]], i64 0) -// CHECK: %[[VAL_4:.*]] = bitcast i8* %[[VAL_3]] to %[[VAL_5:.*]]** -// CHECK: %[[VAL_6:.*]] = load %[[VAL_5]]*, %[[VAL_5]]** %[[VAL_4]], align 8 -// CHECK: %[[VAL_7:.*]] = tail call i8* @__quantum__rt__array_get_element_ptr_1d(%[[VAL_2]]* %[[VAL_1]], i64 1) -// CHECK: %[[VAL_8:.*]] = bitcast i8* %[[VAL_7]] to %[[VAL_5]]** -// CHECK: %[[VAL_9:.*]] = load %[[VAL_5]]*, %[[VAL_5]]** %[[VAL_8]], align 8 -// CHECK: tail call void @__quantum__qis__h(%[[VAL_5]]* %[[VAL_6]]) -// CHECK: tail call void (i64, void (%[[VAL_2]]*, %[[VAL_5]]*)*, ...) @invokeWithControlQubits(i64 1, void (%[[VAL_2]]*, %[[VAL_5]]*)* nonnull @__quantum__qis__x__ctl, %[[VAL_5]]* %[[VAL_6]], %[[VAL_5]]* %[[VAL_9]]) -// CHECK: %[[VAL_10:.*]] = tail call %[[VAL_11:.*]]* @__quantum__qis__mz(%[[VAL_5]]* %[[VAL_6]]) -// CHECK: %[[VAL_12:.*]] = bitcast %Result* %[[VAL_10]] to i1* -// CHECK: %[[VAL_13:.*]] = load i1, i1* %[[VAL_12]], align 1 -// CHECK: %[[VAL_14:.*]] = tail call %[[VAL_11]]* @__quantum__qis__mz(%[[VAL_5]]* %[[VAL_9]]) -// CHECK: %[[VAL_15:.*]] = bitcast %Result* %[[VAL_14]] to i1* -// CHECK: %[[VAL_16:.*]] = load i1, i1* %[[VAL_15]], align 1 -// CHECK: %[[VAL_20:.*]] = insertvalue { i1, i1 } undef, i1 %[[VAL_13]], 0 -// CHECK: %[[VAL_19:.*]] = insertvalue { i1, i1 } %[[VAL_20]], i1 %[[VAL_16]], 1 -// CHECK: tail call void @__quantum__rt__qubit_release_array(%[[VAL_2]]* %[[VAL_1]]) -// CHECK: ret { i1, i1 } %[[VAL_19]] -// CHECK: } - -// CHECK-LABEL: define i16 @test_1(i8* nocapture readnone -// CHECK-SAME: %[[VAL_0:.*]]) {{.*}}{ -// CHECK-NEXT: %[[VAL_2:.*]] = alloca i16, align 8 -// CHECK: %[[VAL_3:.*]] = bitcast i16* %[[VAL_2]] to i8* -// CHECK: call { i8*, i64 } @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_1.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_1.thunk to i8*), i8* nonnull %[[VAL_3]], i64 2, i64 0) -// CHECK: %[[VAL_4:.*]] = load i16, i16* %[[VAL_2]], align 8 -// CHECK: ret i16 %[[VAL_4]] +// CHECK-LABEL: define { i1, i1 } @__nvqpp__mlirgen__test_1() local_unnamed_addr { +// CHECK: %[[VAL_0:.*]] = tail call %[[VAL_1:.*]]* @__quantum__rt__qubit_allocate_array(i64 2) +// CHECK: %[[VAL_2:.*]] = tail call i8* @__quantum__rt__array_get_element_ptr_1d(%[[VAL_1]]* %[[VAL_0]], i64 0) +// CHECK: %[[VAL_3:.*]] = bitcast i8* %[[VAL_2]] to %[[VAL_4:.*]]** +// CHECK: %[[VAL_5:.*]] = load %[[VAL_4]]*, %[[VAL_4]]** %[[VAL_3]], align 8 +// CHECK: %[[VAL_6:.*]] = tail call i8* @__quantum__rt__array_get_element_ptr_1d(%[[VAL_1]]* %[[VAL_0]], i64 1) +// CHECK: %[[VAL_7:.*]] = bitcast i8* %[[VAL_6]] to %[[VAL_4]]** +// CHECK: %[[VAL_8:.*]] = load %[[VAL_4]]*, %[[VAL_4]]** %[[VAL_7]], align 8 +// CHECK: tail call void @__quantum__qis__h(%[[VAL_4]]* %[[VAL_5]]) +// CHECK: tail call void (i64, void (%[[VAL_1]]*, %[[VAL_4]]*)*, ...) @invokeWithControlQubits(i64 1, void (%[[VAL_1]]*, %[[VAL_4]]*)* nonnull @__quantum__qis__x__ctl, %[[VAL_4]]* %[[VAL_5]], %[[VAL_4]]* %[[VAL_8]]) +// CHECK: %[[VAL_9:.*]] = tail call %[[VAL_10:.*]]* @__quantum__qis__mz(%[[VAL_4]]* %[[VAL_5]]) +// CHECK: %[[VAL_11:.*]] = bitcast %[[VAL_10]]* %[[VAL_9]] to i1* +// CHECK: %[[VAL_12:.*]] = load i1, i1* %[[VAL_11]], align 1 +// CHECK: %[[VAL_13:.*]] = tail call %[[VAL_10]]* @__quantum__qis__mz(%[[VAL_4]]* %[[VAL_8]]) +// CHECK: %[[VAL_14:.*]] = bitcast %[[VAL_10]]* %[[VAL_13]] to i1* +// CHECK: %[[VAL_15:.*]] = load i1, i1* %[[VAL_14]], align 1 +// CHECK: %[[VAL_16:.*]] = insertvalue { i1, i1 } undef, i1 %[[VAL_12]], 0 +// CHECK: %[[VAL_17:.*]] = insertvalue { i1, i1 } %[[VAL_16]], i1 %[[VAL_15]], 1 +// CHECK: tail call void @__quantum__rt__qubit_release_array(%[[VAL_1]]* %[[VAL_0]]) +// CHECK: ret { i1, i1 } %[[VAL_17]] +// CHECK: } + +// CHECK-LABEL: define i16 @test_1(i8* nocapture readnone +// CHECK-SAME: %[[VAL_0:.*]]) local_unnamed_addr { +// CHECK: %[[VAL_1:.*]] = alloca [0 x i8*], align 8 +// CHECK: %[[VAL_2:.*]] = alloca i16 +// CHECK: %[[VAL_3:.*]] = alloca { i8**, i8**, i8** }, align 8 +// CHECK: %[[VAL_4:.*]] = bitcast i16* %[[VAL_2]] to i8* +// CHECK: %[[VAL_5:.*]] = getelementptr inbounds [0 x i8*], [0 x i8*]* %[[VAL_1]], i64 0, i64 0 +// CHECK: %[[VAL_6:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_3]], i64 0, i32 0 +// CHECK: store i8** %[[VAL_5]], i8*** %[[VAL_6]], align 8 +// CHECK: %[[VAL_7:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_3]], i64 0, i32 1 +// CHECK: store i8** %[[VAL_5]], i8*** %[[VAL_7]], align 8 +// CHECK: %[[VAL_8:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_3]], i64 0, i32 2 +// CHECK: store i8** %[[VAL_5]], i8*** %[[VAL_8]], align 8 +// CHECK: %[[VAL_9:.*]] = bitcast { i8**, i8**, i8** }* %[[VAL_3]] to i8* +// CHECK: %[[VAL_10:.*]] = call { i8*, i64 } @hybridLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_1.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_1.thunk to i8*), i8* nonnull %[[VAL_4]], i64 2, i64 0, i8* nonnull %[[VAL_9]]) +// CHECK: %[[VAL_11:.*]] = load i16, i16* %[[VAL_2]] +// CHECK: ret i16 %[[VAL_11]] // CHECK: } // struct{i16, f32, f64, i64} -> sret ptr @@ -201,20 +230,32 @@ func.func @test_2(%1: !cc.ptr> {llvm.sret = !cc return } -// CHECK-LABEL: define { i16, float, double, i64 } @__nvqpp__mlirgen__test_2() +// CHECK-LABEL: define { i16, float, double, i64 } @__nvqpp__mlirgen__test_2() local_unnamed_addr {{.*}} { // CHECK: ret { i16, float, double, i64 } { i16 8, float 0x40159999A0000000, double 3.783000e+01, i64 1479 } // CHECK: } // CHECK-LABEL: define void @test_2({ i16, float, double, i64 }* nocapture writeonly sret({ i16, float, double, i64 }) -// CHECK-SAME: %[[VAL_0:.*]], i8* nocapture readnone %[[VAL_1:.*]]) {{.*}}{ -// CHECK: %[[VAL_2:.*]] = alloca { { i16, float, double, i64 } }, align 8 -// CHECK: %[[VAL_3:.*]] = bitcast { { i16, float, double, i64 } }* %[[VAL_2]] to i8* -// CHECK: call { i8*, i64 } @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_2.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_2.thunk to i8*), i8* nonnull %[[VAL_3]], i64 24, i64 0) -// CHECK: %[[VAL_4:.*]] = bitcast { i16, float, double, i64 }* %[[VAL_0]] to i8* -// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 8 dereferenceable(24) %[[VAL_4]], i8* noundef nonnull align 8 dereferenceable(24) %[[VAL_3]], i64 24, i1 false) +// CHECK-SAME: %[[VAL_0:.*]], i8* nocapture readnone +// CHECK-SAME: %[[VAL_1:.*]]) local_unnamed_addr { +// CHECK: %[[VAL_2:.*]] = alloca [0 x i8*], align 8 +// CHECK: %[[VAL_3:.*]] = alloca [24 x i8], align 1 +// CHECK: %[[VAL_4:.*]] = alloca { i8**, i8**, i8** }, align 8 +// CHECK: %[[VAL_5:.*]] = getelementptr inbounds [24 x i8], [24 x i8]* %[[VAL_3]], i64 0, i64 0 +// CHECK: %[[VAL_6:.*]] = getelementptr inbounds [0 x i8*], [0 x i8*]* %[[VAL_2]], i64 0, i64 0 +// CHECK: %[[VAL_7:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_4]], i64 0, i32 0 +// CHECK: store i8** %[[VAL_6]], i8*** %[[VAL_7]], align 8 +// CHECK: %[[VAL_8:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_4]], i64 0, i32 1 +// CHECK: store i8** %[[VAL_6]], i8*** %[[VAL_8]], align 8 +// CHECK: %[[VAL_9:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_4]], i64 0, i32 2 +// CHECK: store i8** %[[VAL_6]], i8*** %[[VAL_9]], align 8 +// CHECK: %[[VAL_10:.*]] = bitcast { i8**, i8**, i8** }* %[[VAL_4]] to i8* +// CHECK: %[[VAL_11:.*]] = call { i8*, i64 } @hybridLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_2.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_2.thunk to i8*), i8* nonnull %[[VAL_5]], i64 24, i64 0, i8* nonnull %[[VAL_10]]) +// CHECK: %[[VAL_12:.*]] = bitcast { i16, float, double, i64 }* %[[VAL_0]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 8 dereferenceable(24) %[[VAL_12]], i8* noundef nonnull align 1 dereferenceable(24) %[[VAL_5]], i64 24, i1 false) // CHECK: ret void // CHECK: } + // array -> sret ptr func.func @__nvqpp__mlirgen__test_3() -> !cc.array { %rv = cc.undef !cc.array @@ -235,17 +276,28 @@ func.func @test_3(%1: !cc.ptr> {llvm.sret = !cc.array> {llvm.sret = !cc.struct return } -// CHECK-LABEL: define { i64, double } @__nvqpp__mlirgen__test_4() {{.*}}{ +// CHECK-LABEL: define { i64, double } @__nvqpp__mlirgen__test_4() local_unnamed_addr {{.*}} { // CHECK: ret { i64, double } { i64 537892, double 0x40578DA858793DD9 } // CHECK: } // CHECK-LABEL: define void @test_4({ i64, double }* nocapture writeonly sret({ i64, double }) -// CHECK-SAME: %[[VAL_0:.*]], i8* nocapture readnone %[[VAL_1:.*]]) {{.*}}{ -// CHECK: %[[VAL_2:.*]] = alloca { i64, double }, align 8 -// CHECK: %[[VAL_3:.*]] = bitcast { i64, double }* %[[VAL_2]] to i8* -// CHECK: call { i8*, i64 } @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_4.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_4.thunk to i8*), i8* nonnull %[[VAL_3]], i64 16, i64 0) -// CHECK: %[[VAL_4:.*]] = bitcast { i64, double }* %[[VAL_0]] to i8* -// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 8 dereferenceable(16) %[[VAL_4]], i8* noundef nonnull align 8 dereferenceable(16) %[[VAL_3]], i64 16, i1 false) +// CHECK-SAME: %[[VAL_0:.*]], i8* nocapture readnone +// CHECK-SAME: %[[VAL_1:.*]]) local_unnamed_addr { +// CHECK: %[[VAL_2:.*]] = alloca [0 x i8*], align 8 +// CHECK: %[[VAL_3:.*]] = alloca [16 x i8], align 1 +// CHECK: %[[VAL_4:.*]] = alloca { i8**, i8**, i8** }, align 8 +// CHECK: %[[VAL_5:.*]] = getelementptr inbounds [16 x i8], [16 x i8]* %[[VAL_3]], i64 0, i64 0 +// CHECK: %[[VAL_6:.*]] = getelementptr inbounds [0 x i8*], [0 x i8*]* %[[VAL_2]], i64 0, i64 0 +// CHECK: %[[VAL_7:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_4]], i64 0, i32 0 +// CHECK: store i8** %[[VAL_6]], i8*** %[[VAL_7]], align 8 +// CHECK: %[[VAL_8:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_4]], i64 0, i32 1 +// CHECK: store i8** %[[VAL_6]], i8*** %[[VAL_8]], align 8 +// CHECK: %[[VAL_9:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_4]], i64 0, i32 2 +// CHECK: store i8** %[[VAL_6]], i8*** %[[VAL_9]], align 8 +// CHECK: %[[VAL_10:.*]] = bitcast { i8**, i8**, i8** }* %[[VAL_4]] to i8* +// CHECK: %[[VAL_11:.*]] = call { i8*, i64 } @hybridLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_4.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_4.thunk to i8*), i8* nonnull %[[VAL_5]], i64 16, i64 0, i8* nonnull %[[VAL_10]]) +// CHECK: %[[VAL_12:.*]] = bitcast { i64, double }* %[[VAL_0]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 8 dereferenceable(16) %[[VAL_12]], i8* noundef nonnull align 1 dereferenceable(16) %[[VAL_5]], i64 16, i1 false) // CHECK: ret void // CHECK: } @@ -284,102 +347,114 @@ func.func @test_5(%sret: !cc.ptr> {llvm.sret = !cc.struct return } -// CHECK-LABEL: define { i64, double } @__nvqpp__mlirgen__test_5() {{.*}}{ +// CHECK-LABEL: define { i64, double } @__nvqpp__mlirgen__test_5() local_unnamed_addr {{.*}} { // CHECK: ret { i64, double } { i64 537892, double 0x40578DA858793DD9 } // CHECK: } // CHECK-LABEL: define void @test_5({ i64, double }* nocapture writeonly sret({ i64, double }) -// CHECK-SAME: %[[VAL_0:.*]]) {{.*}}{ -// CHECK: %[[VAL_1:.*]] = alloca { i64, double }, align 8 -// CHECK: %[[VAL_2:.*]] = bitcast { i64, double }* %[[VAL_1]] to i8* -// CHECK: call { i8*, i64 } @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_5.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_5.thunk to i8*), i8* nonnull %[[VAL_2]], i64 16, i64 0) -// CHECK: %[[VAL_3:.*]] = bitcast { i64, double }* %[[VAL_0]] to i8* -// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 8 dereferenceable(16) %[[VAL_3]], i8* noundef nonnull align 8 dereferenceable(16) %[[VAL_2]], i64 16, i1 false) +// CHECK-SAME: %[[VAL_0:.*]]) local_unnamed_addr { +// CHECK: %[[VAL_1:.*]] = alloca [0 x i8*], align 8 +// CHECK: %[[VAL_2:.*]] = alloca [16 x i8], align 1 +// CHECK: %[[VAL_3:.*]] = alloca { i8**, i8**, i8** }, align 8 +// CHECK: %[[VAL_4:.*]] = getelementptr inbounds [16 x i8], [16 x i8]* %[[VAL_2]], i64 0, i64 0 +// CHECK: %[[VAL_5:.*]] = getelementptr inbounds [0 x i8*], [0 x i8*]* %[[VAL_1]], i64 0, i64 0 +// CHECK: %[[VAL_6:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_3]], i64 0, i32 0 +// CHECK: store i8** %[[VAL_5]], i8*** %[[VAL_6]], align 8 +// CHECK: %[[VAL_7:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_3]], i64 0, i32 1 +// CHECK: store i8** %[[VAL_5]], i8*** %[[VAL_7]], align 8 +// CHECK: %[[VAL_8:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_3]], i64 0, i32 2 +// CHECK: store i8** %[[VAL_5]], i8*** %[[VAL_8]], align 8 +// CHECK: %[[VAL_9:.*]] = bitcast { i8**, i8**, i8** }* %[[VAL_3]] to i8* +// CHECK: %[[VAL_10:.*]] = call { i8*, i64 } @hybridLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_5.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_5.thunk to i8*), i8* nonnull %[[VAL_4]], i64 16, i64 0, i8* nonnull %[[VAL_9]]) +// CHECK: %[[VAL_11:.*]] = bitcast { i64, double }* %[[VAL_0]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 8 dereferenceable(16) %[[VAL_11]], i8* noundef nonnull align 1 dereferenceable(16) %[[VAL_4]], i64 16, i1 false) // CHECK: ret void // CHECK: } } - //===----------------------------------------------------------------------===// -// CHECK-LABEL: define i64 @test_0.returnOffset() +// CHECK-LABEL: define i64 @test_0.returnOffset() local_unnamed_addr {{.*}} { // CHECK: ret i64 8 // CHECK: } // CHECK-LABEL: define { i8*, i64 } @test_0.thunk(i8* nocapture -// CHECK-SAME: %[[VAL_0:.*]], i1 %[[VAL_1:.*]]) { +// CHECK-SAME: %[[VAL_0:.*]], i1 +// CHECK-SAME: %[[VAL_1:.*]]) { // CHECK: %[[VAL_2:.*]] = bitcast i8* %[[VAL_0]] to i32* // CHECK: %[[VAL_3:.*]] = load i32, i32* %[[VAL_2]], align 4 -// CHECK: %[[VAL_5:.*]] = sext i32 %[[VAL_3]] to i64 -// CHECK: %[[VAL_6:.*]] = tail call %[[VAL_7:.*]]* @__quantum__rt__qubit_allocate_array(i64 %[[VAL_5]]) -// CHECK: %[[VAL_8:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(%[[VAL_7]]* %[[VAL_6]]) -// CHECK: %[[VAL_9:.*]] = icmp sgt i64 %[[VAL_8]], 0 -// CHECK: br i1 %[[VAL_9]], label %[[VAL_10:.*]], label %[[VAL_11:.*]] -// CHECK: ._crit_edge.thread: ; preds = %[[VAL_12:.*]] -// CHECK: %[[VAL_13:.*]] = alloca i8, i64 %[[VAL_8]], align 1 -// CHECK: br label %[[VAL_14:.*]] -// CHECK: .lr.ph: ; preds = %[[VAL_12]], %[[VAL_10]] -// CHECK: %[[VAL_15:.*]] = phi i64 [ %[[VAL_16:.*]], %[[VAL_10]] ], [ 0, %[[VAL_12]] ] -// CHECK: %[[VAL_17:.*]] = tail call i8* @__quantum__rt__array_get_element_ptr_1d(%[[VAL_7]]* %[[VAL_6]], i64 %[[VAL_15]]) -// CHECK: %[[VAL_18:.*]] = bitcast i8* %[[VAL_17]] to %[[VAL_19:.*]]** -// CHECK: %[[VAL_20:.*]] = load %[[VAL_19]]*, %[[VAL_19]]** %[[VAL_18]], align 8 -// CHECK: tail call void @__quantum__qis__h(%[[VAL_19]]* %[[VAL_20]]) -// CHECK: %[[VAL_16]] = add nuw nsw i64 %[[VAL_15]], 1 -// CHECK: %[[VAL_21:.*]] = icmp eq i64 %[[VAL_16]], %[[VAL_8]] -// CHECK: br i1 %[[VAL_21]], label %[[VAL_22:.*]], label %[[VAL_10]] -// CHECK: ._crit_edge: ; preds = %[[VAL_10]] -// CHECK: %[[VAL_23:.*]] = alloca i8, i64 %[[VAL_8]], align 1 -// CHECK: br i1 %[[VAL_9]], label %[[VAL_24:.*]], label %[[VAL_14]] -// CHECK: [[VAL_24]]: ; preds = %[[VAL_22]], %[[VAL_24]] -// CHECK: %[[VAL_25:.*]] = phi i64 [ %[[VAL_26:.*]], %[[VAL_24]] ], [ 0, %[[VAL_22]] ] -// CHECK: %[[VAL_27:.*]] = tail call i8* @__quantum__rt__array_get_element_ptr_1d(%[[VAL_7]]* %[[VAL_6]], i64 %[[VAL_25]]) -// CHECK: %[[VAL_28:.*]] = bitcast i8* %[[VAL_27]] to %[[VAL_19]]** -// CHECK: %[[VAL_29:.*]] = load %[[VAL_19]]*, %[[VAL_19]]** %[[VAL_28]], align 8 -// CHECK: %[[VAL_30:.*]] = tail call %[[VAL_31:.*]]* @__quantum__qis__mz(%[[VAL_19]]* %[[VAL_29]]) -// CHECK: %[[VAL_32:.*]] = bitcast %[[VAL_31]]* %[[VAL_30]] to i1* -// CHECK: %[[VAL_33:.*]] = load i1, i1* %[[VAL_32]], align 1 -// CHECK: %[[VAL_34:.*]] = getelementptr i8, i8* %[[VAL_23]], i64 %[[VAL_25]] -// CHECK: %[[VAL_35:.*]] = zext i1 %[[VAL_33]] to i8 -// CHECK: store i8 %[[VAL_35]], i8* %[[VAL_34]], align 1 -// CHECK: %[[VAL_26]] = add nuw nsw i64 %[[VAL_25]], 1 -// CHECK: %[[VAL_36:.*]] = icmp eq i64 %[[VAL_26]], %[[VAL_8]] -// CHECK: br i1 %[[VAL_36]], label %[[VAL_14]], label %[[VAL_24]] -// CHECK: [[VAL_14]]: ; preds = %[[VAL_24]], %[[VAL_11]], %[[VAL_22]] -// CHECK: %[[VAL_37:.*]] = phi i8* [ %[[VAL_13]], %[[VAL_11]] ], [ %[[VAL_23]], %[[VAL_22]] ], [ %[[VAL_23]], %[[VAL_24]] ] -// CHECK: %[[VAL_38:.*]] = call i8* @__nvqpp_vectorCopyCtor(i8* nonnull %[[VAL_37]], i64 %[[VAL_8]], i64 1) -// CHECK: call void @__quantum__rt__qubit_release_array(%[[VAL_7]]* %[[VAL_6]]) -// CHECK: %[[VAL_50:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 8 -// CHECK: %[[VAL_51:.*]] = bitcast i8* %[[VAL_50]] to i8** -// CHECK: store i8* %[[VAL_38]], i8** %[[VAL_51]], align 8 -// CHECK: %[[VAL_52:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 16 -// CHECK: %[[VAL_53:.*]] = bitcast i8* %[[VAL_52]] to i64* -// CHECK: store i64 %[[VAL_8]], i64* %[[VAL_53]], align 8 +// CHECK: %[[VAL_4:.*]] = sext i32 %[[VAL_3]] to i64 +// CHECK: %[[VAL_5:.*]] = tail call %[[VAL_6:.*]]* @__quantum__rt__qubit_allocate_array(i64 %[[VAL_4]]) +// CHECK: %[[VAL_7:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(%[[VAL_6]]* %[[VAL_5]]) +// CHECK: %[[VAL_8:.*]] = icmp sgt i64 %[[VAL_7]], 0 +// CHECK: br i1 %[[VAL_8]], label %[[VAL_9:.*]], label %[[VAL_10:.*]] +// CHECK: ._crit_edge.thread: ; preds = %[[VAL_11:.*]] +// CHECK: %[[VAL_12:.*]] = alloca i8, i64 %[[VAL_7]], align 1 +// CHECK: br label %[[VAL_13:.*]] +// CHECK: .lr.ph: ; preds = %[[VAL_11]], %[[VAL_9]] +// CHECK: %[[VAL_14:.*]] = phi i64 [ %[[VAL_15:.*]], %[[VAL_9]] ], [ 0, %[[VAL_11]] ] +// CHECK: %[[VAL_16:.*]] = tail call i8* @__quantum__rt__array_get_element_ptr_1d(%[[VAL_6]]* %[[VAL_5]], i64 %[[VAL_14]]) +// CHECK: %[[VAL_17:.*]] = bitcast i8* %[[VAL_16]] to %[[VAL_18:.*]]** +// CHECK: %[[VAL_19:.*]] = load %[[VAL_18]]*, %[[VAL_18]]** %[[VAL_17]], align 8 +// CHECK: tail call void @__quantum__qis__h(%[[VAL_18]]* %[[VAL_19]]) +// CHECK: %[[VAL_15]] = add nuw nsw i64 %[[VAL_14]], 1 +// CHECK: %[[VAL_20:.*]] = icmp eq i64 %[[VAL_15]], %[[VAL_7]] +// CHECK: br i1 %[[VAL_20]], label %[[VAL_21:.*]], label %[[VAL_9]] +// CHECK: ._crit_edge: ; preds = %[[VAL_9]] +// CHECK: %[[VAL_22:.*]] = alloca i8, i64 %[[VAL_7]], align 1 +// CHECK: br i1 %[[VAL_8]], label %[[VAL_23:.*]], label %[[VAL_13]] +// CHECK: .lr.ph6: ; preds = %[[VAL_21]], %[[VAL_23]] +// CHECK: %[[VAL_24:.*]] = phi i64 [ %[[VAL_25:.*]], %[[VAL_23]] ], [ 0, %[[VAL_21]] ] +// CHECK: %[[VAL_26:.*]] = tail call i8* @__quantum__rt__array_get_element_ptr_1d(%[[VAL_6]]* %[[VAL_5]], i64 %[[VAL_24]]) +// CHECK: %[[VAL_27:.*]] = bitcast i8* %[[VAL_26]] to %[[VAL_18]]** +// CHECK: %[[VAL_28:.*]] = load %[[VAL_18]]*, %[[VAL_18]]** %[[VAL_27]], align 8 +// CHECK: %[[VAL_29:.*]] = tail call %[[VAL_30:.*]]* @__quantum__qis__mz(%[[VAL_18]]* %[[VAL_28]]) +// CHECK: %[[VAL_31:.*]] = bitcast %[[VAL_30]]* %[[VAL_29]] to i1* +// CHECK: %[[VAL_32:.*]] = load i1, i1* %[[VAL_31]], align 1 +// CHECK: %[[VAL_33:.*]] = getelementptr i8, i8* %[[VAL_22]], i64 %[[VAL_24]] +// CHECK: %[[VAL_34:.*]] = zext i1 %[[VAL_32]] to i8 +// CHECK: store i8 %[[VAL_34]], i8* %[[VAL_33]], align 1 +// CHECK: %[[VAL_25]] = add nuw nsw i64 %[[VAL_24]], 1 +// CHECK: %[[VAL_35:.*]] = icmp eq i64 %[[VAL_25]], %[[VAL_7]] +// CHECK: br i1 %[[VAL_35]], label %[[VAL_13]], label %[[VAL_23]] +// CHECK: ._crit_edge7: ; preds = %[[VAL_23]], %[[VAL_10]], %[[VAL_21]] +// CHECK: %[[VAL_36:.*]] = phi i8* [ %[[VAL_12]], %[[VAL_10]] ], [ %[[VAL_22]], %[[VAL_21]] ], [ %[[VAL_22]], %[[VAL_23]] ] +// CHECK: %[[VAL_37:.*]] = call i8* @__nvqpp_vectorCopyCtor(i8* nonnull %[[VAL_36]], i64 %[[VAL_7]], i64 1) +// CHECK: call void @__quantum__rt__qubit_release_array(%[[VAL_6]]* %[[VAL_5]]) +// CHECK: %[[VAL_38:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 8 +// CHECK: %[[VAL_39:.*]] = bitcast i8* %[[VAL_38]] to i8** +// CHECK: store i8* %[[VAL_37]], i8** %[[VAL_39]], align 8 +// CHECK: %[[VAL_40:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 16 +// CHECK: %[[VAL_41:.*]] = bitcast i8* %[[VAL_40]] to i64* +// CHECK: store i64 %[[VAL_7]], i64* %[[VAL_41]], align 8 // CHECK: br i1 %[[VAL_1]], label %[[VAL_42:.*]], label %[[VAL_43:.*]] -// CHECK: [[VAL_43]]: ; preds = %[[VAL_14]], %[[VAL_42]] -// CHECK: %[[VAL_44:.*]] = phi { i8*, i64 } [ %[[VAL_45:.*]], %[[VAL_42]] ], [ zeroinitializer, %[[VAL_14]] ] +// CHECK: common.ret: ; preds = %[[VAL_13]], %[[VAL_42]] +// CHECK: %[[VAL_44:.*]] = phi { i8*, i64 } [ %[[VAL_45:.*]], %[[VAL_42]] ], [ zeroinitializer, %[[VAL_13]] ] // CHECK: ret { i8*, i64 } %[[VAL_44]] -// CHECK: [[VAL_42]]: ; preds = %[[VAL_14]] -// CHECK: %[[VAL_46:.*]] = add i64 %[[VAL_8]], 24 +// CHECK: 31: ; preds = %[[VAL_13]] +// CHECK: %[[VAL_46:.*]] = add i64 %[[VAL_7]], 24 // CHECK: %[[VAL_47:.*]] = call i8* @malloc(i64 %[[VAL_46]]) // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 1 dereferenceable(24) %[[VAL_47]], i8* noundef nonnull align 1 dereferenceable(24) %[[VAL_0]], i64 24, i1 false) // CHECK: %[[VAL_48:.*]] = getelementptr i8, i8* %[[VAL_47]], i64 24 -// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %[[VAL_48]], i8* align 1 %[[VAL_38]], i64 %[[VAL_8]], i1 false) +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %[[VAL_48]], i8* align 1 %[[VAL_37]], i64 %[[VAL_7]], i1 false) // CHECK: %[[VAL_49:.*]] = insertvalue { i8*, i64 } undef, i8* %[[VAL_47]], 0 // CHECK: %[[VAL_45]] = insertvalue { i8*, i64 } %[[VAL_49]], i64 %[[VAL_46]], 1 +// CHECK: %[[VAL_50:.*]] = getelementptr i8, i8* %[[VAL_47]], i64 8 +// CHECK: %[[VAL_51:.*]] = bitcast i8* %[[VAL_50]] to i8** +// CHECK: store i8* %[[VAL_48]], i8** %[[VAL_51]], align 8 // CHECK: br label %[[VAL_43]] // CHECK: } // CHECK-LABEL: define i64 @test_0.argsCreator(i8** nocapture readonly -// CHECK-SAME: %[[VAL_0:.*]], i8** nocapture writeonly -// CHECK-SAME: %[[VAL_1:.*]]) #{{[0-9]+}} { +// CHECK-SAME: %[[VAL_0:.*]], i8** nocapture writeonly +// CHECK-SAME: %[[VAL_1:.*]]) {{.*}} { // CHECK: %[[VAL_2:.*]] = bitcast i8** %[[VAL_0]] to i32** // CHECK: %[[VAL_3:.*]] = load i32*, i32** %[[VAL_2]], align 8 // CHECK: %[[VAL_4:.*]] = load i32, i32* %[[VAL_3]], align 4 -// CHECK: %[[VAL_5:.*]] = insertvalue { i32, { i1*, i64 } } undef, i32 %[[VAL_4]], 0 -// CHECK: %[[VAL_6:.*]] = tail call dereferenceable_or_null(24) i8* @malloc(i64 24) -// CHECK: %[[VAL_7:.*]] = bitcast i8* %[[VAL_6]] to { i32, { i1*, i64 } }* -// CHECK: store { i32, { i1*, i64 } } %[[VAL_5]], { i32, { i1*, i64 } }* %[[VAL_7]], align 8 -// CHECK: store i8* %[[VAL_6]], i8** %[[VAL_1]], align 8 +// CHECK: %[[VAL_5:.*]] = tail call dereferenceable_or_null(24) i8* @malloc(i64 24) +// CHECK: %[[VAL_6:.*]] = bitcast i8* %[[VAL_5]] to i32* +// CHECK: store i32 %[[VAL_4]], i32* %[[VAL_6]], align 4 +// CHECK: store i8* %[[VAL_5]], i8** %[[VAL_1]], align 8 // CHECK: ret i64 24 // CHECK: } @@ -389,13 +464,13 @@ func.func @test_5(%sret: !cc.ptr> {llvm.sret = !cc.struct // CHECK: ret void // CHECK: } -// CHECK-LABEL: define i64 @test_1.returnOffset() +// CHECK-LABEL: define i64 @test_1.returnOffset() local_unnamed_addr {{.*}} { // CHECK: ret i64 0 // CHECK: } // CHECK-LABEL: define { i8*, i64 } @test_1.thunk(i8* nocapture writeonly -// CHECK-SAME: %[[VAL_0:.*]], i1 -// CHECK-SAME: %[[VAL_1:.*]]) { +// CHECK-SAME: %[[VAL_0:.*]], i1 +// CHECK-SAME: %[[VAL_1:.*]]) { // CHECK: %[[VAL_2:.*]] = tail call %[[VAL_3:.*]]* @__quantum__rt__qubit_allocate_array(i64 2) // CHECK: %[[VAL_4:.*]] = tail call i8* @__quantum__rt__array_get_element_ptr_1d(%[[VAL_3]]* %[[VAL_2]], i64 0) // CHECK: %[[VAL_5:.*]] = bitcast i8* %[[VAL_4]] to %[[VAL_6:.*]]** @@ -421,8 +496,8 @@ func.func @test_5(%sret: !cc.ptr> {llvm.sret = !cc.struct // CHECK: } // CHECK-LABEL: define i64 @test_1.argsCreator(i8** nocapture readnone -// CHECK-SAME: %[[VAL_0:.*]], i8** nocapture writeonly -// CHECK-SAME: %[[VAL_1:.*]]) #{{[0-9]+}} { +// CHECK-SAME: %[[VAL_0:.*]], i8** nocapture writeonly +// CHECK-SAME: %[[VAL_1:.*]]) {{.*}} { // CHECK: %[[VAL_2:.*]] = tail call dereferenceable_or_null(2) i8* @malloc(i64 2) // CHECK: store i8* %[[VAL_2]], i8** %[[VAL_1]], align 8 // CHECK: ret i64 2 @@ -434,21 +509,21 @@ func.func @test_5(%sret: !cc.ptr> {llvm.sret = !cc.struct // CHECK: ret void // CHECK: } -// CHECK-LABEL: define i64 @test_2.returnOffset() +// CHECK-LABEL: define i64 @test_2.returnOffset() local_unnamed_addr {{.*}} { // CHECK: ret i64 0 // CHECK: } // CHECK-LABEL: define { i8*, i64 } @test_2.thunk(i8* nocapture writeonly -// CHECK-SAME: %[[VAL_0:.*]], i1 -// CHECK-SAME: %[[VAL_1:.*]]) #{{[0-9]+}} { +// CHECK-SAME: %[[VAL_0:.*]], i1 +// CHECK-SAME: %[[VAL_1:.*]]) {{.*}} { // CHECK: %[[VAL_2:.*]] = bitcast i8* %[[VAL_0]] to { i16, float, double, i64 }* // CHECK: store { i16, float, double, i64 } { i16 8, float 0x40159999A0000000, double 3.783000e+01, i64 1479 }, { i16, float, double, i64 }* %[[VAL_2]], align 8 // CHECK: ret { i8*, i64 } zeroinitializer // CHECK: } // CHECK-LABEL: define i64 @test_2.argsCreator(i8** nocapture readnone -// CHECK-SAME: %[[VAL_0:.*]], i8** nocapture writeonly -// CHECK-SAME: %[[VAL_1:.*]]) #{{[0-9]+}} { +// CHECK-SAME: %[[VAL_0:.*]], i8** nocapture writeonly +// CHECK-SAME: %[[VAL_1:.*]]) {{.*}} { // CHECK: %[[VAL_2:.*]] = tail call dereferenceable_or_null(24) i8* @malloc(i64 24) // CHECK: store i8* %[[VAL_2]], i8** %[[VAL_1]], align 8 // CHECK: ret i64 24 @@ -460,12 +535,13 @@ func.func @test_5(%sret: !cc.ptr> {llvm.sret = !cc.struct // CHECK: ret void // CHECK: } -// CHECK-LABEL: define i64 @test_3.returnOffset() +// CHECK-LABEL: define i64 @test_3.returnOffset() local_unnamed_addr {{.*}} { // CHECK: ret i64 0 // CHECK: } // CHECK-LABEL: define { i8*, i64 } @test_3.thunk(i8* nocapture writeonly -// CHECK-SAME: %[[VAL_0:.*]], i1 %[[VAL_1:.*]]) #{{[0-9]+}} { +// CHECK-SAME: %[[VAL_0:.*]], i1 +// CHECK-SAME: %[[VAL_1:.*]]) {{.*}} { // CHECK: %[[VAL_2:.*]] = bitcast i8* %[[VAL_0]] to i64* // CHECK: store i64 5, i64* %[[VAL_2]], align 4 // CHECK: %[[VAL_3:.*]] = getelementptr inbounds i8, i8* %[[VAL_0]], i64 8 @@ -484,8 +560,8 @@ func.func @test_5(%sret: !cc.ptr> {llvm.sret = !cc.struct // CHECK: } // CHECK-LABEL: define i64 @test_3.argsCreator(i8** nocapture readnone -// CHECK-SAME: %[[VAL_0:.*]], i8** nocapture writeonly -// CHECK-SAME: %[[VAL_1:.*]]) #{{[0-9]+}} { +// CHECK-SAME: %[[VAL_0:.*]], i8** nocapture writeonly +// CHECK-SAME: %[[VAL_1:.*]]) {{.*}} { // CHECK: %[[VAL_2:.*]] = tail call dereferenceable_or_null(40) i8* @malloc(i64 40) // CHECK: store i8* %[[VAL_2]], i8** %[[VAL_1]], align 8 // CHECK: ret i64 40 @@ -497,12 +573,13 @@ func.func @test_5(%sret: !cc.ptr> {llvm.sret = !cc.struct // CHECK: ret void // CHECK: } -// CHECK-LABEL: define i64 @test_4.returnOffset() +// CHECK-LABEL: define i64 @test_4.returnOffset() local_unnamed_addr {{.*}} { // CHECK: ret i64 0 // CHECK: } // CHECK-LABEL: define { i8*, i64 } @test_4.thunk(i8* nocapture writeonly -// CHECK-SAME: %[[VAL_0:.*]], i1 %[[VAL_1:.*]]) #{{[0-9]+}} { +// CHECK-SAME: %[[VAL_0:.*]], i1 +// CHECK-SAME: %[[VAL_1:.*]]) {{.*}} { // CHECK: %[[VAL_2:.*]] = bitcast i8* %[[VAL_0]] to i64* // CHECK: store i64 537892, i64* %[[VAL_2]], align 4 // CHECK: %[[VAL_3:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 8 @@ -512,8 +589,8 @@ func.func @test_5(%sret: !cc.ptr> {llvm.sret = !cc.struct // CHECK: } // CHECK-LABEL: define i64 @test_4.argsCreator(i8** nocapture readnone -// CHECK-SAME: %[[VAL_0:.*]], i8** nocapture writeonly -// CHECK-SAME: %[[VAL_1:.*]]) #{{[0-9]+}} { +// CHECK-SAME: %[[VAL_0:.*]], i8** nocapture writeonly +// CHECK-SAME: %[[VAL_1:.*]]) {{.*}} { // CHECK: %[[VAL_2:.*]] = tail call dereferenceable_or_null(16) i8* @malloc(i64 16) // CHECK: store i8* %[[VAL_2]], i8** %[[VAL_1]], align 8 // CHECK: ret i64 16 @@ -525,12 +602,13 @@ func.func @test_5(%sret: !cc.ptr> {llvm.sret = !cc.struct // CHECK: ret void // CHECK: } -// CHECK-LABEL: define i64 @test_5.returnOffset() +// CHECK-LABEL: define i64 @test_5.returnOffset() local_unnamed_addr {{.*}} { // CHECK: ret i64 0 // CHECK: } // CHECK-LABEL: define { i8*, i64 } @test_5.thunk(i8* nocapture writeonly -// CHECK-SAME: %[[VAL_0:.*]], i1 %[[VAL_1:.*]]) #{{[0-9]+}} { +// CHECK-SAME: %[[VAL_0:.*]], i1 +// CHECK-SAME: %[[VAL_1:.*]]) {{.*}} { // CHECK: %[[VAL_2:.*]] = bitcast i8* %[[VAL_0]] to i64* // CHECK: store i64 537892, i64* %[[VAL_2]], align 4 // CHECK: %[[VAL_3:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 8 @@ -540,8 +618,8 @@ func.func @test_5(%sret: !cc.ptr> {llvm.sret = !cc.struct // CHECK: } // CHECK-LABEL: define i64 @test_5.argsCreator(i8** nocapture readnone -// CHECK-SAME: %[[VAL_0:.*]], i8** nocapture writeonly -// CHECK-SAME: %[[VAL_1:.*]]) #{{[0-9]+}} { +// CHECK-SAME: %[[VAL_0:.*]], i8** nocapture writeonly +// CHECK-SAME: %[[VAL_1:.*]]) {{.*}} { // CHECK: %[[VAL_2:.*]] = tail call dereferenceable_or_null(16) i8* @malloc(i64 16) // CHECK: store i8* %[[VAL_2]], i8** %[[VAL_1]], align 8 // CHECK: ret i64 16