diff --git a/include/cudaq/Optimizer/Builder/Factory.h b/include/cudaq/Optimizer/Builder/Factory.h
index 24e933117a..596381ee17 100644
--- a/include/cudaq/Optimizer/Builder/Factory.h
+++ b/include/cudaq/Optimizer/Builder/Factory.h
@@ -128,9 +128,13 @@ inline mlir::Type stateImplType(mlir::Type eleTy) {
   return cudaq::opt::factory::getPointerType(eleTy.getContext());
 }
 
-// Host side types for std::string and std::vector
+// Generate host side type for std::string. The result is the type of a block of
+// bytes and the length to allocate. This allows for the creation of code to
+// allocate a variable, stride across such a variable, etc. The ModuleOp must
+// contain the size of a pauli_word in its attributes.
+cudaq::cc::ArrayType genHostStringType(mlir::ModuleOp module);
 
-cudaq::cc::StructType stlStringType(mlir::MLIRContext *ctx);
+// Host side types for std::vector
 cudaq::cc::StructType stlVectorType(mlir::Type eleTy);
 
 //===----------------------------------------------------------------------===//
@@ -246,6 +250,9 @@ bool hasSRet(mlir::func::FuncOp funcOp);
 mlir::FunctionType toHostSideFuncType(mlir::FunctionType funcTy,
                                       bool addThisPtr, mlir::ModuleOp module);
 
+/// Convert device type, \p ty, to host side type.
+mlir::Type convertToHostSideType(mlir::Type ty, mlir::ModuleOp module);
+
 // Return `true` if the given type corresponds to a standard vector type
 // according to our convention.
 // The convention is a `ptr<struct<ptr<T>, ptr<T>, ptr<T>>>`.
diff --git a/include/cudaq/Optimizer/Builder/Intrinsics.h b/include/cudaq/Optimizer/Builder/Intrinsics.h
index fa9ce53097..5884dbb39e 100644
--- a/include/cudaq/Optimizer/Builder/Intrinsics.h
+++ b/include/cudaq/Optimizer/Builder/Intrinsics.h
@@ -36,11 +36,16 @@ static constexpr const char getCudaqSizeFromTriple[] =
 // typically specialized to be bit packed).
 static constexpr const char stdvecBoolCtorFromInitList[] =
     "__nvqpp_initializer_list_to_vector_bool";
+
 // Convert a (likely packed) std::vector<bool> into a sequence of bytes, each
 // holding a boolean value.
 static constexpr const char stdvecBoolUnpackToInitList[] =
     "__nvqpp_vector_bool_to_initializer_list";
 
+// Free any temporary buffers used to hold std::vector<bool> data.
+static constexpr const char stdvecBoolFreeTemporaryLists[] =
+    "__nvqpp_vector_bool_free_temporary_initlists";
+
 // The internal data of the cudaq::state object must be `2**n` in length. This
 // function returns the value `n`.
 static constexpr const char getNumQubitsFromCudaqState[] =
diff --git a/include/cudaq/Optimizer/Builder/Runtime.h b/include/cudaq/Optimizer/Builder/Runtime.h
index e65c05a857..4fc9405272 100644
--- a/include/cudaq/Optimizer/Builder/Runtime.h
+++ b/include/cudaq/Optimizer/Builder/Runtime.h
@@ -10,6 +10,16 @@
 
 #include "cudaq/Optimizer/Builder/Factory.h"
 
+//===----------------------------------------------------------------------===//
+//
+// Runtime helper functions are functions that will appear in the runtime
+// library (implementations are defined in either the headers or libraries in
+// the `runtime` directory). These helper functions may never be assumed to
+// appear on the device-side, so these helpers should only be used in host-side
+// code.
+//
+//===----------------------------------------------------------------------===//
+
 namespace cudaq::runtime {
 
 /// Prefix for all kernel entry functions.
@@ -52,4 +62,15 @@ static constexpr const char CudaqRegisterKernelName[] =
 static constexpr const char cudaqAHSPrefixName[] =
     "__analog_hamiltonian_kernel__";
 
+// Host-side helper functions for working with `cudaq::pauli_word` or a
+// `std::string`. These include both fully dynamic and binding time (library
+// build time) helper functions.
+static constexpr const char sizeofStringAttrName[] = "cc.sizeof_string";
+static constexpr const char getPauliWordSize[] =
+    "_ZNK5cudaq10pauli_word11_nvqpp_sizeEv";
+static constexpr const char getPauliWordData[] =
+    "_ZNK5cudaq10pauli_word11_nvqpp_dataEv";
+static constexpr const char bindingGetStringData[] = "__nvqpp_getStringData";
+static constexpr const char bindingGetStringSize[] = "__nvqpp_getStringSize";
+
 } // namespace cudaq::runtime
diff --git a/include/cudaq/Optimizer/Dialect/CC/CCTypes.td b/include/cudaq/Optimizer/Dialect/CC/CCTypes.td
index d8a5820abe..aa03aedc07 100644
--- a/include/cudaq/Optimizer/Dialect/CC/CCTypes.td
+++ b/include/cudaq/Optimizer/Dialect/CC/CCTypes.td
@@ -119,7 +119,12 @@ def cc_StructType : CCType<"Struct", "struct",
   ];
 
   let extraClassDeclaration = [{
+    // O(1)
     bool isEmpty() const { return getMembers().empty(); }
+
+    // O(n)
+    std::size_t getNumMembers() const { return getMembers().size(); }
+
     Type getMember(unsigned position) { return getMembers()[position]; }
   }];
 }
diff --git a/lib/Frontend/nvqpp/ASTBridge.cpp b/lib/Frontend/nvqpp/ASTBridge.cpp
index 806f3c6bde..2e4f1d810f 100644
--- a/lib/Frontend/nvqpp/ASTBridge.cpp
+++ b/lib/Frontend/nvqpp/ASTBridge.cpp
@@ -153,10 +153,10 @@ class QPUCodeFinder : public clang::RecursiveASTVisitor<QPUCodeFinder> {
   using Base = clang::RecursiveASTVisitor<QPUCodeFinder>;
   explicit QPUCodeFinder(
       cudaq::EmittedFunctionsCollection &funcsToEmit, clang::CallGraph &cgb,
-      clang::ItaniumMangleContext *mangler,
+      clang::ItaniumMangleContext *mangler, ModuleOp module,
       std::unordered_map<std::string, std::string> &customOperations)
       : functionsToEmit(funcsToEmit), callGraphBuilder(cgb), mangler(mangler),
-        customOperationNames(customOperations) {}
+        module(module), customOperationNames(customOperations) {}
 
   /// Add a kernel to the list of kernels to process.
   template <bool replace = true>
@@ -332,6 +332,25 @@ class QPUCodeFinder : public clang::RecursiveASTVisitor<QPUCodeFinder> {
         tuplesAreReversed = !opt->isZero();
       }
     }
+    if (cudaq::isInNamespace(x, "cudaq") &&
+        cudaq::isInNamespace(x, "details") &&
+        x->getName().equals("_nvqpp_sizeof")) {
+      // This constexpr is the sizeof a pauli_word and a std::string.
+      auto loc = x->getLocation();
+      auto opt = x->getAnyInitializer()->getIntegerConstantExpr(
+          x->getASTContext(), &loc, false);
+      assert(opt && "must compute the sizeof a cudaq::pauli_word");
+      auto sizeofString = opt->getZExtValue();
+      auto sizeAttr = module->getAttr(cudaq::runtime::sizeofStringAttrName);
+      if (sizeAttr) {
+        assert(sizeofString == cast<IntegerAttr>(sizeAttr).getUInt());
+      } else {
+        auto *ctx = module.getContext();
+        auto i64Ty = IntegerType::get(ctx, 64);
+        module->setAttr(cudaq::runtime::sizeofStringAttrName,
+                        IntegerAttr::get(i64Ty, sizeofString));
+      }
+    }
     // The check to make sure that quantum data types are only used in kernels
     // is done here. This checks both variable declarations and parameters.
     if (quantumTypesNotAllowed)
@@ -357,6 +376,7 @@ class QPUCodeFinder : public clang::RecursiveASTVisitor<QPUCodeFinder> {
   cudaq::EmittedFunctionsCollection &functionsToEmit;
   clang::CallGraph &callGraphBuilder;
   clang::ItaniumMangleContext *mangler;
+  ModuleOp module;
   std::unordered_map<std::string, std::string> &customOperationNames;
   // A class that is being visited. Need to run semantics checks on it if and
   // only if it has a quantum kernel.
@@ -648,7 +668,7 @@ void ASTBridgeAction::ASTBridgeConsumer::HandleTranslationUnit(
 
 bool ASTBridgeAction::ASTBridgeConsumer::HandleTopLevelDecl(
     clang::DeclGroupRef dg) {
-  QPUCodeFinder finder(functionsToEmit, callGraphBuilder, mangler,
+  QPUCodeFinder finder(functionsToEmit, callGraphBuilder, mangler, module.get(),
                        customOperationNames);
   // Loop over all decls, saving the function decls that are quantum kernels.
   for (const auto *decl : dg)
diff --git a/lib/Frontend/nvqpp/ConvertStmt.cpp b/lib/Frontend/nvqpp/ConvertStmt.cpp
index 8c67f68283..efc6c889c9 100644
--- a/lib/Frontend/nvqpp/ConvertStmt.cpp
+++ b/lib/Frontend/nvqpp/ConvertStmt.cpp
@@ -331,7 +331,9 @@ bool QuakeBridgeVisitor::VisitReturnStmt(clang::ReturnStmt *x) {
                                                 ValueRange{heapCopy, dynSize});
       };
       IRBuilder irb(builder);
-      Value tySize = irb.getByteSizeOfType(loc, eleTy);
+      Value tySize;
+      if (!cudaq::cc::isDynamicType(eleTy))
+        tySize = irb.getByteSizeOfType(loc, eleTy);
       if (!tySize) {
         TODO_x(toLocation(x), x, mangler, "unhandled vector element type");
         return false;
diff --git a/lib/Optimizer/Builder/Factory.cpp b/lib/Optimizer/Builder/Factory.cpp
index 421943c6c9..5bdd71e67e 100644
--- a/lib/Optimizer/Builder/Factory.cpp
+++ b/lib/Optimizer/Builder/Factory.cpp
@@ -7,6 +7,7 @@
  ******************************************************************************/
 
 #include "cudaq/Optimizer/Builder/Intrinsics.h"
+#include "cudaq/Optimizer/Builder/Runtime.h"
 #include "cudaq/Optimizer/CodeGen/QIRFunctionNames.h"
 #include "cudaq/Optimizer/Dialect/CC/CCOps.h"
 #include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
@@ -305,12 +306,15 @@ cc::LoopOp factory::createMonotonicLoop(
   return loop;
 }
 
-cc::StructType factory::stlStringType(MLIRContext *ctx) {
+cc::ArrayType factory::genHostStringType(ModuleOp mod) {
+  auto *ctx = mod.getContext();
   auto i8Ty = IntegerType::get(ctx, 8);
-  auto ptrI8Ty = cc::PointerType::get(i8Ty);
-  auto i64Ty = IntegerType::get(ctx, 64);
-  auto padTy = cc::ArrayType::get(ctx, i8Ty, 16);
-  return cc::StructType::get(ctx, ArrayRef<Type>{ptrI8Ty, i64Ty, padTy});
+  auto sizeAttr = mod->getAttr(cudaq::runtime::sizeofStringAttrName);
+  if (sizeAttr) {
+    auto size = cast<IntegerAttr>(sizeAttr).getInt();
+    return cc::ArrayType::get(ctx, i8Ty, size);
+  }
+  return cc::ArrayType::get(ctx, i8Ty, sizeof(std::string));
 }
 
 // FIXME: We should get the underlying structure of a std::vector from the
@@ -321,6 +325,22 @@ cc::StructType factory::stlVectorType(Type eleTy) {
   return cc::StructType::get(ctx, ArrayRef<Type>{ptrTy, ptrTy, ptrTy});
 }
 
+// Note that this is the raw host type, where std::vector<bool> is distinct.
+// When converting to the device side, the distinction is deliberately removed
+// making std::vector<bool> the same format as std::vector<char>.
+static cc::StructType stlHostVectorType(Type eleTy) {
+  MLIRContext *ctx = eleTy.getContext();
+  if (eleTy != IntegerType::get(ctx, 1)) {
+    // std::vector<T> where T != bool.
+    return factory::stlVectorType(eleTy);
+  }
+  // std::vector<bool> is a different type than std::vector<T>.
+  auto ptrTy = cc::PointerType::get(eleTy);
+  auto i8Ty = IntegerType::get(ctx, 8);
+  auto padout = cc::ArrayType::get(ctx, i8Ty, 32);
+  return cc::StructType::get(ctx, ArrayRef<Type>{ptrTy, padout});
+}
+
 // FIXME: Give these front-end names so we can disambiguate more types.
 cc::StructType factory::getDynamicBufferType(MLIRContext *ctx) {
   auto ptrTy = cc::PointerType::get(IntegerType::get(ctx, 8));
@@ -342,24 +362,19 @@ Type factory::getSRetElementType(FunctionType funcTy) {
   return funcTy.getResult(0);
 }
 
-static Type convertToHostSideType(Type ty) {
+Type factory::convertToHostSideType(Type ty, ModuleOp mod) {
   if (auto memrefTy = dyn_cast<cc::StdvecType>(ty))
-    return convertToHostSideType(
-        factory::stlVectorType(memrefTy.getElementType()));
+    return stlHostVectorType(
+        convertToHostSideType(memrefTy.getElementType(), mod));
   if (isa<cc::IndirectCallableType>(ty))
     return cc::PointerType::get(IntegerType::get(ty.getContext(), 8));
-  if (auto memrefTy = dyn_cast<cc::CharspanType>(ty)) {
-    // `pauli_word` is an object with a std::vector in the header files at
-    // present. This data type *must* be updated if it becomes a std::string
-    // once again.
-    return convertToHostSideType(
-        factory::stlVectorType(IntegerType::get(ty.getContext(), 8)));
-  }
+  if (auto csTy = dyn_cast<cc::CharspanType>(ty))
+    return genHostStringType(mod);
   auto *ctx = ty.getContext();
   if (auto structTy = dyn_cast<cc::StructType>(ty)) {
     SmallVector<Type> newMembers;
     for (auto mem : structTy.getMembers())
-      newMembers.push_back(convertToHostSideType(mem));
+      newMembers.push_back(convertToHostSideType(mem, mod));
     if (structTy.getName())
       return cc::StructType::get(ctx, structTy.getName(), newMembers,
                                  structTy.getBitSize(), structTy.getAlignment(),
@@ -579,7 +594,7 @@ FunctionType factory::toHostSideFuncType(FunctionType funcTy, bool addThisPtr,
       // returned via a sret argument in the first position. When this argument
       // is added, the this pointer becomes the second argument. Both are opaque
       // pointers at this point.
-      auto eleTy = convertToHostSideType(getSRetElementType(funcTy));
+      auto eleTy = convertToHostSideType(getSRetElementType(funcTy), module);
       inputTys.push_back(cc::PointerType::get(eleTy));
       hasSRet = true;
     } else {
@@ -595,7 +610,7 @@ FunctionType factory::toHostSideFuncType(FunctionType funcTy, bool addThisPtr,
 
   // Add all the explicit (not hidden) arguments after the hidden ones.
   for (auto kernelTy : funcTy.getInputs()) {
-    auto hostTy = convertToHostSideType(kernelTy);
+    auto hostTy = convertToHostSideType(kernelTy, module);
     if (auto strTy = dyn_cast<cc::StructType>(hostTy)) {
       // On x86_64 and aarch64, a struct that is smaller than 128 bits may be
       // passed in registers as separate arguments. See classifyArgumentType()
@@ -636,6 +651,9 @@ FunctionType factory::toHostSideFuncType(FunctionType funcTy, bool addThisPtr,
       }
       // Pass a struct as a byval pointer.
       hostTy = cc::PointerType::get(hostTy);
+    } else if (isa<cc::ArrayType>(hostTy)) {
+      // Pass a raw data block as a pointer. (It's a struct passed as a blob.)
+      hostTy = cc::PointerType::get(hostTy);
     }
     inputTys.push_back(hostTy);
   }
diff --git a/lib/Optimizer/Builder/Intrinsics.cpp b/lib/Optimizer/Builder/Intrinsics.cpp
index 1774475b1b..1826241eaa 100644
--- a/lib/Optimizer/Builder/Intrinsics.cpp
+++ b/lib/Optimizer/Builder/Intrinsics.cpp
@@ -49,6 +49,18 @@ inline bool operator<(const IntrinsicCode &icode, const IntrinsicCode &jcode) {
 /// well as prototypes for LLVM intrinsics and C library calls that are used by
 /// the compiler. The table should be kept in sorted order.
 static constexpr IntrinsicCode intrinsicTable[] = {
+    // These following pauli_word helper functions are only available on the
+    // host-side. They ought not be called in kernel code.
+    {cudaq::runtime::getPauliWordData,
+     {},
+     "func.func private @_ZNK5cudaq10pauli_word11_nvqpp_dataEv(%pw : "
+     "!cc.ptr<i8>) -> !cc.ptr<i8>"},
+    {cudaq::runtime::getPauliWordSize,
+     {cudaq::runtime::getPauliWordData, cudaq::runtime::bindingGetStringData,
+      cudaq::runtime::bindingGetStringSize},
+     "func.func private @_ZNK5cudaq10pauli_word11_nvqpp_sizeEv(%pw : "
+     "!cc.ptr<i8>) -> i64"},
+
     // Initialize a (preallocated) buffer (the first parameter) with i64 values
     // on the semi-open range `[0..n)` where `n` is the second parameter.
     {cudaq::runtime::getLinkableKernelKey,
@@ -292,6 +304,15 @@ static constexpr IntrinsicCode intrinsicTable[] = {
   func.func private @__nvqpp_getStateVectorLength_fp64(%p : i64, %o : i64) -> i64
   )#"},
 
+    // Quasi-portable entry points for use with non-C++ front ends (Python).
+    {cudaq::runtime::bindingGetStringData,
+     {},
+     "func.func private @__nvqpp_getStringData(%p: !cc.ptr<i8>) -> "
+     "!cc.ptr<i8>"},
+    {cudaq::runtime::bindingGetStringSize,
+     {},
+     "func.func private @__nvqpp_getStringSize(%p: !cc.ptr<i8>) -> i64"},
+
     // __nvqpp_initializer_list_to_vector_bool
     {cudaq::stdvecBoolCtorFromInitList,
      {},
@@ -307,11 +328,17 @@ static constexpr IntrinsicCode intrinsicTable[] = {
     return %0 : !cc.ptr<i8>
   })#"},
 
+    // __nvqpp_vector_bool_free_temporary_lists
+    {cudaq::stdvecBoolFreeTemporaryLists,
+     {},
+     R"#(
+  func.func private @__nvqpp_vector_bool_free_temporary_initlists(!cc.ptr<i8>) -> ())#"},
+
     // __nvqpp_vector_bool_to_initializer_list
     {cudaq::stdvecBoolUnpackToInitList,
      {},
      R"#(
-  func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr<!cc.struct<{!cc.ptr<i1>, !cc.ptr<i1>, !cc.ptr<i1>}>>, !cc.ptr<!cc.struct<{!cc.ptr<i1>, !cc.ptr<i1>, !cc.ptr<i1>}>>) -> ())#"},
+  func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr<!cc.struct<{!cc.ptr<i1>, !cc.ptr<i1>, !cc.ptr<i1>}>>, !cc.ptr<!cc.struct<{!cc.ptr<i1>, !cc.array<i8 x 32>}>>, !cc.ptr<!cc.ptr<i8>>) -> ())#"},
 
     {"__nvqpp_zeroDynamicResult", {}, R"#(
   func.func private @__nvqpp_zeroDynamicResult() -> !cc.struct<{!cc.ptr<i8>, i64}> {
diff --git a/lib/Optimizer/Dialect/CC/CCOps.cpp b/lib/Optimizer/Dialect/CC/CCOps.cpp
index 9d539640ac..8221aa5e81 100644
--- a/lib/Optimizer/Dialect/CC/CCOps.cpp
+++ b/lib/Optimizer/Dialect/CC/CCOps.cpp
@@ -106,6 +106,10 @@ Value cudaq::cc::getByteSizeOfType(OpBuilder &builder, Location loc, Type ty,
         return builder.create<arith::MulIOp>(loc, builder.getI64Type(), v,
                                              scale);
       })
+      .Case([&](cudaq::cc::SpanLikeType) -> Value {
+        // Uniformly on the device size: {ptr, i64}
+        return createInt(16);
+      })
       .Default({});
 }
 
diff --git a/lib/Optimizer/Dialect/CC/CCTypes.cpp b/lib/Optimizer/Dialect/CC/CCTypes.cpp
index 816695e173..0543a12a51 100644
--- a/lib/Optimizer/Dialect/CC/CCTypes.cpp
+++ b/lib/Optimizer/Dialect/CC/CCTypes.cpp
@@ -158,7 +158,7 @@ Type cc::SpanLikeType::getElementType() const {
 }
 
 bool isDynamicType(Type ty) {
-  if (isa<StdvecType>(ty))
+  if (isa<SpanLikeType>(ty))
     return true;
   if (auto strTy = dyn_cast<StructType>(ty)) {
     for (auto memTy : strTy.getMembers())
diff --git a/lib/Optimizer/Transforms/DecompositionPatterns.cpp b/lib/Optimizer/Transforms/DecompositionPatterns.cpp
index de32b86e45..bdf8e9244c 100644
--- a/lib/Optimizer/Transforms/DecompositionPatterns.cpp
+++ b/lib/Optimizer/Transforms/DecompositionPatterns.cpp
@@ -362,6 +362,9 @@ struct ExpPauliDecomposition : public OpRewritePattern<quake::ExpPauliOp> {
               auto strAttr = cast<mlir::StringAttr>(attr.value());
               optPauliWordStr = strAttr.getValue();
             }
+          } else if (auto lit = addrOp.getDefiningOp<
+                                cudaq::cc::CreateStringLiteralOp>()) {
+            optPauliWordStr = lit.getStringLiteral();
           }
         }
       }
@@ -369,7 +372,7 @@ struct ExpPauliDecomposition : public OpRewritePattern<quake::ExpPauliOp> {
 
     // Assert that we have a constant known pauli word
     if (!optPauliWordStr.has_value())
-      return failure();
+      return expPauliOp.emitOpError("cannot determine pauli word string");
 
     auto pauliWordStr = optPauliWordStr.value();
 
diff --git a/lib/Optimizer/Transforms/GenKernelExecution.cpp b/lib/Optimizer/Transforms/GenKernelExecution.cpp
index 2e45c8df96..69f4e93681 100644
--- a/lib/Optimizer/Transforms/GenKernelExecution.cpp
+++ b/lib/Optimizer/Transforms/GenKernelExecution.cpp
@@ -15,10 +15,10 @@
 #include "cudaq/Optimizer/Transforms/Passes.h"
 #include "cudaq/Todo.h"
 #include "clang/Basic/Version.h"
+#include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/ToolOutputFile.h"
-#include "mlir/Analysis/DataLayoutAnalysis.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
 #include "mlir/IR/Diagnostics.h"
 #include "mlir/Transforms/Passes.h"
@@ -48,283 +48,1149 @@ static bool isCodegenArgumentGather(std::size_t kind) {
   return kind == 0 || kind == 2;
 }
 
-/// This pass adds a `<kernel name>.thunk` function and a rewritten C++ host
-/// side (mangled) stub to the code for every entry-point kernel in the module.
-/// It may also generate a `<kernel name>.argsCreator` function. Finally, it
-/// creates registration hooks for the CUDA-Q runtime to be able to find the
-/// kernel by name and, as appropriate, the `<kernel name>.argsCreator`
-/// function.
-namespace {
-class GenerateKernelExecution
-    : public cudaq::opt::impl::GenerateKernelExecutionBase<
-          GenerateKernelExecution> {
-public:
-  using GenerateKernelExecutionBase::GenerateKernelExecutionBase;
+static bool isStateType(Type ty) {
+  if (auto ptrTy = dyn_cast<cudaq::cc::PointerType>(ty))
+    return isa<cudaq::cc::StateType>(ptrTy.getElementType());
+  return false;
+}
 
-  /// Creates the function signature for a thunk function. The signature is
-  /// always the same for all thunk functions.
-  ///
-  /// Every thunk function has an identical signature, making it callable from a
-  /// generic "kernel launcher" in the CUDA-Q runtime.
-  ///
-  /// This signature is defined as: `(ptr, bool) -> {ptr, i64}`.
-  ///
-  /// The first argument is a pointer to a data buffer that encodes all the
-  /// arguments (and static return) values to (and from) the kernel in the
-  /// pointer-free encoding. The second argument indicates if this call is to a
-  /// remote process (if true). The result is a pointer and size (span) if the
-  /// kernel returns a dynamically sized result, otherwise it will be
-  /// `{nullptr, 0}`. It is the responsibility of calling code to free any
-  /// dynamic result buffer(s) and convert those to `std::vector` objects.
-  FunctionType getThunkType(MLIRContext *ctx) {
-    auto ptrTy = cudaq::cc::PointerType::get(IntegerType::get(ctx, 8));
-    return FunctionType::get(ctx, {ptrTy, IntegerType::get(ctx, 1)},
-                             {cudaq::opt::factory::getDynamicBufferType(ctx)});
+/// Creates the function signature for a thunk function. The signature is always
+/// the same for all thunk functions.
+///
+/// Every thunk function has an identical signature, making it callable from a
+/// generic "kernel launcher" in the CUDA-Q runtime.
+///
+/// This signature is defined as: `(ptr, bool) -> {ptr, i64}`.
+///
+/// The first argument is a pointer to a data buffer that encodes all the
+/// arguments (and static return) values to (and from) the kernel in the
+/// pointer-free encoding. The second argument indicates if this call is to a
+/// remote process (if true). The result is a pointer and size (span) if the
+/// kernel returns a dynamically sized result, otherwise it will be
+/// `{nullptr, 0}`. It is the responsibility of calling code to free any
+/// dynamic result buffer(s) and convert those to `std::vector` objects.
+static FunctionType getThunkType(MLIRContext *ctx) {
+  auto ptrTy = cudaq::cc::PointerType::get(IntegerType::get(ctx, 8));
+  return FunctionType::get(ctx, {ptrTy, IntegerType::get(ctx, 1)},
+                           {cudaq::opt::factory::getDynamicBufferType(ctx)});
+}
+
+/// Generate code to read the length from a host-side string object. (On the
+/// device side, a string is encoded as a span.) The length of a string is the
+/// number of bytes of data.
+///
+/// In order to handle a std::string value it is assumed to be laid out in
+/// memory as the following structure.
+///
+/// <code>
+///   struct vector {
+///     i8* data;
+///     i64 length;
+///     [i8 x 16] inlinedata;
+///   };
+/// </code>
+///
+/// This implementation does \e not support wide characters.
+static Value genStringLength(Location loc, OpBuilder &builder, Value stringArg,
+                             ModuleOp module) {
+  Type stringTy = stringArg.getType();
+  assert(isa<cudaq::cc::PointerType>(stringTy) &&
+         isa<cudaq::cc::ArrayType>(
+             cast<cudaq::cc::PointerType>(stringTy).getElementType()) &&
+         "host side string expected");
+  auto callArg = builder.create<cudaq::cc::CastOp>(
+      loc, cudaq::cc::PointerType::get(builder.getI8Type()), stringArg);
+  StringRef helperName = module->getAttr(cudaq::runtime::sizeofStringAttrName)
+                             ? cudaq::runtime::getPauliWordSize
+                             : cudaq::runtime::bindingGetStringSize;
+  auto lenRes = builder.create<func::CallOp>(loc, builder.getI64Type(),
+                                             helperName, ValueRange{callArg});
+  return lenRes.getResult(0);
+}
+
+/// Generate code that computes the size in bytes of a `std::vector<T>` array
+/// in the same way as a `std::vector<T>::size()`. This assumes the vector is
+/// laid out in memory as the following structure.
+///
+/// <code>
+///   struct vector {
+///     T* begin;
+///     T* end;
+///     T* allocated_end;
+///   };
+/// </code>
+///
+/// The first two elements are pointers to the beginning and end of the data
+/// in the vector, respectively. This data is kept in a contiguous memory
+/// range. The following implementation follows what Clang CodeGen produces
+/// for `std::vector<T>::size()` without the final `sdiv` op that divides the
+/// `sizeof(data[N])` by the `sizeof(T)`. The result is the total required
+/// memory size for the vector data itself in \e bytes.
+static Value genVectorSize(Location loc, OpBuilder &builder, Value vecArg) {
+  auto vecTy = cast<cudaq::cc::PointerType>(vecArg.getType());
+  auto vecStructTy = cast<cudaq::cc::StructType>(vecTy.getElementType());
+  assert(vecStructTy.getNumMembers() == 3 &&
+         vecStructTy.getMember(0) == vecStructTy.getMember(1) &&
+         vecStructTy.getMember(0) == vecStructTy.getMember(2) &&
+         "host side vector expected");
+  auto vecElePtrTy = cudaq::cc::PointerType::get(vecStructTy.getMember(0));
+
+  // Get the pointer to the pointer of the end of the array
+  Value endPtr = builder.create<cudaq::cc::ComputePtrOp>(
+      loc, vecElePtrTy, vecArg, ArrayRef<cudaq::cc::ComputePtrArg>{1});
+
+  // Get the pointer to the pointer of the beginning of the array
+  Value beginPtr = builder.create<cudaq::cc::ComputePtrOp>(
+      loc, vecElePtrTy, vecArg, ArrayRef<cudaq::cc::ComputePtrArg>{0});
+
+  // Load to a T*
+  endPtr = builder.create<cudaq::cc::LoadOp>(loc, endPtr);
+  beginPtr = builder.create<cudaq::cc::LoadOp>(loc, beginPtr);
+
+  // Map those pointers to integers
+  Type i64Ty = builder.getI64Type();
+  Value endInt = builder.create<cudaq::cc::CastOp>(loc, i64Ty, endPtr);
+  Value beginInt = builder.create<cudaq::cc::CastOp>(loc, i64Ty, beginPtr);
+
+  // Subtracting these will give us the size in bytes.
+  return builder.create<arith::SubIOp>(loc, endInt, beginInt);
+}
+
+static Value genComputeReturnOffset(Location loc, OpBuilder &builder,
+                                    FunctionType funcTy,
+                                    cudaq::cc::StructType msgStructTy) {
+  if (funcTy.getNumResults() == 0)
+    return builder.create<arith::ConstantIntOp>(loc, NoResultOffset, 64);
+  std::int32_t numKernelArgs = funcTy.getNumInputs();
+  auto i64Ty = builder.getI64Type();
+  return builder.create<cudaq::cc::OffsetOfOp>(
+      loc, i64Ty, msgStructTy, ArrayRef<std::int32_t>{numKernelArgs});
+}
+
+/// Create a function that determines the return value offset in the message
+/// buffer.
+static void genReturnOffsetFunction(Location loc, OpBuilder &builder,
+                                    FunctionType devKernelTy,
+                                    cudaq::cc::StructType msgStructTy,
+                                    const std::string &classNameStr) {
+  auto *ctx = builder.getContext();
+  auto i64Ty = builder.getI64Type();
+  auto funcTy = FunctionType::get(ctx, {}, {i64Ty});
+  auto returnOffsetFunc =
+      builder.create<func::FuncOp>(loc, classNameStr + ".returnOffset", funcTy);
+  OpBuilder::InsertionGuard guard(builder);
+  auto *entry = returnOffsetFunc.addEntryBlock();
+  builder.setInsertionPointToStart(entry);
+  auto result = genComputeReturnOffset(loc, builder, devKernelTy, msgStructTy);
+  builder.create<func::ReturnOp>(loc, result);
+}
+
+static cudaq::cc::PointerType getByteAddressableType(OpBuilder &builder) {
+  return cudaq::cc::PointerType::get(
+      cudaq::cc::ArrayType::get(builder.getI8Type()));
+}
+
+static cudaq::cc::PointerType getPointerToPointerType(OpBuilder &builder) {
+  return cudaq::cc::PointerType::get(
+      cudaq::cc::PointerType::get(builder.getI8Type()));
+}
+
+static bool isDynamicSignature(FunctionType devFuncTy) {
+  for (auto t : devFuncTy.getInputs())
+    if (cudaq::cc::isDynamicType(t))
+      return true;
+  for (auto t : devFuncTy.getResults())
+    if (cudaq::cc::isDynamicType(t))
+      return true;
+  return false;
+}
+
+static std::pair<Value, Value>
+genByteSizeAndElementCount(Location loc, OpBuilder &builder, ModuleOp module,
+                           Type eleTy, Value size, Value arg, Type t) {
+  // If this is a vector<vector<...>>, convert the bytes of vector to bytes of
+  // length (i64).
+  if (auto sty = dyn_cast<cudaq::cc::StdvecType>(eleTy)) {
+    auto eTy = cast<cudaq::cc::PointerType>(arg.getType()).getElementType();
+    auto fTy = cast<cudaq::cc::StructType>(eTy).getMember(0);
+    auto tTy = cast<cudaq::cc::PointerType>(fTy).getElementType();
+    auto i64Ty = builder.getI64Type();
+    auto eleSize = builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, tTy);
+    Value count = builder.create<arith::DivSIOp>(loc, size, eleSize);
+    auto ate = builder.create<arith::ConstantIntOp>(loc, 8, 64);
+    size = builder.create<arith::MulIOp>(loc, count, ate);
+    return {size, count};
   }
 
-  /// Add LLVM code with the OpBuilder that computes the size in bytes
-  /// of a `std::vector<T>` array in the same way as a `std::vector<T>::size()`.
-  /// This assumes the vector is laid out in memory as the following structure.
-  ///
-  /// <code>
-  ///   struct vector {
-  ///     T* begin;
-  ///     T* end;
-  ///     T* allocated_end;
-  ///   };
-  /// </code>
-  ///
-  /// The first two elements are pointers to the beginning and end of the data
-  /// in the vector, respectively. This data is kept in a contiguous memory
-  /// range. The following implementation follows what Clang CodeGen produces
-  /// for `std::vector<T>::size()` without the final `sdiv` op that divides the
-  /// `sizeof(data[N])` by the `sizeof(T)`. The result is the total required
-  /// memory size for the vector data itself in \e bytes.
-  ///
-  /// In order to handle a std::string value it is assumed to be laid out in
-  /// memory as the following structure.
-  ///
-  /// <code>
-  ///   struct vector {
-  ///     i8* data;
-  ///     i64 length;
-  ///     [i8 x 16] inlinedata;
-  ///   };
-  /// </code>
-  ///
-  /// In the string case, the size can just be read from the data structure.
-  Value getVectorSize(Location loc, OpBuilder &builder,
-                      cudaq::cc::PointerType ptrTy, Value arg) {
-    // Create the i64 type
-    Type i64Ty = builder.getI64Type();
+  // If this is a vector<string>, convert the bytes of string to bytes of length
+  // (i64).
+  if (isa<cudaq::cc::CharspanType>(eleTy)) {
+    auto arrTy = cudaq::opt::factory::genHostStringType(module);
+    auto words =
+        builder.create<arith::ConstantIntOp>(loc, arrTy.getSize() / 8, 64);
+    size = builder.create<arith::DivSIOp>(loc, size, words);
+    auto ate = builder.create<arith::ConstantIntOp>(loc, 8, 64);
+    Value count = builder.create<arith::DivSIOp>(loc, size, ate);
+    return {size, count};
+  }
 
-    // We're given ptr<struct<...>>, get that struct type (struct<T*,T*,T*>)
-    auto inpStructTy = cast<cudaq::cc::StructType>(ptrTy.getElementType());
+  // If this is a vector<struct<...>>, convert the bytes of struct to bytes of
+  // struct with converted members.
+  if (isa<cudaq::cc::StructType>(eleTy)) {
+    auto vecTy = cast<cudaq::cc::PointerType>(arg.getType()).getElementType();
+    auto vecEleRefTy = cast<cudaq::cc::StructType>(vecTy).getMember(0);
+    auto vecEleTy = cast<cudaq::cc::PointerType>(vecEleRefTy).getElementType();
+    auto i64Ty = builder.getI64Type();
+    auto hostStrSize =
+        builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, vecEleTy);
+    Value count = builder.create<arith::DivSIOp>(loc, size, hostStrSize);
+    Type packedTy = cudaq::opt::factory::genArgumentBufferType(eleTy);
+    auto packSize = builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, packedTy);
+    size = builder.create<arith::MulIOp>(loc, count, packSize);
+    return {size, count};
+  }
+  return {};
+}
 
-    if (inpStructTy.getMember(1) == i64Ty) {
-      // This is a string, so just read the length out.
-      auto ptrI64Ty = cudaq::cc::PointerType::get(i64Ty);
-      auto lenPtr = builder.create<cudaq::cc::ComputePtrOp>(
-          loc, ptrI64Ty, arg, SmallVector<cudaq::cc::ComputePtrArg>{1});
-      return builder.create<cudaq::cc::LoadOp>(loc, lenPtr);
-    }
+static bool isStdVectorBool(Type ty) {
+  auto stdvecTy = dyn_cast<cudaq::cc::StdvecType>(ty);
+  return stdvecTy &&
+         (stdvecTy.getElementType() == IntegerType::get(ty.getContext(), 1));
+}
 
-    // For the following GEP calls, we'll expect them to return T**
-    auto ptrTtype = cudaq::cc::PointerType::get(inpStructTy.getMember(0));
+/// Recursively check if \p ty contains a `std::vector<bool>`.
+static bool hasStdVectorBool(Type ty) {
+  if (isStdVectorBool(ty))
+    return true;
+  if (auto sty = dyn_cast<cudaq::cc::StdvecType>(ty))
+    return hasStdVectorBool(sty.getElementType());
+  if (auto sty = dyn_cast<cudaq::cc::StructType>(ty))
+    for (auto mem : sty.getMembers())
+      if (hasStdVectorBool(mem))
+        return true;
+  return false;
+}
 
-    // Get the pointer to the pointer of the end of the array
-    Value endPtr = builder.create<cudaq::cc::ComputePtrOp>(
-        loc, ptrTtype, arg, SmallVector<cudaq::cc::ComputePtrArg>{1});
+// The host-side type of a `std::vector<bool>` is distinct from the transient
+// type for a `std::vector<bool>`. The former is a unique data type with a size
+// of 40 bytes. The latter is identical to `std::vector<char>` (which has a size
+// of 24 bytes).
+static Type convertToTransientType(Type ty, ModuleOp mod) {
+  if (isStdVectorBool(ty)) {
+    auto *ctx = ty.getContext();
+    return cudaq::opt::factory::stlVectorType(IntegerType::get(ctx, 1));
+  }
+  if (auto sty = dyn_cast<cudaq::cc::StdvecType>(ty))
+    return cudaq::opt::factory::stlVectorType(
+        convertToTransientType(sty.getElementType(), mod));
+  if (auto sty = dyn_cast<cudaq::cc::StructType>(ty)) {
+    SmallVector<Type> newMems;
+    for (auto mem : sty.getMembers())
+      newMems.push_back(convertToTransientType(mem, mod));
+    auto *ctx = ty.getContext();
+    return cudaq::cc::StructType::get(ctx, newMems);
+  }
+  return cudaq::opt::factory::convertToHostSideType(ty, mod);
+}
 
-    // Get the pointer to the pointer of the beginning of the array
-    Value beginPtr = builder.create<cudaq::cc::ComputePtrOp>(
-        loc, ptrTtype, arg, SmallVector<cudaq::cc::ComputePtrArg>{0});
+static std::pair<Value, bool>
+convertAllStdVectorBool(Location loc, OpBuilder &builder, ModuleOp module,
+                        Value arg, Type ty, Value heapTracker,
+                        std::optional<Value> preallocated = std::nullopt) {
+  // If we are here, `ty` must be a `std::vector<bool>` or recursively contain a
+  // `std::vector<bool>`.
+
+  // Handle `std::vector<bool>`.
+  if (isStdVectorBool(ty)) {
+    auto stdvecTy = cast<cudaq::cc::StdvecType>(ty);
+    Type stdvecHostTy =
+        cudaq::opt::factory::stlVectorType(stdvecTy.getElementType());
+    Value tmp = preallocated.has_value()
+                    ? *preallocated
+                    : builder.create<cudaq::cc::AllocaOp>(loc, stdvecHostTy);
+    builder.create<func::CallOp>(loc, std::nullopt,
+                                 cudaq::stdvecBoolUnpackToInitList,
+                                 ArrayRef<Value>{tmp, arg, heapTracker});
+    return {tmp, true};
+  }
+
+  // Handle `std::vector<T>` where `T` != `bool`.
+  if (auto sty = dyn_cast<cudaq::cc::StdvecType>(ty)) {
+    // arg is a std::vector<T>.
+    // It's type must be ptr<struct<ptr<T>, ptr<T>, ptr<T>>>.
+    auto seleTy = sty.getElementType();
+    auto ptrArgTy = cast<cudaq::cc::PointerType>(arg.getType());
+    auto argVecTy = cast<cudaq::cc::StructType>(ptrArgTy.getElementType());
+    auto subVecPtrTy = cudaq::cc::PointerType::get(argVecTy.getMember(0));
+    // Compute the pointer to the pointer to the first T element.
+    auto inputRef = builder.create<cudaq::cc::ComputePtrOp>(
+        loc, subVecPtrTy, arg, ArrayRef<cudaq::cc::ComputePtrArg>{0});
+    auto startInput = builder.create<cudaq::cc::LoadOp>(loc, inputRef);
+    auto startTy = startInput.getType();
+    auto subArrTy = cudaq::cc::ArrayType::get(
+        cast<cudaq::cc::PointerType>(startTy).getElementType());
+    auto input = builder.create<cudaq::cc::CastOp>(
+        loc, cudaq::cc::PointerType::get(subArrTy), startInput);
+    auto transientTy = convertToTransientType(sty, module);
+    auto tmp = [&]() -> Value {
+      if (preallocated)
+        return builder.create<cudaq::cc::CastOp>(
+            loc, cudaq::cc::PointerType::get(transientTy), *preallocated);
+      return builder.create<cudaq::cc::AllocaOp>(loc, transientTy);
+    }();
+    Value sizeDelta = genVectorSize(loc, builder, arg);
+    auto count = [&]() -> Value {
+      if (cudaq::cc::isDynamicType(seleTy)) {
+        auto p = genByteSizeAndElementCount(loc, builder, module, seleTy,
+                                            sizeDelta, arg, sty);
+        return p.second;
+      }
+      auto sizeEle = builder.create<cudaq::cc::SizeOfOp>(
+          loc, builder.getI64Type(), seleTy);
+      return builder.create<arith::DivSIOp>(loc, sizeDelta, sizeEle);
+    }();
+    auto transEleTy = cast<cudaq::cc::StructType>(transientTy).getMember(0);
+    auto dataTy = cast<cudaq::cc::PointerType>(transEleTy).getElementType();
+    auto sizeTransientTy =
+        builder.create<cudaq::cc::SizeOfOp>(loc, builder.getI64Type(), dataTy);
+    Value sizeInBytes =
+        builder.create<arith::MulIOp>(loc, count, sizeTransientTy);
+
+    // Create a new vector that we'll store the converted data into.
+    Value byteBuffer = builder.create<cudaq::cc::AllocaOp>(
+        loc, builder.getI8Type(), sizeInBytes);
+
+    // Initialize the temporary vector.
+    auto vecEleTy = cudaq::cc::PointerType::get(transEleTy);
+    auto tmpBegin = builder.create<cudaq::cc::ComputePtrOp>(
+        loc, vecEleTy, tmp, ArrayRef<cudaq::cc::ComputePtrArg>{0});
+    auto bufferBegin =
+        builder.create<cudaq::cc::CastOp>(loc, transEleTy, byteBuffer);
+    builder.create<cudaq::cc::StoreOp>(loc, bufferBegin, tmpBegin);
+    auto tmpEnd = builder.create<cudaq::cc::ComputePtrOp>(
+        loc, vecEleTy, tmp, ArrayRef<cudaq::cc::ComputePtrArg>{1});
+    auto byteBufferEnd = builder.create<cudaq::cc::ComputePtrOp>(
+        loc, cudaq::cc::PointerType::get(builder.getI8Type()), byteBuffer,
+        ArrayRef<cudaq::cc::ComputePtrArg>{sizeInBytes});
+    auto bufferEnd =
+        builder.create<cudaq::cc::CastOp>(loc, transEleTy, byteBufferEnd);
+    builder.create<cudaq::cc::StoreOp>(loc, bufferEnd, tmpEnd);
+    auto tmpEnd2 = builder.create<cudaq::cc::ComputePtrOp>(
+        loc, vecEleTy, tmp, ArrayRef<cudaq::cc::ComputePtrArg>{2});
+    builder.create<cudaq::cc::StoreOp>(loc, bufferEnd, tmpEnd2);
+
+    // Loop over each element in the outer vector and initialize it to the inner
+    // vector value. The data may be heap allocated.)
+    auto transientEleTy = convertToTransientType(seleTy, module);
+    auto transientBufferTy =
+        cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(transientEleTy));
+    auto buffer =
+        builder.create<cudaq::cc::CastOp>(loc, transientBufferTy, byteBuffer);
 
-    // Load to a T*
-    endPtr = builder.create<cudaq::cc::LoadOp>(loc, endPtr);
-    beginPtr = builder.create<cudaq::cc::LoadOp>(loc, beginPtr);
+    cudaq::opt::factory::createInvariantLoop(
+        builder, loc, count,
+        [&](OpBuilder &builder, Location loc, Region &, Block &block) {
+          Value i = block.getArgument(0);
+          Value inp = builder.create<cudaq::cc::ComputePtrOp>(
+              loc, startTy, input, ArrayRef<cudaq::cc::ComputePtrArg>{i});
+          auto currentVector = builder.create<cudaq::cc::ComputePtrOp>(
+              loc, cudaq::cc::PointerType::get(transientEleTy), buffer,
+              ArrayRef<cudaq::cc::ComputePtrArg>{i});
+          convertAllStdVectorBool(loc, builder, module, inp, seleTy,
+                                  heapTracker, currentVector);
+        });
+    return {tmp, true};
+  }
 
-    // Map those pointers to integers
-    Value endInt = builder.create<cudaq::cc::CastOp>(loc, i64Ty, endPtr);
-    Value beginInt = builder.create<cudaq::cc::CastOp>(loc, i64Ty, beginPtr);
+  // Handle `struct { ... };`.
+  if (auto sty = dyn_cast<cudaq::cc::StructType>(ty)) {
+    auto bufferTy = convertToTransientType(ty, module);
+    auto argPtrTy = cast<cudaq::cc::PointerType>(arg.getType());
+    auto argStrTy = cast<cudaq::cc::StructType>(argPtrTy.getElementType());
+
+    // If a struct was preallocated, use it. Otherwise, create a new struct that
+    // we'll store the converted data into.
+    auto buffer = [&]() -> Value {
+      if (preallocated)
+        return builder.create<cudaq::cc::CastOp>(
+            loc, cudaq::cc::PointerType::get(bufferTy), *preallocated);
+      return builder.create<cudaq::cc::AllocaOp>(loc, bufferTy);
+    }();
 
-    // Subtracting these will give us the size in bytes.
-    return builder.create<arith::SubIOp>(loc, endInt, beginInt);
+    // Loop over each element. Replace each with the converted value.
+    for (auto iter : llvm::enumerate(sty.getMembers())) {
+      std::int32_t i = iter.index();
+      Type memTy = iter.value();
+      auto fromPtr = builder.create<cudaq::cc::ComputePtrOp>(
+          loc, cudaq::cc::PointerType::get(argStrTy.getMember(i)), arg,
+          ArrayRef<cudaq::cc::ComputePtrArg>{i});
+      auto transientTy = convertToTransientType(memTy, module);
+      Value toPtr = builder.create<cudaq::cc::ComputePtrOp>(
+          loc, cudaq::cc::PointerType::get(transientTy), buffer,
+          ArrayRef<cudaq::cc::ComputePtrArg>{i});
+      convertAllStdVectorBool(loc, builder, module, fromPtr, memTy, heapTracker,
+                              toPtr);
+    }
+    return {buffer, true};
   }
+  return {arg, false};
+}
+
+static std::pair<Value, bool>
+unpackAnyStdVectorBool(Location loc, OpBuilder &builder, ModuleOp module,
+                       Value arg, Type ty, Value heapTracker) {
+  if (hasStdVectorBool(ty))
+    return convertAllStdVectorBool(loc, builder, module, arg, ty, heapTracker);
+  return {arg, false};
+}
 
-  /// Helper that converts a byte length to a length of i64.
-  Value convertLengthBytesToLengthI64(Location loc, OpBuilder &builder,
-                                      Value length) {
-    auto eight = builder.create<arith::ConstantIntOp>(loc, 8, 64);
-    return builder.create<arith::DivSIOp>(loc, length, eight);
+// Take the list of host-side arguments and device side argument types and zip
+// them together logically with the position. Generates any fixup code that's
+// needed, like when the device side uses a pair of arguments for a single
+// logical device side argument. May drop some arguments on the floor if they
+// cannot be encoded.
+template <bool argsAreReferences>
+static SmallVector<std::tuple<unsigned, Value, Type>>
+zipArgumentsWithDeviceTypes(Location loc, OpBuilder &builder, ModuleOp module,
+                            ValueRange args, TypeRange types,
+                            Value heapTracker) {
+  SmallVector<std::tuple<unsigned, Value, Type>> result;
+  if constexpr (argsAreReferences) {
+    // Simple case: the number of args must be equal to the types.
+    assert(args.size() == types.size() &&
+           "arguments and types must have same size");
+    for (auto iter : llvm::enumerate(llvm::zip(args, types))) {
+      // Remove the reference.
+      Value v = std::get<Value>(iter.value());
+      Type ty = std::get<Type>(iter.value());
+      if (!(cudaq::cc::isDynamicType(ty) || isStateType(ty) ||
+            isa<cudaq::cc::IndirectCallableType>(ty)))
+        v = builder.create<cudaq::cc::LoadOp>(loc, v);
+      // Python will pass a std::vector<bool> to us here. Unpack it.
+      auto pear =
+          unpackAnyStdVectorBool(loc, builder, module, v, ty, heapTracker);
+      v = pear.first;
+      result.emplace_back(iter.index(), v, ty);
+    }
+  } else /*constexpr*/ {
+    // In this case, we *may* have logical arguments that are passed in pairs.
+    auto *ctx = builder.getContext();
+    auto *parent = builder.getBlock()->getParentOp();
+    auto module = parent->getParentOfType<ModuleOp>();
+    auto lastArg = args.end();
+    auto tyIter = types.begin();
+    unsigned argPos = 0;
+    for (auto argIter = args.begin(); argIter != lastArg;
+         ++argIter, ++tyIter, ++argPos) {
+      assert(tyIter != types.end());
+      Type devTy = *tyIter;
+
+      // std::vector<bool> isn't really a std::vector<>. Use the helper
+      // function to unpack it so it looks like any other vector.
+      auto pear = unpackAnyStdVectorBool(loc, builder, module, *argIter, devTy,
+                                         heapTracker);
+      if (pear.second) {
+        result.emplace_back(argPos, pear.first, devTy);
+        continue;
+      }
+
+      // Check for a struct passed in a pair of arguments.
+      if (isa<cudaq::cc::StructType>(devTy) &&
+          !isa<cudaq::cc::PointerType>((*argIter).getType()) &&
+          cudaq::opt::factory::isX86_64(module) &&
+          cudaq::opt::factory::structUsesTwoArguments(devTy)) {
+        auto first = *argIter++;
+        auto second = *argIter;
+        // TODO: Investigate if it's correct to assume the register layout
+        // will match the memory layout of the small struct.
+        auto pairTy = cudaq::cc::StructType::get(
+            ctx, ArrayRef<Type>{first.getType(), second.getType()});
+        auto tmp = builder.create<cudaq::cc::AllocaOp>(loc, pairTy);
+        auto tmp1 = builder.create<cudaq::cc::CastOp>(
+            loc, cudaq::cc::PointerType::get(first.getType()), tmp);
+        builder.create<cudaq::cc::StoreOp>(loc, first, tmp1);
+        auto tmp2 = builder.create<cudaq::cc::ComputePtrOp>(
+            loc, cudaq::cc::PointerType::get(second.getType()), tmp,
+            ArrayRef<cudaq::cc::ComputePtrArg>{1});
+        builder.create<cudaq::cc::StoreOp>(loc, second, tmp2);
+        auto devPtrTy = cudaq::cc::PointerType::get(devTy);
+        Value devVal = builder.create<cudaq::cc::CastOp>(loc, devPtrTy, tmp);
+        if (!cudaq::cc::isDynamicType(devTy))
+          devVal = builder.create<cudaq::cc::LoadOp>(loc, devVal);
+        result.emplace_back(argPos, devVal, devTy);
+        continue;
+      }
+
+      // Is this a static struct passed as a byval pointer?
+      if (isa<cudaq::cc::StructType>(devTy) &&
+          isa<cudaq::cc::PointerType>((*argIter).getType()) &&
+          !cudaq::cc::isDynamicType(devTy)) {
+        Value devVal = builder.create<cudaq::cc::LoadOp>(loc, *argIter);
+        result.emplace_back(argPos, devVal, devTy);
+        continue;
+      }
+      result.emplace_back(argPos, *argIter, devTy);
+    }
   }
+  return result;
+}
+
+static Value descendThroughDynamicType(Location loc, OpBuilder &builder,
+                                       ModuleOp module, Type ty, Value addend,
+                                       Value arg, Value tmp) {
+  auto i64Ty = builder.getI64Type();
+  Value tySize =
+      TypeSwitch<Type, Value>(ty)
+          // A char span is dynamic, but it is not recursively dynamic. Just
+          // read the length of the string out.
+          .Case([&](cudaq::cc::CharspanType t) -> Value {
+            return genStringLength(loc, builder, arg, module);
+          })
+          // A std::vector is dynamic and may be recursive dynamic as well.
+          .Case([&](cudaq::cc::StdvecType t) -> Value {
+            // Compute the byte span of the vector.
+            Value size = genVectorSize(loc, builder, arg);
+            auto eleTy = t.getElementType();
+            if (!cudaq::cc::isDynamicType(eleTy))
+              return size;
+
+            // Otherwise, we have a recursively dynamic case.
+            auto [bytes, count] = genByteSizeAndElementCount(
+                loc, builder, module, eleTy, size, arg, t);
+            assert(count && "vector must have elements");
+            size = bytes;
+
+            // At this point, arg is a known vector of elements of dynamic
+            // type, so walk over the vector and recurse on each element.
+            // `size` is already the proper size of the lengths of each of the
+            // elements in turn.
+            builder.create<cudaq::cc::StoreOp>(loc, size, tmp);
+            auto ptrTy = cast<cudaq::cc::PointerType>(arg.getType());
+            auto strTy = cast<cudaq::cc::StructType>(ptrTy.getElementType());
+            auto memTy = cast<cudaq::cc::PointerType>(strTy.getMember(0));
+            auto arrTy =
+                cudaq::cc::PointerType::get(cudaq::cc::PointerType::get(
+                    cudaq::cc::ArrayType::get(memTy.getElementType())));
+            auto castPtr = builder.create<cudaq::cc::CastOp>(loc, arrTy, arg);
+            auto castArg = builder.create<cudaq::cc::LoadOp>(loc, castPtr);
+            auto castPtrTy =
+                cudaq::cc::PointerType::get(memTy.getElementType());
+            cudaq::opt::factory::createInvariantLoop(
+                builder, loc, count,
+                [&](OpBuilder &builder, Location loc, Region &, Block &block) {
+                  Value i = block.getArgument(0);
+                  auto ai = builder.create<cudaq::cc::ComputePtrOp>(
+                      loc, castPtrTy, castArg,
+                      ArrayRef<cudaq::cc::ComputePtrArg>{i});
+                  auto tmpVal = builder.create<cudaq::cc::LoadOp>(loc, tmp);
+                  Value innerSize = descendThroughDynamicType(
+                      loc, builder, module, eleTy, tmpVal, ai, tmp);
+                  builder.create<cudaq::cc::StoreOp>(loc, innerSize, tmp);
+                });
+            return builder.create<cudaq::cc::LoadOp>(loc, tmp);
+          })
+          // A struct can be dynamic if it contains dynamic members. Get the
+          // static portion of the struct first, which will have length slots.
+          // Then get the dynamic sizes for the dynamic members.
+          .Case([&](cudaq::cc::StructType t) -> Value {
+            if (cudaq::cc::isDynamicType(t)) {
+              Type packedTy = cudaq::opt::factory::genArgumentBufferType(t);
+              Value strSize =
+                  builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, packedTy);
+              for (auto iter : llvm::enumerate(t.getMembers())) {
+                std::int32_t i = iter.index();
+                auto m = iter.value();
+                if (cudaq::cc::isDynamicType(m)) {
+                  auto hostPtrTy = cast<cudaq::cc::PointerType>(arg.getType());
+                  auto hostStrTy =
+                      cast<cudaq::cc::StructType>(hostPtrTy.getElementType());
+                  auto pm = cudaq::cc::PointerType::get(hostStrTy.getMember(i));
+                  auto ai = builder.create<cudaq::cc::ComputePtrOp>(
+                      loc, pm, arg, ArrayRef<cudaq::cc::ComputePtrArg>{i});
+                  strSize = descendThroughDynamicType(loc, builder, module, m,
+                                                      strSize, ai, tmp);
+                }
+              }
+              return strSize;
+            }
+            return builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, t);
+          })
+          .Default([&](Type t) -> Value {
+            return builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, t);
+          });
+  return builder.create<arith::AddIOp>(loc, tySize, addend);
+}
 
-  /// This computes a vector's size and handles recursive vector types. This
-  /// first value returned is the size of the top level (outermost) vector in
-  /// bytes. The second value is the recursive size of all the vectors within
-  /// the outer vector.
-  std::pair<Value, Value>
-  computeRecursiveVectorSize(Location loc, OpBuilder &builder, Value hostArg,
-                             cudaq::cc::PointerType hostVecTy,
-                             cudaq::cc::SpanLikeType stdvecTy) {
-    Value topLevelSize;
-    Value recursiveSize;
-    auto eleTy = stdvecTy.getElementType();
-    if (auto sTy = dyn_cast<cudaq::cc::SpanLikeType>(eleTy)) {
-      // This is the recursive case. vector<vector<...>>. Convert size of
-      // vectors to i64s.
-      topLevelSize = computeHostVectorLengthInBytes(
-          loc, builder, hostArg, stdvecTy.getElementType(), hostVecTy);
-      auto nested = fetchHostVectorFront(loc, builder, hostArg, hostVecTy);
-      auto tmp = builder.create<cudaq::cc::AllocaOp>(loc, builder.getI64Type());
-      builder.create<cudaq::cc::StoreOp>(loc, topLevelSize, tmp);
-      // Convert bytes to units of i64. (Divide by 8)
-      auto topLevelCount =
-          convertLengthBytesToLengthI64(loc, builder, topLevelSize);
-      // Now walk the vectors recursively.
-      auto topLevelIndex = builder.create<cudaq::cc::CastOp>(
-          loc, builder.getI64Type(), topLevelCount,
-          cudaq::cc::CastOpMode::Unsigned);
+static Value
+genSizeOfDynamicMessageBuffer(Location loc, OpBuilder &builder, ModuleOp module,
+                              cudaq::cc::StructType structTy,
+                              ArrayRef<std::tuple<unsigned, Value, Type>> zippy,
+                              Value tmp) {
+  auto i64Ty = builder.getI64Type();
+  Value initSize = builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, structTy);
+  for (auto [_, a, t] : zippy)
+    if (cudaq::cc::isDynamicType(t))
+      initSize =
+          descendThroughDynamicType(loc, builder, module, t, initSize, a, tmp);
+  return initSize;
+}
+
+static Value populateStringAddendum(Location loc, OpBuilder &builder,
+                                    Value host, Value sizeSlot, Value addendum,
+                                    ModuleOp module) {
+  Value size = genStringLength(loc, builder, host, module);
+  builder.create<cudaq::cc::StoreOp>(loc, size, sizeSlot);
+  auto ptrI8Ty = cudaq::cc::PointerType::get(builder.getI8Type());
+  auto fromPtr = builder.create<cudaq::cc::CastOp>(loc, ptrI8Ty, host);
+  StringRef helperName = module->getAttr(cudaq::runtime::sizeofStringAttrName)
+                             ? cudaq::runtime::getPauliWordData
+                             : cudaq::runtime::bindingGetStringData;
+  auto dataPtr = builder.create<func::CallOp>(loc, ptrI8Ty, helperName,
+                                              ValueRange{fromPtr});
+  auto notVolatile = builder.create<arith::ConstantIntOp>(loc, 0, 1);
+  auto toPtr = builder.create<cudaq::cc::CastOp>(loc, ptrI8Ty, addendum);
+  builder.create<func::CallOp>(
+      loc, std::nullopt, cudaq::llvmMemCopyIntrinsic,
+      ValueRange{toPtr, dataPtr.getResult(0), size, notVolatile});
+  auto ptrI8Arr = getByteAddressableType(builder);
+  auto addBytes = builder.create<cudaq::cc::CastOp>(loc, ptrI8Arr, addendum);
+  return builder.create<cudaq::cc::ComputePtrOp>(
+      loc, ptrI8Ty, addBytes, ArrayRef<cudaq::cc::ComputePtrArg>{size});
+}
+
+// Simple case when the vector data is known to not hold dynamic data.
+static Value populateVectorAddendum(Location loc, OpBuilder &builder,
+                                    Value host, Value sizeSlot,
+                                    Value addendum) {
+  Value size = genVectorSize(loc, builder, host);
+  builder.create<cudaq::cc::StoreOp>(loc, size, sizeSlot);
+  auto ptrI8Ty = cudaq::cc::PointerType::get(builder.getI8Type());
+  auto ptrPtrI8 = getPointerToPointerType(builder);
+  auto fromPtrPtr = builder.create<cudaq::cc::CastOp>(loc, ptrPtrI8, host);
+  auto fromPtr = builder.create<cudaq::cc::LoadOp>(loc, fromPtrPtr);
+  auto notVolatile = builder.create<arith::ConstantIntOp>(loc, 0, 1);
+  auto toPtr = builder.create<cudaq::cc::CastOp>(loc, ptrI8Ty, addendum);
+  builder.create<func::CallOp>(loc, std::nullopt, cudaq::llvmMemCopyIntrinsic,
+                               ValueRange{toPtr, fromPtr, size, notVolatile});
+  auto ptrI8Arr = getByteAddressableType(builder);
+  auto addBytes = builder.create<cudaq::cc::CastOp>(loc, ptrI8Arr, addendum);
+  return builder.create<cudaq::cc::ComputePtrOp>(
+      loc, ptrI8Ty, addBytes, ArrayRef<cudaq::cc::ComputePtrArg>{size});
+}
+
+static Value populateDynamicAddendum(Location loc, OpBuilder &builder,
+                                     ModuleOp module, Type devArgTy, Value host,
+                                     Value sizeSlot, Value addendum,
+                                     Value addendumScratch) {
+  if (isa<cudaq::cc::CharspanType>(devArgTy))
+    return populateStringAddendum(loc, builder, host, sizeSlot, addendum,
+                                  module);
+  if (auto vecTy = dyn_cast<cudaq::cc::StdvecType>(devArgTy)) {
+    auto eleTy = vecTy.getElementType();
+    if (cudaq::cc::isDynamicType(eleTy)) {
+      // Recursive case. Visit each dynamic element, copying it.
+      Value size = genVectorSize(loc, builder, host);
+      auto [bytes, count] = genByteSizeAndElementCount(
+          loc, builder, module, eleTy, size, host, devArgTy);
+      size = bytes;
+      builder.create<cudaq::cc::StoreOp>(loc, size, sizeSlot);
+
+      // Convert from bytes to vector length in elements.
+      // Compute new addendum start.
+      auto addrTy = getByteAddressableType(builder);
+      auto castEnd = builder.create<cudaq::cc::CastOp>(loc, addrTy, addendum);
+      Value newAddendum = builder.create<cudaq::cc::ComputePtrOp>(
+          loc, addendum.getType(), castEnd,
+          ArrayRef<cudaq::cc::ComputePtrArg>{size});
+      builder.create<cudaq::cc::StoreOp>(loc, newAddendum, addendumScratch);
+      Type dataTy = cudaq::opt::factory::genArgumentBufferType(eleTy);
+      auto arrDataTy = cudaq::cc::ArrayType::get(dataTy);
+      auto sizeBlockTy = cudaq::cc::PointerType::get(arrDataTy);
+      auto ptrDataTy = cudaq::cc::PointerType::get(dataTy);
+
+      // In the recursive case, the next block of addendum is a vector of
+      // elements which are either sizes or contain sizes. The sizes are i64
+      // and expressed in bytes. Each size will be the size of the span of the
+      // element (or its subfields) at that offset.
+      auto sizeBlock =
+          builder.create<cudaq::cc::CastOp>(loc, sizeBlockTy, addendum);
+      auto hostEleTy =
+          cast<cudaq::cc::PointerType>(host.getType()).getElementType();
+      auto ptrPtrBlockTy = cudaq::cc::PointerType::get(
+          cast<cudaq::cc::StructType>(hostEleTy).getMember(0));
+
+      // The host argument is a std::vector, so we want to get the address of
+      // "front" out of the vector (the first pointer in the triple) and step
+      // over the contiguous range of vectors in the host block. The vector of
+      // vectors forms a ragged array structure in host memory.
+      auto hostBeginPtrRef = builder.create<cudaq::cc::ComputePtrOp>(
+          loc, ptrPtrBlockTy, host, ArrayRef<cudaq::cc::ComputePtrArg>{0});
+      auto hostBegin = builder.create<cudaq::cc::LoadOp>(loc, hostBeginPtrRef);
+      auto hostBeginEleTy = cast<cudaq::cc::PointerType>(hostBegin.getType());
+      auto hostBlockTy = cudaq::cc::PointerType::get(
+          cudaq::cc::ArrayType::get(hostBeginEleTy.getElementType()));
+      auto hostBlock =
+          builder.create<cudaq::cc::CastOp>(loc, hostBlockTy, hostBegin);
+
+      // Loop over each vector element in the vector (recursively).
       cudaq::opt::factory::createInvariantLoop(
-          builder, loc, topLevelIndex,
+          builder, loc, count,
           [&](OpBuilder &builder, Location loc, Region &, Block &block) {
             Value i = block.getArgument(0);
-            auto sub = builder.create<cudaq::cc::ComputePtrOp>(loc, hostVecTy,
-                                                               nested, i);
-            auto p =
-                computeRecursiveVectorSize(loc, builder, sub, hostVecTy, sTy);
-            auto subSz = builder.create<cudaq::cc::LoadOp>(loc, tmp);
-            auto sum = builder.create<arith::AddIOp>(loc, p.second, subSz);
-            builder.create<cudaq::cc::StoreOp>(loc, sum, tmp);
+            Value addm =
+                builder.create<cudaq::cc::LoadOp>(loc, addendumScratch);
+            auto subSlot = builder.create<cudaq::cc::ComputePtrOp>(
+                loc, ptrDataTy, sizeBlock,
+                ArrayRef<cudaq::cc::ComputePtrArg>{i});
+            auto subHost = builder.create<cudaq::cc::ComputePtrOp>(
+                loc, hostBeginEleTy, hostBlock,
+                ArrayRef<cudaq::cc::ComputePtrArg>{i});
+            Value newAddm =
+                populateDynamicAddendum(loc, builder, module, eleTy, subHost,
+                                        subSlot, addm, addendumScratch);
+            builder.create<cudaq::cc::StoreOp>(loc, newAddm, addendumScratch);
           });
-      recursiveSize = builder.create<cudaq::cc::LoadOp>(loc, tmp);
+      return builder.create<cudaq::cc::LoadOp>(loc, addendumScratch);
+    }
+    return populateVectorAddendum(loc, builder, host, sizeSlot, addendum);
+  }
+  auto devStrTy = cast<cudaq::cc::StructType>(devArgTy);
+  auto hostStrTy = cast<cudaq::cc::StructType>(
+      cast<cudaq::cc::PointerType>(sizeSlot.getType()).getElementType());
+  assert(devStrTy.getNumMembers() == hostStrTy.getNumMembers());
+  for (auto iter : llvm::enumerate(devStrTy.getMembers())) {
+    std::int32_t iterIdx = iter.index();
+    auto hostPtrTy = cast<cudaq::cc::PointerType>(host.getType());
+    auto hostMemTy = cast<cudaq::cc::StructType>(hostPtrTy.getElementType())
+                         .getMember(iterIdx);
+    auto val = builder.create<cudaq::cc::ComputePtrOp>(
+        loc, cudaq::cc::PointerType::get(hostMemTy), host,
+        ArrayRef<cudaq::cc::ComputePtrArg>{iterIdx});
+    Type iterTy = iter.value();
+    if (cudaq::cc::isDynamicType(iterTy)) {
+      Value fieldInSlot = builder.create<cudaq::cc::ComputePtrOp>(
+          loc, cudaq::cc::PointerType::get(builder.getI64Type()), sizeSlot,
+          ArrayRef<cudaq::cc::ComputePtrArg>{iterIdx});
+      addendum =
+          populateDynamicAddendum(loc, builder, module, iterTy, val,
+                                  fieldInSlot, addendum, addendumScratch);
     } else {
-      // Non-recusive case. Just compute the size of the top-level vector<T>.
-      topLevelSize = getVectorSize(loc, builder, hostVecTy, hostArg);
-      recursiveSize = topLevelSize;
+      Value fieldInSlot = builder.create<cudaq::cc::ComputePtrOp>(
+          loc, cudaq::cc::PointerType::get(iterTy), sizeSlot,
+          ArrayRef<cudaq::cc::ComputePtrArg>{iterIdx});
+      auto v = builder.create<cudaq::cc::LoadOp>(loc, val);
+      builder.create<cudaq::cc::StoreOp>(loc, v, fieldInSlot);
     }
-    return {topLevelSize, recursiveSize};
   }
+  return addendum;
+}
 
-  /// This computes a dynamic struct's size and handles recursive dynamic types.
-  /// This first value returned is the initial value of the top level
-  /// (outermost) struct to be saved in the buffer. More specifically, any
-  /// (recursive) member that is a vector is replaced by a i64 byte size. The
-  /// offset of the trailing data is, as always, implicit. The second value is
-  /// the recursive size of all the dynamic components within the outer struct.
-  std::pair<Value, Value> computeRecursiveDynamicStructSize(
-      Location loc, OpBuilder &builder, cudaq::cc::StructType structTy,
-      Value arg, Value totalSize, cudaq::cc::StructType genTy) {
-    Value retval = builder.create<cudaq::cc::UndefOp>(loc, genTy);
-    auto argTy = cast<cudaq::cc::PointerType>(arg.getType());
-    for (auto iter : llvm::enumerate(structTy.getMembers())) {
-      auto memTy = iter.value();
-      std::int32_t off = iter.index();
-      auto structMemTy =
-          cast<cudaq::cc::StructType>(argTy.getElementType()).getMember(off);
-      auto structMemPtrTy = cudaq::cc::PointerType::get(structMemTy);
-      auto memPtrVal = builder.create<cudaq::cc::ComputePtrOp>(
-          loc, structMemPtrTy, arg, ArrayRef<cudaq::cc::ComputePtrArg>{off});
-      if (cudaq::cc::isDynamicType(memTy)) {
-        if (auto sTy = dyn_cast<cudaq::cc::StructType>(memTy)) {
-          auto gTy = cast<cudaq::cc::StructType>(structMemTy);
-          auto pr = computeRecursiveDynamicStructSize(
-              loc, builder, sTy, memPtrVal, totalSize, gTy);
-          retval = builder.create<cudaq::cc::InsertValueOp>(
-              loc, retval.getType(), retval, pr.first, off);
-          totalSize = builder.create<arith::AddIOp>(loc, totalSize, pr.second);
-          continue;
-        }
-        auto memStdVecTy = cast<cudaq::cc::SpanLikeType>(memTy);
-        Type eTy = memStdVecTy.getElementType();
-        auto stlVecTy = cudaq::opt::factory::stlVectorType(eTy);
-        auto ptrMemTy = cudaq::cc::PointerType::get(stlVecTy);
-        auto pr = computeRecursiveVectorSize(loc, builder, memPtrVal, ptrMemTy,
-                                             memStdVecTy);
-        retval = builder.create<cudaq::cc::InsertValueOp>(
-            loc, retval.getType(), retval, pr.second, off);
-        totalSize = builder.create<arith::AddIOp>(loc, totalSize, pr.first);
-        continue;
-      }
-      auto memVal = builder.create<cudaq::cc::LoadOp>(loc, memPtrVal);
-      retval = builder.create<cudaq::cc::InsertValueOp>(loc, retval.getType(),
-                                                        retval, memVal, off);
+static void
+populateMessageBuffer(Location loc, OpBuilder &builder, ModuleOp module,
+                      Value msgBufferBase,
+                      ArrayRef<std::tuple<unsigned, Value, Type>> zippy,
+                      Value addendum = {}, Value addendumScratch = {}) {
+  auto structTy = cast<cudaq::cc::StructType>(
+      cast<cudaq::cc::PointerType>(msgBufferBase.getType()).getElementType());
+  // Loop over all the arguments and populate the message buffer.
+  for (auto [idx, arg, devArgTy] : zippy) {
+    std::int32_t i = idx;
+    if (cudaq::cc::isDynamicType(devArgTy)) {
+      assert(addendum && "must have addendum to encode dynamic argument(s)");
+      // Get the address of the slot to be filled.
+      auto memberTy = cast<cudaq::cc::StructType>(structTy).getMember(i);
+      auto ptrTy = cudaq::cc::PointerType::get(memberTy);
+      auto slot = builder.create<cudaq::cc::ComputePtrOp>(
+          loc, ptrTy, msgBufferBase, ArrayRef<cudaq::cc::ComputePtrArg>{i});
+      addendum = populateDynamicAddendum(loc, builder, module, devArgTy, arg,
+                                         slot, addendum, addendumScratch);
+      continue;
+    }
+
+    // If the argument is a callable, skip it.
+    if (isa<cudaq::cc::CallableType>(devArgTy))
+      continue;
+    // If the argument is an empty struct, skip it.
+    if (auto strTy = dyn_cast<cudaq::cc::StructType>(devArgTy);
+        strTy && strTy.isEmpty())
+      continue;
+
+    // Get the address of the slot to be filled.
+    auto memberTy = cast<cudaq::cc::StructType>(structTy).getMember(i);
+    auto ptrTy = cudaq::cc::PointerType::get(memberTy);
+    Value slot = builder.create<cudaq::cc::ComputePtrOp>(
+        loc, ptrTy, msgBufferBase, ArrayRef<cudaq::cc::ComputePtrArg>{i});
+
+    // Argument is a packaged kernel. In this case, the argument is some
+    // unknown kernel that may be called. The packaged argument is coming
+    // from opaque C++ host code, so we need to identify what kernel it
+    // references and then pass its name as a span of characters to the
+    // launch kernel.
+    if (isa<cudaq::cc::IndirectCallableType>(devArgTy)) {
+      auto i64Ty = builder.getI64Type();
+      auto kernKey = builder.create<func::CallOp>(
+          loc, i64Ty, cudaq::runtime::getLinkableKernelKey, ValueRange{arg});
+      builder.create<cudaq::cc::StoreOp>(loc, kernKey.getResult(0), slot);
+      continue;
     }
-    return {retval, totalSize};
+
+    // Just pass the raw pointer. The buffer is supposed to be pointer-free
+    // since it may be unpacked in a different address space. However, if this
+    // is a simulation and things are in the same address space, we pass the
+    // pointer for convenience.
+    if (isa<cudaq::cc::PointerType>(devArgTy))
+      arg = builder.create<cudaq::cc::CastOp>(loc, memberTy, arg);
+
+    if (isa<cudaq::cc::StructType, cudaq::cc::ArrayType>(arg.getType()) &&
+        (cudaq::cc::PointerType::get(arg.getType()) != slot.getType())) {
+      slot = builder.create<cudaq::cc::CastOp>(
+          loc, cudaq::cc::PointerType::get(arg.getType()), slot);
+    }
+    builder.create<cudaq::cc::StoreOp>(loc, arg, slot);
   }
+}
 
-  /// Copy a vector's data, which must be \p bytes in length, from \p hostArg to
-  /// \p outputBuffer. The hostArg must have a pointer type that is compatible
-  /// with the triple pointer std::vector base implementation.
-  Value copyVectorData(Location loc, OpBuilder &builder, Value bytes,
-                       Value hostArg, Value outputBuffer) {
-    auto notVolatile = builder.create<arith::ConstantIntOp>(loc, 0, 1);
-    auto inStructTy = cast<cudaq::cc::StructType>(
-        cast<cudaq::cc::PointerType>(hostArg.getType()).getElementType());
-    auto beginPtr = builder.create<cudaq::cc::ComputePtrOp>(
-        loc, cudaq::cc::PointerType::get(inStructTy.getMember(0)), hostArg,
-        SmallVector<cudaq::cc::ComputePtrArg>{0});
-    auto fromBuff = builder.create<cudaq::cc::LoadOp>(loc, beginPtr);
-    auto i8Ty = builder.getI8Type();
-    auto vecFromBuff = cudaq::opt::factory::createCast(
-        builder, loc, cudaq::cc::PointerType::get(i8Ty), fromBuff);
-    builder.create<func::CallOp>(
-        loc, std::nullopt, cudaq::llvmMemCopyIntrinsic,
-        ValueRange{outputBuffer, vecFromBuff, bytes, notVolatile});
-    auto i8ArrTy = cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(i8Ty));
-    auto buf1 =
-        cudaq::opt::factory::createCast(builder, loc, i8ArrTy, outputBuffer);
-    // Increment outputBuffer by size bytes.
-    return builder.create<cudaq::cc::ComputePtrOp>(
-        loc, outputBuffer.getType(), buf1, SmallVector<Value>{bytes});
+/// A kernel function that takes a quantum type argument (also known as a pure
+/// device kernel) cannot be called directly from C++ (classical) code. It must
+/// be called via other quantum code.
+static bool hasLegalType(FunctionType funTy) {
+  for (auto ty : funTy.getInputs())
+    if (quake::isQuantumType(ty))
+      return false;
+  for (auto ty : funTy.getResults())
+    if (quake::isQuantumType(ty))
+      return false;
+  return true;
+}
+
+static MutableArrayRef<BlockArgument>
+dropAnyHiddenArguments(MutableArrayRef<BlockArgument> args, FunctionType funcTy,
+                       bool hasThisPointer) {
+  const bool hiddenSRet = cudaq::opt::factory::hasHiddenSRet(funcTy);
+  const unsigned count =
+      cudaq::cc::numberOfHiddenArgs(hasThisPointer, hiddenSRet);
+  if (count > 0 && args.size() >= count &&
+      std::all_of(args.begin(), args.begin() + count, [](auto i) {
+        return isa<cudaq::cc::PointerType>(i.getType());
+      }))
+    return args.drop_front(count);
+  return args;
+}
+
+static std::pair<bool, func::FuncOp>
+lookupHostEntryPointFunc(StringRef mangledEntryPointName, ModuleOp module,
+                         func::FuncOp funcOp) {
+  if (mangledEntryPointName.equals("BuilderKernel.EntryPoint") ||
+      mangledEntryPointName.contains("_PyKernelEntryPointRewrite")) {
+    // No host entry point needed.
+    return {false, func::FuncOp{}};
   }
+  if (auto *decl = module.lookupSymbol(mangledEntryPointName))
+    if (auto func = dyn_cast<func::FuncOp>(decl)) {
+      func.eraseBody();
+      return {true, func};
+    }
+  funcOp.emitOpError("could not generate the host-side kernel function (" +
+                     mangledEntryPointName + ")");
+  return {true, func::FuncOp{}};
+}
+
+/// Generate code to initialize the std::vector<T>, \p sret, from an initializer
+/// list with data at \p data and length \p size. Use the library helper
+/// routine. This function takes two !llvm.ptr arguments.
+static void genStdvecBoolFromInitList(Location loc, OpBuilder &builder,
+                                      Value sret, Value data, Value size) {
+  auto ptrTy = cudaq::cc::PointerType::get(builder.getContext());
+  auto castData = builder.create<cudaq::cc::CastOp>(loc, ptrTy, data);
+  auto castSret = builder.create<cudaq::cc::CastOp>(loc, ptrTy, sret);
+  builder.create<func::CallOp>(loc, std::nullopt,
+                               cudaq::stdvecBoolCtorFromInitList,
+                               ArrayRef<Value>{castSret, castData, size});
+}
+
+/// Generate a `std::vector<T>` (where `T != bool`) from an initializer list.
+/// This is done with the assumption that `std::vector` is implemented as a
+/// triple of pointers. The original content of the vector is freed and the new
+/// content, which is already on the stack, is moved into the `std::vector`.
+static void genStdvecTFromInitList(Location loc, OpBuilder &builder, Value sret,
+                                   Value data, Value tSize, Value vecSize) {
+  auto i8Ty = builder.getI8Type();
+  auto stlVectorTy =
+      cudaq::cc::PointerType::get(cudaq::opt::factory::stlVectorType(i8Ty));
+  auto ptrTy = cudaq::cc::PointerType::get(i8Ty);
+  auto castSret = builder.create<cudaq::cc::CastOp>(loc, stlVectorTy, sret);
+  auto ptrPtrTy = cudaq::cc::PointerType::get(ptrTy);
+  auto sret0 = builder.create<cudaq::cc::ComputePtrOp>(
+      loc, ptrPtrTy, castSret, SmallVector<cudaq::cc::ComputePtrArg>{0});
+  auto arrI8Ty = cudaq::cc::ArrayType::get(i8Ty);
+  auto ptrArrTy = cudaq::cc::PointerType::get(arrI8Ty);
+  auto buffPtr0 = builder.create<cudaq::cc::CastOp>(loc, ptrTy, data);
+  builder.create<cudaq::cc::StoreOp>(loc, buffPtr0, sret0);
+  auto sret1 = builder.create<cudaq::cc::ComputePtrOp>(
+      loc, ptrPtrTy, castSret, SmallVector<cudaq::cc::ComputePtrArg>{1});
+  Value byteLen = builder.create<arith::MulIOp>(loc, tSize, vecSize);
+  auto buffPtr = builder.create<cudaq::cc::CastOp>(loc, ptrArrTy, data);
+  auto endPtr = builder.create<cudaq::cc::ComputePtrOp>(
+      loc, ptrTy, buffPtr, SmallVector<cudaq::cc::ComputePtrArg>{byteLen});
+  builder.create<cudaq::cc::StoreOp>(loc, endPtr, sret1);
+  auto sret2 = builder.create<cudaq::cc::ComputePtrOp>(
+      loc, ptrPtrTy, castSret, SmallVector<cudaq::cc::ComputePtrArg>{2});
+  builder.create<cudaq::cc::StoreOp>(loc, endPtr, sret2);
+}
+
+// Alloca a pointer to a pointer and initialize it to nullptr.
+static Value createEmptyHeapTracker(Location loc, OpBuilder &builder) {
+  auto ptrI8Ty = cudaq::cc::PointerType::get(builder.getI8Type());
+  auto result = builder.create<cudaq::cc::AllocaOp>(loc, ptrI8Ty);
+  auto zero = builder.create<arith::ConstantIntOp>(loc, 0, 64);
+  auto null = builder.create<cudaq::cc::CastOp>(loc, ptrI8Ty, zero);
+  builder.create<cudaq::cc::StoreOp>(loc, null, result);
+  return result;
+}
+
+// If there are temporaries, call the helper to free them.
+static void maybeFreeHeapAllocations(Location loc, OpBuilder &builder,
+                                     Value heapTracker) {
+  auto head = builder.create<cudaq::cc::LoadOp>(loc, heapTracker);
+  auto zero = builder.create<arith::ConstantIntOp>(loc, 0, 64);
+  auto headAsInt =
+      builder.create<cudaq::cc::CastOp>(loc, builder.getI64Type(), head);
+  auto cmp = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ne,
+                                           headAsInt, zero);
+  // If there are no std::vector<bool> to unpack, then the heapTracker will be
+  // set to `nullptr` and otherwise unused. That will allow the compiler to DCE
+  // this call after constant propagation.
+  builder.create<cudaq::cc::IfOp>(
+      loc, TypeRange{}, cmp,
+      [&](OpBuilder &builder, Location loc, Region &region) {
+        region.push_back(new Block());
+        auto &body = region.front();
+        OpBuilder::InsertionGuard guard(builder);
+        builder.setInsertionPointToStart(&body);
+        builder.create<func::CallOp>(loc, std::nullopt,
+                                     cudaq::stdvecBoolFreeTemporaryLists,
+                                     ArrayRef<Value>{head});
+        builder.create<cudaq::cc::ContinueOp>(loc);
+      });
+}
 
-  /// Given that \p arg is a SpanLikeType value, compute its extent size (the
-  /// number of elements in the outermost vector times `sizeof(int64_t)`) and
-  /// total recursive size (both values are in bytes). We add the extent size
-  /// into the message buffer field and increase the size of the addend by the
-  /// total recursive size.
-  std::pair<Value, Value> insertVectorSizeAndIncrementExtraBytes(
-      Location loc, OpBuilder &builder, Value arg,
-      cudaq::cc::PointerType ptrInTy, cudaq::cc::SpanLikeType stdvecTy,
-      Value stVal, std::int32_t idx, Value extraBytes) {
-    auto [extentSize, recursiveSize] =
-        computeRecursiveVectorSize(loc, builder, arg, ptrInTy, stdvecTy);
-    stVal = builder.create<cudaq::cc::InsertValueOp>(loc, stVal.getType(),
-                                                     stVal, extentSize, idx);
-    extraBytes = builder.create<arith::AddIOp>(loc, extraBytes, recursiveSize);
-    return {stVal, extraBytes};
+/// Fetch an argument from the comm buffer. Here, the argument is not dynamic so
+/// it can be read as is out of the buffer.
+static Value fetchInputValue(Location loc, OpBuilder &builder, Type devTy,
+                             Value ptr) {
+  assert(!cudaq::cc::isDynamicType(devTy) && "must not be a dynamic type");
+  if (isa<cudaq::cc::IndirectCallableType>(devTy)) {
+    // An indirect callable passes a key value which will be used to determine
+    // the kernel that is being called.
+    auto key = builder.create<cudaq::cc::LoadOp>(loc, ptr);
+    return builder.create<cudaq::cc::CastOp>(loc, devTy, key);
   }
 
-  Value genComputeReturnOffset(Location loc, OpBuilder &builder,
-                               FunctionType funcTy,
-                               cudaq::cc::StructType msgStructTy) {
-    if (funcTy.getNumResults() == 0)
-      return builder.create<arith::ConstantIntOp>(loc, NoResultOffset, 64);
-    std::int32_t numKernelArgs = funcTy.getNumInputs();
-    auto i64Ty = builder.getI64Type();
-    return builder.create<cudaq::cc::OffsetOfOp>(
-        loc, i64Ty, msgStructTy, ArrayRef<std::int32_t>{numKernelArgs});
+  if (isa<cudaq::cc::CallableType>(devTy)) {
+    // A direct callable will have already been effectively inlined and this
+    // argument should not be referenced.
+    return builder.create<cudaq::cc::PoisonOp>(loc, devTy);
   }
 
-  /// Create a function that determines the return value offset in the message
-  /// buffer.
-  void genReturnOffsetFunction(Location loc, OpBuilder &builder,
-                               FunctionType devKernelTy,
-                               cudaq::cc::StructType msgStructTy,
-                               const std::string &classNameStr) {
-    auto *ctx = builder.getContext();
+  auto ptrDevTy = cudaq::cc::PointerType::get(devTy);
+  if (auto strTy = dyn_cast<cudaq::cc::StructType>(devTy)) {
+    // Argument is a struct.
+    if (strTy.isEmpty())
+      return builder.create<cudaq::cc::UndefOp>(loc, devTy);
+
+    // Cast to avoid conflicts between layout compatible, distinct struct types.
+    auto structPtr = builder.create<cudaq::cc::CastOp>(loc, ptrDevTy, ptr);
+    return builder.create<cudaq::cc::LoadOp>(loc, structPtr);
+  }
+
+  // Default case: argument passed as a value inplace.
+  return builder.create<cudaq::cc::LoadOp>(loc, ptr);
+}
+
+/// Helper routine to generate code to increment the trailing data pointer to
+/// the next block of data (if any).
+static Value incrementTrailingDataPointer(Location loc, OpBuilder &builder,
+                                          Value trailingData, Value bytes) {
+  auto i8Ty = builder.getI8Type();
+  auto bufferTy = cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(i8Ty));
+  auto buffPtr = builder.create<cudaq::cc::CastOp>(loc, bufferTy, trailingData);
+  auto i8PtrTy = cudaq::cc::PointerType::get(i8Ty);
+  return builder.create<cudaq::cc::ComputePtrOp>(
+      loc, i8PtrTy, buffPtr, ArrayRef<cudaq::cc::ComputePtrArg>{bytes});
+}
+
+/// In the thunk, we need to unpack any `std::vector` objects encoded in the
+/// packet. Since these have dynamic size, they are encoded as trailing bytes
+/// by offset and size. The offset is implicit from the values of the
+/// arguments. All sizes are encoded as `int64_t`.
+///
+/// A vector of vector of ... T is encoded as a int64_t (length). At the
+/// offset of the level `i` vector will be a sequence of sizes for the level
+/// `i+1` vectors. For the leaf vector level, `n`, the blocks of data for each
+/// vector will be immediately following for each vector at level `n` for the
+/// branch of the tree being encoded.
+///
+/// For example, a variable defined and initialized as
+/// ```
+/// vector<vector<vector<char>>> example =
+///    {{{'a'}, {'b', 'c'}, {'z'}}, {{'d' 'e', 'f'}}};
+/// ```
+///
+/// and passed as an argument to a kernel will be encoded as the following
+/// block. The block will have a structure with the declared arguments
+/// followed by an addendum of variable data, where the vector data is
+/// encoded.
+///
+/// ```
+///   arguments: { ..., 1, ... }
+///   addendum: [[3; 1 2 1, a, b c, z] [1; 3, d e f]]
+/// ```
+static std::pair<Value, Value> constructDynamicInputValue(Location loc,
+                                                          OpBuilder &builder,
+                                                          Type devTy, Value ptr,
+                                                          Value trailingData) {
+  assert(cudaq::cc::isDynamicType(devTy) && "must be dynamic type");
+  // There are 2 cases.
+  // 1. The dynamic type is a std::span of any legal device argument type.
+  // 2. The dynamic type is a struct containing at least 1 std::span.
+  if (auto spanTy = dyn_cast<cudaq::cc::SpanLikeType>(devTy)) {
+    // ptr: a pointer to the length of the block in bytes.
+    // trailingData: the block of data to decode.
+    auto eleTy = spanTy.getElementType();
     auto i64Ty = builder.getI64Type();
-    auto funcTy = FunctionType::get(ctx, {}, {i64Ty});
-    auto returnOffsetFunc = builder.create<func::FuncOp>(
-        loc, classNameStr + ".returnOffset", funcTy);
-    OpBuilder::InsertionGuard guard(builder);
-    auto *entry = returnOffsetFunc.addEntryBlock();
-    builder.setInsertionPointToStart(entry);
-    auto result =
-        genComputeReturnOffset(loc, builder, devKernelTy, msgStructTy);
-    builder.create<func::ReturnOp>(loc, result);
+    auto buffEleTy = cudaq::opt::factory::genArgumentBufferType(eleTy);
+
+    // Get the size of each element in the vector and compute the vector's
+    // logical length.
+    auto eleSize = builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, buffEleTy);
+    Value bytes = builder.create<cudaq::cc::LoadOp>(loc, ptr);
+    auto vecLength = builder.create<arith::DivSIOp>(loc, bytes, eleSize);
+
+    if (cudaq::cc::isDynamicType(eleTy)) {
+      // The vector is recursively dynamic.
+      // Create a new block in which to place the stdvec/struct data in
+      // device-side format.
+      Value newVecData =
+          builder.create<cudaq::cc::AllocaOp>(loc, eleTy, vecLength);
+      // Compute new trailing data, skipping the current vector's data.
+      auto nextTrailingData =
+          incrementTrailingDataPointer(loc, builder, trailingData, bytes);
+
+      // For each element in the vector, convert it to device-side format and
+      // save the result in newVecData.
+      auto elePtrTy = cudaq::cc::PointerType::get(eleTy);
+      auto packTy = cudaq::opt::factory::genArgumentBufferType(eleTy);
+      Type packedArrTy =
+          cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(packTy));
+      Type packedEleTy = cudaq::cc::PointerType::get(packTy);
+      auto arrPtr =
+          builder.create<cudaq::cc::CastOp>(loc, packedArrTy, trailingData);
+      auto trailingDataVar =
+          builder.create<cudaq::cc::AllocaOp>(loc, nextTrailingData.getType());
+      builder.create<cudaq::cc::StoreOp>(loc, nextTrailingData,
+                                         trailingDataVar);
+      cudaq::opt::factory::createInvariantLoop(
+          builder, loc, vecLength,
+          [&](OpBuilder &builder, Location loc, Region &, Block &block) {
+            Value i = block.getArgument(0);
+            auto nextTrailingData =
+                builder.create<cudaq::cc::LoadOp>(loc, trailingDataVar);
+            auto vecMemPtr = builder.create<cudaq::cc::ComputePtrOp>(
+                loc, packedEleTy, arrPtr,
+                ArrayRef<cudaq::cc::ComputePtrArg>{i});
+            auto r = constructDynamicInputValue(loc, builder, eleTy, vecMemPtr,
+                                                nextTrailingData);
+            auto newVecPtr = builder.create<cudaq::cc::ComputePtrOp>(
+                loc, elePtrTy, newVecData,
+                ArrayRef<cudaq::cc::ComputePtrArg>{i});
+            builder.create<cudaq::cc::StoreOp>(loc, r.first, newVecPtr);
+            builder.create<cudaq::cc::StoreOp>(loc, r.second, trailingDataVar);
+          });
+
+      // Create the new outer stdvec span as the result.
+      Value stdvecResult = builder.create<cudaq::cc::StdvecInitOp>(
+          loc, spanTy, newVecData, vecLength);
+      nextTrailingData =
+          builder.create<cudaq::cc::LoadOp>(loc, trailingDataVar);
+      return {stdvecResult, nextTrailingData};
+    }
+
+    // This vector has constant data, so just use the data in-place and
+    // construct the stdvec span with it.
+    auto castTrailingData = builder.create<cudaq::cc::CastOp>(
+        loc, cudaq::cc::PointerType::get(eleTy), trailingData);
+    Value stdvecResult = builder.create<cudaq::cc::StdvecInitOp>(
+        loc, spanTy, castTrailingData, vecLength);
+    auto nextTrailingData =
+        incrementTrailingDataPointer(loc, builder, trailingData, bytes);
+    return {stdvecResult, nextTrailingData};
+  }
+
+  // Argument must be a struct.
+  // The struct contains dynamic components. Extract them and build up the
+  // struct value to be passed as an argument.
+  // ptr: pointer to the first element of the struct or a vector length.
+  // tailingData: the block of data for the first dynamic type field.
+  auto strTy = cast<cudaq::cc::StructType>(devTy);
+  auto ptrEleTy = cast<cudaq::cc::PointerType>(ptr.getType()).getElementType();
+  auto packedTy = cast<cudaq::cc::StructType>(ptrEleTy);
+  Value result = builder.create<cudaq::cc::UndefOp>(loc, strTy);
+  assert(strTy.getNumMembers() == packedTy.getNumMembers());
+  for (auto iter :
+       llvm::enumerate(llvm::zip(strTy.getMembers(), packedTy.getMembers()))) {
+    auto devMemTy = std::get<0>(iter.value());
+    std::int32_t off = iter.index();
+    auto packedMemTy = std::get<1>(iter.value());
+    auto dataPtr = builder.create<cudaq::cc::ComputePtrOp>(
+        loc, cudaq::cc::PointerType::get(packedMemTy), ptr,
+        ArrayRef<cudaq::cc::ComputePtrArg>{off});
+    if (cudaq::cc::isDynamicType(devMemTy)) {
+      auto r = constructDynamicInputValue(loc, builder, devMemTy, dataPtr,
+                                          trailingData);
+      result = builder.create<cudaq::cc::InsertValueOp>(loc, strTy, result,
+                                                        r.first, off);
+      trailingData = r.second;
+      continue;
+    }
+    auto val = fetchInputValue(loc, builder, devMemTy, dataPtr);
+    result =
+        builder.create<cudaq::cc::InsertValueOp>(loc, strTy, result, val, off);
   }
+  return {result, trailingData};
+}
+
+/// Translate the buffer data to a sequence of arguments suitable to the
+/// actual kernel call.
+///
+/// \param inTy      The actual expected type of the argument.
+/// \param structTy  The modified buffer type over all the arguments at the
+/// current level.
+static std::pair<Value, Value>
+processInputValue(Location loc, OpBuilder &builder, Value trailingData,
+                  Value ptrPackedStruct, Type inTy, std::int32_t off,
+                  cudaq::cc::StructType packedStructTy) {
+  auto packedPtr = builder.create<cudaq::cc::ComputePtrOp>(
+      loc, cudaq::cc::PointerType::get(packedStructTy.getMember(off)),
+      ptrPackedStruct, ArrayRef<cudaq::cc::ComputePtrArg>{off});
+  if (cudaq::cc::isDynamicType(inTy))
+    return constructDynamicInputValue(loc, builder, inTy, packedPtr,
+                                      trailingData);
+  auto val = fetchInputValue(loc, builder, inTy, packedPtr);
+  return {val, trailingData};
+}
+
+/// This pass adds a `<kernel name>.thunk` function and a rewritten C++ host
+/// side (mangled) stub to the code for every entry-point kernel in the module.
+/// It may also generate a `<kernel name>.argsCreator` function. Finally, it
+/// creates registration hooks for the CUDA-Q runtime to be able to find the
+/// kernel by name and, as appropriate, the `<kernel name>.argsCreator`
+/// function.
+namespace {
+class GenerateKernelExecution
+    : public cudaq::opt::impl::GenerateKernelExecutionBase<
+          GenerateKernelExecution> {
+public:
+  using GenerateKernelExecutionBase::GenerateKernelExecutionBase;
 
   /// Creates a function that can take a block of pointers to argument values
   /// and using the compiler's knowledge of a kernel encodes those argument
@@ -340,6 +1206,7 @@ class GenerateKernelExecution
   /// buffer. (Message buffers are at least the size of \p structTy but may be
   /// extended.)
   func::FuncOp genKernelArgsCreatorFunction(Location loc, OpBuilder &builder,
+                                            ModuleOp module,
                                             FunctionType devKernelTy,
                                             cudaq::cc::StructType msgStructTy,
                                             const std::string &classNameStr,
@@ -348,14 +1215,18 @@ class GenerateKernelExecution
     auto *ctx = builder.getContext();
     Type i8Ty = builder.getI8Type();
     Type ptrI8Ty = cudaq::cc::PointerType::get(i8Ty);
-    auto ptrPtrType = cudaq::cc::PointerType::get(ptrI8Ty);
+    auto ptrPtrType = getPointerToPointerType(builder);
     Type i64Ty = builder.getI64Type();
     auto structPtrTy = cudaq::cc::PointerType::get(msgStructTy);
-    auto getHostArgType = [&](unsigned idx) {
-      bool hasSRet = cudaq::opt::factory::hasHiddenSRet(hostFuncTy);
-      unsigned count = cudaq::cc::numberOfHiddenArgs(hasThisPtr, hasSRet);
-      return hostFuncTy.getInput(count + idx);
-    };
+    auto passedDevArgTys = devKernelTy.getInputs().drop_front(startingArgIdx);
+
+    SmallVector<Type> passedHostArgTys;
+    for (auto ty : passedDevArgTys) {
+      Type hostTy = cudaq::opt::factory::convertToHostSideType(ty, module);
+      if (cudaq::cc::isDynamicType(ty))
+        hostTy = cudaq::cc::PointerType::get(hostTy);
+      passedHostArgTys.push_back(hostTy);
+    }
 
     // Create the function that we'll fill.
     auto funcType = FunctionType::get(ctx, {ptrPtrType, ptrPtrType}, {i64Ty});
@@ -365,365 +1236,83 @@ class GenerateKernelExecution
     auto *entry = argsCreatorFunc.addEntryBlock();
     builder.setInsertionPointToStart(entry);
 
-    // Get the original function args
-    auto kernelArgTypes = devKernelTy.getInputs().drop_front(startingArgIdx);
+    // Convert all the arguments passed in the array of void* to appear as if
+    // they had been naturally passed as C++ arguments.
+    // This means, casting to the correct type (host-side) and removing the
+    // outer pointer by a dereference. Each argument must be a valid reference
+    // at this point, so if the dereference fails (say it is a nullptr), it is a
+    // bug in the code that is calling this argsCreator.
 
-    // Init the struct
-    Value stVal = builder.create<cudaq::cc::UndefOp>(loc, msgStructTy);
-
-    // Get the variadic void* args
-    auto variadicArgs = builder.create<cudaq::cc::CastOp>(
+    // Get the array of void* args.
+    auto argsArray = builder.create<cudaq::cc::CastOp>(
         loc, cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(ptrI8Ty)),
         entry->getArgument(0));
 
-    // Initialize the counter for extra size.
-    Value zero = builder.create<arith::ConstantIntOp>(loc, 0, 64);
-    Value extraBytes = zero;
-
-    // Process all the arguments for the original call by looping over the
-    // kernel's arguments.
-    bool hasTrailingData = false;
-    DenseMap<std::int32_t, Value> replacementArgs;
-    for (auto kaIter : llvm::enumerate(kernelArgTypes)) {
-      std::int32_t idx = kaIter.index();
-
-      // The current cudaq kernel arg and message buffer element type.
-      Type currArgTy = kaIter.value();
-      Type currEleTy = msgStructTy.getMember(idx);
-
-      // Skip any elements that are callables or empty structures.
-      if (isa<cudaq::cc::CallableType>(currEleTy))
-        continue;
-      if (auto strTy = dyn_cast<cudaq::cc::StructType>(currEleTy))
-        if (strTy.isEmpty())
-          continue;
-
-      // Get the pointer to the argument from out of the block of pointers,
-      // which are the variadic args.
-      Value argPtrPtr = builder.create<cudaq::cc::ComputePtrOp>(
-          loc, ptrPtrType, variadicArgs,
-          SmallVector<cudaq::cc::ComputePtrArg>{idx});
-      Value argPtr = builder.create<cudaq::cc::LoadOp>(loc, ptrI8Ty, argPtrPtr);
-
-      if (auto stdvecTy = dyn_cast<cudaq::cc::SpanLikeType>(currArgTy)) {
-        // If this is a vector argument, then we will add data to the message
-        // buffer's addendum (unless the vector is length 0).
-        auto ptrInTy = cudaq::cc::PointerType::get(
-            cudaq::opt::factory::stlVectorType(stdvecTy.getElementType()));
-
-        Value arg = builder.create<cudaq::cc::CastOp>(loc, ptrInTy, argPtr);
-        if (stdvecTy.getElementType() == builder.getI1Type()) {
-          // Create a mock vector of i8 and populate the bools, 1 per char.
-          Value temp = builder.create<cudaq::cc::AllocaOp>(
-              loc, ptrInTy.getElementType());
-          builder.create<func::CallOp>(loc, std::nullopt,
-                                       cudaq::stdvecBoolUnpackToInitList,
-                                       ArrayRef<Value>{temp, arg});
-          replacementArgs[idx] = temp;
-          arg = temp;
-        }
-
-        auto [p1, p2] = insertVectorSizeAndIncrementExtraBytes(
-            loc, builder, arg, ptrInTy, stdvecTy, stVal, idx, extraBytes);
-        stVal = p1;
-        extraBytes = p2;
-        hasTrailingData = true;
-        continue;
-      }
+    // Loop over the array and cast the void* to the host-side type.
+    SmallVector<Value> pseudoArgs;
+    for (auto iter : llvm::enumerate(passedHostArgTys)) {
+      std::int32_t i = iter.index();
+      auto parg = builder.create<cudaq::cc::ComputePtrOp>(
+          loc, ptrPtrType, argsArray, ArrayRef<cudaq::cc::ComputePtrArg>{i});
+      Type ty = iter.value();
+      // parg is a pointer to a pointer as it is an element of an array of
+      // pointers. Always dereference the first layer here.
+      Value deref = builder.create<cudaq::cc::LoadOp>(loc, parg);
+      if (!isa<cudaq::cc::PointerType>(ty))
+        ty = cudaq::cc::PointerType::get(ty);
+      pseudoArgs.push_back(builder.create<cudaq::cc::CastOp>(loc, ty, deref));
+    }
 
-      if (auto strTy = dyn_cast<cudaq::cc::StructType>(currArgTy)) {
-        Value v = argPtr;
-        if (!cudaq::cc::isDynamicType(strTy)) {
-          // struct is static size, so just load the value (byval ptr).
-          v = builder.create<cudaq::cc::CastOp>(
-              loc, cudaq::cc::PointerType::get(currEleTy), v);
-          v = builder.create<cudaq::cc::LoadOp>(loc, v);
-          stVal = builder.create<cudaq::cc::InsertValueOp>(loc, stVal.getType(),
-                                                           stVal, v, idx);
-          continue;
-        }
-        auto genTy = cast<cudaq::cc::StructType>(currEleTy);
-        Value zero = builder.create<arith::ConstantIntOp>(loc, 0, 64);
-        Type hostArgTy = getHostArgType(idx);
-        v = builder.create<cudaq::cc::CastOp>(loc, hostArgTy, v);
-        auto [quakeVal, recursiveSize] = computeRecursiveDynamicStructSize(
-            loc, builder, strTy, v, zero, genTy);
-        stVal = builder.create<cudaq::cc::InsertValueOp>(loc, stVal.getType(),
-                                                         stVal, quakeVal, idx);
-        extraBytes =
-            builder.create<arith::AddIOp>(loc, extraBytes, recursiveSize);
-        hasTrailingData = true;
-        continue;
-      }
-      if (auto ptrTy = dyn_cast<cudaq::cc::PointerType>(currEleTy)) {
-        if (isa<cudaq::cc::StateType>(ptrTy.getElementType())) {
-          // Special case: if the argument is a `cudaq::state*`, then just pass
-          // the pointer. We can do that in this case because the synthesis step
-          // (which will receive the argument data) is assumed to run in the
-          // same memory space.
-          argPtr = builder.create<cudaq::cc::CastOp>(loc, currEleTy, argPtr);
-          stVal = builder.create<cudaq::cc::InsertValueOp>(loc, stVal.getType(),
-                                                           stVal, argPtr, idx);
-        }
-        continue;
-      }
+    // Zip the arguments with the device side argument types. Recall that some
+    // of the (left-most) arguments may have been dropped on the floor.
+    const bool hasDynamicSignature = isDynamicSignature(devKernelTy);
+    Value heapTracker = createEmptyHeapTracker(loc, builder);
+    auto zippy = zipArgumentsWithDeviceTypes</*argsAreReferences=*/true>(
+        loc, builder, module, pseudoArgs, passedDevArgTys, heapTracker);
+    auto sizeScratch = builder.create<cudaq::cc::AllocaOp>(loc, i64Ty);
+    auto messageBufferSize = [&]() -> Value {
+      if (hasDynamicSignature)
+        return genSizeOfDynamicMessageBuffer(loc, builder, module, msgStructTy,
+                                             zippy, sizeScratch);
+      return builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, msgStructTy);
+    }();
 
-      // cast to the struct element type, void* -> TYPE *
-      argPtr = builder.create<cudaq::cc::CastOp>(
-          loc, cudaq::cc::PointerType::get(currEleTy), argPtr);
-      Value loadedVal =
-          builder.create<cudaq::cc::LoadOp>(loc, currEleTy, argPtr);
-      stVal = builder.create<cudaq::cc::InsertValueOp>(loc, stVal.getType(),
-                                                       stVal, loadedVal, idx);
+    // Allocate the message buffer on the heap. It must outlive this call.
+    auto buff = builder.create<func::CallOp>(loc, ptrI8Ty, "malloc",
+                                             ValueRange(messageBufferSize));
+    Value rawMessageBuffer = buff.getResult(0);
+    Value msgBufferPrefix =
+        builder.create<cudaq::cc::CastOp>(loc, structPtrTy, rawMessageBuffer);
+
+    // Populate the message buffer with the pointer-free argument values.
+    if (hasDynamicSignature) {
+      auto addendumScratch = builder.create<cudaq::cc::AllocaOp>(loc, ptrI8Ty);
+      Value prefixSize =
+          builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, msgStructTy);
+      auto arrMessageBuffer = builder.create<cudaq::cc::CastOp>(
+          loc, cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(i8Ty)),
+          rawMessageBuffer);
+      // Compute the position of the addendum.
+      Value addendumPtr = builder.create<cudaq::cc::ComputePtrOp>(
+          loc, ptrI8Ty, arrMessageBuffer,
+          ArrayRef<cudaq::cc::ComputePtrArg>{prefixSize});
+      populateMessageBuffer(loc, builder, module, msgBufferPrefix, zippy,
+                            addendumPtr, addendumScratch);
+    } else {
+      populateMessageBuffer(loc, builder, module, msgBufferPrefix, zippy);
     }
 
-    // Compute the struct size
-    Value structSize =
-        builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, msgStructTy);
-
-    // Here we do have vector args
-    Value extendedStructSize =
-        hasTrailingData
-            ? builder.create<arith::AddIOp>(loc, structSize, extraBytes)
-            : structSize;
-    // If no vector args, handle this simple case and drop out
-    Value buff = builder
-                     .create<func::CallOp>(loc, ptrI8Ty, "malloc",
-                                           ValueRange(extendedStructSize))
-                     .getResult(0);
-
-    Value casted = builder.create<cudaq::cc::CastOp>(loc, structPtrTy, buff);
-    builder.create<cudaq::cc::StoreOp>(loc, stVal, casted);
-    if (hasTrailingData) {
-      auto arrTy = cudaq::cc::ArrayType::get(i8Ty);
-      auto ptrArrTy = cudaq::cc::PointerType::get(arrTy);
-      auto cast1 = builder.create<cudaq::cc::CastOp>(loc, ptrArrTy, buff);
-      Value vecToBuffer = builder.create<cudaq::cc::ComputePtrOp>(
-          loc, ptrI8Ty, cast1, SmallVector<Value>{structSize});
-      for (auto iter : llvm::enumerate(msgStructTy.getMembers())) {
-        std::int32_t idx = iter.index();
-        if (idx == static_cast<std::int32_t>(kernelArgTypes.size()))
-          break;
-        // Get the corresponding cudaq kernel arg type
-        auto currArgTy = kernelArgTypes[idx];
-        if (auto stdvecTy = dyn_cast<cudaq::cc::SpanLikeType>(currArgTy)) {
-          auto bytes = builder.create<cudaq::cc::ExtractValueOp>(
-              loc, builder.getI64Type(), stVal, idx);
-          Value argPtrPtr = builder.create<cudaq::cc::ComputePtrOp>(
-              loc, ptrPtrType, variadicArgs,
-              ArrayRef<cudaq::cc::ComputePtrArg>{idx});
-          auto ptrInTy = cudaq::cc::PointerType::get(
-              cudaq::opt::factory::stlVectorType(stdvecTy.getElementType()));
-          Value arg =
-              builder.create<cudaq::cc::LoadOp>(loc, ptrI8Ty, argPtrPtr);
-          arg = builder.create<cudaq::cc::CastOp>(loc, ptrInTy, arg);
-          vecToBuffer = encodeVectorData(loc, builder, bytes, stdvecTy, arg,
-                                         vecToBuffer, ptrInTy);
-          if (stdvecTy.getElementType() == builder.getI1Type()) {
-            auto ptrI1Ty = cudaq::cc::PointerType::get(builder.getI1Type());
-            assert(replacementArgs.count(idx) && "must be in map");
-            auto arg = replacementArgs[idx];
-            auto heapPtr = builder.create<cudaq::cc::ComputePtrOp>(
-                loc, cudaq::cc::PointerType::get(ptrI1Ty), arg,
-                ArrayRef<cudaq::cc::ComputePtrArg>{0});
-            auto loadHeapPtr = builder.create<cudaq::cc::LoadOp>(loc, heapPtr);
-            auto i8Ty = builder.getI8Type();
-            Value heapCast = builder.create<cudaq::cc::CastOp>(
-                loc, cudaq::cc::PointerType::get(i8Ty), loadHeapPtr);
-            builder.create<func::CallOp>(loc, std::nullopt, "free",
-                                         ArrayRef<Value>{heapCast});
-          }
-        } else if (auto strTy = dyn_cast<cudaq::cc::StructType>(currArgTy)) {
-          if (cudaq::cc::isDynamicType(strTy)) {
-            Value argPtrPtr = builder.create<cudaq::cc::ComputePtrOp>(
-                loc, ptrPtrType, variadicArgs,
-                ArrayRef<cudaq::cc::ComputePtrArg>{idx});
-            Value arg =
-                builder.create<cudaq::cc::LoadOp>(loc, ptrI8Ty, argPtrPtr);
-            Type hostArgTy = getHostArgType(idx);
-            arg = builder.create<cudaq::cc::CastOp>(loc, hostArgTy, arg);
-            auto structPtrArrTy = cudaq::cc::PointerType::get(
-                cudaq::cc::ArrayType::get(msgStructTy));
-            auto temp =
-                builder.create<cudaq::cc::CastOp>(loc, structPtrArrTy, buff);
-            vecToBuffer = encodeDynamicStructData(loc, builder, strTy, arg,
-                                                  temp, vecToBuffer);
-          }
-        }
-      }
-    }
-    builder.create<cudaq::cc::StoreOp>(loc, buff, entry->getArgument(1));
-    builder.create<func::ReturnOp>(loc, ValueRange{extendedStructSize});
-    return argsCreatorFunc;
-  }
+    maybeFreeHeapAllocations(loc, builder, heapTracker);
 
-  /// In the thunk, we need to unpack any `std::vector` objects encoded in the
-  /// packet. Since these have dynamic size, they are encoded as trailing bytes
-  /// by offset and size. The offset is implicit from the values of the
-  /// arguments. All sizes are encoded as `int64_t`.
-  ///
-  /// A vector of vector of ... T is encoded as a int64_t (length). At the
-  /// offset of the level `i` vector will be a sequence of sizes for the level
-  /// `i+1` vectors. For the leaf vector level, `n`, the blocks of data for each
-  /// vector will be immediately following for each vector at level `n` for the
-  /// branch of the tree being encoded.
-  ///
-  /// For example, a variable defined and initialized as
-  /// ```
-  /// vector<vector<vector<char>>> example =
-  ///    {{{'a'}, {'b', 'c'}, {'z'}}, {{'d' 'e', 'f'}}};
-  /// ```
-  ///
-  /// and passed as an argument to a kernel will be encoded as the following
-  /// block. The block will have a structure with the declared arguments
-  /// followed by an addendum of variable data, where the vector data is
-  /// encoded.
-  ///
-  /// ```
-  ///   arguments: { ..., 1, ... }
-  ///   addendum: [[3; 1 2 1, a, b c, z] [1; 3, d e f]]
-  /// ```
-  std::pair<Value, Value> unpackStdVector(Location loc, OpBuilder &builder,
-                                          cudaq::cc::SpanLikeType stdvecTy,
-                                          Value vecSize, Value trailingData) {
-    // Convert the pointer-free std::vector<T> to a span structure to be
-    // passed. A span structure is a pointer and a size (in element
-    // units). Note that this structure may be recursive.
-    auto i8Ty = builder.getI8Type();
-    auto arrI8Ty = cudaq::cc::ArrayType::get(i8Ty);
-    auto ptrI8Ty = cudaq::cc::PointerType::get(i8Ty);
-    auto bytesTy = cudaq::cc::PointerType::get(arrI8Ty);
-    Type eleTy = stdvecTy.getElementType();
-    auto innerStdvecTy = dyn_cast<cudaq::cc::SpanLikeType>(eleTy);
-    std::size_t eleSize =
-        innerStdvecTy ? /*(i64Type/8)*/ 8 : dataLayout->getTypeSize(eleTy);
-    auto eleSizeVal = [&]() -> Value {
-      if (eleSize)
-        return builder.create<arith::ConstantIntOp>(loc, eleSize, 64);
-      assert(isa<cudaq::cc::StructType>(eleTy) ||
-             (isa<cudaq::cc::ArrayType>(eleTy) &&
-              !cast<cudaq::cc::ArrayType>(eleTy).isUnknownSize()));
-      auto i64Ty = builder.getI64Type();
-      return builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, eleTy);
-    }();
-    auto vecLength = builder.create<arith::DivSIOp>(loc, vecSize, eleSizeVal);
-    if (innerStdvecTy) {
-      // Recursive case: std::vector<std::vector<...>>
-      // TODO: Uses stack allocation, however it may be better to use heap
-      // allocation. It's not clear the QPU has heap memory allocation. If this
-      // uses heap allocation, then the thunk must free that memory *after* the
-      // kernel proper returns.
-      auto vecTmp = builder.create<cudaq::cc::AllocaOp>(loc, eleTy, vecLength);
-      auto currentEnd = builder.create<cudaq::cc::AllocaOp>(loc, ptrI8Ty);
-      auto i64Ty = builder.getI64Type();
-      auto arrI64Ty = cudaq::cc::ArrayType::get(i64Ty);
-      auto arrTy = cudaq::cc::PointerType::get(arrI64Ty);
-      auto innerVec =
-          builder.create<cudaq::cc::CastOp>(loc, arrTy, trailingData);
-      auto trailingBytes =
-          builder.create<cudaq::cc::CastOp>(loc, bytesTy, trailingData);
-      trailingData = builder.create<cudaq::cc::ComputePtrOp>(
-          loc, ptrI8Ty, trailingBytes, vecSize);
-      builder.create<cudaq::cc::StoreOp>(loc, trailingData, currentEnd);
-      // Loop over each subvector in the vector and recursively unpack it into
-      // the vecTmp variable. Leaf vectors do not need a fresh variable. This
-      // effectively translates all the size/offset information for all the
-      // subvectors into temps.
-      Value vecLengthIndex = builder.create<cudaq::cc::CastOp>(
-          loc, builder.getI64Type(), vecLength,
-          cudaq::cc::CastOpMode::Unsigned);
-      cudaq::opt::factory::createInvariantLoop(
-          builder, loc, vecLengthIndex,
-          [&](OpBuilder &builder, Location loc, Region &, Block &block) {
-            Value i = block.getArgument(0);
-            auto innerPtr = builder.create<cudaq::cc::ComputePtrOp>(
-                loc, cudaq::cc::PointerType::get(i64Ty), innerVec,
-                SmallVector<cudaq::cc::ComputePtrArg>{i});
-            Value innerVecSize =
-                builder.create<cudaq::cc::LoadOp>(loc, innerPtr);
-            Value tmp = builder.create<cudaq::cc::LoadOp>(loc, currentEnd);
-            auto unpackPair =
-                unpackStdVector(loc, builder, innerStdvecTy, innerVecSize, tmp);
-            auto ptrInnerTy = cudaq::cc::PointerType::get(innerStdvecTy);
-            auto subVecPtr = builder.create<cudaq::cc::ComputePtrOp>(
-                loc, ptrInnerTy, vecTmp,
-                SmallVector<cudaq::cc::ComputePtrArg>{i});
-            builder.create<cudaq::cc::StoreOp>(loc, unpackPair.first,
-                                               subVecPtr);
-            builder.create<cudaq::cc::StoreOp>(loc, unpackPair.second,
-                                               currentEnd);
-          });
-      auto coerceResult = builder.create<cudaq::cc::CastOp>(
-          loc, cudaq::cc::PointerType::get(stdvecTy), vecTmp);
-      trailingData = builder.create<cudaq::cc::LoadOp>(loc, currentEnd);
-      Value result = builder.create<cudaq::cc::StdvecInitOp>(
-          loc, stdvecTy, coerceResult, vecLength);
-      return {result, trailingData};
-    }
-    // Must divide by byte, 8 bits.
-    // The data is at trailingData and is valid for vecLength of eleTy.
-    auto castData = builder.create<cudaq::cc::CastOp>(
-        loc, cudaq::cc::PointerType::get(eleTy), trailingData);
-    Value stdVecResult = builder.create<cudaq::cc::StdvecInitOp>(
-        loc, stdvecTy, castData, vecLength);
-    auto arrTy = cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(i8Ty));
-    Value casted = builder.create<cudaq::cc::CastOp>(loc, arrTy, trailingData);
-    trailingData =
-        builder.create<cudaq::cc::ComputePtrOp>(loc, ptrI8Ty, casted, vecSize);
-    return {stdVecResult, trailingData};
-  }
+    // Return the message buffer and its size in bytes.
+    builder.create<cudaq::cc::StoreOp>(loc, rawMessageBuffer,
+                                       entry->getArgument(1));
+    builder.create<func::ReturnOp>(loc, ValueRange{messageBufferSize});
 
-  /// Translate the buffer data to a sequence of arguments suitable to the
-  /// actual kernel call.
-  ///
-  /// \param inTy      The actual expected type of the argument.
-  /// \param structTy  The modified buffer type over all the arguments at the
-  /// current level.
-  std::pair<Value, Value> processInputValue(Location loc, OpBuilder &builder,
-                                            Value trailingData, Value val,
-                                            Type inTy, std::int64_t off,
-                                            cudaq::cc::StructType structTy) {
-    if (isa<cudaq::cc::IndirectCallableType>(inTy)) {
-      auto i64Ty = builder.getI64Type();
-      auto key =
-          builder.create<cudaq::cc::ExtractValueOp>(loc, i64Ty, val, off);
-      return {builder.create<cudaq::cc::CastOp>(loc, inTy, key), trailingData};
-    }
-    if (isa<cudaq::cc::CallableType>(inTy))
-      return {builder.create<cudaq::cc::UndefOp>(loc, inTy), trailingData};
-    if (auto stdVecTy = dyn_cast<cudaq::cc::SpanLikeType>(inTy)) {
-      Value vecSize = builder.create<cudaq::cc::ExtractValueOp>(
-          loc, builder.getI64Type(), val, off);
-      return unpackStdVector(loc, builder, stdVecTy, vecSize, trailingData);
-    }
-    if (auto strTy = dyn_cast<cudaq::cc::StructType>(inTy)) {
-      if (!cudaq::cc::isDynamicType(strTy)) {
-        if (strTy.isEmpty())
-          return {builder.create<cudaq::cc::UndefOp>(loc, inTy), trailingData};
-        return {builder.create<cudaq::cc::ExtractValueOp>(loc, inTy, val, off),
-                trailingData};
-      }
-      // The struct contains dynamic components. Extract them and build up the
-      // struct value to be passed as an argument.
-      Type buffMemTy = structTy.getMember(off);
-      Value strVal = builder.create<cudaq::cc::UndefOp>(loc, inTy);
-      Value subVal =
-          builder.create<cudaq::cc::ExtractValueOp>(loc, buffMemTy, val, off);
-      // Convert the argument type, strTy, to a buffer type.
-      auto memberArgTy = cast<cudaq::cc::StructType>(
-          cudaq::opt::factory::genArgumentBufferType(strTy));
-      for (auto iter : llvm::enumerate(strTy.getMembers())) {
-        auto [a, t] =
-            processInputValue(loc, builder, trailingData, subVal, iter.value(),
-                              iter.index(), memberArgTy);
-        trailingData = t;
-        strVal = builder.create<cudaq::cc::InsertValueOp>(loc, inTy, strVal, a,
-                                                          iter.index());
-      }
-      return {strVal, trailingData};
-    }
-    return {builder.create<cudaq::cc::ExtractValueOp>(loc, inTy, val, off),
-            trailingData};
+    // Note: the .argsCreator will have allocated space for a static result in
+    // the message buffer. If the kernel returns a dynamic result, the launch
+    // kernel code will have to properly return it in the appropriate context.
+    return argsCreatorFunc;
   }
 
   /// Generate the thunk function. This function is called by the library
@@ -747,7 +1336,6 @@ class GenerateKernelExecution
     auto castOp = builder.create<cudaq::cc::CastOp>(loc, structPtrTy,
                                                     thunkEntry->getArgument(0));
     auto isClientServer = thunkEntry->getArgument(1);
-    Value val = builder.create<cudaq::cc::LoadOp>(loc, castOp);
     auto i64Ty = builder.getI64Type();
 
     // Compute the struct size without the trailing bytes, structSize.
@@ -768,7 +1356,7 @@ class GenerateKernelExecution
     SmallVector<Value> args;
     const std::int32_t offset = funcTy.getNumInputs();
     for (auto inp : llvm::enumerate(funcTy.getInputs())) {
-      auto [a, t] = processInputValue(loc, builder, trailingData, val,
+      auto [a, t] = processInputValue(loc, builder, trailingData, castOp,
                                       inp.value(), inp.index(), structTy);
       trailingData = t;
       args.push_back(a);
@@ -846,438 +1434,76 @@ class GenerateKernelExecution
     return thunk;
   }
 
-  /// Generate code to initialize the std::vector<T>, \p sret, from an
-  /// initializer list with data at \p data and length \p size. Use the library
-  /// helper routine. This function takes two !llvm.ptr arguments.
-  void genStdvecBoolFromInitList(Location loc, OpBuilder &builder, Value sret,
-                                 Value data, Value size) {
-    auto ptrTy = cudaq::cc::PointerType::get(builder.getContext());
-    auto castData = builder.create<cudaq::cc::CastOp>(loc, ptrTy, data);
-    auto castSret = builder.create<cudaq::cc::CastOp>(loc, ptrTy, sret);
-    builder.create<func::CallOp>(loc, std::nullopt,
-                                 cudaq::stdvecBoolCtorFromInitList,
-                                 ArrayRef<Value>{castSret, castData, size});
-  }
-
-  /// Generate a `std::vector<T>` (where `T != bool`) from an initializer list.
-  /// This is done with the assumption that `std::vector` is implemented as a
-  /// triple of pointers. The original content of the vector is freed and the
-  /// new content, which is already on the stack, is moved into the
-  /// `std::vector`.
-  void genStdvecTFromInitList(Location loc, OpBuilder &builder, Value sret,
-                              Value data, Value tSize, Value vecSize) {
-    auto i8Ty = builder.getI8Type();
-    auto stlVectorTy =
-        cudaq::cc::PointerType::get(cudaq::opt::factory::stlVectorType(i8Ty));
-    auto ptrTy = cudaq::cc::PointerType::get(i8Ty);
-    auto castSret = builder.create<cudaq::cc::CastOp>(loc, stlVectorTy, sret);
-    auto ptrPtrTy = cudaq::cc::PointerType::get(ptrTy);
-    auto sret0 = builder.create<cudaq::cc::ComputePtrOp>(
-        loc, ptrPtrTy, castSret, SmallVector<cudaq::cc::ComputePtrArg>{0});
-    auto arrI8Ty = cudaq::cc::ArrayType::get(i8Ty);
-    auto ptrArrTy = cudaq::cc::PointerType::get(arrI8Ty);
-    auto buffPtr0 = builder.create<cudaq::cc::CastOp>(loc, ptrTy, data);
-    builder.create<cudaq::cc::StoreOp>(loc, buffPtr0, sret0);
-    auto sret1 = builder.create<cudaq::cc::ComputePtrOp>(
-        loc, ptrPtrTy, castSret, SmallVector<cudaq::cc::ComputePtrArg>{1});
-    Value byteLen = builder.create<arith::MulIOp>(loc, tSize, vecSize);
-    auto buffPtr = builder.create<cudaq::cc::CastOp>(loc, ptrArrTy, data);
-    auto endPtr = builder.create<cudaq::cc::ComputePtrOp>(
-        loc, ptrTy, buffPtr, SmallVector<cudaq::cc::ComputePtrArg>{byteLen});
-    builder.create<cudaq::cc::StoreOp>(loc, endPtr, sret1);
-    auto sret2 = builder.create<cudaq::cc::ComputePtrOp>(
-        loc, ptrPtrTy, castSret, SmallVector<cudaq::cc::ComputePtrArg>{2});
-    builder.create<cudaq::cc::StoreOp>(loc, endPtr, sret2);
-  }
-
-  static MutableArrayRef<BlockArgument>
-  dropAnyHiddenArguments(MutableArrayRef<BlockArgument> args,
-                         FunctionType funcTy, bool hasThisPointer) {
-    const bool hiddenSRet = cudaq::opt::factory::hasHiddenSRet(funcTy);
-    const unsigned count =
-        cudaq::cc::numberOfHiddenArgs(hasThisPointer, hiddenSRet);
-    if (count > 0 && args.size() >= count &&
-        std::all_of(args.begin(), args.begin() + count, [](auto i) {
-          return isa<cudaq::cc::PointerType>(i.getType());
-        }))
-      return args.drop_front(count);
-    return args;
-  }
-
-  // Return the vector's length, computed on the CPU side, in bytes.
-  Value computeHostVectorLengthInBytes(Location loc, OpBuilder &builder,
-                                       Value hostArg, Type eleTy,
-                                       cudaq::cc::PointerType hostVecTy) {
-    auto rawSize = getVectorSize(loc, builder, hostVecTy, hostArg);
-    if (isa<cudaq::cc::SpanLikeType>(eleTy)) {
-      auto three = builder.create<arith::ConstantIntOp>(loc, 3, 64);
-      return builder.create<arith::DivSIOp>(loc, rawSize, three);
-    }
-    return rawSize;
-  }
-
-  Value fetchHostVectorFront(Location loc, OpBuilder &builder, Value hostArg,
-                             cudaq::cc::PointerType hostVecTy) {
-    auto inpStructTy = cast<cudaq::cc::StructType>(hostVecTy.getElementType());
-    auto ptrTtype = cudaq::cc::PointerType::get(inpStructTy.getMember(0));
-    auto beginPtr = builder.create<cudaq::cc::ComputePtrOp>(
-        loc, ptrTtype, hostArg, SmallVector<cudaq::cc::ComputePtrArg>{0});
-    auto ptrArrSTy = cudaq::opt::factory::getIndexedObjectType(inpStructTy);
-    auto vecPtr = builder.create<cudaq::cc::CastOp>(
-        loc, cudaq::cc::PointerType::get(ptrArrSTy), beginPtr);
-    return builder.create<cudaq::cc::LoadOp>(loc, vecPtr);
-  }
-
-  Value recursiveVectorDataCopy(Location loc, OpBuilder &builder, Value hostArg,
-                                Value buffPtr, cudaq::cc::SpanLikeType stdvecTy,
-                                cudaq::cc::PointerType hostVecTy) {
-    auto vecLen = computeHostVectorLengthInBytes(loc, builder, hostArg,
-                                                 stdvecTy, hostVecTy);
-    auto nested = fetchHostVectorFront(loc, builder, hostArg, hostVecTy);
-    auto vecLogicalLen = convertLengthBytesToLengthI64(loc, builder, vecLen);
-    auto vecLenIndex = builder.create<cudaq::cc::CastOp>(
-        loc, builder.getI64Type(), vecLogicalLen,
-        cudaq::cc::CastOpMode::Unsigned);
-    auto buffPtrTy = cast<cudaq::cc::PointerType>(buffPtr.getType());
-    auto tmp = builder.create<cudaq::cc::AllocaOp>(loc, buffPtrTy);
-    auto buffArrTy = cudaq::cc::ArrayType::get(buffPtrTy.getElementType());
-    auto castPtr = builder.create<cudaq::cc::CastOp>(
-        loc, cudaq::cc::PointerType::get(buffArrTy), buffPtr);
-    auto newEnd = builder.create<cudaq::cc::ComputePtrOp>(
-        loc, buffPtrTy, castPtr, SmallVector<cudaq::cc::ComputePtrArg>{vecLen});
-    builder.create<cudaq::cc::StoreOp>(loc, newEnd, tmp);
-    auto i64Ty = builder.getI64Type();
-    auto arrI64Ty = cudaq::cc::ArrayType::get(i64Ty);
-    auto ptrI64Ty = cudaq::cc::PointerType::get(i64Ty);
-    auto ptrArrTy = cudaq::cc::PointerType::get(arrI64Ty);
-    auto vecBasePtr = builder.create<cudaq::cc::CastOp>(loc, ptrArrTy, buffPtr);
-    auto nestedArr = builder.create<cudaq::cc::CastOp>(loc, hostVecTy, nested);
-    auto hostArrVecTy = cudaq::cc::PointerType::get(
-        cudaq::cc::ArrayType::get(hostVecTy.getElementType()));
-    cudaq::opt::factory::createInvariantLoop(
-        builder, loc, vecLenIndex,
-        [&](OpBuilder &builder, Location loc, Region &, Block &block) {
-          Value i = block.getArgument(0);
-          auto currBuffPtr = builder.create<cudaq::cc::ComputePtrOp>(
-              loc, ptrI64Ty, vecBasePtr, ArrayRef<cudaq::cc::ComputePtrArg>{i});
-          auto upCast =
-              builder.create<cudaq::cc::CastOp>(loc, hostArrVecTy, nestedArr);
-          auto hostSubVec = builder.create<cudaq::cc::ComputePtrOp>(
-              loc, hostVecTy, upCast, ArrayRef<cudaq::cc::ComputePtrArg>{i});
-          Value buff = builder.create<cudaq::cc::LoadOp>(loc, tmp);
-          // Compute and save the byte size.
-          auto vecSz = computeHostVectorLengthInBytes(
-              loc, builder, hostSubVec, stdvecTy.getElementType(), hostVecTy);
-          builder.create<cudaq::cc::StoreOp>(loc, vecSz, currBuffPtr);
-          // Recursively copy vector data.
-          auto endBuff = encodeVectorData(loc, builder, vecSz, stdvecTy,
-                                          hostSubVec, buff, hostVecTy);
-          builder.create<cudaq::cc::StoreOp>(loc, endBuff, tmp);
-        });
-    return builder.create<cudaq::cc::LoadOp>(loc, tmp);
-  }
-
-  /// Recursively encode a `std::vector` into a buffer's addendum. The data is
-  /// read from \p hostArg. The data is \p bytes size long if this is a leaf
-  /// vector, otherwise the size is computed on-the-fly during the encoding of
-  /// the ragged array.
-  /// \return The new pointer to the end of the addendum block.
-  Value encodeVectorData(Location loc, OpBuilder &builder, Value bytes,
-                         cudaq::cc::SpanLikeType stdvecTy, Value hostArg,
-                         Value bufferAddendum, cudaq::cc::PointerType ptrInTy) {
-    auto eleTy = stdvecTy.getElementType();
-    if (auto subVecTy = dyn_cast<cudaq::cc::SpanLikeType>(eleTy))
-      return recursiveVectorDataCopy(loc, builder, hostArg, bufferAddendum,
-                                     subVecTy, ptrInTy);
-    return copyVectorData(loc, builder, bytes, hostArg, bufferAddendum);
-  }
-
-  /// Recursively encode a struct which has dynamically sized members (such as
-  /// vectors). The vector members are encoded as i64 sizes with the data
-  /// attached to the buffer addendum.
-  /// \return The new pointer to the end of the addendum block.
-  Value encodeDynamicStructData(Location loc, OpBuilder &builder,
-                                cudaq::cc::StructType deviceTy, Value hostArg,
-                                Value bufferArg, Value bufferAddendum) {
-    for (auto iter : llvm::enumerate(deviceTy.getMembers())) {
-      auto memTy = iter.value();
-      if (auto vecTy = dyn_cast<cudaq::cc::SpanLikeType>(memTy)) {
-        Type eTy = vecTy.getElementType();
-        auto hostTy = cudaq::opt::factory::stlVectorType(eTy);
-        auto ptrHostTy = cudaq::cc::PointerType::get(hostTy);
-        auto ptrI64Ty = cudaq::cc::PointerType::get(builder.getI64Type());
-        std::int32_t offset = iter.index();
-        auto sizeAddr = builder.create<cudaq::cc::ComputePtrOp>(
-            loc, ptrI64Ty, bufferArg,
-            ArrayRef<cudaq::cc::ComputePtrArg>{0, 0, offset});
-        auto size = builder.create<cudaq::cc::LoadOp>(loc, sizeAddr);
-        auto vecAddr = builder.create<cudaq::cc::ComputePtrOp>(
-            loc, ptrHostTy, hostArg,
-            ArrayRef<cudaq::cc::ComputePtrArg>{offset});
-        bufferAddendum = encodeVectorData(loc, builder, size, vecTy, vecAddr,
-                                          bufferAddendum, ptrHostTy);
-      } else if (auto strTy = dyn_cast<cudaq::cc::StructType>(memTy)) {
-        if (cudaq::cc::isDynamicType(strTy)) {
-          auto ptrStrTy = cudaq::cc::PointerType::get(strTy);
-          std::int32_t idx = iter.index();
-          auto strAddr = builder.create<cudaq::cc::ComputePtrOp>(
-              loc, ptrStrTy, bufferArg,
-              ArrayRef<cudaq::cc::ComputePtrArg>{idx});
-          bufferAddendum = encodeDynamicStructData(loc, builder, strTy, strAddr,
-                                                   bufferArg, bufferAddendum);
-        }
-      } else if (auto arrTy = dyn_cast<cudaq::cc::ArrayType>(memTy)) {
-        // This is like vector type if the array has dynamic size. If it has a
-        // constant size, it is like a struct with n identical members.
-        TODO_loc(loc, "array type");
-      }
-    }
-    return bufferAddendum;
-  }
-
-  static std::pair<bool, func::FuncOp>
-  lookupHostEntryPointFunc(StringRef mangledEntryPointName, ModuleOp module,
-                           func::FuncOp funcOp) {
-    if (mangledEntryPointName.equals("BuilderKernel.EntryPoint") ||
-        mangledEntryPointName.contains("_PyKernelEntryPointRewrite")) {
-      // No host entry point needed.
-      return {false, func::FuncOp{}};
-    }
-    if (auto *decl = module.lookupSymbol(mangledEntryPointName))
-      if (auto func = dyn_cast<func::FuncOp>(decl)) {
-        func.eraseBody();
-        return {true, func};
-      }
-    funcOp.emitOpError("could not generate the host-side kernel function (" +
-                       mangledEntryPointName + ")");
-    return {true, func::FuncOp{}};
-  }
-
-  /// Generate an all new entry point body, calling launchKernel in the runtime
-  /// library. Pass along the thunk, so the runtime can call the quantum
-  /// circuit. These entry points are `operator()` member functions in a class,
-  /// so account for the `this` argument here.
-  void genNewHostEntryPoint(Location loc, OpBuilder &builder,
+  /// Generate an all new entry point body, calling <i>some</i>LaunchKernel in
+  /// the runtime library. Pass along the thunk, so the runtime can call the
+  /// quantum circuit. These entry points may be `operator()` member functions
+  /// in a class, so account for the `this` argument here.
+  void genNewHostEntryPoint(Location loc, OpBuilder &builder, ModuleOp module,
                             FunctionType devFuncTy,
                             LLVM::GlobalOp kernelNameObj, func::FuncOp hostFunc,
                             bool addThisPtr, cudaq::cc::StructType structTy,
                             func::FuncOp thunkFunc) {
     auto *ctx = builder.getContext();
     auto i64Ty = builder.getI64Type();
-    std::int32_t offset = devFuncTy.getNumInputs();
+    auto i8Ty = builder.getI8Type();
+    auto ptrI8Ty = cudaq::cc::PointerType::get(i8Ty);
     auto thunkTy = getThunkType(ctx);
     auto structPtrTy = cudaq::cc::PointerType::get(structTy);
-    Block *hostFuncEntryBlock = hostFunc.addEntryBlock();
+    const std::int32_t offset = devFuncTy.getNumInputs();
 
+    Block *hostFuncEntryBlock = hostFunc.addEntryBlock();
     OpBuilder::InsertionGuard guard(builder);
     builder.setInsertionPointToStart(hostFuncEntryBlock);
-    auto i8Ty = builder.getI8Type();
-    auto ptrI8Ty = cudaq::cc::PointerType::get(i8Ty);
 
-    Value temp;
+    SmallVector<BlockArgument> blockArgs{dropAnyHiddenArguments(
+        hostFuncEntryBlock->getArguments(), devFuncTy, addThisPtr)};
+    SmallVector<Value> blockValues(blockArgs.size());
+    std::copy(blockArgs.begin(), blockArgs.end(), blockValues.begin());
+    const bool hasDynamicSignature = isDynamicSignature(devFuncTy);
+    Value heapTracker = createEmptyHeapTracker(loc, builder);
+    auto zippy = zipArgumentsWithDeviceTypes</*argsAreReferences=*/false>(
+        loc, builder, module, blockValues, devFuncTy.getInputs(), heapTracker);
+    auto sizeScratch = builder.create<cudaq::cc::AllocaOp>(loc, i64Ty);
+    auto messageBufferSize = [&]() -> Value {
+      if (hasDynamicSignature)
+        return genSizeOfDynamicMessageBuffer(loc, builder, module, structTy,
+                                             zippy, sizeScratch);
+      return builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, structTy);
+    }();
+
+    Value msgBufferPrefix;
     Value castTemp;
     Value resultOffset;
     Value castLoadThunk;
     Value extendedStructSize;
     if (isCodegenPackedData(codegenKind)) {
-      Value stVal = builder.create<cudaq::cc::UndefOp>(loc, structTy);
-
-      // Process all the arguments for the original call, ignoring any hidden
-      // arguments (such as the `this` pointer).
-      auto zero = builder.create<arith::ConstantIntOp>(loc, 0, 64);
-      Value extraBytes = zero;
-      bool hasTrailingData = false;
-      SmallVector<BlockArgument> blockArgs{dropAnyHiddenArguments(
-          hostFuncEntryBlock->getArguments(), devFuncTy, addThisPtr)};
-      std::int32_t idx = 0;
-      SmallVector<Value> blockValues(blockArgs.size());
-      std::copy(blockArgs.begin(), blockArgs.end(), blockValues.begin());
-      for (auto iter = blockArgs.begin(), end = blockArgs.end(); iter != end;
-           ++iter, ++idx) {
-        Value arg = *iter;
-        Type inTy = arg.getType();
-        Type quakeTy = devFuncTy.getInput(idx);
-        // If the argument is a callable, skip it.
-        if (isa<cudaq::cc::CallableType>(quakeTy))
-          continue;
-
-        // Argument is a packaged kernel. In this case, the argument is some
-        // unknown kernel that may be called. The packaged argument is coming
-        // from opaque C++ host code, so we need to identify what kernel it
-        // references and then pass its name as a span of characters to the
-        // launch kernel.
-        if (isa<cudaq::cc::IndirectCallableType>(quakeTy)) {
-          auto kernKey = builder.create<func::CallOp>(
-              loc, i64Ty, cudaq::runtime::getLinkableKernelKey,
-              ValueRange{arg});
-          stVal = builder.create<cudaq::cc::InsertValueOp>(
-              loc, stVal.getType(), stVal, kernKey.getResult(0), idx);
-          continue;
-        }
-
-        // If the argument is an empty struct, skip it.
-        if (auto strTy = dyn_cast<cudaq::cc::StructType>(quakeTy))
-          if (strTy.isEmpty())
-            continue;
-
-        if (auto stdvecTy = dyn_cast<cudaq::cc::SpanLikeType>(quakeTy)) {
-          // Per the CUDA-Q spec, an entry point kernel must take a `[const]
-          // std::vector<T>` value argument.
-          // Should the spec stipulate that pure device kernels must pass by
-          // read-only reference, i.e., take `const std::vector<T> &` arguments?
-          auto ptrInTy = cast<cudaq::cc::PointerType>(inTy);
-          // If this is a std::vector<bool>, unpack it.
-          if (stdvecTy.getElementType() == builder.getI1Type()) {
-            // Create a mock vector of i8 and populate the bools, 1 per char.
-            Value tmp = builder.create<cudaq::cc::AllocaOp>(
-                loc, ptrInTy.getElementType());
-            builder.create<func::CallOp>(loc, std::nullopt,
-                                         cudaq::stdvecBoolUnpackToInitList,
-                                         ValueRange{tmp, arg});
-            arg = blockValues[idx] = tmp;
-          }
-          // FIXME: call the `size` member function. For expediency, assume this
-          // is an std::vector and the size is the scaled delta between the
-          // first two pointers. Use the unscaled size for now.
-          auto [p1, p2] = insertVectorSizeAndIncrementExtraBytes(
-              loc, builder, arg, ptrInTy, stdvecTy, stVal, idx, extraBytes);
-          stVal = p1;
-          extraBytes = p2;
-          hasTrailingData = true;
-          continue;
-        }
-        if (auto strTy = dyn_cast<cudaq::cc::StructType>(quakeTy)) {
-          if (!isa<cudaq::cc::PointerType>(arg.getType())) {
-            // If argument is not a pointer, then struct was promoted into a
-            // register.
-            auto *parent = builder.getBlock()->getParentOp();
-            auto module = parent->getParentOfType<ModuleOp>();
-            auto tmp = builder.create<cudaq::cc::AllocaOp>(loc, quakeTy);
-            auto cast = builder.create<cudaq::cc::CastOp>(
-                loc, cudaq::cc::PointerType::get(arg.getType()), tmp);
-            if (cudaq::opt::factory::isX86_64(module)) {
-              builder.create<cudaq::cc::StoreOp>(loc, arg, cast);
-              if (cudaq::opt::factory::structUsesTwoArguments(quakeTy)) {
-                auto arrTy = cudaq::cc::ArrayType::get(builder.getI8Type());
-                auto cast = builder.create<cudaq::cc::CastOp>(
-                    loc, cudaq::cc::PointerType::get(arrTy), tmp);
-                auto hiPtr = builder.create<cudaq::cc::ComputePtrOp>(
-                    loc, cudaq::cc::PointerType::get(builder.getI8Type()), cast,
-                    cudaq::cc::ComputePtrArg{8});
-                ++iter;
-                Value nextArg = *iter;
-                auto cast2 = builder.create<cudaq::cc::CastOp>(
-                    loc, cudaq::cc::PointerType::get(nextArg.getType()), hiPtr);
-                builder.create<cudaq::cc::StoreOp>(loc, nextArg, cast2);
-              }
-            } else {
-              builder.create<cudaq::cc::StoreOp>(loc, arg, cast);
-            }
-            // Load the assembled (sub-)struct and insert into the buffer value.
-            Value v = builder.create<cudaq::cc::LoadOp>(loc, tmp);
-            stVal = builder.create<cudaq::cc::InsertValueOp>(
-                loc, stVal.getType(), stVal, v, idx);
-            continue;
-          }
-          if (!cudaq::cc::isDynamicType(strTy)) {
-            // struct is static size, so just load the value (byval ptr).
-            Value v = builder.create<cudaq::cc::LoadOp>(loc, arg);
-            stVal = builder.create<cudaq::cc::InsertValueOp>(
-                loc, stVal.getType(), stVal, v, idx);
-            continue;
-          }
-          auto genTy = cast<cudaq::cc::StructType>(
-              cudaq::opt::factory::genArgumentBufferType(strTy));
-          Value zero = builder.create<arith::ConstantIntOp>(loc, 0, 64);
-          auto [quakeVal, recursiveSize] = computeRecursiveDynamicStructSize(
-              loc, builder, strTy, arg, zero, genTy);
-          stVal = builder.create<cudaq::cc::InsertValueOp>(
-              loc, stVal.getType(), stVal, quakeVal, idx);
-          extraBytes =
-              builder.create<arith::AddIOp>(loc, extraBytes, recursiveSize);
-          hasTrailingData = true;
-          continue;
-        }
-        if (auto ptrTy = dyn_cast<cudaq::cc::PointerType>(inTy)) {
-          if (isa<cudaq::cc::StateType>(ptrTy.getElementType())) {
-            // Special case: if the argument is a `cudaq::state*`, then just
-            // pass the pointer. We can do that in this case because the
-            // synthesis step (which will receive the argument data) is assumed
-            // to run in the same memory space.
-            Value argPtr = builder.create<cudaq::cc::CastOp>(loc, inTy, arg);
-            stVal = builder.create<cudaq::cc::InsertValueOp>(
-                loc, stVal.getType(), stVal, argPtr, idx);
-          }
-          continue;
-        }
-
-        stVal = builder.create<cudaq::cc::InsertValueOp>(loc, stVal.getType(),
-                                                         stVal, arg, idx);
+      auto rawMessageBuffer =
+          builder.create<cudaq::cc::AllocaOp>(loc, i8Ty, messageBufferSize);
+      msgBufferPrefix =
+          builder.create<cudaq::cc::CastOp>(loc, structPtrTy, rawMessageBuffer);
+
+      if (hasDynamicSignature) {
+        auto addendumScratch =
+            builder.create<cudaq::cc::AllocaOp>(loc, ptrI8Ty);
+        Value prefixSize =
+            builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, structTy);
+        Value addendumPtr = builder.create<cudaq::cc::ComputePtrOp>(
+            loc, ptrI8Ty, rawMessageBuffer,
+            ArrayRef<cudaq::cc::ComputePtrArg>{prefixSize});
+        populateMessageBuffer(loc, builder, module, msgBufferPrefix, zippy,
+                              addendumPtr, addendumScratch);
+      } else {
+        populateMessageBuffer(loc, builder, module, msgBufferPrefix, zippy);
       }
 
-      // Compute the struct size without the trailing bytes, structSize, and
-      // with the trailing bytes, extendedStructSize.
-      Value structSize =
-          builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, structTy);
-      extendedStructSize =
-          builder.create<arith::AddIOp>(loc, structSize, extraBytes);
-
-      // Allocate our struct to save the argument to.
-      auto buff =
-          builder.create<cudaq::cc::AllocaOp>(loc, i8Ty, extendedStructSize);
-
-      temp = builder.create<cudaq::cc::CastOp>(loc, structPtrTy, buff);
-
-      // Store the arguments to the argument section.
-      builder.create<cudaq::cc::StoreOp>(loc, stVal, temp);
-
-      auto structPtrArrTy =
-          cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(structTy));
-      temp = builder.create<cudaq::cc::CastOp>(loc, structPtrArrTy, buff);
-
-      // Append the vector data to the end of the struct.
-      if (hasTrailingData) {
-        Value vecToBuffer = builder.create<cudaq::cc::ComputePtrOp>(
-            loc, ptrI8Ty, buff, SmallVector<Value>{structSize});
-        // Ignore any hidden `this` argument.
-        for (auto inp : llvm::enumerate(blockValues)) {
-          Value arg = inp.value();
-          Type inTy = arg.getType();
-          std::int32_t idx = inp.index();
-          Type quakeTy = devFuncTy.getInput(idx);
-          if (auto stdvecTy = dyn_cast<cudaq::cc::SpanLikeType>(quakeTy)) {
-            auto bytes = builder.create<cudaq::cc::ExtractValueOp>(loc, i64Ty,
-                                                                   stVal, idx);
-            assert(stdvecTy == devFuncTy.getInput(idx));
-            auto ptrInTy = cast<cudaq::cc::PointerType>(inTy);
-            vecToBuffer = encodeVectorData(loc, builder, bytes, stdvecTy, arg,
-                                           vecToBuffer, ptrInTy);
-            if (stdvecTy.getElementType() == builder.getI1Type()) {
-              auto ptrI1Ty = cudaq::cc::PointerType::get(builder.getI1Type());
-              auto heapPtr = builder.create<cudaq::cc::ComputePtrOp>(
-                  loc, cudaq::cc::PointerType::get(ptrI1Ty), arg,
-                  ArrayRef<cudaq::cc::ComputePtrArg>{0});
-              auto loadHeapPtr =
-                  builder.create<cudaq::cc::LoadOp>(loc, heapPtr);
-              Value heapCast = builder.create<cudaq::cc::CastOp>(
-                  loc, cudaq::cc::PointerType::get(i8Ty), loadHeapPtr);
-              builder.create<func::CallOp>(loc, std::nullopt, "free",
-                                           ArrayRef<Value>{heapCast});
-            }
-            continue;
-          }
-          if (auto strTy = dyn_cast<cudaq::cc::StructType>(quakeTy)) {
-            if (cudaq::cc::isDynamicType(strTy))
-              vecToBuffer = encodeDynamicStructData(loc, builder, strTy, arg,
-                                                    temp, vecToBuffer);
-          }
-        }
-      }
+      maybeFreeHeapAllocations(loc, builder, heapTracker);
+      extendedStructSize = messageBufferSize;
       Value loadThunk =
           builder.create<func::ConstantOp>(loc, thunkTy, thunkFunc.getName());
       castLoadThunk =
           builder.create<cudaq::cc::FuncToPtrOp>(loc, ptrI8Ty, loadThunk);
-      castTemp = builder.create<cudaq::cc::CastOp>(loc, ptrI8Ty, temp);
+      castTemp =
+          builder.create<cudaq::cc::CastOp>(loc, ptrI8Ty, msgBufferPrefix);
       resultOffset = genComputeReturnOffset(loc, builder, devFuncTy, structTy);
     }
 
@@ -1397,7 +1623,8 @@ class GenerateKernelExecution
       builder.setInsertionPointToEnd(elseBlock);
       // span was returned in the original buffer.
       Value mRes = builder.create<cudaq::cc::ComputePtrOp>(
-          loc, ptrResTy, temp, ArrayRef<cudaq::cc::ComputePtrArg>{0, offset});
+          loc, ptrResTy, msgBufferPrefix,
+          ArrayRef<cudaq::cc::ComputePtrArg>{offset});
       builder.create<cf::BranchOp>(loc, endifBlock, ArrayRef<Value>{mRes});
       builder.setInsertionPointToEnd(endifBlock);
       launchResult = endifBlock->getArgument(0);
@@ -1454,7 +1681,8 @@ class GenerateKernelExecution
       if (resultVal) {
         // Static values. std::vector are necessarily sret, see below.
         auto resPtr = builder.create<cudaq::cc::ComputePtrOp>(
-            loc, ptrResTy, temp, ArrayRef<cudaq::cc::ComputePtrArg>{0, offset});
+            loc, ptrResTy, msgBufferPrefix,
+            ArrayRef<cudaq::cc::ComputePtrArg>{offset});
         Type castToTy = cudaq::cc::PointerType::get(hostFuncTy.getResult(0));
         auto castResPtr = [&]() -> Value {
           if (castToTy == ptrResTy)
@@ -1496,8 +1724,8 @@ class GenerateKernelExecution
           // type for the memcpy, so the device should return an (aggregate)
           // value of suitable size.
           auto resPtr = builder.create<cudaq::cc::ComputePtrOp>(
-              loc, ptrResTy, temp,
-              ArrayRef<cudaq::cc::ComputePtrArg>{0, offset});
+              loc, ptrResTy, msgBufferPrefix,
+              ArrayRef<cudaq::cc::ComputePtrArg>{offset});
           auto castMsgBuff =
               builder.create<cudaq::cc::CastOp>(loc, ptrI8Ty, resPtr);
           Type eleTy =
@@ -1516,19 +1744,6 @@ class GenerateKernelExecution
     builder.create<func::ReturnOp>(loc, results);
   }
 
-  /// A kernel function that takes a quantum type argument (also known as a pure
-  /// device kernel) cannot be called directly from C++ (classical) code. It
-  /// must be called via other quantum code.
-  bool hasLegalType(FunctionType funTy) {
-    for (auto ty : funTy.getInputs())
-      if (quake::isQuantumType(ty))
-        return false;
-    for (auto ty : funTy.getResults())
-      if (quake::isQuantumType(ty))
-        return false;
-    return true;
-  }
-
   /// Generate a function to be executed at load-time which will register the
   /// kernel with the runtime.
   LLVM::LLVMFuncOp registerKernelWithRuntimeForExecution(
@@ -1649,6 +1864,10 @@ class GenerateKernelExecution
             irBuilder.loadIntrinsic(module, cudaq::stdvecBoolUnpackToInitList)))
       return module.emitError(std::string("could not load ") +
                               cudaq::stdvecBoolUnpackToInitList);
+    if (failed(irBuilder.loadIntrinsic(module,
+                                       cudaq::stdvecBoolFreeTemporaryLists)))
+      return module.emitError(std::string("could not load ") +
+                              cudaq::stdvecBoolFreeTemporaryLists);
     if (failed(irBuilder.loadIntrinsic(module, cudaq::llvmMemCopyIntrinsic)))
       return module.emitError(std::string("could not load ") +
                               cudaq::llvmMemCopyIntrinsic);
@@ -1656,6 +1875,10 @@ class GenerateKernelExecution
       return module.emitError("could not load __nvqpp_zeroDynamicResult");
     if (failed(irBuilder.loadIntrinsic(module, "__nvqpp_createDynamicResult")))
       return module.emitError("could not load __nvqpp_createDynamicResult");
+    if (failed(
+            irBuilder.loadIntrinsic(module, cudaq::runtime::getPauliWordSize)))
+      return module.emitError(
+          "could not load cudaq::pauli_word::_nvqpp_size or _nvqpp_data");
     return success();
   }
 
@@ -1665,8 +1888,6 @@ class GenerateKernelExecution
     auto builder = OpBuilder::atBlockEnd(module.getBody());
     auto mangledNameMap =
         module->getAttrOfType<DictionaryAttr>(cudaq::runtime::mangledNameMap);
-    DataLayoutAnalysis dla(module); // caches module's data layout information.
-    dataLayout = &dla.getAtOrAbove(module);
     std::error_code ec;
     llvm::ToolOutputFile out(outputFilename, ec, llvm::sys::fs::OF_None);
     if (ec) {
@@ -1744,7 +1965,7 @@ class GenerateKernelExecution
         // Generate the argsCreator function used by synthesis.
         if (startingArgIdx == 0) {
           argsCreatorFunc = genKernelArgsCreatorFunction(
-              loc, builder, funcTy, structTy, classNameStr, hostFuncTy,
+              loc, builder, module, funcTy, structTy, classNameStr, hostFuncTy,
               hasThisPtr);
         } else {
           // We are operating in a very special case where we want the
@@ -1756,7 +1977,7 @@ class GenerateKernelExecution
               cudaq::opt::factory::buildInvokeStructType(funcTy,
                                                          startingArgIdx);
           argsCreatorFunc = genKernelArgsCreatorFunction(
-              loc, builder, funcTy, structTy_argsCreator, classNameStr,
+              loc, builder, module, funcTy, structTy_argsCreator, classNameStr,
               hostFuncTy, hasThisPtr);
         }
       }
@@ -1764,8 +1985,8 @@ class GenerateKernelExecution
       // Generate a new mangled function on the host side to call the
       // callback function.
       if (hostEntryNeeded)
-        genNewHostEntryPoint(loc, builder, funcTy, kernelNameObj, hostFunc,
-                             hasThisPtr, structTy, thunk);
+        genNewHostEntryPoint(loc, builder, module, funcTy, kernelNameObj,
+                             hostFunc, hasThisPtr, structTy, thunk);
 
       // Generate a function at startup to register this kernel as having
       // been processed for kernel execution.
@@ -1783,7 +2004,5 @@ class GenerateKernelExecution
     }
     out.keep();
   }
-
-  const DataLayout *dataLayout = nullptr;
 };
 } // namespace
diff --git a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
index 166f558275..82e6896c06 100644
--- a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
+++ b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
@@ -122,15 +122,9 @@ synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
                          ATTR arrayAttr, MAKER makeElementValue) {
   auto *ctx = builder.getContext();
   auto argTy = argument.getType();
-  assert(isa<cudaq::cc::StdvecType>(argTy) ||
-         isa<cudaq::cc::CharspanType>(argTy));
-  ELETY eleTy = [&]() -> ELETY {
-    if (auto strTy = dyn_cast<cudaq::cc::StdvecType>(argTy))
-      return cast<ELETY>(strTy.getElementType());
-    // Force cast this to ELETY. This will only happen for CharspanType.
-    return cast<ELETY>(cudaq::opt::factory::getCharType(ctx));
-  }();
-  auto strTy = cudaq::cc::StdvecType::get(ctx, eleTy);
+  assert(isa<cudaq::cc::SpanLikeType>(argTy));
+  auto strTy = cast<cudaq::cc::SpanLikeType>(argTy);
+  auto eleTy = cast<ELETY>(strTy.getElementType());
   builder.setInsertionPointToStart(argument.getOwner());
   auto argLoc = argument.getLoc();
   auto conArray = builder.create<cudaq::cc::ConstantArrayOp>(
@@ -572,7 +566,7 @@ class QuakeSynthesizer
 
       // If std::vector<arithmetic> type, add it to the list of vector info.
       // These will be processed when we reach the buffer's appendix.
-      if (auto vecTy = dyn_cast<cudaq::cc::StdvecType>(type)) {
+      if (auto vecTy = dyn_cast<cudaq::cc::SpanLikeType>(type)) {
         auto eleTy = vecTy.getElementType();
         if (!isa<IntegerType, FloatType, ComplexType, cudaq::cc::CharspanType>(
                 eleTy)) {
@@ -621,19 +615,6 @@ class QuakeSynthesizer
         continue;
       }
 
-      if (auto charSpanTy = dyn_cast<cudaq::cc::CharspanType>(type)) {
-        const char *ptrToSizeInBuffer =
-            static_cast<const char *>(args) + offset;
-        auto sizeFromBuffer =
-            *reinterpret_cast<const std::uint64_t *>(ptrToSizeInBuffer);
-        std::size_t bytesInType = sizeof(char);
-        auto vectorSize = sizeFromBuffer / bytesInType;
-        stdVecInfo.emplace_back(
-            argNum, cudaq::opt::factory::getCharType(builder.getContext()),
-            vectorSize);
-        continue;
-      }
-
       funcOp.emitOpError("We cannot synthesize argument(s) of this type.");
       signalPassFailure();
       return;
diff --git a/python/tests/kernel/test_observe_kernel.py b/python/tests/kernel/test_observe_kernel.py
index 5bf9d5a812..24c63ba90a 100644
--- a/python/tests/kernel/test_observe_kernel.py
+++ b/python/tests/kernel/test_observe_kernel.py
@@ -302,8 +302,7 @@ def test_pack_args_pauli_list():
     def generateRandomPauliStrings(numQubits, numPaulis):
         s = ['X', 'Y', 'Z', 'I']
         return [
-            ''.join([random.choice(s)
-                     for i in range(numQubits)])
+            ''.join([random.choice(s) for i in range(numQubits)])
             for i in range(numPaulis)
         ]
 
@@ -336,7 +335,8 @@ def gqeCirc2(N: int, thetas: list[float], paulis: list[cudaq.pauli_word]):
     ts = np.random.rand(len(pauliStings))
 
     exp_val1 = cudaq.observe_async(gqeCirc1, obs, numQubits, list(ts),
-                                   pauliStings[0]).get().expectation()
+                                   cudaq.pauli_word(
+                                       pauliStings[0])).get().expectation()
     print('observe_async exp_val1', exp_val1)
     exp_val2 = cudaq.observe_async(gqeCirc2, obs, numQubits, list(ts),
                                    pauliStings).get().expectation()
diff --git a/python/utils/OpaqueArguments.h b/python/utils/OpaqueArguments.h
index 3e410a07b6..46afd2fedc 100644
--- a/python/utils/OpaqueArguments.h
+++ b/python/utils/OpaqueArguments.h
@@ -101,7 +101,7 @@ inline py::args simplifiedValidateInputArguments(py::args &args) {
 
       arg = args[i].attr("tolist")();
     } else if (py::isinstance<py::str>(arg)) {
-      arg = cudaq::pauli_word(py::cast<std::string>(arg));
+      arg = py::cast<std::string>(arg);
     } else if (py::isinstance<py::list>(arg)) {
       py::list arg_list = py::cast<py::list>(arg);
       const bool all_strings = [&]() {
@@ -330,8 +330,7 @@ inline void packArgs(OpaqueArguments &argData, py::args args,
           addArgument(argData, arg.cast<long>());
         })
         .Case([&](cudaq::cc::CharspanType ty) {
-          addArgument(argData,
-                      cudaq::pauli_word(arg.cast<cudaq::pauli_word>().str()));
+          addArgument(argData, arg.cast<cudaq::pauli_word>().str());
         })
         .Case([&](cudaq::cc::PointerType ty) {
           if (isa<cudaq::cc::StateType>(ty.getElementType())) {
@@ -432,8 +431,7 @@ inline void packArgs(OpaqueArguments &argData, py::args args,
               .Case([&](cudaq::cc::CharspanType type) {
                 genericVecAllocator.template operator()<cudaq::pauli_word>(
                     [](py::handle element, int index, int elementIndex) {
-                      auto pw = element.cast<cudaq::pauli_word>();
-                      return cudaq::pauli_word(pw.str());
+                      return element.cast<cudaq::pauli_word>().str();
                     });
                 return;
               })
diff --git a/runtime/common/ArgumentConversion.cpp b/runtime/common/ArgumentConversion.cpp
index 0de2589752..c310966a07 100644
--- a/runtime/common/ArgumentConversion.cpp
+++ b/runtime/common/ArgumentConversion.cpp
@@ -77,14 +77,16 @@ static Value genConstant(OpBuilder &builder, FloatType fltTy, long double *v) {
 static Value genConstant(OpBuilder &builder, const std::string &v,
                          ModuleOp substMod) {
   auto loc = builder.getUnknownLoc();
-  cudaq::IRBuilder irBuilder(builder);
-  auto cString = irBuilder.genCStringLiteralAppendNul(loc, substMod, v);
-  auto addr = builder.create<cudaq::cc::AddressOfOp>(
-      loc, cudaq::cc::PointerType::get(cString.getType()), cString.getName());
-  auto i8PtrTy = cudaq::cc::PointerType::get(builder.getI8Type());
-  auto cast = builder.create<cudaq::cc::CastOp>(loc, i8PtrTy, addr);
+  auto *ctx = builder.getContext();
+  auto i8Ty = builder.getI8Type();
+  auto strLitTy = cudaq::cc::PointerType::get(
+      cudaq::cc::ArrayType::get(ctx, i8Ty, v.size() + 1));
+  auto strLit =
+      builder.create<cudaq::cc::CreateStringLiteralOp>(loc, strLitTy, v);
+  auto i8PtrTy = cudaq::cc::PointerType::get(i8Ty);
+  auto cast = builder.create<cudaq::cc::CastOp>(loc, i8PtrTy, strLit);
   auto size = builder.create<arith::ConstantIntOp>(loc, v.size(), 64);
-  auto chSpanTy = cudaq::cc::CharspanType::get(builder.getContext());
+  auto chSpanTy = cudaq::cc::CharspanType::get(ctx);
   return builder.create<cudaq::cc::StdvecInitOp>(loc, chSpanTy, cast, size);
 }
 
@@ -218,6 +220,21 @@ Value dispatchSubtype(OpBuilder &builder, Type ty, void *p, ModuleOp substMod,
       .Default({});
 }
 
+// Get the size of \p eleTy on the host side in bytes.
+static std::size_t getHostSideElementSize(Type eleTy,
+                                          llvm::DataLayout &layout) {
+  if (isa<cudaq::cc::StdvecType>(eleTy))
+    return sizeof(std::vector<int>);
+  if (isa<cudaq::cc::CharspanType>(eleTy)) {
+    // char span type is a std::string on host side.
+    return sizeof(std::string);
+  }
+  // Note: we want the size on the host side, but `getDataSize()` returns the
+  // size on the device side. This is ok for now since they are the same for
+  // most types and the special cases are handled above.
+  return cudaq::opt::getDataSize(layout, eleTy);
+}
+
 Value genConstant(OpBuilder &builder, cudaq::cc::StdvecType vecTy, void *p,
                   ModuleOp substMod, llvm::DataLayout &layout) {
   typedef const char *VectorType[3];
@@ -227,11 +244,7 @@ Value genConstant(OpBuilder &builder, cudaq::cc::StdvecType vecTy, void *p,
     return {};
   auto eleTy = vecTy.getElementType();
   auto elePtrTy = cudaq::cc::PointerType::get(eleTy);
-  auto eleSize = cudaq::opt::getDataSize(layout, eleTy);
-  if (isa<cudaq::cc::CharspanType>(eleTy)) {
-    // char span type (i.e. pauli word) is a `vector<char>`
-    eleSize = sizeof(VectorType);
-  }
+  auto eleSize = getHostSideElementSize(eleTy, layout);
 
   assert(eleSize && "element must have a size");
   auto loc = builder.getUnknownLoc();
diff --git a/runtime/cudaq/cudaq.cpp b/runtime/cudaq/cudaq.cpp
index 10ecc3b914..ca84a43121 100644
--- a/runtime/cudaq/cudaq.cpp
+++ b/runtime/cudaq/cudaq.cpp
@@ -470,20 +470,44 @@ void __nvqpp_initializer_list_to_vector_bool(std::vector<bool> &result,
 /// `std::vector<bool>` overload. The conversion turns the `std::vector<bool>`
 /// into a mock vector structure that looks like `std::vector<char>`. The
 /// calling routine must cleanup the buffer allocated by this code.
-void __nvqpp_vector_bool_to_initializer_list(void *outData,
-                                             const std::vector<bool> &inVec) {
+/// This helper routine may only be called on the host side.
+void __nvqpp_vector_bool_to_initializer_list(
+    void *outData, const std::vector<bool> &inVec,
+    std::vector<char *> **allocations) {
   // The MockVector must be allocated by the caller.
   struct MockVector {
     char *start;
     char *end;
+    char *end2;
   };
   MockVector *mockVec = reinterpret_cast<MockVector *>(outData);
   auto outSize = inVec.size();
   // The buffer allocated here must be freed by the caller.
-  mockVec->start = static_cast<char *>(malloc(outSize));
-  mockVec->end = mockVec->start + outSize;
+  if (!*allocations)
+    *allocations = new std::vector<char *>;
+  char *newData = static_cast<char *>(malloc(outSize));
+  (*allocations)->push_back(newData);
+  mockVec->start = newData;
+  mockVec->end2 = mockVec->end = newData + outSize;
   for (unsigned i = 0; i < outSize; ++i)
-    (mockVec->start)[i] = static_cast<char>(inVec[i]);
+    newData[i] = static_cast<char>(inVec[i]);
 }
+
+/// This helper routine deletes the vector that tracks all the temporaries that
+/// were created as well as the temporaries themselves.
+/// This routine may only be called on the host side.
+void __nvqpp_vector_bool_free_temporary_initlists(
+    std::vector<char *> *allocations) {
+  for (auto *p : *allocations)
+    free(p);
+  delete allocations;
+}
+
+/// Quasi-portable string helpers for Python (non-C++ frontends).  These library
+/// helper functions allow non-C++ front-ends to remain portable with the core
+/// layer. As these helpers ought to be built along with the bindings, there
+/// should not be a compatibility issue.
+const char *__nvqpp_getStringData(const std::string &s) { return s.data(); }
+std::uint64_t __nvqpp_getStringSize(const std::string &s) { return s.size(); }
 }
 } // namespace cudaq::support
diff --git a/runtime/cudaq/qis/pauli_word.h b/runtime/cudaq/qis/pauli_word.h
index afcd446e77..4a49a706a1 100644
--- a/runtime/cudaq/qis/pauli_word.h
+++ b/runtime/cudaq/qis/pauli_word.h
@@ -5,23 +5,59 @@
  * This source code and the accompanying materials are made available under    *
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
+
 #pragma once
 
+#include <algorithm>
+#include <ctype.h>
 #include <string>
-#include <vector>
 
 namespace cudaq {
-/// @brief The `pauli_word` is a thin wrapper on a
-/// Pauli tensor product string, e.g. `XXYZ` on 4
-// qubits.
-class pauli_word {
-private:
-  std::vector<char> term;
 
+/// @brief The `pauli_word` is a thin wrapper on a Pauli tensor product string,
+/// e.g. `XXYZ` on 4 qubits.
+class pauli_word {
 public:
   pauli_word() = default;
-  pauli_word(const std::string t) : term(t.begin(), t.end()) {}
-  std::string str() const { return std::string(term.begin(), term.end()); }
-  const std::vector<char> &data() const { return term; }
+  pauli_word(std::string &&t) : term{std::move(t)} { to_upper_case(); }
+  pauli_word(const std::string &t) : term(t) { to_upper_case(); }
+  pauli_word(const char *const p) : term{p} { to_upper_case(); }
+  pauli_word &operator=(const std::string &t) {
+    term = t;
+    to_upper_case();
+    return *this;
+  }
+  pauli_word &operator=(const char *const p) {
+    term = p;
+    to_upper_case();
+    return *this;
+  }
+
+  std::string str() const { return term; }
+
+  // TODO: Obsolete? Used by KernelWrapper.h only.
+  const std::vector<char> data() const { return {term.begin(), term.end()}; }
+
+private:
+  // Convert the string member to upper case at construction/assignment.
+  // TODO: This should probably verify the string contains only letters valid in
+  // this alphabet: I, X, Y, and Z.
+  void to_upper_case() {
+    std::transform(term.begin(), term.end(), term.begin(), ::toupper);
+  }
+
+  // These methods used by the compiler.
+  __attribute__((used)) const char *_nvqpp_data() const { return term.data(); }
+  __attribute__((used)) std::uint64_t _nvqpp_size() const {
+    return term.size();
+  }
+
+  std::string term; ///< Pauli words are string-like.
 };
-} // namespace cudaq
\ No newline at end of file
+
+namespace details {
+static_assert(sizeof(std::string) == sizeof(pauli_word));
+// This constant used by the compiler.
+static constexpr std::uint64_t _nvqpp_sizeof = sizeof(pauli_word);
+} // namespace details
+} // namespace cudaq
diff --git a/runtime/cudaq/qis/qubit_qis.h b/runtime/cudaq/qis/qubit_qis.h
index c83dffe844..c05a862bf9 100644
--- a/runtime/cudaq/qis/qubit_qis.h
+++ b/runtime/cudaq/qis/qubit_qis.h
@@ -17,6 +17,7 @@
 #include "cudaq/qis/qreg.h"
 #include "cudaq/qis/qvector.h"
 #include "cudaq/spin_op.h"
+#include <algorithm>
 #include <cstring>
 #include <functional>
 
@@ -828,11 +829,13 @@ std::vector<measure_result> mz(qubit &q, Qs &&...qs) {
 }
 
 namespace support {
-// Helper to initialize a `vector<bool>` data structure.
+// Helpers to deal with the `vector<bool>` specialized template type.
 extern "C" {
 void __nvqpp_initializer_list_to_vector_bool(std::vector<bool> &, char *,
                                              std::size_t);
-void __nvqpp_vector_bool_to_initializer_list(void *, const std::vector<bool> &);
+void __nvqpp_vector_bool_to_initializer_list(void *, const std::vector<bool> &,
+                                             std::vector<char *> **);
+void __nvqpp_vector_bool_free_temporary_initlists(std::vector<char *> *);
 }
 } // namespace support
 
diff --git a/runtime/test/test_argument_conversion.cpp b/runtime/test/test_argument_conversion.cpp
index 987bfd4c34..c16b43ddb7 100644
--- a/runtime/test/test_argument_conversion.cpp
+++ b/runtime/test/test_argument_conversion.cpp
@@ -202,12 +202,11 @@ void test_scalars(mlir::MLIRContext *ctx) {
 // CHECK:       Substitution module:
 
 // CHECK-LABEL:   cc.arg_subst[0] {
-// CHECK:           %[[VAL_0:.*]] = cc.address_of @cstr.58595A00 : !cc.ptr<!llvm.array<4 x i8>>
-// CHECK:           %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<!llvm.array<4 x i8>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_0:.*]] = cc.string_literal "XYZ" : !cc.ptr<!cc.array<i8 x 4>>
+// CHECK:           %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<!cc.array<i8 x 4>>) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_2:.*]] = arith.constant 3 : i64
 // CHECK:           %[[VAL_3:.*]] = cc.stdvec_init %[[VAL_1]], %[[VAL_2]] : (!cc.ptr<i8>, i64) -> !cc.charspan
 // CHECK:         }
-// CHECK-DAG:     llvm.mlir.global private constant @cstr.58595A00("XYZ\00") {addr_space = 0 : i32}
   // clang-format on
 }
 
@@ -250,14 +249,14 @@ void test_vectors(mlir::MLIRContext *ctx) {
   // clang-format off
 // CHECK-LABEL:   cc.arg_subst[0] {
 // CHECK:           %[[VAL_0:.*]] = cc.alloca !cc.array<!cc.charspan x 2>
-// CHECK:           %[[VAL_1:.*]] = cc.address_of @cstr.585800 : !cc.ptr<!llvm.array<3 x i8>>
-// CHECK:           %[[VAL_2:.*]] = cc.cast %[[VAL_1]] : (!cc.ptr<!llvm.array<3 x i8>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_1:.*]] = cc.string_literal "XX" : !cc.ptr<!cc.array<i8 x 3>>
+// CHECK:           %[[VAL_2:.*]] = cc.cast %[[VAL_1]] : (!cc.ptr<!cc.array<i8 x 3>>) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_3:.*]] = arith.constant 2 : i64
 // CHECK:           %[[VAL_4:.*]] = cc.stdvec_init %[[VAL_2]], %[[VAL_3]] : (!cc.ptr<i8>, i64) -> !cc.charspan
 // CHECK:           %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_0]][0] : (!cc.ptr<!cc.array<!cc.charspan x 2>>) -> !cc.ptr<!cc.charspan>
 // CHECK:           cc.store %[[VAL_4]], %[[VAL_5]] : !cc.ptr<!cc.charspan>
-// CHECK:           %[[VAL_6:.*]] = cc.address_of @cstr.585900 : !cc.ptr<!llvm.array<3 x i8>>
-// CHECK:           %[[VAL_7:.*]] = cc.cast %[[VAL_6]] : (!cc.ptr<!llvm.array<3 x i8>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_6:.*]] = cc.string_literal "XY" : !cc.ptr<!cc.array<i8 x 3>>
+// CHECK:           %[[VAL_7:.*]] = cc.cast %[[VAL_6]] : (!cc.ptr<!cc.array<i8 x 3>>) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_8:.*]] = arith.constant 2 : i64
 // CHECK:           %[[VAL_9:.*]] = cc.stdvec_init %[[VAL_7]], %[[VAL_8]] : (!cc.ptr<i8>, i64) -> !cc.charspan
 // CHECK:           %[[VAL_10:.*]] = cc.compute_ptr %[[VAL_0]][1] : (!cc.ptr<!cc.array<!cc.charspan x 2>>) -> !cc.ptr<!cc.charspan>
@@ -265,8 +264,6 @@ void test_vectors(mlir::MLIRContext *ctx) {
 // CHECK:           %[[VAL_11:.*]] = arith.constant 2 : i64
 // CHECK:           %[[VAL_12:.*]] = cc.stdvec_init %[[VAL_0]], %[[VAL_11]] : (!cc.ptr<!cc.array<!cc.charspan x 2>>, i64) -> !cc.stdvec<!cc.charspan>
 // CHECK:         }
-// CHECK-DAG:     llvm.mlir.global private constant @cstr.585800("XX\00") {addr_space = 0 : i32}
-// CHECK-DAG:     llvm.mlir.global private constant @cstr.585900("XY\00") {addr_space = 0 : i32}
   // clang-format on
 }
 
@@ -502,14 +499,14 @@ void test_combinations(mlir::MLIRContext *ctx) {
 // CHECK-DAG:     func.func private @__nvqpp_cudaq_state_createFromData_fp64(!cc.ptr<i8>, i64) -> !cc.ptr<!cc.state>
 // CHECK-LABEL:   cc.arg_subst[2] {
 // CHECK:           %[[VAL_0:.*]] = cc.alloca !cc.array<!cc.charspan x 2>
-// CHECK:           %[[VAL_1:.*]] = cc.address_of @cstr.585800 : !cc.ptr<!llvm.array<3 x i8>>
-// CHECK:           %[[VAL_2:.*]] = cc.cast %[[VAL_1]] : (!cc.ptr<!llvm.array<3 x i8>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_1:.*]] = cc.string_literal "XX" : !cc.ptr<!cc.array<i8 x 3>>
+// CHECK:           %[[VAL_2:.*]] = cc.cast %[[VAL_1]] : (!cc.ptr<!cc.array<i8 x 3>>) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_3:.*]] = arith.constant 2 : i64
 // CHECK:           %[[VAL_4:.*]] = cc.stdvec_init %[[VAL_2]], %[[VAL_3]] : (!cc.ptr<i8>, i64) -> !cc.charspan
 // CHECK:           %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_0]][0] : (!cc.ptr<!cc.array<!cc.charspan x 2>>) -> !cc.ptr<!cc.charspan>
 // CHECK:           cc.store %[[VAL_4]], %[[VAL_5]] : !cc.ptr<!cc.charspan>
-// CHECK:           %[[VAL_6:.*]] = cc.address_of @cstr.585900 : !cc.ptr<!llvm.array<3 x i8>>
-// CHECK:           %[[VAL_7:.*]] = cc.cast %[[VAL_6]] : (!cc.ptr<!llvm.array<3 x i8>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_6:.*]] = cc.string_literal "XY" : !cc.ptr<!cc.array<i8 x 3>>
+// CHECK:           %[[VAL_7:.*]] = cc.cast %[[VAL_6]] : (!cc.ptr<!cc.array<i8 x 3>>) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_8:.*]] = arith.constant 2 : i64
 // CHECK:           %[[VAL_9:.*]] = cc.stdvec_init %[[VAL_7]], %[[VAL_8]] : (!cc.ptr<i8>, i64) -> !cc.charspan
 // CHECK:           %[[VAL_10:.*]] = cc.compute_ptr %[[VAL_0]][1] : (!cc.ptr<!cc.array<!cc.charspan x 2>>) -> !cc.ptr<!cc.charspan>
@@ -517,8 +514,6 @@ void test_combinations(mlir::MLIRContext *ctx) {
 // CHECK:           %[[VAL_11:.*]] = arith.constant 2 : i64
 // CHECK:           %[[VAL_12:.*]] = cc.stdvec_init %[[VAL_0]], %[[VAL_11]] : (!cc.ptr<!cc.array<!cc.charspan x 2>>, i64) -> !cc.stdvec<!cc.charspan>
 // CHECK:         }
-// CHECK-DAG:     llvm.mlir.global private constant @cstr.585800("XX\00") {addr_space = 0 : i32}
-// CHECK-DAG:     llvm.mlir.global private constant @cstr.585900("XY\00") {addr_space = 0 : i32}
   // clang-format on
 }
 
diff --git a/targettests/Kernel/signature-0.cpp b/targettests/Kernel/signature-0.cpp
index 882fb24704..0adf9c8779 100644
--- a/targettests/Kernel/signature-0.cpp
+++ b/targettests/Kernel/signature-0.cpp
@@ -65,12 +65,9 @@ class Qernel6 {
   }
 };
 
-// FIXME: unhandled ctor call
-#define NYI /*__qpu__*/
-
 class Qernel7 {
 public:
-  std::vector<bool> operator()(std::vector<bool> v) NYI { return v; }
+  std::vector<bool> operator()(std::vector<bool> v) __qpu__ { return v; }
 };
 
 int main() {
diff --git a/targettests/Kernel/signature-4.cpp b/targettests/Kernel/signature-4.cpp
index 14deb5c55f..00e9effc93 100644
--- a/targettests/Kernel/signature-4.cpp
+++ b/targettests/Kernel/signature-4.cpp
@@ -14,10 +14,8 @@
 
 // Tests that we can take a small struct, a struct with a vector member, a
 // vector of small structs, and a large struct as an argument and return the
-// same. Currently, DefaultQPU::launchKernel does not handle return values at
-// all.
+// same.
 
-// FIXME
 #define NYI /*__qpu__*/
 
 void ok() { std::cout << "ok\n"; }
@@ -48,7 +46,6 @@ struct QernelS1 {
   }
 };
 
-// struct with vector member not yet supported
 struct S2 {
   int _1;
   std::vector<float> _2;
@@ -66,6 +63,7 @@ struct QernelS2a {
 };
 
 struct QernelS2 {
+  // kernel result type not supported (bridge)
   S2 operator()(S2 s) NYI {
     s._1++;
     s._2[0] = 0.0;
@@ -84,16 +82,14 @@ class QernelS3a {
   }
 };
 
-// ctor in return not supported
 struct QernelS3 {
-  std::vector<S1> operator()(std::vector<S1> s) NYI {
+  std::vector<S1> operator()(std::vector<S1> s) __qpu__ {
     s[0]._1++;
     s[0]._2 = 0.0;
     return s;
   }
 };
 
-// bug in bridge
 std::vector<S1> mock_ctor(const std::vector<S1> &v) { return v; }
 
 struct QernelS4 {
diff --git a/targettests/Kernel/signature-5.cpp b/targettests/Kernel/signature-5.cpp
index a42b5b8518..a2fa263560 100644
--- a/targettests/Kernel/signature-5.cpp
+++ b/targettests/Kernel/signature-5.cpp
@@ -15,7 +15,6 @@
 // Test kernels can take arguments of tuple or pair as well as return values of
 // same.
 
-// FIXME: tuple and pair are not handled.
 #define NYI /*__qpu__*/
 
 void ok() { std::cout << "ok\n"; }
@@ -24,7 +23,7 @@ void fail() { std::cout << "fail\n"; }
 using S1 = std::tuple<int, int, int>;
 
 struct QernelS1a {
-  void operator()(S1 s) NYI {
+  void operator()(S1 s) __qpu__ {
     if (std::get<0>(s) == 1 && std::get<1>(s) == 2 && std::get<2>(s) == 4)
       ok();
     else
@@ -38,10 +37,18 @@ struct QernelS1 {
   }
 };
 
+S1 qernel_s1b_helper(S1 s) {
+  return {std::get<2>(s) + 1, std::get<1>(s) + 1, std::get<0>(s) + 1};
+}
+
+struct QernelS1b {
+  S1 operator()(S1 s) NYI { return qernel_s1b_helper(s); }
+};
+
 using S2 = std::tuple<double, float, std::vector<int>>;
 
 struct QernelS2a {
-  void operator()(S2 s) NYI {
+  void operator()(S2 s) __qpu__ {
     if (std::get<0>(s) == 8.16 && std::get<1>(s) == 32.64f &&
         std::get<2>(s).size() == 2)
       ok();
@@ -88,6 +95,13 @@ int main() {
     ok();
   else
     fail();
+  std::cout << "QernelS1b ";
+  auto updated_s1b = QernelS1b{}(s1);
+  if (std::get<0>(updated_s1b) == 5 && std::get<1>(updated_s1b) == 3 &&
+      std::get<2>(updated_s1b) == 2)
+    ok();
+  else
+    fail();
 
   std::vector<int> v = {128, 256};
   S2 s2 = {8.16, 32.64f, v};
@@ -117,6 +131,7 @@ int main() {
 // clang-format off
 // CHECK-LABEL: QernelS1a ok
 // CHECK-NEXT: QernelS1 ok
+// CHECK-NEXT: QernelS1b ok
 // CHECK-NEXT: QernelS2a ok
 // CHECK-NEXT: QernelS2 ok
 // CHECK-NEXT: ok
diff --git a/targettests/Remote-Sim/pauli_word.cpp b/targettests/Remote-Sim/pauli_word.cpp
index cd68042325..7624d948c0 100644
--- a/targettests/Remote-Sim/pauli_word.cpp
+++ b/targettests/Remote-Sim/pauli_word.cpp
@@ -10,7 +10,6 @@
 
 // clang-format off
 // RUN: nvq++ %cpp_std --target remote-mqpu --remote-mqpu-auto-launch 1 %s -o %t && %t
-// RUN: nvq++ %cpp_std --enable-mlir --target remote-mqpu --remote-mqpu-auto-launch 1 %s -o %t && %t
 // clang-format on
 
 #include "remote_test_assert.h"
diff --git a/targettests/SeparateCompilation/arith_spans.cpp b/targettests/SeparateCompilation/arith_spans.cpp
new file mode 100644
index 0000000000..4de3979ed1
--- /dev/null
+++ b/targettests/SeparateCompilation/arith_spans.cpp
@@ -0,0 +1,353 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// clang-format off
+// RUN: if [ command -v split-file ]; then \
+// RUN: split-file %s %t && \
+// RUN: nvq++ %cpp_std --enable-mlir -c %t/span_dumps.cpp -o %t/span_dumps.o && \
+// RUN: nvq++ %cpp_std --enable-mlir -c %t/span_exercise.cpp -o %t/span_exercise.o && \
+// RUN: nvq++ %cpp_std --enable-mlir %t/span_dumps.o %t/span_exercise.o -o %t/spanaroo.out && \
+// RUN: %t/spanaroo.out | FileCheck %s ; else \
+// RUN: echo "skipping" ; fi
+// clang-format on
+
+//--- span_dumps.cpp
+
+#include <iostream>
+#include <span>
+#include <string>
+
+extern "C" {
+void dump_bool_vector(std::span<bool> x) {
+  std::cout << "booleans: ";
+  for (auto i : x)
+    std::cout << i << ' ';
+  std::cout << '\n';
+}
+
+void dump_int_vector(std::span<int> x) {
+  std::cout << "integers: ";
+  for (auto i : x)
+    std::cout << i << ' ';
+  std::cout << '\n';
+}
+
+void dump_2d_int_vector(std::span<std::span<int>> x) {
+  std::cout << "integer matrix: {\n";
+  for (auto s : x) {
+    std::cout << "    ";
+    for (auto i : s)
+      std::cout << i << "  ";
+    std::cout << '\n';
+  }
+  std::cout << "}\n";
+}
+
+void dump_int_scalar(int x) { std::cout << "scalar integer: " << x << '\n'; }
+
+void dump_double_vector(std::span<double> x) {
+  std::cout << "doubles: ";
+  for (auto d : x)
+    std::cout << d << ' ';
+  std::cout << '\n';
+}
+}
+
+//--- span_exercise.cpp
+
+#include <cudaq.h>
+#include <iostream>
+
+// Fake host C++ signature that matches.
+extern "C" {
+void dump_int_vector(const std::vector<int> &pw);
+void dump_int_scalar(int v);
+void dump_bool_vector(const std::vector<bool> &pw);
+void dump_double_vector(const std::vector<double> &pw);
+void dump_2d_int_vector(const std::vector<std::vector<int>> &pw);
+}
+
+__qpu__ void kern1(std::vector<int> arg) { dump_int_vector(arg); }
+
+__qpu__ void kern2(std::vector<std::vector<int>> arg) {
+  for (unsigned i = 0; i < arg.size(); ++i)
+    dump_int_vector(arg[i]);
+}
+
+struct IntVectorPair {
+  std::vector<int> _0;
+  std::vector<int> _1;
+};
+
+__qpu__ void kern3(IntVectorPair ivp) {
+  dump_int_vector(ivp._0);
+  dump_int_vector(ivp._1);
+}
+
+__qpu__ void kern4(std::vector<IntVectorPair> vivp) {
+  for (unsigned i = 0; i < vivp.size(); ++i) {
+    dump_int_vector(vivp[i]._0);
+    dump_int_vector(vivp[i]._1);
+  }
+}
+
+__qpu__ void qern1(std::vector<double> arg) { dump_double_vector(arg); }
+
+__qpu__ void qern2(std::vector<std::vector<double>> arg) {
+  for (unsigned i = 0; i < arg.size(); ++i)
+    dump_double_vector(arg[i]);
+}
+
+struct DoubleVectorPair {
+  std::vector<double> _0;
+  std::vector<double> _1;
+};
+
+__qpu__ void qern3(DoubleVectorPair ivp) {
+  dump_double_vector(ivp._0);
+  dump_double_vector(ivp._1);
+}
+
+__qpu__ void qern4(std::vector<DoubleVectorPair> vivp) {
+  for (unsigned i = 0; i < vivp.size(); ++i) {
+    dump_double_vector(vivp[i]._0);
+    dump_double_vector(vivp[i]._1);
+  }
+}
+
+__qpu__ void cern1(std::vector<bool> arg) { dump_bool_vector(arg); }
+
+__qpu__ void cern2(std::vector<std::vector<bool>> arg) {
+  for (unsigned i = 0; i < arg.size(); ++i)
+    dump_bool_vector(arg[i]);
+}
+
+struct BoolVectorPair {
+  std::vector<bool> _0;
+  std::vector<bool> _1;
+};
+
+__qpu__ void cern3(BoolVectorPair ivp) {
+  dump_bool_vector(ivp._0);
+  dump_bool_vector(ivp._1);
+}
+
+__qpu__ void cern4(std::vector<BoolVectorPair> vivp) {
+  for (unsigned i = 0; i < vivp.size(); ++i) {
+    dump_bool_vector(vivp[i]._0);
+    dump_bool_vector(vivp[i]._1);
+  }
+}
+
+struct Interesting {
+  std::vector<std::vector<std::vector<int>>> ragged3d;
+  int flags;
+  std::vector<double> angular;
+};
+
+__qpu__ void exciting(std::vector<Interesting> vi) {
+  for (unsigned i = 0; i < vi.size(); ++i) {
+    for (unsigned j = 0; j < vi[i].ragged3d.size(); ++j)
+      dump_2d_int_vector(vi[i].ragged3d[j]);
+    dump_int_scalar(vi[i].flags);
+    dump_double_vector(vi[i].angular);
+  }
+}
+
+int main() {
+  std::vector<int> pw0 = {345, 1, 2};
+  std::cout << "---\n";
+  kern1(pw0);
+  std::vector<int> pw1 = {92347, 3, 4};
+  std::vector<int> pw2 = {2358, 5, 6};
+  std::vector<int> pw3 = {45, 7, 18};
+  std::vector<std::vector<int>> vpw{pw0, pw1, pw2, pw3};
+  std::cout << "---\n";
+  kern2(vpw);
+
+  IntVectorPair ivp = {{8, 238, 44}, {0, -4, 81, 92745}};
+  std::cout << "---\n";
+  kern3(ivp);
+
+  IntVectorPair ivp2 = {{5, -87, 43, 1, 76}, {0, 0, 2, 1}};
+  IntVectorPair ivp3 = {{1}, {-2, 3}};
+  IntVectorPair ivp4 = {{-4, -5, 6}, {-7, -8, -9, 88}};
+  std::vector<IntVectorPair> vivp = {ivp, ivp2, ivp3, ivp4};
+  std::cout << "---\n";
+  kern4(vivp);
+
+  std::vector<double> dpw0 = {3.45, 1., 2.};
+  std::cout << "---\n";
+  qern1(dpw0);
+  std::vector<double> dpw1 = {92.347, 2.3, 4.};
+  std::vector<double> dpw2 = {235.8, 5.5, 6.4};
+  std::vector<double> dpw3 = {4.5, 77.7, 18.2};
+  std::vector<std::vector<double>> vdpw{dpw0, dpw1, dpw2, dpw3};
+  std::cout << "---\n";
+  qern2(vdpw);
+
+  DoubleVectorPair dvp = {{8., 2.38, 4.4}, {0., -4.99, 81.5, 92.745}};
+  std::cout << "---\n";
+  qern3(dvp);
+
+  DoubleVectorPair dvp2 = {{5., -8.7, 4.3, 1., 7.6}, {0., 0., 2., 1.}};
+  DoubleVectorPair dvp3 = {{1.}, {-2., 3.}};
+  DoubleVectorPair dvp4 = {{-4., -5., 6.}, {-7., -8., -9., .88}};
+  std::vector<DoubleVectorPair> vdvp = {dvp, dvp2, dvp3, dvp4};
+  std::cout << "---\n";
+  qern4(vdvp);
+
+  std::vector<bool> bpw0 = {true, false};
+  std::cout << "---\n";
+  cern1(bpw0);
+  std::vector<bool> bpw1 = {false, false, false};
+  std::vector<bool> bpw2 = {false, true, false, true};
+  std::vector<bool> bpw3 = {false, false, true, false, true};
+  std::vector<std::vector<bool>> vbpw{bpw0, bpw1, bpw2, bpw3};
+  std::cout << "---\n";
+  cern2(vbpw);
+
+  BoolVectorPair bvp = {{false, false}, {false, true, true, false}};
+  std::cout << "---\n";
+  cern3(bvp);
+
+  BoolVectorPair bvp2 = {{false, true, true, false, true, false},
+                         {false, true, true, false, false, false, true, false}};
+  BoolVectorPair bvp3 = {{false}, {true, true}};
+  BoolVectorPair bvp4 = {{true, false, false}, {false, true, false, true}};
+  std::vector<BoolVectorPair> vbvp = {bvp, bvp2, bvp3, bvp4};
+  std::cout << "---\n";
+  cern4(vbvp);
+
+  std::vector<std::vector<int>> ix0 = {pw0, pw0};
+  std::vector<std::vector<int>> ix1 = {pw1, pw0};
+  std::vector<std::vector<int>> ix2 = {pw2, pw3, pw3};
+  std::vector<std::vector<int>> ix3 = {{404}, {101, 202}};
+  std::vector<std::vector<std::vector<int>>> i3d0 = {ix0, ix1};
+  std::vector<std::vector<std::vector<int>>> i3d1 = {ix1};
+  std::vector<std::vector<std::vector<int>>> i3d2 = {ix2, ix3};
+  std::vector<std::vector<std::vector<int>>> i3d3 = {ix3};
+  std::vector<std::vector<std::vector<int>>> i3d4 = {ix2, ix0, ix0};
+  Interesting in0 = {i3d0, 66, {2.0, 4.0}};
+  Interesting in1 = {i3d1, 123, {3.0, 6.0}};
+  Interesting in2 = {i3d2, 561, {4.0, 8.0}};
+  Interesting in3 = {i3d3, 72341, {5.0, 10.0}};
+  Interesting in4 = {i3d4, -2348, {12.0, 5280.1}};
+  std::vector<Interesting> ving = {in0, in1, in2, in3, in4};
+  std::cout << "===\n";
+  exciting(ving);
+
+  return 0;
+}
+
+// CHECK: ---
+// CHECK: integers: 345 1 2
+// CHECK: ---
+// CHECK: integers: 345 1 2
+// CHECK-NEXT: integers: 92347 3 4
+// CHECK-NEXT: integers: 2358 5 6
+// CHECK-NEXT: integers: 45 7 18
+// CHECK: ---
+// CHECK: integers: 8 238 44
+// CHECK-NEXT: integers: 0 -4 81 92745
+// CHECK: ---
+// CHECK: integers: 8 238 44
+// CHECK-NEXT: integers: 0 -4 81 92745
+// CHECK-NEXT: integers: 5 -87 43 1 76
+// CHECK-NEXT: integers: 0 0 2 1
+// CHECK-NEXT: integers: 1
+// CHECK-NEXT: integers: -2 3
+// CHECK-NEXT: integers: -4 -5 6
+// CHECK-NEXT: integers: -7 -8 -9 88
+// CHECK: ---
+// CHECK: doubles: 3.45 1 2
+// CHECK: ---
+// CHECK: doubles: 3.45 1 2
+// CHECK-NEXT: doubles: 92.347 2.3 4
+// CHECK-NEXT: doubles: 235.8 5.5 6.4
+// CHECK-NEXT: doubles: 4.5 77.7 18.2
+// CHECK: ---
+// CHECK: doubles: 8 2.38 4.4
+// CHECK-NEXT: doubles: 0 -4.99 81.5 92.745
+// CHECK: ---
+// CHECK: doubles: 8 2.38 4.4
+// CHECK-NEXT: doubles: 0 -4.99 81.5 92.745
+// CHECK-NEXT: doubles: 5 -8.7 4.3 1 7.6
+// CHECK-NEXT: doubles: 0 0 2 1
+// CHECK-NEXT: doubles: 1
+// CHECK-NEXT: doubles: -2 3
+// CHECK-NEXT: doubles: -4 -5 6
+// CHECK-NEXT: doubles: -7 -8 -9 0.88
+// CHECK: ---
+// CHECK: booleans: 1 0
+// CHECK: ---
+// CHECK: booleans: 1 0
+// CHECK-NEXT: booleans: 0 0 0
+// CHECK-NEXT: booleans: 0 1 0 1
+// CHECK-NEXT: booleans: 0 0 1 0 1
+// CHECK: ---
+// CHECK: booleans: 0 0
+// CHECK-NEXT: booleans: 0 1 1 0
+// CHECK: ---
+// CHECK: booleans: 0 0
+// CHECK-NEXT: booleans: 0 1 1 0
+// CHECK-NEXT: booleans: 0 1 1 0 1 0
+// CHECK-NEXT: booleans: 0 1 1 0 0 0 1 0
+// CHECK-NEXT: booleans: 0
+// CHECK-NEXT: booleans: 1 1
+// CHECK-NEXT: booleans: 1 0 0
+// CHECK-NEXT: booleans: 0 1 0 1
+// CHECK: ===
+// CHECK: integer matrix: {
+// CHECK-NEXT:     345  1  2
+// CHECK-NEXT:     345  1  2
+// CHECK-NEXT: }
+// CHECK-NEXT: integer matrix: {
+// CHECK-NEXT:     92347  3  4
+// CHECK-NEXT:     345  1  2
+// CHECK-NEXT: }
+// CHECK-NEXT: scalar integer: 66
+// CHECK-NEXT: doubles: 2 4
+// CHECK-NEXT: integer matrix: {
+// CHECK-NEXT:     92347  3  4
+// CHECK-NEXT:     345  1  2
+// CHECK-NEXT: }
+// CHECK-NEXT: scalar integer: 123
+// CHECK-NEXT: doubles: 3 6
+// CHECK-NEXT: integer matrix: {
+// CHECK-NEXT:     2358  5  6
+// CHECK-NEXT:     45  7  18
+// CHECK-NEXT:     45  7  18
+// CHECK-NEXT: }
+// CHECK-NEXT: integer matrix: {
+// CHECK-NEXT:     404
+// CHECK-NEXT:     101  202
+// CHECK-NEXT: }
+// CHECK-NEXT: scalar integer: 561
+// CHECK-NEXT: doubles: 4 8
+// CHECK-NEXT: integer matrix: {
+// CHECK-NEXT:     404
+// CHECK-NEXT:     101  202
+// CHECK-NEXT: }
+// CHECK-NEXT: scalar integer: 72341
+// CHECK-NEXT: doubles: 5 10
+// CHECK-NEXT: integer matrix: {
+// CHECK-NEXT:     2358  5  6
+// CHECK-NEXT:     45  7  18
+// CHECK-NEXT:     45  7  18
+// CHECK-NEXT: }
+// CHECK-NEXT: integer matrix: {
+// CHECK-NEXT:     345  1  2
+// CHECK-NEXT:     345  1  2
+// CHECK-NEXT: }
+// CHECK-NEXT: integer matrix: {
+// CHECK-NEXT:     345  1  2
+// CHECK-NEXT:     345  1  2
+// CHECK-NEXT: }
+// CHECK-NEXT: scalar integer: -2348
+// CHECK-NEXT: doubles: 12 5280.1
diff --git a/targettests/SeparateCompilation/pauli_words.cpp b/targettests/SeparateCompilation/pauli_words.cpp
new file mode 100644
index 0000000000..31ac339e0c
--- /dev/null
+++ b/targettests/SeparateCompilation/pauli_words.cpp
@@ -0,0 +1,65 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// clang-format off
+// RUN: if [ command -v split-file ]; then \
+// RUN: split-file %s %t && \
+// RUN: nvq++ %cpp_std --enable-mlir -c %t/pauli_word_display.cpp -o %t/pauli_word_display.o && \
+// RUN: nvq++ %cpp_std --enable-mlir -c %t/pauli_wordle.cpp -o %t/pauli_wordle.o && \
+// RUN: nvq++ %cpp_std --enable-mlir %t/pauli_word_display.o %t/pauli_wordle.o -o %t/pauli_wordle.out && \
+// RUN: %t/pauli_wordle.out | FileCheck %s ; else \
+// RUN: echo "skipping" ; fi
+// clang-format on
+
+//--- pauli_word_display.cpp
+
+#include <iostream>
+#include <span>
+#include <string>
+
+extern "C" {
+void display(std::span<char> x) {
+  std::string s{x.data(), x.size()};
+  std::cout << "pauli word: " << s << '\n';
+}
+}
+
+//--- pauli_wordle.cpp
+
+#include <cudaq.h>
+
+// Fake host C++ signature that matches. Since this is called on the device side
+// the pauli_word will have been converted to a span.
+extern "C" void display(const cudaq::pauli_word &pw);
+
+__qpu__ void kerny(std::vector<cudaq::pauli_word> arg) {
+  display(arg[0]);
+  display(arg[1]);
+  display(arg[2]);
+  display(arg[3]);
+}
+
+__qpu__ void kernub(cudaq::pauli_word arg) { display(arg); }
+
+int main() {
+  cudaq::pauli_word pw0 = "YYZ";
+  kernub(pw0);
+
+  cudaq::pauli_word pw1 = "ZIZ";
+  cudaq::pauli_word pw2 = "XXXY";
+  cudaq::pauli_word pw3 = "YIIII";
+  std::vector<cudaq::pauli_word> vpw{pw0, pw1, pw2, pw3};
+  kerny(vpw);
+  return 0;
+}
+
+// CHECK: pauli word: YYZ
+// CHECK: pauli word: YYZ
+// CHECK: pauli word: ZIZ
+// CHECK: pauli word: XXXY
+// CHECK: pauli word: YIIII
diff --git a/targettests/execution/exp_pauli.cpp b/targettests/execution/exp_pauli.cpp
index bf7ed5bac1..014d86ccf6 100644
--- a/targettests/execution/exp_pauli.cpp
+++ b/targettests/execution/exp_pauli.cpp
@@ -8,17 +8,18 @@
 
 // clang-format off
 // Simulators
-// RUN: nvq++ %cpp_std --enable-mlir                                                    %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --enable-mlir --target remote-mqpu          -fkernel-exec-kind=2 %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --enable-mlir %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std -fkernel-exec-kind=2 --enable-mlir -target remote-mqpu %s -o %t && %t | FileCheck %s
 //
 // Quantum emulators
-// RUN: nvq++ %cpp_std --target quantinuum               --emulate -fkernel-exec-kind=2 %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target ionq                     --emulate -fkernel-exec-kind=2 %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std -fkernel-exec-kind=2 -target quantinuum --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std -fkernel-exec-kind=2 -target ionq       --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std -fkernel-exec-kind=2 -target oqc        --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std -fkernel-exec-kind=2 -target anyon      --emulate %s -o %t && %t | FileCheck %s
+
 // 2 different IQM machines for 2 different topologies
-// RUN: nvq++ %cpp_std --target iqm --iqm-machine Adonis --emulate -fkernel-exec-kind=2 %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target iqm --iqm-machine Apollo --emulate -fkernel-exec-kind=2 %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target oqc                      --emulate -fkernel-exec-kind=2 %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target anyon                    --emulate -fkernel-exec-kind=2 %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std -fkernel-exec-kind=2 -target iqm --iqm-machine Adonis --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std -fkernel-exec-kind=2 -target iqm --iqm-machine Apollo --emulate %s -o %t && %t | FileCheck %s
 // clang-format on
 
 #include <cudaq.h>
diff --git a/test/AST-Quake/calling_convention-aarch64.cpp b/test/AST-Quake/calling_convention-aarch64.cpp
index 174aaf3558..22d60856e0 100644
--- a/test/AST-Quake/calling_convention-aarch64.cpp
+++ b/test/AST-Quake/calling_convention-aarch64.cpp
@@ -271,7 +271,7 @@ struct V3 {
 // CHECK-LABEL:  func.func @_ZN2V3clESt6vectorIlSaIlEES0_IbSaIbEE(
 // CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr<i8>,
 // CHECK-SAME:     %[[VAL_1:.*]]: !cc.ptr<!cc.struct<{!cc.ptr<i64>, !cc.ptr<i64>, !cc.ptr<i64>}>>,
-// CHECK-SAME:     %[[VAL_2:.*]]: !cc.ptr<!cc.struct<{!cc.ptr<i1>, !cc.ptr<i1>, !cc.ptr<i1>}>>)
+// CHECK-SAME:     %[[VAL_2:.*]]: !cc.ptr<!cc.struct<{!cc.ptr<i1>, !cc.array<i8 x 32>}>>)
 // clang-format on
 
 //===----------------------------------------------------------------------===//
diff --git a/test/AST-Quake/calling_convention.cpp b/test/AST-Quake/calling_convention.cpp
index 3d2c6e2e4a..fcf7c26cda 100644
--- a/test/AST-Quake/calling_convention.cpp
+++ b/test/AST-Quake/calling_convention.cpp
@@ -278,9 +278,7 @@ struct V3 {
 // CHECK-SAME:     %[[VAL_2:.*]]: !cc.ptr<!cc.struct<{!cc.ptr<f32>, !cc.ptr<f32>, !cc.ptr<f32>}>>,
 // CHECK-SAME:     %[[VAL_3:.*]]: !cc.ptr<!cc.struct<{!cc.ptr<i16>, !cc.ptr<i16>, !cc.ptr<i16>}>>)
 // CHECK-LABEL:  func.func @_ZN2V3clESt6vectorIlSaIlEES0_IbSaIbEE(
-// CHECK-SAME: %[[VAL_0:.*]]: !cc.ptr<i8>,
-// CHECK-SAME:     %[[VAL_1:.*]]: !cc.ptr<!cc.struct<{!cc.ptr<i64>, !cc.ptr<i64>, !cc.ptr<i64>}>>,
-// CHECK-SAME:     %[[VAL_2:.*]]: !cc.ptr<!cc.struct<{!cc.ptr<i1>, !cc.ptr<i1>, !cc.ptr<i1>}>>)
+// CHECK-SAME:     %[[VAL_0:.*]]: !cc.ptr<i8>, %[[VAL_1:.*]]: !cc.ptr<!cc.struct<{!cc.ptr<i64>, !cc.ptr<i64>, !cc.ptr<i64>}>>, %[[VAL_2:.*]]: !cc.ptr<!cc.struct<{!cc.ptr<i1>, !cc.array<i8 x 32>}>>)
 // clang-format on
 
 //===----------------------------------------------------------------------===//
diff --git a/test/Quake/kernel_exec-1.qke b/test/Quake/kernel_exec-1.qke
index 37ac7c7229..cd079998ae 100644
--- a/test/Quake/kernel_exec-1.qke
+++ b/test/Quake/kernel_exec-1.qke
@@ -6,15 +6,13 @@
 // the terms of the Apache License 2.0 which accompanies this distribution.   //
 // ========================================================================== //
 
-// RUN: cudaq-opt --kernel-execution=codegen=1 %s | FileCheck %s
-// RUN: cudaq-opt --kernel-execution=codegen=2 %s | FileCheck --check-prefix=STREAM %s
-// RUN: cudaq-opt --kernel-execution %s | FileCheck --check-prefix=HYBRID %s
+// RUN: cudaq-opt -kernel-execution=codegen=1 %s | FileCheck --check-prefix=ALT %s
+// RUN: cudaq-opt -kernel-execution=codegen=2 %s | FileCheck --check-prefix=STREAMLINED %s
+// RUN: cudaq-opt -kernel-execution %s | FileCheck --check-prefix=HYBRID %s
 
 module attributes {quake.mangled_name_map = {
   __nvqpp__mlirgen__ghz = "_ZN3ghzclEi"}} {
 
-// CHECK-LABEL:   func.func @__nvqpp__mlirgen__ghz(
-
   func.func @__nvqpp__mlirgen__ghz(%arg0: i32) -> f64 {
     %0 = cc.alloca i32
     cc.store %arg0, %0 : !cc.ptr<i32>
@@ -83,174 +81,369 @@ module attributes {quake.mangled_name_map = {
   }
 }
 
-// Check the generated code.
+// ALT-LABEL:   func.func @_ZN3ghzclEi(
+// ALT-SAME:                           %[[VAL_0:.*]]: !cc.ptr<i8>,
+// ALT-SAME:                           %[[VAL_1:.*]]: i32) -> f64 {
+// ALT:           %[[VAL_2:.*]] = cc.alloca i64
+// ALT:           %[[VAL_3:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64
+// ALT:           %[[VAL_4:.*]] = cc.alloca i8{{\[}}%[[VAL_3]] : i64]
+// ALT:           %[[VAL_5:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.struct<{i32, f64}>>
+// ALT:           %[[VAL_6:.*]] = cc.compute_ptr %[[VAL_5]][0] : (!cc.ptr<!cc.struct<{i32, f64}>>) -> !cc.ptr<i32>
+// ALT:           cc.store %[[VAL_1]], %[[VAL_6]] : !cc.ptr<i32>
+// ALT:           %[[VAL_7:.*]] = constant @ghz.thunk : (!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// ALT:           %[[VAL_8:.*]] = cc.func_ptr %[[VAL_7]] : ((!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
+// ALT:           %[[VAL_9:.*]] = cc.cast %[[VAL_5]] : (!cc.ptr<!cc.struct<{i32, f64}>>) -> !cc.ptr<i8>
+// ALT:           %[[VAL_10:.*]] = cc.offsetof !cc.struct<{i32, f64}> [1] : i64
+// ALT:           %[[VAL_11:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr<array<4 x i8>>
+// ALT:           %[[VAL_12:.*]] = cc.cast %[[VAL_11]] : (!llvm.ptr<array<4 x i8>>) -> !cc.ptr<i8>
+// ALT:           %[[VAL_13:.*]] = call @altLaunchKernel(%[[VAL_12]], %[[VAL_8]], %[[VAL_9]], %[[VAL_3]], %[[VAL_10]]) : (!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// ALT:           %[[VAL_14:.*]] = cc.extract_value %[[VAL_13]][0] : (!cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
+// ALT:           %[[VAL_15:.*]] = cc.cast %[[VAL_14]] : (!cc.ptr<i8>) -> i64
+// ALT:           %[[VAL_16:.*]] = arith.constant 0 : i64
+// ALT:           %[[VAL_17:.*]] = arith.cmpi ne, %[[VAL_15]], %[[VAL_16]] : i64
+// ALT:           cf.cond_br %[[VAL_17]], ^bb1, ^bb2
+// ALT:         ^bb1:
+// ALT:           %[[VAL_18:.*]] = cc.cast %[[VAL_14]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.struct<{i32, f64}>>
+// ALT:           %[[VAL_19:.*]] = cc.compute_ptr %[[VAL_18]][1] : (!cc.ptr<!cc.struct<{i32, f64}>>) -> !cc.ptr<f64>
+// ALT:           cf.br ^bb3(%[[VAL_19]] : !cc.ptr<f64>)
+// ALT:         ^bb2:
+// ALT:           %[[VAL_20:.*]] = cc.compute_ptr %[[VAL_5]][1] : (!cc.ptr<!cc.struct<{i32, f64}>>) -> !cc.ptr<f64>
+// ALT:           cf.br ^bb3(%[[VAL_20]] : !cc.ptr<f64>)
+// ALT:         ^bb3(%[[VAL_21:.*]]: !cc.ptr<f64>):
+// ALT:           %[[VAL_22:.*]] = cc.compute_ptr %[[VAL_5]][1] : (!cc.ptr<!cc.struct<{i32, f64}>>) -> !cc.ptr<f64>
+// ALT:           %[[VAL_23:.*]] = cc.load %[[VAL_22]] : !cc.ptr<f64>
+// ALT:           return %[[VAL_23]] : f64
+// ALT:         }
+// ALT:         func.func private @altLaunchKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// ALT:         func.func private @cudaqRegisterArgsCreator(!cc.ptr<i8>, !cc.ptr<i8>)
+// ALT:         llvm.func @cudaqRegisterLambdaName(!llvm.ptr<i8>, !llvm.ptr<i8>) attributes {sym_visibility = "private"}
+// ALT:         func.func private @__cudaq_registerLinkableKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>)
+// ALT:         func.func private @__cudaq_getLinkableKernelKey(!cc.ptr<i8>) -> i64
+// ALT:         func.func private @cudaqRegisterKernelName(!cc.ptr<i8>)
+// ALT:         func.func private @malloc(i64) -> !cc.ptr<i8>
+// ALT:         func.func private @free(!cc.ptr<i8>)
+// ALT:         func.func private @__nvqpp_initializer_list_to_vector_bool(!cc.ptr<none>, !cc.ptr<none>, i64)
+// ALT:         func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr<!cc.struct<{!cc.ptr<i1>, !cc.ptr<i1>, !cc.ptr<i1>}>>, !cc.ptr<!cc.struct<{!cc.ptr<i1>, !cc.array<i8 x 32>}>>, !cc.ptr<!cc.ptr<i8>>)
+// ALT:         func.func private @llvm.memcpy.p0i8.p0i8.i64(!cc.ptr<i8>, !cc.ptr<i8>, i64, i1)
+
+// ALT-LABEL:   func.func private @__nvqpp_zeroDynamicResult() -> !cc.struct<{!cc.ptr<i8>, i64}> {
+// ALT:           %[[VAL_0:.*]] = arith.constant 0 : i64
+// ALT:           %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (i64) -> !cc.ptr<i8>
+// ALT:           %[[VAL_2:.*]] = cc.undef !cc.struct<{!cc.ptr<i8>, i64}>
+// ALT:           %[[VAL_3:.*]] = cc.insert_value %[[VAL_1]], %[[VAL_2]][0] : (!cc.struct<{!cc.ptr<i8>, i64}>, !cc.ptr<i8>) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// ALT:           %[[VAL_4:.*]] = cc.insert_value %[[VAL_0]], %[[VAL_3]][1] : (!cc.struct<{!cc.ptr<i8>, i64}>, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// ALT:           return %[[VAL_4]] : !cc.struct<{!cc.ptr<i8>, i64}>
+// ALT:         }
+
+// ALT-LABEL:   func.func private @__nvqpp_createDynamicResult(
+// ALT-SAME:                                                   %[[VAL_0:.*]]: !cc.ptr<i8>,
+// ALT-SAME:                                                   %[[VAL_1:.*]]: i64,
+// ALT-SAME:                                                   %[[VAL_2:.*]]: !cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>,
+// ALT-SAME:                                                   %[[VAL_3:.*]]: i64) -> !cc.struct<{!cc.ptr<i8>, i64}> {
+// ALT:           %[[VAL_4:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>) -> !cc.ptr<i64>
+// ALT:           %[[VAL_5:.*]] = cc.load %[[VAL_4]] : !cc.ptr<i64>
+// ALT:           %[[VAL_6:.*]] = arith.addi %[[VAL_1]], %[[VAL_5]] : i64
+// ALT:           %[[VAL_7:.*]] = call @malloc(%[[VAL_6]]) : (i64) -> !cc.ptr<i8>
+// ALT:           %[[VAL_8:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.array<i8 x ?>>
+// ALT:           %[[VAL_9:.*]] = arith.constant false
+// ALT:           call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_7]], %[[VAL_0]], %[[VAL_1]], %[[VAL_9]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
+// ALT:           %[[VAL_10:.*]] = cc.compute_ptr %[[VAL_2]][0] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>) -> !cc.ptr<!cc.ptr<i8>>
+// ALT:           %[[VAL_11:.*]] = cc.load %[[VAL_10]] : !cc.ptr<!cc.ptr<i8>>
+// ALT:           %[[VAL_12:.*]] = cc.compute_ptr %[[VAL_8]]{{\[}}%[[VAL_1]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
+// ALT:           call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_12]], %[[VAL_11]], %[[VAL_5]], %[[VAL_9]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
+// ALT:           %[[VAL_13:.*]] = cc.undef !cc.struct<{!cc.ptr<i8>, i64}>
+// ALT:           %[[VAL_14:.*]] = cc.insert_value %[[VAL_7]], %[[VAL_13]][0] : (!cc.struct<{!cc.ptr<i8>, i64}>, !cc.ptr<i8>) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// ALT:           %[[VAL_15:.*]] = cc.insert_value %[[VAL_6]], %[[VAL_14]][1] : (!cc.struct<{!cc.ptr<i8>, i64}>, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// ALT:           %[[VAL_16:.*]] = cc.compute_ptr %[[VAL_8]]{{\[}}%[[VAL_3]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
+// ALT:           %[[VAL_17:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.ptr<i8>>
+// ALT:           cc.store %[[VAL_12]], %[[VAL_17]] : !cc.ptr<!cc.ptr<i8>>
+// ALT:           return %[[VAL_15]] : !cc.struct<{!cc.ptr<i8>, i64}>
+// ALT:         }
+// ALT:         llvm.mlir.global external constant @ghz.kernelName("ghz\00") {addr_space = 0 : i32}
+
+// ALT-LABEL:   func.func @ghz.returnOffset() -> i64 {
+// ALT:           %[[VAL_0:.*]] = cc.offsetof !cc.struct<{i32, f64}> [1] : i64
+// ALT:           return %[[VAL_0]] : i64
+// ALT:         }
+
+// ALT-LABEL:   func.func @ghz.thunk(
+// ALT-SAME:                         %[[VAL_0:.*]]: !cc.ptr<i8>,
+// ALT-SAME:                         %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr<i8>, i64}> {
+// ALT:           %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.struct<{i32, f64}>>
+// ALT:           %[[VAL_3:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64
+// ALT:           %[[VAL_4:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.array<i8 x ?>>
+// ALT:           %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_4]]{{\[}}%[[VAL_3]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
+// ALT:           %[[VAL_6:.*]] = cc.compute_ptr %[[VAL_2]][0] : (!cc.ptr<!cc.struct<{i32, f64}>>) -> !cc.ptr<i32>
+// ALT:           %[[VAL_7:.*]] = cc.load %[[VAL_6]] : !cc.ptr<i32>
+// ALT:           %[[VAL_8:.*]] = call @__nvqpp__mlirgen__ghz(%[[VAL_7]]) : (i32) -> f64
+// ALT:           %[[VAL_9:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr<!cc.struct<{i32, f64}>>) -> !cc.ptr<f64>
+// ALT:           cc.store %[[VAL_8]], %[[VAL_9]] : !cc.ptr<f64>
+// ALT:           %[[VAL_10:.*]] = call @__nvqpp_zeroDynamicResult() : () -> !cc.struct<{!cc.ptr<i8>, i64}>
+// ALT:           return %[[VAL_10]] : !cc.struct<{!cc.ptr<i8>, i64}>
+// ALT:         }
 
-// CHECK-LABEL:   func.func @_ZN3ghzclEi(
-// CHECK-SAME:      %[[VAL_0:.*]]: !cc.ptr<i8>, %[[VAL_1:.*]]: i32) -> f64 {
-// CHECK-DAG:       %[[VAL_2:.*]] = cc.undef !cc.struct<{i32, f64}>
-// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_4:.*]] = cc.insert_value %[[VAL_1]], %[[VAL_2]][0] : (!cc.struct<{i32, f64}>, i32) -> !cc.struct<{i32, f64}>
-// CHECK:           %[[VAL_5:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64
-// CHECK:           %[[VAL_6:.*]] = arith.addi %[[VAL_5]], %[[VAL_3]] : i64
-// CHECK:           %[[VAL_7:.*]] = cc.alloca i8{{\[}}%[[VAL_6]] : i64]
-// CHECK:           %[[VAL_8:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.struct<{i32, f64}>>
-// CHECK:           cc.store %[[VAL_4]], %[[VAL_8]] : !cc.ptr<!cc.struct<{i32, f64}>>
-// CHECK:           %[[VAL_9:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.array<!cc.struct<{i32, f64}> x ?>>
-// CHECK:           %[[VAL_10:.*]] = constant @ghz.thunk : (!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>
-// CHECK:           %[[VAL_11:.*]] = cc.func_ptr %[[VAL_10]] : ((!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_12:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr<!cc.array<!cc.struct<{i32, f64}> x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_13:.*]] = cc.offsetof !cc.struct<{i32, f64}> [1] : i64
-// CHECK:           %[[VAL_14:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr<array<4 x i8>>
-// CHECK:           %[[VAL_15:.*]] = cc.cast %[[VAL_14]] : (!llvm.ptr<array<4 x i8>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_16:.*]] = call @altLaunchKernel(%[[VAL_15]], %[[VAL_11]], %[[VAL_12]], %[[VAL_6]], %[[VAL_13]]) : (!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>
-// CHECK:           %[[VAL_17:.*]] = cc.extract_value %[[VAL_16]][0] : (!cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_18:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr<i8>) -> i64
-// CHECK:           %[[VAL_19:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_20:.*]] = arith.cmpi ne, %[[VAL_18]], %[[VAL_19]] : i64
-// CHECK:           cf.cond_br %[[VAL_20]], ^bb1, ^bb2
-// CHECK:         ^bb1:
-// CHECK:           %[[VAL_21:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.struct<{i32, f64}>>
-// CHECK:           %[[VAL_22:.*]] = cc.compute_ptr %[[VAL_21]][1] : (!cc.ptr<!cc.struct<{i32, f64}>>) -> !cc.ptr<f64>
-// CHECK:           cf.br ^bb3(%[[VAL_22]] : !cc.ptr<f64>)
-// CHECK:         ^bb2:
-// CHECK:           %[[VAL_23:.*]] = cc.compute_ptr %[[VAL_9]][0, 1] : (!cc.ptr<!cc.array<!cc.struct<{i32, f64}> x ?>>) -> !cc.ptr<f64>
-// CHECK:           cf.br ^bb3(%[[VAL_23]] : !cc.ptr<f64>)
-// CHECK:         ^bb3(%[[VAL_24:.*]]: !cc.ptr<f64>):
-// CHECK:           %[[VAL_25:.*]] = cc.compute_ptr %[[VAL_9]][0, 1] : (!cc.ptr<!cc.array<!cc.struct<{i32, f64}> x ?>>) -> !cc.ptr<f64>
-// CHECK:           %[[VAL_26:.*]] = cc.load %[[VAL_25]] : !cc.ptr<f64>
-// CHECK:           return %[[VAL_26]] : f64
-// CHECK:         }
+// ALT-LABEL:   func.func @ghz.argsCreator(
+// ALT-SAME:                               %[[VAL_0:.*]]: !cc.ptr<!cc.ptr<i8>>,
+// ALT-SAME:                               %[[VAL_1:.*]]: !cc.ptr<!cc.ptr<i8>>) -> i64 {
+// ALT:           %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<!cc.ptr<i8>>) -> !cc.ptr<!cc.array<!cc.ptr<i8> x ?>>
+// ALT:           %[[VAL_3:.*]] = cc.compute_ptr %[[VAL_2]][0] : (!cc.ptr<!cc.array<!cc.ptr<i8> x ?>>) -> !cc.ptr<!cc.ptr<i8>>
+// ALT:           %[[VAL_4:.*]] = cc.load %[[VAL_3]] : !cc.ptr<!cc.ptr<i8>>
+// ALT:           %[[VAL_5:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr<i8>) -> !cc.ptr<i32>
+// ALT:           %[[VAL_6:.*]] = cc.load %[[VAL_5]] : !cc.ptr<i32>
+// ALT:           %[[VAL_7:.*]] = cc.alloca i64
+// ALT:           %[[VAL_8:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64
+// ALT:           %[[VAL_9:.*]] = call @malloc(%[[VAL_8]]) : (i64) -> !cc.ptr<i8>
+// ALT:           %[[VAL_10:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.struct<{i32, f64}>>
+// ALT:           %[[VAL_11:.*]] = cc.compute_ptr %[[VAL_10]][0] : (!cc.ptr<!cc.struct<{i32, f64}>>) -> !cc.ptr<i32>
+// ALT:           cc.store %[[VAL_6]], %[[VAL_11]] : !cc.ptr<i32>
+// ALT:           cc.store %[[VAL_9]], %[[VAL_1]] : !cc.ptr<!cc.ptr<i8>>
+// ALT:           return %[[VAL_8]] : i64
+// ALT:         }
 
-// CHECK:         func.func private @altLaunchKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>
-// CHECK:         func.func private @cudaqRegisterKernelName(!cc.ptr<i8>)
+// ALT-LABEL:   llvm.func @ghz.kernelRegFunc() {
+// ALT:           %[[VAL_0:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr<array<4 x i8>>
+// ALT:           %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (!llvm.ptr<array<4 x i8>>) -> !cc.ptr<i8>
+// ALT:           func.call @cudaqRegisterKernelName(%[[VAL_1]]) : (!cc.ptr<i8>) -> ()
+// ALT:           %[[VAL_2:.*]] = func.constant @ghz.argsCreator : (!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>) -> i64
+// ALT:           %[[VAL_3:.*]] = cc.func_ptr %[[VAL_2]] : ((!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>) -> i64) -> !cc.ptr<i8>
+// ALT:           func.call @cudaqRegisterArgsCreator(%[[VAL_1]], %[[VAL_3]]) : (!cc.ptr<i8>, !cc.ptr<i8>) -> ()
+// ALT:           llvm.return
+// ALT:         }
+// ALT:         llvm.mlir.global_ctors {ctors = [@ghz.kernelRegFunc], priorities = [17 : i32]}
 
-// CHECK:         llvm.mlir.global external constant @ghz.kernelName("ghz\00") {addr_space = 0 : i32}
+// STREAMLINED-LABEL:   func.func @_ZN3ghzclEi(
+// STREAMLINED-SAME:                           %[[VAL_0:.*]]: !cc.ptr<i8>,
+// STREAMLINED-SAME:                           %[[VAL_1:.*]]: i32) -> f64 {
+// STREAMLINED:           %[[VAL_2:.*]] = cc.alloca i64
+// STREAMLINED:           %[[VAL_3:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64
+// STREAMLINED:           %[[VAL_4:.*]] = cc.alloca !cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>
+// STREAMLINED:           %[[VAL_5:.*]] = cc.alloca !cc.array<!cc.ptr<i8> x 1>
+// STREAMLINED:           %[[VAL_6:.*]] = cc.sizeof !cc.array<!cc.ptr<i8> x 1> : i64
+// STREAMLINED:           %[[VAL_7:.*]] = cc.cast %[[VAL_5]] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 1>>) -> !cc.ptr<!cc.ptr<i8>>
+// STREAMLINED:           %[[VAL_8:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// STREAMLINED:           cc.store %[[VAL_7]], %[[VAL_8]] : !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// STREAMLINED:           %[[VAL_9:.*]] = cc.cast %[[VAL_5]] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 1>>) -> i64
+// STREAMLINED:           %[[VAL_10:.*]] = arith.addi %[[VAL_9]], %[[VAL_6]] : i64
+// STREAMLINED:           %[[VAL_11:.*]] = cc.cast %[[VAL_10]] : (i64) -> !cc.ptr<!cc.ptr<i8>>
+// STREAMLINED:           %[[VAL_12:.*]] = cc.compute_ptr %[[VAL_4]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// STREAMLINED:           cc.store %[[VAL_11]], %[[VAL_12]] : !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// STREAMLINED:           %[[VAL_13:.*]] = cc.compute_ptr %[[VAL_4]][2] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// STREAMLINED:           cc.store %[[VAL_11]], %[[VAL_13]] : !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// STREAMLINED:           %[[VAL_14:.*]] = cc.compute_ptr %[[VAL_5]][0] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 1>>) -> !cc.ptr<!cc.ptr<i8>>
+// STREAMLINED:           %[[VAL_15:.*]] = cc.alloca i32
+// STREAMLINED:           cc.store %[[VAL_1]], %[[VAL_15]] : !cc.ptr<i32>
+// STREAMLINED:           %[[VAL_16:.*]] = cc.cast %[[VAL_15]] : (!cc.ptr<i32>) -> !cc.ptr<i8>
+// STREAMLINED:           cc.store %[[VAL_16]], %[[VAL_14]] : !cc.ptr<!cc.ptr<i8>>
+// STREAMLINED:           %[[VAL_17:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<i8>
+// STREAMLINED:           %[[VAL_18:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr<array<4 x i8>>
+// STREAMLINED:           %[[VAL_19:.*]] = cc.cast %[[VAL_18]] : (!llvm.ptr<array<4 x i8>>) -> !cc.ptr<i8>
+// STREAMLINED:           call @streamlinedLaunchKernel(%[[VAL_19]], %[[VAL_17]]) : (!cc.ptr<i8>, !cc.ptr<i8>) -> ()
+// STREAMLINED:           %[[VAL_20:.*]] = cc.undef f64
+// STREAMLINED:           return %[[VAL_20]] : f64
+// STREAMLINED:         }
+// STREAMLINED:         func.func private @streamlinedLaunchKernel(!cc.ptr<i8>, !cc.ptr<i8>)
+// STREAMLINED:         func.func private @cudaqRegisterArgsCreator(!cc.ptr<i8>, !cc.ptr<i8>)
+// STREAMLINED:         llvm.func @cudaqRegisterLambdaName(!llvm.ptr<i8>, !llvm.ptr<i8>) attributes {sym_visibility = "private"}
+// STREAMLINED:         func.func private @__cudaq_registerLinkableKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>)
+// STREAMLINED:         func.func private @__cudaq_getLinkableKernelKey(!cc.ptr<i8>) -> i64
+// STREAMLINED:         func.func private @cudaqRegisterKernelName(!cc.ptr<i8>)
+// STREAMLINED:         func.func private @malloc(i64) -> !cc.ptr<i8>
+// STREAMLINED:         func.func private @free(!cc.ptr<i8>)
+// STREAMLINED:         func.func private @__nvqpp_initializer_list_to_vector_bool(!cc.ptr<none>, !cc.ptr<none>, i64)
+// STREAMLINED:         func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr<!cc.struct<{!cc.ptr<i1>, !cc.ptr<i1>, !cc.ptr<i1>}>>, !cc.ptr<!cc.struct<{!cc.ptr<i1>, !cc.array<i8 x 32>}>>, !cc.ptr<!cc.ptr<i8>>)
+// STREAMLINED:         func.func private @llvm.memcpy.p0i8.p0i8.i64(!cc.ptr<i8>, !cc.ptr<i8>, i64, i1)
 
-// CHECK-LABEL:   func.func @ghz.thunk(
-// CHECK-SAME:                         %[[VAL_0:.*]]: !cc.ptr<i8>,
-// CHECK-SAME:                         %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr<i8>, i64}> {
-// CHECK:           %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.struct<{i32, f64}>>
-// CHECK:           %[[VAL_3:.*]] = cc.load %[[VAL_2]] : !cc.ptr<!cc.struct<{i32, f64}>>
-// CHECK:           %[[VAL_7:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64
-// CHECK:           %[[VAL_20:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.array<i8 x ?>>
-// CHECK:           %[[VAL_8:.*]] = cc.compute_ptr %[[VAL_20]][%[[VAL_7]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_9:.*]] = cc.extract_value %[[VAL_3]][0] : (!cc.struct<{i32, f64}>) -> i32
-// CHECK:           %[[VAL_10:.*]] = call @__nvqpp__mlirgen__ghz(%[[VAL_9]]) : (i32) -> f64
-// CHECK:           %[[VAL_11:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr<!cc.struct<{i32, f64}>>) -> !cc.ptr<f64>
-// CHECK:           cc.store %[[VAL_10]], %[[VAL_11]] : !cc.ptr<f64>
-// CHECK:           %[[VAL_12:.*]] = call @__nvqpp_zeroDynamicResult() : () -> !cc.struct<{!cc.ptr<i8>, i64}>
-// CHECK:           return %[[VAL_12]] : !cc.struct<{!cc.ptr<i8>, i64}>
-// CHECK:         }
+// STREAMLINED-LABEL:   func.func private @__nvqpp_zeroDynamicResult() -> !cc.struct<{!cc.ptr<i8>, i64}> {
+// STREAMLINED:           %[[VAL_0:.*]] = arith.constant 0 : i64
+// STREAMLINED:           %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (i64) -> !cc.ptr<i8>
+// STREAMLINED:           %[[VAL_2:.*]] = cc.undef !cc.struct<{!cc.ptr<i8>, i64}>
+// STREAMLINED:           %[[VAL_3:.*]] = cc.insert_value %[[VAL_1]], %[[VAL_2]][0] : (!cc.struct<{!cc.ptr<i8>, i64}>, !cc.ptr<i8>) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// STREAMLINED:           %[[VAL_4:.*]] = cc.insert_value %[[VAL_0]], %[[VAL_3]][1] : (!cc.struct<{!cc.ptr<i8>, i64}>, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// STREAMLINED:           return %[[VAL_4]] : !cc.struct<{!cc.ptr<i8>, i64}>
+// STREAMLINED:         }
 
-// CHECK-LABEL:   func.func @ghz.argsCreator(
-// CHECK-SAME:      %[[VAL_0:.*]]: !cc.ptr<!cc.ptr<i8>>,
-// CHECK-SAME:      %[[VAL_1:.*]]: !cc.ptr<!cc.ptr<i8>>) -> i64 {
-// CHECK:           %[[VAL_2:.*]] = cc.undef !cc.struct<{i32, f64}>
-// CHECK:           %[[VAL_14:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<!cc.ptr<i8>>) -> !cc.ptr<!cc.array<!cc.ptr<i8> x ?>>
-// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_4:.*]] = cc.compute_ptr %[[VAL_14]][0] : (!cc.ptr<!cc.array<!cc.ptr<i8> x ?>>) -> !cc.ptr<!cc.ptr<i8>>
-// CHECK:           %[[VAL_5:.*]] = cc.load %[[VAL_4]] : !cc.ptr<!cc.ptr<i8>>
-// CHECK:           %[[VAL_6:.*]] = cc.cast %[[VAL_5]] : (!cc.ptr<i8>) -> !cc.ptr<i32>
-// CHECK:           %[[VAL_7:.*]] = cc.load %[[VAL_6]] : !cc.ptr<i32>
-// CHECK:           %[[VAL_8:.*]] = cc.insert_value %[[VAL_7]], %[[VAL_2]][0] : (!cc.struct<{i32, f64}>, i32) -> !cc.struct<{i32, f64}>
-// CHECK:           %[[VAL_11:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64
-// CHECK:           %[[VAL_12:.*]] = call @malloc(%[[VAL_11]]) : (i64) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_13:.*]] = cc.cast %[[VAL_12]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.struct<{i32, f64}>>
-// CHECK:           cc.store %[[VAL_8]], %[[VAL_13]] : !cc.ptr<!cc.struct<{i32, f64}>>
-// CHECK:           cc.store %[[VAL_12]], %[[VAL_1]] : !cc.ptr<!cc.ptr<i8>>
-// CHECK:           return %[[VAL_11]] : i64
-// CHECK:         }
+// STREAMLINED-LABEL:   func.func private @__nvqpp_createDynamicResult(
+// STREAMLINED-SAME:                                                   %[[VAL_0:.*]]: !cc.ptr<i8>,
+// STREAMLINED-SAME:                                                   %[[VAL_1:.*]]: i64,
+// STREAMLINED-SAME:                                                   %[[VAL_2:.*]]: !cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>,
+// STREAMLINED-SAME:                                                   %[[VAL_3:.*]]: i64) -> !cc.struct<{!cc.ptr<i8>, i64}> {
+// STREAMLINED:           %[[VAL_4:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>) -> !cc.ptr<i64>
+// STREAMLINED:           %[[VAL_5:.*]] = cc.load %[[VAL_4]] : !cc.ptr<i64>
+// STREAMLINED:           %[[VAL_6:.*]] = arith.addi %[[VAL_1]], %[[VAL_5]] : i64
+// STREAMLINED:           %[[VAL_7:.*]] = call @malloc(%[[VAL_6]]) : (i64) -> !cc.ptr<i8>
+// STREAMLINED:           %[[VAL_8:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.array<i8 x ?>>
+// STREAMLINED:           %[[VAL_9:.*]] = arith.constant false
+// STREAMLINED:           call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_7]], %[[VAL_0]], %[[VAL_1]], %[[VAL_9]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
+// STREAMLINED:           %[[VAL_10:.*]] = cc.compute_ptr %[[VAL_2]][0] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>) -> !cc.ptr<!cc.ptr<i8>>
+// STREAMLINED:           %[[VAL_11:.*]] = cc.load %[[VAL_10]] : !cc.ptr<!cc.ptr<i8>>
+// STREAMLINED:           %[[VAL_12:.*]] = cc.compute_ptr %[[VAL_8]]{{\[}}%[[VAL_1]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
+// STREAMLINED:           call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_12]], %[[VAL_11]], %[[VAL_5]], %[[VAL_9]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
+// STREAMLINED:           %[[VAL_13:.*]] = cc.undef !cc.struct<{!cc.ptr<i8>, i64}>
+// STREAMLINED:           %[[VAL_14:.*]] = cc.insert_value %[[VAL_7]], %[[VAL_13]][0] : (!cc.struct<{!cc.ptr<i8>, i64}>, !cc.ptr<i8>) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// STREAMLINED:           %[[VAL_15:.*]] = cc.insert_value %[[VAL_6]], %[[VAL_14]][1] : (!cc.struct<{!cc.ptr<i8>, i64}>, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// STREAMLINED:           %[[VAL_16:.*]] = cc.compute_ptr %[[VAL_8]]{{\[}}%[[VAL_3]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
+// STREAMLINED:           %[[VAL_17:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.ptr<i8>>
+// STREAMLINED:           cc.store %[[VAL_12]], %[[VAL_17]] : !cc.ptr<!cc.ptr<i8>>
+// STREAMLINED:           return %[[VAL_15]] : !cc.struct<{!cc.ptr<i8>, i64}>
+// STREAMLINED:         }
+// STREAMLINED:         llvm.mlir.global external constant @ghz.kernelName("ghz\00") {addr_space = 0 : i32}
 
-// CHECK-LABEL:   llvm.func @ghz.kernelRegFunc() {
-// CHECK:           %[[VAL_0:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr<array<4 x i8>>
-// CHECK:           %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (!llvm.ptr<array<4 x i8>>) -> !cc.ptr<i8>
-// CHECK:           func.call @cudaqRegisterKernelName(%[[VAL_1]]) : (!cc.ptr<i8>) -> ()
-// CHECK:           %[[VAL_2:.*]] = func.constant @ghz.argsCreator : (!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>) -> i64
-// CHECK:           %[[VAL_3:.*]] = cc.func_ptr %[[VAL_2]] : ((!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>) -> i64) -> !cc.ptr<i8>
-// CHECK:           func.call @cudaqRegisterArgsCreator(%[[VAL_1]], %[[VAL_3]]) : (!cc.ptr<i8>, !cc.ptr<i8>) -> ()
-// CHECK:           llvm.return
-// CHECK:         }
+// STREAMLINED-LABEL:   llvm.func @ghz.kernelRegFunc() {
+// STREAMLINED:           %[[VAL_0:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr<array<4 x i8>>
+// STREAMLINED:           %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (!llvm.ptr<array<4 x i8>>) -> !cc.ptr<i8>
+// STREAMLINED:           func.call @cudaqRegisterKernelName(%[[VAL_1]]) : (!cc.ptr<i8>) -> ()
+// STREAMLINED:           llvm.return
+// STREAMLINED:         }
+// STREAMLINED:         llvm.mlir.global_ctors {ctors = [@ghz.kernelRegFunc], priorities = [17 : i32]}
 
 
-// STREAM-LABEL:   func.func @_ZN3ghzclEi(
-// STREAM-SAME:      %[[VAL_0:.*]]: !cc.ptr<i8>, %[[VAL_1:.*]]: i32) -> f64 {
-// STREAM:           %[[VAL_2:.*]] = cc.alloca !cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>
-// STREAM:           %[[VAL_3:.*]] = cc.alloca !cc.array<!cc.ptr<i8> x 1>
-// STREAM:           %[[VAL_4:.*]] = cc.sizeof !cc.array<!cc.ptr<i8> x 1> : i64
-// STREAM:           %[[VAL_5:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 1>>) -> !cc.ptr<!cc.ptr<i8>>
-// STREAM:           %[[VAL_6:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
-// STREAM:           cc.store %[[VAL_5]], %[[VAL_6]] : !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
-// STREAM:           %[[VAL_7:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 1>>) -> i64
-// STREAM:           %[[VAL_8:.*]] = arith.addi %[[VAL_7]], %[[VAL_4]] : i64
-// STREAM:           %[[VAL_9:.*]] = cc.cast %[[VAL_8]] : (i64) -> !cc.ptr<!cc.ptr<i8>>
-// STREAM:           %[[VAL_10:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
-// STREAM:           cc.store %[[VAL_9]], %[[VAL_10]] : !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
-// STREAM:           %[[VAL_11:.*]] = cc.compute_ptr %[[VAL_2]][2] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
-// STREAM:           cc.store %[[VAL_9]], %[[VAL_11]] : !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
-// STREAM:           %[[VAL_14:.*]] = cc.compute_ptr %[[VAL_3]][0] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 1>>) -> !cc.ptr<!cc.ptr<i8>>
-// STREAM:           %[[VAL_15:.*]] = cc.alloca i32
-// STREAM:           cc.store %[[VAL_1]], %[[VAL_15]] : !cc.ptr<i32>
-// STREAM:           %[[VAL_16:.*]] = cc.cast %[[VAL_15]] : (!cc.ptr<i32>) -> !cc.ptr<i8>
-// STREAM:           cc.store %[[VAL_16]], %[[VAL_14]] : !cc.ptr<!cc.ptr<i8>>
-// STREAM:           %[[VAL_19:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<i8>
-// STREAM:           %[[VAL_20:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr<array<4 x i8>>
-// STREAM:           %[[VAL_21:.*]] = cc.cast %[[VAL_20]] : (!llvm.ptr<array<4 x i8>>) -> !cc.ptr<i8>
-// STREAM:           call @streamlinedLaunchKernel(%[[VAL_21]], %[[VAL_19]]) : (!cc.ptr<i8>, !cc.ptr<i8>) -> ()
-// STREAM:           %[[VAL_22:.*]] = cc.undef f64
-// STREAM:           return %[[VAL_22]] : f64
-// STREAM:         }
 
 // HYBRID-LABEL:   func.func @_ZN3ghzclEi(
-// HYBRID-SAME:      %[[VAL_0:.*]]: !cc.ptr<i8>, %[[VAL_1:.*]]: i32) -> f64 {
-// HYBRID:           %[[VAL_2:.*]] = cc.undef !cc.struct<{i32, f64}>
-// HYBRID:           %[[VAL_3:.*]] = arith.constant 0 : i64
-// HYBRID:           %[[VAL_4:.*]] = cc.insert_value %[[VAL_1]], %[[VAL_2]][0] : (!cc.struct<{i32, f64}>, i32) -> !cc.struct<{i32, f64}>
-// HYBRID:           %[[VAL_5:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64
-// HYBRID:           %[[VAL_6:.*]] = arith.addi %[[VAL_5]], %[[VAL_3]] : i64
-// HYBRID:           %[[VAL_7:.*]] = cc.alloca i8{{\[}}%[[VAL_6]] : i64]
-// HYBRID:           %[[VAL_8:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.struct<{i32, f64}>>
-// HYBRID:           cc.store %[[VAL_4]], %[[VAL_8]] : !cc.ptr<!cc.struct<{i32, f64}>>
-// HYBRID:           %[[VAL_9:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.array<!cc.struct<{i32, f64}> x ?>>
-// HYBRID:           %[[VAL_10:.*]] = constant @ghz.thunk : (!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>
-// HYBRID:           %[[VAL_11:.*]] = cc.func_ptr %[[VAL_10]] : ((!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
-// HYBRID:           %[[VAL_12:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr<!cc.array<!cc.struct<{i32, f64}> x ?>>) -> !cc.ptr<i8>
-// HYBRID:           %[[VAL_13:.*]] = cc.offsetof !cc.struct<{i32, f64}> [1] : i64
-// HYBRID:           %[[VAL_14:.*]] = cc.alloca !cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>
-// HYBRID:           %[[VAL_15:.*]] = cc.alloca !cc.array<!cc.ptr<i8> x 1>
-// HYBRID:           %[[VAL_16:.*]] = cc.sizeof !cc.array<!cc.ptr<i8> x 1> : i64
-// HYBRID:           %[[VAL_17:.*]] = cc.cast %[[VAL_15]] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 1>>) -> !cc.ptr<!cc.ptr<i8>>
-// HYBRID:           %[[VAL_18:.*]] = cc.cast %[[VAL_14]] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
-// HYBRID:           cc.store %[[VAL_17]], %[[VAL_18]] : !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
-// HYBRID:           %[[VAL_19:.*]] = cc.cast %[[VAL_15]] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 1>>) -> i64
-// HYBRID:           %[[VAL_20:.*]] = arith.addi %[[VAL_19]], %[[VAL_16]] : i64
-// HYBRID:           %[[VAL_21:.*]] = cc.cast %[[VAL_20]] : (i64) -> !cc.ptr<!cc.ptr<i8>>
-// HYBRID:           %[[VAL_22:.*]] = cc.compute_ptr %[[VAL_14]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
-// HYBRID:           cc.store %[[VAL_21]], %[[VAL_22]] : !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
-// HYBRID:           %[[VAL_23:.*]] = cc.compute_ptr %[[VAL_14]][2] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
-// HYBRID:           cc.store %[[VAL_21]], %[[VAL_23]] : !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
-// HYBRID:           %[[VAL_24:.*]] = cc.compute_ptr %[[VAL_15]][0] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 1>>) -> !cc.ptr<!cc.ptr<i8>>
-// HYBRID:           %[[VAL_25:.*]] = cc.alloca i32
-// HYBRID:           cc.store %[[VAL_1]], %[[VAL_25]] : !cc.ptr<i32>
-// HYBRID:           %[[VAL_26:.*]] = cc.cast %[[VAL_25]] : (!cc.ptr<i32>) -> !cc.ptr<i8>
-// HYBRID:           cc.store %[[VAL_26]], %[[VAL_24]] : !cc.ptr<!cc.ptr<i8>>
-// HYBRID:           %[[VAL_27:.*]] = cc.cast %[[VAL_14]] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<i8>
-// HYBRID:           %[[VAL_28:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr<array<4 x i8>>
-// HYBRID:           %[[VAL_29:.*]] = cc.cast %[[VAL_28]] : (!llvm.ptr<array<4 x i8>>) -> !cc.ptr<i8>
-// HYBRID:           %[[VAL_30:.*]] = call @hybridLaunchKernel(%[[VAL_29]], %[[VAL_11]], %[[VAL_12]], %[[VAL_6]], %[[VAL_13]], %[[VAL_27]]) : (!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64, !cc.ptr<i8>) -> !cc.struct<{!cc.ptr<i8>, i64}>
-// HYBRID:           %[[VAL_31:.*]] = cc.extract_value %[[VAL_30]][0] : (!cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
-// HYBRID:           %[[VAL_32:.*]] = cc.cast %[[VAL_31]] : (!cc.ptr<i8>) -> i64
-// HYBRID:           %[[VAL_33:.*]] = arith.constant 0 : i64
-// HYBRID:           %[[VAL_34:.*]] = arith.cmpi ne, %[[VAL_32]], %[[VAL_33]] : i64
-// HYBRID:           cf.cond_br %[[VAL_34]], ^bb1, ^bb2
+// HYBRID-SAME:                           %[[VAL_0:.*]]: !cc.ptr<i8>,
+// HYBRID-SAME:                           %[[VAL_1:.*]]: i32) -> f64 {
+// HYBRID:           %[[VAL_2:.*]] = cc.alloca i64
+// HYBRID:           %[[VAL_3:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64
+// HYBRID:           %[[VAL_4:.*]] = cc.alloca i8{{\[}}%[[VAL_3]] : i64]
+// HYBRID:           %[[VAL_5:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.struct<{i32, f64}>>
+// HYBRID:           %[[VAL_6:.*]] = cc.compute_ptr %[[VAL_5]][0] : (!cc.ptr<!cc.struct<{i32, f64}>>) -> !cc.ptr<i32>
+// HYBRID:           cc.store %[[VAL_1]], %[[VAL_6]] : !cc.ptr<i32>
+// HYBRID:           %[[VAL_7:.*]] = constant @ghz.thunk : (!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// HYBRID:           %[[VAL_8:.*]] = cc.func_ptr %[[VAL_7]] : ((!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
+// HYBRID:           %[[VAL_9:.*]] = cc.cast %[[VAL_5]] : (!cc.ptr<!cc.struct<{i32, f64}>>) -> !cc.ptr<i8>
+// HYBRID:           %[[VAL_10:.*]] = cc.offsetof !cc.struct<{i32, f64}> [1] : i64
+// HYBRID:           %[[VAL_11:.*]] = cc.alloca !cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>
+// HYBRID:           %[[VAL_12:.*]] = cc.alloca !cc.array<!cc.ptr<i8> x 1>
+// HYBRID:           %[[VAL_13:.*]] = cc.sizeof !cc.array<!cc.ptr<i8> x 1> : i64
+// HYBRID:           %[[VAL_14:.*]] = cc.cast %[[VAL_12]] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 1>>) -> !cc.ptr<!cc.ptr<i8>>
+// HYBRID:           %[[VAL_15:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// HYBRID:           cc.store %[[VAL_14]], %[[VAL_15]] : !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// HYBRID:           %[[VAL_16:.*]] = cc.cast %[[VAL_12]] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 1>>) -> i64
+// HYBRID:           %[[VAL_17:.*]] = arith.addi %[[VAL_16]], %[[VAL_13]] : i64
+// HYBRID:           %[[VAL_18:.*]] = cc.cast %[[VAL_17]] : (i64) -> !cc.ptr<!cc.ptr<i8>>
+// HYBRID:           %[[VAL_19:.*]] = cc.compute_ptr %[[VAL_11]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// HYBRID:           cc.store %[[VAL_18]], %[[VAL_19]] : !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// HYBRID:           %[[VAL_20:.*]] = cc.compute_ptr %[[VAL_11]][2] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// HYBRID:           cc.store %[[VAL_18]], %[[VAL_20]] : !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// HYBRID:           %[[VAL_21:.*]] = cc.compute_ptr %[[VAL_12]][0] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 1>>) -> !cc.ptr<!cc.ptr<i8>>
+// HYBRID:           %[[VAL_22:.*]] = cc.alloca i32
+// HYBRID:           cc.store %[[VAL_1]], %[[VAL_22]] : !cc.ptr<i32>
+// HYBRID:           %[[VAL_23:.*]] = cc.cast %[[VAL_22]] : (!cc.ptr<i32>) -> !cc.ptr<i8>
+// HYBRID:           cc.store %[[VAL_23]], %[[VAL_21]] : !cc.ptr<!cc.ptr<i8>>
+// HYBRID:           %[[VAL_24:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<i8>
+// HYBRID:           %[[VAL_25:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr<array<4 x i8>>
+// HYBRID:           %[[VAL_26:.*]] = cc.cast %[[VAL_25]] : (!llvm.ptr<array<4 x i8>>) -> !cc.ptr<i8>
+// HYBRID:           %[[VAL_27:.*]] = call @hybridLaunchKernel(%[[VAL_26]], %[[VAL_8]], %[[VAL_9]], %[[VAL_3]], %[[VAL_10]], %[[VAL_24]]) : (!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64, !cc.ptr<i8>) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// HYBRID:           %[[VAL_28:.*]] = cc.extract_value %[[VAL_27]][0] : (!cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
+// HYBRID:           %[[VAL_29:.*]] = cc.cast %[[VAL_28]] : (!cc.ptr<i8>) -> i64
+// HYBRID:           %[[VAL_30:.*]] = arith.constant 0 : i64
+// HYBRID:           %[[VAL_31:.*]] = arith.cmpi ne, %[[VAL_29]], %[[VAL_30]] : i64
+// HYBRID:           cf.cond_br %[[VAL_31]], ^bb1, ^bb2
 // HYBRID:         ^bb1:
-// HYBRID:           %[[VAL_35:.*]] = cc.cast %[[VAL_31]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.struct<{i32, f64}>>
-// HYBRID:           %[[VAL_36:.*]] = cc.compute_ptr %[[VAL_35]][1] : (!cc.ptr<!cc.struct<{i32, f64}>>) -> !cc.ptr<f64>
-// HYBRID:           cf.br ^bb3(%[[VAL_36]] : !cc.ptr<f64>)
+// HYBRID:           %[[VAL_32:.*]] = cc.cast %[[VAL_28]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.struct<{i32, f64}>>
+// HYBRID:           %[[VAL_33:.*]] = cc.compute_ptr %[[VAL_32]][1] : (!cc.ptr<!cc.struct<{i32, f64}>>) -> !cc.ptr<f64>
+// HYBRID:           cf.br ^bb3(%[[VAL_33]] : !cc.ptr<f64>)
 // HYBRID:         ^bb2:
-// HYBRID:           %[[VAL_37:.*]] = cc.compute_ptr %[[VAL_9]][0, 1] : (!cc.ptr<!cc.array<!cc.struct<{i32, f64}> x ?>>) -> !cc.ptr<f64>
-// HYBRID:           cf.br ^bb3(%[[VAL_37]] : !cc.ptr<f64>)
-// HYBRID:         ^bb3(%[[VAL_38:.*]]: !cc.ptr<f64>):
-// HYBRID:           %[[VAL_39:.*]] = cc.compute_ptr %[[VAL_9]][0, 1] : (!cc.ptr<!cc.array<!cc.struct<{i32, f64}> x ?>>) -> !cc.ptr<f64>
-// HYBRID:           %[[VAL_40:.*]] = cc.load %[[VAL_39]] : !cc.ptr<f64>
-// HYBRID:           return %[[VAL_40]] : f64
+// HYBRID:           %[[VAL_34:.*]] = cc.compute_ptr %[[VAL_5]][1] : (!cc.ptr<!cc.struct<{i32, f64}>>) -> !cc.ptr<f64>
+// HYBRID:           cf.br ^bb3(%[[VAL_34]] : !cc.ptr<f64>)
+// HYBRID:         ^bb3(%[[VAL_35:.*]]: !cc.ptr<f64>):
+// HYBRID:           %[[VAL_36:.*]] = cc.compute_ptr %[[VAL_5]][1] : (!cc.ptr<!cc.struct<{i32, f64}>>) -> !cc.ptr<f64>
+// HYBRID:           %[[VAL_37:.*]] = cc.load %[[VAL_36]] : !cc.ptr<f64>
+// HYBRID:           return %[[VAL_37]] : f64
+// HYBRID:         }
+// HYBRID:         func.func private @hybridLaunchKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64, !cc.ptr<i8>) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// HYBRID:         func.func private @cudaqRegisterArgsCreator(!cc.ptr<i8>, !cc.ptr<i8>)
+// HYBRID:         llvm.func @cudaqRegisterLambdaName(!llvm.ptr<i8>, !llvm.ptr<i8>) attributes {sym_visibility = "private"}
+// HYBRID:         func.func private @__cudaq_registerLinkableKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>)
+// HYBRID:         func.func private @__cudaq_getLinkableKernelKey(!cc.ptr<i8>) -> i64
+// HYBRID:         func.func private @cudaqRegisterKernelName(!cc.ptr<i8>)
+// HYBRID:         func.func private @malloc(i64) -> !cc.ptr<i8>
+// HYBRID:         func.func private @free(!cc.ptr<i8>)
+// HYBRID:         func.func private @__nvqpp_initializer_list_to_vector_bool(!cc.ptr<none>, !cc.ptr<none>, i64)
+// HYBRID:         func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr<!cc.struct<{!cc.ptr<i1>, !cc.ptr<i1>, !cc.ptr<i1>}>>, !cc.ptr<!cc.struct<{!cc.ptr<i1>, !cc.array<i8 x 32>}>>, !cc.ptr<!cc.ptr<i8>>)
+// HYBRID:         func.func private @llvm.memcpy.p0i8.p0i8.i64(!cc.ptr<i8>, !cc.ptr<i8>, i64, i1)
+
+// HYBRID-LABEL:   func.func private @__nvqpp_zeroDynamicResult() -> !cc.struct<{!cc.ptr<i8>, i64}> {
+// HYBRID:           %[[VAL_0:.*]] = arith.constant 0 : i64
+// HYBRID:           %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (i64) -> !cc.ptr<i8>
+// HYBRID:           %[[VAL_2:.*]] = cc.undef !cc.struct<{!cc.ptr<i8>, i64}>
+// HYBRID:           %[[VAL_3:.*]] = cc.insert_value %[[VAL_1]], %[[VAL_2]][0] : (!cc.struct<{!cc.ptr<i8>, i64}>, !cc.ptr<i8>) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// HYBRID:           %[[VAL_4:.*]] = cc.insert_value %[[VAL_0]], %[[VAL_3]][1] : (!cc.struct<{!cc.ptr<i8>, i64}>, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// HYBRID:           return %[[VAL_4]] : !cc.struct<{!cc.ptr<i8>, i64}>
+// HYBRID:         }
+
+// HYBRID-LABEL:   func.func private @__nvqpp_createDynamicResult(
+// HYBRID-SAME:                                                   %[[VAL_0:.*]]: !cc.ptr<i8>,
+// HYBRID-SAME:                                                   %[[VAL_1:.*]]: i64,
+// HYBRID-SAME:                                                   %[[VAL_2:.*]]: !cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>,
+// HYBRID-SAME:                                                   %[[VAL_3:.*]]: i64) -> !cc.struct<{!cc.ptr<i8>, i64}> {
+// HYBRID:           %[[VAL_4:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>) -> !cc.ptr<i64>
+// HYBRID:           %[[VAL_5:.*]] = cc.load %[[VAL_4]] : !cc.ptr<i64>
+// HYBRID:           %[[VAL_6:.*]] = arith.addi %[[VAL_1]], %[[VAL_5]] : i64
+// HYBRID:           %[[VAL_7:.*]] = call @malloc(%[[VAL_6]]) : (i64) -> !cc.ptr<i8>
+// HYBRID:           %[[VAL_8:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.array<i8 x ?>>
+// HYBRID:           %[[VAL_9:.*]] = arith.constant false
+// HYBRID:           call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_7]], %[[VAL_0]], %[[VAL_1]], %[[VAL_9]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
+// HYBRID:           %[[VAL_10:.*]] = cc.compute_ptr %[[VAL_2]][0] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>) -> !cc.ptr<!cc.ptr<i8>>
+// HYBRID:           %[[VAL_11:.*]] = cc.load %[[VAL_10]] : !cc.ptr<!cc.ptr<i8>>
+// HYBRID:           %[[VAL_12:.*]] = cc.compute_ptr %[[VAL_8]]{{\[}}%[[VAL_1]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
+// HYBRID:           call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_12]], %[[VAL_11]], %[[VAL_5]], %[[VAL_9]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
+// HYBRID:           %[[VAL_13:.*]] = cc.undef !cc.struct<{!cc.ptr<i8>, i64}>
+// HYBRID:           %[[VAL_14:.*]] = cc.insert_value %[[VAL_7]], %[[VAL_13]][0] : (!cc.struct<{!cc.ptr<i8>, i64}>, !cc.ptr<i8>) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// HYBRID:           %[[VAL_15:.*]] = cc.insert_value %[[VAL_6]], %[[VAL_14]][1] : (!cc.struct<{!cc.ptr<i8>, i64}>, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// HYBRID:           %[[VAL_16:.*]] = cc.compute_ptr %[[VAL_8]]{{\[}}%[[VAL_3]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
+// HYBRID:           %[[VAL_17:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.ptr<i8>>
+// HYBRID:           cc.store %[[VAL_12]], %[[VAL_17]] : !cc.ptr<!cc.ptr<i8>>
+// HYBRID:           return %[[VAL_15]] : !cc.struct<{!cc.ptr<i8>, i64}>
 // HYBRID:         }
+// HYBRID:         llvm.mlir.global external constant @ghz.kernelName("ghz\00") {addr_space = 0 : i32}
+
+// HYBRID-LABEL:   func.func @ghz.returnOffset() -> i64 {
+// HYBRID:           %[[VAL_0:.*]] = cc.offsetof !cc.struct<{i32, f64}> [1] : i64
+// HYBRID:           return %[[VAL_0]] : i64
+// HYBRID:         }
+
+// HYBRID-LABEL:   func.func @ghz.thunk(
+// HYBRID-SAME:        %[[VAL_0:.*]]: !cc.ptr<i8>,
+// HYBRID-SAME:        %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr<i8>, i64}> {
+// HYBRID:           %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.struct<{i32, f64}>>
+// HYBRID:           %[[VAL_3:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64
+// HYBRID:           %[[VAL_4:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.array<i8 x ?>>
+// HYBRID:           %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_4]]{{\[}}%[[VAL_3]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
+// HYBRID:           %[[VAL_6:.*]] = cc.compute_ptr %[[VAL_2]][0] : (!cc.ptr<!cc.struct<{i32, f64}>>) -> !cc.ptr<i32>
+// HYBRID:           %[[VAL_7:.*]] = cc.load %[[VAL_6]] : !cc.ptr<i32>
+// HYBRID:           %[[VAL_8:.*]] = call @__nvqpp__mlirgen__ghz(%[[VAL_7]]) : (i32) -> f64
+// HYBRID:           %[[VAL_9:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr<!cc.struct<{i32, f64}>>) -> !cc.ptr<f64>
+// HYBRID:           cc.store %[[VAL_8]], %[[VAL_9]] : !cc.ptr<f64>
+// HYBRID:           %[[VAL_10:.*]] = call @__nvqpp_zeroDynamicResult() : () -> !cc.struct<{!cc.ptr<i8>, i64}>
+// HYBRID:           return %[[VAL_10]] : !cc.struct<{!cc.ptr<i8>, i64}>
+// HYBRID:         }
+
+// HYBRID-LABEL:   func.func @ghz.argsCreator(
+// HYBRID-SAME:                               %[[VAL_0:.*]]: !cc.ptr<!cc.ptr<i8>>,
+// HYBRID-SAME:                               %[[VAL_1:.*]]: !cc.ptr<!cc.ptr<i8>>) -> i64 {
+// HYBRID:           %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<!cc.ptr<i8>>) -> !cc.ptr<!cc.array<!cc.ptr<i8> x ?>>
+// HYBRID:           %[[VAL_3:.*]] = cc.compute_ptr %[[VAL_2]][0] : (!cc.ptr<!cc.array<!cc.ptr<i8> x ?>>) -> !cc.ptr<!cc.ptr<i8>>
+// HYBRID:           %[[VAL_4:.*]] = cc.load %[[VAL_3]] : !cc.ptr<!cc.ptr<i8>>
+// HYBRID:           %[[VAL_5:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr<i8>) -> !cc.ptr<i32>
+// HYBRID:           %[[VAL_6:.*]] = cc.load %[[VAL_5]] : !cc.ptr<i32>
+// HYBRID:           %[[VAL_7:.*]] = cc.alloca i64
+// HYBRID:           %[[VAL_8:.*]] = cc.sizeof !cc.struct<{i32, f64}> : i64
+// HYBRID:           %[[VAL_9:.*]] = call @malloc(%[[VAL_8]]) : (i64) -> !cc.ptr<i8>
+// HYBRID:           %[[VAL_10:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.struct<{i32, f64}>>
+// HYBRID:           %[[VAL_11:.*]] = cc.compute_ptr %[[VAL_10]][0] : (!cc.ptr<!cc.struct<{i32, f64}>>) -> !cc.ptr<i32>
+// HYBRID:           cc.store %[[VAL_6]], %[[VAL_11]] : !cc.ptr<i32>
+// HYBRID:           cc.store %[[VAL_9]], %[[VAL_1]] : !cc.ptr<!cc.ptr<i8>>
+// HYBRID:           return %[[VAL_8]] : i64
+// HYBRID:         }
+
+// HYBRID-LABEL:   llvm.func @ghz.kernelRegFunc() {
+// HYBRID:           %[[VAL_0:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr<array<4 x i8>>
+// HYBRID:           %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (!llvm.ptr<array<4 x i8>>) -> !cc.ptr<i8>
+// HYBRID:           func.call @cudaqRegisterKernelName(%[[VAL_1]]) : (!cc.ptr<i8>) -> ()
+// HYBRID:           %[[VAL_2:.*]] = func.constant @ghz.argsCreator : (!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>) -> i64
+// HYBRID:           %[[VAL_3:.*]] = cc.func_ptr %[[VAL_2]] : ((!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>) -> i64) -> !cc.ptr<i8>
+// HYBRID:           func.call @cudaqRegisterArgsCreator(%[[VAL_1]], %[[VAL_3]]) : (!cc.ptr<i8>, !cc.ptr<i8>) -> ()
+// HYBRID:           llvm.return
+// HYBRID:         }
+// HYBRID:         llvm.mlir.global_ctors {ctors = [@ghz.kernelRegFunc], priorities = [17 : i32]}
+
diff --git a/test/Quake/kernel_exec-2.qke b/test/Quake/kernel_exec-2.qke
index a9b04b8449..b94412cb11 100644
--- a/test/Quake/kernel_exec-2.qke
+++ b/test/Quake/kernel_exec-2.qke
@@ -6,7 +6,7 @@
 // the terms of the Apache License 2.0 which accompanies this distribution.   //
 // ========================================================================== //
 
-// RUN: cudaq-opt --kernel-execution=codegen=1 %s | FileCheck %s
+// RUN: cudaq-opt -kernel-execution %s | FileCheck %s
 
 module attributes {quake.mangled_name_map = {
 __nvqpp__mlirgen__function_hawaiian = "shirt",
@@ -36,120 +36,210 @@ __nvqpp__mlirgen__function_cargo = "pants"}} {
   }
 }
 
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_cargo(
+// CHECK-SAME:      %[[VAL_0:.*]]: !cc.stdvec<i32>,
+// CHECK-SAME:      %[[VAL_1:.*]]: !quake.ref) attributes {"cudaq-kernel", no_this} {
+// CHECK:           return
+// CHECK:         }
+
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_hawaiian(
+// CHECK-SAME:      %[[VAL_0:.*]]: i1,
+// CHECK-SAME:      %[[VAL_1:.*]]: !cc.stdvec<i32>) attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+// CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.ref
+// CHECK:           %[[VAL_3:.*]] = quake.alloca !quake.ref
+// CHECK:           %[[VAL_4:.*]] = quake.alloca !quake.ref
+// CHECK:           cc.if(%[[VAL_0]]) {
+// CHECK:             quake.x %[[VAL_4]] : (!quake.ref) -> ()
+// CHECK:           }
+// CHECK:           call @__nvqpp__mlirgen__function_cargo(%[[VAL_1]], %[[VAL_4]]) : (!cc.stdvec<i32>, !quake.ref) -> ()
+// CHECK:           return
+// CHECK:         }
+
 // CHECK-LABEL:   func.func @shirt(
-// CHECK-SAME:      %[[VAL_0:.*]]: i1, %[[VAL_1:.*]]: !cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i32>}>>) {
-// CHECK:           %[[VAL_2:.*]] = cc.undef !cc.struct<{i1, i64}>
-// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_4:.*]] = cc.insert_value %[[VAL_0]], %[[VAL_2]][0] : (!cc.struct<{i1, i64}>, i1) -> !cc.struct<{i1, i64}>
-// CHECK:           %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_1]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i32>}>>) -> !cc.ptr<!cc.ptr<i32>>
-// CHECK:           %[[VAL_6:.*]] = cc.compute_ptr %[[VAL_1]][0] : (!cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i32>}>>) -> !cc.ptr<!cc.ptr<i32>>
+// CHECK-SAME:      %[[VAL_0:.*]]: i1,
+// CHECK-SAME:      %[[VAL_1:.*]]: !cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i32>}>>) {
+// CHECK:           %[[VAL_2:.*]] = cc.alloca i64
+// CHECK:           %[[VAL_3:.*]] = cc.sizeof !cc.struct<{i1, i64}> : i64
+// CHECK:           %[[VAL_4:.*]] = cc.compute_ptr %[[VAL_1]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i32>}>>) -> !cc.ptr<!cc.ptr<i32>>
+// CHECK:           %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_1]][0] : (!cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i32>}>>) -> !cc.ptr<!cc.ptr<i32>>
+// CHECK:           %[[VAL_6:.*]] = cc.load %[[VAL_4]] : !cc.ptr<!cc.ptr<i32>>
 // CHECK:           %[[VAL_7:.*]] = cc.load %[[VAL_5]] : !cc.ptr<!cc.ptr<i32>>
-// CHECK:           %[[VAL_8:.*]] = cc.load %[[VAL_6]] : !cc.ptr<!cc.ptr<i32>>
+// CHECK:           %[[VAL_8:.*]] = cc.cast %[[VAL_6]] : (!cc.ptr<i32>) -> i64
 // CHECK:           %[[VAL_9:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr<i32>) -> i64
-// CHECK:           %[[VAL_10:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr<i32>) -> i64
-// CHECK:           %[[VAL_11:.*]] = arith.subi %[[VAL_9]], %[[VAL_10]] : i64
-// CHECK:           %[[VAL_12:.*]] = cc.insert_value %[[VAL_11]], %[[VAL_4]][1] : (!cc.struct<{i1, i64}>, i64) -> !cc.struct<{i1, i64}>
-// CHECK:           %[[VAL_13:.*]] = arith.addi %[[VAL_3]], %[[VAL_11]] : i64
-// CHECK:           %[[VAL_16:.*]] = cc.sizeof !cc.struct<{i1, i64}> : i64
-// CHECK:           %[[VAL_17:.*]] = arith.addi %[[VAL_16]], %[[VAL_13]] : i64
-// CHECK:           %[[VAL_18:.*]] = cc.alloca i8[%[[VAL_17]] : i64]
-// CHECK:           %[[VAL_19:.*]] = cc.cast %[[VAL_18]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.struct<{i1, i64}>>
-// CHECK:           cc.store %[[VAL_12]], %[[VAL_19]] : !cc.ptr<!cc.struct<{i1, i64}>>
-// CHECK:           %[[VAL_20:.*]] = cc.cast %[[VAL_18]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.array<!cc.struct<{i1, i64}> x ?>>
-// CHECK:           %[[VAL_21:.*]] = cc.compute_ptr %[[VAL_18]][%[[VAL_16]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_22:.*]] = cc.extract_value %[[VAL_12]][1] : (!cc.struct<{i1, i64}>) -> i64
-// CHECK:           %[[VAL_23:.*]] = arith.constant false
-// CHECK:           %[[VAL_24:.*]] = cc.compute_ptr %[[VAL_1]][0] : (!cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i32>}>>) -> !cc.ptr<!cc.ptr<i32>>
-// CHECK:           %[[VAL_25:.*]] = cc.load %[[VAL_24]] : !cc.ptr<!cc.ptr<i32>>
-// CHECK:           %[[VAL_26:.*]] = cc.cast %[[VAL_25]] : (!cc.ptr<i32>) -> !cc.ptr<i8>
-// CHECK:           call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_21]], %[[VAL_26]], %[[VAL_22]], %[[VAL_23]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
-// CHECK:           %[[VAL_90:.*]] = cc.cast %[[VAL_21]] :
-// CHECK:           %[[VAL_27:.*]] = cc.compute_ptr %[[VAL_90]][%[[VAL_22]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_29:.*]] = constant @function_hawaiian.thunk : (!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>
-// CHECK:           %[[VAL_31:.*]] = cc.func_ptr %[[VAL_29]] : ((!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_32:.*]] = cc.cast %[[VAL_20]] : (!cc.ptr<!cc.array<!cc.struct<{i1, i64}> x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_33:.*]] = arith.constant 2147483647 : i64
-// CHECK:           %[[VAL_28:.*]] = llvm.mlir.addressof @function_hawaiian.kernelName : !llvm.ptr<array<18 x i8>>
-// CHECK:           %[[VAL_30:.*]] = cc.cast %[[VAL_28]] : (!llvm.ptr<array<18 x i8>>) -> !cc.ptr<i8>
-// CHECK:           call @altLaunchKernel(%[[VAL_30]], %[[VAL_31]], %[[VAL_32]], %[[VAL_17]], %[[VAL_33]]) : (!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// CHECK:           %[[VAL_10:.*]] = arith.subi %[[VAL_8]], %[[VAL_9]] : i64
+// CHECK:           %[[VAL_11:.*]] = arith.addi %[[VAL_10]], %[[VAL_3]] : i64
+// CHECK:           %[[VAL_12:.*]] = cc.alloca i8{{\[}}%[[VAL_11]] : i64]
+// CHECK:           %[[VAL_13:.*]] = cc.cast %[[VAL_12]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.struct<{i1, i64}>>
+// CHECK:           %[[VAL_14:.*]] = cc.alloca !cc.ptr<i8>
+// CHECK:           %[[VAL_15:.*]] = cc.sizeof !cc.struct<{i1, i64}> : i64
+// CHECK:           %[[VAL_16:.*]] = cc.compute_ptr %[[VAL_12]]{{\[}}%[[VAL_15]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_17:.*]] = cc.compute_ptr %[[VAL_13]][0] : (!cc.ptr<!cc.struct<{i1, i64}>>) -> !cc.ptr<i1>
+// CHECK:           cc.store %[[VAL_0]], %[[VAL_17]] : !cc.ptr<i1>
+// CHECK:           %[[VAL_18:.*]] = cc.compute_ptr %[[VAL_13]][1] : (!cc.ptr<!cc.struct<{i1, i64}>>) -> !cc.ptr<i64>
+// CHECK:           %[[VAL_19:.*]] = cc.compute_ptr %[[VAL_1]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i32>}>>) -> !cc.ptr<!cc.ptr<i32>>
+// CHECK:           %[[VAL_20:.*]] = cc.compute_ptr %[[VAL_1]][0] : (!cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i32>}>>) -> !cc.ptr<!cc.ptr<i32>>
+// CHECK:           %[[VAL_21:.*]] = cc.load %[[VAL_19]] : !cc.ptr<!cc.ptr<i32>>
+// CHECK:           %[[VAL_22:.*]] = cc.load %[[VAL_20]] : !cc.ptr<!cc.ptr<i32>>
+// CHECK:           %[[VAL_23:.*]] = cc.cast %[[VAL_21]] : (!cc.ptr<i32>) -> i64
+// CHECK:           %[[VAL_24:.*]] = cc.cast %[[VAL_22]] : (!cc.ptr<i32>) -> i64
+// CHECK:           %[[VAL_25:.*]] = arith.subi %[[VAL_23]], %[[VAL_24]] : i64
+// CHECK:           cc.store %[[VAL_25]], %[[VAL_18]] : !cc.ptr<i64>
+// CHECK:           %[[VAL_26:.*]] = cc.cast %[[VAL_1]] : (!cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i32>}>>) -> !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_27:.*]] = cc.load %[[VAL_26]] : !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_28:.*]] = arith.constant false
+// CHECK:           %[[VAL_29:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr<i8>) -> !cc.ptr<i8>
+// CHECK:           call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_29]], %[[VAL_27]], %[[VAL_25]], %[[VAL_28]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
+// CHECK:           %[[VAL_30:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.array<i8 x ?>>
+// CHECK:           %[[VAL_31:.*]] = cc.compute_ptr %[[VAL_30]]{{\[}}%[[VAL_25]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_32:.*]] = constant @function_hawaiian.thunk : (!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// CHECK:           %[[VAL_33:.*]] = cc.func_ptr %[[VAL_32]] : ((!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_34:.*]] = cc.cast %[[VAL_13]] : (!cc.ptr<!cc.struct<{i1, i64}>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_35:.*]] = arith.constant 2147483647 : i64
+// CHECK:           %[[VAL_36:.*]] = cc.alloca !cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>
+// CHECK:           %[[VAL_37:.*]] = cc.alloca !cc.array<!cc.ptr<i8> x 2>
+// CHECK:           %[[VAL_38:.*]] = cc.sizeof !cc.array<!cc.ptr<i8> x 2> : i64
+// CHECK:           %[[VAL_39:.*]] = cc.cast %[[VAL_37]] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 2>>) -> !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_40:.*]] = cc.cast %[[VAL_36]] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// CHECK:           cc.store %[[VAL_39]], %[[VAL_40]] : !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// CHECK:           %[[VAL_41:.*]] = cc.cast %[[VAL_37]] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 2>>) -> i64
+// CHECK:           %[[VAL_42:.*]] = arith.addi %[[VAL_41]], %[[VAL_38]] : i64
+// CHECK:           %[[VAL_43:.*]] = cc.cast %[[VAL_42]] : (i64) -> !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_44:.*]] = cc.compute_ptr %[[VAL_36]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// CHECK:           cc.store %[[VAL_43]], %[[VAL_44]] : !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// CHECK:           %[[VAL_45:.*]] = cc.compute_ptr %[[VAL_36]][2] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// CHECK:           cc.store %[[VAL_43]], %[[VAL_45]] : !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// CHECK:           %[[VAL_46:.*]] = cc.compute_ptr %[[VAL_37]][0] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 2>>) -> !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_47:.*]] = cc.alloca i1
+// CHECK:           cc.store %[[VAL_0]], %[[VAL_47]] : !cc.ptr<i1>
+// CHECK:           %[[VAL_48:.*]] = cc.cast %[[VAL_47]] : (!cc.ptr<i1>) -> !cc.ptr<i8>
+// CHECK:           cc.store %[[VAL_48]], %[[VAL_46]] : !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_49:.*]] = cc.compute_ptr %[[VAL_37]][1] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 2>>) -> !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_50:.*]] = cc.cast %[[VAL_1]] : (!cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i32>}>>) -> !cc.ptr<i8>
+// CHECK:           cc.store %[[VAL_50]], %[[VAL_49]] : !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_51:.*]] = cc.cast %[[VAL_36]] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_52:.*]] = llvm.mlir.addressof @function_hawaiian.kernelName : !llvm.ptr<array<18 x i8>>
+// CHECK:           %[[VAL_53:.*]] = cc.cast %[[VAL_52]] : (!llvm.ptr<array<18 x i8>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_54:.*]] = call @hybridLaunchKernel(%[[VAL_53]], %[[VAL_33]], %[[VAL_34]], %[[VAL_11]], %[[VAL_35]], %[[VAL_51]]) : (!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64, !cc.ptr<i8>) -> !cc.struct<{!cc.ptr<i8>, i64}>
 // CHECK:           return
 // CHECK:         }
-
-// CHECK-DAG:     func.func private @altLaunchKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>
-// CHECK-DAG:     func.func private @cudaqRegisterKernelName(!cc.ptr<i8>)
-// CHECK-DAG:     func.func private @cudaqRegisterArgsCreator(!cc.ptr<i8>, !cc.ptr<i8>)
-// CHECK-DAG:     func.func private @malloc(i64) -> !cc.ptr<i8>
-// CHECK-DAG:     func.func private @__nvqpp_initializer_list_to_vector_bool(!cc.ptr<none>, !cc.ptr<none>, i64)
-// CHECK-DAG:     func.func private @llvm.memcpy.p0i8.p0i8.i64(!cc.ptr<i8>, !cc.ptr<i8>, i64, i1)
+// CHECK:         func.func private @hybridLaunchKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64, !cc.ptr<i8>) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// CHECK:         func.func private @cudaqRegisterArgsCreator(!cc.ptr<i8>, !cc.ptr<i8>)
+// CHECK:         llvm.func @cudaqRegisterLambdaName(!llvm.ptr<i8>, !llvm.ptr<i8>) attributes {sym_visibility = "private"}
+// CHECK:         func.func private @__cudaq_registerLinkableKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>)
+// CHECK:         func.func private @__cudaq_getLinkableKernelKey(!cc.ptr<i8>) -> i64
+// CHECK:         func.func private @cudaqRegisterKernelName(!cc.ptr<i8>)
+// CHECK:         func.func private @malloc(i64) -> !cc.ptr<i8>
+// CHECK:         func.func private @free(!cc.ptr<i8>)
+// CHECK:         func.func private @__nvqpp_initializer_list_to_vector_bool(!cc.ptr<none>, !cc.ptr<none>, i64)
+// CHECK:         func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr<!cc.struct<{!cc.ptr<i1>, !cc.ptr<i1>, !cc.ptr<i1>}>>, !cc.ptr<!cc.struct<{!cc.ptr<i1>, !cc.array<i8 x 32>}>>, !cc.ptr<!cc.ptr<i8>>)
+// CHECK:         func.func private @llvm.memcpy.p0i8.p0i8.i64(!cc.ptr<i8>, !cc.ptr<i8>, i64, i1)
 
 // CHECK-LABEL:   func.func private @__nvqpp_zeroDynamicResult() -> !cc.struct<{!cc.ptr<i8>, i64}> {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i64
+// CHECK:           %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (i64) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_2:.*]] = cc.undef !cc.struct<{!cc.ptr<i8>, i64}>
+// CHECK:           %[[VAL_3:.*]] = cc.insert_value %[[VAL_1]], %[[VAL_2]][0] : (!cc.struct<{!cc.ptr<i8>, i64}>, !cc.ptr<i8>) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// CHECK:           %[[VAL_4:.*]] = cc.insert_value %[[VAL_0]], %[[VAL_3]][1] : (!cc.struct<{!cc.ptr<i8>, i64}>, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// CHECK:           return %[[VAL_4]] : !cc.struct<{!cc.ptr<i8>, i64}>
+// CHECK:         }
 
 // CHECK-LABEL:   func.func private @__nvqpp_createDynamicResult(
+// CHECK-SAME:      %[[VAL_0:.*]]: !cc.ptr<i8>, %[[VAL_1:.*]]: i64, %[[VAL_2:.*]]: !cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>, %[[VAL_3:.*]]: i64) -> !cc.struct<{!cc.ptr<i8>, i64}> {
+// CHECK:           %[[VAL_4:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>) -> !cc.ptr<i64>
+// CHECK:           %[[VAL_5:.*]] = cc.load %[[VAL_4]] : !cc.ptr<i64>
+// CHECK:           %[[VAL_6:.*]] = arith.addi %[[VAL_1]], %[[VAL_5]] : i64
+// CHECK:           %[[VAL_7:.*]] = call @malloc(%[[VAL_6]]) : (i64) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_8:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.array<i8 x ?>>
+// CHECK:           %[[VAL_9:.*]] = arith.constant false
+// CHECK:           call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_7]], %[[VAL_0]], %[[VAL_1]], %[[VAL_9]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
+// CHECK:           %[[VAL_10:.*]] = cc.compute_ptr %[[VAL_2]][0] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>) -> !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_11:.*]] = cc.load %[[VAL_10]] : !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_12:.*]] = cc.compute_ptr %[[VAL_8]]{{\[}}%[[VAL_1]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
+// CHECK:           call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_12]], %[[VAL_11]], %[[VAL_5]], %[[VAL_9]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
+// CHECK:           %[[VAL_13:.*]] = cc.undef !cc.struct<{!cc.ptr<i8>, i64}>
+// CHECK:           %[[VAL_14:.*]] = cc.insert_value %[[VAL_7]], %[[VAL_13]][0] : (!cc.struct<{!cc.ptr<i8>, i64}>, !cc.ptr<i8>) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// CHECK:           %[[VAL_15:.*]] = cc.insert_value %[[VAL_6]], %[[VAL_14]][1] : (!cc.struct<{!cc.ptr<i8>, i64}>, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// CHECK:           %[[VAL_16:.*]] = cc.compute_ptr %[[VAL_8]]{{\[}}%[[VAL_3]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_17:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.ptr<i8>>
+// CHECK:           cc.store %[[VAL_12]], %[[VAL_17]] : !cc.ptr<!cc.ptr<i8>>
+// CHECK:           return %[[VAL_15]] : !cc.struct<{!cc.ptr<i8>, i64}>
+// CHECK:         }
+// CHECK:         llvm.mlir.global external constant @function_hawaiian.kernelName("function_hawaiian\00") {addr_space = 0 : i32}
 
-// CHECK:         llvm.mlir.global external constant @function_hawaiian.kernelName("function
+// CHECK-LABEL:   func.func @function_hawaiian.returnOffset() -> i64 {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 2147483647 : i64
+// CHECK:           return %[[VAL_0]] : i64
+// CHECK:         }
 
 // CHECK-LABEL:   func.func @function_hawaiian.thunk(
-// CHECK-SAME:      %[[VAL_0:.*]]: !cc.ptr<i8>, %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr<i8>, i64}> {
+// CHECK-SAME:        %[[VAL_0:.*]]: !cc.ptr<i8>,
+// CHECK-SAME:        %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr<i8>, i64}> {
 // CHECK:           %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.struct<{i1, i64}>>
-// CHECK:           %[[VAL_3:.*]] = cc.load %[[VAL_2]] : !cc.ptr<!cc.struct<{i1, i64}>>
-// CHECK:           %[[VAL_7:.*]] = cc.sizeof !cc.struct<{i1, i64}> : i64
-// CHECK:           %[[VAL_20:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.array<i8 x ?>>
-// CHECK:           %[[VAL_8:.*]] = cc.compute_ptr %[[VAL_20]][%[[VAL_7]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_9:.*]] = cc.extract_value %[[VAL_3]][0] : (!cc.struct<{i1, i64}>) -> i1
-// CHECK:           %[[VAL_10:.*]] = cc.extract_value %[[VAL_3]][1] : (!cc.struct<{i1, i64}>) -> i64
-// CHECK:           %[[VAL_11:.*]] = arith.constant 4 : i64
-// CHECK:           %[[VAL_12:.*]] = arith.divsi %[[VAL_10]], %[[VAL_11]] : i64
-// CHECK:           %[[VAL_13:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr<i8>) -> !cc.ptr<i32>
-// CHECK:           %[[VAL_14:.*]] = cc.stdvec_init %[[VAL_13]], %[[VAL_12]] : (!cc.ptr<i32>, i64) -> !cc.stdvec<i32>
-// CHECK:           %[[VAL_90:.*]] = cc.cast %[[VAL_8]] :
-// CHECK:           %[[VAL_15:.*]] = cc.compute_ptr %[[VAL_90]][%[[VAL_10]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
-// CHECK:           call @__nvqpp__mlirgen__function_hawaiian(%[[VAL_9]], %[[VAL_14]]) : (i1, !cc.stdvec<i32>) -> ()
+// CHECK:           %[[VAL_3:.*]] = cc.sizeof !cc.struct<{i1, i64}> : i64
+// CHECK:           %[[VAL_4:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.array<i8 x ?>>
+// CHECK:           %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_4]]{{\[}}%[[VAL_3]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_6:.*]] = cc.compute_ptr %[[VAL_2]][0] : (!cc.ptr<!cc.struct<{i1, i64}>>) -> !cc.ptr<i1>
+// CHECK:           %[[VAL_7:.*]] = cc.load %[[VAL_6]] : !cc.ptr<i1>
+// CHECK:           %[[VAL_8:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr<!cc.struct<{i1, i64}>>) -> !cc.ptr<i64>
+// CHECK:           %[[VAL_9:.*]] = cc.sizeof i32 : i64
+// CHECK:           %[[VAL_10:.*]] = cc.load %[[VAL_8]] : !cc.ptr<i64>
+// CHECK:           %[[VAL_11:.*]] = arith.divsi %[[VAL_10]], %[[VAL_9]] : i64
+// CHECK:           %[[VAL_12:.*]] = cc.cast %[[VAL_5]] : (!cc.ptr<i8>) -> !cc.ptr<i32>
+// CHECK:           %[[VAL_13:.*]] = cc.stdvec_init %[[VAL_12]], %[[VAL_11]] : (!cc.ptr<i32>, i64) -> !cc.stdvec<i32>
+// CHECK:           %[[VAL_14:.*]] = cc.cast %[[VAL_5]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.array<i8 x ?>>
+// CHECK:           %[[VAL_15:.*]] = cc.compute_ptr %[[VAL_14]]{{\[}}%[[VAL_10]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
+// CHECK:           call @__nvqpp__mlirgen__function_hawaiian(%[[VAL_7]], %[[VAL_13]]) : (i1, !cc.stdvec<i32>) -> ()
 // CHECK:           %[[VAL_16:.*]] = call @__nvqpp_zeroDynamicResult() : () -> !cc.struct<{!cc.ptr<i8>, i64}>
 // CHECK:           return %[[VAL_16]] : !cc.struct<{!cc.ptr<i8>, i64}>
 // CHECK:         }
 
 // CHECK-LABEL:   func.func @function_hawaiian.argsCreator(
-// CHECK-SAME:      %[[VAL_0:.*]]: !cc.ptr<!cc.ptr<i8>>, %[[VAL_1:.*]]: !cc.ptr<!cc.ptr<i8>>) -> i64 {
-// CHECK:           %[[VAL_2:.*]] = cc.undef !cc.struct<{i1, i64}>
-// CHECK:           %[[VAL_90:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<!cc.ptr<i8>>) ->
-// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_4:.*]] = cc.compute_ptr %[[VAL_90]][0] : (!cc.ptr<!cc.array<!cc.ptr<i8> x ?>>) -> !cc.ptr<!cc.ptr<i8>>
-// CHECK:           %[[VAL_5:.*]] = cc.load %[[VAL_4]] : !cc.ptr<!cc.ptr<i8>>
-// CHECK:           %[[VAL_6:.*]] = cc.cast %[[VAL_5]] : (!cc.ptr<i8>) -> !cc.ptr<i1>
-// CHECK:           %[[VAL_7:.*]] = cc.load %[[VAL_6]] : !cc.ptr<i1>
-// CHECK:           %[[VAL_8:.*]] = cc.insert_value %[[VAL_7]], %[[VAL_2]][0] : (!cc.struct<{i1, i64}>, i1) -> !cc.struct<{i1, i64}>
-// CHECK:           %[[VAL_9:.*]] = cc.compute_ptr %[[VAL_90]][1] : (!cc.ptr<!cc.array<!cc.ptr<i8> x ?>>) -> !cc.ptr<!cc.ptr<i8>>
-// CHECK:           %[[VAL_10:.*]] = cc.load %[[VAL_9]] : !cc.ptr<!cc.ptr<i8>>
-// CHECK:           %[[VAL_11:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i32>}>>
-// CHECK:           %[[VAL_12:.*]] = cc.compute_ptr %[[VAL_11]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i32>}>>) -> !cc.ptr<!cc.ptr<i32>>
-// CHECK:           %[[VAL_13:.*]] = cc.compute_ptr %[[VAL_11]][0] : (!cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i32>}>>) -> !cc.ptr<!cc.ptr<i32>>
+// CHECK-SAME:              %[[VAL_0:.*]]: !cc.ptr<!cc.ptr<i8>>,
+// CHECK-SAME:              %[[VAL_1:.*]]: !cc.ptr<!cc.ptr<i8>>) -> i64 {
+// CHECK:           %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<!cc.ptr<i8>>) -> !cc.ptr<!cc.array<!cc.ptr<i8> x ?>>
+// CHECK:           %[[VAL_3:.*]] = cc.compute_ptr %[[VAL_2]][0] : (!cc.ptr<!cc.array<!cc.ptr<i8> x ?>>) -> !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_4:.*]] = cc.load %[[VAL_3]] : !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_5:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr<i8>) -> !cc.ptr<i1>
+// CHECK:           %[[VAL_6:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr<!cc.array<!cc.ptr<i8> x ?>>) -> !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_7:.*]] = cc.load %[[VAL_6]] : !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_8:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i32>}>>
+// CHECK:           %[[VAL_9:.*]] = cc.load %[[VAL_5]] : !cc.ptr<i1>
+// CHECK:           %[[VAL_10:.*]] = cc.alloca i64
+// CHECK:           %[[VAL_11:.*]] = cc.sizeof !cc.struct<{i1, i64}> : i64
+// CHECK:           %[[VAL_12:.*]] = cc.compute_ptr %[[VAL_8]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i32>}>>) -> !cc.ptr<!cc.ptr<i32>>
+// CHECK:           %[[VAL_13:.*]] = cc.compute_ptr %[[VAL_8]][0] : (!cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i32>}>>) -> !cc.ptr<!cc.ptr<i32>>
 // CHECK:           %[[VAL_14:.*]] = cc.load %[[VAL_12]] : !cc.ptr<!cc.ptr<i32>>
 // CHECK:           %[[VAL_15:.*]] = cc.load %[[VAL_13]] : !cc.ptr<!cc.ptr<i32>>
 // CHECK:           %[[VAL_16:.*]] = cc.cast %[[VAL_14]] : (!cc.ptr<i32>) -> i64
 // CHECK:           %[[VAL_17:.*]] = cc.cast %[[VAL_15]] : (!cc.ptr<i32>) -> i64
 // CHECK:           %[[VAL_18:.*]] = arith.subi %[[VAL_16]], %[[VAL_17]] : i64
-// CHECK:           %[[VAL_19:.*]] = cc.insert_value %[[VAL_18]], %[[VAL_8]][1] : (!cc.struct<{i1, i64}>, i64) -> !cc.struct<{i1, i64}>
-// CHECK:           %[[VAL_20:.*]] = arith.addi %[[VAL_3]], %[[VAL_18]] : i64
+// CHECK:           %[[VAL_19:.*]] = arith.addi %[[VAL_18]], %[[VAL_11]] : i64
+// CHECK:           %[[VAL_20:.*]] = call @malloc(%[[VAL_19]]) : (i64) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_21:.*]] = cc.cast %[[VAL_20]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.struct<{i1, i64}>>
+// CHECK:           %[[VAL_22:.*]] = cc.alloca !cc.ptr<i8>
 // CHECK:           %[[VAL_23:.*]] = cc.sizeof !cc.struct<{i1, i64}> : i64
-// CHECK:           %[[VAL_24:.*]] = arith.addi %[[VAL_23]], %[[VAL_20]] : i64
-// CHECK:           %[[VAL_25:.*]] = call @malloc(%[[VAL_24]]) : (i64) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_26:.*]] = cc.cast %[[VAL_25]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.struct<{i1, i64}>>
-// CHECK:           cc.store %[[VAL_19]], %[[VAL_26]] : !cc.ptr<!cc.struct<{i1, i64}>>
-// CHECK:           %[[VAL_80:.*]] = cc.cast %[[VAL_25]] :
-// CHECK:           %[[VAL_27:.*]] = cc.compute_ptr %[[VAL_80]][%[[VAL_23]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_28:.*]] = cc.extract_value %[[VAL_19]][1] : (!cc.struct<{i1, i64}>) -> i64
-// CHECK:           %[[VAL_29:.*]] = cc.compute_ptr %[[VAL_90]][1] : (!cc.ptr<!cc.array<!cc.ptr<i8> x ?>>) -> !cc.ptr<!cc.ptr<i8>>
-// CHECK:           %[[VAL_30:.*]] = cc.load %[[VAL_29]] : !cc.ptr<!cc.ptr<i8>>
-// CHECK:           %[[VAL_31:.*]] = cc.cast %[[VAL_30]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i32>}>>
-// CHECK:           %[[VAL_32:.*]] = arith.constant false
-// CHECK:           %[[VAL_33:.*]] = cc.compute_ptr %[[VAL_31]][0] : (!cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i32>}>>) -> !cc.ptr<!cc.ptr<i32>>
-// CHECK:           %[[VAL_34:.*]] = cc.load %[[VAL_33]] : !cc.ptr<!cc.ptr<i32>>
-// CHECK:           %[[VAL_35:.*]] = cc.cast %[[VAL_34]] : (!cc.ptr<i32>) -> !cc.ptr<i8>
-// CHECK:           call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_27]], %[[VAL_35]], %[[VAL_28]], %[[VAL_32]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
-// CHECK:           %[[VAL_83:.*]] = cc.cast %[[VAL_27]] :
-// CHECK:           %[[VAL_36:.*]] = cc.compute_ptr %[[VAL_83]][%[[VAL_28]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
-// CHECK:           cc.store %[[VAL_25]], %[[VAL_1]] : !cc.ptr<!cc.ptr<i8>>
-// CHECK:           return %[[VAL_24]] : i64
+// CHECK:           %[[VAL_24:.*]] = cc.cast %[[VAL_20]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.array<i8 x ?>>
+// CHECK:           %[[VAL_25:.*]] = cc.compute_ptr %[[VAL_24]]{{\[}}%[[VAL_23]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_26:.*]] = cc.compute_ptr %[[VAL_21]][0] : (!cc.ptr<!cc.struct<{i1, i64}>>) -> !cc.ptr<i1>
+// CHECK:           cc.store %[[VAL_9]], %[[VAL_26]] : !cc.ptr<i1>
+// CHECK:           %[[VAL_27:.*]] = cc.compute_ptr %[[VAL_21]][1] : (!cc.ptr<!cc.struct<{i1, i64}>>) -> !cc.ptr<i64>
+// CHECK:           %[[VAL_28:.*]] = cc.compute_ptr %[[VAL_8]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i32>}>>) -> !cc.ptr<!cc.ptr<i32>>
+// CHECK:           %[[VAL_29:.*]] = cc.compute_ptr %[[VAL_8]][0] : (!cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i32>}>>) -> !cc.ptr<!cc.ptr<i32>>
+// CHECK:           %[[VAL_30:.*]] = cc.load %[[VAL_28]] : !cc.ptr<!cc.ptr<i32>>
+// CHECK:           %[[VAL_31:.*]] = cc.load %[[VAL_29]] : !cc.ptr<!cc.ptr<i32>>
+// CHECK:           %[[VAL_32:.*]] = cc.cast %[[VAL_30]] : (!cc.ptr<i32>) -> i64
+// CHECK:           %[[VAL_33:.*]] = cc.cast %[[VAL_31]] : (!cc.ptr<i32>) -> i64
+// CHECK:           %[[VAL_34:.*]] = arith.subi %[[VAL_32]], %[[VAL_33]] : i64
+// CHECK:           cc.store %[[VAL_34]], %[[VAL_27]] : !cc.ptr<i64>
+// CHECK:           %[[VAL_35:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i32>}>>) -> !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_36:.*]] = cc.load %[[VAL_35]] : !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_37:.*]] = arith.constant false
+// CHECK:           %[[VAL_38:.*]] = cc.cast %[[VAL_25]] : (!cc.ptr<i8>) -> !cc.ptr<i8>
+// CHECK:           call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_38]], %[[VAL_36]], %[[VAL_34]], %[[VAL_37]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
+// CHECK:           %[[VAL_39:.*]] = cc.cast %[[VAL_25]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.array<i8 x ?>>
+// CHECK:           %[[VAL_40:.*]] = cc.compute_ptr %[[VAL_39]]{{\[}}%[[VAL_34]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
+// CHECK:           cc.store %[[VAL_20]], %[[VAL_1]] : !cc.ptr<!cc.ptr<i8>>
+// CHECK:           return %[[VAL_19]] : i64
 // CHECK:         }
 
 // CHECK-LABEL:   llvm.func @function_hawaiian.kernelRegFunc() {
@@ -161,6 +251,5 @@ __nvqpp__mlirgen__function_cargo = "pants"}} {
 // CHECK:           func.call @cudaqRegisterArgsCreator(%[[VAL_1]], %[[VAL_3]]) : (!cc.ptr<i8>, !cc.ptr<i8>) -> ()
 // CHECK:           llvm.return
 // CHECK:         }
-
 // CHECK:         llvm.mlir.global_ctors {ctors = [@function_hawaiian.kernelRegFunc], priorities = [17 : i32]}
 
diff --git a/test/Quake/lambda_kernel_exec.qke b/test/Quake/lambda_kernel_exec.qke
index 606b644ffe..aedb9564b5 100644
--- a/test/Quake/lambda_kernel_exec.qke
+++ b/test/Quake/lambda_kernel_exec.qke
@@ -15,7 +15,7 @@
 // CHECK: llvm.call @cudaqRegisterLambdaName(%[[VAL_1]], %{{.*}}) : (!llvm.ptr<i8>, !llvm.ptr<i8>) -> ()
 
 module attributes {quake.mangled_name_map = {__nvqpp__mlirgen__lambda.main.canHaveMultiple = "_ZZ4mainENK3$_1clEv", __nvqpp__mlirgen__lambda.main.test = "_ZZ4mainENK3$_0clEv"}} {
-  func.func @__nvqpp__mlirgen__lambda.main.test() attributes {"cudaq-entrypoint"} {
+  func.func @__nvqpp__mlirgen__lambda.main.test() attributes {"cudaq-entrypoint", no_this} {
     %c2_i32 = arith.constant 2 : i32
     %0 = arith.extsi %c2_i32 : i32 to i64
     %1 = quake.alloca !quake.veq<?>[%0 : i64]
@@ -54,7 +54,7 @@ module attributes {quake.mangled_name_map = {__nvqpp__mlirgen__lambda.main.canHa
 // CHECK-NEXT: %[[VAL_4:.*]] = cc.cast %[[VAL_3]] : (!llvm.ptr<array<10 x i8>>) -> !llvm.ptr<i8>
 // CHECK: llvm.call @cudaqRegisterLambdaName(%[[VAL_4]], %{{.*}}) : (!llvm.ptr<i8>, !llvm.ptr<i8>) -> ()
 
-  func.func @__nvqpp__mlirgen__lambda.main.canHaveMultiple() attributes {"cudaq-entrypoint"} {
+  func.func @__nvqpp__mlirgen__lambda.main.canHaveMultiple() attributes {"cudaq-entrypoint", no_this} {
     %c2_i32 = arith.constant 2 : i32
     %0 = arith.extsi %c2_i32 : i32 to i64
     %1 = quake.alloca !quake.veq<?>[%0 : i64]
diff --git a/test/Quake/return_vector.qke b/test/Quake/return_vector.qke
index 90ccc90610..58bcd2f089 100644
--- a/test/Quake/return_vector.qke
+++ b/test/Quake/return_vector.qke
@@ -6,8 +6,7 @@
 // the terms of the Apache License 2.0 which accompanies this distribution.   //
 // ========================================================================== //
 
-// RUN: cudaq-opt --add-dealloc --kernel-execution=codegen=1 --canonicalize %s \
-// RUN: | FileCheck %s
+// RUN: cudaq-opt -add-dealloc -kernel-execution -canonicalize %s | FileCheck %s
 
 // NB: the mangled name map is required for the kernel-execution pass.
 module attributes{ quake.mangled_name_map = {
@@ -29,61 +28,88 @@ func.func @test_0(%0: !cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i3
 }
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__test_0(
-// CHECK-SAME:      %[[VAL_1:.*]]: i32) -> !cc.stdvec<i32> {
-// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 8 : i64
-// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 256 : i64
-// CHECK:           %[[VAL_4:.*]] = call @malloc(%[[VAL_3]]) : (i64) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_5:.*]] = cc.stdvec_init %[[VAL_4]], %[[VAL_2]] : (!cc.ptr<i8>, i64) -> !cc.stdvec<i32>
-// CHECK:           return %[[VAL_5]] : !cc.stdvec<i32>
+// CHECK-SAME:      %[[VAL_0:.*]]: i32) -> !cc.stdvec<i32> {
+// CHECK:           %[[VAL_1:.*]] = arith.constant 8 : i64
+// CHECK:           %[[VAL_2:.*]] = arith.constant 256 : i64
+// CHECK:           %[[VAL_3:.*]] = call @malloc(%[[VAL_2]]) : (i64) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_4:.*]] = cc.stdvec_init %[[VAL_3]], %[[VAL_1]] : (!cc.ptr<i8>, i64) -> !cc.stdvec<i32>
+// CHECK:           return %[[VAL_4]] : !cc.stdvec<i32>
 // CHECK:         }
 
 // CHECK-LABEL:   func.func @test_0(
-// CHECK-SAME:          %[[VAL_0:.*]]: !cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i32>}>> {llvm.sret = !cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i32>}>},
-// CHECK-SAME:          %[[VAL_1:.*]]: !cc.ptr<i8>, %[[VAL_2:.*]]: i32) {
+// CHECK-SAME:      %[[VAL_0:.*]]: !cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i32>}>> {llvm.sret = !cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i32>}>}, %[[VAL_1:.*]]: !cc.ptr<i8>, %[[VAL_2:.*]]: i32) {
 // CHECK:           %[[VAL_3:.*]] = arith.constant 4 : i64
-// CHECK:           %[[VAL_4:.*]] = constant @test_0.thunk : (!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>
-// CHECK:           %[[VAL_5:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_6:.*]] = cc.undef !cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>
-// CHECK:           %[[VAL_7:.*]] = cc.insert_value %[[VAL_2]], %[[VAL_6]][0] : (!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>, i32) -> !cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>
-// CHECK:           %[[VAL_8:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}> : i64
-// CHECK:           %[[VAL_9:.*]] = cc.alloca i8{{\[}}%[[VAL_8]] : i64]
-// CHECK:           %[[VAL_10:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>
-// CHECK:           cc.store %[[VAL_7]], %[[VAL_10]] : !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>
-// CHECK:           %[[VAL_11:.*]] = cc.func_ptr %[[VAL_4]] : ((!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_12:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_13:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}> [1] : i64
-// CHECK:           %[[VAL_14:.*]] = llvm.mlir.addressof @test_0.kernelName : !llvm.ptr<array<7 x i8>>
-// CHECK:           %[[VAL_15:.*]] = cc.cast %[[VAL_14]] : (!llvm.ptr<array<7 x i8>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_16:.*]] = call @altLaunchKernel(%[[VAL_15]], %[[VAL_11]], %[[VAL_12]], %[[VAL_8]], %[[VAL_13]]) : (!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>
-// CHECK:           %[[VAL_17:.*]] = cc.extract_value %[[VAL_16]][0] : (!cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_18:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr<i8>) -> i64
-// CHECK:           %[[VAL_19:.*]] = arith.cmpi ne, %[[VAL_18]], %[[VAL_5]] : i64
-// CHECK:           cf.cond_br %[[VAL_19]], ^bb1, ^bb2
+// CHECK:           %[[VAL_4:.*]] = arith.constant 8 : i64
+// CHECK:           %[[VAL_5:.*]] = constant @test_0.thunk : (!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i64
+// CHECK:           %[[VAL_7:.*]] = cc.alloca !cc.ptr<i8>
+// CHECK:           %[[VAL_8:.*]] = cc.cast %[[VAL_6]] : (i64) -> !cc.ptr<i8>
+// CHECK:           cc.store %[[VAL_8]], %[[VAL_7]] : !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_9:.*]] = cc.alloca i64
+// CHECK:           %[[VAL_10:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}> : i64
+// CHECK:           %[[VAL_11:.*]] = cc.alloca i8{{\[}}%[[VAL_10]] : i64]
+// CHECK:           %[[VAL_12:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>
+// CHECK:           %[[VAL_13:.*]] = cc.alloca !cc.ptr<i8>
+// CHECK:           %[[VAL_14:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i32>
+// CHECK:           cc.store %[[VAL_2]], %[[VAL_14]] : !cc.ptr<i32>
+// CHECK:           %[[VAL_15:.*]] = cc.load %[[VAL_7]] : !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_16:.*]] = cc.cast %[[VAL_15]] : (!cc.ptr<i8>) -> i64
+// CHECK:           %[[VAL_17:.*]] = arith.cmpi ne, %[[VAL_16]], %[[VAL_6]] : i64
+// CHECK:           cc.if(%[[VAL_17]]) {
+// CHECK:             func.call @__nvqpp_vector_bool_free_temporary_initlists(%[[VAL_15]]) : (!cc.ptr<i8>) -> ()
+// CHECK:           }
+// CHECK:           %[[VAL_18:.*]] = cc.func_ptr %[[VAL_5]] : ((!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_19:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_20:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}> [1] : i64
+// CHECK:           %[[VAL_21:.*]] = cc.alloca !cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>
+// CHECK:           %[[VAL_22:.*]] = cc.alloca !cc.array<!cc.ptr<i8> x 1>
+// CHECK:           %[[VAL_23:.*]] = cc.cast %[[VAL_22]] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 1>>) -> !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_24:.*]] = cc.cast %[[VAL_21]] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// CHECK:           cc.store %[[VAL_23]], %[[VAL_24]] : !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// CHECK:           %[[VAL_25:.*]] = cc.cast %[[VAL_22]] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 1>>) -> i64
+// CHECK:           %[[VAL_26:.*]] = arith.addi %[[VAL_25]], %[[VAL_4]] : i64
+// CHECK:           %[[VAL_27:.*]] = cc.cast %[[VAL_26]] : (i64) -> !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_28:.*]] = cc.compute_ptr %[[VAL_21]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// CHECK:           cc.store %[[VAL_27]], %[[VAL_28]] : !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// CHECK:           %[[VAL_29:.*]] = cc.compute_ptr %[[VAL_21]][2] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// CHECK:           cc.store %[[VAL_27]], %[[VAL_29]] : !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// CHECK:           %[[VAL_30:.*]] = cc.cast %[[VAL_22]] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 1>>) -> !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_31:.*]] = cc.alloca i32
+// CHECK:           cc.store %[[VAL_2]], %[[VAL_31]] : !cc.ptr<i32>
+// CHECK:           %[[VAL_32:.*]] = cc.cast %[[VAL_31]] : (!cc.ptr<i32>) -> !cc.ptr<i8>
+// CHECK:           cc.store %[[VAL_32]], %[[VAL_30]] : !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_33:.*]] = cc.cast %[[VAL_21]] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_34:.*]] = llvm.mlir.addressof @test_0.kernelName : !llvm.ptr<array<7 x i8>>
+// CHECK:           %[[VAL_35:.*]] = cc.cast %[[VAL_34]] : (!llvm.ptr<array<7 x i8>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_36:.*]] = call @hybridLaunchKernel(%[[VAL_35]], %[[VAL_18]], %[[VAL_19]], %[[VAL_10]], %[[VAL_20]], %[[VAL_33]]) : (!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64, !cc.ptr<i8>) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// CHECK:           %[[VAL_37:.*]] = cc.extract_value %[[VAL_36]][0] : (!cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_38:.*]] = cc.cast %[[VAL_37]] : (!cc.ptr<i8>) -> i64
+// CHECK:           %[[VAL_39:.*]] = arith.cmpi ne, %[[VAL_38]], %[[VAL_6]] : i64
+// CHECK:           cf.cond_br %[[VAL_39]], ^bb1, ^bb2
 // CHECK:         ^bb1:
-// CHECK:           %[[VAL_20:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>
-// CHECK:           %[[VAL_21:.*]] = cc.compute_ptr %[[VAL_20]][1] : (!cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<i32>, i64}>>
-// CHECK:           cf.br ^bb3(%[[VAL_21]] : !cc.ptr<!cc.struct<{!cc.ptr<i32>, i64}>>)
+// CHECK:           %[[VAL_40:.*]] = cc.cast %[[VAL_37]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>
+// CHECK:           %[[VAL_41:.*]] = cc.compute_ptr %[[VAL_40]][1] : (!cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<i32>, i64}>>
+// CHECK:           cf.br ^bb3(%[[VAL_41]] : !cc.ptr<!cc.struct<{!cc.ptr<i32>, i64}>>)
 // CHECK:         ^bb2:
-// CHECK:           %[[VAL_22:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>
-// CHECK:           %[[VAL_23:.*]] = cc.compute_ptr %[[VAL_22]][1] : (!cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<i32>, i64}>>
-// CHECK:           cf.br ^bb3(%[[VAL_23]] : !cc.ptr<!cc.struct<{!cc.ptr<i32>, i64}>>)
-// CHECK:         ^bb3(%[[VAL_24:.*]]: !cc.ptr<!cc.struct<{!cc.ptr<i32>, i64}>>):
-// CHECK:           %[[VAL_25:.*]] = cc.cast %[[VAL_24]] : (!cc.ptr<!cc.struct<{!cc.ptr<i32>, i64}>>) -> !cc.ptr<!cc.ptr<i32>>
-// CHECK:           %[[VAL_26:.*]] = cc.load %[[VAL_25]] : !cc.ptr<!cc.ptr<i32>>
-// CHECK:           %[[VAL_27:.*]] = cc.compute_ptr %[[VAL_24]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<i32>, i64}>>) -> !cc.ptr<i64>
-// CHECK:           %[[VAL_28:.*]] = cc.load %[[VAL_27]] : !cc.ptr<i64>
-// CHECK:           %[[VAL_29:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i32>}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>}>>
-// CHECK:           %[[VAL_30:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i32>}>>) -> !cc.ptr<!cc.ptr<i8>>
-// CHECK:           %[[VAL_31:.*]] = cc.cast %[[VAL_26]] : (!cc.ptr<i32>) -> !cc.ptr<i8>
-// CHECK:           cc.store %[[VAL_31]], %[[VAL_30]] : !cc.ptr<!cc.ptr<i8>>
-// CHECK:           %[[VAL_32:.*]] = cc.compute_ptr %[[VAL_29]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>}>>) -> !cc.ptr<!cc.ptr<i8>>
-// CHECK:           %[[VAL_33:.*]] = arith.muli %[[VAL_28]], %[[VAL_3]] : i64
-// CHECK:           %[[VAL_34:.*]] = cc.cast %[[VAL_26]] : (!cc.ptr<i32>) -> !cc.ptr<!cc.array<i8 x ?>>
-// CHECK:           %[[VAL_35:.*]] = cc.compute_ptr %[[VAL_34]]{{\[}}%[[VAL_33]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
-// CHECK:           cc.store %[[VAL_35]], %[[VAL_32]] : !cc.ptr<!cc.ptr<i8>>
-// CHECK:           %[[VAL_36:.*]] = cc.compute_ptr %[[VAL_29]][2] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>}>>) -> !cc.ptr<!cc.ptr<i8>>
-// CHECK:           cc.store %[[VAL_35]], %[[VAL_36]] : !cc.ptr<!cc.ptr<i8>>
-// CHECK:           call @free(%[[VAL_17]]) : (!cc.ptr<i8>) -> ()
+// CHECK:           %[[VAL_42:.*]] = cc.compute_ptr %[[VAL_12]][1] : (!cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<i32>, i64}>>
+// CHECK:           cf.br ^bb3(%[[VAL_42]] : !cc.ptr<!cc.struct<{!cc.ptr<i32>, i64}>>)
+// CHECK:         ^bb3(%[[VAL_43:.*]]: !cc.ptr<!cc.struct<{!cc.ptr<i32>, i64}>>):
+// CHECK:           %[[VAL_44:.*]] = cc.cast %[[VAL_43]] : (!cc.ptr<!cc.struct<{!cc.ptr<i32>, i64}>>) -> !cc.ptr<!cc.ptr<i32>>
+// CHECK:           %[[VAL_45:.*]] = cc.load %[[VAL_44]] : !cc.ptr<!cc.ptr<i32>>
+// CHECK:           %[[VAL_46:.*]] = cc.compute_ptr %[[VAL_43]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<i32>, i64}>>) -> !cc.ptr<i64>
+// CHECK:           %[[VAL_47:.*]] = cc.load %[[VAL_46]] : !cc.ptr<i64>
+// CHECK:           %[[VAL_48:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i32>}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>}>>
+// CHECK:           %[[VAL_49:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i32>}>>) -> !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_50:.*]] = cc.cast %[[VAL_45]] : (!cc.ptr<i32>) -> !cc.ptr<i8>
+// CHECK:           cc.store %[[VAL_50]], %[[VAL_49]] : !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_51:.*]] = cc.compute_ptr %[[VAL_48]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>}>>) -> !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_52:.*]] = arith.muli %[[VAL_47]], %[[VAL_3]] : i64
+// CHECK:           %[[VAL_53:.*]] = cc.cast %[[VAL_45]] : (!cc.ptr<i32>) -> !cc.ptr<!cc.array<i8 x ?>>
+// CHECK:           %[[VAL_54:.*]] = cc.compute_ptr %[[VAL_53]]{{\[}}%[[VAL_52]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
+// CHECK:           cc.store %[[VAL_54]], %[[VAL_51]] : !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_55:.*]] = cc.compute_ptr %[[VAL_48]][2] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>}>>) -> !cc.ptr<!cc.ptr<i8>>
+// CHECK:           cc.store %[[VAL_54]], %[[VAL_55]] : !cc.ptr<!cc.ptr<i8>>
+// CHECK:           call @free(%[[VAL_37]]) : (!cc.ptr<i8>) -> ()
 // CHECK:           return
 // CHECK:         }
 
@@ -100,72 +126,150 @@ func.func @test_1(%0: !cc.ptr<!cc.struct<{!cc.ptr<f64>, !cc.ptr<f64>, !cc.ptr<f6
 }
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__test_1(
-// CHECK-SAME:      %[[VAL_1:.*]]: i32) -> !cc.stdvec<f64> {
-// CHECK:           %[[VAL_2:.*]] = arith.constant 9 : i64
-// CHECK:           %[[VAL_3:.*]] = arith.constant 520 : i64
-// CHECK:           %[[VAL_4:.*]] = call @malloc(%[[VAL_3]]) : (i64) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_5:.*]] = cc.stdvec_init %[[VAL_4]], %[[VAL_2]] : (!cc.ptr<i8>, i64) -> !cc.stdvec<f64>
-// CHECK:           return
+// CHECK-SAME:      %[[VAL_0:.*]]: i32) -> !cc.stdvec<f64> {
+// CHECK:           %[[VAL_1:.*]] = arith.constant 9 : i64
+// CHECK:           %[[VAL_2:.*]] = arith.constant 520 : i64
+// CHECK:           %[[VAL_3:.*]] = call @malloc(%[[VAL_2]]) : (i64) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_4:.*]] = cc.stdvec_init %[[VAL_3]], %[[VAL_1]] : (!cc.ptr<i8>, i64) -> !cc.stdvec<f64>
+// CHECK:           return %[[VAL_4]] : !cc.stdvec<f64>
 // CHECK:         }
 
 // CHECK-LABEL:   func.func @test_1(
-// CHECK-SAME:           %[[VAL_0:.*]]: !cc.ptr<!cc.struct<{!cc.ptr<f64>, !cc.ptr<f64>, !cc.ptr<f64>}>> {llvm.sret = !cc.struct<{!cc.ptr<f64>, !cc.ptr<f64>, !cc.ptr<f64>}>},
-// CHECK-SAME:           %[[VAL_1:.*]]: !cc.ptr<i8>, %[[VAL_2:.*]]: i32) {
+// CHECK-SAME:      %[[VAL_0:.*]]: !cc.ptr<!cc.struct<{!cc.ptr<f64>, !cc.ptr<f64>, !cc.ptr<f64>}>> {llvm.sret = !cc.struct<{!cc.ptr<f64>, !cc.ptr<f64>, !cc.ptr<f64>}>}, %[[VAL_1:.*]]: !cc.ptr<i8>, %[[VAL_2:.*]]: i32) {
 // CHECK:           %[[VAL_3:.*]] = arith.constant 8 : i64
 // CHECK:           %[[VAL_4:.*]] = constant @test_1.thunk : (!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>
 // CHECK:           %[[VAL_5:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_6:.*]] = cc.undef !cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>
-// CHECK:           %[[VAL_7:.*]] = cc.insert_value %[[VAL_2]], %[[VAL_6]][0] : (!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>, i32) -> !cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>
-// CHECK:           %[[VAL_8:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}> : i64
-// CHECK:           %[[VAL_9:.*]] = cc.alloca i8{{\[}}%[[VAL_8]] : i64]
-// CHECK:           %[[VAL_10:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>
-// CHECK:           cc.store %[[VAL_7]], %[[VAL_10]] : !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>
-// CHECK:           %[[VAL_11:.*]] = cc.func_ptr %[[VAL_4]] : ((!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_12:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_13:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}> [1] : i64
-// CHECK:           %[[VAL_14:.*]] = llvm.mlir.addressof @test_1.kernelName : !llvm.ptr<array<7 x i8>>
-// CHECK:           %[[VAL_15:.*]] = cc.cast %[[VAL_14]] : (!llvm.ptr<array<7 x i8>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_16:.*]] = call @altLaunchKernel(%[[VAL_15]], %[[VAL_11]], %[[VAL_12]], %[[VAL_8]], %[[VAL_13]]) : (!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>
-// CHECK:           %[[VAL_17:.*]] = cc.extract_value %[[VAL_16]][0] : (!cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_18:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr<i8>) -> i64
-// CHECK:           %[[VAL_19:.*]] = arith.cmpi ne, %[[VAL_18]], %[[VAL_5]] : i64
-// CHECK:           cf.cond_br %[[VAL_19]], ^bb1, ^bb2
+// CHECK:           %[[VAL_6:.*]] = cc.alloca !cc.ptr<i8>
+// CHECK:           %[[VAL_7:.*]] = cc.cast %[[VAL_5]] : (i64) -> !cc.ptr<i8>
+// CHECK:           cc.store %[[VAL_7]], %[[VAL_6]] : !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_8:.*]] = cc.alloca i64
+// CHECK:           %[[VAL_9:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}> : i64
+// CHECK:           %[[VAL_10:.*]] = cc.alloca i8{{\[}}%[[VAL_9]] : i64]
+// CHECK:           %[[VAL_11:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>
+// CHECK:           %[[VAL_12:.*]] = cc.alloca !cc.ptr<i8>
+// CHECK:           %[[VAL_13:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i32>
+// CHECK:           cc.store %[[VAL_2]], %[[VAL_13]] : !cc.ptr<i32>
+// CHECK:           %[[VAL_14:.*]] = cc.load %[[VAL_6]] : !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_15:.*]] = cc.cast %[[VAL_14]] : (!cc.ptr<i8>) -> i64
+// CHECK:           %[[VAL_16:.*]] = arith.cmpi ne, %[[VAL_15]], %[[VAL_5]] : i64
+// CHECK:           cc.if(%[[VAL_16]]) {
+// CHECK:             func.call @__nvqpp_vector_bool_free_temporary_initlists(%[[VAL_14]]) : (!cc.ptr<i8>) -> ()
+// CHECK:           }
+// CHECK:           %[[VAL_17:.*]] = cc.func_ptr %[[VAL_4]] : ((!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_18:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_19:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}> [1] : i64
+// CHECK:           %[[VAL_20:.*]] = cc.alloca !cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>
+// CHECK:           %[[VAL_21:.*]] = cc.alloca !cc.array<!cc.ptr<i8> x 1>
+// CHECK:           %[[VAL_22:.*]] = cc.cast %[[VAL_21]] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 1>>) -> !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_23:.*]] = cc.cast %[[VAL_20]] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// CHECK:           cc.store %[[VAL_22]], %[[VAL_23]] : !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// CHECK:           %[[VAL_24:.*]] = cc.cast %[[VAL_21]] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 1>>) -> i64
+// CHECK:           %[[VAL_25:.*]] = arith.addi %[[VAL_24]], %[[VAL_3]] : i64
+// CHECK:           %[[VAL_26:.*]] = cc.cast %[[VAL_25]] : (i64) -> !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_27:.*]] = cc.compute_ptr %[[VAL_20]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// CHECK:           cc.store %[[VAL_26]], %[[VAL_27]] : !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// CHECK:           %[[VAL_28:.*]] = cc.compute_ptr %[[VAL_20]][2] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// CHECK:           cc.store %[[VAL_26]], %[[VAL_28]] : !cc.ptr<!cc.ptr<!cc.ptr<i8>>>
+// CHECK:           %[[VAL_29:.*]] = cc.cast %[[VAL_21]] : (!cc.ptr<!cc.array<!cc.ptr<i8> x 1>>) -> !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_30:.*]] = cc.alloca i32
+// CHECK:           cc.store %[[VAL_2]], %[[VAL_30]] : !cc.ptr<i32>
+// CHECK:           %[[VAL_31:.*]] = cc.cast %[[VAL_30]] : (!cc.ptr<i32>) -> !cc.ptr<i8>
+// CHECK:           cc.store %[[VAL_31]], %[[VAL_29]] : !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_32:.*]] = cc.cast %[[VAL_20]] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_33:.*]] = llvm.mlir.addressof @test_1.kernelName : !llvm.ptr<array<7 x i8>>
+// CHECK:           %[[VAL_34:.*]] = cc.cast %[[VAL_33]] : (!llvm.ptr<array<7 x i8>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_35:.*]] = call @hybridLaunchKernel(%[[VAL_34]], %[[VAL_17]], %[[VAL_18]], %[[VAL_9]], %[[VAL_19]], %[[VAL_32]]) : (!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64, !cc.ptr<i8>) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// CHECK:           %[[VAL_36:.*]] = cc.extract_value %[[VAL_35]][0] : (!cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_37:.*]] = cc.cast %[[VAL_36]] : (!cc.ptr<i8>) -> i64
+// CHECK:           %[[VAL_38:.*]] = arith.cmpi ne, %[[VAL_37]], %[[VAL_5]] : i64
+// CHECK:           cf.cond_br %[[VAL_38]], ^bb1, ^bb2
 // CHECK:         ^bb1:
-// CHECK:           %[[VAL_20:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>
-// CHECK:           %[[VAL_21:.*]] = cc.compute_ptr %[[VAL_20]][1] : (!cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<f64>, i64}>>
-// CHECK:           cf.br ^bb3(%[[VAL_21]] : !cc.ptr<!cc.struct<{!cc.ptr<f64>, i64}>>)
+// CHECK:           %[[VAL_39:.*]] = cc.cast %[[VAL_36]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>
+// CHECK:           %[[VAL_40:.*]] = cc.compute_ptr %[[VAL_39]][1] : (!cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<f64>, i64}>>
+// CHECK:           cf.br ^bb3(%[[VAL_40]] : !cc.ptr<!cc.struct<{!cc.ptr<f64>, i64}>>)
 // CHECK:         ^bb2:
-// CHECK:           %[[VAL_22:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>
-// CHECK:           %[[VAL_23:.*]] = cc.compute_ptr %[[VAL_22]][1] : (!cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<f64>, i64}>>
-// CHECK:           cf.br ^bb3(%[[VAL_23]] : !cc.ptr<!cc.struct<{!cc.ptr<f64>, i64}>>)
-// CHECK:         ^bb3(%[[VAL_24:.*]]: !cc.ptr<!cc.struct<{!cc.ptr<f64>, i64}>>):
-// CHECK:           %[[VAL_25:.*]] = cc.cast %[[VAL_24]] : (!cc.ptr<!cc.struct<{!cc.ptr<f64>, i64}>>) -> !cc.ptr<!cc.ptr<f64>>
-// CHECK:           %[[VAL_26:.*]] = cc.load %[[VAL_25]] : !cc.ptr<!cc.ptr<f64>>
-// CHECK:           %[[VAL_27:.*]] = cc.compute_ptr %[[VAL_24]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<f64>, i64}>>) -> !cc.ptr<i64>
-// CHECK:           %[[VAL_28:.*]] = cc.load %[[VAL_27]] : !cc.ptr<i64>
-// CHECK:           %[[VAL_29:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<!cc.struct<{!cc.ptr<f64>, !cc.ptr<f64>, !cc.ptr<f64>}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>}>>
-// CHECK:           %[[VAL_30:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<!cc.struct<{!cc.ptr<f64>, !cc.ptr<f64>, !cc.ptr<f64>}>>) -> !cc.ptr<!cc.ptr<i8>>
-// CHECK:           %[[VAL_31:.*]] = cc.cast %[[VAL_26]] : (!cc.ptr<f64>) -> !cc.ptr<i8>
-// CHECK:           cc.store %[[VAL_31]], %[[VAL_30]] : !cc.ptr<!cc.ptr<i8>>
-// CHECK:           %[[VAL_32:.*]] = cc.compute_ptr %[[VAL_29]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>}>>) -> !cc.ptr<!cc.ptr<i8>>
-// CHECK:           %[[VAL_33:.*]] = arith.muli %[[VAL_28]], %[[VAL_3]] : i64
-// CHECK:           %[[VAL_34:.*]] = cc.cast %[[VAL_26]] : (!cc.ptr<f64>) -> !cc.ptr<!cc.array<i8 x ?>>
-// CHECK:           %[[VAL_35:.*]] = cc.compute_ptr %[[VAL_34]]{{\[}}%[[VAL_33]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
-// CHECK:           cc.store %[[VAL_35]], %[[VAL_32]] : !cc.ptr<!cc.ptr<i8>>
-// CHECK:           %[[VAL_36:.*]] = cc.compute_ptr %[[VAL_29]][2] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>}>>) -> !cc.ptr<!cc.ptr<i8>>
-// CHECK:           cc.store %[[VAL_35]], %[[VAL_36]] : !cc.ptr<!cc.ptr<i8>>
-// CHECK:           call @free(%[[VAL_17]]) : (!cc.ptr<i8>) -> ()
+// CHECK:           %[[VAL_41:.*]] = cc.compute_ptr %[[VAL_11]][1] : (!cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<f64>, i64}>>
+// CHECK:           cf.br ^bb3(%[[VAL_41]] : !cc.ptr<!cc.struct<{!cc.ptr<f64>, i64}>>)
+// CHECK:         ^bb3(%[[VAL_42:.*]]: !cc.ptr<!cc.struct<{!cc.ptr<f64>, i64}>>):
+// CHECK:           %[[VAL_43:.*]] = cc.cast %[[VAL_42]] : (!cc.ptr<!cc.struct<{!cc.ptr<f64>, i64}>>) -> !cc.ptr<!cc.ptr<f64>>
+// CHECK:           %[[VAL_44:.*]] = cc.load %[[VAL_43]] : !cc.ptr<!cc.ptr<f64>>
+// CHECK:           %[[VAL_45:.*]] = cc.compute_ptr %[[VAL_42]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<f64>, i64}>>) -> !cc.ptr<i64>
+// CHECK:           %[[VAL_46:.*]] = cc.load %[[VAL_45]] : !cc.ptr<i64>
+// CHECK:           %[[VAL_47:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<!cc.struct<{!cc.ptr<f64>, !cc.ptr<f64>, !cc.ptr<f64>}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>}>>
+// CHECK:           %[[VAL_48:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<!cc.struct<{!cc.ptr<f64>, !cc.ptr<f64>, !cc.ptr<f64>}>>) -> !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_49:.*]] = cc.cast %[[VAL_44]] : (!cc.ptr<f64>) -> !cc.ptr<i8>
+// CHECK:           cc.store %[[VAL_49]], %[[VAL_48]] : !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_50:.*]] = cc.compute_ptr %[[VAL_47]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>}>>) -> !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_51:.*]] = arith.muli %[[VAL_46]], %[[VAL_3]] : i64
+// CHECK:           %[[VAL_52:.*]] = cc.cast %[[VAL_44]] : (!cc.ptr<f64>) -> !cc.ptr<!cc.array<i8 x ?>>
+// CHECK:           %[[VAL_53:.*]] = cc.compute_ptr %[[VAL_52]]{{\[}}%[[VAL_51]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
+// CHECK:           cc.store %[[VAL_53]], %[[VAL_50]] : !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_54:.*]] = cc.compute_ptr %[[VAL_47]][2] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>}>>) -> !cc.ptr<!cc.ptr<i8>>
+// CHECK:           cc.store %[[VAL_53]], %[[VAL_54]] : !cc.ptr<!cc.ptr<i8>>
+// CHECK:           call @free(%[[VAL_36]]) : (!cc.ptr<i8>) -> ()
 // CHECK:           return
 // CHECK:         }
 
 }
 
+// CHECK:         func.func private @hybridLaunchKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64, !cc.ptr<i8>) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// CHECK:         func.func private @cudaqRegisterArgsCreator(!cc.ptr<i8>, !cc.ptr<i8>)
+// CHECK:         llvm.func @cudaqRegisterLambdaName(!llvm.ptr<i8>, !llvm.ptr<i8>) attributes {sym_visibility = "private"}
+// CHECK:         func.func private @__cudaq_registerLinkableKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>)
+// CHECK:         func.func private @__cudaq_getLinkableKernelKey(!cc.ptr<i8>) -> i64
+// CHECK:         func.func private @cudaqRegisterKernelName(!cc.ptr<i8>)
+// CHECK:         func.func private @free(!cc.ptr<i8>)
+// CHECK:         func.func private @__nvqpp_initializer_list_to_vector_bool(!cc.ptr<none>, !cc.ptr<none>, i64)
+// CHECK:         func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr<!cc.struct<{!cc.ptr<i1>, !cc.ptr<i1>, !cc.ptr<i1>}>>, !cc.ptr<!cc.struct<{!cc.ptr<i1>, !cc.array<i8 x 32>}>>, !cc.ptr<!cc.ptr<i8>>)
+// CHECK:         func.func private @llvm.memcpy.p0i8.p0i8.i64(!cc.ptr<i8>, !cc.ptr<i8>, i64, i1)
+
+// CHECK-LABEL:   func.func private @__nvqpp_zeroDynamicResult() -> !cc.struct<{!cc.ptr<i8>, i64}> {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i64
+// CHECK:           %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (i64) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_2:.*]] = cc.undef !cc.struct<{!cc.ptr<i8>, i64}>
+// CHECK:           %[[VAL_3:.*]] = cc.insert_value %[[VAL_1]], %[[VAL_2]][0] : (!cc.struct<{!cc.ptr<i8>, i64}>, !cc.ptr<i8>) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// CHECK:           %[[VAL_4:.*]] = cc.insert_value %[[VAL_0]], %[[VAL_3]][1] : (!cc.struct<{!cc.ptr<i8>, i64}>, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// CHECK:           return %[[VAL_4]] : !cc.struct<{!cc.ptr<i8>, i64}>
+// CHECK:         }
+
+// CHECK-LABEL:   func.func private @__nvqpp_createDynamicResult(
+// CHECK-SAME:                                                   %[[VAL_0:.*]]: !cc.ptr<i8>,
+// CHECK-SAME:                                                   %[[VAL_1:.*]]: i64,
+// CHECK-SAME:                                                   %[[VAL_2:.*]]: !cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>,
+// CHECK-SAME:                                                   %[[VAL_3:.*]]: i64) -> !cc.struct<{!cc.ptr<i8>, i64}> {
+// CHECK:           %[[VAL_4:.*]] = arith.constant false
+// CHECK:           %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>) -> !cc.ptr<i64>
+// CHECK:           %[[VAL_6:.*]] = cc.load %[[VAL_5]] : !cc.ptr<i64>
+// CHECK:           %[[VAL_7:.*]] = arith.addi %[[VAL_1]], %[[VAL_6]] : i64
+// CHECK:           %[[VAL_8:.*]] = call @malloc(%[[VAL_7]]) : (i64) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_9:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.array<i8 x ?>>
+// CHECK:           call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_8]], %[[VAL_0]], %[[VAL_1]], %[[VAL_4]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
+// CHECK:           %[[VAL_10:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>) -> !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_11:.*]] = cc.load %[[VAL_10]] : !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_12:.*]] = cc.compute_ptr %[[VAL_9]]{{\[}}%[[VAL_1]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
+// CHECK:           call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_12]], %[[VAL_11]], %[[VAL_6]], %[[VAL_4]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
+// CHECK:           %[[VAL_13:.*]] = cc.undef !cc.struct<{!cc.ptr<i8>, i64}>
+// CHECK:           %[[VAL_14:.*]] = cc.insert_value %[[VAL_8]], %[[VAL_13]][0] : (!cc.struct<{!cc.ptr<i8>, i64}>, !cc.ptr<i8>) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// CHECK:           %[[VAL_15:.*]] = cc.insert_value %[[VAL_7]], %[[VAL_14]][1] : (!cc.struct<{!cc.ptr<i8>, i64}>, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// CHECK:           %[[VAL_16:.*]] = cc.compute_ptr %[[VAL_9]]{{\[}}%[[VAL_3]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_17:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.ptr<i8>>
+// CHECK:           cc.store %[[VAL_12]], %[[VAL_17]] : !cc.ptr<!cc.ptr<i8>>
+// CHECK:           return %[[VAL_15]] : !cc.struct<{!cc.ptr<i8>, i64}>
+// CHECK:         }
+// CHECK:         llvm.mlir.global external constant @test_0.kernelName("test_0\00") {addr_space = 0 : i32}
+
+// CHECK-LABEL:   func.func @test_0.returnOffset() -> i64 {
+// CHECK:           %[[VAL_0:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}> [1] : i64
+// CHECK:           return %[[VAL_0]] : i64
+// CHECK:         }
+
 // CHECK-LABEL:   func.func @test_0.thunk(
-// CHECK-SAME:      %[[VAL_0:.*]]: !cc.ptr<i8>, %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr<i8>, i64}> {
+// CHECK-SAME:        %[[VAL_0:.*]]: !cc.ptr<i8>,
+// CHECK-SAME:        %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr<i8>, i64}> {
 // CHECK:           %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>
-// CHECK:           %[[VAL_3:.*]] = cc.load %[[VAL_2]] : !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>
-// CHECK:           %[[VAL_4:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}> : i64
-// CHECK:           %[[VAL_5:.*]] = cc.extract_value %[[VAL_3]][0] : (!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>) -> i32
+// CHECK:           %[[VAL_3:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}> : i64
+// CHECK:           %[[VAL_4:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<i8>) -> !cc.ptr<i32>
+// CHECK:           %[[VAL_5:.*]] = cc.load %[[VAL_4]] : !cc.ptr<i32>
 // CHECK:           %[[VAL_6:.*]] = call @__nvqpp__mlirgen__test_0(%[[VAL_5]]) : (i32) -> !cc.stdvec<i32>
 // CHECK:           %[[VAL_7:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<i32>, i64}>>
 // CHECK:           %[[VAL_8:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr<!cc.struct<{!cc.ptr<i32>, i64}>>) -> !cc.ptr<!cc.stdvec<i32>>
@@ -174,19 +278,53 @@ func.func @test_1(%0: !cc.ptr<!cc.struct<{!cc.ptr<f64>, !cc.ptr<f64>, !cc.ptr<f6
 // CHECK:         ^bb1:
 // CHECK:           %[[VAL_9:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr<!cc.struct<{!cc.ptr<i32>, i64}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>
 // CHECK:           %[[VAL_10:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}> [1] : i64
-// CHECK:           %[[VAL_11:.*]] = call @__nvqpp_createDynamicResult(%[[VAL_0]], %[[VAL_4]], %[[VAL_9]], %[[VAL_10]]) : (!cc.ptr<i8>, i64, !cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// CHECK:           %[[VAL_11:.*]] = call @__nvqpp_createDynamicResult(%[[VAL_0]], %[[VAL_3]], %[[VAL_9]], %[[VAL_10]]) : (!cc.ptr<i8>, i64, !cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>
 // CHECK:           return %[[VAL_11]] : !cc.struct<{!cc.ptr<i8>, i64}>
 // CHECK:         ^bb2:
 // CHECK:           %[[VAL_12:.*]] = call @__nvqpp_zeroDynamicResult() : () -> !cc.struct<{!cc.ptr<i8>, i64}>
 // CHECK:           return %[[VAL_12]] : !cc.struct<{!cc.ptr<i8>, i64}>
 // CHECK:         }
 
+// CHECK-LABEL:   func.func @test_0.argsCreator(
+// CHECK-SAME:                                  %[[VAL_0:.*]]: !cc.ptr<!cc.ptr<i8>>,
+// CHECK-SAME:                                  %[[VAL_1:.*]]: !cc.ptr<!cc.ptr<i8>>) -> i64 {
+// CHECK:           %[[VAL_2:.*]] = cc.load %[[VAL_0]] : !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_3:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr<i8>) -> !cc.ptr<i32>
+// CHECK:           %[[VAL_4:.*]] = cc.load %[[VAL_3]] : !cc.ptr<i32>
+// CHECK:           %[[VAL_5:.*]] = cc.alloca i64
+// CHECK:           %[[VAL_6:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}> : i64
+// CHECK:           %[[VAL_7:.*]] = call @malloc(%[[VAL_6]]) : (i64) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_8:.*]] = cc.alloca !cc.ptr<i8>
+// CHECK:           %[[VAL_9:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr<i8>) -> !cc.ptr<i32>
+// CHECK:           cc.store %[[VAL_4]], %[[VAL_9]] : !cc.ptr<i32>
+// CHECK:           cc.store %[[VAL_7]], %[[VAL_1]] : !cc.ptr<!cc.ptr<i8>>
+// CHECK:           return %[[VAL_6]] : i64
+// CHECK:         }
+
+// CHECK-LABEL:   llvm.func @test_0.kernelRegFunc() {
+// CHECK:           %[[VAL_0:.*]] = func.constant @test_0.argsCreator : (!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>) -> i64
+// CHECK:           %[[VAL_1:.*]] = llvm.mlir.addressof @test_0.kernelName : !llvm.ptr<array<7 x i8>>
+// CHECK:           %[[VAL_2:.*]] = cc.cast %[[VAL_1]] : (!llvm.ptr<array<7 x i8>>) -> !cc.ptr<i8>
+// CHECK:           func.call @cudaqRegisterKernelName(%[[VAL_2]]) : (!cc.ptr<i8>) -> ()
+// CHECK:           %[[VAL_3:.*]] = cc.func_ptr %[[VAL_0]] : ((!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>) -> i64) -> !cc.ptr<i8>
+// CHECK:           func.call @cudaqRegisterArgsCreator(%[[VAL_2]], %[[VAL_3]]) : (!cc.ptr<i8>, !cc.ptr<i8>) -> ()
+// CHECK:           llvm.return
+// CHECK:         }
+// CHECK:         llvm.mlir.global_ctors {ctors = [@test_0.kernelRegFunc], priorities = [17 : i32]}
+// CHECK:         llvm.mlir.global external constant @test_1.kernelName("test_1\00") {addr_space = 0 : i32}
+
+// CHECK-LABEL:   func.func @test_1.returnOffset() -> i64 {
+// CHECK:           %[[VAL_0:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}> [1] : i64
+// CHECK:           return %[[VAL_0]] : i64
+// CHECK:         }
+
 // CHECK-LABEL:   func.func @test_1.thunk(
-// CHECK-SAME:      %[[VAL_0:.*]]: !cc.ptr<i8>, %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr<i8>, i64}> {
+// CHECK-SAME:        %[[VAL_0:.*]]: !cc.ptr<i8>,
+// CHECK-SAME:        %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr<i8>, i64}> {
 // CHECK:           %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>
-// CHECK:           %[[VAL_3:.*]] = cc.load %[[VAL_2]] : !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>
-// CHECK:           %[[VAL_4:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}> : i64
-// CHECK:           %[[VAL_5:.*]] = cc.extract_value %[[VAL_3]][0] : (!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>) -> i32
+// CHECK:           %[[VAL_3:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}> : i64
+// CHECK:           %[[VAL_4:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<i8>) -> !cc.ptr<i32>
+// CHECK:           %[[VAL_5:.*]] = cc.load %[[VAL_4]] : !cc.ptr<i32>
 // CHECK:           %[[VAL_6:.*]] = call @__nvqpp__mlirgen__test_1(%[[VAL_5]]) : (i32) -> !cc.stdvec<f64>
 // CHECK:           %[[VAL_7:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<f64>, i64}>>
 // CHECK:           %[[VAL_8:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr<!cc.struct<{!cc.ptr<f64>, i64}>>) -> !cc.ptr<!cc.stdvec<f64>>
@@ -195,9 +333,37 @@ func.func @test_1(%0: !cc.ptr<!cc.struct<{!cc.ptr<f64>, !cc.ptr<f64>, !cc.ptr<f6
 // CHECK:         ^bb1:
 // CHECK:           %[[VAL_9:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr<!cc.struct<{!cc.ptr<f64>, i64}>>) -> !cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>
 // CHECK:           %[[VAL_10:.*]] = cc.offsetof !cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}> [1] : i64
-// CHECK:           %[[VAL_11:.*]] = call @__nvqpp_createDynamicResult(%[[VAL_0]], %[[VAL_4]], %[[VAL_9]], %[[VAL_10]]) : (!cc.ptr<i8>, i64, !cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>
+// CHECK:           %[[VAL_11:.*]] = call @__nvqpp_createDynamicResult(%[[VAL_0]], %[[VAL_3]], %[[VAL_9]], %[[VAL_10]]) : (!cc.ptr<i8>, i64, !cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>
 // CHECK:           return %[[VAL_11]] : !cc.struct<{!cc.ptr<i8>, i64}>
 // CHECK:         ^bb2:
 // CHECK:           %[[VAL_12:.*]] = call @__nvqpp_zeroDynamicResult() : () -> !cc.struct<{!cc.ptr<i8>, i64}>
 // CHECK:           return %[[VAL_12]] : !cc.struct<{!cc.ptr<i8>, i64}>
 // CHECK:         }
+
+// CHECK-LABEL:   func.func @test_1.argsCreator(
+// CHECK-SAME:        %[[VAL_0:.*]]: !cc.ptr<!cc.ptr<i8>>,
+// CHECK-SAME:        %[[VAL_1:.*]]: !cc.ptr<!cc.ptr<i8>>) -> i64 {
+// CHECK:           %[[VAL_2:.*]] = cc.load %[[VAL_0]] : !cc.ptr<!cc.ptr<i8>>
+// CHECK:           %[[VAL_3:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr<i8>) -> !cc.ptr<i32>
+// CHECK:           %[[VAL_4:.*]] = cc.load %[[VAL_3]] : !cc.ptr<i32>
+// CHECK:           %[[VAL_5:.*]] = cc.alloca i64
+// CHECK:           %[[VAL_6:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}> : i64
+// CHECK:           %[[VAL_7:.*]] = call @malloc(%[[VAL_6]]) : (i64) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_8:.*]] = cc.alloca !cc.ptr<i8>
+// CHECK:           %[[VAL_9:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr<i8>) -> !cc.ptr<i32>
+// CHECK:           cc.store %[[VAL_4]], %[[VAL_9]] : !cc.ptr<i32>
+// CHECK:           cc.store %[[VAL_7]], %[[VAL_1]] : !cc.ptr<!cc.ptr<i8>>
+// CHECK:           return %[[VAL_6]] : i64
+// CHECK:         }
+
+// CHECK-LABEL:   llvm.func @test_1.kernelRegFunc() {
+// CHECK:           %[[VAL_0:.*]] = func.constant @test_1.argsCreator : (!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>) -> i64
+// CHECK:           %[[VAL_1:.*]] = llvm.mlir.addressof @test_1.kernelName : !llvm.ptr<array<7 x i8>>
+// CHECK:           %[[VAL_2:.*]] = cc.cast %[[VAL_1]] : (!llvm.ptr<array<7 x i8>>) -> !cc.ptr<i8>
+// CHECK:           func.call @cudaqRegisterKernelName(%[[VAL_2]]) : (!cc.ptr<i8>) -> ()
+// CHECK:           %[[VAL_3:.*]] = cc.func_ptr %[[VAL_0]] : ((!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>) -> i64) -> !cc.ptr<i8>
+// CHECK:           func.call @cudaqRegisterArgsCreator(%[[VAL_2]], %[[VAL_3]]) : (!cc.ptr<i8>, !cc.ptr<i8>) -> ()
+// CHECK:           llvm.return
+// CHECK:         }
+// CHECK:         llvm.mlir.global_ctors {ctors = [@test_1.kernelRegFunc], priorities = [17 : i32]}
+
diff --git a/test/Translate/argument.qke b/test/Translate/argument.qke
index 6a3532805a..865a622a55 100644
--- a/test/Translate/argument.qke
+++ b/test/Translate/argument.qke
@@ -6,7 +6,7 @@
 // the terms of the Apache License 2.0 which accompanies this distribution.   //
 // ========================================================================== //
 
-// RUN: cudaq-opt --kernel-execution=codegen=1 --canonicalize %s | \
+// RUN: cudaq-opt -kernel-execution -canonicalize %s | \
 // RUN: cudaq-translate --convert-to=qir | FileCheck %s
 
 // NB: the mangled name map is required for the kernel-execution pass.
@@ -31,7 +31,7 @@ func.func @test_0(%0: !cc.ptr<i8>, %1: !cc.ptr<!cc.struct<{!cc.ptr<!cc.struct<{i
 }
 
 // CHECK-LABEL: define void @__nvqpp__mlirgen__test_0({ { i32, double }*, i64 } 
-// CHECK-SAME:    %[[VAL_0:.*]]) {{.*}}{
+// CHECK-SAME:    %[[VAL_0:.*]]) local_unnamed_addr {
 // CHECK:         %[[VAL_1:.*]] = extractvalue { { i32, double }*, i64 } %[[VAL_0]], 0
 // CHECK:         %[[VAL_2:.*]] = extractvalue { { i32, double }*, i64 } %[[VAL_0]], 1
 // CHECK:         %[[VAL_3:.*]] = bitcast { i32, double }* %[[VAL_1]] to i8*
@@ -40,7 +40,7 @@ func.func @test_0(%0: !cc.ptr<i8>, %1: !cc.ptr<!cc.struct<{!cc.ptr<!cc.struct<{i
 // CHECK:       }
 
 // CHECK-LABEL: define void @test_0(i8* nocapture readnone 
-// CHECK-SAME:    %[[VAL_0:.*]], { { i32, double }*, { i32, double }*, { i32, double }* }* nocapture readonly %[[VAL_1:.*]]) {{.*}}{
+// CHECK-SAME:    %[[VAL_0:.*]], { { i32, double }*, { i32, double }*, { i32, double }* }* {{.*}}%[[VAL_1:.*]]) local_unnamed_addr {
 // CHECK:         %[[VAL_2:.*]] = getelementptr { { i32, double }*, { i32, double }*, { i32, double }* }, { { i32, double }*, { i32, double }*, { i32, double }* }* %[[VAL_1]], i64 0, i32 1
 // CHECK:         %[[VAL_3:.*]] = getelementptr { { i32, double }*, { i32, double }*, { i32, double }* }, { { i32, double }*, { i32, double }*, { i32, double }* }* %[[VAL_1]], i64 0, i32 0
 // CHECK:         %[[VAL_4:.*]] = load { i32, double }*, { i32, double }** %[[VAL_2]], align 8
@@ -50,12 +50,27 @@ func.func @test_0(%0: !cc.ptr<i8>, %1: !cc.ptr<!cc.struct<{!cc.ptr<!cc.struct<{i
 // CHECK:         %[[VAL_8:.*]] = sub i64 %[[VAL_6]], %[[VAL_7]]
 // CHECK:         %[[VAL_9:.*]] = add i64 %[[VAL_8]], 8
 // CHECK:         %[[VAL_10:.*]] = alloca i8, i64 %[[VAL_9]], align 8
-// CHECK:         %[[VAL_11:.*]] = bitcast i8* %[[VAL_10]] to i64*
-// CHECK:         store i64 %[[VAL_8]], i64* %[[VAL_11]], align 8
-// CHECK:         %[[VAL_12:.*]] = getelementptr i8, i8* %[[VAL_10]], i64 8
+// CHECK:         %[[VAL_11:.*]] = getelementptr i8, i8* %[[VAL_10]], i64 8
+// CHECK:         %[[VAL_12:.*]] = bitcast i8* %[[VAL_10]] to i64*
+// CHECK:         store i64 %[[VAL_8]], i64* %[[VAL_12]], align 8
 // CHECK:         %[[VAL_13:.*]] = bitcast { i32, double }* %[[VAL_5]] to i8*
-// CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %[[VAL_12]], i8* align 1 %[[VAL_13]], i64 %[[VAL_8]], i1 false)
-// CHECK:         call { i8*, i64 } @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_0.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_0.thunk to i8*), i8* nonnull %[[VAL_10]], i64 %[[VAL_9]], i64 2147483647)
+// CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %[[VAL_11]], i8* align 1 %[[VAL_13]], i64 %[[VAL_8]], i1 false)
+// CHECK:         %[[VAL_14:.*]] = alloca { i8**, i8**, i8** }, align 8
+// CHECK:         %[[VAL_15:.*]] = alloca [1 x i8*], align 8
+// CHECK:         %[[VAL_16:.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* %[[VAL_15]], i64 0, i64 0
+// CHECK:         %[[VAL_17:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_14]], i64 0, i32 0
+// CHECK:         store i8** %[[VAL_16]], i8*** %[[VAL_17]], align 8
+// CHECK:         %[[VAL_18:.*]] = ptrtoint [1 x i8*]* %[[VAL_15]] to i64
+// CHECK:         %[[VAL_19:.*]] = add i64 %[[VAL_18]], 8
+// CHECK:         %[[VAL_20:.*]] = inttoptr i64 %[[VAL_19]] to i8**
+// CHECK:         %[[VAL_21:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_14]], i64 0, i32 1
+// CHECK:         store i8** %[[VAL_20]], i8*** %[[VAL_21]], align 8
+// CHECK:         %[[VAL_22:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_14]], i64 0, i32 2
+// CHECK:         store i8** %[[VAL_20]], i8*** %[[VAL_22]], align 8
+// CHECK:         %[[VAL_23:.*]] = bitcast [1 x i8*]* %[[VAL_15]] to { { i32, double }*, { i32, double }*, { i32, double }* }**
+// CHECK:         store { { i32, double }*, { i32, double }*, { i32, double }* }* %[[VAL_1]], { { i32, double }*, { i32, double }*, { i32, double }* }** %[[VAL_23]], align 8
+// CHECK:         %[[VAL_24:.*]] = bitcast { i8**, i8**, i8** }* %[[VAL_14]] to i8*
+// CHECK:         %[[VAL_25:.*]] = call { i8*, i64 } @hybridLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_0.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_0.thunk to i8*), i8* nonnull %[[VAL_10]], i64 %[[VAL_9]], i64 2147483647, i8* nonnull %[[VAL_24]])
 // CHECK:         ret void
 // CHECK:       }
 
@@ -78,7 +93,7 @@ func.func @test_1(%0: !cc.ptr<i8>, %1: !cc.ptr<!cc.struct<{!cc.struct<{!cc.ptr<i
 }
 
 // CHECK-LABEL: define void @__nvqpp__mlirgen__test_1({ { i16*, i64 }, { float*, i64 } } 
-// CHECK-SAME:    %[[VAL_0:.*]]) {{.*}}{
+// CHECK-SAME:    %[[VAL_0:.*]]) local_unnamed_addr {
 // CHECK:         %[[VAL_1:.*]] = extractvalue { { i16*, i64 }, { float*, i64 } } %[[VAL_0]], 0
 // CHECK:         %[[VAL_2:.*]] = extractvalue { i16*, i64 } %[[VAL_1]], 0
 // CHECK:         %[[VAL_3:.*]] = extractvalue { i16*, i64 } %[[VAL_1]], 1
@@ -93,7 +108,7 @@ func.func @test_1(%0: !cc.ptr<i8>, %1: !cc.ptr<!cc.struct<{!cc.struct<{!cc.ptr<i
 // CHECK:       }
 
 // CHECK-LABEL: define void @test_1(i8* nocapture readnone 
-// CHECK-SAME:    %[[VAL_0:.*]], { { i16*, i16*, i16* }, { float*, float*, float* } }* nocapture readonly %[[VAL_1:.*]]) {{.*}}{
+// CHECK-SAME:    %[[VAL_0:.*]], { { i16*, i16*, i16* }, { float*, float*, float* } }* {{.*}}%[[VAL_1:.*]]) local_unnamed_addr {
 // CHECK:         %[[VAL_2:.*]] = getelementptr { { i16*, i16*, i16* }, { float*, float*, float* } }, { { i16*, i16*, i16* }, { float*, float*, float* } }* %[[VAL_1]], i64 0, i32 0, i32 1
 // CHECK:         %[[VAL_3:.*]] = getelementptr { { i16*, i16*, i16* }, { float*, float*, float* } }, { { i16*, i16*, i16* }, { float*, float*, float* } }* %[[VAL_1]], i64 0, i32 0, i32 0
 // CHECK:         %[[VAL_4:.*]] = load i16*, i16** %[[VAL_2]], align 8
@@ -108,21 +123,36 @@ func.func @test_1(%0: !cc.ptr<i8>, %1: !cc.ptr<!cc.struct<{!cc.struct<{!cc.ptr<i
 // CHECK:         %[[VAL_13:.*]] = ptrtoint float* %[[VAL_11]] to i64
 // CHECK:         %[[VAL_14:.*]] = ptrtoint float* %[[VAL_12]] to i64
 // CHECK:         %[[VAL_15:.*]] = sub i64 %[[VAL_13]], %[[VAL_14]]
-// CHECK:         %[[VAL_16:.*]] = add i64 %[[VAL_8]], 16
+// CHECK:         %[[VAL_16:.*]] = add i64 %[[VAL_8]], 32
 // CHECK:         %[[VAL_17:.*]] = add i64 %[[VAL_16]], %[[VAL_15]]
 // CHECK:         %[[VAL_18:.*]] = alloca i8, i64 %[[VAL_17]], align 8
-// CHECK:         %[[VAL_19:.*]] = bitcast i8* %[[VAL_18]] to i64*
-// CHECK:         store i64 %[[VAL_8]], i64* %[[VAL_19]], align 8
-// CHECK:         %[[VAL_20:.*]] = getelementptr inbounds i8, i8* %[[VAL_18]], i64 8
-// CHECK:         %[[VAL_21:.*]] = bitcast i8* %[[VAL_20]] to i64*
-// CHECK:         store i64 %[[VAL_15]], i64* %[[VAL_21]], align 8
-// CHECK:         %[[VAL_22:.*]] = getelementptr i8, i8* %[[VAL_18]], i64 16
-// CHECK:         %[[VAL_23:.*]] = bitcast i16* %[[VAL_5]] to i8*
-// CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %[[VAL_22]], i8* align 1 %[[VAL_23]], i64 %[[VAL_8]], i1 false)
-// CHECK:         %[[VAL_24:.*]] = getelementptr i8, i8* %[[VAL_22]], i64 %[[VAL_8]]
+// CHECK:         %[[VAL_19:.*]] = getelementptr i8, i8* %[[VAL_18]], i64 16
+// CHECK:         %[[VAL_20:.*]] = bitcast i8* %[[VAL_18]] to i64*
+// CHECK:         store i64 %[[VAL_8]], i64* %[[VAL_20]], align 8
+// CHECK:         %[[VAL_21:.*]] = bitcast i16* %[[VAL_5]] to i8*
+// CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %[[VAL_19]], i8* align 1 %[[VAL_21]], i64 %[[VAL_8]], i1 false)
+// CHECK:         %[[VAL_22:.*]] = getelementptr i8, i8* %[[VAL_19]], i64 %[[VAL_8]]
+// CHECK:         %[[VAL_23:.*]] = getelementptr i8, i8* %[[VAL_18]], i64 8
+// CHECK:         %[[VAL_24:.*]] = bitcast i8* %[[VAL_23]] to i64*
+// CHECK:         store i64 %[[VAL_15]], i64* %[[VAL_24]], align 8
 // CHECK:         %[[VAL_25:.*]] = bitcast float* %[[VAL_12]] to i8*
-// CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %[[VAL_24]], i8* align 1 %[[VAL_25]], i64 %[[VAL_15]], i1 false)
-// CHECK:         call { i8*, i64 } @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_1.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_1.thunk to i8*), i8* nonnull %[[VAL_18]], i64 %[[VAL_17]], i64 2147483647)
+// CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %[[VAL_22]], i8* align 1 %[[VAL_25]], i64 %[[VAL_15]], i1 false)
+// CHECK:         %[[VAL_26:.*]] = alloca { i8**, i8**, i8** }, align 8
+// CHECK:         %[[VAL_27:.*]] = alloca [1 x i8*], align 8
+// CHECK:         %[[VAL_28:.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* %[[VAL_27]], i64 0, i64 0
+// CHECK:         %[[VAL_29:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_26]], i64 0, i32 0
+// CHECK:         store i8** %[[VAL_28]], i8*** %[[VAL_29]], align 8
+// CHECK:         %[[VAL_30:.*]] = ptrtoint [1 x i8*]* %[[VAL_27]] to i64
+// CHECK:         %[[VAL_31:.*]] = add i64 %[[VAL_30]], 8
+// CHECK:         %[[VAL_32:.*]] = inttoptr i64 %[[VAL_31]] to i8**
+// CHECK:         %[[VAL_33:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_26]], i64 0, i32 1
+// CHECK:         store i8** %[[VAL_32]], i8*** %[[VAL_33]], align 8
+// CHECK:         %[[VAL_34:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_26]], i64 0, i32 2
+// CHECK:         store i8** %[[VAL_32]], i8*** %[[VAL_34]], align 8
+// CHECK:         %[[VAL_35:.*]] = bitcast [1 x i8*]* %[[VAL_27]] to { { i16*, i16*, i16* }, { float*, float*, float* } }**
+// CHECK:         store { { i16*, i16*, i16* }, { float*, float*, float* } }* %[[VAL_1]], { { i16*, i16*, i16* }, { float*, float*, float* } }** %[[VAL_35]], align 8
+// CHECK:         %[[VAL_36:.*]] = bitcast { i8**, i8**, i8** }* %[[VAL_26]] to i8*
+// CHECK:         %[[VAL_37:.*]] = call { i8*, i64 } @hybridLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_1.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_1.thunk to i8*), i8* nonnull %[[VAL_18]], i64 %[[VAL_17]], i64 2147483647, i8* nonnull %[[VAL_36]])
 // CHECK:         ret void
 // CHECK:       }
 
@@ -139,7 +169,7 @@ func.func @test_2(%0: !cc.ptr<i8>, %1: !cc.ptr<!cc.struct<{!cc.ptr<!cc.struct<{i
 }
 
 // CHECK-LABEL: define void @__nvqpp__mlirgen__test_2({ { i32, double }*, i64 } 
-// CHECK-SAME:    %[[VAL_0:.*]]) {{.*}}{
+// CHECK-SAME:    %[[VAL_0:.*]]) local_unnamed_addr {
 // CHECK:         %[[VAL_1:.*]] = extractvalue { { i32, double }*, i64 } %[[VAL_0]], 0
 // CHECK:         %[[VAL_2:.*]] = extractvalue { { i32, double }*, i64 } %[[VAL_0]], 1
 // CHECK:         %[[VAL_3:.*]] = bitcast { i32, double }* %[[VAL_1]] to i8*
@@ -148,7 +178,7 @@ func.func @test_2(%0: !cc.ptr<i8>, %1: !cc.ptr<!cc.struct<{!cc.ptr<!cc.struct<{i
 // CHECK:       }
 
 // CHECK-LABEL: define void @test_2(i8* nocapture readnone 
-// CHECK-SAME:    %[[VAL_0:.*]], { { i32, double }*, { i32, double }*, { i32, double }* }* nocapture readonly %[[VAL_1:.*]]) {{.*}}{
+// CHECK-SAME:    %[[VAL_0:.*]], { { i32, double }*, { i32, double }*, { i32, double }* }* {{.*}}%[[VAL_1:.*]]) local_unnamed_addr {
 // CHECK:         %[[VAL_2:.*]] = getelementptr { { i32, double }*, { i32, double }*, { i32, double }* }, { { i32, double }*, { i32, double }*, { i32, double }* }* %[[VAL_1]], i64 0, i32 1
 // CHECK:         %[[VAL_3:.*]] = getelementptr { { i32, double }*, { i32, double }*, { i32, double }* }, { { i32, double }*, { i32, double }*, { i32, double }* }* %[[VAL_1]], i64 0, i32 0
 // CHECK:         %[[VAL_4:.*]] = load { i32, double }*, { i32, double }** %[[VAL_2]], align 8
@@ -158,12 +188,27 @@ func.func @test_2(%0: !cc.ptr<i8>, %1: !cc.ptr<!cc.struct<{!cc.ptr<!cc.struct<{i
 // CHECK:         %[[VAL_8:.*]] = sub i64 %[[VAL_6]], %[[VAL_7]]
 // CHECK:         %[[VAL_9:.*]] = add i64 %[[VAL_8]], 8
 // CHECK:         %[[VAL_10:.*]] = alloca i8, i64 %[[VAL_9]], align 8
-// CHECK:         %[[VAL_11:.*]] = bitcast i8* %[[VAL_10]] to i64*
-// CHECK:         store i64 %[[VAL_8]], i64* %[[VAL_11]], align 8
-// CHECK:         %[[VAL_12:.*]] = getelementptr i8, i8* %[[VAL_10]], i64 8
+// CHECK:         %[[VAL_11:.*]] = getelementptr i8, i8* %[[VAL_10]], i64 8
+// CHECK:         %[[VAL_12:.*]] = bitcast i8* %[[VAL_10]] to i64*
+// CHECK:         store i64 %[[VAL_8]], i64* %[[VAL_12]], align 8
 // CHECK:         %[[VAL_13:.*]] = bitcast { i32, double }* %[[VAL_5]] to i8*
-// CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %[[VAL_12]], i8* align 1 %[[VAL_13]], i64 %[[VAL_8]], i1 false)
-// CHECK:         call { i8*, i64 } @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_2.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_2.thunk to i8*), i8* nonnull %[[VAL_10]], i64 %[[VAL_9]], i64 2147483647)
+// CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %[[VAL_11]], i8* align 1 %[[VAL_13]], i64 %[[VAL_8]], i1 false)
+// CHECK:         %[[VAL_14:.*]] = alloca { i8**, i8**, i8** }, align 8
+// CHECK:         %[[VAL_15:.*]] = alloca [1 x i8*], align 8
+// CHECK:         %[[VAL_16:.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* %[[VAL_15]], i64 0, i64 0
+// CHECK:         %[[VAL_17:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_14]], i64 0, i32 0
+// CHECK:         store i8** %[[VAL_16]], i8*** %[[VAL_17]], align 8
+// CHECK:         %[[VAL_18:.*]] = ptrtoint [1 x i8*]* %[[VAL_15]] to i64
+// CHECK:         %[[VAL_19:.*]] = add i64 %[[VAL_18]], 8
+// CHECK:         %[[VAL_20:.*]] = inttoptr i64 %[[VAL_19]] to i8**
+// CHECK:         %[[VAL_21:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_14]], i64 0, i32 1
+// CHECK:         store i8** %[[VAL_20]], i8*** %[[VAL_21]], align 8
+// CHECK:         %[[VAL_22:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_14]], i64 0, i32 2
+// CHECK:         store i8** %[[VAL_20]], i8*** %[[VAL_22]], align 8
+// CHECK:         %[[VAL_23:.*]] = bitcast [1 x i8*]* %[[VAL_15]] to { { i32, double }*, { i32, double }*, { i32, double }* }**
+// CHECK:         store { { i32, double }*, { i32, double }*, { i32, double }* }* %[[VAL_1]], { { i32, double }*, { i32, double }*, { i32, double }* }** %[[VAL_23]], align 8
+// CHECK:         %[[VAL_24:.*]] = bitcast { i8**, i8**, i8** }* %[[VAL_14]] to i8*
+// CHECK:         %[[VAL_25:.*]] = call { i8*, i64 } @hybridLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_2.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_2.thunk to i8*), i8* nonnull %[[VAL_10]], i64 %[[VAL_9]], i64 2147483647, i8* nonnull %[[VAL_24]])
 // CHECK:         ret void
 // CHECK:       }
 
@@ -184,10 +229,9 @@ func.func @__nvqpp__mlirgen__test_3(%arg0 : !cc.struct<{!cc.stdvec<i16>, !cc.std
 func.func @test_3(%0: !cc.ptr<i8>, %1: !cc.ptr<!cc.struct<{!cc.struct<{!cc.ptr<i16>, !cc.ptr<i16>, !cc.ptr<i16>}>, !cc.struct<{!cc.ptr<f32>, !cc.ptr<f32>, !cc.ptr<f32>}>}>>) {
   return
 }
-}
 
 // CHECK-LABEL: define void @__nvqpp__mlirgen__test_3({ { i16*, i64 }, { float*, i64 } } 
-// CHECK-SAME:    %[[VAL_0:.*]]) {{.*}}{
+// CHECK-SAME:    %[[VAL_0:.*]]) local_unnamed_addr {
 // CHECK:         %[[VAL_1:.*]] = extractvalue { { i16*, i64 }, { float*, i64 } } %[[VAL_0]], 0
 // CHECK:         %[[VAL_2:.*]] = extractvalue { i16*, i64 } %[[VAL_1]], 0
 // CHECK:         %[[VAL_3:.*]] = extractvalue { i16*, i64 } %[[VAL_1]], 1
@@ -202,7 +246,7 @@ func.func @test_3(%0: !cc.ptr<i8>, %1: !cc.ptr<!cc.struct<{!cc.struct<{!cc.ptr<i
 // CHECK:       }
 
 // CHECK-LABEL: define void @test_3(i8* nocapture readnone 
-// CHECK-SAME:    %[[VAL_0:.*]], { { i16*, i16*, i16* }, { float*, float*, float* } }* nocapture readonly %[[VAL_1:.*]]) {{.*}}{
+// CHECK-SAME:    %[[VAL_0:.*]], { { i16*, i16*, i16* }, { float*, float*, float* } }* {{.*}}%[[VAL_1:.*]]) local_unnamed_addr {
 // CHECK:         %[[VAL_2:.*]] = getelementptr { { i16*, i16*, i16* }, { float*, float*, float* } }, { { i16*, i16*, i16* }, { float*, float*, float* } }* %[[VAL_1]], i64 0, i32 0, i32 1
 // CHECK:         %[[VAL_3:.*]] = getelementptr { { i16*, i16*, i16* }, { float*, float*, float* } }, { { i16*, i16*, i16* }, { float*, float*, float* } }* %[[VAL_1]], i64 0, i32 0, i32 0
 // CHECK:         %[[VAL_4:.*]] = load i16*, i16** %[[VAL_2]], align 8
@@ -217,40 +261,53 @@ func.func @test_3(%0: !cc.ptr<i8>, %1: !cc.ptr<!cc.struct<{!cc.struct<{!cc.ptr<i
 // CHECK:         %[[VAL_13:.*]] = ptrtoint float* %[[VAL_11]] to i64
 // CHECK:         %[[VAL_14:.*]] = ptrtoint float* %[[VAL_12]] to i64
 // CHECK:         %[[VAL_15:.*]] = sub i64 %[[VAL_13]], %[[VAL_14]]
-// CHECK:         %[[VAL_16:.*]] = add i64 %[[VAL_8]], 16
+// CHECK:         %[[VAL_16:.*]] = add i64 %[[VAL_8]], 32
 // CHECK:         %[[VAL_17:.*]] = add i64 %[[VAL_16]], %[[VAL_15]]
 // CHECK:         %[[VAL_18:.*]] = alloca i8, i64 %[[VAL_17]], align 8
-// CHECK:         %[[VAL_19:.*]] = bitcast i8* %[[VAL_18]] to i64*
-// CHECK:         store i64 %[[VAL_8]], i64* %[[VAL_19]], align 8
-// CHECK:         %[[VAL_20:.*]] = getelementptr inbounds i8, i8* %[[VAL_18]], i64 8
-// CHECK:         %[[VAL_21:.*]] = bitcast i8* %[[VAL_20]] to i64*
-// CHECK:         store i64 %[[VAL_15]], i64* %[[VAL_21]], align 8
-// CHECK:         %[[VAL_22:.*]] = getelementptr i8, i8* %[[VAL_18]], i64 16
-// CHECK:         %[[VAL_23:.*]] = bitcast i16* %[[VAL_5]] to i8*
-// CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %[[VAL_22]], i8* align 1 %[[VAL_23]], i64 %[[VAL_8]], i1 false)
-// CHECK:         %[[VAL_24:.*]] = getelementptr i8, i8* %[[VAL_22]], i64 %[[VAL_8]]
+// CHECK:         %[[VAL_19:.*]] = getelementptr i8, i8* %[[VAL_18]], i64 16
+// CHECK:         %[[VAL_20:.*]] = bitcast i8* %[[VAL_18]] to i64*
+// CHECK:         store i64 %[[VAL_8]], i64* %[[VAL_20]], align 8
+// CHECK:         %[[VAL_21:.*]] = bitcast i16* %[[VAL_5]] to i8*
+// CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %[[VAL_19]], i8* align 1 %[[VAL_21]], i64 %[[VAL_8]], i1 false)
+// CHECK:         %[[VAL_22:.*]] = getelementptr i8, i8* %[[VAL_19]], i64 %[[VAL_8]]
+// CHECK:         %[[VAL_23:.*]] = getelementptr i8, i8* %[[VAL_18]], i64 8
+// CHECK:         %[[VAL_24:.*]] = bitcast i8* %[[VAL_23]] to i64*
+// CHECK:         store i64 %[[VAL_15]], i64* %[[VAL_24]], align 8
 // CHECK:         %[[VAL_25:.*]] = bitcast float* %[[VAL_12]] to i8*
-// CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %[[VAL_24]], i8* align 1 %[[VAL_25]], i64 %[[VAL_15]], i1 false)
-// CHECK:         call { i8*, i64 } @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_3.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_3.thunk to i8*), i8* nonnull %[[VAL_18]], i64 %[[VAL_17]], i64 2147483647)
+// CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %[[VAL_22]], i8* align 1 %[[VAL_25]], i64 %[[VAL_15]], i1 false)
+// CHECK:         %[[VAL_26:.*]] = alloca { i8**, i8**, i8** }, align 8
+// CHECK:         %[[VAL_27:.*]] = alloca [1 x i8*], align 8
+// CHECK:         %[[VAL_28:.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* %[[VAL_27]], i64 0, i64 0
+// CHECK:         %[[VAL_29:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_26]], i64 0, i32 0
+// CHECK:         store i8** %[[VAL_28]], i8*** %[[VAL_29]], align 8
+// CHECK:         %[[VAL_30:.*]] = ptrtoint [1 x i8*]* %[[VAL_27]] to i64
+// CHECK:         %[[VAL_31:.*]] = add i64 %[[VAL_30]], 8
+// CHECK:         %[[VAL_32:.*]] = inttoptr i64 %[[VAL_31]] to i8**
+// CHECK:         %[[VAL_33:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_26]], i64 0, i32 1
+// CHECK:         store i8** %[[VAL_32]], i8*** %[[VAL_33]], align 8
+// CHECK:         %[[VAL_34:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_26]], i64 0, i32 2
+// CHECK:         store i8** %[[VAL_32]], i8*** %[[VAL_34]], align 8
+// CHECK:         %[[VAL_35:.*]] = bitcast [1 x i8*]* %[[VAL_27]] to { { i16*, i16*, i16* }, { float*, float*, float* } }**
+// CHECK:         store { { i16*, i16*, i16* }, { float*, float*, float* } }* %[[VAL_1]], { { i16*, i16*, i16* }, { float*, float*, float* } }** %[[VAL_35]], align 8
+// CHECK:         %[[VAL_36:.*]] = bitcast { i8**, i8**, i8** }* %[[VAL_26]] to i8*
+// CHECK:         %[[VAL_37:.*]] = call { i8*, i64 } @hybridLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_3.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_3.thunk to i8*), i8* nonnull %[[VAL_18]], i64 %[[VAL_17]], i64 2147483647, i8* nonnull %[[VAL_36]])
 // CHECK:         ret void
 // CHECK:       }
 
-// CHECK-LABEL: define i64 @test_0.returnOffset() {{.*}}{
-// CHECK:         ret i64 2147483647
-// CHECK:       }
+}
 
 // CHECK-LABEL: define { i8*, i64 } @test_0.thunk(i8* 
-// CHECK-SAME:    %[[VAL_0:.*]], i1 %[[VAL_1:.*]]) {
-// CHECK:         %[[VAL_2:.*]] = bitcast i8* %[[VAL_0]] to i64*
-// CHECK:         %[[VAL_3:.*]] = load i64, i64* %[[VAL_2]], align 4
-// CHECK:         %[[VAL_4:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 8
-// CHECK:         %[[VAL_5:.*]] = sdiv i64 %[[VAL_3]], 16
-// CHECK:         tail call void @anchor(i8* %[[VAL_4]], i64 %[[VAL_5]])
+// CHECK-SAME:      %[[VAL_0:.*]], i1 %[[VAL_1:.*]]) {
+// CHECK:         %[[VAL_2:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 8
+// CHECK:         %[[VAL_3:.*]] = bitcast i8* %[[VAL_0]] to i64*
+// CHECK:         %[[VAL_4:.*]] = load i64, i64* %[[VAL_3]], align 4
+// CHECK:         %[[VAL_5:.*]] = sdiv i64 %[[VAL_4]], 16
+// CHECK:         tail call void @anchor(i8* %[[VAL_2]], i64 %[[VAL_5]])
 // CHECK:         ret { i8*, i64 } zeroinitializer
 // CHECK:       }
 
 // CHECK-LABEL: define i64 @test_0.argsCreator(i8** nocapture readonly 
-// CHECK-SAME:    %[[VAL_0:.*]], i8** nocapture writeonly %[[VAL_1:.*]]) #{{[0-9]+}} {
+// CHECK-SAME:    %[[VAL_0:.*]], i8** nocapture writeonly %[[VAL_1:.*]]) {{.*}} {
 // CHECK:         %[[VAL_2:.*]] = load i8*, i8** %[[VAL_0]], align 8
 // CHECK:         %[[VAL_3:.*]] = getelementptr i8, i8* %[[VAL_2]], i64 8
 // CHECK:         %[[VAL_4:.*]] = bitcast i8* %[[VAL_3]] to { i32, double }**
@@ -262,45 +319,34 @@ func.func @test_3(%0: !cc.ptr<i8>, %1: !cc.ptr<!cc.struct<{!cc.struct<{!cc.ptr<i
 // CHECK:         %[[VAL_10:.*]] = sub i64 %[[VAL_8]], %[[VAL_9]]
 // CHECK:         %[[VAL_11:.*]] = add i64 %[[VAL_10]], 8
 // CHECK:         %[[VAL_12:.*]] = tail call i8* @malloc(i64 %[[VAL_11]])
-// CHECK:         %[[VAL_13:.*]] = bitcast i8* %[[VAL_12]] to { i64 }*
-// CHECK:         %[[VAL_14:.*]] = getelementptr { i64 }, { i64 }* %[[VAL_13]], i64 0, i32 0
+// CHECK:         %[[VAL_13:.*]] = getelementptr i8, i8* %[[VAL_12]], i64 8
+// CHECK:         %[[VAL_14:.*]] = bitcast i8* %[[VAL_12]] to i64*
 // CHECK:         store i64 %[[VAL_10]], i64* %[[VAL_14]], align 4
-// CHECK:         %[[VAL_15:.*]] = getelementptr i8, i8* %[[VAL_12]], i64 8
-// CHECK:         %[[VAL_16:.*]] = bitcast i8* %[[VAL_2]] to i8**
-// CHECK:         %[[VAL_17:.*]] = load i8*, i8** %[[VAL_16]], align 8
-// CHECK:         tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %[[VAL_15]], i8* align 1 %[[VAL_17]], i64 %[[VAL_10]], i1 false)
+// CHECK:         %[[VAL_15:.*]] = bitcast i8* %[[VAL_2]] to i8**
+// CHECK:         %[[VAL_16:.*]] = load i8*, i8** %[[VAL_15]], align 8
+// CHECK:         tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %[[VAL_13]], i8* align 1 %[[VAL_16]], i64 %[[VAL_10]], i1 false)
 // CHECK:         store i8* %[[VAL_12]], i8** %[[VAL_1]], align 8
 // CHECK:         ret i64 %[[VAL_11]]
 // CHECK:       }
 
-// CHECK-LABEL: define void @test_0.kernelRegFunc() {
-// CHECK:         tail call void @cudaqRegisterKernelName(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_0.kernelName, i64 0, i64 0))
-// CHECK:         tail call void @cudaqRegisterArgsCreator(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_0.kernelName, i64 0, i64 0), i8* nonnull bitcast (i64 (i8**, i8**)* @test_0.argsCreator to i8*))
-// CHECK:         ret void
-// CHECK:       }
-
-// CHECK-LABEL: define i64 @test_1.returnOffset() {{.*}}{
-// CHECK:         ret i64 2147483647
-// CHECK:       }
-
 // CHECK-LABEL: define { i8*, i64 } @test_1.thunk(i8* 
-// CHECK-SAME:    %[[VAL_0:.*]], i1 %[[VAL_1:.*]]) {
-// CHECK:         %[[VAL_2:.*]] = bitcast i8* %[[VAL_0]] to i64*
-// CHECK:         %[[VAL_3:.*]] = load i64, i64* %[[VAL_2]], align 4
-// CHECK:         %[[VAL_4:.*]] = getelementptr inbounds i8, i8* %[[VAL_0]], i64 8
-// CHECK:         %[[VAL_5:.*]] = bitcast i8* %[[VAL_4]] to i64*
-// CHECK:         %[[VAL_6:.*]] = load i64, i64* %[[VAL_5]], align 4
-// CHECK:         %[[VAL_7:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 16
-// CHECK:         %[[VAL_8:.*]] = sdiv i64 %[[VAL_3]], 2
-// CHECK:         %[[VAL_9:.*]] = getelementptr i8, i8* %[[VAL_7]], i64 %[[VAL_3]]
-// CHECK:         %[[VAL_10:.*]] = sdiv i64 %[[VAL_6]], 4
-// CHECK:         tail call void @anchor(i8* %[[VAL_7]], i64 %[[VAL_8]])
-// CHECK:         tail call void @anchor(i8* %[[VAL_9]], i64 %[[VAL_10]])
+// CHECK-SAME:      %[[VAL_0:.*]], i1 %[[VAL_1:.*]]) {
+// CHECK:         %[[VAL_2:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 16
+// CHECK:         %[[VAL_3:.*]] = bitcast i8* %[[VAL_0]] to i64*
+// CHECK:         %[[VAL_4:.*]] = load i64, i64* %[[VAL_3]], align 4
+// CHECK:         %[[VAL_5:.*]] = sdiv i64 %[[VAL_4]], 2
+// CHECK:         %[[VAL_6:.*]] = getelementptr i8, i8* %[[VAL_2]], i64 %[[VAL_4]]
+// CHECK:         %[[VAL_7:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 8
+// CHECK:         %[[VAL_8:.*]] = bitcast i8* %[[VAL_7]] to i64*
+// CHECK:         %[[VAL_9:.*]] = load i64, i64* %[[VAL_8]], align 4
+// CHECK:         %[[VAL_10:.*]] = sdiv i64 %[[VAL_9]], 4
+// CHECK:         tail call void @anchor(i8* %[[VAL_2]], i64 %[[VAL_5]])
+// CHECK:         tail call void @anchor(i8* %[[VAL_6]], i64 %[[VAL_10]])
 // CHECK:         ret { i8*, i64 } zeroinitializer
 // CHECK:       }
 
 // CHECK-LABEL: define i64 @test_1.argsCreator(i8** nocapture readonly 
-// CHECK-SAME:    %[[VAL_0:.*]], i8** nocapture writeonly %[[VAL_1:.*]]) #{{[0-9]+}} {
+// CHECK-SAME:    %[[VAL_0:.*]], i8** nocapture writeonly %[[VAL_1:.*]]) {{.*}} {
 // CHECK:         %[[VAL_2:.*]] = load i8*, i8** %[[VAL_0]], align 8
 // CHECK:         %[[VAL_3:.*]] = getelementptr i8, i8* %[[VAL_2]], i64 8
 // CHECK:         %[[VAL_4:.*]] = bitcast i8* %[[VAL_3]] to i16**
@@ -318,51 +364,44 @@ func.func @test_3(%0: !cc.ptr<i8>, %1: !cc.ptr<!cc.struct<{!cc.struct<{!cc.ptr<i
 // CHECK:         %[[VAL_16:.*]] = load float*, float** %[[VAL_14]], align 8
 // CHECK:         %[[VAL_17:.*]] = ptrtoint float* %[[VAL_15]] to i64
 // CHECK:         %[[VAL_18:.*]] = ptrtoint float* %[[VAL_16]] to i64
-// CHECK:         %[[VAL_19:.*]] = sub i64 %[[VAL_17]], %[[VAL_18]]
-// CHECK:         %[[VAL_20:.*]] = add i64 %[[VAL_10]], 16
-// CHECK:         %[[VAL_21:.*]] = add i64 %[[VAL_20]], %[[VAL_19]]
+// CHECK:         %[[VAL_19:.*]] = add i64 %[[VAL_10]], 32
+// CHECK:         %[[VAL_20:.*]] = add i64 %[[VAL_19]], %[[VAL_17]]
+// CHECK:         %[[VAL_21:.*]] = sub i64 %[[VAL_20]], %[[VAL_18]]
 // CHECK:         %[[VAL_22:.*]] = tail call i8* @malloc(i64 %[[VAL_21]])
-// CHECK:         %[[VAL_23:.*]] = bitcast i8* %[[VAL_22]] to { { i64, i64 } }*
-// CHECK:         %[[VAL_24:.*]] = getelementptr inbounds { { i64, i64 } }, { { i64, i64 } }* %[[VAL_23]], i64 0, i32 0, i32 0
-// CHECK:         store i64 %[[VAL_10]], i64* %[[VAL_24]], align 4
-// CHECK:         %[[VAL_25:.*]] = getelementptr inbounds { { i64, i64 } }, { { i64, i64 } }* %[[VAL_23]], i64 0, i32 0, i32 1
-// CHECK:         store i64 %[[VAL_19]], i64* %[[VAL_25]], align 4
-// CHECK:         %[[VAL_26:.*]] = getelementptr i8, i8* %[[VAL_22]], i64 16
-// CHECK:         %[[VAL_27:.*]] = bitcast i8* %[[VAL_2]] to i8**
-// CHECK:         %[[VAL_28:.*]] = load i8*, i8** %[[VAL_27]], align 8
-// CHECK:         tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %[[VAL_26]], i8* align 1 %[[VAL_28]], i64 %[[VAL_10]], i1 false)
-// CHECK:         %[[VAL_29:.*]] = getelementptr i8, i8* %[[VAL_26]], i64 %[[VAL_10]]
-// CHECK:         %[[VAL_30:.*]] = bitcast i8* %[[VAL_13]] to i8**
-// CHECK:         %[[VAL_31:.*]] = load i8*, i8** %[[VAL_30]], align 8
-// CHECK:         tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %[[VAL_29]], i8* align 1 %[[VAL_31]], i64 %[[VAL_19]], i1 false)
+// CHECK:         %[[VAL_23:.*]] = getelementptr i8, i8* %[[VAL_22]], i64 16
+// CHECK:         %[[VAL_24:.*]] = bitcast i8* %[[VAL_22]] to { i64, i64 }*
+// CHECK:         %[[VAL_25:.*]] = bitcast i8* %[[VAL_22]] to i64*
+// CHECK:         store i64 %[[VAL_10]], i64* %[[VAL_25]], align 4
+// CHECK:         %[[VAL_26:.*]] = bitcast i8* %[[VAL_2]] to i8**
+// CHECK:         %[[VAL_27:.*]] = load i8*, i8** %[[VAL_26]], align 8
+// CHECK:         tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %[[VAL_23]], i8* align 1 %[[VAL_27]], i64 %[[VAL_10]], i1 false)
+// CHECK:         %[[VAL_28:.*]] = getelementptr i8, i8* %[[VAL_23]], i64 %[[VAL_10]]
+// CHECK:         %[[VAL_29:.*]] = getelementptr { i64, i64 }, { i64, i64 }* %[[VAL_24]], i64 0, i32 1
+// CHECK:         %[[VAL_30:.*]] = load float*, float** %[[VAL_12]], align 8
+// CHECK:         %[[VAL_31:.*]] = load float*, float** %[[VAL_14]], align 8
+// CHECK:         %[[VAL_32:.*]] = ptrtoint float* %[[VAL_30]] to i64
+// CHECK:         %[[VAL_33:.*]] = ptrtoint float* %[[VAL_31]] to i64
+// CHECK:         %[[VAL_34:.*]] = sub i64 %[[VAL_32]], %[[VAL_33]]
+// CHECK:         store i64 %[[VAL_34]], i64* %[[VAL_29]], align 4
+// CHECK:         %[[VAL_35:.*]] = bitcast i8* %[[VAL_13]] to i8**
+// CHECK:         %[[VAL_36:.*]] = load i8*, i8** %[[VAL_35]], align 8
+// CHECK:         tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %[[VAL_28]], i8* align 1 %[[VAL_36]], i64 %[[VAL_34]], i1 false)
 // CHECK:         store i8* %[[VAL_22]], i8** %[[VAL_1]], align 8
 // CHECK:         ret i64 %[[VAL_21]]
 // CHECK:       }
 
-// CHECK-LABEL: define void @test_1.kernelRegFunc() {
-// CHECK:         tail call void @cudaqRegisterKernelName(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_1.kernelName, i64 0, i64 0))
-// CHECK:         tail call void @cudaqRegisterArgsCreator(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_1.kernelName, i64 0, i64 0), i8* nonnull bitcast (i64 (i8**, i8**)* @test_1.argsCreator to i8*))
-// CHECK:         ret void
-// CHECK:       }
-// CHECK:       ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
-
-// CHECK-LABEL: define i64 @test_2.returnOffset() {{.*}}{
-// CHECK:         ret i64 2147483647
-// CHECK:       }
-
 // CHECK-LABEL: define { i8*, i64 } @test_2.thunk(i8* 
-// CHECK-SAME:    %[[VAL_0:.*]], i1 %[[VAL_1:.*]]) {
-// CHECK:         %[[VAL_2:.*]] = bitcast i8* %[[VAL_0]] to i64*
-// CHECK:         %[[VAL_3:.*]] = load i64, i64* %[[VAL_2]], align 4
-// CHECK:         %[[VAL_4:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 8
-// CHECK:         %[[VAL_5:.*]] = sdiv i64 %[[VAL_3]], 16
-// CHECK:         tail call void @anchor(i8* %[[VAL_4]], i64 %[[VAL_5]])
+// CHECK-SAME:      %[[VAL_0:.*]], i1 %[[VAL_1:.*]]) {
+// CHECK:         %[[VAL_2:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 8
+// CHECK:         %[[VAL_3:.*]] = bitcast i8* %[[VAL_0]] to i64*
+// CHECK:         %[[VAL_4:.*]] = load i64, i64* %[[VAL_3]], align 4
+// CHECK:         %[[VAL_5:.*]] = sdiv i64 %[[VAL_4]], 16
+// CHECK:         tail call void @anchor(i8* %[[VAL_2]], i64 %[[VAL_5]])
 // CHECK:         ret { i8*, i64 } zeroinitializer
 // CHECK:       }
-// CHECK:       ; Function Attrs: mustprogress nofree nounwind willreturn
 
 // CHECK-LABEL: define i64 @test_2.argsCreator(i8** nocapture readonly 
-// CHECK-SAME:    %[[VAL_0:.*]], i8** nocapture writeonly %[[VAL_1:.*]]) #{{[0-9]+}} {
+// CHECK-SAME:    %[[VAL_0:.*]], i8** nocapture writeonly %[[VAL_1:.*]]) {{.*}} {
 // CHECK:         %[[VAL_2:.*]] = load i8*, i8** %[[VAL_0]], align 8
 // CHECK:         %[[VAL_3:.*]] = getelementptr i8, i8* %[[VAL_2]], i64 8
 // CHECK:         %[[VAL_4:.*]] = bitcast i8* %[[VAL_3]] to { i32, double }**
@@ -374,45 +413,34 @@ func.func @test_3(%0: !cc.ptr<i8>, %1: !cc.ptr<!cc.struct<{!cc.struct<{!cc.ptr<i
 // CHECK:         %[[VAL_10:.*]] = sub i64 %[[VAL_8]], %[[VAL_9]]
 // CHECK:         %[[VAL_11:.*]] = add i64 %[[VAL_10]], 8
 // CHECK:         %[[VAL_12:.*]] = tail call i8* @malloc(i64 %[[VAL_11]])
-// CHECK:         %[[VAL_13:.*]] = bitcast i8* %[[VAL_12]] to { i64 }*
-// CHECK:         %[[VAL_14:.*]] = getelementptr { i64 }, { i64 }* %[[VAL_13]], i64 0, i32 0
+// CHECK:         %[[VAL_13:.*]] = getelementptr i8, i8* %[[VAL_12]], i64 8
+// CHECK:         %[[VAL_14:.*]] = bitcast i8* %[[VAL_12]] to i64*
 // CHECK:         store i64 %[[VAL_10]], i64* %[[VAL_14]], align 4
-// CHECK:         %[[VAL_15:.*]] = getelementptr i8, i8* %[[VAL_12]], i64 8
-// CHECK:         %[[VAL_16:.*]] = bitcast i8* %[[VAL_2]] to i8**
-// CHECK:         %[[VAL_17:.*]] = load i8*, i8** %[[VAL_16]], align 8
-// CHECK:         tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %[[VAL_15]], i8* align 1 %[[VAL_17]], i64 %[[VAL_10]], i1 false)
+// CHECK:         %[[VAL_15:.*]] = bitcast i8* %[[VAL_2]] to i8**
+// CHECK:         %[[VAL_16:.*]] = load i8*, i8** %[[VAL_15]], align 8
+// CHECK:         tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %[[VAL_13]], i8* align 1 %[[VAL_16]], i64 %[[VAL_10]], i1 false)
 // CHECK:         store i8* %[[VAL_12]], i8** %[[VAL_1]], align 8
 // CHECK:         ret i64 %[[VAL_11]]
 // CHECK:       }
 
-// CHECK-LABEL: define void @test_2.kernelRegFunc() {
-// CHECK:         tail call void @cudaqRegisterKernelName(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_2.kernelName, i64 0, i64 0))
-// CHECK:         tail call void @cudaqRegisterArgsCreator(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_2.kernelName, i64 0, i64 0), i8* nonnull bitcast (i64 (i8**, i8**)* @test_2.argsCreator to i8*))
-// CHECK:         ret void
-// CHECK:       }
-
-// CHECK-LABEL: define i64 @test_3.returnOffset() {{.*}}{
-// CHECK:         ret i64 2147483647
-// CHECK:       }
-
 // CHECK-LABEL: define { i8*, i64 } @test_3.thunk(i8* 
-// CHECK-SAME:    %[[VAL_0:.*]], i1 %[[VAL_1:.*]]) {
-// CHECK:         %[[VAL_2:.*]] = bitcast i8* %[[VAL_0]] to i64*
-// CHECK:         %[[VAL_3:.*]] = load i64, i64* %[[VAL_2]], align 4
-// CHECK:         %[[VAL_4:.*]] = getelementptr inbounds i8, i8* %[[VAL_0]], i64 8
-// CHECK:         %[[VAL_5:.*]] = bitcast i8* %[[VAL_4]] to i64*
-// CHECK:         %[[VAL_6:.*]] = load i64, i64* %[[VAL_5]], align 4
-// CHECK:         %[[VAL_7:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 16
-// CHECK:         %[[VAL_8:.*]] = sdiv i64 %[[VAL_3]], 2
-// CHECK:         %[[VAL_9:.*]] = getelementptr i8, i8* %[[VAL_7]], i64 %[[VAL_3]]
-// CHECK:         %[[VAL_10:.*]] = sdiv i64 %[[VAL_6]], 4
-// CHECK:         tail call void @anchor(i8* %[[VAL_7]], i64 %[[VAL_8]])
-// CHECK:         tail call void @anchor(i8* %[[VAL_9]], i64 %[[VAL_10]])
+// CHECK-SAME:      %[[VAL_0:.*]], i1 %[[VAL_1:.*]]) {
+// CHECK:         %[[VAL_2:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 16
+// CHECK:         %[[VAL_3:.*]] = bitcast i8* %[[VAL_0]] to i64*
+// CHECK:         %[[VAL_4:.*]] = load i64, i64* %[[VAL_3]], align 4
+// CHECK:         %[[VAL_5:.*]] = sdiv i64 %[[VAL_4]], 2
+// CHECK:         %[[VAL_6:.*]] = getelementptr i8, i8* %[[VAL_2]], i64 %[[VAL_4]]
+// CHECK:         %[[VAL_7:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 8
+// CHECK:         %[[VAL_8:.*]] = bitcast i8* %[[VAL_7]] to i64*
+// CHECK:         %[[VAL_9:.*]] = load i64, i64* %[[VAL_8]], align 4
+// CHECK:         %[[VAL_10:.*]] = sdiv i64 %[[VAL_9]], 4
+// CHECK:         tail call void @anchor(i8* %[[VAL_2]], i64 %[[VAL_5]])
+// CHECK:         tail call void @anchor(i8* %[[VAL_6]], i64 %[[VAL_10]])
 // CHECK:         ret { i8*, i64 } zeroinitializer
 // CHECK:       }
 
 // CHECK-LABEL: define i64 @test_3.argsCreator(i8** nocapture readonly 
-// CHECK-SAME:    %[[VAL_0:.*]], i8** nocapture writeonly %[[VAL_1:.*]]) #{{[0-9]+}} {
+// CHECK-SAME:    %[[VAL_0:.*]], i8** nocapture writeonly %[[VAL_1:.*]]) {{.*}} {
 // CHECK:         %[[VAL_2:.*]] = load i8*, i8** %[[VAL_0]], align 8
 // CHECK:         %[[VAL_3:.*]] = getelementptr i8, i8* %[[VAL_2]], i64 8
 // CHECK:         %[[VAL_4:.*]] = bitcast i8* %[[VAL_3]] to i16**
@@ -430,23 +458,28 @@ func.func @test_3(%0: !cc.ptr<i8>, %1: !cc.ptr<!cc.struct<{!cc.struct<{!cc.ptr<i
 // CHECK:         %[[VAL_16:.*]] = load float*, float** %[[VAL_14]], align 8
 // CHECK:         %[[VAL_17:.*]] = ptrtoint float* %[[VAL_15]] to i64
 // CHECK:         %[[VAL_18:.*]] = ptrtoint float* %[[VAL_16]] to i64
-// CHECK:         %[[VAL_19:.*]] = sub i64 %[[VAL_17]], %[[VAL_18]]
-// CHECK:         %[[VAL_20:.*]] = add i64 %[[VAL_10]], 16
-// CHECK:         %[[VAL_21:.*]] = add i64 %[[VAL_20]], %[[VAL_19]]
+// CHECK:         %[[VAL_19:.*]] = add i64 %[[VAL_10]], 32
+// CHECK:         %[[VAL_20:.*]] = add i64 %[[VAL_19]], %[[VAL_17]]
+// CHECK:         %[[VAL_21:.*]] = sub i64 %[[VAL_20]], %[[VAL_18]]
 // CHECK:         %[[VAL_22:.*]] = tail call i8* @malloc(i64 %[[VAL_21]])
-// CHECK:         %[[VAL_23:.*]] = bitcast i8* %[[VAL_22]] to { { i64, i64 } }*
-// CHECK:         %[[VAL_24:.*]] = getelementptr inbounds { { i64, i64 } }, { { i64, i64 } }* %[[VAL_23]], i64 0, i32 0, i32 0
-// CHECK:         store i64 %[[VAL_10]], i64* %[[VAL_24]], align 4
-// CHECK:         %[[VAL_25:.*]] = getelementptr inbounds { { i64, i64 } }, { { i64, i64 } }* %[[VAL_23]], i64 0, i32 0, i32 1
-// CHECK:         store i64 %[[VAL_19]], i64* %[[VAL_25]], align 4
-// CHECK:         %[[VAL_26:.*]] = getelementptr i8, i8* %[[VAL_22]], i64 16
-// CHECK:         %[[VAL_27:.*]] = bitcast i8* %[[VAL_2]] to i8**
-// CHECK:         %[[VAL_28:.*]] = load i8*, i8** %[[VAL_27]], align 8
-// CHECK:         tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %[[VAL_26]], i8* align 1 %[[VAL_28]], i64 %[[VAL_10]], i1 false)
-// CHECK:         %[[VAL_29:.*]] = getelementptr i8, i8* %[[VAL_26]], i64 %[[VAL_10]]
-// CHECK:         %[[VAL_30:.*]] = bitcast i8* %[[VAL_13]] to i8**
-// CHECK:         %[[VAL_31:.*]] = load i8*, i8** %[[VAL_30]], align 8
-// CHECK:         tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %[[VAL_29]], i8* align 1 %[[VAL_31]], i64 %[[VAL_19]], i1 false)
+// CHECK:         %[[VAL_23:.*]] = getelementptr i8, i8* %[[VAL_22]], i64 16
+// CHECK:         %[[VAL_24:.*]] = bitcast i8* %[[VAL_22]] to { i64, i64 }*
+// CHECK:         %[[VAL_25:.*]] = bitcast i8* %[[VAL_22]] to i64*
+// CHECK:         store i64 %[[VAL_10]], i64* %[[VAL_25]], align 4
+// CHECK:         %[[VAL_26:.*]] = bitcast i8* %[[VAL_2]] to i8**
+// CHECK:         %[[VAL_27:.*]] = load i8*, i8** %[[VAL_26]], align 8
+// CHECK:         tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %[[VAL_23]], i8* align 1 %[[VAL_27]], i64 %[[VAL_10]], i1 false)
+// CHECK:         %[[VAL_28:.*]] = getelementptr i8, i8* %[[VAL_23]], i64 %[[VAL_10]]
+// CHECK:         %[[VAL_29:.*]] = getelementptr { i64, i64 }, { i64, i64 }* %[[VAL_24]], i64 0, i32 1
+// CHECK:         %[[VAL_30:.*]] = load float*, float** %[[VAL_12]], align 8
+// CHECK:         %[[VAL_31:.*]] = load float*, float** %[[VAL_14]], align 8
+// CHECK:         %[[VAL_32:.*]] = ptrtoint float* %[[VAL_30]] to i64
+// CHECK:         %[[VAL_33:.*]] = ptrtoint float* %[[VAL_31]] to i64
+// CHECK:         %[[VAL_34:.*]] = sub i64 %[[VAL_32]], %[[VAL_33]]
+// CHECK:         store i64 %[[VAL_34]], i64* %[[VAL_29]], align 4
+// CHECK:         %[[VAL_35:.*]] = bitcast i8* %[[VAL_13]] to i8**
+// CHECK:         %[[VAL_36:.*]] = load i8*, i8** %[[VAL_35]], align 8
+// CHECK:         tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %[[VAL_28]], i8* align 1 %[[VAL_36]], i64 %[[VAL_34]], i1 false)
 // CHECK:         store i8* %[[VAL_22]], i8** %[[VAL_1]], align 8
 // CHECK:         ret i64 %[[VAL_21]]
 // CHECK:       }
@@ -456,3 +489,4 @@ func.func @test_3(%0: !cc.ptr<i8>, %1: !cc.ptr<!cc.struct<{!cc.struct<{!cc.ptr<i
 // CHECK:         tail call void @cudaqRegisterArgsCreator(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_3.kernelName, i64 0, i64 0), i8* nonnull bitcast (i64 (i8**, i8**)* @test_3.argsCreator to i8*))
 // CHECK:         ret void
 // CHECK:       }
+
diff --git a/test/Translate/return_values.qke b/test/Translate/return_values.qke
index b78efb09ec..9361f10012 100644
--- a/test/Translate/return_values.qke
+++ b/test/Translate/return_values.qke
@@ -6,7 +6,8 @@
 // the terms of the Apache License 2.0 which accompanies this distribution.   //
 // ========================================================================== //
 
-// RUN: cudaq-opt --add-dealloc --kernel-execution=codegen=1 --canonicalize %s | cudaq-translate --convert-to=qir | FileCheck %s
+// RUN: cudaq-opt -add-dealloc -kernel-execution -canonicalize %s | \
+// RUN: cudaq-translate --convert-to=qir | FileCheck %s
 
 // NB: the mangled name map is required for the kernel-execution pass.
 // QIR codegen requires the target triple.
@@ -57,76 +58,94 @@ func.func @__nvqpp__mlirgen__test_0(%arg0: i32) -> !cc.stdvec<i1> {
 func.func @test_0(%1: !cc.ptr<!cc.struct<{!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>}>> {llvm.sret = !cc.struct<{!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>}>}, %this: !cc.ptr<i8>, %2: i32) {
   return
 }
-
-// CHECK-LABEL: define { i1*, i64 } @__nvqpp__mlirgen__test_0(
-// CHECK-SAME:    i32 %[[VAL_1:.*]]) {{.*}}{
-// CHECK:         %[[VAL_2:.*]] = sext i32 %[[VAL_1]] to i64
-// CHECK:         %[[VAL_3:.*]] = tail call %[[VAL_4:.*]]* @__quantum__rt__qubit_allocate_array(i64 %[[VAL_2]])
-// CHECK:         %[[VAL_5:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(%[[VAL_4]]* %[[VAL_3]])
-// CHECK:         %[[VAL_6:.*]] = icmp sgt i64 %[[VAL_5]], 0
-// CHECK:         br i1 %[[VAL_6]], label %[[VAL_7:.*]], label %[[VAL_8:.*]]
-// CHECK:       ._crit_edge.thread:                               ; preds = %[[VAL_9:.*]]
-// CHECK:         %[[VAL_10:.*]] = alloca i8, i64 %[[VAL_5]], align 1
-// CHECK:         br label %[[VAL_11:.*]]
-// CHECK:       .lr.ph:                                           ; preds = %[[VAL_9]], %[[VAL_7]]
-// CHECK:         %[[VAL_12:.*]] = phi i64 [ %[[VAL_13:.*]], %[[VAL_7]] ], [ 0, %[[VAL_9]] ]
-// CHECK:         %[[VAL_14:.*]] = tail call i8* @__quantum__rt__array_get_element_ptr_1d(%[[VAL_4]]* %[[VAL_3]], i64 %[[VAL_12]])
-// CHECK:         %[[VAL_15:.*]] = bitcast i8* %[[VAL_14]] to %[[VAL_16:.*]]**
-// CHECK:         %[[VAL_17:.*]] = load %[[VAL_16]]*, %[[VAL_16]]** %[[VAL_15]], align 8
-// CHECK:         tail call void @__quantum__qis__h(%[[VAL_16]]* %[[VAL_17]])
-// CHECK:         %[[VAL_13]] = add nuw nsw i64 %[[VAL_12]], 1
-// CHECK:         %[[VAL_18:.*]] = icmp eq i64 %[[VAL_13]], %[[VAL_5]]
-// CHECK:         br i1 %[[VAL_18]], label %[[VAL_19:.*]], label %[[VAL_7]]
-// CHECK:       ._crit_edge:                                      ; preds = %[[VAL_7]]
-// CHECK:         %[[VAL_20:.*]] = alloca i8, i64 %[[VAL_5]], align 1
-// CHECK:         br i1 %[[VAL_6]], label %[[VAL_21:.*]], label %[[VAL_11]]
-// CHECK:       .lr.ph4:                                          ; preds = %[[VAL_19]], %[[VAL_21]]
-// CHECK:         %[[VAL_22:.*]] = phi i64 [ %[[VAL_23:.*]], %[[VAL_21]] ], [ 0, %[[VAL_19]] ]
-// CHECK:         %[[VAL_24:.*]] = tail call i8* @__quantum__rt__array_get_element_ptr_1d(%[[VAL_4]]* %[[VAL_3]], i64 %[[VAL_22]])
-// CHECK:         %[[VAL_25:.*]] = bitcast i8* %[[VAL_24]] to %[[VAL_16]]**
-// CHECK:         %[[VAL_26:.*]] = load %[[VAL_16]]*, %[[VAL_16]]** %[[VAL_25]], align 8
-// CHECK:         %[[VAL_27:.*]] = tail call %[[VAL_28:.*]]* @__quantum__qis__mz(%[[VAL_16]]* %[[VAL_26]])
-// CHECK:         %[[VAL_29:.*]] = bitcast %[[VAL_28]]* %[[VAL_27]] to i1*
-// CHECK:         %[[VAL_30:.*]] = load i1, i1* %[[VAL_29]], align 1
-// CHECK:         %[[VAL_31:.*]] = getelementptr i8, i8* %[[VAL_20]], i64 %[[VAL_22]]
-// CHECK:         %[[VAL_32:.*]] = zext i1 %[[VAL_30]] to i8
-// CHECK:         store i8 %[[VAL_32]], i8* %[[VAL_31]], align 1
-// CHECK:         %[[VAL_23]] = add nuw nsw i64 %[[VAL_22]], 1
-// CHECK:         %[[VAL_33:.*]] = icmp eq i64 %[[VAL_23]], %[[VAL_5]]
-// CHECK:         br i1 %[[VAL_33]], label %[[VAL_11]], label %[[VAL_21]]
-// CHECK:       ._crit_edge5:                                     ; preds = %[[VAL_21]], %[[VAL_8]], %[[VAL_19]]
-// CHECK:         %[[VAL_34:.*]] = phi i8* [ %[[VAL_10]], %[[VAL_8]] ], [ %[[VAL_20]], %[[VAL_19]] ], [ %[[VAL_20]], %[[VAL_21]] ]
-// CHECK:         %[[VAL_35:.*]] = call i8* @__nvqpp_vectorCopyCtor(i8* nonnull %[[VAL_34]], i64 %[[VAL_5]], i64 1)
-// CHECK:         %[[VAL_36:.*]] = bitcast i8* %[[VAL_35]] to i1*
-// CHECK:         %[[VAL_37:.*]] = insertvalue { i1*, i64 } undef, i1* %[[VAL_36]], 0
-// CHECK:         %[[VAL_38:.*]] = insertvalue { i1*, i64 } %[[VAL_37]], i64 %[[VAL_5]], 1
-// CHECK:         call void @__quantum__rt__qubit_release_array(%Array* %[[VAL_3]])
-// CHECK:         ret { i1*, i64 } %[[VAL_38]]
+// CHECK-LABEL: define { i1*, i64 } @__nvqpp__mlirgen__test_0(i32 
+// CHECK-SAME:    %[[VAL_0:.*]]) local_unnamed_addr {
+// CHECK:         %[[VAL_1:.*]] = sext i32 %[[VAL_0]] to i64
+// CHECK:         %[[VAL_2:.*]] = tail call %[[VAL_3:.*]]* @__quantum__rt__qubit_allocate_array(i64 %[[VAL_1]])
+// CHECK:         %[[VAL_4:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(%[[VAL_3]]* %[[VAL_2]])
+// CHECK:         %[[VAL_5:.*]] = icmp sgt i64 %[[VAL_4]], 0
+// CHECK:         br i1 %[[VAL_5]], label %[[VAL_6:.*]], label %[[VAL_7:.*]]
+// CHECK:       ._crit_edge.thread:                               ; preds = %[[VAL_8:.*]]
+// CHECK:         %[[VAL_9:.*]] = alloca i8, i64 %[[VAL_4]], align 1
+// CHECK:         br label %[[VAL_10:.*]]
+// CHECK:       .lr.ph:                                           ; preds = %[[VAL_8]], %[[VAL_6]]
+// CHECK:         %[[VAL_11:.*]] = phi i64 [ %[[VAL_12:.*]], %[[VAL_6]] ], [ 0, %[[VAL_8]] ]
+// CHECK:         %[[VAL_13:.*]] = tail call i8* @__quantum__rt__array_get_element_ptr_1d(%[[VAL_3]]* %[[VAL_2]], i64 %[[VAL_11]])
+// CHECK:         %[[VAL_14:.*]] = bitcast i8* %[[VAL_13]] to %[[VAL_15:.*]]**
+// CHECK:         %[[VAL_16:.*]] = load %[[VAL_15]]*, %[[VAL_15]]** %[[VAL_14]], align 8
+// CHECK:         tail call void @__quantum__qis__h(%[[VAL_15]]* %[[VAL_16]])
+// CHECK:         %[[VAL_12]] = add nuw nsw i64 %[[VAL_11]], 1
+// CHECK:         %[[VAL_17:.*]] = icmp eq i64 %[[VAL_12]], %[[VAL_4]]
+// CHECK:         br i1 %[[VAL_17]], label %[[VAL_18:.*]], label %[[VAL_6]]
+// CHECK:       ._crit_edge:                                      ; preds = %[[VAL_6]]
+// CHECK:         %[[VAL_19:.*]] = alloca i8, i64 %[[VAL_4]], align 1
+// CHECK:         br i1 %[[VAL_5]], label %[[VAL_20:.*]], label %[[VAL_10]]
+// CHECK:       .lr.ph4:                                          ; preds = %[[VAL_18]], %[[VAL_20]]
+// CHECK:         %[[VAL_21:.*]] = phi i64 [ %[[VAL_22:.*]], %[[VAL_20]] ], [ 0, %[[VAL_18]] ]
+// CHECK:         %[[VAL_23:.*]] = tail call i8* @__quantum__rt__array_get_element_ptr_1d(%[[VAL_3]]* %[[VAL_2]], i64 %[[VAL_21]])
+// CHECK:         %[[VAL_24:.*]] = bitcast i8* %[[VAL_23]] to %[[VAL_15]]**
+// CHECK:         %[[VAL_25:.*]] = load %[[VAL_15]]*, %[[VAL_15]]** %[[VAL_24]], align 8
+// CHECK:         %[[VAL_26:.*]] = tail call %[[VAL_27:.*]]* @__quantum__qis__mz(%[[VAL_15]]* %[[VAL_25]])
+// CHECK:         %[[VAL_28:.*]] = bitcast %[[VAL_27]]* %[[VAL_26]] to i1*
+// CHECK:         %[[VAL_29:.*]] = load i1, i1* %[[VAL_28]], align 1
+// CHECK:         %[[VAL_30:.*]] = getelementptr i8, i8* %[[VAL_19]], i64 %[[VAL_21]]
+// CHECK:         %[[VAL_31:.*]] = zext i1 %[[VAL_29]] to i8
+// CHECK:         store i8 %[[VAL_31]], i8* %[[VAL_30]], align 1
+// CHECK:         %[[VAL_22]] = add nuw nsw i64 %[[VAL_21]], 1
+// CHECK:         %[[VAL_32:.*]] = icmp eq i64 %[[VAL_22]], %[[VAL_4]]
+// CHECK:         br i1 %[[VAL_32]], label %[[VAL_10]], label %[[VAL_20]]
+// CHECK:       ._crit_edge5:                                     ; preds = %[[VAL_20]], %[[VAL_7]], %[[VAL_18]]
+// CHECK:         %[[VAL_33:.*]] = phi i8* [ %[[VAL_9]], %[[VAL_7]] ], [ %[[VAL_19]], %[[VAL_18]] ], [ %[[VAL_19]], %[[VAL_20]] ]
+// CHECK:         %[[VAL_34:.*]] = call i8* @__nvqpp_vectorCopyCtor(i8* nonnull %[[VAL_33]], i64 %[[VAL_4]], i64 1)
+// CHECK:         %[[VAL_35:.*]] = bitcast i8* %[[VAL_34]] to i1*
+// CHECK:         %[[VAL_36:.*]] = insertvalue { i1*, i64 } undef, i1* %[[VAL_35]], 0
+// CHECK:         %[[VAL_37:.*]] = insertvalue { i1*, i64 } %[[VAL_36]], i64 %[[VAL_4]], 1
+// CHECK:         call void @__quantum__rt__qubit_release_array(%[[VAL_3]]* %[[VAL_2]])
+// CHECK:         ret { i1*, i64 } %[[VAL_37]]
 // CHECK:       }
 
 // CHECK-LABEL: define void @test_0({ i8*, i8*, i8* }* sret({ i8*, i8*, i8* }) 
-// CHECK-SAME:       %[[VAL_0:.*]], i8* nocapture readnone %[[VAL_1:.*]], i32 %[[VAL_2:.*]]) {{.*}}{
-// CHECK:         %[[VAL_3:.*]] = alloca { i32, { i1*, i64 } }, align 8
+// CHECK-SAME:                                                                 %[[VAL_0:.*]], i8* nocapture readnone
+// CHECK-SAME:                                                                 %[[VAL_1:.*]], i32
+// CHECK-SAME:                                                                 %[[VAL_2:.*]]) local_unnamed_addr {
+// CHECK:         %[[VAL_3:.*]] = alloca { i32, { i1*, i64 } }, align 4
 // CHECK:         %[[VAL_4:.*]] = bitcast { i32, { i1*, i64 } }* %[[VAL_3]] to i8*
 // CHECK:         %[[VAL_5:.*]] = getelementptr inbounds { i32, { i1*, i64 } }, { i32, { i1*, i64 } }* %[[VAL_3]], i64 0, i32 0
-// CHECK:         store i32 %[[VAL_2]], i32* %[[VAL_5]], align 8
-// CHECK:         %[[VAL_6:.*]] = call { i8*, i64 } @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_0.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_0.thunk to i8*), i8* nonnull %[[VAL_4]], i64 24, i64 8)
-// CHECK:         %[[VAL_7:.*]] = extractvalue { i8*, i64 } %[[VAL_6]], 0
-// CHECK:         %[[VAL_8:.*]] = icmp eq i8* %[[VAL_7]], null
-// CHECK:         %[[VAL_9:.*]] = getelementptr i8, i8* %[[VAL_7]], i64 8
-// CHECK:         %[[VAL_10:.*]] = bitcast i8* %[[VAL_9]] to { i1*, i64 }*
-// CHECK:         %[[VAL_11:.*]] = getelementptr inbounds { i32, { i1*, i64 } }, { i32, { i1*, i64 } }* %[[VAL_3]], i64 0, i32 1
-// CHECK:         %[[VAL_12:.*]] = select i1 %[[VAL_8]], { i1*, i64 }* %[[VAL_11]], { i1*, i64 }* %[[VAL_10]]
-// CHECK:         %[[VAL_13:.*]] = bitcast { i1*, i64 }* %[[VAL_12]] to i8**
-// CHECK:         %[[VAL_14:.*]] = load i8*, i8** %[[VAL_13]], align 8
-// CHECK:         %[[VAL_15:.*]] = getelementptr inbounds { i32, { i1*, i64 } }, { i32, { i1*, i64 } }* %[[VAL_3]], i64 0, i32 1, i32 1
-// CHECK:         %[[VAL_16:.*]] = getelementptr i8, i8* %[[VAL_7]], i64 16
-// CHECK:         %[[VAL_17:.*]] = bitcast i8* %[[VAL_16]] to i64*
-// CHECK:         %[[VAL_18:.*]] = select i1 %[[VAL_8]], i64* %[[VAL_15]], i64* %[[VAL_17]]
-// CHECK:         %[[VAL_19:.*]] = load i64, i64* %[[VAL_18]], align 4
-// CHECK:         %[[VAL_20:.*]] = bitcast { i8*, i8*, i8* }* %[[VAL_0]] to i8*
-// CHECK:         call void @__nvqpp_initializer_list_to_vector_bool(i8* %[[VAL_20]], i8* %[[VAL_14]], i64 %[[VAL_19]])
-// CHECK:         call void @free(i8* %[[VAL_7]])
+// CHECK:         store i32 %[[VAL_2]], i32* %[[VAL_5]], align 4
+// CHECK:         %[[VAL_6:.*]] = alloca { i8**, i8**, i8** }, align 8
+// CHECK:         %[[VAL_7:.*]] = alloca [1 x i8*], align 8
+// CHECK:         %[[VAL_8:.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* %[[VAL_7]], i64 0, i64 0
+// CHECK:         %[[VAL_9:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_6]], i64 0, i32 0
+// CHECK:         store i8** %[[VAL_8]], i8*** %[[VAL_9]], align 8
+// CHECK:         %[[VAL_10:.*]] = ptrtoint [1 x i8*]* %[[VAL_7]] to i64
+// CHECK:         %[[VAL_11:.*]] = add i64 %[[VAL_10]], 8
+// CHECK:         %[[VAL_12:.*]] = inttoptr i64 %[[VAL_11]] to i8**
+// CHECK:         %[[VAL_13:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_6]], i64 0, i32 1
+// CHECK:         store i8** %[[VAL_12]], i8*** %[[VAL_13]], align 8
+// CHECK:         %[[VAL_14:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_6]], i64 0, i32 2
+// CHECK:         store i8** %[[VAL_12]], i8*** %[[VAL_14]], align 8
+// CHECK:         %[[VAL_15:.*]] = alloca i32, align 4
+// CHECK:         store i32 %[[VAL_2]], i32* %[[VAL_15]], align 4
+// CHECK:         %[[VAL_16:.*]] = bitcast [1 x i8*]* %[[VAL_7]] to i32**
+// CHECK:         store i32* %[[VAL_15]], i32** %[[VAL_16]], align 8
+// CHECK:         %[[VAL_17:.*]] = bitcast { i8**, i8**, i8** }* %[[VAL_6]] to i8*
+// CHECK:         %[[VAL_18:.*]] = call { i8*, i64 } @hybridLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_0.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_0.thunk to i8*), i8* nonnull %[[VAL_4]], i64 24, i64 8, i8* nonnull %[[VAL_17]])
+// CHECK:         %[[VAL_19:.*]] = extractvalue { i8*, i64 } %[[VAL_18]], 0
+// CHECK:         %[[VAL_20:.*]] = icmp eq i8* %[[VAL_19]], null
+// CHECK:         %[[VAL_21:.*]] = getelementptr i8, i8* %[[VAL_19]], i64 8
+// CHECK:         %[[VAL_22:.*]] = bitcast i8* %[[VAL_21]] to { i1*, i64 }*
+// CHECK:         %[[VAL_23:.*]] = getelementptr inbounds { i32, { i1*, i64 } }, { i32, { i1*, i64 } }* %[[VAL_3]], i64 0, i32 1
+// CHECK:         %[[VAL_24:.*]] = select i1 %[[VAL_20]], { i1*, i64 }* %[[VAL_23]], { i1*, i64 }* %[[VAL_22]]
+// CHECK:         %[[VAL_25:.*]] = bitcast { i1*, i64 }* %[[VAL_24]] to i8**
+// CHECK:         %[[VAL_26:.*]] = load i8*, i8** %[[VAL_25]], align 8
+// CHECK:         %[[VAL_27:.*]] = getelementptr inbounds { i32, { i1*, i64 } }, { i32, { i1*, i64 } }* %[[VAL_3]], i64 0, i32 1, i32 1
+// CHECK:         %[[VAL_28:.*]] = getelementptr i8, i8* %[[VAL_19]], i64 16
+// CHECK:         %[[VAL_29:.*]] = bitcast i8* %[[VAL_28]] to i64*
+// CHECK:         %[[VAL_30:.*]] = select i1 %[[VAL_20]], i64* %[[VAL_27]], i64* %[[VAL_29]]
+// CHECK:         %[[VAL_31:.*]] = load i64, i64* %[[VAL_30]], align 4
+// CHECK:         %[[VAL_32:.*]] = bitcast { i8*, i8*, i8* }* %[[VAL_0]] to i8*
+// CHECK:         call void @__nvqpp_initializer_list_to_vector_bool(i8* %[[VAL_32]], i8* %[[VAL_26]], i64 %[[VAL_31]])
+// CHECK:         call void @free(i8* %[[VAL_19]])
 // CHECK:         ret void
 // CHECK:       }
 
@@ -152,35 +171,45 @@ func.func @test_1(%this: !cc.ptr<i8>) -> i16 {
   return %0 : i16
 }
 
-// CHECK-LABEL: define { i1, i1 } @__nvqpp__mlirgen__test_1()
-// CHECK:         %[[VAL_1:.*]] = tail call %[[VAL_2:.*]]* @__quantum__rt__qubit_allocate_array(i64 2)
-// CHECK:         %[[VAL_3:.*]] = tail call i8* @__quantum__rt__array_get_element_ptr_1d(%[[VAL_2]]* %[[VAL_1]], i64 0)
-// CHECK:         %[[VAL_4:.*]] = bitcast i8* %[[VAL_3]] to %[[VAL_5:.*]]**
-// CHECK:         %[[VAL_6:.*]] = load %[[VAL_5]]*, %[[VAL_5]]** %[[VAL_4]], align 8
-// CHECK:         %[[VAL_7:.*]] = tail call i8* @__quantum__rt__array_get_element_ptr_1d(%[[VAL_2]]* %[[VAL_1]], i64 1)
-// CHECK:         %[[VAL_8:.*]] = bitcast i8* %[[VAL_7]] to %[[VAL_5]]**
-// CHECK:         %[[VAL_9:.*]] = load %[[VAL_5]]*, %[[VAL_5]]** %[[VAL_8]], align 8
-// CHECK:         tail call void @__quantum__qis__h(%[[VAL_5]]* %[[VAL_6]])
-// CHECK:         tail call void (i64, void (%[[VAL_2]]*, %[[VAL_5]]*)*, ...) @invokeWithControlQubits(i64 1, void (%[[VAL_2]]*, %[[VAL_5]]*)* nonnull @__quantum__qis__x__ctl, %[[VAL_5]]* %[[VAL_6]], %[[VAL_5]]* %[[VAL_9]])
-// CHECK:         %[[VAL_10:.*]] = tail call %[[VAL_11:.*]]* @__quantum__qis__mz(%[[VAL_5]]* %[[VAL_6]])
-// CHECK:         %[[VAL_12:.*]] = bitcast %Result* %[[VAL_10]] to i1*
-// CHECK:         %[[VAL_13:.*]] = load i1, i1* %[[VAL_12]], align 1
-// CHECK:         %[[VAL_14:.*]] = tail call %[[VAL_11]]* @__quantum__qis__mz(%[[VAL_5]]* %[[VAL_9]])
-// CHECK:         %[[VAL_15:.*]] = bitcast %Result* %[[VAL_14]] to i1*
-// CHECK:         %[[VAL_16:.*]] = load i1, i1* %[[VAL_15]], align 1
-// CHECK:         %[[VAL_20:.*]] = insertvalue { i1, i1 } undef, i1 %[[VAL_13]], 0
-// CHECK:         %[[VAL_19:.*]] = insertvalue { i1, i1 } %[[VAL_20]], i1 %[[VAL_16]], 1
-// CHECK:         tail call void @__quantum__rt__qubit_release_array(%[[VAL_2]]* %[[VAL_1]])
-// CHECK:         ret { i1, i1 } %[[VAL_19]]
-// CHECK:       }
-
-// CHECK-LABEL: define i16 @test_1(i8* nocapture readnone
-// CHECK-SAME:    %[[VAL_0:.*]]) {{.*}}{
-// CHECK-NEXT:    %[[VAL_2:.*]] = alloca i16, align 8
-// CHECK:         %[[VAL_3:.*]] = bitcast i16* %[[VAL_2]] to i8*
-// CHECK:         call { i8*, i64 } @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_1.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_1.thunk to i8*), i8* nonnull %[[VAL_3]], i64 2, i64 0)
-// CHECK:         %[[VAL_4:.*]] = load i16, i16* %[[VAL_2]], align 8
-// CHECK:         ret i16 %[[VAL_4]]
+// CHECK-LABEL: define { i1, i1 } @__nvqpp__mlirgen__test_1() local_unnamed_addr {
+// CHECK:         %[[VAL_0:.*]] = tail call %[[VAL_1:.*]]* @__quantum__rt__qubit_allocate_array(i64 2)
+// CHECK:         %[[VAL_2:.*]] = tail call i8* @__quantum__rt__array_get_element_ptr_1d(%[[VAL_1]]* %[[VAL_0]], i64 0)
+// CHECK:         %[[VAL_3:.*]] = bitcast i8* %[[VAL_2]] to %[[VAL_4:.*]]**
+// CHECK:         %[[VAL_5:.*]] = load %[[VAL_4]]*, %[[VAL_4]]** %[[VAL_3]], align 8
+// CHECK:         %[[VAL_6:.*]] = tail call i8* @__quantum__rt__array_get_element_ptr_1d(%[[VAL_1]]* %[[VAL_0]], i64 1)
+// CHECK:         %[[VAL_7:.*]] = bitcast i8* %[[VAL_6]] to %[[VAL_4]]**
+// CHECK:         %[[VAL_8:.*]] = load %[[VAL_4]]*, %[[VAL_4]]** %[[VAL_7]], align 8
+// CHECK:         tail call void @__quantum__qis__h(%[[VAL_4]]* %[[VAL_5]])
+// CHECK:         tail call void (i64, void (%[[VAL_1]]*, %[[VAL_4]]*)*, ...) @invokeWithControlQubits(i64 1, void (%[[VAL_1]]*, %[[VAL_4]]*)* nonnull @__quantum__qis__x__ctl, %[[VAL_4]]* %[[VAL_5]], %[[VAL_4]]* %[[VAL_8]])
+// CHECK:         %[[VAL_9:.*]] = tail call %[[VAL_10:.*]]* @__quantum__qis__mz(%[[VAL_4]]* %[[VAL_5]])
+// CHECK:         %[[VAL_11:.*]] = bitcast %[[VAL_10]]* %[[VAL_9]] to i1*
+// CHECK:         %[[VAL_12:.*]] = load i1, i1* %[[VAL_11]], align 1
+// CHECK:         %[[VAL_13:.*]] = tail call %[[VAL_10]]* @__quantum__qis__mz(%[[VAL_4]]* %[[VAL_8]])
+// CHECK:         %[[VAL_14:.*]] = bitcast %[[VAL_10]]* %[[VAL_13]] to i1*
+// CHECK:         %[[VAL_15:.*]] = load i1, i1* %[[VAL_14]], align 1
+// CHECK:         %[[VAL_16:.*]] = insertvalue { i1, i1 } undef, i1 %[[VAL_12]], 0
+// CHECK:         %[[VAL_17:.*]] = insertvalue { i1, i1 } %[[VAL_16]], i1 %[[VAL_15]], 1
+// CHECK:         tail call void @__quantum__rt__qubit_release_array(%[[VAL_1]]* %[[VAL_0]])
+// CHECK:         ret { i1, i1 } %[[VAL_17]]
+// CHECK:       }
+
+// CHECK-LABEL: define i16 @test_1(i8* nocapture readnone 
+// CHECK-SAME:        %[[VAL_0:.*]]) local_unnamed_addr {
+// CHECK:         %[[VAL_1:.*]] = alloca [0 x i8*], align 8
+// CHECK:         %[[VAL_2:.*]] = alloca i16
+// CHECK:         %[[VAL_3:.*]] = alloca { i8**, i8**, i8** }, align 8
+// CHECK:         %[[VAL_4:.*]] = bitcast i16* %[[VAL_2]] to i8*
+// CHECK:         %[[VAL_5:.*]] = getelementptr inbounds [0 x i8*], [0 x i8*]* %[[VAL_1]], i64 0, i64 0
+// CHECK:         %[[VAL_6:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_3]], i64 0, i32 0
+// CHECK:         store i8** %[[VAL_5]], i8*** %[[VAL_6]], align 8
+// CHECK:         %[[VAL_7:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_3]], i64 0, i32 1
+// CHECK:         store i8** %[[VAL_5]], i8*** %[[VAL_7]], align 8
+// CHECK:         %[[VAL_8:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_3]], i64 0, i32 2
+// CHECK:         store i8** %[[VAL_5]], i8*** %[[VAL_8]], align 8
+// CHECK:         %[[VAL_9:.*]] = bitcast { i8**, i8**, i8** }* %[[VAL_3]] to i8*
+// CHECK:         %[[VAL_10:.*]] = call { i8*, i64 } @hybridLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_1.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_1.thunk to i8*), i8* nonnull %[[VAL_4]], i64 2, i64 0, i8* nonnull %[[VAL_9]])
+// CHECK:         %[[VAL_11:.*]] = load i16, i16* %[[VAL_2]]
+// CHECK:         ret i16 %[[VAL_11]]
 // CHECK:       }
 
 // struct{i16, f32, f64, i64} -> sret ptr
@@ -201,20 +230,32 @@ func.func @test_2(%1: !cc.ptr<!cc.struct<{i16, f32, f64, i64}>> {llvm.sret = !cc
   return
 }
 
-// CHECK-LABEL: define { i16, float, double, i64 } @__nvqpp__mlirgen__test_2()
+// CHECK-LABEL: define { i16, float, double, i64 } @__nvqpp__mlirgen__test_2() local_unnamed_addr {{.*}} {
 // CHECK:         ret { i16, float, double, i64 } { i16 8, float 0x40159999A0000000, double 3.783000e+01, i64 1479 }
 // CHECK:       }
 
 // CHECK-LABEL: define void @test_2({ i16, float, double, i64 }* nocapture writeonly sret({ i16, float, double, i64 }) 
-// CHECK-SAME:          %[[VAL_0:.*]], i8* nocapture readnone %[[VAL_1:.*]]) {{.*}}{
-// CHECK:         %[[VAL_2:.*]] = alloca { { i16, float, double, i64 } }, align 8
-// CHECK:         %[[VAL_3:.*]] = bitcast { { i16, float, double, i64 } }* %[[VAL_2]] to i8*
-// CHECK:         call { i8*, i64 } @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_2.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_2.thunk to i8*), i8* nonnull %[[VAL_3]], i64 24, i64 0)
-// CHECK:         %[[VAL_4:.*]] = bitcast { i16, float, double, i64 }* %[[VAL_0]] to i8*
-// CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 8 dereferenceable(24) %[[VAL_4]], i8* noundef nonnull align 8 dereferenceable(24) %[[VAL_3]], i64 24, i1 false)
+// CHECK-SAME:      %[[VAL_0:.*]], i8* nocapture readnone
+// CHECK-SAME:      %[[VAL_1:.*]]) local_unnamed_addr {
+// CHECK:         %[[VAL_2:.*]] = alloca [0 x i8*], align 8
+// CHECK:         %[[VAL_3:.*]] = alloca [24 x i8], align 1
+// CHECK:         %[[VAL_4:.*]] = alloca { i8**, i8**, i8** }, align 8
+// CHECK:         %[[VAL_5:.*]] = getelementptr inbounds [24 x i8], [24 x i8]* %[[VAL_3]], i64 0, i64 0
+// CHECK:         %[[VAL_6:.*]] = getelementptr inbounds [0 x i8*], [0 x i8*]* %[[VAL_2]], i64 0, i64 0
+// CHECK:         %[[VAL_7:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_4]], i64 0, i32 0
+// CHECK:         store i8** %[[VAL_6]], i8*** %[[VAL_7]], align 8
+// CHECK:         %[[VAL_8:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_4]], i64 0, i32 1
+// CHECK:         store i8** %[[VAL_6]], i8*** %[[VAL_8]], align 8
+// CHECK:         %[[VAL_9:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_4]], i64 0, i32 2
+// CHECK:         store i8** %[[VAL_6]], i8*** %[[VAL_9]], align 8
+// CHECK:         %[[VAL_10:.*]] = bitcast { i8**, i8**, i8** }* %[[VAL_4]] to i8*
+// CHECK:         %[[VAL_11:.*]] = call { i8*, i64 } @hybridLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_2.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_2.thunk to i8*), i8* nonnull %[[VAL_5]], i64 24, i64 0, i8* nonnull %[[VAL_10]])
+// CHECK:         %[[VAL_12:.*]] = bitcast { i16, float, double, i64 }* %[[VAL_0]] to i8*
+// CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 8 dereferenceable(24) %[[VAL_12]], i8* noundef nonnull align 1 dereferenceable(24) %[[VAL_5]], i64 24, i1 false)
 // CHECK:         ret void
 // CHECK:       }
 
+
 // array<T x n> -> sret ptr
 func.func @__nvqpp__mlirgen__test_3() -> !cc.array<i64 x 5> {
   %rv = cc.undef !cc.array<i64 x 5>
@@ -235,17 +276,28 @@ func.func @test_3(%1: !cc.ptr<!cc.array<i64 x 5>> {llvm.sret = !cc.array<i64 x 5
   return
 }
 
-// CHECK-LABEL: define [5 x i64] @__nvqpp__mlirgen__test_3(
+// CHECK-LABEL: define [5 x i64] @__nvqpp__mlirgen__test_3() local_unnamed_addr {{.*}} {
 // CHECK:         ret [5 x i64] [i64 5, i64 74, i64 299, i64 1659, i64 61234]
 // CHECK:       }
 
 // CHECK-LABEL: define void @test_3([5 x i64]* nocapture writeonly sret([5 x i64]) 
-// CHECK-SAME:      %[[VAL_0:.*]], i8* nocapture readnone %[[VAL_1:.*]]) {{.*}}{
-// CHECK:         %[[VAL_2:.*]] = alloca { [5 x i64] }, align 8
-// CHECK:         %[[VAL_3:.*]] = bitcast { [5 x i64] }* %[[VAL_2]] to i8*
-// CHECK:         call { i8*, i64 } @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_3.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_3.thunk to i8*), i8* nonnull %[[VAL_3]], i64 40, i64 0)
-// CHECK:         %[[VAL_4:.*]] = bitcast [5 x i64]* %[[VAL_0]] to i8*
-// CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 8 dereferenceable(40) %[[VAL_4]], i8* noundef nonnull align 8 dereferenceable(40) %[[VAL_3]], i64 40, i1 false)
+// CHECK-SAME:                                                                     %[[VAL_0:.*]], i8* nocapture readnone
+// CHECK-SAME:                                                                     %[[VAL_1:.*]]) local_unnamed_addr {
+// CHECK:         %[[VAL_2:.*]] = alloca [0 x i8*], align 8
+// CHECK:         %[[VAL_3:.*]] = alloca [40 x i8], align 1
+// CHECK:         %[[VAL_4:.*]] = alloca { i8**, i8**, i8** }, align 8
+// CHECK:         %[[VAL_5:.*]] = getelementptr inbounds [40 x i8], [40 x i8]* %[[VAL_3]], i64 0, i64 0
+// CHECK:         %[[VAL_6:.*]] = getelementptr inbounds [0 x i8*], [0 x i8*]* %[[VAL_2]], i64 0, i64 0
+// CHECK:         %[[VAL_7:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_4]], i64 0, i32 0
+// CHECK:         store i8** %[[VAL_6]], i8*** %[[VAL_7]], align 8
+// CHECK:         %[[VAL_8:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_4]], i64 0, i32 1
+// CHECK:         store i8** %[[VAL_6]], i8*** %[[VAL_8]], align 8
+// CHECK:         %[[VAL_9:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_4]], i64 0, i32 2
+// CHECK:         store i8** %[[VAL_6]], i8*** %[[VAL_9]], align 8
+// CHECK:         %[[VAL_10:.*]] = bitcast { i8**, i8**, i8** }* %[[VAL_4]] to i8*
+// CHECK:         %[[VAL_11:.*]] = call { i8*, i64 } @hybridLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_3.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_3.thunk to i8*), i8* nonnull %[[VAL_5]], i64 40, i64 0, i8* nonnull %[[VAL_10]])
+// CHECK:         %[[VAL_12:.*]] = bitcast [5 x i64]* %[[VAL_0]] to i8*
+// CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 8 dereferenceable(40) %[[VAL_12]], i8* noundef nonnull align 1 dereferenceable(40) %[[VAL_5]], i64 40, i1 false)
 // CHECK:         ret void
 // CHECK:       }
 
@@ -260,17 +312,28 @@ func.func @test_4(%sret: !cc.ptr<!cc.struct<{i64, f64}>> {llvm.sret = !cc.struct
   return
 }
 
-// CHECK-LABEL: define { i64, double } @__nvqpp__mlirgen__test_4() {{.*}}{
+// CHECK-LABEL: define { i64, double } @__nvqpp__mlirgen__test_4() local_unnamed_addr {{.*}} {
 // CHECK:         ret { i64, double } { i64 537892, double 0x40578DA858793DD9 }
 // CHECK:       }
 
 // CHECK-LABEL: define void @test_4({ i64, double }* nocapture writeonly sret({ i64, double }) 
-// CHECK-SAME:     %[[VAL_0:.*]], i8* nocapture readnone %[[VAL_1:.*]]) {{.*}}{
-// CHECK:         %[[VAL_2:.*]] = alloca { i64, double }, align 8
-// CHECK:         %[[VAL_3:.*]] = bitcast { i64, double }* %[[VAL_2]] to i8*
-// CHECK:         call { i8*, i64 } @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_4.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_4.thunk to i8*), i8* nonnull %[[VAL_3]], i64 16, i64 0)
-// CHECK:         %[[VAL_4:.*]] = bitcast { i64, double }* %[[VAL_0]] to i8*
-// CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 8 dereferenceable(16) %[[VAL_4]], i8* noundef nonnull align 8 dereferenceable(16) %[[VAL_3]], i64 16, i1 false)
+// CHECK-SAME:     %[[VAL_0:.*]], i8* nocapture readnone
+// CHECK-SAME:     %[[VAL_1:.*]]) local_unnamed_addr {
+// CHECK:         %[[VAL_2:.*]] = alloca [0 x i8*], align 8
+// CHECK:         %[[VAL_3:.*]] = alloca [16 x i8], align 1
+// CHECK:         %[[VAL_4:.*]] = alloca { i8**, i8**, i8** }, align 8
+// CHECK:         %[[VAL_5:.*]] = getelementptr inbounds [16 x i8], [16 x i8]* %[[VAL_3]], i64 0, i64 0
+// CHECK:         %[[VAL_6:.*]] = getelementptr inbounds [0 x i8*], [0 x i8*]* %[[VAL_2]], i64 0, i64 0
+// CHECK:         %[[VAL_7:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_4]], i64 0, i32 0
+// CHECK:         store i8** %[[VAL_6]], i8*** %[[VAL_7]], align 8
+// CHECK:         %[[VAL_8:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_4]], i64 0, i32 1
+// CHECK:         store i8** %[[VAL_6]], i8*** %[[VAL_8]], align 8
+// CHECK:         %[[VAL_9:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_4]], i64 0, i32 2
+// CHECK:         store i8** %[[VAL_6]], i8*** %[[VAL_9]], align 8
+// CHECK:         %[[VAL_10:.*]] = bitcast { i8**, i8**, i8** }* %[[VAL_4]] to i8*
+// CHECK:         %[[VAL_11:.*]] = call { i8*, i64 } @hybridLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_4.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_4.thunk to i8*), i8* nonnull %[[VAL_5]], i64 16, i64 0, i8* nonnull %[[VAL_10]])
+// CHECK:         %[[VAL_12:.*]] = bitcast { i64, double }* %[[VAL_0]] to i8*
+// CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 8 dereferenceable(16) %[[VAL_12]], i8* noundef nonnull align 1 dereferenceable(16) %[[VAL_5]], i64 16, i1 false)
 // CHECK:         ret void
 // CHECK:       }
 
@@ -284,102 +347,114 @@ func.func @test_5(%sret: !cc.ptr<!cc.struct<{i64, f64}>> {llvm.sret = !cc.struct
   return
 }
 
-// CHECK-LABEL: define { i64, double } @__nvqpp__mlirgen__test_5() {{.*}}{
+// CHECK-LABEL: define { i64, double } @__nvqpp__mlirgen__test_5() local_unnamed_addr {{.*}} {
 // CHECK:         ret { i64, double } { i64 537892, double 0x40578DA858793DD9 }
 // CHECK:       }
 
 // CHECK-LABEL: define void @test_5({ i64, double }* nocapture writeonly sret({ i64, double }) 
-// CHECK-SAME:       %[[VAL_0:.*]]) {{.*}}{
-// CHECK:         %[[VAL_1:.*]] = alloca { i64, double }, align 8
-// CHECK:         %[[VAL_2:.*]] = bitcast { i64, double }* %[[VAL_1]] to i8*
-// CHECK:         call { i8*, i64 } @altLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_5.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_5.thunk to i8*), i8* nonnull %[[VAL_2]], i64 16, i64 0)
-// CHECK:         %[[VAL_3:.*]] = bitcast { i64, double }* %[[VAL_0]] to i8*
-// CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 8 dereferenceable(16) %[[VAL_3]], i8* noundef nonnull align 8 dereferenceable(16) %[[VAL_2]], i64 16, i1 false)
+// CHECK-SAME:                                                                                 %[[VAL_0:.*]]) local_unnamed_addr {
+// CHECK:         %[[VAL_1:.*]] = alloca [0 x i8*], align 8
+// CHECK:         %[[VAL_2:.*]] = alloca [16 x i8], align 1
+// CHECK:         %[[VAL_3:.*]] = alloca { i8**, i8**, i8** }, align 8
+// CHECK:         %[[VAL_4:.*]] = getelementptr inbounds [16 x i8], [16 x i8]* %[[VAL_2]], i64 0, i64 0
+// CHECK:         %[[VAL_5:.*]] = getelementptr inbounds [0 x i8*], [0 x i8*]* %[[VAL_1]], i64 0, i64 0
+// CHECK:         %[[VAL_6:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_3]], i64 0, i32 0
+// CHECK:         store i8** %[[VAL_5]], i8*** %[[VAL_6]], align 8
+// CHECK:         %[[VAL_7:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_3]], i64 0, i32 1
+// CHECK:         store i8** %[[VAL_5]], i8*** %[[VAL_7]], align 8
+// CHECK:         %[[VAL_8:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_3]], i64 0, i32 2
+// CHECK:         store i8** %[[VAL_5]], i8*** %[[VAL_8]], align 8
+// CHECK:         %[[VAL_9:.*]] = bitcast { i8**, i8**, i8** }* %[[VAL_3]] to i8*
+// CHECK:         %[[VAL_10:.*]] = call { i8*, i64 } @hybridLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_5.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_5.thunk to i8*), i8* nonnull %[[VAL_4]], i64 16, i64 0, i8* nonnull %[[VAL_9]])
+// CHECK:         %[[VAL_11:.*]] = bitcast { i64, double }* %[[VAL_0]] to i8*
+// CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 8 dereferenceable(16) %[[VAL_11]], i8* noundef nonnull align 1 dereferenceable(16) %[[VAL_4]], i64 16, i1 false)
 // CHECK:         ret void
 // CHECK:       }
 
 }
-
 //===----------------------------------------------------------------------===//
 
-// CHECK-LABEL: define i64 @test_0.returnOffset()
+// CHECK-LABEL: define i64 @test_0.returnOffset() local_unnamed_addr {{.*}} {
 // CHECK:         ret i64 8
 // CHECK:       }
 
 // CHECK-LABEL: define { i8*, i64 } @test_0.thunk(i8* nocapture 
-// CHECK-SAME:     %[[VAL_0:.*]], i1 %[[VAL_1:.*]]) {
+// CHECK-SAME:    %[[VAL_0:.*]], i1
+// CHECK-SAME:    %[[VAL_1:.*]]) {
 // CHECK:         %[[VAL_2:.*]] = bitcast i8* %[[VAL_0]] to i32*
 // CHECK:         %[[VAL_3:.*]] = load i32, i32* %[[VAL_2]], align 4
-// CHECK:         %[[VAL_5:.*]] = sext i32 %[[VAL_3]] to i64
-// CHECK:         %[[VAL_6:.*]] = tail call %[[VAL_7:.*]]* @__quantum__rt__qubit_allocate_array(i64 %[[VAL_5]])
-// CHECK:         %[[VAL_8:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(%[[VAL_7]]* %[[VAL_6]])
-// CHECK:         %[[VAL_9:.*]] = icmp sgt i64 %[[VAL_8]], 0
-// CHECK:         br i1 %[[VAL_9]], label %[[VAL_10:.*]], label %[[VAL_11:.*]]
-// CHECK:       ._crit_edge.thread:                               ; preds = %[[VAL_12:.*]]
-// CHECK:         %[[VAL_13:.*]] = alloca i8, i64 %[[VAL_8]], align 1
-// CHECK:         br label %[[VAL_14:.*]]
-// CHECK:       .lr.ph:                                           ; preds = %[[VAL_12]], %[[VAL_10]]
-// CHECK:         %[[VAL_15:.*]] = phi i64 [ %[[VAL_16:.*]], %[[VAL_10]] ], [ 0, %[[VAL_12]] ]
-// CHECK:         %[[VAL_17:.*]] = tail call i8* @__quantum__rt__array_get_element_ptr_1d(%[[VAL_7]]* %[[VAL_6]], i64 %[[VAL_15]])
-// CHECK:         %[[VAL_18:.*]] = bitcast i8* %[[VAL_17]] to %[[VAL_19:.*]]**
-// CHECK:         %[[VAL_20:.*]] = load %[[VAL_19]]*, %[[VAL_19]]** %[[VAL_18]], align 8
-// CHECK:         tail call void @__quantum__qis__h(%[[VAL_19]]* %[[VAL_20]])
-// CHECK:         %[[VAL_16]] = add nuw nsw i64 %[[VAL_15]], 1
-// CHECK:         %[[VAL_21:.*]] = icmp eq i64 %[[VAL_16]], %[[VAL_8]]
-// CHECK:         br i1 %[[VAL_21]], label %[[VAL_22:.*]], label %[[VAL_10]]
-// CHECK:       ._crit_edge:                                      ; preds = %[[VAL_10]]
-// CHECK:         %[[VAL_23:.*]] = alloca i8, i64 %[[VAL_8]], align 1
-// CHECK:         br i1 %[[VAL_9]], label %[[VAL_24:.*]], label %[[VAL_14]]
-// CHECK:       [[VAL_24]]:                                          ; preds = %[[VAL_22]], %[[VAL_24]]
-// CHECK:         %[[VAL_25:.*]] = phi i64 [ %[[VAL_26:.*]], %[[VAL_24]] ], [ 0, %[[VAL_22]] ]
-// CHECK:         %[[VAL_27:.*]] = tail call i8* @__quantum__rt__array_get_element_ptr_1d(%[[VAL_7]]* %[[VAL_6]], i64 %[[VAL_25]])
-// CHECK:         %[[VAL_28:.*]] = bitcast i8* %[[VAL_27]] to %[[VAL_19]]**
-// CHECK:         %[[VAL_29:.*]] = load %[[VAL_19]]*, %[[VAL_19]]** %[[VAL_28]], align 8
-// CHECK:         %[[VAL_30:.*]] = tail call %[[VAL_31:.*]]* @__quantum__qis__mz(%[[VAL_19]]* %[[VAL_29]])
-// CHECK:         %[[VAL_32:.*]] = bitcast %[[VAL_31]]* %[[VAL_30]] to i1*
-// CHECK:         %[[VAL_33:.*]] = load i1, i1* %[[VAL_32]], align 1
-// CHECK:         %[[VAL_34:.*]] = getelementptr i8, i8* %[[VAL_23]], i64 %[[VAL_25]]
-// CHECK:         %[[VAL_35:.*]] = zext i1 %[[VAL_33]] to i8
-// CHECK:         store i8 %[[VAL_35]], i8* %[[VAL_34]], align 1
-// CHECK:         %[[VAL_26]] = add nuw nsw i64 %[[VAL_25]], 1
-// CHECK:         %[[VAL_36:.*]] = icmp eq i64 %[[VAL_26]], %[[VAL_8]]
-// CHECK:         br i1 %[[VAL_36]], label %[[VAL_14]], label %[[VAL_24]]
-// CHECK:       [[VAL_14]]:                                     ; preds = %[[VAL_24]], %[[VAL_11]], %[[VAL_22]]
-// CHECK:         %[[VAL_37:.*]] = phi i8* [ %[[VAL_13]], %[[VAL_11]] ], [ %[[VAL_23]], %[[VAL_22]] ], [ %[[VAL_23]], %[[VAL_24]] ]
-// CHECK:         %[[VAL_38:.*]] = call i8* @__nvqpp_vectorCopyCtor(i8* nonnull %[[VAL_37]], i64 %[[VAL_8]], i64 1)
-// CHECK:         call void @__quantum__rt__qubit_release_array(%[[VAL_7]]* %[[VAL_6]])
-// CHECK:         %[[VAL_50:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 8
-// CHECK:         %[[VAL_51:.*]] = bitcast i8* %[[VAL_50]] to i8**
-// CHECK:         store i8* %[[VAL_38]], i8** %[[VAL_51]], align 8
-// CHECK:         %[[VAL_52:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 16
-// CHECK:         %[[VAL_53:.*]] = bitcast i8* %[[VAL_52]] to i64*
-// CHECK:         store i64 %[[VAL_8]], i64* %[[VAL_53]], align 8
+// CHECK:         %[[VAL_4:.*]] = sext i32 %[[VAL_3]] to i64
+// CHECK:         %[[VAL_5:.*]] = tail call %[[VAL_6:.*]]* @__quantum__rt__qubit_allocate_array(i64 %[[VAL_4]])
+// CHECK:         %[[VAL_7:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(%[[VAL_6]]* %[[VAL_5]])
+// CHECK:         %[[VAL_8:.*]] = icmp sgt i64 %[[VAL_7]], 0
+// CHECK:         br i1 %[[VAL_8]], label %[[VAL_9:.*]], label %[[VAL_10:.*]]
+// CHECK:       ._crit_edge.thread:                               ; preds = %[[VAL_11:.*]]
+// CHECK:         %[[VAL_12:.*]] = alloca i8, i64 %[[VAL_7]], align 1
+// CHECK:         br label %[[VAL_13:.*]]
+// CHECK:       .lr.ph:                                           ; preds = %[[VAL_11]], %[[VAL_9]]
+// CHECK:         %[[VAL_14:.*]] = phi i64 [ %[[VAL_15:.*]], %[[VAL_9]] ], [ 0, %[[VAL_11]] ]
+// CHECK:         %[[VAL_16:.*]] = tail call i8* @__quantum__rt__array_get_element_ptr_1d(%[[VAL_6]]* %[[VAL_5]], i64 %[[VAL_14]])
+// CHECK:         %[[VAL_17:.*]] = bitcast i8* %[[VAL_16]] to %[[VAL_18:.*]]**
+// CHECK:         %[[VAL_19:.*]] = load %[[VAL_18]]*, %[[VAL_18]]** %[[VAL_17]], align 8
+// CHECK:         tail call void @__quantum__qis__h(%[[VAL_18]]* %[[VAL_19]])
+// CHECK:         %[[VAL_15]] = add nuw nsw i64 %[[VAL_14]], 1
+// CHECK:         %[[VAL_20:.*]] = icmp eq i64 %[[VAL_15]], %[[VAL_7]]
+// CHECK:         br i1 %[[VAL_20]], label %[[VAL_21:.*]], label %[[VAL_9]]
+// CHECK:       ._crit_edge:                                      ; preds = %[[VAL_9]]
+// CHECK:         %[[VAL_22:.*]] = alloca i8, i64 %[[VAL_7]], align 1
+// CHECK:         br i1 %[[VAL_8]], label %[[VAL_23:.*]], label %[[VAL_13]]
+// CHECK:       .lr.ph6:                                          ; preds = %[[VAL_21]], %[[VAL_23]]
+// CHECK:         %[[VAL_24:.*]] = phi i64 [ %[[VAL_25:.*]], %[[VAL_23]] ], [ 0, %[[VAL_21]] ]
+// CHECK:         %[[VAL_26:.*]] = tail call i8* @__quantum__rt__array_get_element_ptr_1d(%[[VAL_6]]* %[[VAL_5]], i64 %[[VAL_24]])
+// CHECK:         %[[VAL_27:.*]] = bitcast i8* %[[VAL_26]] to %[[VAL_18]]**
+// CHECK:         %[[VAL_28:.*]] = load %[[VAL_18]]*, %[[VAL_18]]** %[[VAL_27]], align 8
+// CHECK:         %[[VAL_29:.*]] = tail call %[[VAL_30:.*]]* @__quantum__qis__mz(%[[VAL_18]]* %[[VAL_28]])
+// CHECK:         %[[VAL_31:.*]] = bitcast %[[VAL_30]]* %[[VAL_29]] to i1*
+// CHECK:         %[[VAL_32:.*]] = load i1, i1* %[[VAL_31]], align 1
+// CHECK:         %[[VAL_33:.*]] = getelementptr i8, i8* %[[VAL_22]], i64 %[[VAL_24]]
+// CHECK:         %[[VAL_34:.*]] = zext i1 %[[VAL_32]] to i8
+// CHECK:         store i8 %[[VAL_34]], i8* %[[VAL_33]], align 1
+// CHECK:         %[[VAL_25]] = add nuw nsw i64 %[[VAL_24]], 1
+// CHECK:         %[[VAL_35:.*]] = icmp eq i64 %[[VAL_25]], %[[VAL_7]]
+// CHECK:         br i1 %[[VAL_35]], label %[[VAL_13]], label %[[VAL_23]]
+// CHECK:       ._crit_edge7:                                     ; preds = %[[VAL_23]], %[[VAL_10]], %[[VAL_21]]
+// CHECK:         %[[VAL_36:.*]] = phi i8* [ %[[VAL_12]], %[[VAL_10]] ], [ %[[VAL_22]], %[[VAL_21]] ], [ %[[VAL_22]], %[[VAL_23]] ]
+// CHECK:         %[[VAL_37:.*]] = call i8* @__nvqpp_vectorCopyCtor(i8* nonnull %[[VAL_36]], i64 %[[VAL_7]], i64 1)
+// CHECK:         call void @__quantum__rt__qubit_release_array(%[[VAL_6]]* %[[VAL_5]])
+// CHECK:         %[[VAL_38:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 8
+// CHECK:         %[[VAL_39:.*]] = bitcast i8* %[[VAL_38]] to i8**
+// CHECK:         store i8* %[[VAL_37]], i8** %[[VAL_39]], align 8
+// CHECK:         %[[VAL_40:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 16
+// CHECK:         %[[VAL_41:.*]] = bitcast i8* %[[VAL_40]] to i64*
+// CHECK:         store i64 %[[VAL_7]], i64* %[[VAL_41]], align 8
 // CHECK:         br i1 %[[VAL_1]], label %[[VAL_42:.*]], label %[[VAL_43:.*]]
-// CHECK:       [[VAL_43]]:                                       ; preds = %[[VAL_14]], %[[VAL_42]]
-// CHECK:         %[[VAL_44:.*]] = phi { i8*, i64 } [ %[[VAL_45:.*]], %[[VAL_42]] ], [ zeroinitializer, %[[VAL_14]] ]
+// CHECK:       common.ret:                                       ; preds = %[[VAL_13]], %[[VAL_42]]
+// CHECK:         %[[VAL_44:.*]] = phi { i8*, i64 } [ %[[VAL_45:.*]], %[[VAL_42]] ], [ zeroinitializer, %[[VAL_13]] ]
 // CHECK:         ret { i8*, i64 } %[[VAL_44]]
-// CHECK:       [[VAL_42]]:                                               ; preds = %[[VAL_14]]
-// CHECK:         %[[VAL_46:.*]] = add i64 %[[VAL_8]], 24
+// CHECK:       31:                                               ; preds = %[[VAL_13]]
+// CHECK:         %[[VAL_46:.*]] = add i64 %[[VAL_7]], 24
 // CHECK:         %[[VAL_47:.*]] = call i8* @malloc(i64 %[[VAL_46]])
 // CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 1 dereferenceable(24) %[[VAL_47]], i8* noundef nonnull align 1 dereferenceable(24) %[[VAL_0]], i64 24, i1 false)
 // CHECK:         %[[VAL_48:.*]] = getelementptr i8, i8* %[[VAL_47]], i64 24
-// CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %[[VAL_48]], i8* align 1 %[[VAL_38]], i64 %[[VAL_8]], i1 false)
+// CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %[[VAL_48]], i8* align 1 %[[VAL_37]], i64 %[[VAL_7]], i1 false)
 // CHECK:         %[[VAL_49:.*]] = insertvalue { i8*, i64 } undef, i8* %[[VAL_47]], 0
 // CHECK:         %[[VAL_45]] = insertvalue { i8*, i64 } %[[VAL_49]], i64 %[[VAL_46]], 1
+// CHECK:         %[[VAL_50:.*]] = getelementptr i8, i8* %[[VAL_47]], i64 8
+// CHECK:         %[[VAL_51:.*]] = bitcast i8* %[[VAL_50]] to i8**
+// CHECK:         store i8* %[[VAL_48]], i8** %[[VAL_51]], align 8
 // CHECK:         br label %[[VAL_43]]
 // CHECK:       }
 
 // CHECK-LABEL: define i64 @test_0.argsCreator(i8** nocapture readonly 
-// CHECK-SAME:        %[[VAL_0:.*]], i8** nocapture writeonly
-// CHECK-SAME:        %[[VAL_1:.*]]) #{{[0-9]+}} {
+// CHECK-SAME:                                                         %[[VAL_0:.*]], i8** nocapture writeonly
+// CHECK-SAME:                                                         %[[VAL_1:.*]]) {{.*}} {
 // CHECK:         %[[VAL_2:.*]] = bitcast i8** %[[VAL_0]] to i32**
 // CHECK:         %[[VAL_3:.*]] = load i32*, i32** %[[VAL_2]], align 8
 // CHECK:         %[[VAL_4:.*]] = load i32, i32* %[[VAL_3]], align 4
-// CHECK:         %[[VAL_5:.*]] = insertvalue { i32, { i1*, i64 } } undef, i32 %[[VAL_4]], 0
-// CHECK:         %[[VAL_6:.*]] = tail call dereferenceable_or_null(24) i8* @malloc(i64 24)
-// CHECK:         %[[VAL_7:.*]] = bitcast i8* %[[VAL_6]] to { i32, { i1*, i64 } }*
-// CHECK:         store { i32, { i1*, i64 } } %[[VAL_5]], { i32, { i1*, i64 } }* %[[VAL_7]], align 8
-// CHECK:         store i8* %[[VAL_6]], i8** %[[VAL_1]], align 8
+// CHECK:         %[[VAL_5:.*]] = tail call dereferenceable_or_null(24) i8* @malloc(i64 24)
+// CHECK:         %[[VAL_6:.*]] = bitcast i8* %[[VAL_5]] to i32*
+// CHECK:         store i32 %[[VAL_4]], i32* %[[VAL_6]], align 4
+// CHECK:         store i8* %[[VAL_5]], i8** %[[VAL_1]], align 8
 // CHECK:         ret i64 24
 // CHECK:       }
 
@@ -389,13 +464,13 @@ func.func @test_5(%sret: !cc.ptr<!cc.struct<{i64, f64}>> {llvm.sret = !cc.struct
 // CHECK:         ret void
 // CHECK:       }
 
-// CHECK-LABEL: define i64 @test_1.returnOffset()
+// CHECK-LABEL: define i64 @test_1.returnOffset() local_unnamed_addr {{.*}} {
 // CHECK:         ret i64 0
 // CHECK:       }
 
 // CHECK-LABEL: define { i8*, i64 } @test_1.thunk(i8* nocapture writeonly 
-// CHECK-SAME:      %[[VAL_0:.*]], i1
-// CHECK-SAME:      %[[VAL_1:.*]]) {
+// CHECK-SAME:                                                            %[[VAL_0:.*]], i1
+// CHECK-SAME:                                                            %[[VAL_1:.*]]) {
 // CHECK:         %[[VAL_2:.*]] = tail call %[[VAL_3:.*]]* @__quantum__rt__qubit_allocate_array(i64 2)
 // CHECK:         %[[VAL_4:.*]] = tail call i8* @__quantum__rt__array_get_element_ptr_1d(%[[VAL_3]]* %[[VAL_2]], i64 0)
 // CHECK:         %[[VAL_5:.*]] = bitcast i8* %[[VAL_4]] to %[[VAL_6:.*]]**
@@ -421,8 +496,8 @@ func.func @test_5(%sret: !cc.ptr<!cc.struct<{i64, f64}>> {llvm.sret = !cc.struct
 // CHECK:       }
 
 // CHECK-LABEL: define i64 @test_1.argsCreator(i8** nocapture readnone 
-// CHECK-SAME:        %[[VAL_0:.*]], i8** nocapture writeonly
-// CHECK-SAME:        %[[VAL_1:.*]]) #{{[0-9]+}} {
+// CHECK-SAME:                                                         %[[VAL_0:.*]], i8** nocapture writeonly
+// CHECK-SAME:                                                         %[[VAL_1:.*]]) {{.*}} {
 // CHECK:         %[[VAL_2:.*]] = tail call dereferenceable_or_null(2) i8* @malloc(i64 2)
 // CHECK:         store i8* %[[VAL_2]], i8** %[[VAL_1]], align 8
 // CHECK:         ret i64 2
@@ -434,21 +509,21 @@ func.func @test_5(%sret: !cc.ptr<!cc.struct<{i64, f64}>> {llvm.sret = !cc.struct
 // CHECK:         ret void
 // CHECK:       }
 
-// CHECK-LABEL: define i64 @test_2.returnOffset()
+// CHECK-LABEL: define i64 @test_2.returnOffset() local_unnamed_addr {{.*}} {
 // CHECK:         ret i64 0
 // CHECK:       }
 
 // CHECK-LABEL: define { i8*, i64 } @test_2.thunk(i8* nocapture writeonly 
-// CHECK-SAME:      %[[VAL_0:.*]], i1
-// CHECK-SAME:      %[[VAL_1:.*]]) #{{[0-9]+}} {
+// CHECK-SAME:                                                            %[[VAL_0:.*]], i1
+// CHECK-SAME:                                                            %[[VAL_1:.*]]) {{.*}} {
 // CHECK:         %[[VAL_2:.*]] = bitcast i8* %[[VAL_0]] to { i16, float, double, i64 }*
 // CHECK:         store { i16, float, double, i64 } { i16 8, float 0x40159999A0000000, double 3.783000e+01, i64 1479 }, { i16, float, double, i64 }* %[[VAL_2]], align 8
 // CHECK:         ret { i8*, i64 } zeroinitializer
 // CHECK:       }
 
 // CHECK-LABEL: define i64 @test_2.argsCreator(i8** nocapture readnone 
-// CHECK-SAME:         %[[VAL_0:.*]], i8** nocapture writeonly
-// CHECK-SAME:         %[[VAL_1:.*]]) #{{[0-9]+}} {
+// CHECK-SAME:                                                         %[[VAL_0:.*]], i8** nocapture writeonly
+// CHECK-SAME:                                                         %[[VAL_1:.*]]) {{.*}} {
 // CHECK:         %[[VAL_2:.*]] = tail call dereferenceable_or_null(24) i8* @malloc(i64 24)
 // CHECK:         store i8* %[[VAL_2]], i8** %[[VAL_1]], align 8
 // CHECK:         ret i64 24
@@ -460,12 +535,13 @@ func.func @test_5(%sret: !cc.ptr<!cc.struct<{i64, f64}>> {llvm.sret = !cc.struct
 // CHECK:         ret void
 // CHECK:       }
 
-// CHECK-LABEL: define i64 @test_3.returnOffset()
+// CHECK-LABEL: define i64 @test_3.returnOffset() local_unnamed_addr {{.*}} {
 // CHECK:         ret i64 0
 // CHECK:       }
 
 // CHECK-LABEL: define { i8*, i64 } @test_3.thunk(i8* nocapture writeonly 
-// CHECK-SAME:          %[[VAL_0:.*]], i1 %[[VAL_1:.*]]) #{{[0-9]+}} {
+// CHECK-SAME:                                                            %[[VAL_0:.*]], i1
+// CHECK-SAME:                                                            %[[VAL_1:.*]]) {{.*}} {
 // CHECK:         %[[VAL_2:.*]] = bitcast i8* %[[VAL_0]] to i64*
 // CHECK:         store i64 5, i64* %[[VAL_2]], align 4
 // CHECK:         %[[VAL_3:.*]] = getelementptr inbounds i8, i8* %[[VAL_0]], i64 8
@@ -484,8 +560,8 @@ func.func @test_5(%sret: !cc.ptr<!cc.struct<{i64, f64}>> {llvm.sret = !cc.struct
 // CHECK:       }
 
 // CHECK-LABEL: define i64 @test_3.argsCreator(i8** nocapture readnone 
-// CHECK-SAME:       %[[VAL_0:.*]], i8** nocapture writeonly
-// CHECK-SAME:       %[[VAL_1:.*]]) #{{[0-9]+}} {
+// CHECK-SAME:                                                         %[[VAL_0:.*]], i8** nocapture writeonly
+// CHECK-SAME:                                                         %[[VAL_1:.*]]) {{.*}} {
 // CHECK:         %[[VAL_2:.*]] = tail call dereferenceable_or_null(40) i8* @malloc(i64 40)
 // CHECK:         store i8* %[[VAL_2]], i8** %[[VAL_1]], align 8
 // CHECK:         ret i64 40
@@ -497,12 +573,13 @@ func.func @test_5(%sret: !cc.ptr<!cc.struct<{i64, f64}>> {llvm.sret = !cc.struct
 // CHECK:         ret void
 // CHECK:       }
 
-// CHECK-LABEL: define i64 @test_4.returnOffset()
+// CHECK-LABEL: define i64 @test_4.returnOffset() local_unnamed_addr {{.*}} {
 // CHECK:         ret i64 0
 // CHECK:       }
 
 // CHECK-LABEL: define { i8*, i64 } @test_4.thunk(i8* nocapture writeonly 
-// CHECK-SAME:        %[[VAL_0:.*]], i1 %[[VAL_1:.*]]) #{{[0-9]+}} {
+// CHECK-SAME:                                                            %[[VAL_0:.*]], i1
+// CHECK-SAME:                                                            %[[VAL_1:.*]]) {{.*}} {
 // CHECK:         %[[VAL_2:.*]] = bitcast i8* %[[VAL_0]] to i64*
 // CHECK:         store i64 537892, i64* %[[VAL_2]], align 4
 // CHECK:         %[[VAL_3:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 8
@@ -512,8 +589,8 @@ func.func @test_5(%sret: !cc.ptr<!cc.struct<{i64, f64}>> {llvm.sret = !cc.struct
 // CHECK:       }
 
 // CHECK-LABEL: define i64 @test_4.argsCreator(i8** nocapture readnone 
-// CHECK-SAME:         %[[VAL_0:.*]], i8** nocapture writeonly
-// CHECK-SAME:         %[[VAL_1:.*]]) #{{[0-9]+}} {
+// CHECK-SAME:                                                         %[[VAL_0:.*]], i8** nocapture writeonly
+// CHECK-SAME:                                                         %[[VAL_1:.*]]) {{.*}} {
 // CHECK:         %[[VAL_2:.*]] = tail call dereferenceable_or_null(16) i8* @malloc(i64 16)
 // CHECK:         store i8* %[[VAL_2]], i8** %[[VAL_1]], align 8
 // CHECK:         ret i64 16
@@ -525,12 +602,13 @@ func.func @test_5(%sret: !cc.ptr<!cc.struct<{i64, f64}>> {llvm.sret = !cc.struct
 // CHECK:         ret void
 // CHECK:       }
 
-// CHECK-LABEL: define i64 @test_5.returnOffset()
+// CHECK-LABEL: define i64 @test_5.returnOffset() local_unnamed_addr {{.*}} {
 // CHECK:         ret i64 0
 // CHECK:       }
 
 // CHECK-LABEL: define { i8*, i64 } @test_5.thunk(i8* nocapture writeonly 
-// CHECK-SAME:      %[[VAL_0:.*]], i1 %[[VAL_1:.*]]) #{{[0-9]+}} {
+// CHECK-SAME:                                                            %[[VAL_0:.*]], i1
+// CHECK-SAME:                                                            %[[VAL_1:.*]]) {{.*}} {
 // CHECK:         %[[VAL_2:.*]] = bitcast i8* %[[VAL_0]] to i64*
 // CHECK:         store i64 537892, i64* %[[VAL_2]], align 4
 // CHECK:         %[[VAL_3:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 8
@@ -540,8 +618,8 @@ func.func @test_5(%sret: !cc.ptr<!cc.struct<{i64, f64}>> {llvm.sret = !cc.struct
 // CHECK:       }
 
 // CHECK-LABEL: define i64 @test_5.argsCreator(i8** nocapture readnone 
-// CHECK-SAME:      %[[VAL_0:.*]], i8** nocapture writeonly
-// CHECK-SAME:      %[[VAL_1:.*]]) #{{[0-9]+}} {
+// CHECK-SAME:                                                         %[[VAL_0:.*]], i8** nocapture writeonly
+// CHECK-SAME:                                                         %[[VAL_1:.*]]) {{.*}} {
 // CHECK:         %[[VAL_2:.*]] = tail call dereferenceable_or_null(16) i8* @malloc(i64 16)
 // CHECK:         store i8* %[[VAL_2]], i8** %[[VAL_1]], align 8
 // CHECK:         ret i64 16