GridTools · edopao · Nov 5, 2024 · Oct 23, 2024 · Oct 23, 2024 · Oct 24, 2024
diff --git a/src/gt4py/next/program_processors/runners/dace.py b/src/gt4py/next/program_processors/runners/dace.py
@@ -8,8 +8,8 @@
 
 import factory
 
-from gt4py.next import allocators as next_allocators, backend
-from gt4py.next.ffront import foast_to_gtir, past_to_itir
+from gt4py.next import backend
+from gt4py.next.ffront import foast_to_gtir, foast_to_past, past_to_itir
 from gt4py.next.program_processors.runners.dace_fieldview import workflow as dace_fieldview_workflow
 from gt4py.next.program_processors.runners.dace_iterator import workflow as dace_iterator_workflow
 from gt4py.next.program_processors.runners.gtfn import GTFNBackendFactory
@@ -25,12 +25,12 @@ class Params:
             ),
         )
         auto_optimize = factory.Trait(
-            otf_workflow__translation__auto_optimize=True, name_temps="_opt"
+            otf_workflow__translation__auto_optimize=True, name_postfix="_opt"
         )
         use_field_canonical_representation: bool = False
 
     name = factory.LazyAttribute(
-        lambda o: f"run_dace_{o.name_device}{o.name_temps}{o.name_cached}{o.name_postfix}"
+        lambda o: f"run_dace_{o.name_device}{o.name_temps}{o.name_cached}{o.name_postfix}.itir"
     )
 
     transforms = backend.DEFAULT_TRANSFORMS
@@ -45,12 +45,28 @@ class Params:
 itir_cpu = run_dace_cpu
 itir_gpu = run_dace_gpu
 
-gtir_cpu = backend.Backend(
-    name="dace.gtir.cpu",
-    executor=dace_fieldview_workflow.DaCeWorkflowFactory(),
-    allocator=next_allocators.StandardCPUFieldBufferAllocator(),
-    transforms=backend.Transforms(
+
+class DaCeFieldviewBackendFactory(GTFNBackendFactory):
+    class Params:
+        otf_workflow = factory.SubFactory(
+            dace_fieldview_workflow.DaCeWorkflowFactory,
+            device_type=factory.SelfAttribute("..device_type"),
+            auto_optimize=factory.SelfAttribute("..auto_optimize"),
+        )
+        auto_optimize = factory.Trait(name_postfix="_opt")
+
+    name = factory.LazyAttribute(
+        lambda o: f"run_dace_{o.name_device}{o.name_temps}{o.name_cached}{o.name_postfix}.gtir"
+    )
+
+    transforms = backend.Transforms(
         past_to_itir=past_to_itir.past_to_itir_factory(to_gtir=True),
-        foast_to_itir=foast_to_gtir.adapted_foast_to_gtir_factory(cached=True),
-    ),
-)
+        foast_to_itir=foast_to_gtir.adapted_foast_to_gtir_factory(),
+        field_view_op_to_prog=foast_to_past.operator_to_program_factory(
+            foast_to_itir_step=foast_to_gtir.adapted_foast_to_gtir_factory()
+        ),
+    )
+
+
+gtir_cpu = DaCeFieldviewBackendFactory(cached=True, auto_optimize=False)
+gtir_gpu = DaCeFieldviewBackendFactory(gpu=True, cached=True, auto_optimize=False)
diff --git a/src/gt4py/next/program_processors/runners/dace_common/dace_backend.py b/src/gt4py/next/program_processors/runners/dace_common/dace_backend.py
@@ -32,7 +32,7 @@ def _convert_arg(arg: Any, sdfg_param: str, use_field_canonical_representation:
         # Note that 'ndarray.item()' always transforms the numpy scalar to a python scalar,
         # which may change its precision. To avoid this, we use here the empty tuple as index
         # for 'ndarray.__getitem__()'.
-        return arg.ndarray[()]
+        return arg.asnumpy()[()]
     # field domain offsets are not supported
     non_zero_offsets = [
         (dim, dim_range)
@@ -88,10 +88,15 @@ def _get_shape_args(
     for name, value in args.items():
         for sym, size in zip(arrays[name].shape, value.shape, strict=True):
             if isinstance(sym, dace.symbol):
-                assert sym.name not in shape_args
-                shape_args[sym.name] = size
+                if sym.name not in shape_args:
+                    shape_args[sym.name] = size
+                elif shape_args[sym.name] != size:
+                    # TODO(edopao): This case is only hit if all fields in a tuple have the same dims and sizes.
+                    raise ValueError(
+                        f"Expected array size {sym.name} for arg {name} to be {shape_args[sym.name]}, got {size}."
+                    )
             elif sym != size:
-                raise RuntimeError(
+                raise ValueError(
                     f"Expected shape {arrays[name].shape} for arg {name}, got {value.shape}."
                 )
     return shape_args
@@ -109,10 +114,15 @@ def _get_stride_args(
                     f"Stride ({stride_size} bytes) for argument '{sym}' must be a multiple of item size ({value.itemsize} bytes)."
                 )
             if isinstance(sym, dace.symbol):
-                assert sym.name not in stride_args
-                stride_args[str(sym)] = stride
+                if sym.name not in stride_args:
+                    stride_args[str(sym)] = stride
+                elif stride_args[sym.name] != stride:
+                    # TODO(edopao): This case is only hit if all fields in a tuple have the same dims and sizes.
+                    raise ValueError(
+                        f"Expected array stride {sym.name} for arg {name} to be {stride_args[sym.name]}, got {stride}."
+                    )
             elif sym != stride:
-                raise RuntimeError(
+                raise ValueError(
                     f"Expected stride {arrays[name].strides} for arg {name}, got {value.strides}."
                 )
     return stride_args

diff --git a/src/gt4py/next/program_processors/runners/dace_common/workflow.py b/src/gt4py/next/program_processors/runners/dace_common/workflow.py
@@ -17,7 +17,7 @@
 from dace.codegen.compiled_sdfg import _array_interface_ptr as get_array_interface_ptr
 
 from gt4py._core import definitions as core_defs
-from gt4py.next import common, config
+from gt4py.next import common, config, utils as gtx_utils
 from gt4py.next.otf import arguments, languages, stages, step_types, workflow
 from gt4py.next.otf.compilation import cache
 from gt4py.next.program_processors.runners.dace_common import dace_backend, utility as dace_utils
@@ -116,7 +116,7 @@ def decorated_program(
             args = (*args, *arguments.iter_size_args(args))
 
         if sdfg_program._lastargs:
-            kwargs = dict(zip(sdfg.arg_names, args, strict=True))
+            kwargs = dict(zip(sdfg.arg_names, gtx_utils.flatten_nested_tuple(args), strict=True))
             kwargs.update(dace_backend.get_sdfg_conn_args(sdfg, offset_provider, on_gpu))
 
             use_fast_call = True

diff --git a/src/gt4py/next/program_processors/runners/dace_fieldview/gtir_sdfg.py b/src/gt4py/next/program_processors/runners/dace_fieldview/gtir_sdfg.py
@@ -217,6 +217,7 @@ def _add_storage(
         name: str,
         gt_type: ts.DataType,
         transient: bool = True,
+        tuple_name: Optional[str] = None,
     ) -> list[tuple[str, ts.DataType]]:
         """
         Add storage in the SDFG for a given GT4Py data symbol.
@@ -236,6 +237,7 @@ def _add_storage(
             name: Symbol Name to be allocated.
             gt_type: GT4Py symbol type.
             transient: True when the data symbol has to be allocated as internal storage.
+            tuple_name: Must be set for tuple fields in order to use the same array shape and strides symbols.
 
         Returns:
             List of tuples '(data_name, gt_type)' where 'data_name' is the name of
@@ -250,7 +252,9 @@ def _add_storage(
                 name, gt_type, flatten=True
             ):
                 tuple_fields.extend(
-                    self._add_storage(sdfg, symbolic_arguments, tname, tsymbol_type, transient)
+                    self._add_storage(
+                        sdfg, symbolic_arguments, tname, tsymbol_type, transient, tuple_name=name
+                    )
                 )
             return tuple_fields
 
@@ -260,16 +264,23 @@ def _add_storage(
                 return self._add_storage(sdfg, symbolic_arguments, name, gt_type.dtype, transient)
             # handle default case: field with one or more dimensions
             dc_dtype = dace_utils.as_dace_type(gt_type.dtype)
-            # use symbolic shape, which allows to invoke the program with fields of different size;
-            # and symbolic strides, which enables decoupling the memory layout from generated code.
-            sym_shape, sym_strides = self._make_array_shape_and_strides(name, gt_type.dims)
+            if tuple_name is None:
+                # Use symbolic shape, which allows to invoke the program with fields of different size;
+                # and symbolic strides, which enables decoupling the memory layout from generated code.
+                sym_shape, sym_strides = self._make_array_shape_and_strides(name, gt_type.dims)
+            else:
+                # All fields in a tuple must have the same dims and sizes,
+                # therefore we use the same shape and strides symbols based on 'tuple_name'.
+                sym_shape, sym_strides = self._make_array_shape_and_strides(
+                    tuple_name, gt_type.dims
+                )
             sdfg.add_array(name, sym_shape, dc_dtype, strides=sym_strides, transient=transient)
 
             return [(name, gt_type)]
 
         elif isinstance(gt_type, ts.ScalarType):
             dc_dtype = dace_utils.as_dace_type(gt_type)
-            if name in symbolic_arguments:
+            if dace_utils.is_field_symbol(name) or name in symbolic_arguments:
                 if name in sdfg.symbols:
                     # Sometimes, when the field domain is implicitly derived from the
                     # field domain, the gt4py lowering adds the field size as a scalar

diff --git a/src/gt4py/next/program_processors/runners/dace_fieldview/workflow.py b/src/gt4py/next/program_processors/runners/dace_fieldview/workflow.py
@@ -16,14 +16,16 @@
 import factory
 
 from gt4py._core import definitions as core_defs
-from gt4py.next import common, config
+from gt4py.next import allocators as gtx_allocators, common, config
 from gt4py.next.iterator import ir as itir, transforms as itir_transforms
 from gt4py.next.otf import languages, recipes, stages, step_types, workflow
 from gt4py.next.otf.binding import interface
 from gt4py.next.otf.languages import LanguageSettings
 from gt4py.next.program_processors.runners.dace_common import workflow as dace_workflow
-from gt4py.next.program_processors.runners.dace_fieldview import gtir_sdfg
-from gt4py.next.type_system import type_translation as tt
+from gt4py.next.program_processors.runners.dace_fieldview import (
+    gtir_sdfg,
+    transformations as gtx_transformations,
+)
 
 
 @dataclasses.dataclass(frozen=True)
@@ -33,7 +35,8 @@ class DaCeTranslator(
     ],
     step_types.TranslationStep[languages.SDFG, languages.LanguageSettings],
 ):
-    device_type: core_defs.DeviceType = core_defs.DeviceType.CPU
+    device_type: core_defs.DeviceType
+    auto_optimize: bool
 
     def _language_settings(self) -> languages.LanguageSettings:
         return languages.LanguageSettings(
@@ -45,9 +48,19 @@ def generate_sdfg(
         ir: itir.Program,
         offset_provider: common.OffsetProvider,
         column_axis: Optional[common.Dimension],
+        auto_opt: bool,
+        on_gpu: bool,
     ) -> dace.SDFG:
         ir = itir_transforms.apply_fieldview_transforms(ir, offset_provider=offset_provider)
-        return gtir_sdfg.build_sdfg_from_gtir(ir=ir, offset_provider=offset_provider)
+        sdfg = gtir_sdfg.build_sdfg_from_gtir(ir, offset_provider=offset_provider)
+
+        if auto_opt:
+            gtx_transformations.gt_auto_optimize(sdfg, gpu=on_gpu)
+        elif on_gpu:
+            gtx_transformations.gt_simplify(sdfg)
+            gtx_transformations.gt_gpu_transformation(sdfg, try_removing_trivial_maps=False)
+
+        return sdfg
 
     def __call__(
         self, inp: stages.CompilableProgram
@@ -60,11 +73,13 @@ def __call__(
             program,
             inp.args.offset_provider,
             inp.args.column_axis,
+            auto_opt=self.auto_optimize,
+            on_gpu=(self.device_type == gtx_allocators.CUPY_DEVICE),
         )
 
         param_types = tuple(
-            interface.Parameter(param, tt.from_value(arg))
-            for param, arg in zip(sdfg.arg_names, inp.args.args)
+            interface.Parameter(param, arg_type)
+            for param, arg_type in zip(sdfg.arg_names, inp.args.args)
         )
 
         module: stages.ProgramSource[languages.SDFG, languages.LanguageSettings] = (
@@ -98,10 +113,12 @@ class Params:
         cmake_build_type: config.CMakeBuildType = factory.LazyFunction(
             lambda: config.CMAKE_BUILD_TYPE
         )
+        auto_optimize: bool = False
 
     translation = factory.SubFactory(
         DaCeTranslationStepFactory,
         device_type=factory.SelfAttribute("..device_type"),
+        auto_optimize=factory.SelfAttribute("..auto_optimize"),
     )
     bindings = _no_bindings
     compilation = factory.SubFactory(

diff --git a/tests/next_tests/definitions.py b/tests/next_tests/definitions.py
@@ -71,6 +71,7 @@ class OptionalProgramBackendId(_PythonObjectIdMixin, str, enum.Enum):
     DACE_CPU = "gt4py.next.program_processors.runners.dace.itir_cpu"
     DACE_GPU = "gt4py.next.program_processors.runners.dace.itir_gpu"
     GTIR_DACE_CPU = "gt4py.next.program_processors.runners.dace.gtir_cpu"
+    GTIR_DACE_GPU = "gt4py.next.program_processors.runners.dace.gtir_gpu"
 
 
 class ProgramFormatterId(_PythonObjectIdMixin, str, enum.Enum):
@@ -145,11 +146,14 @@ class ProgramFormatterId(_PythonObjectIdMixin, str, enum.Enum):
     (USES_TUPLE_ARGS, XFAIL, UNSUPPORTED_MESSAGE),
     (USES_TUPLE_RETURNS, XFAIL, UNSUPPORTED_MESSAGE),
     (USES_ZERO_DIMENSIONAL_FIELDS, XFAIL, UNSUPPORTED_MESSAGE),
-    (USES_INDEX_BUILTIN, XFAIL, UNSUPPORTED_MESSAGE),
     (STARTS_FROM_GTIR_PROGRAM, SKIP, UNSUPPORTED_MESSAGE),
 ]
 GTIR_DACE_SKIP_TEST_LIST = [
-    (ALL, SKIP, UNSUPPORTED_MESSAGE),
+    (USES_DYNAMIC_OFFSETS, XFAIL, UNSUPPORTED_MESSAGE),
+    (USES_INDEX_BUILTIN, XFAIL, UNSUPPORTED_MESSAGE),
+    (USES_NEGATIVE_MODULO, XFAIL, UNSUPPORTED_MESSAGE),
+    (USES_SCAN, XFAIL, UNSUPPORTED_MESSAGE),
+    (USES_SPARSE_FIELDS_AS_OUTPUT, XFAIL, UNSUPPORTED_MESSAGE),
 ]
 EMBEDDED_SKIP_LIST = [
     (USES_DYNAMIC_OFFSETS, XFAIL, UNSUPPORTED_MESSAGE),
@@ -177,6 +181,11 @@ class ProgramFormatterId(_PythonObjectIdMixin, str, enum.Enum):
     OptionalProgramBackendId.DACE_CPU: DACE_SKIP_TEST_LIST,
     OptionalProgramBackendId.DACE_GPU: DACE_SKIP_TEST_LIST,
     OptionalProgramBackendId.GTIR_DACE_CPU: GTIR_DACE_SKIP_TEST_LIST,
+    OptionalProgramBackendId.GTIR_DACE_GPU: GTIR_DACE_SKIP_TEST_LIST
+    + [
+        # TODO(edopao): Enable when GPU codegen issues related to symbolic domain are fixed.
+        (ALL, XFAIL, UNSUPPORTED_MESSAGE),
+    ],
     ProgramBackendId.GTFN_CPU: GTFN_SKIP_TEST_LIST
     + [(USES_SCAN_NESTED, XFAIL, UNSUPPORTED_MESSAGE)],
     ProgramBackendId.GTFN_CPU_IMPERATIVE: GTFN_SKIP_TEST_LIST

diff --git a/tests/next_tests/integration_tests/feature_tests/dace/test_orchestration.py b/tests/next_tests/integration_tests/feature_tests/dace/test_orchestration.py
@@ -32,7 +32,10 @@
 
 try:
     import dace
-    from gt4py.next.program_processors.runners.dace import run_dace_cpu, run_dace_gpu
+    from gt4py.next.program_processors.runners.dace import (
+        itir_cpu as run_dace_cpu,
+        itir_gpu as run_dace_gpu,
+    )
 except ImportError:
     dace: Optional[ModuleType] = None  # type:ignore[no-redef]
     run_dace_cpu: Optional[next_backend.Backend] = None

diff --git a/tests/next_tests/integration_tests/feature_tests/ffront_tests/ffront_test_utils.py b/tests/next_tests/integration_tests/feature_tests/ffront_tests/ffront_test_utils.py
@@ -62,12 +62,16 @@ def __gt_allocator__(
             next_tests.definitions.OptionalProgramBackendId.DACE_CPU,
             marks=pytest.mark.requires_dace,
         ),
+        pytest.param(
+            next_tests.definitions.OptionalProgramBackendId.DACE_GPU,
+            marks=(pytest.mark.requires_dace, pytest.mark.requires_gpu),
+        ),
         pytest.param(
             next_tests.definitions.OptionalProgramBackendId.GTIR_DACE_CPU,
             marks=pytest.mark.requires_dace,
         ),
         pytest.param(
-            next_tests.definitions.OptionalProgramBackendId.DACE_GPU,
+            next_tests.definitions.OptionalProgramBackendId.GTIR_DACE_GPU,
             marks=(pytest.mark.requires_dace, pytest.mark.requires_gpu),
         ),
     ],

diff --git a/tests/next_tests/integration_tests/feature_tests/iterator_tests/test_program.py b/tests/next_tests/integration_tests/feature_tests/iterator_tests/test_program.py
@@ -88,6 +88,7 @@ def index_program_shift(out, size):
     )
 
 
+@pytest.mark.starts_from_gtir_program
 @pytest.mark.uses_index_builtin
 def test_index_builtin_shift(program_processor):
     program_processor, validate = program_processor