From d65c4943bdaa81799798200f8a4339e037157125 Mon Sep 17 00:00:00 2001
From: Kiuk Chung <kiuk@google.com>
Date: Fri, 29 Sep 2023 22:06:00 +0000
Subject: [PATCH] (re)enable torch.compile in the pytorch trainer for train,
 predict, and eval

---
 keras/backend/common/global_state.py    |  6 +++++
 keras/backend/torch/core.py             | 11 ++++++++-
 keras/backend/torch/numpy.py            | 19 ++++++++++++++-
 keras/backend/torch/random.py           |  3 +++
 keras/backend/torch/trainer.py          | 17 +++++++-------
 keras/layers/reshaping/flatten.py       |  7 +++++-
 keras/layers/reshaping/up_sampling2d.py | 17 +++++++++-----
 keras/testing/test_case.py              | 21 ++++++++++++++++-
 keras/trainers/epoch_iterator.py        |  2 +-
 keras/trainers/trainer.py               | 31 +++++++++++++------------
 keras/trainers/trainer_test.py          | 20 ++++++++++++++++
 keras/utils/naming_test.py              |  8 ++++++-
 requirements.txt                        |  4 ++--
 13 files changed, 129 insertions(+), 37 deletions(-)

diff --git a/keras/backend/common/global_state.py b/keras/backend/common/global_state.py
index 9ebcacd51a1..8d72538ebdc 100644
--- a/keras/backend/common/global_state.py
+++ b/keras/backend/common/global_state.py
@@ -77,3 +77,9 @@ def clear_session():
             from tensorflow.python.eager import context
 
             context.context().clear_kernel_cache()
+    elif backend.backend() == "torch":
+        import torch._dynamo as dynamo
+
+        # reset's torchdynamo's cache so that  cached guards, compiled fn, etc
+        # do not persist between clear_session() calls
+        dynamo.reset()
diff --git a/keras/backend/torch/core.py b/keras/backend/torch/core.py
index 70ba7cda7b3..af26646561d 100644
--- a/keras/backend/torch/core.py
+++ b/keras/backend/torch/core.py
@@ -149,6 +149,7 @@ def convert_to_tensor(x, dtype=None, sparse=False):
         return torch.as_tensor(x, dtype=torch.int32, device=get_device())
     if isinstance(x, float):
         return torch.as_tensor(x, dtype=torch.float32, device=get_device())
+
     # Convert to np in case of any array-like that is not list or tuple.
     if not isinstance(x, (list, tuple)):
         x = np.array(x)
@@ -180,7 +181,15 @@ def transform(x):
 
 
 def is_tensor(x):
-    return torch.is_tensor(x)
+    # Using the built-in `isinstance` is recommended by pytorch
+    # over using torch.is_tensor
+    # see: https://pytorch.org/docs/stable/generated/torch.is_tensor.html
+    #
+    # Also, `torch.is_tensor()` causes issues with dynamo caching when
+    # a torch.Tensor and numpy.ndarray of the same size, shape, and dtype
+    # is passed, if called on a Tensor first the second call with ndarray
+    # will return `True` and vice-versa.
+    return isinstance(x, torch.Tensor)
 
 
 def shape(x):
diff --git a/keras/backend/torch/numpy.py b/keras/backend/torch/numpy.py
index 17a2bf35c5a..a2290cf50c5 100644
--- a/keras/backend/torch/numpy.py
+++ b/keras/backend/torch/numpy.py
@@ -63,6 +63,9 @@ def mean(x, axis=None, keepdims=False):
     if axis == () or axis == []:
         # Torch handles the empty axis case differently from numpy.
         return x
+    elif isinstance(axis, int):
+        axis = (axis,)  # see [NB] below
+
     ori_dtype = standardize_dtype(x.dtype)
     # torch.mean only supports floating point inputs
     compute_dtype = dtypes.result_type(x.dtype, "float32")
@@ -70,8 +73,22 @@ def mean(x, axis=None, keepdims=False):
         result_dtype = compute_dtype
     else:
         result_dtype = ori_dtype
+
+    # [NB] the python torch op torch.mean() is generated into
+    # `torch._C._VariableFunctions.pyi`, and the method
+    # signature is overloaded.
+    # Dynamo won't actually find the correct signature of
+    # `torch.mean()` if arguments are passed via kwargs
+    # So we have to pass the arguments via positional args
+    # EXCEPT for those that are forced as kwargs via the `*`
+    # delimiter in the overloaded method signatures.
+    # Additionally, we have to create a singleton-tuple
+    # when `axis` is an int to match the existing fn signature
     result = torch.mean(
-        x, axis=axis, keepdims=keepdims, dtype=to_torch_dtype(compute_dtype)
+        x,
+        axis,
+        keepdims,
+        dtype=to_torch_dtype(compute_dtype),
     )
     return cast(result, result_dtype)
 
diff --git a/keras/backend/torch/random.py b/keras/backend/torch/random.py
index b4365c02dac..f85732ca37a 100644
--- a/keras/backend/torch/random.py
+++ b/keras/backend/torch/random.py
@@ -10,6 +10,9 @@
 from keras.random.seed_generator import make_default_seed
 
 
+# torch.Generator not supported with dynamo
+# see: https://github.com/pytorch/pytorch/issues/88576
+@torch.compiler.disable()
 def torch_seed_generator(seed):
     first_seed, second_seed = draw_seed(seed)
     device = get_device()
diff --git a/keras/backend/torch/trainer.py b/keras/backend/torch/trainer.py
index 34f0bd782d7..56f7eb6cf81 100644
--- a/keras/backend/torch/trainer.py
+++ b/keras/backend/torch/trainer.py
@@ -102,12 +102,7 @@ def one_step_on_data(data):
             return self.train_step(data)
 
         if self.jit_compile:
-            raise ValueError(
-                "`jit_compile` is not yet enabled for the PyTorch backend."
-            )
-            # Temporarily disabled torch compile due to failed unit tests.
-            # TODO: Uncomment the following line when unit tests passes.
-            # self.train_function = torch.compile(one_step_on_data)
+            self.train_function = torch.compile(one_step_on_data)
         else:
             self.train_function = one_step_on_data
 
@@ -127,7 +122,10 @@ def one_step_on_data(data):
             with torch.no_grad():
                 return self.test_step(data)
 
-        self.test_function = one_step_on_data
+        if self.jit_compile:
+            self.test_function = torch.compile(one_step_on_data)
+        else:
+            self.test_function = one_step_on_data
 
     def make_predict_function(self, force=False):
         if self.predict_function is not None and not force:
@@ -145,7 +143,10 @@ def one_step_on_data(data):
             with torch.no_grad():
                 return self.predict_step(data)
 
-        self.predict_function = one_step_on_data
+        if self.jit_compile:
+            self.predict_function = torch.compile(one_step_on_data)
+        else:
+            self.predict_function = one_step_on_data
 
     def _symbolic_build(self, data_batch):
         model_unbuilt = not all(layer.built for layer in self._flatten_layers())
diff --git a/keras/layers/reshaping/flatten.py b/keras/layers/reshaping/flatten.py
index da72ce961dd..0923f33da4b 100644
--- a/keras/layers/reshaping/flatten.py
+++ b/keras/layers/reshaping/flatten.py
@@ -57,7 +57,12 @@ def compute_output_shape(self, input_shape):
         non_batch_dims = input_shape[1:]
         if len(non_batch_dims) == 0:
             flattened_dim = 1
-        elif None in non_batch_dims:
+        elif any(d is None for d in non_batch_dims):
+            # NB: we cannot use the shorter `None in non_batch_dims` here b/c
+            # torchdynamo errors when calling `__contains__` op with
+            # a constant (in this case `None`) operand since it assumes
+            # that the elements in the collection are also `ConstantVariable`s
+            # but tensor shapes can be `SymNodeVariable`s (e.g. `SymInt`)
             flattened_dim = None
         else:
             flattened_dim = math.prod(non_batch_dims)
diff --git a/keras/layers/reshaping/up_sampling2d.py b/keras/layers/reshaping/up_sampling2d.py
index 0e593de09e7..969839f9ab7 100644
--- a/keras/layers/reshaping/up_sampling2d.py
+++ b/keras/layers/reshaping/up_sampling2d.py
@@ -1,5 +1,3 @@
-import numpy as np
-
 from keras import backend
 from keras import ops
 from keras.api_export import keras_export
@@ -149,10 +147,6 @@ def _resize_images(
         else:
             raise ValueError(f"Invalid `data_format` argument: {data_format}")
 
-        new_shape = x.shape[rows : cols + 1]
-        new_shape *= np.array([height_factor, width_factor])
-        new_shape = new_shape.tolist()
-
         if data_format == "channels_first":
             x = ops.transpose(x, [0, 2, 3, 1])
         # https://github.com/keras-team/keras/issues/294
@@ -161,6 +155,17 @@ def _resize_images(
             x = ops.repeat(x, height_factor, axis=1)
             x = ops.repeat(x, width_factor, axis=2)
         else:
+            # multiply the height and width factor on each dim
+            # by hand (versus using element-wise multiplication
+            # by np.array([height_factor, width_factor]) then
+            # list-ifying the tensor by calling `.tolist()`)
+            # since when running under torchdynamo, `new_shape`
+            # will be traced as a symbolic variable (specifically
+            # a `FakeTensor`) which does not have a `tolist()` method.
+            new_shape = (
+                x.shape[rows] * height_factor,
+                x.shape[cols] * width_factor,
+            )
             x = ops.image.resize(x, new_shape, interpolation=interpolation)
         if data_format == "channels_first":
             x = ops.transpose(x, [0, 3, 1, 2])
diff --git a/keras/testing/test_case.py b/keras/testing/test_case.py
index 91684f6dab7..b4c738819ba 100644
--- a/keras/testing/test_case.py
+++ b/keras/testing/test_case.py
@@ -11,6 +11,7 @@
 from keras import utils
 from keras.backend.common import is_float_dtype
 from keras.backend.common import standardize_dtype
+from keras.backend.common.global_state import clear_session
 from keras.backend.common.keras_tensor import KerasTensor
 from keras.models import Model
 from keras.utils import traceback_utils
@@ -24,6 +25,12 @@ def __init__(self, *args, **kwargs):
         if traceback_utils.is_traceback_filtering_enabled():
             traceback_utils.disable_traceback_filtering()
 
+    def setUp(self):
+        # clear global state so that test cases are independent
+        # required for the jit enabled torch tests since dynamo has
+        # a global cache for guards, compiled fn, etc
+        clear_session()
+
     def get_temp_dir(self):
         temp_dir = tempfile.mkdtemp()
         self.addCleanup(lambda: shutil.rmtree(temp_dir))
@@ -329,7 +336,19 @@ def call(self, x):
                 output_data = tree.map_structure(
                     lambda x: backend.convert_to_numpy(x), output_data
                 )
-                model.compile(optimizer="sgd", loss="mse", jit_compile=True)
+                # test the "default" path for each backend by setting
+                # jit_compile="auto.
+                # for tensorflow and jax backends auto is jitted
+                # for torch backend auto is eager
+                #
+                # NB: for torch, jit_compile=True turns on torchdynamo
+                #  which may not always succeed in tracing depending
+                #  on the model. Run your program with these env vars
+                #  to get debug traces of dynamo:
+                #    TORCH_LOGS="+dynamo"
+                #    TORCHDYNAMO_VERBOSE=1
+                #    TORCHDYNAMO_REPORT_GUARD_FAILURES=1
+                model.compile(optimizer="sgd", loss="mse", jit_compile="auto")
                 model.fit(input_data, output_data, verbose=0)
 
         # Build test.
diff --git a/keras/trainers/epoch_iterator.py b/keras/trainers/epoch_iterator.py
index 4e1a776d3dc..b5d7606c077 100644
--- a/keras/trainers/epoch_iterator.py
+++ b/keras/trainers/epoch_iterator.py
@@ -122,7 +122,7 @@ def enumerate_epoch(self, return_type="auto"):
             if buffer:
                 yield step - len(buffer) + 1, buffer
             if not self._num_batches:
-                # Infer the number of batches returned by the data_adater.
+                # Infer the number of batches returned by the data_adapter.
                 # Assumed static.
                 self._num_batches = step + 1
         self.data_adapter.on_epoch_end()
diff --git a/keras/trainers/trainer.py b/keras/trainers/trainer.py
index 8606b4cbf5d..b70a3a31bc4 100644
--- a/keras/trainers/trainer.py
+++ b/keras/trainers/trainer.py
@@ -104,7 +104,7 @@ def compile(
                  and to set it to `True` when debugging.
             steps_per_execution: Int. The number of batches to run
                 during each a single compiled function call. Running multiple
-                batches inside a single a single compiled function call can
+                batches inside a single compiled function call can
                 greatly improve performance on TPUs or small models with a large
                 Python overhead. At most, one full epoch will be run each
                 execution. If a number larger than the size of the epoch is
@@ -115,9 +115,12 @@ def compile(
                 each compiled function execution).
                 Not supported with the PyTorch backend.
             jit_compile: Bool or `"auto"`. Whether to use XLA compilation when
-                compiling a model. Not supported with the PyTorch backend.
-                If `"auto"`, XLA compilation will be enabled if the
-                the model supports it, and disabled otherwise.
+                compiling a model. For `jax` and `tensorflow` backends,
+                `jit_compile="auto"` enables XLA compilation if the model
+                supports it, and disabled otherwise.
+                For `torch` backend, `"auto"` will default to eager
+                execution and `jit_compile=True` will run with `torch.compile`
+                with the `"inductor"` backend.
             auto_scale_loss: Bool. If `True` and the model dtype policy is
                 `"mixed_float16"`, the passed optimizer will be automatically
                 wrapped in a `LossScaleOptimizer`, which will dynamically
@@ -162,12 +165,7 @@ def compile(
                 "cannot also be True. Disabling `jit_compile`.",
                 stacklevel=2,
             )
-        if jit_compile and backend.backend() == "torch":
-            warnings.warn(
-                "`jit_compile` is not yet enabled for the PyTorch backend. "
-                "Proceeding with `jit_compile=False`."
-            )
-            jit_compile = False
+
         self.jit_compile = jit_compile
         self.run_eagerly = run_eagerly
         self.stop_training = False
@@ -194,7 +192,10 @@ def compile(
     def jit_compile(self):
         if self._jit_compile is None:
             # Value was never set. Resolve it now.
-            jit_compile = model_supports_jit(self)
+            # torch always runs in eager unless jit_compile is explicitly set
+            jit_compile = (
+                model_supports_jit(self) and backend.backend() != "torch"
+            )
             self._jit_compile = jit_compile
         return self._jit_compile
 
@@ -866,11 +867,11 @@ def _assert_compile_called(self, method_name=None):
 
 
 def resolve_auto_jit_compile(model):
+    if backend.backend() == "torch":
+        # jit_compile = "auto" with the pytorch backend defaults to eager
+        return False
+
     if model_supports_jit(model):
-        if backend.backend() == "torch":
-            # Torch defaults to eager mode
-            # until torch compile is reliable
-            return False
         return True
     return False
 
diff --git a/keras/trainers/trainer_test.py b/keras/trainers/trainer_test.py
index 80f108b3ae9..ee96a0e5f8c 100644
--- a/keras/trainers/trainer_test.py
+++ b/keras/trainers/trainer_test.py
@@ -148,6 +148,18 @@ def __init__(self, units):
         )
         self.assertEqual(len(model_weighted.metrics), 3)
 
+    @pytest.mark.skipif(
+        backend.backend() != "torch",
+        reason="torch backend runs in eager mode for jit_compile='auto'",
+    )
+    def test_compile_eager_vs_jit_torch(self):
+        model = ExampleModel(units=3)
+        model.compile(jit_compile="auto")
+        # torch trainer en/disables torch.compile only based on the value of
+        # model.jit_compile (not model.run_eagerly)
+        self.assertFalse(model.run_eagerly)
+        self.assertFalse(model.jit_compile)
+
     @parameterized.named_parameters(
         [
             ("eager", True, False, False),
@@ -292,6 +304,14 @@ def test_predict_flow(self, run_eagerly, jit_compile):
         outputs = model.predict(x, batch_size=batch_size)
         self.assertAllClose(outputs, 4 * np.ones((100, 3)))
 
+    @parameterized.named_parameters(
+        [
+            ("eager", True, False),
+            ("graph_fn", False, False),
+            ("jit", False, True),
+        ]
+    )
+    def test_predict_flow_struct(self, run_eagerly, jit_compile):
         # Test with input/output structs
         model = StructModel(units=3)
         model.run_eagerly = run_eagerly
diff --git a/keras/utils/naming_test.py b/keras/utils/naming_test.py
index c5b0752a191..6be61fdbefe 100644
--- a/keras/utils/naming_test.py
+++ b/keras/utils/naming_test.py
@@ -61,7 +61,13 @@ def test_uniquify_already_uniquified_name(self):
         name = "unique_name"
         unique_name = naming.uniquify(name)
         new_unique_name = naming.uniquify(unique_name)
-        self.assertEqual(new_unique_name, unique_name)
+
+        # first time `name` is uniquified so returns same name
+        self.assertEqual(name, unique_name)
+
+        # second time `name` is uniquified should be different
+        # from the first output
+        self.assertNotEqual(new_unique_name, unique_name)
 
     def test_to_snake_case_capital_after_any_character(self):
         name = "myVariableNameHere"
diff --git a/requirements.txt b/requirements.txt
index 39b351e3481..33eb38e34d9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,8 +2,8 @@
 tf-nightly==2.15.0.dev20231009  # Pin a working nightly until rc0.
 
 # Torch.
-torch>=2.0.1
-torchvision>=0.15.1
+torch>=2.1.0
+torchvision>=0.16.0
 
 # Jax.
 jax[cpu]