[ci/docker/air] Update ML/DL dependencies to latest releases / Remove…

… Py3.6 Docker images (ray-project#28808) This PR upgrades some of our dependencies to the latest releases. At the moment, we are testing with a lower bound of dependencies. This is good as we ensure compatibility with these versions, but at the same time we don't test compatibility with more recent releases. We prioritize backwards compatibility over compatibility with more recent versions. This PR: - Introduces a set of _legacy dependencies_ (the current versions in the CI). - This is a lower bound of dependencies that we guarantee compatibility with - The regular dependencies are upgraded to more recent releases - We remove Docker builds for Python 3.6. as this is incompatible with more recent versions of tensorflow/torch - Wheel building for 3.6 is not affected. Signed-off-by: Kai Fricke <[email protected]> Signed-off-by: Artur Niederfahrenhorst <[email protected]> Co-authored-by: Artur Niederfahrenhorst <[email protected]>
jkpjkpjkp · Oct 27, 2022 · 6b9a56d · 6b9a56d
1 parent 4f75404
commit 6b9a56d
Show file tree

Hide file tree

Showing 21 changed files with 104 additions and 62 deletions.
diff --git a/.buildkite/pipeline.build.yml b/.buildkite/pipeline.build.yml
@@ -84,26 +84,6 @@
 #     # Upload to latest directory.
 #     - if [ "$BUILDKITE_BRANCH" == "master" ]; then python .buildkite/copy_files.py --destination wheels --path ./.whl; fi
 
-- label: ":docker: Build Images: py36 (1/2)"
-  conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"]
-  instance_size: medium
-  commands:
-    - LINUX_WHEELS=1 ./ci/ci.sh build
-    - pip install -q docker aws_requests_auth boto3
-    - ./ci/env/env_info.sh
-    - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi
-    - python ./ci/build/build-docker-images.py --py-versions py36 --device-types cpu cu101 cu102 cu110 --build-type BUILDKITE --build-base
-
-- label: ":docker: Build Images: py36 (2/2)"
-  conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"]
-  instance_size: medium
-  commands:
-    - LINUX_WHEELS=1 ./ci/ci.sh build
-    - pip install -q docker aws_requests_auth boto3
-    - ./ci/env/env_info.sh
-    - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi
-    - python ./ci/build/build-docker-images.py --py-versions py36 --device-types cu111 cu112 --build-type BUILDKITE --build-base
-
 - label: ":docker: Build Images: py37 (1/2)"
   conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"]
   instance_size: medium
@@ -612,16 +592,39 @@
 - label: ":cold_face: :python: Ray Python 3.6 ML compatibility tests"
   conditions:
     ["ALWAYS", "RAY_CI_TUNE_AFFECTED", "RAY_CI_TRAIN_AFFECTED", "RAY_CI_ML_AFFECTED", ]
-  instance_size: medium
+  instance_size: large
   commands:
     - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT
     - ./ci/env/install-minimal.sh 3.6
     - pip install -r python/requirements/ml/requirements_py36_compat.txt
     - pip install -U typing-extensions
     - HOROVOD_WITH_GLOO=1 HOROVOD_WITHOUT_MPI=1 HOROVOD_WITHOUT_MXNET=1 HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 pip install horovod
     - ./ci/env/env_info.sh
+    - bazel test --config=ci $(./scripts/bazel_export_options) --test_tag_filters=compat_py36 
+      python/ray/tests/horovod/...
+      python/ray/tests/lightgbm/...
+      python/ray/tests/ml_py36_compat/...
+      python/ray/tests/xgboost/...
+
+
+- label: ":cold_face: :python: Ray Python legacy dependency ML compatibility tests"
+  conditions:
+    ["RAY_CI_PYTHON_DEPENDENCIES_AFFECTED", "RAY_CI_TUNE_AFFECTED", "RAY_CI_TRAIN_AFFECTED", "RAY_CI_ML_AFFECTED"]
+  instance_size: large
+  commands:
+    - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT
+    - ./ci/env/install-minimal.sh 3.7
+    - DATA_PROCESSING_TESTING=1 TUNE_TESTING=1 TRAIN_TESTING=1 ./ci/env/install-dependencies.sh
+    - pip install -r python/requirements/ml/requirements_legacy_compat.txt
+    - pip install -U typing-extensions
+    - HOROVOD_WITH_GLOO=1 HOROVOD_WITHOUT_MPI=1 HOROVOD_WITHOUT_MXNET=1 HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 pip install horovod
+    - ./ci/env/env_info.sh
     - bazel test --config=ci $(./scripts/bazel_export_options) --test_tag_filters=compat 
       python/ray/tests/horovod/...
       python/ray/tests/lightgbm/...
       python/ray/tests/ml_py36_compat/...
       python/ray/tests/xgboost/...
+      python/ray/tests/ray_lightning/...
+    - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=-gpu,-mosaic,-needs_credentials python/ray/air/...
+    - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=ray_air,-mosaic,-gpu_only,-gpu,-needs_credentials python/ray/train/...
+    - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=ray_air,-mosaic python/ray/data/...
diff --git a/ci/build/build-docker-images.py b/ci/build/build-docker-images.py
@@ -60,9 +60,9 @@
 
 # The CUDA version to use for the ML Docker image.
 # If changing the CUDA version in the below line, you should also change the base Docker
-# image being used in ~/.buildkite/Dockerfile.gpu to match the same image being used
+# image being used in ~/ci/docker/Dockerfile.gpu to match the same image being used
 # here.
-ML_CUDA_VERSION = "cu112"
+ML_CUDA_VERSION = "cu116"
 
 DEFAULT_PYTHON_VERSION = "py37"
 
@@ -361,20 +361,24 @@ def prep_ray_ml():
     ml_requirements_files = [
         "python/requirements/ml/requirements_ml_docker.txt",
         "python/requirements/ml/requirements_dl.txt",
-        "python/requirements/ml/requirements_py36_compat.txt",
         "python/requirements/ml/requirements_tune.txt",
         "python/requirements/ml/requirements_rllib.txt",
         "python/requirements/ml/requirements_train.txt",
         "python/requirements/ml/requirements_upstream.txt",
     ]
+    # We don't need these in the ml docker image
+    ignore_requirements = [
+        "python/requirements/ml/requirements_legacy_compat.txt",
+        "python/requirements/ml/requirements_py36_compat.txt",
+    ]
 
     files_on_disk = glob.glob(f"{root_dir}/python/**/requirements*.txt", recursive=True)
     for file_on_disk in files_on_disk:
         rel = os.path.relpath(file_on_disk, start=root_dir)
         print(rel)
         if not rel.startswith("python/requirements/ml"):
             continue
-        elif rel not in ml_requirements_files:
+        elif rel not in ml_requirements_files and rel not in ignore_requirements:
             raise RuntimeError(
                 f"A new requirements file was found in the repository, but it has "
                 f"not been added to `build-docker-images.py` "

diff --git a/ci/docker/base.gpu.Dockerfile b/ci/docker/base.gpu.Dockerfile
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:11.2.0-cudnn8-devel-ubuntu20.04
+FROM nvidia/cuda:11.6.1-cudnn8-devel-ubuntu20.04
 
 ARG REMOTE_CACHE_URL
 ARG BUILDKITE_PULL_REQUEST

diff --git a/ci/env/install-dependencies.sh b/ci/env/install-dependencies.sh
@@ -413,11 +413,6 @@ install_pip_packages() {
     fi
   fi
 
-  # RLlib testing with TF 1.x.
-  if [ "${RLLIB_TESTING-}" = 1 ] && { [ -n "${TF_VERSION-}" ] || [ -n "${TFP_VERSION-}" ]; }; then
-    pip install --upgrade tensorflow-probability=="${TFP_VERSION}" tensorflow=="${TF_VERSION}"
-  fi
-
   # Inject our own mirror for the CIFAR10 dataset
   if [ "${TRAIN_TESTING-}" = 1 ] || [ "${TUNE_TESTING-}" = 1 ] ||  [ "${DOC_TESTING-}" = 1 ]; then
     SITE_PACKAGES=$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')

diff --git a/python/ray/tests/horovod/BUILD b/python/ray/tests/horovod/BUILD
@@ -3,7 +3,7 @@ py_test(
     size = "medium",
     srcs = ["test_horovod.py"],
     deps = ["//:ray_lib"],
-    tags = ["team:ml", "compat", "exclusive"]
+    tags = ["team:ml", "compat", "compat_py36", "exclusive"]
 )
 
 
diff --git a/python/ray/tests/lightgbm/BUILD b/python/ray/tests/lightgbm/BUILD
@@ -3,15 +3,15 @@ py_test(
  size = "small",
  srcs = ["simple_example.py"],
  deps = ["//:ray_lib"],
- tags = ["team:ml", "compat", "exclusive"],
+ tags = ["team:ml", "compat", "compat_py36", "exclusive"],
 )
 
 py_test(
  name = "simple_tune",
  size="small",
  srcs = ["simple_tune.py"],
  deps = ["//:ray_lib"],
- tags = ["team:ml", "compat", "exclusive"]
+ tags = ["team:ml", "compat", "compat_py36", "exclusive"]
 )
 
 
diff --git a/python/ray/tests/ml_py36_compat/BUILD b/python/ray/tests/ml_py36_compat/BUILD
@@ -3,13 +3,13 @@ py_test(
  size = "medium",
  srcs = ["tune_hvd_keras.py"],
  deps = ["//:ray_lib"],
- tags = ["team:ml", "compat", "exclusive"],
+ tags = ["team:ml", "compat", "compat_py36", "exclusive"],
 )
 
 py_test(
  name = "tune_hvd_torch",
  size = "medium",
  srcs = ["tune_hvd_torch.py"],
  deps = ["//:ray_lib"],
- tags = ["team:ml", "compat", "exclusive"],
+ tags = ["team:ml", "compat", "compat_py36", "exclusive"],
 )
diff --git a/python/ray/tests/ray_lightning/BUILD b/python/ray/tests/ray_lightning/BUILD
@@ -3,13 +3,13 @@ py_test(
  size = "small",
  srcs = ["simple_example.py"],
  deps = ["//:ray_lib"],
- tags = ["team:ml", "exclusive"],
+ tags = ["team:ml", "compat", "exclusive"],
 )
 
 py_test(
  name = "simple_tune",
  size="small",
  srcs = ["simple_tune.py"],
  deps = ["//:ray_lib"],
- tags = ["team:ml", "exclusive"]
+ tags = ["team:ml", "compat", "exclusive"]
 )
diff --git a/python/ray/tests/xgboost/BUILD b/python/ray/tests/xgboost/BUILD
@@ -7,15 +7,15 @@ py_test(
  size = "small",
  srcs = ["simple_example.py"],
  deps = ["//:ray_lib"],
- tags = ["team:ml", "compat", "exclusive"],
+ tags = ["team:ml", "compat", "compat_py36", "exclusive"],
 )
 
 py_test(
  name = "simple_tune",
  size="small",
  srcs = ["simple_tune.py"],
  deps = ["//:ray_lib"],
- tags = ["team:ml", "compat", "exclusive"]
+ tags = ["team:ml", "compat", "compat_py36", "exclusive"]
 )
 
 
diff --git a/python/ray/tune/BUILD b/python/ray/tune/BUILD
@@ -679,7 +679,7 @@ py_test(
 
 py_test(
     name = "mnist_ptl_mini",
-    size = "small",
+    size = "medium",
     srcs = ["examples/mnist_ptl_mini.py"],
     deps = [":tune_lib"],
     tags = ["team:ml", "exclusive", "example", "pytorch"],

diff --git a/python/ray/tune/tests/test_sample.py b/python/ray/tune/tests/test_sample.py
@@ -921,8 +921,8 @@ def testConvertHEBO(self):
         self.assertEqual(config1, config2)
         self.assertIn(config1["a"], [2, 3, 4])
         self.assertIn(config1["b"]["x"], list(range(5)))
-        self.assertLess(1e-4, config1["b"]["z"])
-        self.assertLess(config1["b"]["z"], 1e-2)
+        self.assertLessEqual(1e-4, config1["b"]["z"])
+        self.assertLessEqual(config1["b"]["z"], 1e-2)
 
         searcher = HEBOSearch(metric="a", mode="max", random_state_seed=123)
         analysis = tune.run(

diff --git a/python/ray/tune/tests/test_tune_restore.py b/python/ray/tune/tests/test_tune_restore.py
@@ -462,7 +462,7 @@ def tearDown(self):
 
     def testPBTKeras(self):
         from ray.tune.examples.pbt_tune_cifar10_with_keras import Cifar10Model
-        from tensorflow.python.keras.datasets import cifar10
+        from tensorflow.keras.datasets import cifar10
 
         cifar10.load_data()
         validate_save_restore(Cifar10Model)

diff --git a/python/requirements/ml/requirements_dl.txt b/python/requirements/ml/requirements_dl.txt
@@ -1,15 +1,15 @@
 # These requirements are used for the CI and CPU-only Docker images so we install CPU only versions of torch.
 # For GPU Docker images, you should install requirements_ml_docker.txt afterwards.
 
-tensorflow==2.6.2
-tensorflow-probability==0.14.1
+tensorflow==2.9.0
+tensorflow-probability==0.17.0
 
 # If you make changes to the torch versions below, please also make the corresponding changes to `requirements_ml_docker.txt`!
 
-torch==1.9.0;sys_platform=="darwin"
-torchvision==0.10.0;sys_platform=="darwin"
+torch==1.12.1;sys_platform=="darwin"
+torchvision==0.13.1;sys_platform=="darwin"
 
 # On non-OSX machines only install CPU version of torch and torchvision
 -f https://download.pytorch.org/whl/torch_stable.html
-torch==1.9.0+cpu;sys_platform!="darwin"
-torchvision==0.10.0+cpu;sys_platform!="darwin"
+torch==1.12.1+cpu;sys_platform!="darwin"
+torchvision==0.13.1+cpu;sys_platform!="darwin"
diff --git a/python/requirements/ml/requirements_legacy_compat.txt b/python/requirements/ml/requirements_legacy_compat.txt
@@ -0,0 +1,31 @@
+# ATTENTION: THESE DEPENDENCIES SHOULD USUALLY NOT BE UPDATED!
+
+# Updating these dependencies means we remove official support for dependency releases older than
+# the specified version.
+
+# These are compatibility requirements to make sure certain workflows continue to work
+# with these dependency versions. They thus act as a lower bound for compatibility
+# with ML libraries.
+# Concretely, we set up a fresh Python 3.7 environment and
+# run the pipeline job in `Ray Python legacy dependency ML compatibility tests` with these dependencies installed.
+
+# ML libraries
+torch==1.9.0
+tensorflow==2.7.0
+tensorflow-probability==0.14.1
+keras==2.7.0
+
+# Torch addons
+torchvision==0.10.0
+
+pytorch-lightning==1.5.10
+
+# Upstream libraries
+lightgbm_ray==0.1.5
+xgboost_ray==0.1.10
+ray_lightning==0.2.0
+
+# Datasets
+pyarrow==6.0.1
+
+ray[tune,data]
diff --git a/python/requirements/ml/requirements_ml_docker.txt b/python/requirements/ml/requirements_ml_docker.txt
@@ -5,12 +5,12 @@ tblib
 
 # If you make changes to the torch versions, please also make the corresponding changes to `requirements_dl.txt`!
 -f https://download.pytorch.org/whl/torch_stable.html
-torch==1.9.0+cu111
-torchvision==0.10.0+cu111
+torch==1.12.1+cu116
+torchvision==0.13.1+cu116
 
--f https://data.pyg.org/whl/torch-1.9.0+cu111.html
+-f https://data.pyg.org/whl/torch-1.12.1+cu116.html
 torch-scatter==2.0.9
-torch-sparse==0.6.12
+torch-sparse==0.6.15
 
 # torch-geometric has to be installed after torch-scatter and torch-sparse.
 torch-geometric==2.0.3; python_version < '3.7'

diff --git a/python/requirements/ml/requirements_rllib.txt b/python/requirements/ml/requirements_rllib.txt
@@ -22,7 +22,7 @@ supersuit==3.3.3; python_version >= '3.7'
 pybullet==3.2.0
 # For tests on RecSim and Kaggle envs.
 recsim==0.2.4
-tensorflow_estimator==2.6.0
+tensorflow_estimator==2.9.0
 # DeepMind's OpenSpiel
 open-spiel==1.0.2
 
@@ -34,8 +34,8 @@ higher==0.2.1
 pyglet==1.5.15
 imageio-ffmpeg==0.4.5
 # ONNX
-onnx==1.9.0
-onnxruntime==1.9.0
-tf2onnx==1.8.5
+onnx==1.12.0
+onnxruntime==1.12.0
+tf2onnx==1.12.1
 typer==0.6.1
 rich==12.0.1
diff --git a/python/requirements/ml/requirements_tune.txt b/python/requirements/ml/requirements_tune.txt
@@ -28,7 +28,7 @@ pytest-remotedata==0.3.2
 lightning-bolts==0.4.0
 pytorch-lightning==1.5.10
 shortuuid==1.0.1
-scikit-optimize==0.8.1
+scikit-optimize==0.9.0
 sigopt==7.5.0
 timm==0.4.5
 transformers==4.18.0; python_version <= '3.6'

diff --git a/rllib/BUILD b/rllib/BUILD
@@ -84,7 +84,7 @@ py_test(
     name = "learning_tests_cartpole_a2c_microbatch",
     main = "tests/run_regression_tests.py",
     tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete"],
-    size = "medium",
+    size = "large",
     srcs = ["tests/run_regression_tests.py"],
     data = ["tuned_examples/a2c/cartpole-a2c-microbatch.yaml"],
     args = ["--yaml-dir=tuned_examples/a2c"]

diff --git a/rllib/algorithms/a3c/tests/test_a3c.py b/rllib/algorithms/a3c/tests/test_a3c.py
@@ -27,7 +27,7 @@ def test_a3c_compilation(self):
         num_iterations = 2
 
         # Test against all frameworks.
-        for _ in framework_iterator(config, with_eager_tracing=True):
+        for _ in framework_iterator(config, with_eager_tracing=False):
             for env in ["CartPole-v1", "Pendulum-v1", "PongDeterministic-v0"]:
                 print("env={}".format(env))
                 config.model["use_lstm"] = env == "CartPole-v1"

diff --git a/rllib/tests/run_regression_tests.py b/rllib/tests/run_regression_tests.py
@@ -55,6 +55,7 @@
         "is particularly useful for timed tests."
     ),
 )
+
 # Obsoleted arg, use --framework=torch instead.
 parser.add_argument(
     "--torch", action="store_true", help="Runs all tests with PyTorch enabled."
@@ -108,7 +109,13 @@
             continue
 
         # Always run with eager-tracing when framework=tf2 if not in local-mode.
-        if args.framework == "tf2" and not args.local_mode:
+        # Ignore this if the yaml explicitly tells us to disable eager tracing
+        if (
+            args.framework == "tf2"
+            and not args.local_mode
+            and not exp["config"].get("eager_tracing") is False
+        ):
+
             exp["config"]["eager_tracing"] = True
 
         # Print out the actual config.

diff --git a/rllib/tuned_examples/a2c/cartpole-a2c-microbatch.yaml b/rllib/tuned_examples/a2c/cartpole-a2c-microbatch.yaml
@@ -12,3 +12,5 @@ cartpole-a2c-microbatch:
         rollout_fragment_length: 20
         microbatch_size: 40
         train_batch_size: 120
+        # When using tf>=2.8, eager tracing can not be used
+        eager_tracing: False