From 6b9a56d28e1029741feaa864257d75824fe36622 Mon Sep 17 00:00:00 2001 From: Kai Fricke Date: Thu, 27 Oct 2022 15:39:02 -0700 Subject: [PATCH] [ci/docker/air] Update ML/DL dependencies to latest releases / Remove Py3.6 Docker images (#28808) This PR upgrades some of our dependencies to the latest releases. At the moment, we are testing with a lower bound of dependencies. This is good as we ensure compatibility with these versions, but at the same time we don't test compatibility with more recent releases. We prioritize backwards compatibility over compatibility with more recent versions. This PR: - Introduces a set of _legacy dependencies_ (the current versions in the CI). - This is a lower bound of dependencies that we guarantee compatibility with - The regular dependencies are upgraded to more recent releases - We remove Docker builds for Python 3.6. as this is incompatible with more recent versions of tensorflow/torch - Wheel building for 3.6 is not affected. Signed-off-by: Kai Fricke Signed-off-by: Artur Niederfahrenhorst Co-authored-by: Artur Niederfahrenhorst --- .buildkite/pipeline.build.yml | 45 ++++++++++--------- ci/build/build-docker-images.py | 12 +++-- ci/docker/base.gpu.Dockerfile | 2 +- ci/env/install-dependencies.sh | 5 --- python/ray/tests/horovod/BUILD | 2 +- python/ray/tests/lightgbm/BUILD | 4 +- python/ray/tests/ml_py36_compat/BUILD | 4 +- python/ray/tests/ray_lightning/BUILD | 4 +- python/ray/tests/xgboost/BUILD | 4 +- python/ray/tune/BUILD | 2 +- python/ray/tune/tests/test_sample.py | 4 +- python/ray/tune/tests/test_tune_restore.py | 2 +- python/requirements/ml/requirements_dl.txt | 12 ++--- .../ml/requirements_legacy_compat.txt | 31 +++++++++++++ .../ml/requirements_ml_docker.txt | 8 ++-- python/requirements/ml/requirements_rllib.txt | 8 ++-- python/requirements/ml/requirements_tune.txt | 2 +- rllib/BUILD | 2 +- rllib/algorithms/a3c/tests/test_a3c.py | 2 +- rllib/tests/run_regression_tests.py | 9 +++- .../a2c/cartpole-a2c-microbatch.yaml | 2 + 21 files changed, 104 insertions(+), 62 deletions(-) create mode 100644 python/requirements/ml/requirements_legacy_compat.txt diff --git a/.buildkite/pipeline.build.yml b/.buildkite/pipeline.build.yml index 32f953810079..67f79624f55a 100644 --- a/.buildkite/pipeline.build.yml +++ b/.buildkite/pipeline.build.yml @@ -84,26 +84,6 @@ # # Upload to latest directory. # - if [ "$BUILDKITE_BRANCH" == "master" ]; then python .buildkite/copy_files.py --destination wheels --path ./.whl; fi -- label: ":docker: Build Images: py36 (1/2)" - conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"] - instance_size: medium - commands: - - LINUX_WHEELS=1 ./ci/ci.sh build - - pip install -q docker aws_requests_auth boto3 - - ./ci/env/env_info.sh - - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi - - python ./ci/build/build-docker-images.py --py-versions py36 --device-types cpu cu101 cu102 cu110 --build-type BUILDKITE --build-base - -- label: ":docker: Build Images: py36 (2/2)" - conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"] - instance_size: medium - commands: - - LINUX_WHEELS=1 ./ci/ci.sh build - - pip install -q docker aws_requests_auth boto3 - - ./ci/env/env_info.sh - - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi - - python ./ci/build/build-docker-images.py --py-versions py36 --device-types cu111 cu112 --build-type BUILDKITE --build-base - - label: ":docker: Build Images: py37 (1/2)" conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"] instance_size: medium @@ -612,7 +592,7 @@ - label: ":cold_face: :python: Ray Python 3.6 ML compatibility tests" conditions: ["ALWAYS", "RAY_CI_TUNE_AFFECTED", "RAY_CI_TRAIN_AFFECTED", "RAY_CI_ML_AFFECTED", ] - instance_size: medium + instance_size: large commands: - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT - ./ci/env/install-minimal.sh 3.6 @@ -620,8 +600,31 @@ - pip install -U typing-extensions - HOROVOD_WITH_GLOO=1 HOROVOD_WITHOUT_MPI=1 HOROVOD_WITHOUT_MXNET=1 HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 pip install horovod - ./ci/env/env_info.sh + - bazel test --config=ci $(./scripts/bazel_export_options) --test_tag_filters=compat_py36 + python/ray/tests/horovod/... + python/ray/tests/lightgbm/... + python/ray/tests/ml_py36_compat/... + python/ray/tests/xgboost/... + + +- label: ":cold_face: :python: Ray Python legacy dependency ML compatibility tests" + conditions: + ["RAY_CI_PYTHON_DEPENDENCIES_AFFECTED", "RAY_CI_TUNE_AFFECTED", "RAY_CI_TRAIN_AFFECTED", "RAY_CI_ML_AFFECTED"] + instance_size: large + commands: + - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT + - ./ci/env/install-minimal.sh 3.7 + - DATA_PROCESSING_TESTING=1 TUNE_TESTING=1 TRAIN_TESTING=1 ./ci/env/install-dependencies.sh + - pip install -r python/requirements/ml/requirements_legacy_compat.txt + - pip install -U typing-extensions + - HOROVOD_WITH_GLOO=1 HOROVOD_WITHOUT_MPI=1 HOROVOD_WITHOUT_MXNET=1 HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 pip install horovod + - ./ci/env/env_info.sh - bazel test --config=ci $(./scripts/bazel_export_options) --test_tag_filters=compat python/ray/tests/horovod/... python/ray/tests/lightgbm/... python/ray/tests/ml_py36_compat/... python/ray/tests/xgboost/... + python/ray/tests/ray_lightning/... + - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=-gpu,-mosaic,-needs_credentials python/ray/air/... + - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=ray_air,-mosaic,-gpu_only,-gpu,-needs_credentials python/ray/train/... + - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=ray_air,-mosaic python/ray/data/... diff --git a/ci/build/build-docker-images.py b/ci/build/build-docker-images.py index c3a907b2b09c..1888ec220715 100644 --- a/ci/build/build-docker-images.py +++ b/ci/build/build-docker-images.py @@ -60,9 +60,9 @@ # The CUDA version to use for the ML Docker image. # If changing the CUDA version in the below line, you should also change the base Docker -# image being used in ~/.buildkite/Dockerfile.gpu to match the same image being used +# image being used in ~/ci/docker/Dockerfile.gpu to match the same image being used # here. -ML_CUDA_VERSION = "cu112" +ML_CUDA_VERSION = "cu116" DEFAULT_PYTHON_VERSION = "py37" @@ -361,12 +361,16 @@ def prep_ray_ml(): ml_requirements_files = [ "python/requirements/ml/requirements_ml_docker.txt", "python/requirements/ml/requirements_dl.txt", - "python/requirements/ml/requirements_py36_compat.txt", "python/requirements/ml/requirements_tune.txt", "python/requirements/ml/requirements_rllib.txt", "python/requirements/ml/requirements_train.txt", "python/requirements/ml/requirements_upstream.txt", ] + # We don't need these in the ml docker image + ignore_requirements = [ + "python/requirements/ml/requirements_legacy_compat.txt", + "python/requirements/ml/requirements_py36_compat.txt", + ] files_on_disk = glob.glob(f"{root_dir}/python/**/requirements*.txt", recursive=True) for file_on_disk in files_on_disk: @@ -374,7 +378,7 @@ def prep_ray_ml(): print(rel) if not rel.startswith("python/requirements/ml"): continue - elif rel not in ml_requirements_files: + elif rel not in ml_requirements_files and rel not in ignore_requirements: raise RuntimeError( f"A new requirements file was found in the repository, but it has " f"not been added to `build-docker-images.py` " diff --git a/ci/docker/base.gpu.Dockerfile b/ci/docker/base.gpu.Dockerfile index 6d33fd2d9b25..b3f79557f831 100644 --- a/ci/docker/base.gpu.Dockerfile +++ b/ci/docker/base.gpu.Dockerfile @@ -1,4 +1,4 @@ -FROM nvidia/cuda:11.2.0-cudnn8-devel-ubuntu20.04 +FROM nvidia/cuda:11.6.1-cudnn8-devel-ubuntu20.04 ARG REMOTE_CACHE_URL ARG BUILDKITE_PULL_REQUEST diff --git a/ci/env/install-dependencies.sh b/ci/env/install-dependencies.sh index e53ef3a608d5..25755e4758fb 100755 --- a/ci/env/install-dependencies.sh +++ b/ci/env/install-dependencies.sh @@ -413,11 +413,6 @@ install_pip_packages() { fi fi - # RLlib testing with TF 1.x. - if [ "${RLLIB_TESTING-}" = 1 ] && { [ -n "${TF_VERSION-}" ] || [ -n "${TFP_VERSION-}" ]; }; then - pip install --upgrade tensorflow-probability=="${TFP_VERSION}" tensorflow=="${TF_VERSION}" - fi - # Inject our own mirror for the CIFAR10 dataset if [ "${TRAIN_TESTING-}" = 1 ] || [ "${TUNE_TESTING-}" = 1 ] || [ "${DOC_TESTING-}" = 1 ]; then SITE_PACKAGES=$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())') diff --git a/python/ray/tests/horovod/BUILD b/python/ray/tests/horovod/BUILD index 2a1a17e61a11..cacba4d5f40f 100644 --- a/python/ray/tests/horovod/BUILD +++ b/python/ray/tests/horovod/BUILD @@ -3,7 +3,7 @@ py_test( size = "medium", srcs = ["test_horovod.py"], deps = ["//:ray_lib"], - tags = ["team:ml", "compat", "exclusive"] + tags = ["team:ml", "compat", "compat_py36", "exclusive"] ) diff --git a/python/ray/tests/lightgbm/BUILD b/python/ray/tests/lightgbm/BUILD index 04570b9aabd1..81124dc850e0 100644 --- a/python/ray/tests/lightgbm/BUILD +++ b/python/ray/tests/lightgbm/BUILD @@ -3,7 +3,7 @@ py_test( size = "small", srcs = ["simple_example.py"], deps = ["//:ray_lib"], - tags = ["team:ml", "compat", "exclusive"], + tags = ["team:ml", "compat", "compat_py36", "exclusive"], ) py_test( @@ -11,7 +11,7 @@ py_test( size="small", srcs = ["simple_tune.py"], deps = ["//:ray_lib"], - tags = ["team:ml", "compat", "exclusive"] + tags = ["team:ml", "compat", "compat_py36", "exclusive"] ) diff --git a/python/ray/tests/ml_py36_compat/BUILD b/python/ray/tests/ml_py36_compat/BUILD index ba32738a8071..d353d5ee2e90 100644 --- a/python/ray/tests/ml_py36_compat/BUILD +++ b/python/ray/tests/ml_py36_compat/BUILD @@ -3,7 +3,7 @@ py_test( size = "medium", srcs = ["tune_hvd_keras.py"], deps = ["//:ray_lib"], - tags = ["team:ml", "compat", "exclusive"], + tags = ["team:ml", "compat", "compat_py36", "exclusive"], ) py_test( @@ -11,5 +11,5 @@ py_test( size = "medium", srcs = ["tune_hvd_torch.py"], deps = ["//:ray_lib"], - tags = ["team:ml", "compat", "exclusive"], + tags = ["team:ml", "compat", "compat_py36", "exclusive"], ) diff --git a/python/ray/tests/ray_lightning/BUILD b/python/ray/tests/ray_lightning/BUILD index 618c3ab482ea..6b73fc9e1839 100644 --- a/python/ray/tests/ray_lightning/BUILD +++ b/python/ray/tests/ray_lightning/BUILD @@ -3,7 +3,7 @@ py_test( size = "small", srcs = ["simple_example.py"], deps = ["//:ray_lib"], - tags = ["team:ml", "exclusive"], + tags = ["team:ml", "compat", "exclusive"], ) py_test( @@ -11,5 +11,5 @@ py_test( size="small", srcs = ["simple_tune.py"], deps = ["//:ray_lib"], - tags = ["team:ml", "exclusive"] + tags = ["team:ml", "compat", "exclusive"] ) \ No newline at end of file diff --git a/python/ray/tests/xgboost/BUILD b/python/ray/tests/xgboost/BUILD index be39dec7738b..5a1db458e364 100644 --- a/python/ray/tests/xgboost/BUILD +++ b/python/ray/tests/xgboost/BUILD @@ -7,7 +7,7 @@ py_test( size = "small", srcs = ["simple_example.py"], deps = ["//:ray_lib"], - tags = ["team:ml", "compat", "exclusive"], + tags = ["team:ml", "compat", "compat_py36", "exclusive"], ) py_test( @@ -15,7 +15,7 @@ py_test( size="small", srcs = ["simple_tune.py"], deps = ["//:ray_lib"], - tags = ["team:ml", "compat", "exclusive"] + tags = ["team:ml", "compat", "compat_py36", "exclusive"] ) diff --git a/python/ray/tune/BUILD b/python/ray/tune/BUILD index e87f47a66197..206cf53bcd51 100644 --- a/python/ray/tune/BUILD +++ b/python/ray/tune/BUILD @@ -679,7 +679,7 @@ py_test( py_test( name = "mnist_ptl_mini", - size = "small", + size = "medium", srcs = ["examples/mnist_ptl_mini.py"], deps = [":tune_lib"], tags = ["team:ml", "exclusive", "example", "pytorch"], diff --git a/python/ray/tune/tests/test_sample.py b/python/ray/tune/tests/test_sample.py index 24e2655645c9..a1b1b5e8d829 100644 --- a/python/ray/tune/tests/test_sample.py +++ b/python/ray/tune/tests/test_sample.py @@ -921,8 +921,8 @@ def testConvertHEBO(self): self.assertEqual(config1, config2) self.assertIn(config1["a"], [2, 3, 4]) self.assertIn(config1["b"]["x"], list(range(5))) - self.assertLess(1e-4, config1["b"]["z"]) - self.assertLess(config1["b"]["z"], 1e-2) + self.assertLessEqual(1e-4, config1["b"]["z"]) + self.assertLessEqual(config1["b"]["z"], 1e-2) searcher = HEBOSearch(metric="a", mode="max", random_state_seed=123) analysis = tune.run( diff --git a/python/ray/tune/tests/test_tune_restore.py b/python/ray/tune/tests/test_tune_restore.py index 3a7baf7ae44d..333aca7f1d13 100644 --- a/python/ray/tune/tests/test_tune_restore.py +++ b/python/ray/tune/tests/test_tune_restore.py @@ -462,7 +462,7 @@ def tearDown(self): def testPBTKeras(self): from ray.tune.examples.pbt_tune_cifar10_with_keras import Cifar10Model - from tensorflow.python.keras.datasets import cifar10 + from tensorflow.keras.datasets import cifar10 cifar10.load_data() validate_save_restore(Cifar10Model) diff --git a/python/requirements/ml/requirements_dl.txt b/python/requirements/ml/requirements_dl.txt index 04d41897a8be..b62c2bb8ae2e 100644 --- a/python/requirements/ml/requirements_dl.txt +++ b/python/requirements/ml/requirements_dl.txt @@ -1,15 +1,15 @@ # These requirements are used for the CI and CPU-only Docker images so we install CPU only versions of torch. # For GPU Docker images, you should install requirements_ml_docker.txt afterwards. -tensorflow==2.6.2 -tensorflow-probability==0.14.1 +tensorflow==2.9.0 +tensorflow-probability==0.17.0 # If you make changes to the torch versions below, please also make the corresponding changes to `requirements_ml_docker.txt`! -torch==1.9.0;sys_platform=="darwin" -torchvision==0.10.0;sys_platform=="darwin" +torch==1.12.1;sys_platform=="darwin" +torchvision==0.13.1;sys_platform=="darwin" # On non-OSX machines only install CPU version of torch and torchvision -f https://download.pytorch.org/whl/torch_stable.html -torch==1.9.0+cpu;sys_platform!="darwin" -torchvision==0.10.0+cpu;sys_platform!="darwin" +torch==1.12.1+cpu;sys_platform!="darwin" +torchvision==0.13.1+cpu;sys_platform!="darwin" diff --git a/python/requirements/ml/requirements_legacy_compat.txt b/python/requirements/ml/requirements_legacy_compat.txt new file mode 100644 index 000000000000..90021db8f802 --- /dev/null +++ b/python/requirements/ml/requirements_legacy_compat.txt @@ -0,0 +1,31 @@ +# ATTENTION: THESE DEPENDENCIES SHOULD USUALLY NOT BE UPDATED! + +# Updating these dependencies means we remove official support for dependency releases older than +# the specified version. + +# These are compatibility requirements to make sure certain workflows continue to work +# with these dependency versions. They thus act as a lower bound for compatibility +# with ML libraries. +# Concretely, we set up a fresh Python 3.7 environment and +# run the pipeline job in `Ray Python legacy dependency ML compatibility tests` with these dependencies installed. + +# ML libraries +torch==1.9.0 +tensorflow==2.7.0 +tensorflow-probability==0.14.1 +keras==2.7.0 + +# Torch addons +torchvision==0.10.0 + +pytorch-lightning==1.5.10 + +# Upstream libraries +lightgbm_ray==0.1.5 +xgboost_ray==0.1.10 +ray_lightning==0.2.0 + +# Datasets +pyarrow==6.0.1 + +ray[tune,data] diff --git a/python/requirements/ml/requirements_ml_docker.txt b/python/requirements/ml/requirements_ml_docker.txt index c38150ad56c2..1f11e096ccd4 100644 --- a/python/requirements/ml/requirements_ml_docker.txt +++ b/python/requirements/ml/requirements_ml_docker.txt @@ -5,12 +5,12 @@ tblib # If you make changes to the torch versions, please also make the corresponding changes to `requirements_dl.txt`! -f https://download.pytorch.org/whl/torch_stable.html -torch==1.9.0+cu111 -torchvision==0.10.0+cu111 +torch==1.12.1+cu116 +torchvision==0.13.1+cu116 --f https://data.pyg.org/whl/torch-1.9.0+cu111.html +-f https://data.pyg.org/whl/torch-1.12.1+cu116.html torch-scatter==2.0.9 -torch-sparse==0.6.12 +torch-sparse==0.6.15 # torch-geometric has to be installed after torch-scatter and torch-sparse. torch-geometric==2.0.3; python_version < '3.7' diff --git a/python/requirements/ml/requirements_rllib.txt b/python/requirements/ml/requirements_rllib.txt index 66491196841a..278cddf73d9f 100644 --- a/python/requirements/ml/requirements_rllib.txt +++ b/python/requirements/ml/requirements_rllib.txt @@ -22,7 +22,7 @@ supersuit==3.3.3; python_version >= '3.7' pybullet==3.2.0 # For tests on RecSim and Kaggle envs. recsim==0.2.4 -tensorflow_estimator==2.6.0 +tensorflow_estimator==2.9.0 # DeepMind's OpenSpiel open-spiel==1.0.2 @@ -34,8 +34,8 @@ higher==0.2.1 pyglet==1.5.15 imageio-ffmpeg==0.4.5 # ONNX -onnx==1.9.0 -onnxruntime==1.9.0 -tf2onnx==1.8.5 +onnx==1.12.0 +onnxruntime==1.12.0 +tf2onnx==1.12.1 typer==0.6.1 rich==12.0.1 diff --git a/python/requirements/ml/requirements_tune.txt b/python/requirements/ml/requirements_tune.txt index b2e4e0c19f23..b4b3362fb0c7 100644 --- a/python/requirements/ml/requirements_tune.txt +++ b/python/requirements/ml/requirements_tune.txt @@ -28,7 +28,7 @@ pytest-remotedata==0.3.2 lightning-bolts==0.4.0 pytorch-lightning==1.5.10 shortuuid==1.0.1 -scikit-optimize==0.8.1 +scikit-optimize==0.9.0 sigopt==7.5.0 timm==0.4.5 transformers==4.18.0; python_version <= '3.6' diff --git a/rllib/BUILD b/rllib/BUILD index 58c1c988d199..6e6a09b9be13 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -84,7 +84,7 @@ py_test( name = "learning_tests_cartpole_a2c_microbatch", main = "tests/run_regression_tests.py", tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete"], - size = "medium", + size = "large", srcs = ["tests/run_regression_tests.py"], data = ["tuned_examples/a2c/cartpole-a2c-microbatch.yaml"], args = ["--yaml-dir=tuned_examples/a2c"] diff --git a/rllib/algorithms/a3c/tests/test_a3c.py b/rllib/algorithms/a3c/tests/test_a3c.py index 49fdff3327af..1d9aa5402581 100644 --- a/rllib/algorithms/a3c/tests/test_a3c.py +++ b/rllib/algorithms/a3c/tests/test_a3c.py @@ -27,7 +27,7 @@ def test_a3c_compilation(self): num_iterations = 2 # Test against all frameworks. - for _ in framework_iterator(config, with_eager_tracing=True): + for _ in framework_iterator(config, with_eager_tracing=False): for env in ["CartPole-v1", "Pendulum-v1", "PongDeterministic-v0"]: print("env={}".format(env)) config.model["use_lstm"] = env == "CartPole-v1" diff --git a/rllib/tests/run_regression_tests.py b/rllib/tests/run_regression_tests.py index 996d9708a3ca..a4e6923af518 100644 --- a/rllib/tests/run_regression_tests.py +++ b/rllib/tests/run_regression_tests.py @@ -55,6 +55,7 @@ "is particularly useful for timed tests." ), ) + # Obsoleted arg, use --framework=torch instead. parser.add_argument( "--torch", action="store_true", help="Runs all tests with PyTorch enabled." @@ -108,7 +109,13 @@ continue # Always run with eager-tracing when framework=tf2 if not in local-mode. - if args.framework == "tf2" and not args.local_mode: + # Ignore this if the yaml explicitly tells us to disable eager tracing + if ( + args.framework == "tf2" + and not args.local_mode + and not exp["config"].get("eager_tracing") is False + ): + exp["config"]["eager_tracing"] = True # Print out the actual config. diff --git a/rllib/tuned_examples/a2c/cartpole-a2c-microbatch.yaml b/rllib/tuned_examples/a2c/cartpole-a2c-microbatch.yaml index c72ea3d749a1..ac6b6ef688af 100644 --- a/rllib/tuned_examples/a2c/cartpole-a2c-microbatch.yaml +++ b/rllib/tuned_examples/a2c/cartpole-a2c-microbatch.yaml @@ -12,3 +12,5 @@ cartpole-a2c-microbatch: rollout_fragment_length: 20 microbatch_size: 40 train_batch_size: 120 + # When using tf>=2.8, eager tracing can not be used + eager_tracing: False