diff --git a/.github/workflows/nightly_release_testing.yaml b/.github/workflows/nightly_release_testing.yaml index 9fa0f8868..2f35fe692 100644 --- a/.github/workflows/nightly_release_testing.yaml +++ b/.github/workflows/nightly_release_testing.yaml @@ -66,6 +66,7 @@ jobs: DEN_TESTER_TOKEN: ${{ secrets.DEN_TESTER_PROD_TOKEN }} DEN_TESTER_USERNAME: ${{ secrets.DEN_TESTER_USERNAME }} API_SERVER_URL: ${{ env.API_SERVER_URL }} + EKS_ARN: ${{ secrets.EKS_ARN }} - name: Run cluster and not on-demand tests env: @@ -104,6 +105,7 @@ jobs: DEN_TESTER_TOKEN: ${{ secrets.DEN_TESTER_PROD_TOKEN }} DEN_TESTER_USERNAME: ${{ secrets.DEN_TESTER_USERNAME }} API_SERVER_URL: ${{ env.API_SERVER_URL }} + EKS_ARN: ${{ secrets.EKS_ARN }} - name: Run on-demand aws tests env: @@ -145,6 +147,7 @@ jobs: DEN_TESTER_TOKEN: ${{ secrets.DEN_TESTER_PROD_TOKEN }} DEN_TESTER_USERNAME: ${{ secrets.DEN_TESTER_USERNAME }} API_SERVER_URL: ${{ env.API_SERVER_URL }} + EKS_ARN: ${{ secrets.EKS_ARN }} - name: Run on-demand aws tests env: @@ -184,6 +187,7 @@ jobs: DEN_TESTER_TOKEN: ${{ secrets.DEN_TESTER_PROD_TOKEN }} DEN_TESTER_USERNAME: ${{ secrets.DEN_TESTER_USERNAME }} API_SERVER_URL: ${{ env.API_SERVER_URL }} + EKS_ARN: ${{ secrets.EKS_ARN }} - name: Run on-demand gcp tests env: @@ -222,6 +226,7 @@ jobs: DEN_TESTER_TOKEN: ${{ secrets.DEN_TESTER_PROD_TOKEN }} DEN_TESTER_USERNAME: ${{ secrets.DEN_TESTER_USERNAME }} API_SERVER_URL: ${{ env.API_SERVER_URL }} + EKS_ARN: ${{ secrets.EKS_ARN }} - name: Run kubernetes tests env: @@ -268,6 +273,7 @@ jobs: DEN_TESTER_TOKEN: ${{ secrets.DEN_TESTER_PROD_TOKEN }} DEN_TESTER_USERNAME: ${{ secrets.DEN_TESTER_USERNAME }} API_SERVER_URL: ${{ env.API_SERVER_URL }} + EKS_ARN: ${{ secrets.EKS_ARN }} - name: Wait to check cluster status run: sleep 600 # 10 minutes diff --git a/tests/conftest.py b/tests/conftest.py index 337228f2d..c1e7acab3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -235,6 +235,10 @@ def event_loop(): from tests.fixtures.on_demand_cluster_fixtures import ( a10g_gpu_cluster, # noqa: F401 + den_launched_ondemand_aws_docker_cluster, # noqa: F401 + den_launched_ondemand_aws_k8s_cluster, # noqa: F401 + den_launched_ondemand_gcp_k8s_cluster, # noqa: F401 + den_launcher_v100_gpu_cluster, # noqa: F401 k80_gpu_cluster, # noqa: F401 multinode_cpu_docker_conda_cluster, # noqa: F401 multinode_gpu_cluster, # noqa: F401 diff --git a/tests/fixtures/on_demand_cluster_fixtures.py b/tests/fixtures/on_demand_cluster_fixtures.py index 33621641b..c0882d4a1 100644 --- a/tests/fixtures/on_demand_cluster_fixtures.py +++ b/tests/fixtures/on_demand_cluster_fixtures.py @@ -1,3 +1,4 @@ +import os from pathlib import Path import pytest @@ -5,6 +6,7 @@ import runhouse as rh from runhouse.constants import DEFAULT_HTTPS_PORT +from runhouse.resources.hardware.utils import LauncherType from runhouse.resources.images.image import Image from tests.conftest import init_args @@ -36,14 +38,30 @@ def setup_test_cluster(args, request, create_env=False): @pytest.fixture( params=[ "ondemand_aws_docker_cluster", + "den_launched_ondemand_aws_docker_cluster", "ondemand_gcp_cluster", "ondemand_k8s_cluster", "ondemand_k8s_docker_cluster", "v100_gpu_cluster", + "den_launcher_v100_gpu_cluster", "k80_gpu_cluster", "a10g_gpu_cluster", + "den_launched_ondemand_aws_k8s_cluster", + "den_launched_ondemand_gcp_k8s_cluster", + ], + ids=[ + "aws_cpu", + "aws_gpu_den_launcher", + "gcp_cpu", + "k8s_cpu", + "k8s_docker_cpu", + "v100", + "v100_den_launcher", + "k80", + "a10g", + "aws_k8_den_launcher", + "gcp_k8_den_launcher", ], - ids=["aws_cpu", "gcp_cpu", "k8s_cpu", "k8s_docker_cpu", "v100", "k80", "a10g"], ) def ondemand_cluster(request): return request.getfixturevalue(request.param) @@ -71,6 +89,25 @@ def ondemand_aws_docker_cluster(request): return cluster +@pytest.fixture(scope="session") +def den_launched_ondemand_aws_docker_cluster(request): + """ + Note: Also used to test docker and default env with alternate Ray version. + """ + args = { + "name": "aws-cpu-den", + "instance_type": "CPU:2+", + "provider": "aws", + "image_id": "docker:rayproject/ray:latest-py311-cpu", + "region": "us-east-2", + "image": Image(name="default_image").install_packages(["ray==2.30.0"]), + "sky_kwargs": {"launch": {"retry_until_up": True}}, + "launcher": LauncherType.DEN, + } + cluster = setup_test_cluster(args, request, create_env=True) + return cluster + + @pytest.fixture(scope="session") def ondemand_aws_https_cluster_with_auth(request, test_rns_folder): args = { @@ -129,6 +166,42 @@ def ondemand_k8s_cluster(request): return cluster +@pytest.fixture(scope="session") +def den_launched_ondemand_aws_k8s_cluster(request): + kube_config_path = Path.home() / ".kube" / "config" + + if not kube_config_path.exists(): + pytest.skip("no kubeconfig found") + + args = { + "name": "k8s-cpu-den", + "provider": "kubernetes", + "instance_type": "CPU:1", + "launcher": LauncherType.DEN, + "context": os.getenv("EKS_ARN"), + } + cluster = setup_test_cluster(args, request) + return cluster + + +@pytest.fixture(scope="session") +def den_launched_ondemand_gcp_k8s_cluster(request): + kube_config_path = Path.home() / ".kube" / "config" + + if not kube_config_path.exists(): + pytest.skip("no kubeconfig found") + + args = { + "name": "k8s-cpu-den", + "provider": "kubernetes", + "instance_type": "CPU:1", + "launcher": LauncherType.DEN, + "context": "gke_testing", + } + cluster = setup_test_cluster(args, request) + return cluster + + @pytest.fixture(scope="session") def ondemand_k8s_docker_cluster(request): kube_config_path = Path.home() / ".kube" / "config" @@ -140,7 +213,6 @@ def ondemand_k8s_docker_cluster(request): "name": "k8s-docker-cpu", "provider": "kubernetes", "instance_type": "CPU:1", - "memory": ".2", "image": Image(name="default_image").from_docker( "rayproject/ray:latest-py311-cpu" ), @@ -156,7 +228,19 @@ def v100_gpu_cluster(request): "instance_type": "V100:1", "provider": "aws", } - cluster = setup_test_cluster(args, request) + cluster = setup_test_cluster(args, request, create_env=True) + return cluster + + +@pytest.fixture(scope="session") +def den_launcher_v100_gpu_cluster(request): + args = { + "name": "rh-v100-den", + "instance_type": "V100:1", + "provider": "aws", + "launcher": LauncherType.DEN, + } + cluster = setup_test_cluster(args, request, create_env=True) return cluster diff --git a/tests/fixtures/static_cluster_fixtures.py b/tests/fixtures/static_cluster_fixtures.py index 55494d927..fef9fd3d6 100644 --- a/tests/fixtures/static_cluster_fixtures.py +++ b/tests/fixtures/static_cluster_fixtures.py @@ -20,8 +20,9 @@ def setup_static_cluster( compute_type: computeType = computeType.cpu, ): instance_type = "CPU:4" if compute_type == computeType.cpu else "g5.xlarge" + launcher = launcher if launcher else LauncherType.LOCAL cluster = rh.cluster( - f"aws-{compute_type}-password", + f"{launcher}-aws-{compute_type}-password", instance_type=instance_type, provider="aws", launcher=launcher, diff --git a/tests/test_resources/test_clusters/test_cluster.py b/tests/test_resources/test_clusters/test_cluster.py index 2ff4b8e57..c98054c71 100644 --- a/tests/test_resources/test_clusters/test_cluster.py +++ b/tests/test_resources/test_clusters/test_cluster.py @@ -44,7 +44,6 @@ set_output_env_vars, ) - """ TODO: 1) In subclasses, test factory methods create same type as parent 2) In subclasses, use monkeypatching to make sure `up()` is called for various methods if the server is not up @@ -131,9 +130,7 @@ class TestCluster(tests.test_resources.test_resource.TestResource): } MINIMAL = {"cluster": ["static_cpu_pwd_cluster"]} RELEASE = { - "cluster": [ - "static_cpu_pwd_cluster", - ] + "cluster": ["static_cpu_pwd_cluster", "static_cpu_pwd_cluster_den_launcher"] } MAXIMAL = { "cluster": [ @@ -142,6 +139,7 @@ class TestCluster(tests.test_resources.test_resource.TestResource): "docker_cluster_pwd_ssh_no_auth", "static_cpu_pwd_cluster", "multinode_cpu_docker_conda_cluster", + "static_gpu_pwd_cluster_den_launcher", ] } diff --git a/tests/test_resources/test_clusters/test_on_demand_cluster.py b/tests/test_resources/test_clusters/test_on_demand_cluster.py index 5c6372a54..d7e628999 100644 --- a/tests/test_resources/test_clusters/test_on_demand_cluster.py +++ b/tests/test_resources/test_clusters/test_on_demand_cluster.py @@ -87,10 +87,13 @@ class TestOnDemandCluster(tests.test_resources.test_clusters.test_cluster.TestCl RELEASE = { "cluster": [ "ondemand_aws_docker_cluster", + "den_launched_ondemand_aws_docker_cluster", "ondemand_gcp_cluster", "ondemand_aws_https_cluster_with_auth", "ondemand_k8s_cluster", "ondemand_k8s_docker_cluster", + "den_launched_ondemand_aws_k8s_cluster", + "den_launched_ondemand_gcp_k8s_cluster", ] } MAXIMAL = { @@ -101,6 +104,7 @@ class TestOnDemandCluster(tests.test_resources.test_clusters.test_cluster.TestCl "ondemand_k8s_docker_cluster", "ondemand_aws_https_cluster_with_auth", "v100_gpu_cluster", + "den_launcher_v100_gpu_cluster", "k80_gpu_cluster", "a10g_gpu_cluster", "static_cpu_pwd_cluster",