diff --git a/.github/workflows/nightly_release_testing.yaml b/.github/workflows/nightly_release_testing.yaml index 2f35fe692..834d10964 100644 --- a/.github/workflows/nightly_release_testing.yaml +++ b/.github/workflows/nightly_release_testing.yaml @@ -38,13 +38,6 @@ jobs: run: pytest --level release tests -k "not cluster" --detached timeout-minutes: 60 - - name: Teardown all clusters - if: always() - run: | - sky status - sky down --all -y - sky status - cluster-tests: runs-on: ubuntu-latest permissions: @@ -74,16 +67,9 @@ jobs: KITCHEN_TESTER_USERNAME: ${{ secrets.KITCHEN_TESTER_USERNAME }} ORG_MEMBER_TOKEN: ${{ secrets.ORG_MEMBER_PROD_TOKEN }} ORG_MEMBER_USERNAME: ${{ secrets.ORG_MEMBER_USERNAME }} - run: pytest --level release tests -k "clustertest and not ondemand" --detached + run: pytest --level release tests -k "cluster and not ondemand and not TestMultiNodeCluster" --detached timeout-minutes: 60 - - name: Teardown all cluster-tests clusters - if: always() - run: | - sky status - sky down --all -y - sky status - ondemand-aws-tests: runs-on: ubuntu-latest permissions: @@ -119,13 +105,6 @@ jobs: run: pytest --level release tests -k "ondemand_aws_https_cluster_with_auth" --detached timeout-minutes: 60 - - name: Teardown all ondemand-aws-tests clusters - if: always() - run: | - sky status - sky down --all -y - sky status - ondemand-aws-multinode-tests: runs-on: ubuntu-latest permissions: @@ -158,14 +137,6 @@ jobs: run: pytest --level release tests -k "TestMultiNodeCluster" --detached timeout-minutes: 60 - - name: Teardown all ondemand-aws-multinode clusters - if: always() - run: | - sky status - sky down --all -y - sky status - - ondemand-gcp-tests: runs-on: ubuntu-latest permissions: @@ -198,13 +169,6 @@ jobs: run: pytest --level release tests -k "ondemand_gcp_cluster" --detached timeout-minutes: 60 - - name: Teardown all ondemand-gcp-tests clusters - if: always() - run: | - sky status - sky down --all -y - sky status - kubernetes-tests: runs-on: ubuntu-latest permissions: @@ -237,15 +201,9 @@ jobs: run: pytest --level release tests -k "ondemand_k8s_cluster" --detached timeout-minutes: 60 - - name: Teardown all kubernetes-tests clusters - if: always() - run: | - sky status - sky down --all -y - sky status - - check-cluster-status: + teardown-clusters: if: always() + runs-on: ubuntu-latest needs: - not-cluster-tests - cluster-tests @@ -253,6 +211,31 @@ jobs: - ondemand-gcp-tests - kubernetes-tests - ondemand-aws-multinode-tests + steps: + - name: Check out repository code + uses: actions/checkout@v3 + + - name: Setup Release Testing + uses: ./.github/workflows/setup_release_testing + with: + KUBECONFIG: ${{ secrets.KUBECONFIG }} + AWS_OSS_ROLE_ARN: ${{ secrets.AWS_OSS_ROLE_ARN }} + DEV_AWS_ACCESS_KEY: ${{ secrets.DEV_AWS_ACCESS_KEY }} + DEV_AWS_SECRET_KEY: ${{ secrets.DEV_AWS_SECRET_KEY }} + GCP_SERVICE_ACCOUNT_KEY: ${{ secrets.GCP_SERVICE_ACCOUNT_KEY }} + GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }} + DEN_TESTER_TOKEN: ${{ secrets.DEN_TESTER_PROD_TOKEN }} + DEN_TESTER_USERNAME: ${{ secrets.DEN_TESTER_USERNAME }} + API_SERVER_URL: ${{ env.API_SERVER_URL }} + EKS_ARN: ${{ secrets.EKS_ARN }} + + - name: Teardown clusters + run: runhouse cluster down -a -y + + check-cluster-status: + if: always() + needs: + - teardown-clusters runs-on: ubuntu-latest permissions: id-token: write diff --git a/.github/workflows/setup_release_testing/action.yaml b/.github/workflows/setup_release_testing/action.yaml index 72c339f0c..9e4664a3a 100644 --- a/.github/workflows/setup_release_testing/action.yaml +++ b/.github/workflows/setup_release_testing/action.yaml @@ -75,7 +75,7 @@ runs: - name: Install python packages & dependencies run: | - pip install runhouse[aws,gcp,kubernetes] + pip install git+https://github.com/run-house/runhouse.git@temp-launcher-testing#egg=runhouse[aws,gcp,kubernetes] pip install -r tests/requirements.txt shell: bash diff --git a/tests/fixtures/on_demand_cluster_fixtures.py b/tests/fixtures/on_demand_cluster_fixtures.py index c0882d4a1..a7b6639e5 100644 --- a/tests/fixtures/on_demand_cluster_fixtures.py +++ b/tests/fixtures/on_demand_cluster_fixtures.py @@ -90,12 +90,12 @@ def ondemand_aws_docker_cluster(request): @pytest.fixture(scope="session") -def den_launched_ondemand_aws_docker_cluster(request): +def den_launched_ondemand_aws_docker_cluster(request, test_rns_folder): """ Note: Also used to test docker and default env with alternate Ray version. """ args = { - "name": "aws-cpu-den", + "name": f"{test_rns_folder}-aws-cpu-den", "instance_type": "CPU:2+", "provider": "aws", "image_id": "docker:rayproject/ray:latest-py311-cpu", @@ -167,14 +167,14 @@ def ondemand_k8s_cluster(request): @pytest.fixture(scope="session") -def den_launched_ondemand_aws_k8s_cluster(request): +def den_launched_ondemand_aws_k8s_cluster(request, test_rns_folder): kube_config_path = Path.home() / ".kube" / "config" if not kube_config_path.exists(): pytest.skip("no kubeconfig found") args = { - "name": "k8s-cpu-den", + "name": f"{test_rns_folder}-k8s-cpu-den", "provider": "kubernetes", "instance_type": "CPU:1", "launcher": LauncherType.DEN, @@ -185,14 +185,14 @@ def den_launched_ondemand_aws_k8s_cluster(request): @pytest.fixture(scope="session") -def den_launched_ondemand_gcp_k8s_cluster(request): +def den_launched_ondemand_gcp_k8s_cluster(request, test_rns_folder): kube_config_path = Path.home() / ".kube" / "config" if not kube_config_path.exists(): pytest.skip("no kubeconfig found") args = { - "name": "k8s-cpu-den", + "name": f"{test_rns_folder}-k8s-cpu-den", "provider": "kubernetes", "instance_type": "CPU:1", "launcher": LauncherType.DEN, @@ -233,9 +233,9 @@ def v100_gpu_cluster(request): @pytest.fixture(scope="session") -def den_launcher_v100_gpu_cluster(request): +def den_launcher_v100_gpu_cluster(request, test_rns_folder): args = { - "name": "rh-v100-den", + "name": f"{test_rns_folder}-rh-v100-den", "instance_type": "V100:1", "provider": "aws", "launcher": LauncherType.DEN, diff --git a/tests/fixtures/static_cluster_fixtures.py b/tests/fixtures/static_cluster_fixtures.py index fef9fd3d6..361975b5d 100644 --- a/tests/fixtures/static_cluster_fixtures.py +++ b/tests/fixtures/static_cluster_fixtures.py @@ -7,6 +7,7 @@ from runhouse.resources.hardware.utils import LauncherType from tests.conftest import init_args +from tests.fixtures.resource_fixtures import create_folder_path from tests.utils import test_env @@ -21,8 +22,9 @@ def setup_static_cluster( ): instance_type = "CPU:4" if compute_type == computeType.cpu else "g5.xlarge" launcher = launcher if launcher else LauncherType.LOCAL + cluster_name = f"{create_folder_path()}-{launcher}-aws-{compute_type}-password" cluster = rh.cluster( - f"{launcher}-aws-{compute_type}-password", + name=cluster_name, instance_type=instance_type, provider="aws", launcher=launcher, @@ -53,7 +55,7 @@ def setup_static_cluster( "ssh_private_key": "~/.ssh/sky-key", "password": "cluster-pass", } - args = dict(name="static-cpu-password", host=[cluster.head_ip], ssh_creds=ssh_creds) + args = dict(name=cluster_name, host=[cluster.head_ip], ssh_creds=ssh_creds) c = rh.cluster(**args).save() c.restart_server(resync_rh=True) init_args[id(c)] = args