From e70032dd6e4c65608f83f64978f5077f307b0edc Mon Sep 17 00:00:00 2001 From: etiennedi Date: Tue, 19 Mar 2024 19:22:19 -0700 Subject: [PATCH 01/16] add oom-prevention pipeline --- .github/workflows/tests.yaml | 1396 +++++++++-------- apps/oom-prevention/.gitignore | 2 + apps/oom-prevention/Dockerfile | 9 + apps/oom-prevention/requirements.txt | 3 + apps/oom-prevention/run.py | 103 ++ .../docker-compose-memory-constrained.yml | 107 ++ oom-prevention.sh | 39 + 7 files changed, 971 insertions(+), 688 deletions(-) create mode 100644 apps/oom-prevention/.gitignore create mode 100644 apps/oom-prevention/Dockerfile create mode 100644 apps/oom-prevention/requirements.txt create mode 100644 apps/oom-prevention/run.py create mode 100644 apps/weaviate/docker-compose-memory-constrained.yml create mode 100755 oom-prevention.sh diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index dc324465..23546f24 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -1,7 +1,7 @@ name: Chaos tests env: - WEAVIATE_VERSION: preview-prepare-release-v1-24-0-rc-0-027c42c + WEAVIATE_VERSION: preview-don-t-merge-wip-memory-investigations-eabedd0 MINIMUM_WEAVIATE_VERSION: 1.15.0 # this is used as the start in the upgrade journey test on: workflow_call: @@ -28,692 +28,712 @@ jobs: steps: - uses: actions/checkout@v2 - uses: psf/black@stable - ann-benchmarks-sift-aws: - name: "[bench AWS] SIFT1M pq=false" - runs-on: ubuntu-latest - timeout-minutes: 60 - env: - AWS_ACCESS_KEY_ID: ${{secrets.AWS_ACCESS_KEY}} - AWS_SECRET_ACCESS_KEY: ${{secrets.AWS_SECRET_ACCESS_KEY}} - DATASET: sift-128-euclidean - DISTANCE: l2-squared - REQUIRED_RECALL: 0.999 - PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} - steps: - - uses: actions/checkout@v3 - - name: Login to Docker Hub - uses: docker/login-action@v2 - with: - username: ${{secrets.DOCKER_USERNAME}} - password: ${{secrets.DOCKER_PASSWORD}} - - id: 'gcs_auth' - name: 'Authenticate to Google Cloud' - uses: 'google-github-actions/auth@v1' - with: - credentials_json: ${{secrets.GCP_SERVICE_ACCOUNT_BENCHMARKS}} - - name: 'Set up Cloud SDK' - uses: 'google-github-actions/setup-gcloud@v1' - - name: Run chaos test - if: always() - run: ./ann_benchmark_aws.sh - - id: 'upload-files' - uses: 'google-github-actions/upload-cloud-storage@v1' - with: - path: 'results' - destination: 'ann-pipelines/github-action-runs' - glob: '*.json' - ann-benchmarks-glove-aws: - name: "[bench AWS] Glove100 pq=false" - runs-on: ubuntu-latest - timeout-minutes: 60 - env: - AWS_ACCESS_KEY_ID: ${{secrets.AWS_ACCESS_KEY}} - AWS_SECRET_ACCESS_KEY: ${{secrets.AWS_SECRET_ACCESS_KEY}} - DATASET: glove-100-angular - DISTANCE: cosine - REQUIRED_RECALL: 0.965 - PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} - steps: - - uses: actions/checkout@v3 - - name: Login to Docker Hub - uses: docker/login-action@v2 - with: - username: ${{secrets.DOCKER_USERNAME}} - password: ${{secrets.DOCKER_PASSWORD}} - - id: 'gcs_auth' - name: 'Authenticate to Google Cloud' - uses: 'google-github-actions/auth@v1' - with: - credentials_json: ${{secrets.GCP_SERVICE_ACCOUNT_BENCHMARKS}} - - name: 'Set up Cloud SDK' - uses: 'google-github-actions/setup-gcloud@v1' - - name: Run chaos test - if: always() - run: ./ann_benchmark_aws.sh - - id: 'upload-files' - uses: 'google-github-actions/upload-cloud-storage@v1' - with: - path: 'results' - destination: 'ann-pipelines/github-action-runs' - glob: '*.json' - ann-benchmarks-pq-sift-aws: - name: "[bench AWS] SIFT1M pq=true" - runs-on: ubuntu-latest - timeout-minutes: 60 - env: - AWS_ACCESS_KEY_ID: ${{secrets.AWS_ACCESS_KEY}} - AWS_SECRET_ACCESS_KEY: ${{secrets.AWS_SECRET_ACCESS_KEY}} - DATASET: sift-128-euclidean - DISTANCE: l2-squared - REQUIRED_RECALL: 0.992 - PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} - steps: - - uses: actions/checkout@v3 - - name: Login to Docker Hub - uses: docker/login-action@v2 - with: - username: ${{secrets.DOCKER_USERNAME}} - password: ${{secrets.DOCKER_PASSWORD}} - - id: 'gcs_auth' - name: 'Authenticate to Google Cloud' - uses: 'google-github-actions/auth@v1' - with: - credentials_json: ${{secrets.GCP_SERVICE_ACCOUNT_BENCHMARKS}} - - name: 'Set up Cloud SDK' - uses: 'google-github-actions/setup-gcloud@v1' - - name: Run chaos test - if: always() - run: ./ann_benchmark_compression_aws.sh - - id: 'upload-files' - uses: 'google-github-actions/upload-cloud-storage@v1' - with: - path: 'results' - destination: 'ann-pipelines/github-action-runs' - glob: '*.json' - ann-benchmarks-pq-glove-aws: - name: "[bench AWS] Glove100 pq=true" - runs-on: ubuntu-latest - timeout-minutes: 60 - env: - AWS_ACCESS_KEY_ID: ${{secrets.AWS_ACCESS_KEY}} - AWS_SECRET_ACCESS_KEY: ${{secrets.AWS_SECRET_ACCESS_KEY}} - DATASET: glove-100-angular - DISTANCE: cosine - REQUIRED_RECALL: 0.89 - PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} - steps: - - uses: actions/checkout@v3 - - name: Login to Docker Hub - uses: docker/login-action@v2 - with: - username: ${{secrets.DOCKER_USERNAME}} - password: ${{secrets.DOCKER_PASSWORD}} - - id: 'gcs_auth' - name: 'Authenticate to Google Cloud' - uses: 'google-github-actions/auth@v1' - with: - credentials_json: ${{secrets.GCP_SERVICE_ACCOUNT_BENCHMARKS}} - - name: 'Set up Cloud SDK' - uses: 'google-github-actions/setup-gcloud@v1' - - name: Run chaos test - if: always() - run: ./ann_benchmark_compression_aws.sh - - id: 'upload-files' - uses: 'google-github-actions/upload-cloud-storage@v1' - with: - path: 'results' - destination: 'ann-pipelines/github-action-runs' - glob: '*.json' - ann-benchmarks-sift-gcp: - name: "[bench GCP] SIFT1M pq=false" - runs-on: ubuntu-latest - timeout-minutes: 60 - env: - DATASET: sift-128-euclidean - DISTANCE: l2-squared - REQUIRED_RECALL: 0.999 - PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} - steps: - - uses: actions/checkout@v3 - - name: Login to Docker Hub - uses: docker/login-action@v2 - with: - username: ${{secrets.DOCKER_USERNAME}} - password: ${{secrets.DOCKER_PASSWORD}} - - id: 'gcs_auth' - name: 'Authenticate to Google Cloud' - uses: 'google-github-actions/auth@v1' - with: - credentials_json: ${{secrets.GCP_SERVICE_ACCOUNT_BENCHMARKS}} - - name: 'Set up Cloud SDK' - uses: 'google-github-actions/setup-gcloud@v1' - - name: Run chaos test - if: always() - run: ./ann_benchmark_gcp.sh - - id: 'upload-files' - uses: 'google-github-actions/upload-cloud-storage@v1' - with: - path: 'results' - destination: 'ann-pipelines/github-action-runs' - glob: '*.json' - ann-benchmarks-pq-sift-gcp: - name: "[bench GCP] SIFT1M pq=true" - runs-on: ubuntu-latest - timeout-minutes: 60 - env: - DATASET: sift-128-euclidean - DISTANCE: l2-squared - REQUIRED_RECALL: 0.992 - PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} - steps: - - uses: actions/checkout@v3 - - name: Login to Docker Hub - uses: docker/login-action@v2 - with: - username: ${{secrets.DOCKER_USERNAME}} - password: ${{secrets.DOCKER_PASSWORD}} - - id: 'gcs_auth' - name: 'Authenticate to Google Cloud' - uses: 'google-github-actions/auth@v1' - with: - credentials_json: ${{secrets.GCP_SERVICE_ACCOUNT_BENCHMARKS}} - - name: 'Set up Cloud SDK' - uses: 'google-github-actions/setup-gcloud@v1' - - name: Run chaos test - if: always() - run: ./ann_benchmark_compression_gcp.sh - - id: 'upload-files' - uses: 'google-github-actions/upload-cloud-storage@v1' - with: - path: 'results' - destination: 'ann-pipelines/github-action-runs' - glob: '*.json' - batch-import-many-classes: - name: One class reveices long and expensive batches, user tries to create and delete 100s of classes in parallel - runs-on: ubuntu-latest - timeout-minutes: 60 - env: - PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} - steps: - - uses: actions/checkout@v3 - # - name: Polar Signals Continuous Profiling - # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 - # with: - # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} - # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' - - name: Login to Docker Hub - uses: docker/login-action@v2 - with: - username: ${{secrets.DOCKER_USERNAME}} - password: ${{secrets.DOCKER_PASSWORD}} - - name: Run chaos test - run: ./batch_import_many_classes.sh - upgrade-journey: - name: Rolling updates in multi-node setup from min to target version - runs-on: ubuntu-latest - timeout-minutes: 60 - env: - PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} - steps: - - uses: actions/checkout@v3 - # - name: Polar Signals Continuous Profiling - # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 - # with: - # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} - # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' - - name: Set up Go - uses: actions/setup-go@v3 - with: - go-version: '1.20' - - name: Login to Docker Hub - uses: docker/login-action@v2 - with: - username: ${{secrets.DOCKER_USERNAME}} - password: ${{secrets.DOCKER_PASSWORD}} - - name: Run chaos test - run: ./upgrade_journey.sh - replicated-imports-with-choas-killing: - name: Replicated imports with chaos killing - runs-on: ubuntu-latest-8-cores - timeout-minutes: 60 - env: - PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} - steps: - - uses: actions/checkout@v3 - # - name: Polar Signals Continuous Profiling - # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 - # with: - # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} - # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' - - name: Login to Docker Hub - uses: docker/login-action@v2 - with: - username: ${{secrets.DOCKER_USERNAME}} - password: ${{secrets.DOCKER_PASSWORD}} - - name: Run chaos test - run: ./replication_importing_while_crashing.sh - replicated-imports-with-backup: - name: Replicated imports with backup loop - runs-on: ubuntu-latest-4-cores - timeout-minutes: 60 - env: - PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} - steps: - - uses: actions/checkout@v3 - # - name: Polar Signals Continuous Profiling - # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 - # with: - # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} - # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' - - name: Login to Docker Hub - uses: docker/login-action@v2 - with: - username: ${{secrets.DOCKER_USERNAME}} - password: ${{secrets.DOCKER_PASSWORD}} - - name: Run chaos test - run: ./replication_importing_with_backup.sh - replication-tunable-consistency: - name: Replication tunable consistency - runs-on: ubuntu-latest-4-cores - timeout-minutes: 60 - env: - PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} - steps: - - uses: actions/checkout@v3 - # - name: Polar Signals Continuous Profiling - # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 - # with: - # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} - # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' - - name: Login to Docker Hub - uses: docker/login-action@v2 - with: - username: ${{secrets.DOCKER_USERNAME}} - password: ${{secrets.DOCKER_PASSWORD}} - - name: Run chaos test - run: ./replication_tunable_consistency.sh - counting-while-compacting: - name: Counting while compacting - runs-on: ubuntu-latest-8-cores - timeout-minutes: 60 - env: - PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} - steps: - - uses: actions/checkout@v3 - # - name: Polar Signals Continuous Profiling - # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 - # with: - # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} - # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' - - name: Login to Docker Hub - uses: docker/login-action@v2 - with: - username: ${{secrets.DOCKER_USERNAME}} - password: ${{secrets.DOCKER_PASSWORD}} - - name: Run chaos test - run: ./counting_while_compacting.sh - segfault-on-batch-ref: - name: Segfault on batch ref - runs-on: ubuntu-latest - timeout-minutes: 60 - env: - PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} - steps: - - uses: actions/checkout@v3 - # - name: Polar Signals Continuous Profiling - # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 - # with: - # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} - # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' - - name: Login to Docker Hub - uses: docker/login-action@v2 - with: - username: ${{secrets.DOCKER_USERNAME}} - password: ${{secrets.DOCKER_PASSWORD}} - - name: Run chaos test - run: ./segfault_batch_ref.sh - import-with-kills: - name: Import during constant kills/crashes - runs-on: ubuntu-latest-4-cores - timeout-minutes: 60 - env: - PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} - steps: - - uses: actions/checkout@v3 - # - name: Polar Signals Continuous Profiling - # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 - # with: - # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} - # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' - - name: Login to Docker Hub - uses: docker/login-action@v2 - with: - username: ${{secrets.DOCKER_USERNAME}} - password: ${{secrets.DOCKER_PASSWORD}} - - name: Run chaos test - run: ./import_while_crashing.sh - heave-imports-crashing: - name: Heavy object store imports while crashing - runs-on: ubuntu-latest-8-cores - timeout-minutes: 60 - env: - PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} - steps: - - uses: actions/checkout@v3 - # - name: Polar Signals Continuous Profiling - # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 - # with: - # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} - # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' - - name: Login to Docker Hub - uses: docker/login-action@v2 - with: - username: ${{secrets.DOCKER_USERNAME}} - password: ${{secrets.DOCKER_PASSWORD}} - - name: Run chaos test - run: ./import_while_crashing_no_vector.sh - segfault-filtered-search: - name: Segfault on filtered vector search (race with hash bucket compaction) - runs-on: ubuntu-latest - timeout-minutes: 60 - env: - PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} - steps: - - uses: actions/checkout@v3 - # - name: Polar Signals Continuous Profiling - # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 - # with: - # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} - # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' - - name: Login to Docker Hub - uses: docker/login-action@v2 - with: - username: ${{secrets.DOCKER_USERNAME}} - password: ${{secrets.DOCKER_PASSWORD}} - - name: Run chaos test - run: ./segfault_filtered_vector_search.sh - backup-restore-crud: - name: Backup & Restore CRUD - runs-on: ubuntu-latest-4-cores - timeout-minutes: 60 - env: - PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} - steps: - - uses: actions/checkout@v3 - # - name: Polar Signals Continuous Profiling - # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 - # with: - # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} - # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' - - name: Login to Docker Hub - uses: docker/login-action@v2 - with: - username: ${{secrets.DOCKER_USERNAME}} - password: ${{secrets.DOCKER_PASSWORD}} - - name: Run chaos test - run: ./backup_and_restore_crud.sh - backup-restore-crud-multi-node: - name: Backup & Restore multi node CRUD - runs-on: ubuntu-latest-4-cores - timeout-minutes: 60 - env: - PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} - steps: - - uses: actions/checkout@v3 - # - name: Polar Signals Continuous Profiling - # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 - # with: - # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} - # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' - - name: Login to Docker Hub - uses: docker/login-action@v2 - with: - username: ${{secrets.DOCKER_USERNAME}} - password: ${{secrets.DOCKER_PASSWORD}} - - name: Run chaos test - run: ./backup_and_restore_multi_node_crud.sh - backup-restore-version-compat: - name: Backup & Restore version compatibility - runs-on: ubuntu-latest-8-cores - timeout-minutes: 60 - env: - PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} - steps: - - uses: actions/checkout@v3 - # - name: Polar Signals Continuous Profiling - # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 - # with: - # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} - # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' - - name: Login to Docker Hub - uses: docker/login-action@v2 - with: - username: ${{secrets.DOCKER_USERNAME}} - password: ${{secrets.DOCKER_PASSWORD}} - - name: Run chaos test - run: ./backup_and_restore_version_compatibility.sh - compare-recall: - name: Compare Recall after import to after restart - runs-on: ubuntu-latest - timeout-minutes: 60 - env: - PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} - steps: - - uses: actions/checkout@v3 - # - name: Polar Signals Continuous Profiling - # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 - # with: - # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} - # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' - - name: Login to Docker Hub - uses: docker/login-action@v2 - with: - username: ${{secrets.DOCKER_USERNAME}} - password: ${{secrets.DOCKER_PASSWORD}} - - name: Run chaos test - run: ./compare_recall_after_restart.sh - concurrent-read-write: - name: Concurrent inverted index read/write - runs-on: ubuntu-latest - timeout-minutes: 60 - env: - PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} - steps: - - uses: actions/checkout@v3 - # - name: Polar Signals Continuous Profiling - # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 - # with: - # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} - # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' - - name: Login to Docker Hub - uses: docker/login-action@v2 - with: - username: ${{secrets.DOCKER_USERNAME}} - password: ${{secrets.DOCKER_PASSWORD}} - - name: Run chaos test - run: ./concurrent_inverted_index_read_write.sh - consecutive-create-update: - name: Consecutive create and update operations - runs-on: ubuntu-latest - timeout-minutes: 60 - env: - PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} - steps: - - uses: actions/checkout@v3 - # - name: Polar Signals Continuous Profiling - # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 - # with: - # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} - # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' - - name: Login to Docker Hub - uses: docker/login-action@v2 - with: - username: ${{secrets.DOCKER_USERNAME}} - password: ${{secrets.DOCKER_PASSWORD}} - - name: Run chaos test - run: ./consecutive_create_and_update_operations.sh - batch-insert-mismatch: - name: Batch insert mismatch - runs-on: ubuntu-latest-4-cores - timeout-minutes: 60 - env: - PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} - steps: - - uses: actions/checkout@v3 - # - name: Polar Signals Continuous Profiling - # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 - # with: - # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} - # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' - - name: Login to Docker Hub - uses: docker/login-action@v2 - with: - username: ${{secrets.DOCKER_USERNAME}} - password: ${{secrets.DOCKER_PASSWORD}} - - name: Run chaos test - run: ./consecutive_create_and_update_operations.sh - rest-patch-restart: - name: REST PATCH requests stop working after restart - runs-on: ubuntu-latest-4-cores - timeout-minutes: 60 - env: - PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} - steps: - - uses: actions/checkout@v3 - # - name: Polar Signals Continuous Profiling - # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 - # with: - # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} - # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' - - name: Login to Docker Hub - uses: docker/login-action@v2 - with: - username: ${{secrets.DOCKER_USERNAME}} - password: ${{secrets.DOCKER_PASSWORD}} - - name: Run chaos test - run: ./rest_patch_stops_working_after_restart.sh - delete-recreate-updates: - name: Delete and recreate class with frequent updates - runs-on: ubuntu-latest-4-cores - timeout-minutes: 60 - env: - PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} - steps: - - uses: actions/checkout@v3 - # - name: Polar Signals Continuous Profiling - # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 - # with: - # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} - # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' - - name: Login to Docker Hub - uses: docker/login-action@v2 - with: - username: ${{secrets.DOCKER_USERNAME}} - password: ${{secrets.DOCKER_PASSWORD}} - - name: Run chaos test - run: ./delete_and_recreate_class.sh - geo-crash: - name: Preventing crashing of geo properties during HNSW maintenance cycles - runs-on: ubuntu-latest-8-cores - timeout-minutes: 60 - env: - PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} - steps: - - uses: actions/checkout@v3 - # - name: Polar Signals Continuous Profiling - # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 - # with: - # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} - # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' - - name: Login to Docker Hub - uses: docker/login-action@v2 - with: - username: ${{secrets.DOCKER_USERNAME}} - password: ${{secrets.DOCKER_PASSWORD}} - - name: Run chaos test - run: ./geo_crash.sh - compaction-roaringset: - name: Preventing panic on compaction of roaringsets - runs-on: ubuntu-latest-4-cores - timeout-minutes: 60 - env: - PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} - steps: - - uses: actions/checkout@v3 - # - name: Polar Signals Continuous Profiling - # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 - # with: - # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} - # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' - - name: Login to Docker Hub - uses: docker/login-action@v2 - with: - username: ${{secrets.DOCKER_USERNAME}} - password: ${{secrets.DOCKER_PASSWORD}} - - name: Run chaos test - run: ./compaction_roaringset.sh - multi-node-references: - name: Large batches with many cross-refs on a multi-node cluster - runs-on: ubuntu-latest-4-cores - timeout-minutes: 60 - env: - PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} - steps: - - uses: actions/checkout@v3 - # - name: Polar Signals Continuous Profiling - # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 - # with: - # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} - # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' - - name: Login to Docker Hub - uses: docker/login-action@v2 - with: - username: ${{secrets.DOCKER_USERNAME}} - password: ${{secrets.DOCKER_PASSWORD}} - - name: Run chaos test - run: ./multi_node_ref_imports.sh - multi-tenancy-concurrent-imports: - name: Concurrent clients importing into multi-node cluster - runs-on: ubuntu-latest-8-cores - timeout-minutes: 60 - env: - PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} - steps: - - uses: actions/checkout@v3 - # - name: Polar Signals Continuous Profiling - # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 - # with: - # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} - # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' - - name: Login to Docker Hub - uses: docker/login-action@v2 - with: - username: ${{secrets.DOCKER_USERNAME}} - password: ${{secrets.DOCKER_PASSWORD}} - - name: Run chaos test - run: ./multi_tenancy_concurrent_importing.sh - multi_tenancy_activate_deactivate: - name: Activate and deactivate tenants' shards - runs-on: ubuntu-latest-4-cores - timeout-minutes: 60 - env: - PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} - steps: - - uses: actions/checkout@v3 - # - name: Polar Signals Continuous Profiling - # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 - # with: - # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} - # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' - - name: Login to Docker Hub - uses: docker/login-action@v2 - with: - username: ${{secrets.DOCKER_USERNAME}} - password: ${{secrets.DOCKER_PASSWORD}} - - name: Run chaos test - run: ./multi_tenancy_activate_deactivate.sh - goroutine_leak_class_delete: - name: Check for degraded performance from goroutine leak on class delete + # ann-benchmarks-sift-aws: + # name: "[bench AWS] SIFT1M pq=false" + # runs-on: ubuntu-latest + # timeout-minutes: 60 + # env: + # AWS_ACCESS_KEY_ID: ${{secrets.AWS_ACCESS_KEY}} + # AWS_SECRET_ACCESS_KEY: ${{secrets.AWS_SECRET_ACCESS_KEY}} + # DATASET: sift-128-euclidean + # DISTANCE: l2-squared + # REQUIRED_RECALL: 0.999 + # PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} + # steps: + # - uses: actions/checkout@v3 + # - name: Login to Docker Hub + # uses: docker/login-action@v2 + # with: + # username: ${{secrets.DOCKER_USERNAME}} + # password: ${{secrets.DOCKER_PASSWORD}} + # - id: 'gcs_auth' + # name: 'Authenticate to Google Cloud' + # uses: 'google-github-actions/auth@v1' + # with: + # credentials_json: ${{secrets.GCP_SERVICE_ACCOUNT_BENCHMARKS}} + # - name: 'Set up Cloud SDK' + # uses: 'google-github-actions/setup-gcloud@v1' + # - name: Run chaos test + # if: always() + # run: ./ann_benchmark_aws.sh + # - id: 'upload-files' + # uses: 'google-github-actions/upload-cloud-storage@v1' + # with: + # path: 'results' + # destination: 'ann-pipelines/github-action-runs' + # glob: '*.json' + # ann-benchmarks-glove-aws: + # name: "[bench AWS] Glove100 pq=false" + # runs-on: ubuntu-latest + # timeout-minutes: 60 + # env: + # AWS_ACCESS_KEY_ID: ${{secrets.AWS_ACCESS_KEY}} + # AWS_SECRET_ACCESS_KEY: ${{secrets.AWS_SECRET_ACCESS_KEY}} + # DATASET: glove-100-angular + # DISTANCE: cosine + # REQUIRED_RECALL: 0.965 + # PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} + # steps: + # - uses: actions/checkout@v3 + # - name: Login to Docker Hub + # uses: docker/login-action@v2 + # with: + # username: ${{secrets.DOCKER_USERNAME}} + # password: ${{secrets.DOCKER_PASSWORD}} + # - id: 'gcs_auth' + # name: 'Authenticate to Google Cloud' + # uses: 'google-github-actions/auth@v1' + # with: + # credentials_json: ${{secrets.GCP_SERVICE_ACCOUNT_BENCHMARKS}} + # - name: 'Set up Cloud SDK' + # uses: 'google-github-actions/setup-gcloud@v1' + # - name: Run chaos test + # if: always() + # run: ./ann_benchmark_aws.sh + # - id: 'upload-files' + # uses: 'google-github-actions/upload-cloud-storage@v1' + # with: + # path: 'results' + # destination: 'ann-pipelines/github-action-runs' + # glob: '*.json' + # ann-benchmarks-pq-sift-aws: + # name: "[bench AWS] SIFT1M pq=true" + # runs-on: ubuntu-latest + # timeout-minutes: 60 + # env: + # AWS_ACCESS_KEY_ID: ${{secrets.AWS_ACCESS_KEY}} + # AWS_SECRET_ACCESS_KEY: ${{secrets.AWS_SECRET_ACCESS_KEY}} + # DATASET: sift-128-euclidean + # DISTANCE: l2-squared + # REQUIRED_RECALL: 0.992 + # PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} + # steps: + # - uses: actions/checkout@v3 + # - name: Login to Docker Hub + # uses: docker/login-action@v2 + # with: + # username: ${{secrets.DOCKER_USERNAME}} + # password: ${{secrets.DOCKER_PASSWORD}} + # - id: 'gcs_auth' + # name: 'Authenticate to Google Cloud' + # uses: 'google-github-actions/auth@v1' + # with: + # credentials_json: ${{secrets.GCP_SERVICE_ACCOUNT_BENCHMARKS}} + # - name: 'Set up Cloud SDK' + # uses: 'google-github-actions/setup-gcloud@v1' + # - name: Run chaos test + # if: always() + # run: ./ann_benchmark_compression_aws.sh + # - id: 'upload-files' + # uses: 'google-github-actions/upload-cloud-storage@v1' + # with: + # path: 'results' + # destination: 'ann-pipelines/github-action-runs' + # glob: '*.json' + # ann-benchmarks-pq-glove-aws: + # name: "[bench AWS] Glove100 pq=true" + # runs-on: ubuntu-latest + # timeout-minutes: 60 + # env: + # AWS_ACCESS_KEY_ID: ${{secrets.AWS_ACCESS_KEY}} + # AWS_SECRET_ACCESS_KEY: ${{secrets.AWS_SECRET_ACCESS_KEY}} + # DATASET: glove-100-angular + # DISTANCE: cosine + # REQUIRED_RECALL: 0.89 + # PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} + # steps: + # - uses: actions/checkout@v3 + # - name: Login to Docker Hub + # uses: docker/login-action@v2 + # with: + # username: ${{secrets.DOCKER_USERNAME}} + # password: ${{secrets.DOCKER_PASSWORD}} + # - id: 'gcs_auth' + # name: 'Authenticate to Google Cloud' + # uses: 'google-github-actions/auth@v1' + # with: + # credentials_json: ${{secrets.GCP_SERVICE_ACCOUNT_BENCHMARKS}} + # - name: 'Set up Cloud SDK' + # uses: 'google-github-actions/setup-gcloud@v1' + # - name: Run chaos test + # if: always() + # run: ./ann_benchmark_compression_aws.sh + # - id: 'upload-files' + # uses: 'google-github-actions/upload-cloud-storage@v1' + # with: + # path: 'results' + # destination: 'ann-pipelines/github-action-runs' + # glob: '*.json' + # ann-benchmarks-sift-gcp: + # name: "[bench GCP] SIFT1M pq=false" + # runs-on: ubuntu-latest + # timeout-minutes: 60 + # env: + # DATASET: sift-128-euclidean + # DISTANCE: l2-squared + # REQUIRED_RECALL: 0.999 + # PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} + # steps: + # - uses: actions/checkout@v3 + # - name: Login to Docker Hub + # uses: docker/login-action@v2 + # with: + # username: ${{secrets.DOCKER_USERNAME}} + # password: ${{secrets.DOCKER_PASSWORD}} + # - id: 'gcs_auth' + # name: 'Authenticate to Google Cloud' + # uses: 'google-github-actions/auth@v1' + # with: + # credentials_json: ${{secrets.GCP_SERVICE_ACCOUNT_BENCHMARKS}} + # - name: 'Set up Cloud SDK' + # uses: 'google-github-actions/setup-gcloud@v1' + # - name: Run chaos test + # if: always() + # run: ./ann_benchmark_gcp.sh + # - id: 'upload-files' + # uses: 'google-github-actions/upload-cloud-storage@v1' + # with: + # path: 'results' + # destination: 'ann-pipelines/github-action-runs' + # glob: '*.json' + # ann-benchmarks-pq-sift-gcp: + # name: "[bench GCP] SIFT1M pq=true" + # runs-on: ubuntu-latest + # timeout-minutes: 60 + # env: + # DATASET: sift-128-euclidean + # DISTANCE: l2-squared + # REQUIRED_RECALL: 0.992 + # PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} + # steps: + # - uses: actions/checkout@v3 + # - name: Login to Docker Hub + # uses: docker/login-action@v2 + # with: + # username: ${{secrets.DOCKER_USERNAME}} + # password: ${{secrets.DOCKER_PASSWORD}} + # - id: 'gcs_auth' + # name: 'Authenticate to Google Cloud' + # uses: 'google-github-actions/auth@v1' + # with: + # credentials_json: ${{secrets.GCP_SERVICE_ACCOUNT_BENCHMARKS}} + # - name: 'Set up Cloud SDK' + # uses: 'google-github-actions/setup-gcloud@v1' + # - name: Run chaos test + # if: always() + # run: ./ann_benchmark_compression_gcp.sh + # - id: 'upload-files' + # uses: 'google-github-actions/upload-cloud-storage@v1' + # with: + # path: 'results' + # destination: 'ann-pipelines/github-action-runs' + # glob: '*.json' + # batch-import-many-classes: + # name: One class reveices long and expensive batches, user tries to create and delete 100s of classes in parallel + # runs-on: ubuntu-latest + # timeout-minutes: 60 + # env: + # PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} + # steps: + # - uses: actions/checkout@v3 + # # - name: Polar Signals Continuous Profiling + # # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 + # # with: + # # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} + # # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' + # - name: Login to Docker Hub + # uses: docker/login-action@v2 + # with: + # username: ${{secrets.DOCKER_USERNAME}} + # password: ${{secrets.DOCKER_PASSWORD}} + # - name: Run chaos test + # run: ./batch_import_many_classes.sh + # upgrade-journey: + # name: Rolling updates in multi-node setup from min to target version + # runs-on: ubuntu-latest + # timeout-minutes: 60 + # env: + # PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} + # steps: + # - uses: actions/checkout@v3 + # # - name: Polar Signals Continuous Profiling + # # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 + # # with: + # # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} + # # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' + # - name: Set up Go + # uses: actions/setup-go@v3 + # with: + # go-version: '1.20' + # - name: Login to Docker Hub + # uses: docker/login-action@v2 + # with: + # username: ${{secrets.DOCKER_USERNAME}} + # password: ${{secrets.DOCKER_PASSWORD}} + # - name: Run chaos test + # run: ./upgrade_journey.sh + # replicated-imports-with-choas-killing: + # name: Replicated imports with chaos killing + # runs-on: ubuntu-latest-8-cores + # timeout-minutes: 60 + # env: + # PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} + # steps: + # - uses: actions/checkout@v3 + # # - name: Polar Signals Continuous Profiling + # # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 + # # with: + # # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} + # # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' + # - name: Login to Docker Hub + # uses: docker/login-action@v2 + # with: + # username: ${{secrets.DOCKER_USERNAME}} + # password: ${{secrets.DOCKER_PASSWORD}} + # - name: Run chaos test + # run: ./replication_importing_while_crashing.sh + # replicated-imports-with-backup: + # name: Replicated imports with backup loop + # runs-on: ubuntu-latest-4-cores + # timeout-minutes: 60 + # env: + # PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} + # steps: + # - uses: actions/checkout@v3 + # # - name: Polar Signals Continuous Profiling + # # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 + # # with: + # # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} + # # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' + # - name: Login to Docker Hub + # uses: docker/login-action@v2 + # with: + # username: ${{secrets.DOCKER_USERNAME}} + # password: ${{secrets.DOCKER_PASSWORD}} + # - name: Run chaos test + # run: ./replication_importing_with_backup.sh + # replication-tunable-consistency: + # name: Replication tunable consistency + # runs-on: ubuntu-latest-4-cores + # timeout-minutes: 60 + # env: + # PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} + # steps: + # - uses: actions/checkout@v3 + # # - name: Polar Signals Continuous Profiling + # # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 + # # with: + # # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} + # # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' + # - name: Login to Docker Hub + # uses: docker/login-action@v2 + # with: + # username: ${{secrets.DOCKER_USERNAME}} + # password: ${{secrets.DOCKER_PASSWORD}} + # - name: Run chaos test + # run: ./replication_tunable_consistency.sh + # counting-while-compacting: + # name: Counting while compacting + # runs-on: ubuntu-latest-8-cores + # timeout-minutes: 60 + # env: + # PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} + # steps: + # - uses: actions/checkout@v3 + # # - name: Polar Signals Continuous Profiling + # # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 + # # with: + # # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} + # # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' + # - name: Login to Docker Hub + # uses: docker/login-action@v2 + # with: + # username: ${{secrets.DOCKER_USERNAME}} + # password: ${{secrets.DOCKER_PASSWORD}} + # - name: Run chaos test + # run: ./counting_while_compacting.sh + # segfault-on-batch-ref: + # name: Segfault on batch ref + # runs-on: ubuntu-latest + # timeout-minutes: 60 + # env: + # PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} + # steps: + # - uses: actions/checkout@v3 + # # - name: Polar Signals Continuous Profiling + # # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 + # # with: + # # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} + # # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' + # - name: Login to Docker Hub + # uses: docker/login-action@v2 + # with: + # username: ${{secrets.DOCKER_USERNAME}} + # password: ${{secrets.DOCKER_PASSWORD}} + # - name: Run chaos test + # run: ./segfault_batch_ref.sh + # import-with-kills: + # name: Import during constant kills/crashes + # runs-on: ubuntu-latest-4-cores + # timeout-minutes: 60 + # env: + # PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} + # steps: + # - uses: actions/checkout@v3 + # # - name: Polar Signals Continuous Profiling + # # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 + # # with: + # # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} + # # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' + # - name: Login to Docker Hub + # uses: docker/login-action@v2 + # with: + # username: ${{secrets.DOCKER_USERNAME}} + # password: ${{secrets.DOCKER_PASSWORD}} + # - name: Run chaos test + # run: ./import_while_crashing.sh + # heave-imports-crashing: + # name: Heavy object store imports while crashing + # runs-on: ubuntu-latest-8-cores + # timeout-minutes: 60 + # env: + # PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} + # steps: + # - uses: actions/checkout@v3 + # # - name: Polar Signals Continuous Profiling + # # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 + # # with: + # # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} + # # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' + # - name: Login to Docker Hub + # uses: docker/login-action@v2 + # with: + # username: ${{secrets.DOCKER_USERNAME}} + # password: ${{secrets.DOCKER_PASSWORD}} + # - name: Run chaos test + # run: ./import_while_crashing_no_vector.sh + # segfault-filtered-search: + # name: Segfault on filtered vector search (race with hash bucket compaction) + # runs-on: ubuntu-latest + # timeout-minutes: 60 + # env: + # PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} + # steps: + # - uses: actions/checkout@v3 + # # - name: Polar Signals Continuous Profiling + # # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 + # # with: + # # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} + # # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' + # - name: Login to Docker Hub + # uses: docker/login-action@v2 + # with: + # username: ${{secrets.DOCKER_USERNAME}} + # password: ${{secrets.DOCKER_PASSWORD}} + # - name: Run chaos test + # run: ./segfault_filtered_vector_search.sh + # backup-restore-crud: + # name: Backup & Restore CRUD + # runs-on: ubuntu-latest-4-cores + # timeout-minutes: 60 + # env: + # PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} + # steps: + # - uses: actions/checkout@v3 + # # - name: Polar Signals Continuous Profiling + # # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 + # # with: + # # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} + # # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' + # - name: Login to Docker Hub + # uses: docker/login-action@v2 + # with: + # username: ${{secrets.DOCKER_USERNAME}} + # password: ${{secrets.DOCKER_PASSWORD}} + # - name: Run chaos test + # run: ./backup_and_restore_crud.sh + # backup-restore-crud-multi-node: + # name: Backup & Restore multi node CRUD + # runs-on: ubuntu-latest-4-cores + # timeout-minutes: 60 + # env: + # PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} + # steps: + # - uses: actions/checkout@v3 + # # - name: Polar Signals Continuous Profiling + # # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 + # # with: + # # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} + # # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' + # - name: Login to Docker Hub + # uses: docker/login-action@v2 + # with: + # username: ${{secrets.DOCKER_USERNAME}} + # password: ${{secrets.DOCKER_PASSWORD}} + # - name: Run chaos test + # run: ./backup_and_restore_multi_node_crud.sh + # backup-restore-version-compat: + # name: Backup & Restore version compatibility + # runs-on: ubuntu-latest-8-cores + # timeout-minutes: 60 + # env: + # PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} + # steps: + # - uses: actions/checkout@v3 + # # - name: Polar Signals Continuous Profiling + # # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 + # # with: + # # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} + # # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' + # - name: Login to Docker Hub + # uses: docker/login-action@v2 + # with: + # username: ${{secrets.DOCKER_USERNAME}} + # password: ${{secrets.DOCKER_PASSWORD}} + # - name: Run chaos test + # run: ./backup_and_restore_version_compatibility.sh + # compare-recall: + # name: Compare Recall after import to after restart + # runs-on: ubuntu-latest + # timeout-minutes: 60 + # env: + # PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} + # steps: + # - uses: actions/checkout@v3 + # # - name: Polar Signals Continuous Profiling + # # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 + # # with: + # # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} + # # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' + # - name: Login to Docker Hub + # uses: docker/login-action@v2 + # with: + # username: ${{secrets.DOCKER_USERNAME}} + # password: ${{secrets.DOCKER_PASSWORD}} + # - name: Run chaos test + # run: ./compare_recall_after_restart.sh + # concurrent-read-write: + # name: Concurrent inverted index read/write + # runs-on: ubuntu-latest + # timeout-minutes: 60 + # env: + # PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} + # steps: + # - uses: actions/checkout@v3 + # # - name: Polar Signals Continuous Profiling + # # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 + # # with: + # # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} + # # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' + # - name: Login to Docker Hub + # uses: docker/login-action@v2 + # with: + # username: ${{secrets.DOCKER_USERNAME}} + # password: ${{secrets.DOCKER_PASSWORD}} + # - name: Run chaos test + # run: ./concurrent_inverted_index_read_write.sh + # consecutive-create-update: + # name: Consecutive create and update operations + # runs-on: ubuntu-latest + # timeout-minutes: 60 + # env: + # PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} + # steps: + # - uses: actions/checkout@v3 + # # - name: Polar Signals Continuous Profiling + # # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 + # # with: + # # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} + # # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' + # - name: Login to Docker Hub + # uses: docker/login-action@v2 + # with: + # username: ${{secrets.DOCKER_USERNAME}} + # password: ${{secrets.DOCKER_PASSWORD}} + # - name: Run chaos test + # run: ./consecutive_create_and_update_operations.sh + # batch-insert-mismatch: + # name: Batch insert mismatch + # runs-on: ubuntu-latest-4-cores + # timeout-minutes: 60 + # env: + # PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} + # steps: + # - uses: actions/checkout@v3 + # # - name: Polar Signals Continuous Profiling + # # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 + # # with: + # # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} + # # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' + # - name: Login to Docker Hub + # uses: docker/login-action@v2 + # with: + # username: ${{secrets.DOCKER_USERNAME}} + # password: ${{secrets.DOCKER_PASSWORD}} + # - name: Run chaos test + # run: ./consecutive_create_and_update_operations.sh + # rest-patch-restart: + # name: REST PATCH requests stop working after restart + # runs-on: ubuntu-latest-4-cores + # timeout-minutes: 60 + # env: + # PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} + # steps: + # - uses: actions/checkout@v3 + # # - name: Polar Signals Continuous Profiling + # # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 + # # with: + # # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} + # # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' + # - name: Login to Docker Hub + # uses: docker/login-action@v2 + # with: + # username: ${{secrets.DOCKER_USERNAME}} + # password: ${{secrets.DOCKER_PASSWORD}} + # - name: Run chaos test + # run: ./rest_patch_stops_working_after_restart.sh + # delete-recreate-updates: + # name: Delete and recreate class with frequent updates + # runs-on: ubuntu-latest-4-cores + # timeout-minutes: 60 + # env: + # PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} + # steps: + # - uses: actions/checkout@v3 + # # - name: Polar Signals Continuous Profiling + # # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 + # # with: + # # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} + # # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' + # - name: Login to Docker Hub + # uses: docker/login-action@v2 + # with: + # username: ${{secrets.DOCKER_USERNAME}} + # password: ${{secrets.DOCKER_PASSWORD}} + # - name: Run chaos test + # run: ./delete_and_recreate_class.sh + # geo-crash: + # name: Preventing crashing of geo properties during HNSW maintenance cycles + # runs-on: ubuntu-latest-8-cores + # timeout-minutes: 60 + # env: + # PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} + # steps: + # - uses: actions/checkout@v3 + # # - name: Polar Signals Continuous Profiling + # # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 + # # with: + # # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} + # # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' + # - name: Login to Docker Hub + # uses: docker/login-action@v2 + # with: + # username: ${{secrets.DOCKER_USERNAME}} + # password: ${{secrets.DOCKER_PASSWORD}} + # - name: Run chaos test + # run: ./geo_crash.sh + # compaction-roaringset: + # name: Preventing panic on compaction of roaringsets + # runs-on: ubuntu-latest-4-cores + # timeout-minutes: 60 + # env: + # PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} + # steps: + # - uses: actions/checkout@v3 + # # - name: Polar Signals Continuous Profiling + # # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 + # # with: + # # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} + # # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' + # - name: Login to Docker Hub + # uses: docker/login-action@v2 + # with: + # username: ${{secrets.DOCKER_USERNAME}} + # password: ${{secrets.DOCKER_PASSWORD}} + # - name: Run chaos test + # run: ./compaction_roaringset.sh + # multi-node-references: + # name: Large batches with many cross-refs on a multi-node cluster + # runs-on: ubuntu-latest-4-cores + # timeout-minutes: 60 + # env: + # PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} + # steps: + # - uses: actions/checkout@v3 + # # - name: Polar Signals Continuous Profiling + # # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 + # # with: + # # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} + # # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' + # - name: Login to Docker Hub + # uses: docker/login-action@v2 + # with: + # username: ${{secrets.DOCKER_USERNAME}} + # password: ${{secrets.DOCKER_PASSWORD}} + # - name: Run chaos test + # run: ./multi_node_ref_imports.sh + # multi-tenancy-concurrent-imports: + # name: Concurrent clients importing into multi-node cluster + # runs-on: ubuntu-latest-8-cores + # timeout-minutes: 60 + # env: + # PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} + # steps: + # - uses: actions/checkout@v3 + # # - name: Polar Signals Continuous Profiling + # # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 + # # with: + # # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} + # # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' + # - name: Login to Docker Hub + # uses: docker/login-action@v2 + # with: + # username: ${{secrets.DOCKER_USERNAME}} + # password: ${{secrets.DOCKER_PASSWORD}} + # - name: Run chaos test + # run: ./multi_tenancy_concurrent_importing.sh + # multi_tenancy_activate_deactivate: + # name: Activate and deactivate tenants' shards + # runs-on: ubuntu-latest-4-cores + # timeout-minutes: 60 + # env: + # PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} + # steps: + # - uses: actions/checkout@v3 + # # - name: Polar Signals Continuous Profiling + # # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 + # # with: + # # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} + # # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' + # - name: Login to Docker Hub + # uses: docker/login-action@v2 + # with: + # username: ${{secrets.DOCKER_USERNAME}} + # password: ${{secrets.DOCKER_PASSWORD}} + # - name: Run chaos test + # run: ./multi_tenancy_activate_deactivate.sh + # goroutine_leak_class_delete: + # name: Check for degraded performance from goroutine leak on class delete + # runs-on: ubuntu-latest + # timeout-minutes: 60 + # env: + # PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} + # steps: + # - uses: actions/checkout@v3 + # # - name: Polar Signals Continuous Profiling + # # uses: polarsignals/gh-actions-ps-profiling@v0.0.1 + # # with: + # # polarsignals_cloud_token: ${{ secrets.POLARSIGNALS_TOKEN }} + # # labels: 'job=${{ github.job }};gh_run_id=${{ github.run_id }}' + # - name: Login to Docker Hub + # uses: docker/login-action@v2 + # with: + # username: ${{secrets.DOCKER_USERNAME}} + # password: ${{secrets.DOCKER_PASSWORD}} + # - name: Run chaos test + # run: ./goroutine_leak_on_class_delete.sh + oom-prevention: + name: Import & Delete into HNSW on memory-constrained cluster runs-on: ubuntu-latest timeout-minutes: 60 env: @@ -731,7 +751,7 @@ jobs: username: ${{secrets.DOCKER_USERNAME}} password: ${{secrets.DOCKER_PASSWORD}} - name: Run chaos test - run: ./goroutine_leak_on_class_delete.sh + run: ./oom-prevention.sh # Commented only because this chaos pipeline was able to interrupt save operation # just in the middle of it being performed and since Weaviate doesn't have a transaction # mechanism implemented then this was causing an error which is a different error then diff --git a/apps/oom-prevention/.gitignore b/apps/oom-prevention/.gitignore new file mode 100644 index 00000000..120d711a --- /dev/null +++ b/apps/oom-prevention/.gitignore @@ -0,0 +1,2 @@ +sphere.1M.jsonl +sphere.1M.jsonl.tar.gz diff --git a/apps/oom-prevention/Dockerfile b/apps/oom-prevention/Dockerfile new file mode 100644 index 00000000..4fe1723d --- /dev/null +++ b/apps/oom-prevention/Dockerfile @@ -0,0 +1,9 @@ +FROM python:3.10-slim-bullseye +RUN apt-get update && apt-get install -y curl unzip git + +WORKDIR /workdir + +COPY requirements.txt . +RUN pip3 install -r requirements.txt + +COPY run.py ./ diff --git a/apps/oom-prevention/requirements.txt b/apps/oom-prevention/requirements.txt new file mode 100644 index 00000000..e3a0d94e --- /dev/null +++ b/apps/oom-prevention/requirements.txt @@ -0,0 +1,3 @@ +numpy==1.22.2 +loguru==0.5.3 +weaviate-client>=4.5.1 diff --git a/apps/oom-prevention/run.py b/apps/oom-prevention/run.py new file mode 100644 index 00000000..ea16bbf8 --- /dev/null +++ b/apps/oom-prevention/run.py @@ -0,0 +1,103 @@ +import os +import tarfile +import requests +import weaviate +import weaviate.classes as wvc +import json +import time +from loguru import logger + +delete_threshold = 12000 # ever n objects run a delete cycle + + +def download_file_if_not_exists(filename, url): + """ + Checks if the specified file exists in the current working directory. + If not, downloads the file from the given URL using loguru for logging. + + Parameters: + - filename: The name of the file to check and download. + - url: The URL from which to download the file if it's not present. + """ + if not os.path.exists(filename): + logger.info(f"{filename} not found, downloading from {url}...") + response = requests.get(url, stream=True) + if response.status_code == 200: + with open(filename, "wb") as f: + f.write(response.content) + logger.info(f"Downloaded {filename} successfully.") + extract_tarball("sphere.1M.jsonl") + else: + logger.error(f"Failed to download {filename}. HTTP Status Code: {response.status_code}") + else: + logger.info(f"{filename} already exists in the current working directory.") + + +def extract_tarball(filepath, target_dir="."): + """ + Extracts a gzipped tarball to the specified target directory. + If target_dir is not specified, extracts to the current working directory. + + Parameters: + - filepath: The path to the gzipped tarball to be extracted. + - target_dir: The directory where the files will be extracted. + """ + try: + with tarfile.open(filepath, "r:gz") as tar: + tar.extractall(path=target_dir) + logger.info(f"Extracted {filepath} to {target_dir} successfully.") + except Exception as e: + logger.error(f"Failed to extract {filepath}. Error: {e}") + + +def import_dataset(client: weaviate.WeaviateClient, file_path: str): + client.collections.delete("SphereOOM") + col = client.collections.create( + "SphereOOM", + vector_index_config=wvc.config.Configure.VectorIndex.hnsw(cleanup_interval_seconds=30), + ) + + with open(file_path, "r") as file: + with col.batch.dynamic() as batch: + i = 0 + for line in file: + obj = json.loads(line) + batch.add_object( + properties={ + "title": obj["title"], + "raw": obj["raw"], + "i": i, + }, + vector=obj["vector"], + uuid=obj["id"], + ) + + if i % 1000 == 0: + err_count = batch.number_errors + logger.info(f"Progress {i}, control: objects={len(col)}, errors={err_count}") + + if i % delete_threshold == 0: + batch.flush() + upper_bound = (i / delete_threshold) * 1000 + start_time = time.time() + del_res = col.data.delete_many( + where=wvc.query.Filter.by_property("i").less_or_equal(upper_bound) + ) + took = time.time() - start_time + logger.info( + f"Successfully deleted {del_res.successful} out of {del_res.matches} in {took:.2f}s" + ) + + i += 1 + + +def main(): + download_file_if_not_exists( + "sphere.1M.jsonl", + "https://storage.googleapis.com/sphere-demo/sphere.1M.jsonl.tar.gz", + ) + client = weaviate.connect_to_local() + import_dataset(client, "sphere.1M.jsonl") + + +main() diff --git a/apps/weaviate/docker-compose-memory-constrained.yml b/apps/weaviate/docker-compose-memory-constrained.yml new file mode 100644 index 00000000..907fd6a8 --- /dev/null +++ b/apps/weaviate/docker-compose-memory-constrained.yml @@ -0,0 +1,107 @@ +--- +version: '3.9' +services: + weaviate-node-1: + init: true + command: + - --host + - 0.0.0.0 + - --port + - '8080' + - --scheme + - http + image: semitechnologies/weaviate:$WEAVIATE_VERSION + ports: + - 8080:8080 + - 6060:6060 + - 50051:50051 + restart: no + volumes: + - ./data-node-1:/var/lib/weaviate + environment: + LOG_LEVEL: 'debug' + QUERY_DEFAULTS_LIMIT: 25 + AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true' + PERSISTENCE_DATA_PATH: '/var/lib/weaviate' + DEFAULT_VECTORIZER_MODULE: 'none' + CLUSTER_HOSTNAME: 'node1' + CLUSTER_GOSSIP_BIND_PORT: '7100' + CLUSTER_DATA_BIND_PORT: '7101' + PERSISTENCE_LSM_ACCESS_STRATEGY: '${PERSISTENCE_LSM_ACCESS_STRATEGY}' + REPLICATION_MINIMUM_FACTOR: 3 + deploy: + resources: + limits: + # used in a chaos pipeline that tries to get Weaviate + # OOM-killed + memory: 2048M + weaviate-node-2: + init: true + command: + - --host + - 0.0.0.0 + - --port + - '8080' + - --scheme + - http + image: semitechnologies/weaviate:$WEAVIATE_VERSION + ports: + - 8081:8080 + - 6061:6060 + - 50052:50051 + restart: no + volumes: + - ./data-node-2:/var/lib/weaviate + environment: + LOG_LEVEL: 'debug' + QUERY_DEFAULTS_LIMIT: 25 + AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true' + PERSISTENCE_DATA_PATH: '/var/lib/weaviate' + DEFAULT_VECTORIZER_MODULE: 'none' + CLUSTER_HOSTNAME: 'node2' + CLUSTER_GOSSIP_BIND_PORT: '7100' + CLUSTER_DATA_BIND_PORT: '7101' + PERSISTENCE_LSM_ACCESS_STRATEGY: '${PERSISTENCE_LSM_ACCESS_STRATEGY}' + REPLICATION_MINIMUM_FACTOR: 3 + CLUSTER_JOIN: 'weaviate-node-1:7100' + deploy: + resources: + limits: + # used in a chaos pipeline that tries to get Weaviate + # OOM-killed + memory: 2048M + weaviate-node-3: + init: true + command: + - --host + - 0.0.0.0 + - --port + - '8080' + - --scheme + - http + image: semitechnologies/weaviate:$WEAVIATE_VERSION + ports: + - 8082:8080 + - 6062:6060 + - 50053:50051 + restart: no + volumes: + - ./data-node-3:/var/lib/weaviate + environment: + LOG_LEVEL: 'debug' + QUERY_DEFAULTS_LIMIT: 25 + AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true' + PERSISTENCE_DATA_PATH: '/var/lib/weaviate' + DEFAULT_VECTORIZER_MODULE: 'none' + CLUSTER_HOSTNAME: 'node3' + CLUSTER_GOSSIP_BIND_PORT: '7100' + CLUSTER_DATA_BIND_PORT: '7101' + PERSISTENCE_LSM_ACCESS_STRATEGY: '${PERSISTENCE_LSM_ACCESS_STRATEGY}' + REPLICATION_MINIMUM_FACTOR: 3 + CLUSTER_JOIN: 'weaviate-node-1:7100' + deploy: + resources: + limits: + # used in a chaos pipeline that tries to get Weaviate + # OOM-killed + memory: 2048M diff --git a/oom-prevention.sh b/oom-prevention.sh new file mode 100755 index 00000000..2fe4962f --- /dev/null +++ b/oom-prevention.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +set -e + +function wait_weaviate() { + echo "Wait for Weaviate to be ready" + for _ in {1..120}; do + if curl -sf -o /dev/null localhost:$1; then + echo "Weaviate is ready" + break + fi + + echo "Weaviate is not ready on $1, trying again in 1s" + sleep 1 + done +} + +echo "Building all required containers" +( cd apps/oom-prevention/ && docker build -t importer . ) + +echo "Starting Weaviate..." +docker-compose -f apps/weaviate/docker-compose-memory-constrained.yml up -d weaviate-node-1 +wait_weaviate 8080 +docker-compose -f apps/weaviate/docker-compose-memory-constrained.yml up -d weaviate-node-2 +wait_weaviate 8081 +docker-compose -f apps/weaviate/docker-compose-memory-constrained.yml up -d weaviate-node-3 +wait_weaviate 8082 + +echo "Run import script in foreground..." +if ! docker run \ + -e 'ORIGIN=http://localhost:8080' \ + --network host \ + -t importer python3 run.py; then + echo "Importer failed, printing latest Weaviate logs..." + docker-compose -f apps/weaviate/docker-compose.yml logs weaviate + exit 1 +fi + +echo "Passed!" From a87c30163edbb322641849abdd12a9e561e613e8 Mon Sep 17 00:00:00 2001 From: etiennedi Date: Tue, 19 Mar 2024 19:26:29 -0700 Subject: [PATCH 02/16] fix docker compose string --- apps/weaviate/docker-compose-memory-constrained.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/weaviate/docker-compose-memory-constrained.yml b/apps/weaviate/docker-compose-memory-constrained.yml index 907fd6a8..efa4fcae 100644 --- a/apps/weaviate/docker-compose-memory-constrained.yml +++ b/apps/weaviate/docker-compose-memory-constrained.yml @@ -15,7 +15,7 @@ services: - 8080:8080 - 6060:6060 - 50051:50051 - restart: no + restart: 'no' volumes: - ./data-node-1:/var/lib/weaviate environment: @@ -49,7 +49,7 @@ services: - 8081:8080 - 6061:6060 - 50052:50051 - restart: no + restart: 'no' volumes: - ./data-node-2:/var/lib/weaviate environment: @@ -84,7 +84,7 @@ services: - 8082:8080 - 6062:6060 - 50053:50051 - restart: no + restart: 'no' volumes: - ./data-node-3:/var/lib/weaviate environment: From f4107fa131bd9578fff8c4791bc275dffa9a6092 Mon Sep 17 00:00:00 2001 From: etiennedi Date: Tue, 19 Mar 2024 19:31:37 -0700 Subject: [PATCH 03/16] fix filepath for tarball --- apps/oom-prevention/run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/oom-prevention/run.py b/apps/oom-prevention/run.py index ea16bbf8..41ab6819 100644 --- a/apps/oom-prevention/run.py +++ b/apps/oom-prevention/run.py @@ -26,7 +26,7 @@ def download_file_if_not_exists(filename, url): with open(filename, "wb") as f: f.write(response.content) logger.info(f"Downloaded {filename} successfully.") - extract_tarball("sphere.1M.jsonl") + extract_tarball("sphere.1M.jsonl.tar.gz") else: logger.error(f"Failed to download {filename}. HTTP Status Code: {response.status_code}") else: From edbc0967fb8d7d596700c4356754c54c4903e6bd Mon Sep 17 00:00:00 2001 From: etiennedi Date: Tue, 19 Mar 2024 19:35:43 -0700 Subject: [PATCH 04/16] don't extract into same file --- apps/oom-prevention/run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/oom-prevention/run.py b/apps/oom-prevention/run.py index 41ab6819..4669dfbd 100644 --- a/apps/oom-prevention/run.py +++ b/apps/oom-prevention/run.py @@ -23,7 +23,7 @@ def download_file_if_not_exists(filename, url): logger.info(f"{filename} not found, downloading from {url}...") response = requests.get(url, stream=True) if response.status_code == 200: - with open(filename, "wb") as f: + with open(filename + ".tar.gz", "wb") as f: f.write(response.content) logger.info(f"Downloaded {filename} successfully.") extract_tarball("sphere.1M.jsonl.tar.gz") From 0a8c459735611e96f4ed9b6a7c7ed5c0e22b2070 Mon Sep 17 00:00:00 2001 From: etiennedi Date: Tue, 19 Mar 2024 19:40:51 -0700 Subject: [PATCH 05/16] run on larger machines --- .github/workflows/tests.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 23546f24..080f9bec 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -734,7 +734,7 @@ jobs: # run: ./goroutine_leak_on_class_delete.sh oom-prevention: name: Import & Delete into HNSW on memory-constrained cluster - runs-on: ubuntu-latest + runs-on: ubuntu-latest-8-cores timeout-minutes: 60 env: PERSISTENCE_LSM_ACCESS_STRATEGY: ${{inputs.lsm_access_strategy}} From 6ddb374ca260e52977f6d3633195bfab174ba9c2 Mon Sep 17 00:00:00 2001 From: etiennedi Date: Tue, 19 Mar 2024 19:46:43 -0700 Subject: [PATCH 06/16] fix log-on-failure --- oom-prevention.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/oom-prevention.sh b/oom-prevention.sh index 2fe4962f..ae6c1b81 100755 --- a/oom-prevention.sh +++ b/oom-prevention.sh @@ -32,7 +32,12 @@ if ! docker run \ --network host \ -t importer python3 run.py; then echo "Importer failed, printing latest Weaviate logs..." - docker-compose -f apps/weaviate/docker-compose.yml logs weaviate + echo "Node 1:" + docker-compose -f apps/weaviate/docker-compose-memory-constrained.yml.yml logs weaviate-node-1 + echo "Node 2:" + docker-compose -f apps/weaviate/docker-compose-memory-constrained.yml.yml logs weaviate-node-2 + echo "Node 3:" + docker-compose -f apps/weaviate/docker-compose-memory-constrained.yml.yml logs weaviate-node-3 exit 1 fi From b2ca9623e3c27a97740202786873389ccb6df9bd Mon Sep 17 00:00:00 2001 From: etiennedi Date: Tue, 19 Mar 2024 19:53:04 -0700 Subject: [PATCH 07/16] another attempt at fixing the log-on-failure --- oom-prevention.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/oom-prevention.sh b/oom-prevention.sh index ae6c1b81..4c9fd405 100755 --- a/oom-prevention.sh +++ b/oom-prevention.sh @@ -33,11 +33,11 @@ if ! docker run \ -t importer python3 run.py; then echo "Importer failed, printing latest Weaviate logs..." echo "Node 1:" - docker-compose -f apps/weaviate/docker-compose-memory-constrained.yml.yml logs weaviate-node-1 + docker-compose -f apps/weaviate/docker-compose-memory-constrained.yml logs weaviate-node-1 echo "Node 2:" - docker-compose -f apps/weaviate/docker-compose-memory-constrained.yml.yml logs weaviate-node-2 + docker-compose -f apps/weaviate/docker-compose-memory-constrained.yml logs weaviate-node-2 echo "Node 3:" - docker-compose -f apps/weaviate/docker-compose-memory-constrained.yml.yml logs weaviate-node-3 + docker-compose -f apps/weaviate/docker-compose-memory-constrained.yml logs weaviate-node-3 exit 1 fi From dda7656deab45687c9831670286bed46490f3996 Mon Sep 17 00:00:00 2001 From: etiennedi Date: Tue, 19 Mar 2024 20:01:07 -0700 Subject: [PATCH 08/16] set GOMEMLIMIT correctly --- apps/weaviate/docker-compose-memory-constrained.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/apps/weaviate/docker-compose-memory-constrained.yml b/apps/weaviate/docker-compose-memory-constrained.yml index efa4fcae..daeec9eb 100644 --- a/apps/weaviate/docker-compose-memory-constrained.yml +++ b/apps/weaviate/docker-compose-memory-constrained.yml @@ -29,6 +29,7 @@ services: CLUSTER_DATA_BIND_PORT: '7101' PERSISTENCE_LSM_ACCESS_STRATEGY: '${PERSISTENCE_LSM_ACCESS_STRATEGY}' REPLICATION_MINIMUM_FACTOR: 3 + GOMEMLIMIT: 2048Mi deploy: resources: limits: @@ -64,6 +65,7 @@ services: PERSISTENCE_LSM_ACCESS_STRATEGY: '${PERSISTENCE_LSM_ACCESS_STRATEGY}' REPLICATION_MINIMUM_FACTOR: 3 CLUSTER_JOIN: 'weaviate-node-1:7100' + GOMEMLIMIT: 2048Mi deploy: resources: limits: @@ -99,6 +101,7 @@ services: PERSISTENCE_LSM_ACCESS_STRATEGY: '${PERSISTENCE_LSM_ACCESS_STRATEGY}' REPLICATION_MINIMUM_FACTOR: 3 CLUSTER_JOIN: 'weaviate-node-1:7100' + GOMEMLIMIT: 2048Mi deploy: resources: limits: From 5208aea01886e858caa0368d028f4094cbe324ec Mon Sep 17 00:00:00 2001 From: etiennedi Date: Tue, 19 Mar 2024 20:02:15 -0700 Subject: [PATCH 09/16] don't use log-level debug --- apps/weaviate/docker-compose-memory-constrained.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/apps/weaviate/docker-compose-memory-constrained.yml b/apps/weaviate/docker-compose-memory-constrained.yml index daeec9eb..78505009 100644 --- a/apps/weaviate/docker-compose-memory-constrained.yml +++ b/apps/weaviate/docker-compose-memory-constrained.yml @@ -19,7 +19,6 @@ services: volumes: - ./data-node-1:/var/lib/weaviate environment: - LOG_LEVEL: 'debug' QUERY_DEFAULTS_LIMIT: 25 AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true' PERSISTENCE_DATA_PATH: '/var/lib/weaviate' @@ -54,7 +53,6 @@ services: volumes: - ./data-node-2:/var/lib/weaviate environment: - LOG_LEVEL: 'debug' QUERY_DEFAULTS_LIMIT: 25 AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true' PERSISTENCE_DATA_PATH: '/var/lib/weaviate' @@ -90,7 +88,6 @@ services: volumes: - ./data-node-3:/var/lib/weaviate environment: - LOG_LEVEL: 'debug' QUERY_DEFAULTS_LIMIT: 25 AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true' PERSISTENCE_DATA_PATH: '/var/lib/weaviate' From d3d1498bf8b95138ae230d556f5b1380c10b4b98 Mon Sep 17 00:00:00 2001 From: etiennedi Date: Tue, 19 Mar 2024 20:04:10 -0700 Subject: [PATCH 10/16] fix unit on GOMEMLIMIT --- apps/weaviate/docker-compose-memory-constrained.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/weaviate/docker-compose-memory-constrained.yml b/apps/weaviate/docker-compose-memory-constrained.yml index 78505009..e8455862 100644 --- a/apps/weaviate/docker-compose-memory-constrained.yml +++ b/apps/weaviate/docker-compose-memory-constrained.yml @@ -28,7 +28,7 @@ services: CLUSTER_DATA_BIND_PORT: '7101' PERSISTENCE_LSM_ACCESS_STRATEGY: '${PERSISTENCE_LSM_ACCESS_STRATEGY}' REPLICATION_MINIMUM_FACTOR: 3 - GOMEMLIMIT: 2048Mi + GOMEMLIMIT: 2048MiB deploy: resources: limits: @@ -63,7 +63,7 @@ services: PERSISTENCE_LSM_ACCESS_STRATEGY: '${PERSISTENCE_LSM_ACCESS_STRATEGY}' REPLICATION_MINIMUM_FACTOR: 3 CLUSTER_JOIN: 'weaviate-node-1:7100' - GOMEMLIMIT: 2048Mi + GOMEMLIMIT: 2048MiB deploy: resources: limits: @@ -98,7 +98,7 @@ services: PERSISTENCE_LSM_ACCESS_STRATEGY: '${PERSISTENCE_LSM_ACCESS_STRATEGY}' REPLICATION_MINIMUM_FACTOR: 3 CLUSTER_JOIN: 'weaviate-node-1:7100' - GOMEMLIMIT: 2048Mi + GOMEMLIMIT: 2048MiB deploy: resources: limits: From c53f4638c109b76f130b671d204a0e0e6a54dcf0 Mon Sep 17 00:00:00 2001 From: etiennedi Date: Tue, 19 Mar 2024 20:26:54 -0700 Subject: [PATCH 11/16] add more docker debug info --- oom-prevention.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/oom-prevention.sh b/oom-prevention.sh index 4c9fd405..2dd99a15 100755 --- a/oom-prevention.sh +++ b/oom-prevention.sh @@ -38,6 +38,9 @@ if ! docker run \ docker-compose -f apps/weaviate/docker-compose-memory-constrained.yml logs weaviate-node-2 echo "Node 3:" docker-compose -f apps/weaviate/docker-compose-memory-constrained.yml logs weaviate-node-3 + docker-compose -f apps/weaviate/docker-compose-memory-constrained.yml ps + docker stats + exit 1 fi From e4e8f058164efe207919e3a0067597f60d1e409a Mon Sep 17 00:00:00 2001 From: etiennedi Date: Tue, 19 Mar 2024 22:08:03 -0700 Subject: [PATCH 12/16] use more verbose image --- .github/workflows/tests.yaml | 2 +- oom-prevention.sh | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 080f9bec..733ab8a9 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -1,7 +1,7 @@ name: Chaos tests env: - WEAVIATE_VERSION: preview-don-t-merge-wip-memory-investigations-eabedd0 + WEAVIATE_VERSION: preview-don-t-merge-wip-memory-investigations-7081e28 MINIMUM_WEAVIATE_VERSION: 1.15.0 # this is used as the start in the upgrade journey test on: workflow_call: diff --git a/oom-prevention.sh b/oom-prevention.sh index 2dd99a15..d40cdf20 100755 --- a/oom-prevention.sh +++ b/oom-prevention.sh @@ -39,7 +39,6 @@ if ! docker run \ echo "Node 3:" docker-compose -f apps/weaviate/docker-compose-memory-constrained.yml logs weaviate-node-3 docker-compose -f apps/weaviate/docker-compose-memory-constrained.yml ps - docker stats exit 1 fi From 31be04aab6d2d4201eca3bbac30769e789c8ebd5 Mon Sep 17 00:00:00 2001 From: etiennedi Date: Wed, 20 Mar 2024 15:07:29 -0700 Subject: [PATCH 13/16] try setting GOMEMLIMIT 100MB before container limit --- apps/weaviate/docker-compose-memory-constrained.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/weaviate/docker-compose-memory-constrained.yml b/apps/weaviate/docker-compose-memory-constrained.yml index e8455862..2f196cb0 100644 --- a/apps/weaviate/docker-compose-memory-constrained.yml +++ b/apps/weaviate/docker-compose-memory-constrained.yml @@ -28,7 +28,7 @@ services: CLUSTER_DATA_BIND_PORT: '7101' PERSISTENCE_LSM_ACCESS_STRATEGY: '${PERSISTENCE_LSM_ACCESS_STRATEGY}' REPLICATION_MINIMUM_FACTOR: 3 - GOMEMLIMIT: 2048MiB + GOMEMLIMIT: 1948MiB deploy: resources: limits: @@ -63,7 +63,7 @@ services: PERSISTENCE_LSM_ACCESS_STRATEGY: '${PERSISTENCE_LSM_ACCESS_STRATEGY}' REPLICATION_MINIMUM_FACTOR: 3 CLUSTER_JOIN: 'weaviate-node-1:7100' - GOMEMLIMIT: 2048MiB + GOMEMLIMIT: 1948MiB deploy: resources: limits: @@ -98,7 +98,7 @@ services: PERSISTENCE_LSM_ACCESS_STRATEGY: '${PERSISTENCE_LSM_ACCESS_STRATEGY}' REPLICATION_MINIMUM_FACTOR: 3 CLUSTER_JOIN: 'weaviate-node-1:7100' - GOMEMLIMIT: 2048MiB + GOMEMLIMIT: 1948MiB deploy: resources: limits: From 440d38f403ffac3c326adb803491221b525e2792 Mon Sep 17 00:00:00 2001 From: etiennedi Date: Thu, 21 Mar 2024 06:13:53 -0700 Subject: [PATCH 14/16] remove batch deletes (to see if they influence OOMkills) --- apps/oom-prevention/run.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/apps/oom-prevention/run.py b/apps/oom-prevention/run.py index 4669dfbd..2fca061f 100644 --- a/apps/oom-prevention/run.py +++ b/apps/oom-prevention/run.py @@ -76,17 +76,17 @@ def import_dataset(client: weaviate.WeaviateClient, file_path: str): err_count = batch.number_errors logger.info(f"Progress {i}, control: objects={len(col)}, errors={err_count}") - if i % delete_threshold == 0: - batch.flush() - upper_bound = (i / delete_threshold) * 1000 - start_time = time.time() - del_res = col.data.delete_many( - where=wvc.query.Filter.by_property("i").less_or_equal(upper_bound) - ) - took = time.time() - start_time - logger.info( - f"Successfully deleted {del_res.successful} out of {del_res.matches} in {took:.2f}s" - ) + # if i % delete_threshold == 0: + # batch.flush() + # upper_bound = (i / delete_threshold) * 1000 + # start_time = time.time() + # del_res = col.data.delete_many( + # where=wvc.query.Filter.by_property("i").less_or_equal(upper_bound) + # ) + # took = time.time() - start_time + # logger.info( + # f"Successfully deleted {del_res.successful} out of {del_res.matches} in {took:.2f}s" + # ) i += 1 From b4d7b72cbaae230ffc51fdda02dae2eee5fd3091 Mon Sep 17 00:00:00 2001 From: etiennedi Date: Thu, 21 Mar 2024 07:02:25 -0700 Subject: [PATCH 15/16] try adjusting unit theory is that docker compose uses MB while regular docker uses MiB --- apps/weaviate/docker-compose-memory-constrained.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/weaviate/docker-compose-memory-constrained.yml b/apps/weaviate/docker-compose-memory-constrained.yml index 2f196cb0..b5d03476 100644 --- a/apps/weaviate/docker-compose-memory-constrained.yml +++ b/apps/weaviate/docker-compose-memory-constrained.yml @@ -34,7 +34,7 @@ services: limits: # used in a chaos pipeline that tries to get Weaviate # OOM-killed - memory: 2048M + memory: 2148M weaviate-node-2: init: true command: @@ -69,7 +69,7 @@ services: limits: # used in a chaos pipeline that tries to get Weaviate # OOM-killed - memory: 2048M + memory: 2148M weaviate-node-3: init: true command: @@ -104,4 +104,4 @@ services: limits: # used in a chaos pipeline that tries to get Weaviate # OOM-killed - memory: 2048M + memory: 2148M From e60b2d2b8bdc564fe48a8351d3c6b7813ee715fa Mon Sep 17 00:00:00 2001 From: etiennedi Date: Thu, 21 Mar 2024 14:22:48 -0700 Subject: [PATCH 16/16] reenable deletes, explain memory settings --- apps/oom-prevention/run.py | 22 +++++++++---------- .../docker-compose-memory-constrained.yml | 4 ++-- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/apps/oom-prevention/run.py b/apps/oom-prevention/run.py index 2fca061f..4669dfbd 100644 --- a/apps/oom-prevention/run.py +++ b/apps/oom-prevention/run.py @@ -76,17 +76,17 @@ def import_dataset(client: weaviate.WeaviateClient, file_path: str): err_count = batch.number_errors logger.info(f"Progress {i}, control: objects={len(col)}, errors={err_count}") - # if i % delete_threshold == 0: - # batch.flush() - # upper_bound = (i / delete_threshold) * 1000 - # start_time = time.time() - # del_res = col.data.delete_many( - # where=wvc.query.Filter.by_property("i").less_or_equal(upper_bound) - # ) - # took = time.time() - start_time - # logger.info( - # f"Successfully deleted {del_res.successful} out of {del_res.matches} in {took:.2f}s" - # ) + if i % delete_threshold == 0: + batch.flush() + upper_bound = (i / delete_threshold) * 1000 + start_time = time.time() + del_res = col.data.delete_many( + where=wvc.query.Filter.by_property("i").less_or_equal(upper_bound) + ) + took = time.time() - start_time + logger.info( + f"Successfully deleted {del_res.successful} out of {del_res.matches} in {took:.2f}s" + ) i += 1 diff --git a/apps/weaviate/docker-compose-memory-constrained.yml b/apps/weaviate/docker-compose-memory-constrained.yml index b5d03476..3b81f9a0 100644 --- a/apps/weaviate/docker-compose-memory-constrained.yml +++ b/apps/weaviate/docker-compose-memory-constrained.yml @@ -28,13 +28,13 @@ services: CLUSTER_DATA_BIND_PORT: '7101' PERSISTENCE_LSM_ACCESS_STRATEGY: '${PERSISTENCE_LSM_ACCESS_STRATEGY}' REPLICATION_MINIMUM_FACTOR: 3 - GOMEMLIMIT: 1948MiB + GOMEMLIMIT: 1948MiB # This assumes 2048MiB-100MiB for binary, etc. deploy: resources: limits: # used in a chaos pipeline that tries to get Weaviate # OOM-killed - memory: 2148M + memory: 2148M # This is MB, not MiB, so it's basically a 2048MiB limit weaviate-node-2: init: true command: