Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bug 1904569: add remaining d2g-enabled worker pools for translations GPU workers #264

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
216 changes: 214 additions & 2 deletions worker-pools.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2047,6 +2047,46 @@ pools:
guestAccelerators:
- acceleratorCount: 4
acceleratorType: nvidia-tesla-v100
- pool_id: '{pool-group}/b-linux-v100-gpu-d2g'
description: Worker for machine learning and other high GPU tasks
owner: [email protected]
variants:
- pool-group: translations-1
email_on_error: true
provider_id:
by-chain-of-trust:
trusted: fxci-level3-gcp
default: fxci-level1-gcp
config:
lifecycle:
# low inactivity timeout because these workers are very expensive
queueInactivityTimeout: 1800
worker-config:
genericWorker:
config:
# 2592900s is 30 days plus 900 seconds to account for https://github.com/taskcluster/taskcluster/issues/7423
maxTaskRunTime: 2592900
enableInteractive: true
d2gConfig:
enableD2G: true
allowGPUs: true
containerEngine: docker
headlessTasks: true
ed25519SigningKeyLocation: '/etc/generic-worker/ed25519_key'
minCapacity: 0
maxCapacity: 200
implementation: generic-worker/worker-runner-linux-multi
regions: [us-central1, us-west1, us-east1]
image: ubuntu-2404-headless
instance_types:
- minCpuPlatform: Intel Skylake
disks:
- <<: *persistent-disk
diskSizeGb: 75
machine_type: n1-highmem-8
guestAccelerators:
- acceleratorCount: 1
acceleratorType: nvidia-tesla-v100
- pool_id: '{pool-group}/b-linux-v100-gpu-d2g-4'
description: Worker for machine learning and other high GPU tasks
owner: [email protected]
Expand Down Expand Up @@ -2080,7 +2120,7 @@ pools:
maxCapacity: 128
implementation: generic-worker/worker-runner-linux-multi
regions: [us-central1, us-west1, us-east1]
image: ubuntu-2404-headless-alpha
image: ubuntu-2404-headless
instance_types:
- minCpuPlatform: Intel Skylake
disks:
Expand Down Expand Up @@ -2123,14 +2163,186 @@ pools:
maxCapacity: 128
implementation: generic-worker/worker-runner-linux-multi
regions: [us-central1, us-west1, us-east1]
image: ubuntu-2404-headless-alpha
image: ubuntu-2404-headless
instance_types:
- minCpuPlatform: Intel Skylake
disks:
- <<: *persistent-disk
diskSizeGb: 300
# 40 CPUs, 256GB RAM
machine_type: n1-custom-40-262144
guestAccelerators:
- acceleratorCount: 4
acceleratorType: nvidia-tesla-v100
- pool_id: '{pool-group}/b-linux-v100-gpu-d2g-4-300gb-standard'
description: Worker for machine learning tasks that require standard VMs
owner: [email protected]
variants:
- pool-group: translations-1
email_on_error: true
provider_id:
by-chain-of-trust:
trusted: fxci-level3-gcp
default: fxci-level1-gcp
config:
lifecycle:
# low inactivity timeout because these workers are very expensive
queueInactivityTimeout: 1800
worker-config:
genericWorker:
config:
# 2592900s is 30 days plus 900 seconds to account for https://github.com/taskcluster/taskcluster/issues/7423
maxTaskRunTime: 2592900
enableInteractive: true
d2gConfig:
enableD2G: true
allowGPUs: true
containerEngine: docker
headlessTasks: true
ed25519SigningKeyLocation: '/etc/generic-worker/ed25519_key'
minCapacity: 0
maxCapacity: 50
implementation: generic-worker/worker-runner-linux-multi
regions: [us-central1, us-west1]
image: ubuntu-2404-headless
instance_types:
- minCpuPlatform: Intel Skylake
disks:
- <<: *persistent-disk
diskSizeGb: 300
# 40 CPUs, 256GB RAM
machine_type: n1-custom-40-262144
scheduling: standard
guestAccelerators:
- acceleratorCount: 4
acceleratorType: nvidia-tesla-v100
- pool_id: '{pool-group}/b-linux-v100-gpu-d2g-4-1tb-standard'
description: Worker for machine learning tasks that require standard VMs
owner: [email protected]
variants:
- pool-group: translations-1
email_on_error: true
provider_id:
by-chain-of-trust:
trusted: fxci-level3-gcp
default: fxci-level1-gcp
config:
lifecycle:
# low inactivity timeout because these workers are very expensive
queueInactivityTimeout: 1800
worker-config:
genericWorker:
config:
# 2592900s is 30 days plus 900 seconds to account for https://github.com/taskcluster/taskcluster/issues/7423
maxTaskRunTime: 2592900
enableInteractive: true
d2gConfig:
enableD2G: true
allowGPUs: true
containerEngine: docker
headlessTasks: true
ed25519SigningKeyLocation: '/etc/generic-worker/ed25519_key'
minCapacity: 0
maxCapacity: 50
implementation: generic-worker/worker-runner-linux-multi
regions: [us-central1, us-west1]
image: ubuntu-2404-headless
instance_types:
- minCpuPlatform: Intel Skylake
disks:
- <<: *persistent-disk
diskSizeGb: 1024
# 40 CPUs, 256GB RAM
machine_type: n1-custom-40-262144
scheduling: standard
guestAccelerators:
- acceleratorCount: 4
acceleratorType: nvidia-tesla-v100
- pool_id: '{pool-group}/b-linux-v100-gpu-d2g-4-1tb'
description: Worker for machine learning and other high GPU tasks
owner: [email protected]
variants:
- pool-group: translations-1
email_on_error: true
provider_id:
by-chain-of-trust:
trusted: fxci-level3-gcp
default: fxci-level1-gcp
config:
lifecycle:
# low inactivity timeout because these workers are very expensive
queueInactivityTimeout: 1800
worker-config:
genericWorker:
config:
# 2592900s is 30 days plus 900 seconds to account for https://github.com/taskcluster/taskcluster/issues/7423
maxTaskRunTime: 2592900
enableInteractive: true
d2gConfig:
enableD2G: true
allowGPUs: true
containerEngine: docker
headlessTasks: true
ed25519SigningKeyLocation: '/etc/generic-worker/ed25519_key'
minCapacity: 0
# We use 4 GPUs per instance across 4 regions with a limit of 128
# per region at any given time. 4 regions * 4 GPUs = 512 total GPUs
# 512 GPUs / 4 per instance = 128 instances possibly running at once.
maxCapacity: 128
implementation: generic-worker/worker-runner-linux-multi
regions: [us-central1, us-west1, us-east1]
image: ubuntu-2404-headless
instance_types:
- minCpuPlatform: Intel Skylake
disks:
- <<: *persistent-disk
diskSizeGb: 1024
# 40 CPUs, 256GB RAM
machine_type: n1-custom-40-262144
guestAccelerators:
- acceleratorCount: 4
acceleratorType: nvidia-tesla-v100
- pool_id: '{pool-group}/b-linux-v100-gpu-d2g-4-2tb'
description: Worker for machine learning and other high GPU tasks
owner: [email protected]
variants:
- pool-group: translations-1
email_on_error: true
provider_id:
by-chain-of-trust:
trusted: fxci-level3-gcp
default: fxci-level1-gcp
config:
lifecycle:
# low inactivity timeout because these workers are very expensive
queueInactivityTimeout: 1800
worker-config:
genericWorker:
config:
# 2592900s is 30 days plus 900 seconds to account for https://github.com/taskcluster/taskcluster/issues/7423
maxTaskRunTime: 2592900
enableInteractive: true
d2gConfig:
enableD2G: true
allowGPUs: true
containerEngine: docker
headlessTasks: true
ed25519SigningKeyLocation: '/etc/generic-worker/ed25519_key'
minCapacity: 0
# We use 4 GPUs per instance across 4 regions with a limit of 128
# per region at any given time. 4 regions * 4 GPUs = 512 total GPUs
# 512 GPUs / 4 per instance = 128 instances possibly running at once.
maxCapacity: 128
implementation: generic-worker/worker-runner-linux-multi
regions: [us-central1, us-west1, us-east1]
image: ubuntu-2404-headless
instance_types:
- minCpuPlatform: Intel Skylake
disks:
- <<: *persistent-disk
diskSizeGb: 2048
# 40 CPUs, 256GB RAM
machine_type: n1-custom-40-262144
guestAccelerators:
- acceleratorCount: 4
acceleratorType: nvidia-tesla-v100
Expand Down
Loading