forked from GoogleCloudPlatform/cluster-toolkit
-
Notifications
You must be signed in to change notification settings - Fork 0
/
htc-slurm-v5-legacy.yaml
165 lines (146 loc) · 5.15 KB
/
htc-slurm-v5-legacy.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
# Copyright 2022 Google LLC
# Copyright (C) SchedMD LLC.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---
# This blueprint provisions a cluster using the Slurm scheduler configured to
# efficiently run many short duration, loosely-coupled (non-MPI) jobs. See also:
# https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/htc.md
# https://slurm.schedmd.com/high_throughput.html
blueprint_name: htc-slurm
vars:
project_id: ## Set GCP Project ID Here ##
deployment_name: htc-slurm
region: us-west4
zone: us-west4-c
# By default, public IPs are set in the login and controller to allow easier
# SSH access. To turn this behavior off, set this to true.
disable_public_ips: false
# Stage `community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/*` into the deployment folder.
# If you move the blueprint, make sure the relative path is correct.
staged_configs: $(ghpc_stage("../modules/scheduler/schedmd-slurm-gcp-v5-controller/etc"))
# Documentation for each of the modules used below can be found at
# https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md
deployment_groups:
- group: primary
modules:
# Source is an embedded module, denoted by "modules/*" without ./, ../, /
# as a prefix. To refer to a local or community module, prefix with ./, ../ or /
- id: network1
source: modules/network/vpc
- id: homefs
source: modules/file-system/filestore
use: [network1]
settings:
local_mount: /home
- id: projectsfs
source: modules/file-system/filestore
use: [network1]
settings:
filestore_tier: HIGH_SCALE_SSD
size_gb: 10240
local_mount: /projects
# This file system has an associated license cost.
# https://console.developers.google.com/marketplace/product/ddnstorage/exascaler-cloud
- id: scratchfs
source: community/modules/file-system/DDN-EXAScaler
use: [network1]
settings:
local_mount: /scratch
# The compute partition is designed for performance.
# Use:
# `srun -N 4 -p compute <<Command>>` for any node in the partition.
# `srun -N 4 -p compute --mincpus 30 <<Command>>` for node group c2s60.
- id: compute_node_group_c2s60
source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
settings:
name: c2s60
node_count_dynamic_max: 200
bandwidth_tier: gvnic_enabled
- id: compute_node_group_c2s30
source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
settings:
name: c2s30
node_count_dynamic_max: 200
machine_type: c2-standard-30
bandwidth_tier: gvnic_enabled
- id: compute_partition
source: community/modules/compute/schedmd-slurm-gcp-v5-partition
use:
- network1
- homefs
- scratchfs
- projectsfs
- compute_node_group_c2s60
- compute_node_group_c2s30
settings:
partition_name: compute
enable_placement: false
exclusive: false
# The lowcost partition is designed to run at a lower cost and without additional quota
# Use:
# `srun -N 4 <<Command>>` for any node in the partition.
# `srun -N 4 --mincpus 2` for node group n2s4.
- id: low_cost_node_group_n2s2
source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
settings:
name: n2s2
machine_type: n2-standard-2
node_count_dynamic_max: 10
bandwidth_tier: gvnic_enabled
- id: low_cost_node_group_n2s4
source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
settings:
name: n2s4
machine_type: n2-standard-4
node_count_dynamic_max: 10
bandwidth_tier: gvnic_enabled
- id: low_cost_partition
source: community/modules/compute/schedmd-slurm-gcp-v5-partition
use:
- network1
- homefs
- scratchfs
- projectsfs
- low_cost_node_group_n2s2
- low_cost_node_group_n2s4
settings:
is_default: true
partition_name: lowcost
enable_placement: false
exclusive: false
- id: slurm_controller
source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller
use:
- network1
- homefs
- scratchfs
- projectsfs
- low_cost_partition
- compute_partition
settings:
machine_type: c2-standard-8
disable_controller_public_ips: $(vars.disable_public_ips)
slurm_conf_tpl: $(vars.staged_configs)/htc-slurm.conf.tpl
slurmdbd_conf_tpl: $(vars.staged_configs)/htc-slurmdbd.conf.tpl
- id: slurm_login
source: community/modules/scheduler/schedmd-slurm-gcp-v5-login
use:
- network1
- slurm_controller
settings:
machine_type: n2-standard-4
disable_login_public_ips: $(vars.disable_public_ips)
- id: hpc_dashboard
source: modules/monitoring/dashboard
outputs: [instructions]