This repository has been archived by the owner on Oct 11, 2024. It is now read-only.
forked from dandi/dandi-hub
-
Notifications
You must be signed in to change notification settings - Fork 0
/
z2jh.yml
494 lines (413 loc) · 15.7 KB
/
z2jh.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
---
- hosts: all
user: ec2-user
environment:
- AWS_REGION: "{{ aws_region }}"
- KOPS_STATE_STORE: s3://{{ namespace }}-s3
tasks:
## create working directory
- name: Creates directory
file:
path: /home/ec2-user/{{ namespace }}
state: directory
## install packages ##
- name: Check to see if pip is already installed
command: "/usr/bin/pip-3 --version"
ignore_errors: true
register: pip_is_installed
changed_when: false
- block:
- name: Download get-pip.py
get_url:
url: https://bootstrap.pypa.io/get-pip.py
dest: /tmp
- name: Install pip
command: "python /tmp/get-pip.py"
register: pip_log
become: yes
- name: Delete get-pip.py
file:
state: absent
path: /tmp/get-pip.py
when: pip_is_installed.rc != 0
- name: Install boto3
pip:
name: boto3
executable: /usr/bin/pip-3
become: yes
- name: Check for kops package
command: which kops
failed_when: false
changed_when: false
register: kops
- block:
- name: Download kops
shell: wget -O kops https://github.com/kubernetes/kops/releases/download/v1.25.4/kops-linux-amd64
args:
chdir: /home/ec2-user/{{ namespace }}
creates: kops
- name: Modify perms to install kops
file:
dest: /home/ec2-user/{{ namespace }}/kops
mode: "+x"
- name: Install kops
command: mv /home/ec2-user/{{ namespace }}/kops /usr/local/bin
args:
creates: /usr/local/bin/kops
become: yes
when: kops.rc
- name: Check for kubectl package
command: which kubectl
failed_when: false
changed_when: false
register: kubectl
- block:
- name: Download kubectl
shell: wget -O kubectl https://storage.googleapis.com/kubernetes-release/release/v1.25.8/bin/linux/amd64/kubectl
args:
chdir: /home/ec2-user/{{ namespace }}
creates: /home/ec2-user/{{ namespace }}/kubectl
- name: Modify perms to install kubectl
file:
dest: /home/ec2-user/{{ namespace }}/kubectl
mode: "+x"
- name: Install kubectl
command: mv /home/ec2-user/{{ namespace }}/kubectl /usr/local/bin
args:
creates: /usr/local/bin/kubectl
become: yes
when: kubectl.rc
- name: Check for helm package
command: which helm
failed_when: false
changed_when: false
register: helm
- name: Install helm
shell: curl https://raw.githubusercontent.com/kubernetes/helm/master/scripts/get-helm-3 | bash
when: helm.rc
## Create kubernetes (k8s) cluster
- name: Provision s3 instance
aws_s3:
bucket: "{{ namespace }}-s3"
mode: create
register: s3_log
- name: Create SSH key
command: ssh-keygen -f /home/ec2-user/.ssh/id_rsa -C '' -N ''
args:
creates: /home/ec2-user/.ssh/id_rsa
- name: Set kubectl context
command: kops export kubecfg --admin --name {{ namespace }}.k8s.local
ignore_errors: yes
- name: Check to see if Kubernetes is already installed
command: kubectl cluster-info
failed_when: False
changed_when: False
register: kci_log
- aws_az_info:
register: az_facts
- name: Define Kubernetes cluster config
shell: kops create cluster {{ namespace }}.k8s.local \
--authorization RBAC \
--cloud aws \
--container-runtime containerd \
--master-size t3a.large \
--master-volume-size 50 \
--node-size {{ node_size }} \
--zones {{ az_facts.availability_zones | map(attribute='zone_name') | list | join(',') }} \
--node-volume-size {{ node_volume_size }} \
--dry-run \
-oyaml > cluster-{{ namespace }}.yaml
args:
chdir: /home/ec2-user/{{ namespace }}
register: kops_output
when: kci_log.rc
- name: Add GPU option to cluster config
lineinfile:
path: /home/ec2-user/{{ namespace }}/cluster-{{ namespace }}.yaml
insertafter: "containerRuntime: containerd"
state: present
line: "{{ item }}"
with_items:
- ' enabled: true'
- ' nvidiaGPU:'
- ' containerd:'
when: kci_log.rc
- name: Register Kubernetes cluster
command: kops create -f cluster-{{ namespace }}.yaml
args:
chdir: /home/ec2-user/{{ namespace }}
when: kci_log.rc
- name: Create Kubernetes cluster resources
command: kops update cluster {{ namespace }}.k8s.local \
--yes
args:
chdir: /home/ec2-user/{{ namespace }}
register: k8s_log
when: kci_log.rc
- name: Set kubectl context
command: kops export kubecfg --admin --name {{ namespace }}.k8s.local
- name: Verify that kops setup is complete - WARNING - long run time.
shell: kops validate cluster
args:
executable: /bin/bash
register: kops_result
until: kops_result.stdout.find('is ready') != -1
retries: 20
delay: 30
changed_when: False
# Create filesystem for k8s
- ec2_group_info:
filters:
group_name: masters.{{ namespace }}.k8s.local
register: sg_master_facts
- ec2_group_info:
filters:
group_name: nodes.{{ namespace }}.k8s.local
register: sg_nodes_facts
- ec2_vpc_net_info:
filters:
"tag:Name": "{{ namespace }}.k8s.local"
register: vpc_facts
- ec2_vpc_subnet_info:
filters:
vpc-id: "{{ vpc_facts['vpcs'][0]['vpc_id'] }}"
register: subnet_facts
- efs_info:
register: efs_facts
# use the jinja map function to return the subnet_ids as a list
- name: subnet_facts
set_fact:
subnet_ids: "{{ subnet_facts.subnets|map(attribute='id')|list }}"
- name: Create NFS Security Group
ec2_group:
name: "{{ namespace }}.nfs"
description: "{{ namespace }}.nfs"
vpc_id: "{{ vpc_facts['vpcs'][0]['vpc_id'] }}"
tags:
name: "{{ namespace }}.nfs"
rules:
- proto: tcp
from_port: 2049
to_port: 2049
group_id: "{{ sg_master_facts['security_groups'][0]['group_id'] }}"
- proto: tcp
from_port: 2049
to_port: 2049
group_id: "{{ sg_nodes_facts['security_groups'][0]['group_id'] }}"
rules_egress:
- proto: all
cidr_ip: 0.0.0.0/0
register: nfs_sg_log
- name: Prepare vars file from template.
template: src=target_vars.yaml.j2
dest=/tmp/tempvars.yaml
delegate_to: localhost
- name: Include vars
include_vars: "/tmp/tempvars.yaml"
# Provide all subnets from your Kubernetes VPC (each AZ)
- name: Provision EFS
efs:
state: present
name: "{{ namespace }}-efs"
tags:
Name: "{{ namespace }}-efs"
targets: "{{ target_list }}"
register: efs_log
- name: Store EFS ID
set_fact:
efs_id: "{{ efs_log['efs']['mount_targets'][0]['file_system_id'] }}"
- name: Check for kube namespace
command: kubectl get namespace {{ namespace }}
failed_when: False
changed_when: False
register: get_kube_namespace
- name: Create kube namespace
shell: kubectl create namespace {{ namespace }}
when: get_kube_namespace.rc
args:
chdir: /home/ec2-user/{{ namespace }}
- name: check if nodes yaml file exists
stat:
path: /home/ec2-user/{{ namespace }}/nodes1.yaml
register: nodes_yaml
- block:
- name: Create spot-ig file
template:
src: spot-ig.yaml.j2
dest: /home/ec2-user/{{ namespace }}/spot-ig.yaml
- name: Apply spot-ig config
command: kops create -f /home/ec2-user/{{ namespace }}/spot-ig.yaml
args:
chdir: /home/ec2-user/{{ namespace }}
- name: Create gpu spot-ig file
template:
src: spot-ig-gpu.yaml.j2
dest: /home/ec2-user/{{ namespace }}/spot-ig-gpu.yaml
- name: Apply spot-ig-gpu config
command: kops create -f /home/ec2-user/{{ namespace }}/spot-ig-gpu.yaml
args:
chdir: /home/ec2-user/{{ namespace }}
- name: Create nodes file
template:
src: nodes1.yaml.j2
dest: /home/ec2-user/{{ namespace }}/nodes1.yaml
- name: Create nodes file
template:
src: nodes2.yaml.j2
dest: /home/ec2-user/{{ namespace }}/nodes2.yaml
- name: Create nodes file
template:
src: nodes3.yaml.j2
dest: /home/ec2-user/{{ namespace }}/nodes3.yaml
- name: Apply nodes config
command: kops replace -f /home/ec2-user/{{ namespace }}/nodes1.yaml
args:
chdir: /home/ec2-user/{{ namespace }}
- name: Apply nodes config
command: kops replace -f /home/ec2-user/{{ namespace }}/nodes2.yaml
args:
chdir: /home/ec2-user/{{ namespace }}
- name: Apply nodes config
command: kops replace -f /home/ec2-user/{{ namespace }}/nodes3.yaml
args:
chdir: /home/ec2-user/{{ namespace }}
- name: Apply update to cluster
command: kops update cluster {{ namespace}}.k8s.local --yes
args:
chdir: /home/ec2-user/{{ namespace }}
- name: Apply rolling update to cluster
command: kops rolling-update cluster --yes
register: kops_ru_result
until: kops_ru_result.stdout.find('Rolling update completed for cluster') != -1
retries: 10
delay: 60 # this value
args:
chdir: /home/ec2-user/{{ namespace }}
failed_when:
- kops_ru_result != 0
- '"No rolling-update required." not in kops_ru_result.stdout'
when: nodes_yaml.stat.exists == False
## Check and configure persistent volume ##
- name: Check for persistent volume
command: kubectl get pv
register: pv_log
changed_when: False
- block:
- name: Create pv_efs.yaml config file
template:
src: pv_efs.yaml.j2
dest: /home/ec2-user/{{ namespace }}/pv_efs.yaml
- name: Add efs driver charts
command: helm repo add aws-efs-csi-driver https://kubernetes-sigs.github.io/aws-efs-csi-driver/
args:
chdir: /home/ec2-user/{{ namespace }}
- name: Update Helm chart repos
command: helm repo update
args:
chdir: /home/ec2-user/{{ namespace }}
- name: Add efs driver
shell: helm upgrade --install aws-efs-csi-driver \
--namespace kube-system aws-efs-csi-driver/aws-efs-csi-driver
- name: Create storageclass.yaml config file
template:
src: storageclass.yaml.j2
dest: /home/ec2-user/{{ namespace }}/storageclass.yaml
- name: Apply storageclass config file
command: kubectl apply -f /home/ec2-user/{{ namespace }}/storageclass.yaml
- name: Apply kube pv config
command: kubectl --namespace={{ namespace }} apply -f /home/ec2-user/{{ namespace }}/pv_efs.yaml
register: kube_config_1
changed_when: kube_config_1.stdout.find('unchanged') == -1
args:
chdir: /home/ec2-user/{{ namespace }}
when: pv_log.stdout.find('efs-persist') == -1
- name: Check for persistent volume claim
command: kubectl get pvc --namespace={{ namespace }}
register: pvc_log
changed_when: False
## Configure persistent volume claim ##
- block:
- name: add pod file
template:
src: pod.yaml.j2
dest: /home/ec2-user/{{ namespace }}/pod.yaml
- name: Create pvc_efs.yaml config file
template:
src: pvc_efs.yaml.j2
dest: /home/ec2-user/{{ namespace }}/pvc_efs.yaml
- name: Apply kube pvc config
command: kubectl --namespace={{ namespace }} apply -f /home/ec2-user/{{ namespace }}/pvc_efs.yaml
register: kube_config_1
changed_when: kube_config_1.stdout.find('unchanged') == -1
args:
chdir: /home/ec2-user/{{ namespace }}
when: pvc_log.stdout.find('efs-persist') == -1
- name: check if autoscaler yaml file exists
stat:
path: /home/ec2-user/{{ namespace }}/cluster-autoscaler-multi-asg.yaml
register: autoscaler_yaml
- block:
- name: Attach policy to nodes
command: aws iam attach-role-policy --policy-arn arn:aws:iam::278212569472:policy/ig-policy --role-name nodes.{{ namespace }}.k8s.local
- name: Create autoscale config file
template:
src: cluster-autoscaler-multi-asg.yaml.j2
dest: /home/ec2-user/{{ namespace }}/cluster-autoscaler-multi-asg.yaml
- name: Apply autoscaling to cluster
command: kubectl apply -f /home/ec2-user/{{ namespace }}/cluster-autoscaler-multi-asg.yaml
args:
chdir: /home/ec2-user/{{ namespace }}
when: autoscaler_yaml.stat.exists == False
- name: Add eks charts
command: helm repo add eks https://aws.github.io/eks-charts
args:
chdir: /home/ec2-user/{{ namespace }}
- name: Update Helm chart repos
command: helm repo update
args:
chdir: /home/ec2-user/{{ namespace }}
- name: Add termination handler
shell: helm upgrade --install aws-node-termination-handler \
--namespace kube-system \
eks/aws-node-termination-handler
# configure and create jupyterhub
- name: Create security token
command: openssl rand -hex 32
register: os
- name: Create config file for JupyterHub
template:
src: config.yaml.j2
dest: /home/ec2-user/{{ namespace }}/config.yaml
- name: Add JupyterHub Helm Chart repo
command: helm repo add {{ helm_chart_repo_name }} {{ helm_chart_repo_url }}
args:
chdir: /home/ec2-user/{{ namespace }}
- name: Update Helm chart repos
command: helm repo update
args:
chdir: /home/ec2-user/{{ namespace }}
- name: Check for JupyterHub helm release
command: helm list
register: hl_log
changed_when: False
- name: Install JupyterHub release
shell: helm install {{ namespace }}-jupyterhub {{ helm_chart_repo_name }}/jupyterhub \
--debug \
--version={{ jupyterhub_chart_version }} \
--namespace={{ namespace }} \
--timeout=10m
-f config.yaml
args:
chdir: /home/ec2-user/{{ namespace }}
when: hl_log.stdout.find(namespace + '-jupyterhub') == -1
## Wait for proxy to spin up ##
- name: sleep for 30 seconds and continue with play
wait_for: timeout=30
delegate_to: localhost
- name: Get URL for public proxy
command: kubectl --namespace={{ namespace }} describe svc proxy-public
register: pp_log
until: pp_log.stdout.find('Ensured load balancer') != -1
retries: 10
delay: 60
- debug: msg="Navigate browser to {{ pp_log.stdout_lines[20] }} to begin using JupyterHub!"