-
Notifications
You must be signed in to change notification settings - Fork 0
/
loop.yaml
134 lines (134 loc) · 4.19 KB
/
loop.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
apiVersion: kubeflow.org/v1beta1
kind: Experiment
metadata:
name: jiuhai-experiment01
namespace: default
spec:
maxTrialCount: 64
parallelTrialCount: 64
maxFailedTrialCount: 32
resumePolicy: LongRunning
objective:
type: minimize
goal: -10000
objectiveMetricName: objective
additionalMetricNames:
- rocauc
algorithm:
algorithmName: grid
algorithmSettings: []
parameters:
- name: seed
parameterType: int
feasibleSpace:
min: '1'
max: '3'
step: '1'
- name: dataset
parameterType: categorical
feasibleSpace:
list:
- ogb-molpcba
- ogb-molhiv
- ogbg-ppa
- UPFD
- SBM-Isolation
- SBM-Environment
- RotatedMNIST
- ColoredMNIST
- name: algorithm
parameterType: categorical
feasibleSpace:
list:
- ERM
- groupDRO
- IRM
- deepCORAL
- FLAG
- MLDG
- DANN
- DANN-G
- name: modelname
parameterType: categorical
feasibleSpace:
list:
- gin
metricsCollectorSpec:
collector:
kind: File
source:
filter:
metricsFormat:
- ([\w|-]+)\s*:\s*((-?\d+)(\.\d+)?)
fileSystemPath:
path: /tmp/output.log
kind: File
trialTemplate:
primaryContainerName: training-container
successCondition: status.conditions.#(type=="Complete")#|#(status=="True")#
failureCondition: status.conditions.#(type=="Failed")#|#(status=="True")#
retain: false
trialParameters:
- name: dataset
reference: dataset
description: ''
- name: modelname
reference: modelname
description: ''
- name: algorithm
reference: algorithm
description: ''
- name: seed
reference: seed
description: ''
trialSpec:
apiVersion: batch/v1
kind: Job
metadata:
generateName: jiuhai-gpu-pod
spec:
template:
spec:
serviceAccountName: dgl-s3-user
restartPolicy: Never
volumes:
- name: asail-data # This is a fsx folder on the cluster, which has high performance
persistentVolumeClaim:
claimName: asail-k8s-data-claim
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: beta.kubernetes.io/instance-type
operator: In
values:
- g4dn.4xlarge
- g4dn.8xlarge
- g4dn.12xlarge
- g4dn.metal
containers:
- name: training-container
image: public.ecr.aws/s1o7b3d9/asail-public-dev:jiuhai-pytorch1.8-cuda10.2
args:
- "bash"
- "-c"
- |
git clone https://JiuhaiChen:[email protected]/johnding1996/Graph-Distribution-Shift.git
cd Graph-Distribution-Shift
rm .python-version
pip install -e .
ln -s /data/jiuhai/s3data/data data
ln -s /data/jiuhai/s3data/logs logs
mkdir tmp
python experiments/run_expt.py --dataset ${trialParameters.dataset} --algorithm ${trialParameters.algorithm} --model ${trialParameters.modelname} --root_dir data --n_epochs 300 --seed ${trialParameters.seed} 2>&1 | tee /tmp/output.log
cp /tmp/output.log /data/jiuhai/s3data/logs/output_${trialParameters.dataset}_${trialParameters.algorithm}_${trialParameters.modelname}_${trialParameters.seed}.log \
resources:
# reequests:
# memory: 1000Mi
limits:
nvidia.com/gpu: 1
# memory: 1000Mi
volumeMounts:
- name: asail-data
mountPath: /data