-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbenchmark.py
133 lines (118 loc) · 4.15 KB
/
benchmark.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import os
import subprocess
import pickle
from utils.datasets import DATASETS_INFO
general_config = {
"depth": 4,
"min_in_leaf": 50,
"train_data_limit": 10_000,
"round_limit": 4,
"memory_limit": 250,
"thread_limit": 8,
"time_limit": 8*3600,
"mip_focus": 1,
"mip_heuristics": 0.8,
"random_runs": 10,
}
# for Direct
# configuration = {
# "variant": "direct",
# "base_dir": f"benchmark/direct/runname",
# "shortcut": f"D",
# "script_path": "direct.py",
# "params": [],
# }
# for Warmstarted using CART
configuration = {
"variant": "sklearn_start",
"base_dir": f"benchmark/warmstart/runname",
"shortcut": f"W",
"script_path": "sklearn_warmstart.py",
"params": ["-init hint"],
}
# for Gradual increase of depth
# configuration = {
# "variant": "gradual_increase",
# "base_dir": f"benchmark/gradual/runname",
# "shortcut": f"G",
# "script_path": "gradual_depth_increase.py",
# "params": ["-init hint"],
# }
# for OCT direct
# configuration = {
# "variant": "OCT",
# "base_dir": f"benchmark/OCT/runname",
# "shortcut": f"O",
# "script_path": "oct.py",
# "params": [],
# }
# for OCT direct
# configuration = {
# "variant": "OCT",
# "base_dir": f"benchmark/OCT/warm_runname",
# "shortcut": f"Ow",
# "script_path": "oct.py",
# "params": ["-warm"],
# }
# for halving - not presented in the paper
# configuration = {
# "variant": "halving",
# "base_dir": f"benchmark/halving/runname",
# "shortcut": f"H",
# "script_path": "halving.py",
# "params": ["-u 1 -l 0.5 -prec 0.001"], # all are binary classifications, otherwise should the lower bound be 1/K
# }
base_command = [
"run_python_batch.script",
configuration['script_path'],
f"-d {general_config['depth']}",
f"-max {general_config['train_data_limit']}",
f"-t {general_config['time_limit']}",
f"-m {general_config['memory_limit']}",
f"-thr {general_config['thread_limit']}",
f"-r {general_config['round_limit']}",
f"-focus {general_config['mip_focus']}",
f"-heur {general_config['mip_heuristics']}",
f"-lmin {general_config['min_in_leaf']}",
] + configuration["params"]
os.makedirs(configuration["base_dir"], exist_ok=True)
with open(configuration["base_dir"] + "/config.pickle", "wb") as f:
pickle.dump((general_config, configuration, DATASETS_INFO), f)
jobs = []
for rand_seed in range(general_config["random_runs"]):
for dataset_type in DATASETS_INFO:
for dataset_name, dataset_info in DATASETS_INFO[dataset_type].items():
# for rand_seed, dataset_type, dataset_name in [ # Uncomment for selective extra runs
# (0, "categorical", "albert"),
# ]:
# if True:
# if True:
# dataset_info = DATASETS_INFO[dataset_type][dataset_name]
dataset_path = dataset_info["path"]
if dataset_info["n_features"] > 30 or (dataset_info["n_points"] > 10000/0.8 and dataset_info["n_features"] > 20):
max_memory = 128
res_path = os.path.join(configuration["base_dir"], dataset_type, dataset_name)
os.makedirs(res_path, exist_ok=True)
command = base_command + [
f"--dataset_path {dataset_path}",
f"--dataset_type {dataset_type}",
f"--results_dir {res_path}",
f"-seed {rand_seed}",
]
# call to cluster manager
job_name = f"{configuration['shortcut']}_{rand_seed}_{dataset_type[0]}_{dataset_name}"
outfile = f"{res_path}/run{rand_seed}.out"
precommand = [
"sbatch",
"--parsable",
f"--mem={max_memory*1024}",
f"--out={outfile}",
f"--job-name={job_name}",
f"--cpus-per-task={general_config['thread_limit']}",
]
result = subprocess.run(precommand + command, stdout=subprocess.PIPE, encoding='ascii')
# writes job id to stdout that is useful for later
jobs.append((job_name, result.stdout.strip(), outfile))
with open(configuration["base_dir"]+"/jobs", "w") as f:
for item in jobs:
f.write(",".join(item)+"\n")