forked from materialsproject/matbench
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmy_python_file.py.py
66 lines (46 loc) · 2.25 KB
/
my_python_file.py.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
"""
Code for training and recording the matbench_v0.1 random forest benchmark.
The ML pipeline is placed within the Automatminer pipeline code infrastructure for convenience.
All training and inference was done on a single 128-core HPC node.
Reduce the number of jobs n_jobs for less memory usage on consumer machines.
"""
if __name__ == '__main__':
from automatminer import MatPipe
from automatminer.automl.adaptors import SinglePipelineAdaptor, TPOTAdaptor
from automatminer.featurization import AutoFeaturizer
from automatminer.preprocessing import DataCleaner, FeatureReducer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from matbench.bench import MatbenchBenchmark
from multiprocessing import set_start_method
set_start_method("spawn", force=True)
# The learner is a single 500-estimator Random Forest model
learner = SinglePipelineAdaptor(
regressor=RandomForestRegressor(n_estimators=500),
classifier=RandomForestClassifier(n_estimators=500),
)
pipe_config = {
"learner": learner,
"reducer": FeatureReducer(reducers=[]),
"cleaner": DataCleaner(feature_na_method="mean", max_na_frac=0.01, na_method_fit="drop", na_method_transform="mean"),
"autofeaturizer": AutoFeaturizer(n_jobs=8, preset="debug"),
}
pipe = MatPipe(**pipe_config)
mb = MatbenchBenchmark(autoload=False)
i = 0
#for task in mb.tasks:
task = mb.matbench_jdft2d
print(task)
task.load()
for fold in task.folds:
df_train = task.get_train_and_val_data(fold, as_type="df")
# Fit the RF with matpipe
pipe.fit(df_train, task.metadata.target)
df_test = task.get_test_data(fold, include_target=False, as_type="df")
predictions = pipe.predict(df_test)[f"{task.metadata.target} predicted"]
# A single configuration is used
params = {'note': 'single config; see benchmark user metadata'}
task.record(fold, predictions, params=params)
mb.to_file("results_" + str(i) + ".json.gz")
i += 1
# Save your results
mb.to_file("results.json.gz")