Skip to content

Commit

Permalink
Updates in experiments and documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
Roman Joeres authored and Roman Joeres committed Nov 4, 2023
1 parent 3042a46 commit 811a0ac
Show file tree
Hide file tree
Showing 16 changed files with 20,134 additions and 379 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,8 @@ viz_molnet.py
/experiments/stiming.png
/experiments/timing.png
/runs/
/experiments/PDBBind/datasail/
/experiments/MPP/lohi/
/experiments/PDBBind/deepchem/
/experiments/PDBBind/graphpart/
/experiments/PDBBind/lohi/
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,15 @@ DataSAIL is installable from [conda](https://anaconda.org/kalininalab/datasail)
using

````shell
conda create -n sail -c conda-forge -c kalininalab -c bioconda MPP
conda create -n sail -c conda-forge -c kalininalab -c bioconda -c mosek DataSAIL
conda activate sail
pip install grakel
````

to install it into a new empty environment or

````shell
conda install -c conda-forge -c kalininalab -c bioconda MPP
conda install -c conda-forge -c kalininalab -c bioconda -c mosek DataSAIL
pip install grakel
````

Expand Down
6 changes: 3 additions & 3 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,18 @@ The code for Data Splitting Against Information Leakage, short DataSAIL, is avai
Quick Start
===========

DataSAIL currently only runs in Python 3.10. Therefore, you have to install it into a Python 3.10 environment. For
DataSAIL currently only runs in any oficially supported version of Python (3.7, 3.8, 3.9, 3.10, 3.11). For
conda, this can be created by running

.. code-block:: shell
conda create -n datasail python=3.10
conda create -n datasail
Other than described on the conda-website, the command to install DataSAIL within your just created environment is

.. code-block:: shell
mamba install -c kalininalab -c conda-forge -c bioconda datasail
mamba install -c kalininalab -c mosek -c conda-forge -c bioconda datasail
pip install grakel
The second command is necessary to run WLK clustering as the grakel library is not available on conda for python 3.10.
Expand Down
1 change: 0 additions & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,5 @@ dependencies:
- pytest-cov
- pytest-cases
- rdkit
- mosek::mosek
- pip:
- grakel
21 changes: 21 additions & 0 deletions experiments/MPP/check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from pathlib import Path
import os

import pandas as pd
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator

# extract the data and save them in a CSV file
tb_path = Path("experiments") / "MPP" / "lohi" / "cdata" / "freesolv" / "lohi" / f"split_0" / "fold_0" / \
"model_0"
tb_file = tb_path / list(sorted(filter(
lambda x: str(x).startswith("events"), os.listdir(tb_path)
)))[-1]
print("File:", tb_file)
ea = EventAccumulator(str(tb_file))
ea.Reload()
for long, short in [("validation_", "val"), ("test_", "test")]:
print([m for m in filter(lambda x: x.startswith(long), ea.Tags()["scalars"])])
for metric in filter(lambda x: x.startswith(long), ea.Tags()["scalars"]):
print("metric", [e.value for e in ea.Scalars(metric)])
# dfs[short][f"{tech}_{metric}_split_{run}"] = [e.value for e in ea.Scalars(metric)]
# print(df)
109 changes: 75 additions & 34 deletions experiments/MPP/split.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
import os
import sys
from pathlib import Path

from rdkit import Chem
import deepchem as dc
from datasail.sail import datasail
import lohi_splitter as lohi

from experiments.utils import splitters, mpp_datasets, dc2pd, RUNS
from experiments.utils import SPLITTERS, mpp_datasets, dc2pd, RUNS, telegram

count = 0


def split_w_datasail(name):
Expand All @@ -14,50 +18,51 @@ def split_w_datasail(name):
df = dc2pd(dataset, name)

for tech in ["I1e", "C1e"]:
for run in range(RUNS):
try:
try:

with open(base / tech / "start.txt", "w") as start:
print("Start", file=start)

e_splits, _, _ = datasail(
techniques=[tech],
splits=[8, 2],
names=["train", "test"],
runs=RUNS,
solver="SCIP",
e_type="M",
e_data=dict(df[["ID", "SMILES"]].values.tolist())
)

for run in range(RUNS):
path = base / tech / f"split_{run}"
os.makedirs(path, exist_ok=True)

with open(path / "start.txt", "w") as start:
print("Start", file=start)

e_splits, _, _ = datasail(
techniques=[tech],
splits=[8, 2],
names=["train", "test"],
runs=RUNS,
solver="SCIP",
e_type="M",
e_data=dict(df[["ID", "SMILES"]].values.tolist())
)

train = list(df["ID"].apply(lambda x: e_splits[tech][run].get(x, "") == "train"))
test = list(df["ID"].apply(lambda x: e_splits[tech][run].get(x, "") == "test"))
df[train].to_csv(path / "train.csv", index=False)
df[test].to_csv(path / "test.csv", index=False)
except Exception as e:
print("=" * 80 + f"\n{e}\n" + "=" * 80)
except Exception as e:
print("=" * 80 + f"\n{e}\n" + "=" * 80)


def split_w_deepchem(name):
base = Path('experiments') / 'MPP' / 'deepchem' / 'sdata' / name

dataset = mpp_datasets[name][0](featurizer=dc.feat.DummyFeaturizer(), splitter=None)[1][0]
if name[:2] != "qm":
valid_ids = [i for i, smiles in enumerate(list(dataset.to_dataframe()["X"])) if Chem.MolFromSmiles(smiles) is not None]
valid_ids = [i for i, smiles in enumerate(list(dataset.to_dataframe()["X"])) if
Chem.MolFromSmiles(smiles) is not None]
dataset = dataset.select(valid_ids)

for run in range(RUNS):
for tech in splitters:
for tech in SPLITTERS:
try:
path = base / tech / f"split_{run}"
os.makedirs(path, exist_ok=True)

with open(path / "start.txt", "w") as start:
print("Start", file=start)

train_set, test_set = splitters[tech].train_test_split(dataset, frac_train=0.8)
train_set, test_set = SPLITTERS[tech].train_test_split(dataset, frac_train=0.8)

dc2pd(train_set, name).to_csv(path / "train.csv", index=False)
dc2pd(test_set, name).to_csv(path / "test.csv", index=False)
Expand All @@ -66,19 +71,55 @@ def split_w_deepchem(name):
dataset = dataset.complete_shuffle()


def full_main():
for ds_name in mpp_datasets:
if ds_name in ["pdbbind", "pcba"]:
continue
split_w_datasail(ds_name)
split_w_deepchem(ds_name)
def split_w_lohi(name):
base = Path('experiments') / 'MPP' / 'lohi' / 'sdata' / name

dataset = mpp_datasets[name][0](featurizer=dc.feat.DummyFeaturizer(), splitter=None)[1][0]
if name[:2] != "qm":
valid_ids = [i for i, smiles in enumerate(list(dataset.to_dataframe()["X"])) if
Chem.MolFromSmiles(smiles) is not None]
dataset = dataset.select(valid_ids)

for run in range(RUNS):
try:
path = base / "lohi" / f"split_{run}"
os.makedirs(path, exist_ok=True)
df = dc2pd(dataset, name)

with open(path / "start.txt", "w") as start:
print("Start", file=start)

train_test_partition = lohi.hi_train_test_split(
smiles=list(df["SMILES"]),
similarity_threshold=0.4,
train_min_frac=0.7,
test_min_frac=0.1,
coarsening_threshold=0.4,
max_mip_gap=0.1,
verbose=False,
)

df.iloc[train_test_partition[0]].to_csv(path / "train.csv", index=False)
df.iloc[train_test_partition[1]].to_csv(path / "test.csv", index=False)
global count
count += 1
telegram(f"[MPP {count} / 70] Splitting finished for MPP - lohi - {name} - Run {run + 1} / 5")
except Exception as e:
print("=" * 80 + f"\n{e}\n" + "=" * 80)
dataset = dataset.complete_shuffle()


def scnd_main():
for ds_name in ["hiv", "bace", "bbbp", "tox21", "toxcast", "sider", "clintox"]:
split_w_datasail(ds_name)
split_w_deepchem(ds_name)
def main():
for ds_name in sorted(list(mpp_datasets.keys()), key=lambda x: mpp_datasets[x][3]):
if ds_name in ["pdbbind", "pcba"]:
continue
# split_w_datasail(ds_name)
# split_w_deepchem(ds_name)
split_w_lohi(ds_name)


if __name__ == '__main__':
scnd_main()
if len(sys.argv) > 1:
split_w_lohi(sys.argv[1])
else:
main()
59 changes: 28 additions & 31 deletions experiments/MPP/train.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,27 @@
import os
import shutil
from pathlib import Path

import chemprop
import pandas as pd
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator

from experiments.utils import mpp_datasets, RUNS, MPP_EPOCHS, telegram


count = 0


def train(model, name):
dfs = {"val": pd.DataFrame({"rows": list(range(50))}), "test": pd.DataFrame({"rows": [0]})}
# store the results in training, validation, and test files
for tech in [x for x in os.listdir(f"experiments/MPP/{model}/cdata/{name}") if os.path.isdir(f"experiments/MPP/{model}/cdata/{name}/{x}")]:
cpath = Path("experiments") / "MPP" / model / "cdata" / name
for tech in [x for x in os.listdir(cpath) if os.path.isdir(cpath / x)]:
for run in range(RUNS):
print(tech, "-", run)
try:
print("Check folder:", f"experiments/MPP/{model}/cdata/{name}/{tech}/split_{run}/fold_0/", end="\t")
print(os.path.exists("experiments/MPP/{model}/cdata/{name}/{tech}/split_{run}/fold_0/"))
if os.path.exists("experiments/MPP/{model}/cdata/{name}/{tech}/split_{run}/fold_0/"):
print("Delete folder")
shutil.rmtree("experiments/MPP/{model}/cdata/{name}/{tech}/split_{run}/fold_0/", ignore_errors=True)

train_df = pd.read_csv(f"experiments/MPP/{model}/cdata/{name}/{tech}/split_{run}/train.csv")
test_df = pd.read_csv(f"experiments/MPP/{model}/cdata/{name}/{tech}/split_{run}/test.csv")
path = cpath / tech / f"split_{run}"
train_df = pd.read_csv(path / "train.csv")
test_df = pd.read_csv(path / "test.csv")
train_nunique = train_df.nunique()
test_nunique = test_df.nunique()
train_dropable = train_nunique[train_nunique == 1].index
Expand All @@ -36,19 +32,19 @@ def train(model, name):
test_df.drop(train_dropable, axis=1, inplace=True)
train_df.drop(test_dropable, axis=1, inplace=True)
test_df.drop(test_dropable, axis=1, inplace=True)
train_df.to_csv(f"experiments/MPP/{model}/cdata/{name}/{tech}/split_{run}/train.csv", index=False)
test_df.to_csv(f"experiments/MPP/{model}/cdata/{name}/{tech}/split_{run}/test.csv", index=False)
train_df.to_csv(path / "train.csv", index=False)
test_df.to_csv(path / "test.csv", index=False)

# train the D-MPNN model
targets = list(pd.read_csv(f"experiments/MPP/{model}/cdata/{name}/{tech}/split_{run}/train.csv").columns)
targets = list(pd.read_csv(path / "train.csv").columns)
targets.remove("SMILES")
targets.remove("ID")
arguments = [
"--data_path", f"experiments/MPP/{model}/cdata/{name}/{tech}/split_{run}/train.csv",
"--separate_val_path", f"experiments/MPP/{model}/cdata/{name}/{tech}/split_{run}/test.csv",
"--separate_test_path", f"experiments/MPP/{model}/cdata/{name}/{tech}/split_{run}/test.csv",
"--data_path", str(path / "train.csv"),
"--separate_val_path", str(path / "test.csv"),
"--separate_test_path", str(path / "test.csv"),
"--dataset_type", mpp_datasets[name][1],
"--save_dir", f"experiments/MPP/{model}/cdata/{name}/{tech}/split_{run}/",
"--save_dir", str(path),
"--quiet", "--epochs", str(MPP_EPOCHS),
"--smiles_columns", "SMILES",
"--target_columns", *targets,
Expand All @@ -61,11 +57,13 @@ def train(model, name):
del args

# extract the data and save them in a CSV file
tb_file = f"experiments/MPP/{model}/cdata/{name}/{tech}/split_{run}/fold_0/model_0/" + list(sorted(filter(
lambda x: x.startswith("events"), os.listdir(f"experiments/MPP/{model}/cdata/{name}/{tech}/split_{run}/fold_0/model_0/")
tb_path = Path("experiments") / "MPP" / model / "cdata" / name / tech / f"split_{run}" / "fold_0" / \
"model_0"
tb_file = tb_path / list(sorted(filter(
lambda x: x.startswith("events"), os.listdir(tb_path)
)))[-1]
print("File:", tb_file)
ea = EventAccumulator(tb_file)
ea = EventAccumulator(str(tb_file))
ea.Reload()
for long, short in [("validation_", "val"), ("test_", "test")]:
print([m for m in filter(lambda x: x.startswith(long), ea.Tags()["scalars"])])
Expand All @@ -77,18 +75,17 @@ def train(model, name):

global count
count += 1
telegram(f"[MPP {count} / 105] Training finished for MPP - {model} - {name} - {tech} - Run {run}/4")
telegram(f"[MPP {count} / 55] Training finished for MPP - lohi - {name} - Run {run + 1} / 5")
except Exception as e:
print(e)
for split, df in dfs.items():
print("Saving:", df.shape, "to", f"experiments/MPP/{model}/cdata/{name}/{split}_metrics.tsv")
df.to_csv(f"experiments/MPP/{model}/cdata/{name}/{split}_metrics.tsv", sep="\t", index=False)

save_path = Path("experiments") / "MPP" / model / "cdata" / name / f"{split}_metrics.tsv"
print("Saving:", df.shape, "to", save_path)
df.to_csv(save_path, sep="\t", index=False)

# for dataset in ["freesolv", "esol", "sider", "clintox", "bace", "bbbp", "lipophilicity", "qm7", "tox21", "toxcast", "qm8", "hiv", "muv", "qm9"]:
for dataset in ["qm7", "qm8", "qm9"]:
# for dataset in ["tox21", "toxcast", "hiv", "muv"]:
for tool in ["datasail", "deepchem"]:
print(dataset, "-", tool)
train(tool, dataset)

for dataset in sorted(list(mpp_datasets.keys()), key=lambda x: mpp_datasets[x][3]):
if dataset in {"qm9", "muv", "bace"}:
continue
print(dataset, "-", "lohi")
train("lohi", dataset)
Loading

0 comments on commit 811a0ac

Please sign in to comment.