Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updates in experiments and documentation #9

Merged
merged 1 commit into from
Nov 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,8 @@ viz_molnet.py
/experiments/stiming.png
/experiments/timing.png
/runs/
/experiments/PDBBind/datasail/
/experiments/MPP/lohi/
/experiments/PDBBind/deepchem/
/experiments/PDBBind/graphpart/
/experiments/PDBBind/lohi/
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,15 @@ DataSAIL is installable from [conda](https://anaconda.org/kalininalab/datasail)
using

````shell
conda create -n sail -c conda-forge -c kalininalab -c bioconda MPP
conda create -n sail -c conda-forge -c kalininalab -c bioconda -c mosek DataSAIL
conda activate sail
pip install grakel
````

to install it into a new empty environment or

````shell
conda install -c conda-forge -c kalininalab -c bioconda MPP
conda install -c conda-forge -c kalininalab -c bioconda -c mosek DataSAIL
pip install grakel
````

Expand Down
6 changes: 3 additions & 3 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,18 @@ The code for Data Splitting Against Information Leakage, short DataSAIL, is avai
Quick Start
===========

DataSAIL currently only runs in Python 3.10. Therefore, you have to install it into a Python 3.10 environment. For
DataSAIL currently only runs in any oficially supported version of Python (3.7, 3.8, 3.9, 3.10, 3.11). For
conda, this can be created by running

.. code-block:: shell

conda create -n datasail python=3.10
conda create -n datasail

Other than described on the conda-website, the command to install DataSAIL within your just created environment is

.. code-block:: shell

mamba install -c kalininalab -c conda-forge -c bioconda datasail
mamba install -c kalininalab -c mosek -c conda-forge -c bioconda datasail
pip install grakel

The second command is necessary to run WLK clustering as the grakel library is not available on conda for python 3.10.
Expand Down
1 change: 0 additions & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,5 @@ dependencies:
- pytest-cov
- pytest-cases
- rdkit
- mosek::mosek
- pip:
- grakel
21 changes: 21 additions & 0 deletions experiments/MPP/check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from pathlib import Path
import os

import pandas as pd
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator

# extract the data and save them in a CSV file
tb_path = Path("experiments") / "MPP" / "lohi" / "cdata" / "freesolv" / "lohi" / f"split_0" / "fold_0" / \
"model_0"
tb_file = tb_path / list(sorted(filter(
lambda x: str(x).startswith("events"), os.listdir(tb_path)
)))[-1]
print("File:", tb_file)
ea = EventAccumulator(str(tb_file))
ea.Reload()
for long, short in [("validation_", "val"), ("test_", "test")]:
print([m for m in filter(lambda x: x.startswith(long), ea.Tags()["scalars"])])
for metric in filter(lambda x: x.startswith(long), ea.Tags()["scalars"]):
print("metric", [e.value for e in ea.Scalars(metric)])
# dfs[short][f"{tech}_{metric}_split_{run}"] = [e.value for e in ea.Scalars(metric)]
# print(df)
109 changes: 75 additions & 34 deletions experiments/MPP/split.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
import os
import sys
from pathlib import Path

from rdkit import Chem
import deepchem as dc
from datasail.sail import datasail
import lohi_splitter as lohi

from experiments.utils import splitters, mpp_datasets, dc2pd, RUNS
from experiments.utils import SPLITTERS, mpp_datasets, dc2pd, RUNS, telegram

count = 0


def split_w_datasail(name):
Expand All @@ -14,50 +18,51 @@ def split_w_datasail(name):
df = dc2pd(dataset, name)

for tech in ["I1e", "C1e"]:
for run in range(RUNS):
try:
try:

with open(base / tech / "start.txt", "w") as start:
print("Start", file=start)

e_splits, _, _ = datasail(
techniques=[tech],
splits=[8, 2],
names=["train", "test"],
runs=RUNS,
solver="SCIP",
e_type="M",
e_data=dict(df[["ID", "SMILES"]].values.tolist())
)

for run in range(RUNS):
path = base / tech / f"split_{run}"
os.makedirs(path, exist_ok=True)

with open(path / "start.txt", "w") as start:
print("Start", file=start)

e_splits, _, _ = datasail(
techniques=[tech],
splits=[8, 2],
names=["train", "test"],
runs=RUNS,
solver="SCIP",
e_type="M",
e_data=dict(df[["ID", "SMILES"]].values.tolist())
)

train = list(df["ID"].apply(lambda x: e_splits[tech][run].get(x, "") == "train"))
test = list(df["ID"].apply(lambda x: e_splits[tech][run].get(x, "") == "test"))
df[train].to_csv(path / "train.csv", index=False)
df[test].to_csv(path / "test.csv", index=False)
except Exception as e:
print("=" * 80 + f"\n{e}\n" + "=" * 80)
except Exception as e:
print("=" * 80 + f"\n{e}\n" + "=" * 80)


def split_w_deepchem(name):
base = Path('experiments') / 'MPP' / 'deepchem' / 'sdata' / name

dataset = mpp_datasets[name][0](featurizer=dc.feat.DummyFeaturizer(), splitter=None)[1][0]
if name[:2] != "qm":
valid_ids = [i for i, smiles in enumerate(list(dataset.to_dataframe()["X"])) if Chem.MolFromSmiles(smiles) is not None]
valid_ids = [i for i, smiles in enumerate(list(dataset.to_dataframe()["X"])) if
Chem.MolFromSmiles(smiles) is not None]
dataset = dataset.select(valid_ids)

for run in range(RUNS):
for tech in splitters:
for tech in SPLITTERS:
try:
path = base / tech / f"split_{run}"
os.makedirs(path, exist_ok=True)

with open(path / "start.txt", "w") as start:
print("Start", file=start)

train_set, test_set = splitters[tech].train_test_split(dataset, frac_train=0.8)
train_set, test_set = SPLITTERS[tech].train_test_split(dataset, frac_train=0.8)

dc2pd(train_set, name).to_csv(path / "train.csv", index=False)
dc2pd(test_set, name).to_csv(path / "test.csv", index=False)
Expand All @@ -66,19 +71,55 @@ def split_w_deepchem(name):
dataset = dataset.complete_shuffle()


def full_main():
for ds_name in mpp_datasets:
if ds_name in ["pdbbind", "pcba"]:
continue
split_w_datasail(ds_name)
split_w_deepchem(ds_name)
def split_w_lohi(name):
base = Path('experiments') / 'MPP' / 'lohi' / 'sdata' / name

dataset = mpp_datasets[name][0](featurizer=dc.feat.DummyFeaturizer(), splitter=None)[1][0]
if name[:2] != "qm":
valid_ids = [i for i, smiles in enumerate(list(dataset.to_dataframe()["X"])) if
Chem.MolFromSmiles(smiles) is not None]
dataset = dataset.select(valid_ids)

for run in range(RUNS):
try:
path = base / "lohi" / f"split_{run}"
os.makedirs(path, exist_ok=True)
df = dc2pd(dataset, name)

with open(path / "start.txt", "w") as start:
print("Start", file=start)

train_test_partition = lohi.hi_train_test_split(
smiles=list(df["SMILES"]),
similarity_threshold=0.4,
train_min_frac=0.7,
test_min_frac=0.1,
coarsening_threshold=0.4,
max_mip_gap=0.1,
verbose=False,
)

df.iloc[train_test_partition[0]].to_csv(path / "train.csv", index=False)
df.iloc[train_test_partition[1]].to_csv(path / "test.csv", index=False)
global count
count += 1
telegram(f"[MPP {count} / 70] Splitting finished for MPP - lohi - {name} - Run {run + 1} / 5")
except Exception as e:
print("=" * 80 + f"\n{e}\n" + "=" * 80)
dataset = dataset.complete_shuffle()


def scnd_main():
for ds_name in ["hiv", "bace", "bbbp", "tox21", "toxcast", "sider", "clintox"]:
split_w_datasail(ds_name)
split_w_deepchem(ds_name)
def main():
for ds_name in sorted(list(mpp_datasets.keys()), key=lambda x: mpp_datasets[x][3]):
if ds_name in ["pdbbind", "pcba"]:
continue
# split_w_datasail(ds_name)
# split_w_deepchem(ds_name)
split_w_lohi(ds_name)


if __name__ == '__main__':
scnd_main()
if len(sys.argv) > 1:
split_w_lohi(sys.argv[1])
else:
main()
59 changes: 28 additions & 31 deletions experiments/MPP/train.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,27 @@
import os
import shutil
from pathlib import Path

import chemprop
import pandas as pd
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator

from experiments.utils import mpp_datasets, RUNS, MPP_EPOCHS, telegram


count = 0


def train(model, name):
dfs = {"val": pd.DataFrame({"rows": list(range(50))}), "test": pd.DataFrame({"rows": [0]})}
# store the results in training, validation, and test files
for tech in [x for x in os.listdir(f"experiments/MPP/{model}/cdata/{name}") if os.path.isdir(f"experiments/MPP/{model}/cdata/{name}/{x}")]:
cpath = Path("experiments") / "MPP" / model / "cdata" / name
for tech in [x for x in os.listdir(cpath) if os.path.isdir(cpath / x)]:
for run in range(RUNS):
print(tech, "-", run)
try:
print("Check folder:", f"experiments/MPP/{model}/cdata/{name}/{tech}/split_{run}/fold_0/", end="\t")
print(os.path.exists("experiments/MPP/{model}/cdata/{name}/{tech}/split_{run}/fold_0/"))
if os.path.exists("experiments/MPP/{model}/cdata/{name}/{tech}/split_{run}/fold_0/"):
print("Delete folder")
shutil.rmtree("experiments/MPP/{model}/cdata/{name}/{tech}/split_{run}/fold_0/", ignore_errors=True)

train_df = pd.read_csv(f"experiments/MPP/{model}/cdata/{name}/{tech}/split_{run}/train.csv")
test_df = pd.read_csv(f"experiments/MPP/{model}/cdata/{name}/{tech}/split_{run}/test.csv")
path = cpath / tech / f"split_{run}"
train_df = pd.read_csv(path / "train.csv")
test_df = pd.read_csv(path / "test.csv")
train_nunique = train_df.nunique()
test_nunique = test_df.nunique()
train_dropable = train_nunique[train_nunique == 1].index
Expand All @@ -36,19 +32,19 @@ def train(model, name):
test_df.drop(train_dropable, axis=1, inplace=True)
train_df.drop(test_dropable, axis=1, inplace=True)
test_df.drop(test_dropable, axis=1, inplace=True)
train_df.to_csv(f"experiments/MPP/{model}/cdata/{name}/{tech}/split_{run}/train.csv", index=False)
test_df.to_csv(f"experiments/MPP/{model}/cdata/{name}/{tech}/split_{run}/test.csv", index=False)
train_df.to_csv(path / "train.csv", index=False)
test_df.to_csv(path / "test.csv", index=False)

# train the D-MPNN model
targets = list(pd.read_csv(f"experiments/MPP/{model}/cdata/{name}/{tech}/split_{run}/train.csv").columns)
targets = list(pd.read_csv(path / "train.csv").columns)
targets.remove("SMILES")
targets.remove("ID")
arguments = [
"--data_path", f"experiments/MPP/{model}/cdata/{name}/{tech}/split_{run}/train.csv",
"--separate_val_path", f"experiments/MPP/{model}/cdata/{name}/{tech}/split_{run}/test.csv",
"--separate_test_path", f"experiments/MPP/{model}/cdata/{name}/{tech}/split_{run}/test.csv",
"--data_path", str(path / "train.csv"),
"--separate_val_path", str(path / "test.csv"),
"--separate_test_path", str(path / "test.csv"),
"--dataset_type", mpp_datasets[name][1],
"--save_dir", f"experiments/MPP/{model}/cdata/{name}/{tech}/split_{run}/",
"--save_dir", str(path),
"--quiet", "--epochs", str(MPP_EPOCHS),
"--smiles_columns", "SMILES",
"--target_columns", *targets,
Expand All @@ -61,11 +57,13 @@ def train(model, name):
del args

# extract the data and save them in a CSV file
tb_file = f"experiments/MPP/{model}/cdata/{name}/{tech}/split_{run}/fold_0/model_0/" + list(sorted(filter(
lambda x: x.startswith("events"), os.listdir(f"experiments/MPP/{model}/cdata/{name}/{tech}/split_{run}/fold_0/model_0/")
tb_path = Path("experiments") / "MPP" / model / "cdata" / name / tech / f"split_{run}" / "fold_0" / \
"model_0"
tb_file = tb_path / list(sorted(filter(
lambda x: x.startswith("events"), os.listdir(tb_path)
)))[-1]
print("File:", tb_file)
ea = EventAccumulator(tb_file)
ea = EventAccumulator(str(tb_file))
ea.Reload()
for long, short in [("validation_", "val"), ("test_", "test")]:
print([m for m in filter(lambda x: x.startswith(long), ea.Tags()["scalars"])])
Expand All @@ -77,18 +75,17 @@ def train(model, name):

global count
count += 1
telegram(f"[MPP {count} / 105] Training finished for MPP - {model} - {name} - {tech} - Run {run}/4")
telegram(f"[MPP {count} / 55] Training finished for MPP - lohi - {name} - Run {run + 1} / 5")
except Exception as e:
print(e)
for split, df in dfs.items():
print("Saving:", df.shape, "to", f"experiments/MPP/{model}/cdata/{name}/{split}_metrics.tsv")
df.to_csv(f"experiments/MPP/{model}/cdata/{name}/{split}_metrics.tsv", sep="\t", index=False)

save_path = Path("experiments") / "MPP" / model / "cdata" / name / f"{split}_metrics.tsv"
print("Saving:", df.shape, "to", save_path)
df.to_csv(save_path, sep="\t", index=False)

# for dataset in ["freesolv", "esol", "sider", "clintox", "bace", "bbbp", "lipophilicity", "qm7", "tox21", "toxcast", "qm8", "hiv", "muv", "qm9"]:
for dataset in ["qm7", "qm8", "qm9"]:
# for dataset in ["tox21", "toxcast", "hiv", "muv"]:
for tool in ["datasail", "deepchem"]:
print(dataset, "-", tool)
train(tool, dataset)

for dataset in sorted(list(mpp_datasets.keys()), key=lambda x: mpp_datasets[x][3]):
if dataset in {"qm9", "muv", "bace"}:
continue
print(dataset, "-", "lohi")
train("lohi", dataset)
Loading
Loading