kalininalab · Old-Shatterhand · Nov 4, 2023 · Nov 4, 2023
diff --git a/.gitignore b/.gitignore
@@ -34,3 +34,8 @@ viz_molnet.py
 /experiments/stiming.png
 /experiments/timing.png
 /runs/
+/experiments/PDBBind/datasail/
+/experiments/MPP/lohi/
+/experiments/PDBBind/deepchem/
+/experiments/PDBBind/graphpart/
+/experiments/PDBBind/lohi/
diff --git a/README.md b/README.md
@@ -29,15 +29,15 @@ DataSAIL is installable from [conda](https://anaconda.org/kalininalab/datasail)
 using
 
 ````shell
-conda create -n sail -c conda-forge -c kalininalab -c bioconda MPP
+conda create -n sail -c conda-forge -c kalininalab -c bioconda -c mosek DataSAIL
 conda activate sail
 pip install grakel
 ````
 
 to install it into a new empty environment or
 
 ````shell
-conda install -c conda-forge -c kalininalab -c bioconda MPP
+conda install -c conda-forge -c kalininalab -c bioconda -c mosek DataSAIL
 pip install grakel
 ````
 

diff --git a/docs/index.rst b/docs/index.rst
@@ -10,18 +10,18 @@ The code for Data Splitting Against Information Leakage, short DataSAIL, is avai
 Quick Start
 ===========
 
-DataSAIL currently only runs in Python 3.10. Therefore, you have to install it into a Python 3.10 environment. For
+DataSAIL currently only runs in any oficially supported version of Python (3.7, 3.8, 3.9, 3.10, 3.11). For
 conda, this can be created by running
 
 .. code-block:: shell
 
-    conda create -n datasail python=3.10
+    conda create -n datasail
 
 Other than described on the conda-website, the command to install DataSAIL within your just created environment is
 
 .. code-block:: shell
 
-    mamba install -c kalininalab -c conda-forge -c bioconda datasail
+    mamba install -c kalininalab -c mosek -c conda-forge -c bioconda datasail
     pip install grakel
 
 The second command is necessary to run WLK clustering as the grakel library is not available on conda for python 3.10.

diff --git a/environment.yml b/environment.yml
@@ -22,6 +22,5 @@ dependencies:
   - pytest-cov
   - pytest-cases
   - rdkit
-  - mosek::mosek
   - pip:
       - grakel
diff --git a/experiments/MPP/check.py b/experiments/MPP/check.py
@@ -0,0 +1,21 @@
+from pathlib import Path
+import os
+
+import pandas as pd
+from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
+
+# extract the data and save them in a CSV file
+tb_path = Path("experiments") / "MPP" / "lohi" / "cdata" / "freesolv" / "lohi" / f"split_0" / "fold_0" / \
+          "model_0"
+tb_file = tb_path / list(sorted(filter(
+    lambda x: str(x).startswith("events"), os.listdir(tb_path)
+)))[-1]
+print("File:", tb_file)
+ea = EventAccumulator(str(tb_file))
+ea.Reload()
+for long, short in [("validation_", "val"), ("test_", "test")]:
+    print([m for m in filter(lambda x: x.startswith(long), ea.Tags()["scalars"])])
+    for metric in filter(lambda x: x.startswith(long), ea.Tags()["scalars"]):
+        print("metric", [e.value for e in ea.Scalars(metric)])
+        # dfs[short][f"{tech}_{metric}_split_{run}"] = [e.value for e in ea.Scalars(metric)]
+# print(df)
diff --git a/experiments/MPP/split.py b/experiments/MPP/split.py
@@ -1,11 +1,15 @@
 import os
+import sys
 from pathlib import Path
 
 from rdkit import Chem
 import deepchem as dc
 from datasail.sail import datasail
+import lohi_splitter as lohi
 
-from experiments.utils import splitters, mpp_datasets, dc2pd, RUNS
+from experiments.utils import SPLITTERS, mpp_datasets, dc2pd, RUNS, telegram
+
+count = 0
 
 
 def split_w_datasail(name):
@@ -14,50 +18,51 @@ def split_w_datasail(name):
     df = dc2pd(dataset, name)
 
     for tech in ["I1e", "C1e"]:
-        for run in range(RUNS):
-            try:
+        try:
+
+            with open(base / tech / "start.txt", "w") as start:
+                print("Start", file=start)
+
+            e_splits, _, _ = datasail(
+                techniques=[tech],
+                splits=[8, 2],
+                names=["train", "test"],
+                runs=RUNS,
+                solver="SCIP",
+                e_type="M",
+                e_data=dict(df[["ID", "SMILES"]].values.tolist())
+            )
+
+            for run in range(RUNS):
                 path = base / tech / f"split_{run}"
                 os.makedirs(path, exist_ok=True)
-
-                with open(path / "start.txt", "w") as start:
-                    print("Start", file=start)
-
-                e_splits, _, _ = datasail(
-                    techniques=[tech],
-                    splits=[8, 2],
-                    names=["train", "test"],
-                    runs=RUNS,
-                    solver="SCIP",
-                    e_type="M",
-                    e_data=dict(df[["ID", "SMILES"]].values.tolist())
-                )
-
                 train = list(df["ID"].apply(lambda x: e_splits[tech][run].get(x, "") == "train"))
                 test = list(df["ID"].apply(lambda x: e_splits[tech][run].get(x, "") == "test"))
                 df[train].to_csv(path / "train.csv", index=False)
                 df[test].to_csv(path / "test.csv", index=False)
-            except Exception as e:
-                print("=" * 80 + f"\n{e}\n" + "=" * 80)
+        except Exception as e:
+            print("=" * 80 + f"\n{e}\n" + "=" * 80)
 
 
 def split_w_deepchem(name):
     base = Path('experiments') / 'MPP' / 'deepchem' / 'sdata' / name
 
     dataset = mpp_datasets[name][0](featurizer=dc.feat.DummyFeaturizer(), splitter=None)[1][0]
     if name[:2] != "qm":
-        valid_ids = [i for i, smiles in enumerate(list(dataset.to_dataframe()["X"])) if Chem.MolFromSmiles(smiles) is not None]
+        valid_ids = [i for i, smiles in enumerate(list(dataset.to_dataframe()["X"])) if
+                     Chem.MolFromSmiles(smiles) is not None]
         dataset = dataset.select(valid_ids)
-    
+
     for run in range(RUNS):
-        for tech in splitters:
+        for tech in SPLITTERS:
             try:
                 path = base / tech / f"split_{run}"
                 os.makedirs(path, exist_ok=True)
 
                 with open(path / "start.txt", "w") as start:
                     print("Start", file=start)
 
-                train_set, test_set = splitters[tech].train_test_split(dataset, frac_train=0.8)
+                train_set, test_set = SPLITTERS[tech].train_test_split(dataset, frac_train=0.8)
 
                 dc2pd(train_set, name).to_csv(path / "train.csv", index=False)
                 dc2pd(test_set, name).to_csv(path / "test.csv", index=False)
@@ -66,19 +71,55 @@ def split_w_deepchem(name):
         dataset = dataset.complete_shuffle()
 
 
-def full_main():
-    for ds_name in mpp_datasets:
-        if ds_name in ["pdbbind", "pcba"]:
-            continue
-        split_w_datasail(ds_name)
-        split_w_deepchem(ds_name)
+def split_w_lohi(name):
+    base = Path('experiments') / 'MPP' / 'lohi' / 'sdata' / name
+
+    dataset = mpp_datasets[name][0](featurizer=dc.feat.DummyFeaturizer(), splitter=None)[1][0]
+    if name[:2] != "qm":
+        valid_ids = [i for i, smiles in enumerate(list(dataset.to_dataframe()["X"])) if
+                     Chem.MolFromSmiles(smiles) is not None]
+        dataset = dataset.select(valid_ids)
+
+    for run in range(RUNS):
+        try:
+            path = base / "lohi" / f"split_{run}"
+            os.makedirs(path, exist_ok=True)
+            df = dc2pd(dataset, name)
+
+            with open(path / "start.txt", "w") as start:
+                print("Start", file=start)
+
+            train_test_partition = lohi.hi_train_test_split(
+                smiles=list(df["SMILES"]),
+                similarity_threshold=0.4,
+                train_min_frac=0.7,
+                test_min_frac=0.1,
+                coarsening_threshold=0.4,
+                max_mip_gap=0.1,
+                verbose=False,
+            )
+
+            df.iloc[train_test_partition[0]].to_csv(path / "train.csv", index=False)
+            df.iloc[train_test_partition[1]].to_csv(path / "test.csv", index=False)
+            global count
+            count += 1
+            telegram(f"[MPP {count} / 70] Splitting finished for MPP - lohi - {name} - Run {run + 1} / 5")
+        except Exception as e:
+            print("=" * 80 + f"\n{e}\n" + "=" * 80)
+        dataset = dataset.complete_shuffle()
 
 
-def scnd_main():
-    for ds_name in ["hiv", "bace", "bbbp", "tox21", "toxcast", "sider", "clintox"]:
-        split_w_datasail(ds_name)
-        split_w_deepchem(ds_name)
+def main():
+    for ds_name in sorted(list(mpp_datasets.keys()), key=lambda x: mpp_datasets[x][3]):
+        if ds_name in ["pdbbind", "pcba"]:
+            continue
+        # split_w_datasail(ds_name)
+        # split_w_deepchem(ds_name)
+        split_w_lohi(ds_name)
 
 
 if __name__ == '__main__':
-    scnd_main()
+    if len(sys.argv) > 1:
+        split_w_lohi(sys.argv[1])
+    else:
+        main()
diff --git a/experiments/MPP/train.py b/experiments/MPP/train.py
@@ -1,31 +1,27 @@
 import os
 import shutil
+from pathlib import Path
 
 import chemprop
 import pandas as pd
 from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
 
 from experiments.utils import mpp_datasets, RUNS, MPP_EPOCHS, telegram
 
-
 count = 0
 
 
 def train(model, name):
     dfs = {"val": pd.DataFrame({"rows": list(range(50))}), "test": pd.DataFrame({"rows": [0]})}
     # store the results in training, validation, and test files
-    for tech in [x for x in os.listdir(f"experiments/MPP/{model}/cdata/{name}") if os.path.isdir(f"experiments/MPP/{model}/cdata/{name}/{x}")]:
+    cpath = Path("experiments") / "MPP" / model / "cdata" / name
+    for tech in [x for x in os.listdir(cpath) if os.path.isdir(cpath / x)]:
         for run in range(RUNS):
             print(tech, "-", run)
             try:
-                print("Check folder:", f"experiments/MPP/{model}/cdata/{name}/{tech}/split_{run}/fold_0/", end="\t")
-                print(os.path.exists("experiments/MPP/{model}/cdata/{name}/{tech}/split_{run}/fold_0/"))
-                if os.path.exists("experiments/MPP/{model}/cdata/{name}/{tech}/split_{run}/fold_0/"):
-                    print("Delete folder")
-                    shutil.rmtree("experiments/MPP/{model}/cdata/{name}/{tech}/split_{run}/fold_0/", ignore_errors=True)
-
-                train_df = pd.read_csv(f"experiments/MPP/{model}/cdata/{name}/{tech}/split_{run}/train.csv")
-                test_df = pd.read_csv(f"experiments/MPP/{model}/cdata/{name}/{tech}/split_{run}/test.csv")
+                path = cpath / tech / f"split_{run}"
+                train_df = pd.read_csv(path / "train.csv")
+                test_df = pd.read_csv(path / "test.csv")
                 train_nunique = train_df.nunique()
                 test_nunique = test_df.nunique()
                 train_dropable = train_nunique[train_nunique == 1].index
@@ -36,19 +32,19 @@ def train(model, name):
                 test_df.drop(train_dropable, axis=1, inplace=True)
                 train_df.drop(test_dropable, axis=1, inplace=True)
                 test_df.drop(test_dropable, axis=1, inplace=True)
-                train_df.to_csv(f"experiments/MPP/{model}/cdata/{name}/{tech}/split_{run}/train.csv", index=False)
-                test_df.to_csv(f"experiments/MPP/{model}/cdata/{name}/{tech}/split_{run}/test.csv", index=False)
-                
+                train_df.to_csv(path / "train.csv", index=False)
+                test_df.to_csv(path / "test.csv", index=False)
+
                 # train the D-MPNN model
-                targets = list(pd.read_csv(f"experiments/MPP/{model}/cdata/{name}/{tech}/split_{run}/train.csv").columns)
+                targets = list(pd.read_csv(path / "train.csv").columns)
                 targets.remove("SMILES")
                 targets.remove("ID")
                 arguments = [
-                    "--data_path", f"experiments/MPP/{model}/cdata/{name}/{tech}/split_{run}/train.csv",
-                    "--separate_val_path", f"experiments/MPP/{model}/cdata/{name}/{tech}/split_{run}/test.csv",
-                    "--separate_test_path", f"experiments/MPP/{model}/cdata/{name}/{tech}/split_{run}/test.csv",
+                    "--data_path", str(path / "train.csv"),
+                    "--separate_val_path", str(path / "test.csv"),
+                    "--separate_test_path", str(path / "test.csv"),
                     "--dataset_type", mpp_datasets[name][1],
-                    "--save_dir", f"experiments/MPP/{model}/cdata/{name}/{tech}/split_{run}/",
+                    "--save_dir", str(path),
                     "--quiet", "--epochs", str(MPP_EPOCHS),
                     "--smiles_columns", "SMILES",
                     "--target_columns", *targets,
@@ -61,11 +57,13 @@ def train(model, name):
                 del args
 
                 # extract the data and save them in a CSV file
-                tb_file = f"experiments/MPP/{model}/cdata/{name}/{tech}/split_{run}/fold_0/model_0/" + list(sorted(filter(
-                    lambda x: x.startswith("events"), os.listdir(f"experiments/MPP/{model}/cdata/{name}/{tech}/split_{run}/fold_0/model_0/")
+                tb_path = Path("experiments") / "MPP" / model / "cdata" / name / tech / f"split_{run}" / "fold_0" / \
+                    "model_0"
+                tb_file = tb_path / list(sorted(filter(
+                    lambda x: x.startswith("events"), os.listdir(tb_path)
                 )))[-1]
                 print("File:", tb_file)
-                ea = EventAccumulator(tb_file)
+                ea = EventAccumulator(str(tb_file))
                 ea.Reload()
                 for long, short in [("validation_", "val"), ("test_", "test")]:
                     print([m for m in filter(lambda x: x.startswith(long), ea.Tags()["scalars"])])
@@ -77,18 +75,17 @@ def train(model, name):
 
                 global count
                 count += 1
-                telegram(f"[MPP {count} / 105] Training finished for MPP - {model} - {name} - {tech} - Run {run}/4")
+                telegram(f"[MPP {count} / 55] Training finished for MPP - lohi - {name} - Run {run + 1} / 5")
             except Exception as e:
                 print(e)
     for split, df in dfs.items():
-        print("Saving:", df.shape, "to", f"experiments/MPP/{model}/cdata/{name}/{split}_metrics.tsv")
-        df.to_csv(f"experiments/MPP/{model}/cdata/{name}/{split}_metrics.tsv", sep="\t", index=False)
-
+        save_path = Path("experiments") / "MPP" / model / "cdata" / name / f"{split}_metrics.tsv"
+        print("Saving:", df.shape, "to", save_path)
+        df.to_csv(save_path, sep="\t", index=False)
 
-# for dataset in ["freesolv", "esol", "sider", "clintox", "bace", "bbbp", "lipophilicity", "qm7", "tox21", "toxcast", "qm8", "hiv", "muv", "qm9"]:
-for dataset in ["qm7", "qm8", "qm9"]:
-    # for dataset in ["tox21", "toxcast", "hiv", "muv"]:
-    for tool in ["datasail", "deepchem"]:
-        print(dataset, "-", tool)
-        train(tool, dataset)
 
+for dataset in sorted(list(mpp_datasets.keys()), key=lambda x: mpp_datasets[x][3]):
+    if dataset in {"qm9", "muv", "bace"}:
+        continue
+    print(dataset, "-", "lohi")
+    train("lohi", dataset)