Weighting of clusters in loss function

kalininalab · Mar 27, 2024 · acb93d5 · acb93d5
1 parent 6390166
commit acb93d5
Show file tree

Hide file tree

Showing 12 changed files with 132 additions and 64 deletions.
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -10,11 +10,13 @@ on:
       - main
       - dev
       - dev_1.0
+      - dev_1.0_weighting
   pull_request:
     branches:
       - main
       - dev
       - dev_1.0
+      - dev_1.0_weighting
   workflow_dispatch:  # make is manually start-able
 
 # A workflow run is made up of one or more jobs that can run sequentially or in parallel

diff --git a/datasail/cluster/ecfp.py b/datasail/cluster/ecfp.py
@@ -19,26 +19,31 @@ def run_ecfp(dataset: DataSet, method: SIM_OPTIONS = "tanimoto") -> None:
     """
     lg = RDLogger.logger()
     lg.setLevel(RDLogger.CRITICAL)
+
     if dataset.type != "M":
         raise ValueError("ECFP with Tanimoto-scores can only be applied to molecular data.")
 
-    scaffolds = {}
     LOGGER.info("Start ECFP clustering")
 
     invalid_mols = []
+    scaffolds = {}
     for name in dataset.names:
-        scaffold = read_molecule_encoding(dataset.data[name])
-        if scaffold is None:
+        mol = Chem.MolFromSmiles(dataset.data[name])
+        # scaffold = read_molecule_encoding(dataset.data[name])
+        # if scaffold is None:
+        if mol is None:
             bo, bc = "{", "}"
             LOGGER.warning(f"RDKit cannot parse {name} {bo}{dataset.data[name]}{bc}")
             invalid_mols.append(name)
             continue
-        try:
-            scaffolds[name] = MakeScaffoldGeneric(scaffold)
-        except MolSanitizeException:
-            LOGGER.warning(f"RDKit cannot parse {name} ({dataset.data[name]})")
-            invalid_mols.append(name)
-            continue
+        scaffolds[name] = mol
+        # try:
+        #     scaffolds[name] = MakeScaffoldGeneric(scaffold)
+        # except MolSanitizeException:
+        #     LOGGER.warning(f"RDKit cannot parse {name} ({dataset.data[name]})")
+        #     invalid_mols.append(name)
+        #     continue
+
     for invalid_name in invalid_mols:  # obsolete code?
         dataset.names.remove(invalid_name)
         dataset.data.pop(invalid_name)
@@ -48,15 +53,20 @@ def run_ecfp(dataset: DataSet, method: SIM_OPTIONS = "tanimoto") -> None:
                 poppable.append(key)
         for pop in poppable:
             dataset.id_map.pop(pop)
-
+    
     fps = []
-    dataset.cluster_names = list(set(Chem.MolToSmiles(s) for s in list(scaffolds.values())))
-    for scaffold in dataset.cluster_names:
-        fps.append(AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(scaffold), 2, nBits=1024))
+    # dataset.cluster_names = list(set(Chem.MolToSmiles(s) for s in list(scaffolds.values())))
+    # for scaffold in dataset.cluster_names:
+    #     fps.append(AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(scaffold), 2, nBits=1024))
+    dataset.cluster_names = dataset.names
+    for name in dataset.names:
+        fps.append(AllChem.GetMorganFingerprintAsBitVect(scaffolds[name], 2, nBits=1024))
 
     LOGGER.info(f"Reduced {len(dataset.names)} molecules to {len(dataset.cluster_names)}")
     LOGGER.info("Compute Tanimoto Coefficients")
 
     run(dataset, fps, method)
 
-    dataset.cluster_map = dict((name, Chem.MolToSmiles(scaffolds[name])) for name in dataset.names)
+    dataset.cluster_map = {name: name for name in dataset.names}
+    dataset.cluster_weights = {name: 1 for name in dataset.names}
+
diff --git a/datasail/cluster/vectors.py b/datasail/cluster/vectors.py
@@ -8,17 +8,18 @@
 from datasail.reader.utils import DataSet
 from datasail.settings import LOGGER
 
+# TODO: Exclude all those that do not automatically scale between 0 and 1
 SIM_OPTIONS = Literal[
     "allbit", "asymmetric", "braunblanquet", "cosine", "dice", "kulczynski", "mcconnaughey", "onbit", "rogotgoldberg",
-    "russel", "sokal"
+    "russel", "sokal", "tanimoto"
 ]
 
 # produces inf or nan: correlation, cosine, jensenshannon, seuclidean, braycurtis
 # boolean only: dice, kulczynski1, rogerstanimoto, russelrao, sokalmichener, sokalsneath, yule
 # matching == hamming, manhattan == cityblock (inofficial)
 DIST_OPTIONS = Literal[
     "canberra", "chebyshev", "cityblock", "euclidean", "hamming", "jaccard", "mahalanobis", "manhattan", "matching",
-    "minkowski", "sqeuclidean", "tanimoto"
+    "minkowski", "sqeuclidean"
 ]
 
 
@@ -52,6 +53,8 @@ def get_rdkit_fct(method: SIM_OPTIONS):
         return DataStructs.BulkRogotGoldbergSimilarity
     if method == "russel":
         return DataStructs.BulkRusselSimilarity
+    if method == "tanimoto":
+        return DataStructs.BulkTanimotoSimilarity
     if method == "sokal":
         return DataStructs.BulkSokalSimilarity
     raise ValueError(f"Unknown method {method}")
@@ -182,7 +185,8 @@ def run(
         method: The similarity measure to use.
     """
     if method in get_args(SIM_OPTIONS):
-        dataset.cluster_similarity = scale_min_max(rdkit_sim(fps, method))
+        # dataset.cluster_similarity = scale_min_max(rdkit_sim(fps, method))
+        dataset.cluster_similarity = rdkit_sim(fps, method)
     elif method in get_args(DIST_OPTIONS):
         if method == "mahalanobis" and len(fps) <= len(fps[0]):
             raise ValueError(

diff --git a/datasail/solver/cluster_1d.py b/datasail/solver/cluster_1d.py
@@ -6,6 +6,7 @@
 import numpy as np
 
 from datasail.solver.utils import solve, cluster_y_constraints, compute_limits, stratification_constraints
+from experiments.ablation import david
 
 
 def solve_c1(
@@ -47,7 +48,7 @@ def solve_c1(
     min_lim = compute_limits(epsilon, sum(weights), splits)
 
     x = cvxpy.Variable((len(splits), len(clusters)), boolean=True)  # 19
-    y = [[cvxpy.Variable(1, boolean=True) for _ in range(e)] for e in range(len(clusters))]  # 20
+    # y = [[cvxpy.Variable(1, boolean=True) for _ in range(e)] for e in range(len(clusters))]  # 20
 
     constraints = [cvxpy.sum(x, axis=0) == np.ones((len(clusters)))]  # 16
 
@@ -57,14 +58,35 @@ def solve_c1(
     if s_matrix is not None:
         constraints.append(stratification_constraints(s_matrix, splits, delta, x))
 
-    constraints += cluster_y_constraints(clusters, y, x, splits)  # 18
+    # constraints += cluster_y_constraints(clusters, y, x, splits)  # 18
+
+    intra_weights = similarities if similarities is not None else np.max(distances) - distances
+    # tmp = [[intra_weights[e1, e2] * y[e1][e2] for e2 in range(e1)] for e1 in range(len(clusters))]  # 15
+
+    # Because of different weights tmp != len(clusters) * (len(clusters) - 1) / 2
+    tmp = [[weights[e1] * weights[e2] * intra_weights[e1, e2] * cvxpy.max(cvxpy.vstack([x[s, e1] - x[s, e2] for s in range(len(splits))])) for e2 in range(e1 + 1, len(clusters))] for e1 in range(len(clusters))]  # 15
 
-    intra_weights = similarities if similarities is not None else distances
-    tmp = [[intra_weights[e1, e2] * y[e1][e2] for e2 in range(e1)] for e1 in range(len(clusters))]  # 15
     loss = cvxpy.sum([t for tmp_list in tmp for t in tmp_list])
-    if distances is not None:
-        loss = -loss
+    # if distances is not None:
+    #     loss = -loss
+    # loss += cvxpy.sum([cvxpy.sum([y[e1][e2] for e2 in range(e1)]) for e1 in range(len(clusters))])  # 14
     problem = solve(loss, constraints, max_sec, solver, log_file)
+    # print("============= Evaluation =============")
+    # y_mat = np.full((len(clusters), len(clusters)), 0)
+    # w_mat = np.full((len(clusters), len(clusters)), 0)
+    # for e1 in range(len(clusters)):
+    #     w_mat[e1, e1] = weights[e1] ** 2
+    #     for e2 in range(e1):
+    #         y_mat[e1, e2] = np.max([x[s, e1].value - x[s, e2].value for s in range(len(splits))])
+    #         w_mat[e1, e2] = weights[e1] * weights[e2]
+    #         y_mat[e2, e1] = y_mat[e1, e2]
+    #         w_mat[e2, e1] = w_mat[e1, e2]
+    # print(problem.value)
+    # weights = np.array(weights).reshape(-1, 1)
+    # print(david.eval(np.array([
+    #     [1 if x[0, i].value > 0.1 else -1] for i in range(len(clusters))
+    # ]), similarities, weights @ weights.T))  # , y_mat=y_mat, w_mat=w_mat))
+    # print("======================================")
 
     return None if problem is None else {
         e: names[s] for s in range(len(splits)) for i, e in enumerate(clusters) if x[s, i].value > 0.1

diff --git a/datasail/solver/cluster_2d.py b/datasail/solver/cluster_2d.py
@@ -12,10 +12,12 @@ def solve_c2(
         e_s_matrix: Optional[np.ndarray],
         e_similarities: Optional[np.ndarray],
         e_distances: Optional[np.ndarray],
+        e_weights: Optional[np.ndarray],
         f_clusters: List[Union[str, int]],
         f_s_matrix: Optional[np.ndarray],
         f_similarities: Optional[np.ndarray],
         f_distances: Optional[np.ndarray],
+        f_weights: Optional[np.ndarray],
         inter: np.ndarray,
         delta: float,
         epsilon: float,
@@ -35,10 +37,12 @@ def solve_c2(
         e_s_matrix: Stratification for the e-dataset
         e_similarities: Pairwise similarity matrix of clusters in the order of their names
         e_distances: Pairwise distance matrix of clusters in the order of their names
+        e_weights: Weights of the clusters in the order of their names in e_clusters
         f_clusters: List of cluster names to split from the f-dataset
         f_s_matrix: Stratification for the f-dataset
         f_similarities: Pairwise similarity matrix of clusters in the order of their names
         f_distances: Pairwise distance matrix of clusters in the order of their names
+        f_weights: Weights of the clusters in the order of their names in f_clusters
         inter: Matrix storing the amount of interactions between the entities in the e-clusters and f-clusters
         delta: Additive bound for stratification imbalance
         epsilon: Additive bound for exceeding the requested split size
@@ -58,14 +62,16 @@ def solve_c2(
     x_f = cvxpy.Variable((len(splits), len(f_clusters)), boolean=True)
     x_i = {(e, f): cvxpy.Variable(len(splits), boolean=True) for e in range(len(e_clusters)) for f in
            range(len(f_clusters)) if inter[e, f] != 0}
-    y_e = [[cvxpy.Variable(1, boolean=True) for _ in range(e)] for e in range(len(e_clusters))]
-    y_f = [[cvxpy.Variable(1, boolean=True) for _ in range(f)] for f in range(len(f_clusters))]
+    # y_e = [[cvxpy.Variable(1, boolean=True) for _ in range(e)] for e in range(len(e_clusters))]
+    # y_f = [[cvxpy.Variable(1, boolean=True) for _ in range(f)] for f in range(len(f_clusters))]
 
     # check if the cluster relations are uniform
-    e_intra_weights = e_similarities if e_similarities is not None else e_distances
-    f_intra_weights = f_similarities if f_similarities is not None else f_distances
-    e_uniform = e_intra_weights is None or np.allclose(e_intra_weights, np.ones_like(e_intra_weights))
-    f_uniform = f_intra_weights is None or np.allclose(f_intra_weights, np.ones_like(f_intra_weights))
+    e_intra_weights = e_similarities if e_similarities is not None else 1 - e_distances
+    f_intra_weights = f_similarities if f_similarities is not None else 1 - f_distances
+    e_uniform = e_intra_weights is None or np.allclose(e_intra_weights, np.ones_like(e_intra_weights)) or \
+        np.allclose(e_intra_weights, np.zeros_like(e_intra_weights))
+    f_uniform = f_intra_weights is None or np.allclose(f_intra_weights, np.ones_like(f_intra_weights)) or \
+        np.allclose(f_intra_weights, np.zeros_like(f_intra_weights))
 
     def index(x, y):
         return (x, y) if (x, y) in x_i else None
@@ -83,12 +89,12 @@ def index(x, y):
     interaction_contraints(e_clusters, f_clusters, x_i, constraints, splits, x_e, x_f, min_lim, lambda key: inter[key],
                            index)
 
-    constraints += cluster_y_constraints(e_clusters, y_e, x_e, splits) + \
-        cluster_y_constraints(f_clusters, y_f, x_f, splits)
+    # constraints += cluster_y_constraints(e_clusters, y_e, x_e, splits) + \
+    #     cluster_y_constraints(f_clusters, y_f, x_f, splits)
 
     # inter_loss = (np.sum(inter) - sum(cvxpy.sum(x) for x in x_i.values())) / np.sum(inter)
-    e_loss = leakage_loss(e_uniform, e_intra_weights, y_e, e_clusters, e_similarities)
-    f_loss = leakage_loss(f_uniform, f_intra_weights, y_f, f_clusters, f_similarities)
+    e_loss = leakage_loss(e_uniform, e_intra_weights, x_e, e_clusters, e_weights, len(splits))
+    f_loss = leakage_loss(f_uniform, f_intra_weights, x_f, f_clusters, f_weights, len(splits))
 
     problem = solve(e_loss + f_loss, constraints, max_sec, solver, log_file)
 

diff --git a/datasail/solver/solve.py b/datasail/solver/solve.py
@@ -196,11 +196,13 @@ def run_solver(
                                              for c in e_dataset.cluster_names]) if e_dataset.cluster_stratification is not None else None,
                         e_similarities=e_dataset.cluster_similarity,
                         e_distances=e_dataset.cluster_distance,
+                        e_weights=np.ndarray([e_dataset.cluster_weights.get(c, 1) for c in e_dataset.cluster_names]),
                         f_clusters=f_dataset.cluster_names,
                         f_s_matrix=np.stack([f_dataset.cluster_stratification.get(c, np.zeros(len(dataset.classes)))
                                              for c in f_dataset.cluster_names]) if f_dataset.cluster_stratification is not None else None,
                         f_similarities=f_dataset.cluster_similarity,
                         f_distances=f_dataset.cluster_distance,
+                        f_weights=np.ndarray([f_dataset.cluster_weights.get(c, 1) for c in f_dataset.cluster_names]),
                         inter=cluster_inter,
                         delta=delta,
                         epsilon=epsilon,

diff --git a/datasail/solver/utils.py b/datasail/solver/utils.py
@@ -390,28 +390,30 @@ def collect_results_2d(
 def leakage_loss(
         uniform: bool,
         intra_weights,
-        y,
+        x,
         clusters,
-        similarities
+        weights,
+        num_splits: int,
 ):
     """
     Compute the leakage loss for the cluster-based double-cold splitting.
 
     Args:
         uniform: Boolean flag if the cluster metric is uniform
         intra_weights: Weights of the intra-cluster edges
-        y: Helper variables
+        x: Variables of the optimization problem
         clusters: List of cluster names
-        similarities: Pairwise similarity matrix of clusters in the order of their names
+        weights: Weights of the clusters
+        num_splits: Number of splits
 
     Returns:
         Loss describing the leakage between clusters
     """
     if uniform:
         return 0
     else:
-        if similarities is None:
-            intra_weights = 1 - intra_weights
-        tmp = [intra_weights[c1, c2] * y[c1][c2] for c1 in range(len(clusters)) for c2 in range(c1)]
-        e_loss = cvxpy.sum(tmp)
-        return e_loss
+        # tmp = [intra_weights[c1, c2] * y[c1][c2] for c1 in range(len(clusters)) for c2 in range(c1)]
+        # e_loss = cvxpy.sum(tmp)
+        tmp = [[weights[e1] * weights[e2] * intra_weights[e1, e2] * cvxpy.max(cvxpy.vstack([x[s, e1] - x[s, e2] for s in range(num_splits)])) for e2 in range(e1 + 1, len(clusters))] for e1 in range(len(clusters))]
+        loss = cvxpy.sum([t for tmp_list in tmp for t in tmp_list])
+        return loss
diff --git a/experiments/MPP/split.py b/experiments/MPP/split.py
@@ -49,7 +49,7 @@ def split_w_datasail(base_path: Path, name: str, techniques: List[str], solver:
         techniques=techniques,
         splits=[8, 2],
         names=["train", "test"],
-        runs=1,
+        runs=5,
         solver=solver,
         e_type="M",
         e_data=dict(df[["ID", "SMILES"]].values.tolist()),
@@ -164,8 +164,8 @@ def split(full_path, name, solver="GUROBI"):
     Split the MoleculeNet datasets using different techniques.
     """
     split_w_datasail(full_path / "datasail" / name, name, techniques=["I1e", "C1e"], solver=solver)
-    split_w_deepchem(full_path / "deepchem" / name, name, techniques=SPLITTERS.keys())
-    split_w_lohi(full_path / "lohi" / name, name)
+    # split_w_deepchem(full_path / "deepchem" / name, name, techniques=SPLITTERS.keys())
+    # split_w_lohi(full_path / "lohi" / name, name)
 
 
 def specific():
@@ -182,9 +182,9 @@ def specific():
 if __name__ == '__main__':
     if len(sys.argv) == 1:
         specific()
-    if len(sys.argv) == 2:
+    elif len(sys.argv) == 2:
         split_all(Path(sys.argv[1]))
-    elif len(sys.argv) >= 3:
+    elif len(sys.argv) == 3:
         split(Path(sys.argv[1]), sys.argv[2])
     elif len(sys.argv) >= 4:
         split(Path(sys.argv[1]), sys.argv[2], sys.argv[3])
diff --git a/experiments/MPP/visualize.py b/experiments/MPP/visualize.py
@@ -26,26 +26,41 @@ def compute_il(name, tools, techniques):
         data=dict(df[["ID", "SMILES"]].values.tolist()),
         id_map={x: x for x in df["ID"].tolist()},
     )
+    # print(len(dataset.names))
     dataset.cluster_names, dataset.cluster_map, dataset.cluster_similarity, dataset.cluster_weights = run_ecfp(
         dataset
     )
+    # print(dataset.cluster_similarity.shape)
     output = {}
+    if "deepchem" in tools:
+        print("Use v0.3")
+        root = Path("/") / "scratch" / "SCRATCH_SAS" / "roman" / "DataSAIL" / "v03" / "MPP"
+    else:
+        print("Use v1.0")
+        root = Path("/") / "home" / "rjo21" / "Desktop" / "DataSAIL" / "v1.0_test"
     for tool in tools:
         if tool not in output:
             output[tool] = {}
         for technique in techniques:
+            # print(technique)
             if technique not in TECHNIQUES[tool]:
                 continue
             if technique not in output[tool]:
                 output[tool][technique] = []
             for run in range(RUNS):
-                base = Path("/") / "scratch" / "SCRATCH_SAS" / "roman" / "DataSAIL" / "v03" / "MPP" / tool / name / technique / f"split_{run}"
+                base = root / tool / name / technique / f"split_{run}"
                 train_ids = pd.read_csv(base / "train.csv")["ID"]
                 test_ids = pd.read_csv(base / "test.csv")["ID"]
-                df["assi"] = df["ID"].apply(lambda x: 1 if x in train_ids.values else -1 if x in test_ids.values else None)
+                df["assi"] = df["ID"].apply(lambda x: 1 if x in train_ids.values else -1 if x in test_ids.values else 0)
                 df.dropna(subset=["assi"], inplace=True)
-                il = david.eval(df["assi"].to_numpy().reshape(-1, 1), dataset.cluster_similarity)
+                il = david.eval(
+                    df["assi"].to_numpy().reshape(-1, 1),
+                    dataset.cluster_similarity,
+                    [dataset.cluster_weights[c] for c in dataset.cluster_names],
+                )
                 output[tool][technique].append(il)
+                # print(output)
+                # break
     return output
 
 
@@ -184,12 +199,15 @@ def comp_all_il():
         except Exception as e:
             print(f"Failed for {name}")
             print(e)
-    with open("il.pkl", "wb") as f:
+    with open(f"il.pkl", "wb") as f:
         pickle.dump(output, f)
 
 
 if __name__ == '__main__':
-    comp_all_il()
+    # print("ESOL    :", compute_il("esol", ["datasail"], ["I1e", "C1e"]))
+    print("FreeSolv:", compute_il("freesolv", ["datasail"], ["I1e", "C1e"]))
+    print("FreeSolv:", compute_il("freesolv", ["deepchem", "lohi"], TECHNIQUES["deepchem"] + TECHNIQUES["lohi"]))
+    # comp_all_il()
     # compute_il("esol", ["datasail"], ["I1e", "C1e"])
     # plot_double(Path(sys.argv[1]), ["QM8", "Tox21"])
     # heatmap_plot(Path(sys.argv[1]))