Skip to content

Commit

Permalink
Weighting of clusters in loss function
Browse files Browse the repository at this point in the history
  • Loading branch information
Roman Joeres authored and Roman Joeres committed Mar 27, 2024
1 parent 6390166 commit acb93d5
Show file tree
Hide file tree
Showing 12 changed files with 132 additions and 64 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,13 @@ on:
- main
- dev
- dev_1.0
- dev_1.0_weighting
pull_request:
branches:
- main
- dev
- dev_1.0
- dev_1.0_weighting
workflow_dispatch: # make is manually start-able

# A workflow run is made up of one or more jobs that can run sequentially or in parallel
Expand Down
38 changes: 24 additions & 14 deletions datasail/cluster/ecfp.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,26 +19,31 @@ def run_ecfp(dataset: DataSet, method: SIM_OPTIONS = "tanimoto") -> None:
"""
lg = RDLogger.logger()
lg.setLevel(RDLogger.CRITICAL)

if dataset.type != "M":
raise ValueError("ECFP with Tanimoto-scores can only be applied to molecular data.")

scaffolds = {}
LOGGER.info("Start ECFP clustering")

invalid_mols = []
scaffolds = {}
for name in dataset.names:
scaffold = read_molecule_encoding(dataset.data[name])
if scaffold is None:
mol = Chem.MolFromSmiles(dataset.data[name])
# scaffold = read_molecule_encoding(dataset.data[name])
# if scaffold is None:
if mol is None:
bo, bc = "{", "}"
LOGGER.warning(f"RDKit cannot parse {name} {bo}{dataset.data[name]}{bc}")
invalid_mols.append(name)
continue
try:
scaffolds[name] = MakeScaffoldGeneric(scaffold)
except MolSanitizeException:
LOGGER.warning(f"RDKit cannot parse {name} ({dataset.data[name]})")
invalid_mols.append(name)
continue
scaffolds[name] = mol
# try:
# scaffolds[name] = MakeScaffoldGeneric(scaffold)
# except MolSanitizeException:
# LOGGER.warning(f"RDKit cannot parse {name} ({dataset.data[name]})")
# invalid_mols.append(name)
# continue

for invalid_name in invalid_mols: # obsolete code?
dataset.names.remove(invalid_name)
dataset.data.pop(invalid_name)
Expand All @@ -48,15 +53,20 @@ def run_ecfp(dataset: DataSet, method: SIM_OPTIONS = "tanimoto") -> None:
poppable.append(key)
for pop in poppable:
dataset.id_map.pop(pop)

fps = []
dataset.cluster_names = list(set(Chem.MolToSmiles(s) for s in list(scaffolds.values())))
for scaffold in dataset.cluster_names:
fps.append(AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(scaffold), 2, nBits=1024))
# dataset.cluster_names = list(set(Chem.MolToSmiles(s) for s in list(scaffolds.values())))
# for scaffold in dataset.cluster_names:
# fps.append(AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(scaffold), 2, nBits=1024))
dataset.cluster_names = dataset.names
for name in dataset.names:
fps.append(AllChem.GetMorganFingerprintAsBitVect(scaffolds[name], 2, nBits=1024))

LOGGER.info(f"Reduced {len(dataset.names)} molecules to {len(dataset.cluster_names)}")
LOGGER.info("Compute Tanimoto Coefficients")

run(dataset, fps, method)

dataset.cluster_map = dict((name, Chem.MolToSmiles(scaffolds[name])) for name in dataset.names)
dataset.cluster_map = {name: name for name in dataset.names}
dataset.cluster_weights = {name: 1 for name in dataset.names}

10 changes: 7 additions & 3 deletions datasail/cluster/vectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,18 @@
from datasail.reader.utils import DataSet
from datasail.settings import LOGGER

# TODO: Exclude all those that do not automatically scale between 0 and 1
SIM_OPTIONS = Literal[
"allbit", "asymmetric", "braunblanquet", "cosine", "dice", "kulczynski", "mcconnaughey", "onbit", "rogotgoldberg",
"russel", "sokal"
"russel", "sokal", "tanimoto"
]

# produces inf or nan: correlation, cosine, jensenshannon, seuclidean, braycurtis
# boolean only: dice, kulczynski1, rogerstanimoto, russelrao, sokalmichener, sokalsneath, yule
# matching == hamming, manhattan == cityblock (inofficial)
DIST_OPTIONS = Literal[
"canberra", "chebyshev", "cityblock", "euclidean", "hamming", "jaccard", "mahalanobis", "manhattan", "matching",
"minkowski", "sqeuclidean", "tanimoto"
"minkowski", "sqeuclidean"
]


Expand Down Expand Up @@ -52,6 +53,8 @@ def get_rdkit_fct(method: SIM_OPTIONS):
return DataStructs.BulkRogotGoldbergSimilarity
if method == "russel":
return DataStructs.BulkRusselSimilarity
if method == "tanimoto":
return DataStructs.BulkTanimotoSimilarity
if method == "sokal":
return DataStructs.BulkSokalSimilarity
raise ValueError(f"Unknown method {method}")
Expand Down Expand Up @@ -182,7 +185,8 @@ def run(
method: The similarity measure to use.
"""
if method in get_args(SIM_OPTIONS):
dataset.cluster_similarity = scale_min_max(rdkit_sim(fps, method))
# dataset.cluster_similarity = scale_min_max(rdkit_sim(fps, method))
dataset.cluster_similarity = rdkit_sim(fps, method)
elif method in get_args(DIST_OPTIONS):
if method == "mahalanobis" and len(fps) <= len(fps[0]):
raise ValueError(
Expand Down
34 changes: 28 additions & 6 deletions datasail/solver/cluster_1d.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import numpy as np

from datasail.solver.utils import solve, cluster_y_constraints, compute_limits, stratification_constraints
from experiments.ablation import david


def solve_c1(
Expand Down Expand Up @@ -47,7 +48,7 @@ def solve_c1(
min_lim = compute_limits(epsilon, sum(weights), splits)

x = cvxpy.Variable((len(splits), len(clusters)), boolean=True) # 19
y = [[cvxpy.Variable(1, boolean=True) for _ in range(e)] for e in range(len(clusters))] # 20
# y = [[cvxpy.Variable(1, boolean=True) for _ in range(e)] for e in range(len(clusters))] # 20

constraints = [cvxpy.sum(x, axis=0) == np.ones((len(clusters)))] # 16

Expand All @@ -57,14 +58,35 @@ def solve_c1(
if s_matrix is not None:
constraints.append(stratification_constraints(s_matrix, splits, delta, x))

constraints += cluster_y_constraints(clusters, y, x, splits) # 18
# constraints += cluster_y_constraints(clusters, y, x, splits) # 18

intra_weights = similarities if similarities is not None else np.max(distances) - distances
# tmp = [[intra_weights[e1, e2] * y[e1][e2] for e2 in range(e1)] for e1 in range(len(clusters))] # 15

# Because of different weights tmp != len(clusters) * (len(clusters) - 1) / 2
tmp = [[weights[e1] * weights[e2] * intra_weights[e1, e2] * cvxpy.max(cvxpy.vstack([x[s, e1] - x[s, e2] for s in range(len(splits))])) for e2 in range(e1 + 1, len(clusters))] for e1 in range(len(clusters))] # 15

intra_weights = similarities if similarities is not None else distances
tmp = [[intra_weights[e1, e2] * y[e1][e2] for e2 in range(e1)] for e1 in range(len(clusters))] # 15
loss = cvxpy.sum([t for tmp_list in tmp for t in tmp_list])
if distances is not None:
loss = -loss
# if distances is not None:
# loss = -loss
# loss += cvxpy.sum([cvxpy.sum([y[e1][e2] for e2 in range(e1)]) for e1 in range(len(clusters))]) # 14
problem = solve(loss, constraints, max_sec, solver, log_file)
# print("============= Evaluation =============")
# y_mat = np.full((len(clusters), len(clusters)), 0)
# w_mat = np.full((len(clusters), len(clusters)), 0)
# for e1 in range(len(clusters)):
# w_mat[e1, e1] = weights[e1] ** 2
# for e2 in range(e1):
# y_mat[e1, e2] = np.max([x[s, e1].value - x[s, e2].value for s in range(len(splits))])
# w_mat[e1, e2] = weights[e1] * weights[e2]
# y_mat[e2, e1] = y_mat[e1, e2]
# w_mat[e2, e1] = w_mat[e1, e2]
# print(problem.value)
# weights = np.array(weights).reshape(-1, 1)
# print(david.eval(np.array([
# [1 if x[0, i].value > 0.1 else -1] for i in range(len(clusters))
# ]), similarities, weights @ weights.T)) # , y_mat=y_mat, w_mat=w_mat))
# print("======================================")

return None if problem is None else {
e: names[s] for s in range(len(splits)) for i, e in enumerate(clusters) if x[s, i].value > 0.1
Expand Down
26 changes: 16 additions & 10 deletions datasail/solver/cluster_2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,12 @@ def solve_c2(
e_s_matrix: Optional[np.ndarray],
e_similarities: Optional[np.ndarray],
e_distances: Optional[np.ndarray],
e_weights: Optional[np.ndarray],
f_clusters: List[Union[str, int]],
f_s_matrix: Optional[np.ndarray],
f_similarities: Optional[np.ndarray],
f_distances: Optional[np.ndarray],
f_weights: Optional[np.ndarray],
inter: np.ndarray,
delta: float,
epsilon: float,
Expand All @@ -35,10 +37,12 @@ def solve_c2(
e_s_matrix: Stratification for the e-dataset
e_similarities: Pairwise similarity matrix of clusters in the order of their names
e_distances: Pairwise distance matrix of clusters in the order of their names
e_weights: Weights of the clusters in the order of their names in e_clusters
f_clusters: List of cluster names to split from the f-dataset
f_s_matrix: Stratification for the f-dataset
f_similarities: Pairwise similarity matrix of clusters in the order of their names
f_distances: Pairwise distance matrix of clusters in the order of their names
f_weights: Weights of the clusters in the order of their names in f_clusters
inter: Matrix storing the amount of interactions between the entities in the e-clusters and f-clusters
delta: Additive bound for stratification imbalance
epsilon: Additive bound for exceeding the requested split size
Expand All @@ -58,14 +62,16 @@ def solve_c2(
x_f = cvxpy.Variable((len(splits), len(f_clusters)), boolean=True)
x_i = {(e, f): cvxpy.Variable(len(splits), boolean=True) for e in range(len(e_clusters)) for f in
range(len(f_clusters)) if inter[e, f] != 0}
y_e = [[cvxpy.Variable(1, boolean=True) for _ in range(e)] for e in range(len(e_clusters))]
y_f = [[cvxpy.Variable(1, boolean=True) for _ in range(f)] for f in range(len(f_clusters))]
# y_e = [[cvxpy.Variable(1, boolean=True) for _ in range(e)] for e in range(len(e_clusters))]
# y_f = [[cvxpy.Variable(1, boolean=True) for _ in range(f)] for f in range(len(f_clusters))]

# check if the cluster relations are uniform
e_intra_weights = e_similarities if e_similarities is not None else e_distances
f_intra_weights = f_similarities if f_similarities is not None else f_distances
e_uniform = e_intra_weights is None or np.allclose(e_intra_weights, np.ones_like(e_intra_weights))
f_uniform = f_intra_weights is None or np.allclose(f_intra_weights, np.ones_like(f_intra_weights))
e_intra_weights = e_similarities if e_similarities is not None else 1 - e_distances
f_intra_weights = f_similarities if f_similarities is not None else 1 - f_distances
e_uniform = e_intra_weights is None or np.allclose(e_intra_weights, np.ones_like(e_intra_weights)) or \
np.allclose(e_intra_weights, np.zeros_like(e_intra_weights))
f_uniform = f_intra_weights is None or np.allclose(f_intra_weights, np.ones_like(f_intra_weights)) or \
np.allclose(f_intra_weights, np.zeros_like(f_intra_weights))

def index(x, y):
return (x, y) if (x, y) in x_i else None
Expand All @@ -83,12 +89,12 @@ def index(x, y):
interaction_contraints(e_clusters, f_clusters, x_i, constraints, splits, x_e, x_f, min_lim, lambda key: inter[key],
index)

constraints += cluster_y_constraints(e_clusters, y_e, x_e, splits) + \
cluster_y_constraints(f_clusters, y_f, x_f, splits)
# constraints += cluster_y_constraints(e_clusters, y_e, x_e, splits) + \
# cluster_y_constraints(f_clusters, y_f, x_f, splits)

# inter_loss = (np.sum(inter) - sum(cvxpy.sum(x) for x in x_i.values())) / np.sum(inter)
e_loss = leakage_loss(e_uniform, e_intra_weights, y_e, e_clusters, e_similarities)
f_loss = leakage_loss(f_uniform, f_intra_weights, y_f, f_clusters, f_similarities)
e_loss = leakage_loss(e_uniform, e_intra_weights, x_e, e_clusters, e_weights, len(splits))
f_loss = leakage_loss(f_uniform, f_intra_weights, x_f, f_clusters, f_weights, len(splits))

problem = solve(e_loss + f_loss, constraints, max_sec, solver, log_file)

Expand Down
2 changes: 2 additions & 0 deletions datasail/solver/solve.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,11 +196,13 @@ def run_solver(
for c in e_dataset.cluster_names]) if e_dataset.cluster_stratification is not None else None,
e_similarities=e_dataset.cluster_similarity,
e_distances=e_dataset.cluster_distance,
e_weights=np.ndarray([e_dataset.cluster_weights.get(c, 1) for c in e_dataset.cluster_names]),
f_clusters=f_dataset.cluster_names,
f_s_matrix=np.stack([f_dataset.cluster_stratification.get(c, np.zeros(len(dataset.classes)))
for c in f_dataset.cluster_names]) if f_dataset.cluster_stratification is not None else None,
f_similarities=f_dataset.cluster_similarity,
f_distances=f_dataset.cluster_distance,
f_weights=np.ndarray([f_dataset.cluster_weights.get(c, 1) for c in f_dataset.cluster_names]),
inter=cluster_inter,
delta=delta,
epsilon=epsilon,
Expand Down
20 changes: 11 additions & 9 deletions datasail/solver/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,28 +390,30 @@ def collect_results_2d(
def leakage_loss(
uniform: bool,
intra_weights,
y,
x,
clusters,
similarities
weights,
num_splits: int,
):
"""
Compute the leakage loss for the cluster-based double-cold splitting.
Args:
uniform: Boolean flag if the cluster metric is uniform
intra_weights: Weights of the intra-cluster edges
y: Helper variables
x: Variables of the optimization problem
clusters: List of cluster names
similarities: Pairwise similarity matrix of clusters in the order of their names
weights: Weights of the clusters
num_splits: Number of splits
Returns:
Loss describing the leakage between clusters
"""
if uniform:
return 0
else:
if similarities is None:
intra_weights = 1 - intra_weights
tmp = [intra_weights[c1, c2] * y[c1][c2] for c1 in range(len(clusters)) for c2 in range(c1)]
e_loss = cvxpy.sum(tmp)
return e_loss
# tmp = [intra_weights[c1, c2] * y[c1][c2] for c1 in range(len(clusters)) for c2 in range(c1)]
# e_loss = cvxpy.sum(tmp)
tmp = [[weights[e1] * weights[e2] * intra_weights[e1, e2] * cvxpy.max(cvxpy.vstack([x[s, e1] - x[s, e2] for s in range(num_splits)])) for e2 in range(e1 + 1, len(clusters))] for e1 in range(len(clusters))]
loss = cvxpy.sum([t for tmp_list in tmp for t in tmp_list])
return loss
10 changes: 5 additions & 5 deletions experiments/MPP/split.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def split_w_datasail(base_path: Path, name: str, techniques: List[str], solver:
techniques=techniques,
splits=[8, 2],
names=["train", "test"],
runs=1,
runs=5,
solver=solver,
e_type="M",
e_data=dict(df[["ID", "SMILES"]].values.tolist()),
Expand Down Expand Up @@ -164,8 +164,8 @@ def split(full_path, name, solver="GUROBI"):
Split the MoleculeNet datasets using different techniques.
"""
split_w_datasail(full_path / "datasail" / name, name, techniques=["I1e", "C1e"], solver=solver)
split_w_deepchem(full_path / "deepchem" / name, name, techniques=SPLITTERS.keys())
split_w_lohi(full_path / "lohi" / name, name)
# split_w_deepchem(full_path / "deepchem" / name, name, techniques=SPLITTERS.keys())
# split_w_lohi(full_path / "lohi" / name, name)


def specific():
Expand All @@ -182,9 +182,9 @@ def specific():
if __name__ == '__main__':
if len(sys.argv) == 1:
specific()
if len(sys.argv) == 2:
elif len(sys.argv) == 2:
split_all(Path(sys.argv[1]))
elif len(sys.argv) >= 3:
elif len(sys.argv) == 3:
split(Path(sys.argv[1]), sys.argv[2])
elif len(sys.argv) >= 4:
split(Path(sys.argv[1]), sys.argv[2], sys.argv[3])
28 changes: 23 additions & 5 deletions experiments/MPP/visualize.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,26 +26,41 @@ def compute_il(name, tools, techniques):
data=dict(df[["ID", "SMILES"]].values.tolist()),
id_map={x: x for x in df["ID"].tolist()},
)
# print(len(dataset.names))
dataset.cluster_names, dataset.cluster_map, dataset.cluster_similarity, dataset.cluster_weights = run_ecfp(
dataset
)
# print(dataset.cluster_similarity.shape)
output = {}
if "deepchem" in tools:
print("Use v0.3")
root = Path("/") / "scratch" / "SCRATCH_SAS" / "roman" / "DataSAIL" / "v03" / "MPP"
else:
print("Use v1.0")
root = Path("/") / "home" / "rjo21" / "Desktop" / "DataSAIL" / "v1.0_test"
for tool in tools:
if tool not in output:
output[tool] = {}
for technique in techniques:
# print(technique)
if technique not in TECHNIQUES[tool]:
continue
if technique not in output[tool]:
output[tool][technique] = []
for run in range(RUNS):
base = Path("/") / "scratch" / "SCRATCH_SAS" / "roman" / "DataSAIL" / "v03" / "MPP" / tool / name / technique / f"split_{run}"
base = root / tool / name / technique / f"split_{run}"
train_ids = pd.read_csv(base / "train.csv")["ID"]
test_ids = pd.read_csv(base / "test.csv")["ID"]
df["assi"] = df["ID"].apply(lambda x: 1 if x in train_ids.values else -1 if x in test_ids.values else None)
df["assi"] = df["ID"].apply(lambda x: 1 if x in train_ids.values else -1 if x in test_ids.values else 0)
df.dropna(subset=["assi"], inplace=True)
il = david.eval(df["assi"].to_numpy().reshape(-1, 1), dataset.cluster_similarity)
il = david.eval(
df["assi"].to_numpy().reshape(-1, 1),
dataset.cluster_similarity,
[dataset.cluster_weights[c] for c in dataset.cluster_names],
)
output[tool][technique].append(il)
# print(output)
# break
return output


Expand Down Expand Up @@ -184,12 +199,15 @@ def comp_all_il():
except Exception as e:
print(f"Failed for {name}")
print(e)
with open("il.pkl", "wb") as f:
with open(f"il.pkl", "wb") as f:
pickle.dump(output, f)


if __name__ == '__main__':
comp_all_il()
# print("ESOL :", compute_il("esol", ["datasail"], ["I1e", "C1e"]))
print("FreeSolv:", compute_il("freesolv", ["datasail"], ["I1e", "C1e"]))
print("FreeSolv:", compute_il("freesolv", ["deepchem", "lohi"], TECHNIQUES["deepchem"] + TECHNIQUES["lohi"]))
# comp_all_il()
# compute_il("esol", ["datasail"], ["I1e", "C1e"])
# plot_double(Path(sys.argv[1]), ["QM8", "Tox21"])
# heatmap_plot(Path(sys.argv[1]))
Loading

0 comments on commit acb93d5

Please sign in to comment.