Skip to content

Commit

Permalink
Add wl4s2v2 and update args and readme
Browse files Browse the repository at this point in the history
  • Loading branch information
dongkwan-kim committed Jun 26, 2024
1 parent 9e9ff19 commit a3993ae
Show file tree
Hide file tree
Showing 4 changed files with 149 additions and 47 deletions.
48 changes: 41 additions & 7 deletions WL4S/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,54 @@ This is a very small follow-up work of S2N.

## Install

This repository has been confirmed to be working on `nvidia/cuda:10.2-cudnn8-devel-ubuntu18.04`
and `nvidia/cuda:11.6.0-cudnn8-devel-ubuntu20.04`. GPUs are not required.

```bash
bash install.sh
```

## Datasets

Set `--dataset_path`.

Dataset files (`raw`) can be downloaded from https://github.com/mims-harvard/SubGNN.
Additionally, `raw/glass_embeddings.pth ` can be downloaded from https://github.com/Xi-yuanWang/GLASS/tree/main/Emb.
Then, `SubgraphDataset.process` automatically generates `processed_{dataset}_42_False_undirected` folder.

```
ls /mnt/nas2/GNN-DATA/SUBGRAPH
COMPONENT CORENESS CUTRATIO DENSITY EMUSER HPOMETAB HPONEURO PPIBP
ls /mnt/nas2/GNN-DATA/SUBGRAPH/PPIBP
processed_PPIBP_42_False_undirected raw
ls /mnt/nas2/GNN-DATA/SUBGRAPH/PPIBP/raw
degree_sequence.txt edge_list.txt ego_graphs.txt gin_embeddings.pth glass_embeddings.pth graphsaint_gcn_embeddings.pth shortest_path_matrix.npy similarities subgraphs.pth
ls /mnt/nas2/GNN-DATA/SUBGRAPH/PPIBP/processed_PPIBP_42_False_undirected
args.txt data.pt global_gin.pt global_glass.pt global_graphsaint_gcn.pt meta.pt pre_filter.pt pre_transform.pt
```

## Run

```bash
python wl4s.py --MODE hp_search_for_models --dataset_name PPIBP
python wl4s.py --MODE hp_search_for_models --dataset_name EMUser
# For HPOMetab, search hparams with one run.
python wl4s.py --MODE hp_search_for_models --dataset_name HPOMetab --runs 1
# For PPIBP, k=D (or inf) (connected), precompute kernels, then run SVC (C=1.28) with 2 seeds.
python wl4s.py --MODE run_one --dataset_name PPIBP --stype connected --dtype kernel --wl_cumcat False --hist_norm True --runs 2 --C 1.28
# For EMUser, k=0 (separated), use histogram as features, then run SVC with a linear kernel.
python wl4s.py --MODE run_one --dataset_name EMUser --stype separated --dtype histogram --kernel linear --wl_cumcat False --hist_norm False --runs 3 --C 0.08

python wl4s.py --MODE hp_search_syn
python wl4s.py --MODE hp_search_real
python wl4s2.py --MODE hp_search_syn
python wl4s2.py --MODE hp_search_real
# For PPIBP, run k=0&D with 0.99 and 0.01 coefficients.
python3 wl4s2v2.py --MODE run_one --dataset_name PPIBP --wl_cumcat False --hist_norm True --a_c 0.99 --a_s 0.01 --runs 3 --C 1.28
# Search hparams for k=0&D
python3 wl4s2v2.py --MODE hp_search_real
python3 wl4s2v2.py --MODE hp_search_syn

python wl4s_k.py
# Run wl4s for all k in [0, 1, 2, D]
python3 wl4s_k.py --MODE real_k
python3 wl4s_k.py --MODE syn_k
# Pre-compute kernels for PPIBP and EMUser (If k > 0, intensive time and memory are required)
python3 wl4s_k.py --MODE real_precomputation
```
29 changes: 16 additions & 13 deletions WL4S/wl4s.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,20 +26,23 @@

DATASETS_REAL = ["PPIBP", "EMUser", "HPOMetab", "HPONeuro"]
DATASETS_SYN = ["Component", "Density", "Coreness", "CutRatio"]
MODEL_KWARGS_KEY = ["C", "kernel", "dual"]

parser = argparse.ArgumentParser()
parser.add_argument('--MODE', type=str, default="hp_search_for_models")
parser.add_argument('--MODE', type=str, default="run_one")
parser.add_argument('--dataset_name', type=str, default="PPIBP",
choices=["PPIBP", "HPOMetab", "EMUser", "HPONeuro", "Density", "Component", "Coreness", "CutRatio"])
parser.add_argument('--stype', type=str, default="connected", choices=["connected", "separated"])
parser.add_argument('--dtype', type=str, default="histogram", choices=["histogram", "kernel"])
parser.add_argument('--dtype', type=str, default="kernel", choices=["histogram", "kernel"])
parser.add_argument('--wl_layers', type=int, default=5)
parser.add_argument('--wl_cumcat', type=str2bool, default=False)
parser.add_argument('--hist_norm', type=str2bool, default=True)
parser.add_argument('--k_to_sample', type=int, default=None)
parser.add_argument('--ratio_samples', type=float, default=1.0, help="Only when k_to_sample != 0")
parser.add_argument('--model', type=str, default="LinearSVC")
parser.add_argument('--runs', type=int, default=2)
parser.add_argument('--model', type=str, default="SVC", choices=["LinearSVC", "SVC"])
parser.add_argument('--C', type=float, default=1.0)
parser.add_argument('--kernel', type=str, default="precomputed")
parser.add_argument('--runs', type=int, default=10)
parser.add_argument('--dataset_path', type=str, default="/mnt/nas2/GNN-DATA/SUBGRAPH")


Expand Down Expand Up @@ -230,7 +233,8 @@ def experiment(args, h_or_k_list, splits, all_y, **model_kwargs):

def run_one(args, data_func=get_data_and_model):
h_or_k_list, splits, all_y = data_func(args)
experiment(args, h_or_k_list, splits, all_y)
model_kwargs = {k: getattr(args, k) for k in MODEL_KWARGS_KEY if hasattr(args, k)}
experiment(args, h_or_k_list, splits, all_y, **model_kwargs)


def plot_tsne_all(args, data_func=get_data_and_model, path="../_figures", extension="png"):
Expand Down Expand Up @@ -273,11 +277,12 @@ def init_args(method_name):
print(f"Compute WL hists: {stype} & norm={hist_norm}")
stype_and_norm_to_data_and_model[(stype, hist_norm)] = data_func(args)

Path(file_dir).mkdir(exist_ok=True)
file_path = Path(file_dir) / f"{args.dataset_name}{log_postfix}.csv"
for i, model_kwargs in enumerate(kwargs_list):
print(model_kwargs)
for k in model_kwargs.copy():
if k in args.__dict__:
if k not in MODEL_KWARGS_KEY:
setattr(args, k, model_kwargs.pop(k))
h_or_k_list, splits, all_y = stype_and_norm_to_data_and_model[(args.stype, args.hist_norm)]
results = experiment(args, h_or_k_list, splits, all_y, **model_kwargs)
Expand Down Expand Up @@ -307,14 +312,16 @@ def hp_search_syn(args, hparam_space, more_hparam_space, data_func=get_data_and_

HPARAM_SPACE = {
"stype": ["connected", "separated"],
"wl_cumcat": [True, False],
"wl_cumcat": [False, True],
"hist_norm": [False, True],
"model": ["LinearSVC"],
"model": ["SVC"],
"kernel": ["precomputed"],
"dtype": ["kernel"],
}
Cx100 = [8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576]
MORE_HPARAM_SPACE = {
"C": [c / 100 for c in Cx100],
"dual": [True, False],
"dual": [True, False], # for SVC
}

__args__ = parser.parse_args()
Expand All @@ -328,7 +335,3 @@ def hp_search_syn(args, hparam_space, more_hparam_space, data_func=get_data_and_
run_one(__args__)
elif __args__.MODE == "hp_search_for_models":
hp_search_for_models(__args__, HPARAM_SPACE, MORE_HPARAM_SPACE)
elif __args__.MODE == "hp_search_real":
hp_search_real(__args__, HPARAM_SPACE, MORE_HPARAM_SPACE)
elif __args__.MODE == "hp_search_syn":
hp_search_syn(__args__, HPARAM_SPACE, MORE_HPARAM_SPACE)
58 changes: 58 additions & 0 deletions WL4S/wl4s2v2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from wl4s import parser, hp_search_for_models, hp_search_real, get_data_and_model, hp_search_syn, run_one


def get_data_mixed_kernels(args):
args.stype = "separated"
k_list_s, splits_s, y_s = get_data_and_model(args)

args.stype = "connected"
k_list_c, splits_c, y_c = get_data_and_model(args)

k_list_new = []
for kt_c, kt_s in zip(k_list_c, k_list_s):
kt_new = tuple([(args.a_c * k_c + args.a_s * k_s) for k_c, k_s in zip(kt_c, kt_s)])
k_list_new.append(kt_new)

return k_list_new, splits_c, y_c


if __name__ == '__main__':

parser.add_argument("--a_c", type=float, default=0.9, help="a_c parameter")
parser.add_argument("--a_s", type=float, default=0.1, help="a_s parameter")

HPARAM_SPACE = {
"stype": [None], # NOTE: None
"wl_cumcat": [False, True],
"hist_norm": [False, True],
"model": ["SVC"],
"kernel": ["precomputed"],
"dtype": ["kernel"],
}
Cx100 = [8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576]
MORE_HPARAM_SPACE = {
"C": [c / 100 for c in Cx100],
}
WL4S2_KWS = dict(
data_func=get_data_mixed_kernels, # NOTE: important
file_dir="../_logs_wl4s2",
)

__args__ = parser.parse_args()
__args__.dtype = "kernel"

if __args__.MODE == "run_one":
run_one(__args__)
else:
for _a_c, _a_s in [
(0.99, 0.01), (0.9, 0.1), (0.5, 0.1),
(0.01, 0.99), (0.1, 0.9), (0.1, 0.5),
]:
__args__.a_c, __args__.a_s = _a_c, _a_s

if __args__.MODE == "hp_search_for_models":
hp_search_for_models(__args__, HPARAM_SPACE, MORE_HPARAM_SPACE, **WL4S2_KWS)
elif __args__.MODE == "hp_search_real":
hp_search_real(__args__, HPARAM_SPACE, MORE_HPARAM_SPACE, **WL4S2_KWS)
elif __args__.MODE == "hp_search_syn":
hp_search_syn(__args__, HPARAM_SPACE, MORE_HPARAM_SPACE, **WL4S2_KWS)
61 changes: 34 additions & 27 deletions WL4S/wl4s_k.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import gc

from wl4s import parser, hp_search_for_models, precompute_all_kernels
from wl4s import parser, hp_search_for_models, precompute_all_kernels, hp_search_syn, hp_search_real

if __name__ == '__main__':
HPARAM_SPACE = {
Expand All @@ -12,28 +12,33 @@
MORE_HPARAM_SPACE = {
"C": [c / 100 for c in Cx100],
}
DATA_TO_RATIO_SAMPLES = {"HPOMetab": 1400 / 2400, "HPONeuro": 1400 / 4000}
DATA_TO_RATIO_SAMPLES = {"HPOMetab": 1400 / 2400, "HPONeuro": 1400 / 4000} # for sliced

__args__ = parser.parse_args()
__args__.stype = "separated"

MODE = "real_large_precomputation"

if MODE == "syn":
MODE = __args__.MODE
if MODE == "syn_k":
__args__.stype = "separated"
HPARAM_SPACE = {**HPARAM_SPACE, "model": ["LinearSVC"]}
MORE_HPARAM_SPACE = {**MORE_HPARAM_SPACE, "dual": [True, False]}
for k_to_sample in [None, 1, 2]:
__args__.k_to_sample = k_to_sample
kws = dict(file_dir="../_logs_wl4s_k", log_postfix=f"_{k_to_sample or 0}")
for dataset_name in ["Component", "Density", "Coreness", "CutRatio"]:
__args__.dataset_name = dataset_name
hp_search_for_models(__args__, HPARAM_SPACE, MORE_HPARAM_SPACE, **kws)
hp_search_syn(__args__, HPARAM_SPACE, MORE_HPARAM_SPACE, **kws)

__args__.stype = "connected"
HPARAM_SPACE["stype"] = ["connected"]
kws = dict(file_dir="../_logs_wl4s_k", log_postfix=f"_inf")
hp_search_syn(__args__, HPARAM_SPACE, MORE_HPARAM_SPACE, **kws)

else:
HPARAM_SPACE = {**HPARAM_SPACE, "model": ["SVC"], "kernel": ["precomputed"], "dtype": ["kernel"]}
HPARAM_SPACE = {
**HPARAM_SPACE,
"model": ["SVC"], "kernel": ["precomputed"], "dtype": ["kernel"],
}
__args__.dtype = "kernel"

if MODE == "real_small_precomputation":
if MODE == "real_precomputation":
for k_to_sample in [None, 1, 2]:
for dataset_name in ["PPIBP", "EMUser"]:
__args__.k_to_sample = k_to_sample
Expand All @@ -43,7 +48,8 @@
precompute_all_kernels(__args__)
gc.collect()

elif MODE == "real_large_precomputation":
elif MODE == "sliced_real_precomputation":
# NOTE: DATA_TO_RATIO_SAMPLES exists
for k_to_sample in [2, 1, None]:
for dataset_name in ["HPONeuro", "HPOMetab"]:
__args__.k_to_sample = k_to_sample
Expand All @@ -54,36 +60,37 @@
precompute_all_kernels(__args__)
gc.collect()

elif MODE == "real_small_k":
for k_to_sample in [None, 1, 2]:
for dataset_name in ["PPIBP", "EMUser"]:
elif MODE == "real_k":
for dataset_name in ["PPIBP", "EMUser", "HPOMetab", "HPONeuro"]:
k_to_sample_list = [None] if dataset_name in ["HPOMetab", "HPONeuro"] else [None, 1, 2]
for k_to_sample in k_to_sample_list:
__args__.k_to_sample = k_to_sample
__args__.dataset_name = dataset_name
kws = dict(file_dir="../_logs_wl4s_k", log_postfix=f"_{k_to_sample or 0}")
hp_search_for_models(__args__, HPARAM_SPACE, MORE_HPARAM_SPACE, **kws)

elif MODE == "real_large_k":
__args__.stype = "connected"
HPARAM_SPACE["stype"] = ["connected"]
kws = dict(file_dir="../_logs_wl4s_k", log_postfix=f"_inf")
hp_search_real(__args__, HPARAM_SPACE, MORE_HPARAM_SPACE, **kws)

elif MODE == "sliced_real_k":
# NOTE: DATA_TO_RATIO_SAMPLES exists
for k_to_sample in [None, 1, 2]:
for dataset_name in ["HPONeuro", "HPOMetab"]:
__args__.k_to_sample = k_to_sample
__args__.dataset_name = dataset_name
__args__.ratio_samples = DATA_TO_RATIO_SAMPLES[dataset_name]
kws = dict(file_dir="../_logs_wl4s_k", log_postfix=f"_{k_to_sample or 0}")
kws = dict(file_dir="../_logs_wl4s_k", log_postfix=f"_sliced_{k_to_sample or 0}")
hp_search_for_models(__args__, HPARAM_SPACE, MORE_HPARAM_SPACE, **kws)

elif MODE == "real_small_k_inf":
__args__.stype = "connected"
HPARAM_SPACE["stype"] = ["connected"]
for dataset_name in ["PPIBP", "EMUser"]:
__args__.dataset_name = dataset_name
kws = dict(file_dir="../_logs_wl4s_k", log_postfix=f"_inf")
hp_search_for_models(__args__, HPARAM_SPACE, MORE_HPARAM_SPACE, **kws)

elif MODE == "real_large_k_inf":
__args__.stype = "connected"
HPARAM_SPACE["stype"] = ["connected"]
for dataset_name in ["HPONeuro", "HPOMetab"]:
__args__.dataset_name = dataset_name
__args__.ratio_samples = DATA_TO_RATIO_SAMPLES[dataset_name]
kws = dict(file_dir="../_logs_wl4s_k", log_postfix=f"_inf")
kws = dict(file_dir="../_logs_wl4s_k", log_postfix=f"_sliced_inf")
hp_search_for_models(__args__, HPARAM_SPACE, MORE_HPARAM_SPACE, **kws)

else:
raise ValueError(f"Not supported MODE: {MODE}")

0 comments on commit a3993ae

Please sign in to comment.