From 483277a0634a6cd257d7279422b2a24e55bf53bd Mon Sep 17 00:00:00 2001 From: Devesh Sarda <32046390+sarda-devesh@users.noreply.github.com> Date: Sun, 12 Nov 2023 11:38:43 -0600 Subject: [PATCH] Add in edge weights to preprocessor (#148) * Added in colab data loader * Support both type and weight * Not working code * Works on the custom dataset * Ran tox * Fixed flake8 issues * Hopefully fixed linter issues * Wrote torch tests * Added in pandas converter tests * Performed linter checks * Run check lint outside of the container * working with pandas example * Work on OGBL loader * Passed all tests * Ran autoformat with updated tox * Fixed test_generate issue --------- Co-authored-by: Devesh Sarda Co-authored-by: Jason Mohoney --- docs/examples/config/nc_custom.rst | 3 +- docs/examples/python/lp_custom.rst | 3 +- examples/docker/cpu_ubuntu/dockerfile | 2 +- examples/python/custom_lp.py | 3 +- examples/python/custom_nc_graphsage.py | 3 +- src/cpp/src/common/util.cpp | 1 + src/cpp/src/reporting/reporting.cpp | 3 +- src/cpp/src/storage/buffer.cpp | 2 + src/python/tools/configuration/constants.py | 7 + src/python/tools/marius_preprocess.py | 47 +- .../partitioners/torch_partitioner.py | 56 +- .../converters/readers/pandas_readers.py | 74 +- .../preprocess/converters/torch_constants.py | 12 + .../preprocess/converters/torch_converter.py | 406 +++++-- .../converters/writers/torch_writer.py | 35 + src/python/tools/preprocess/custom.py | 13 +- .../tools/preprocess/datasets/__init__.py | 1 + src/python/tools/preprocess/datasets/fb15k.py | 3 + .../tools/preprocess/datasets/fb15k_237.py | 3 + .../tools/preprocess/datasets/freebase86m.py | 4 +- .../tools/preprocess/datasets/friendster.py | 3 +- .../tools/preprocess/datasets/livejournal.py | 3 +- .../tools/preprocess/datasets/ogb_mag240m.py | 3 + .../preprocess/datasets/ogb_wikikg90mv2.py | 4 +- .../preprocess/datasets/ogbl_citation2.py | 3 + .../tools/preprocess/datasets/ogbl_collab.py | 127 +++ .../tools/preprocess/datasets/ogbl_ppa.py | 3 + .../tools/preprocess/datasets/ogbl_wikikg2.py | 3 + .../tools/preprocess/datasets/ogbn_arxiv.py | 2 + .../preprocess/datasets/ogbn_papers100m.py | 3 + .../preprocess/datasets/ogbn_products.py | 2 + .../tools/preprocess/datasets/twitter.py | 3 +- .../preprocessing/test_torch_converter.py | 565 +++++++++- test/test_data/generate.py | 22 +- test/test_data/train_edges_weights.txt | 1000 +++++++++++++++++ tox.ini | 1 + 36 files changed, 2232 insertions(+), 196 deletions(-) create mode 100644 src/python/tools/preprocess/converters/torch_constants.py create mode 100644 src/python/tools/preprocess/datasets/ogbl_collab.py create mode 100644 test/test_data/train_edges_weights.txt diff --git a/docs/examples/config/nc_custom.rst b/docs/examples/config/nc_custom.rst index 5f7b9336..19dbf8c5 100644 --- a/docs/examples/config/nc_custom.rst +++ b/docs/examples/config/nc_custom.rst @@ -117,7 +117,8 @@ Let's borrow the provided ``examples/python/custom_nc_graphsage.py`` and modify output_dir=self.output_directory, train_edges=self.input_edge_list_file, num_partitions=num_partitions, - columns=[0, 1], + src_column = 0, + dst_column = 1, remap_ids=remap_ids, sequential_train_nodes=sequential_train_nodes, delim=",", diff --git a/docs/examples/python/lp_custom.rst b/docs/examples/python/lp_custom.rst index 76454d7c..866f2d2f 100644 --- a/docs/examples/python/lp_custom.rst +++ b/docs/examples/python/lp_custom.rst @@ -66,7 +66,8 @@ Making a new dataset class requires writing two methods: converter = converter( output_dir=self.output_directory, train_edges=self.input_train_edges_file, - columns = [0,1], # col 0 is src and col 1 dst node in input csv + src_column = 0, # col 0 is src and col 1 dst node in input csv + dst_column = 1, delim=",", # CSV delimitor is "," splits = splits, # Splitting the data in train, valid and test remap_ids=remap_ids # Remapping the raw entity ids into random integers diff --git a/examples/docker/cpu_ubuntu/dockerfile b/examples/docker/cpu_ubuntu/dockerfile index 9edbfad2..5054d8f5 100644 --- a/examples/docker/cpu_ubuntu/dockerfile +++ b/examples/docker/cpu_ubuntu/dockerfile @@ -25,7 +25,7 @@ RUN sh cmake-3.20.0-linux-x86_64.sh --skip-license --prefix=/opt/cmake/ RUN ln -s /opt/cmake/bin/cmake /usr/local/bin/cmake # install pytorch -RUN python3 -m pip install torch==2.0.1+cpu -f https://download.pytorch.org/whl/torch_stable.html +RUN python3 -m pip install torch --index-url https://download.pytorch.org/whl/cpu RUN mkdir /working_dir WORKDIR /working_dir \ No newline at end of file diff --git a/examples/python/custom_lp.py b/examples/python/custom_lp.py index 4dcba5db..417bb928 100644 --- a/examples/python/custom_lp.py +++ b/examples/python/custom_lp.py @@ -38,7 +38,8 @@ def preprocess(self, remap_ids=True, splits=None): converter = converter( output_dir=self.output_directory, train_edges=self.input_train_edges_file, - columns=[0, 1], # col 0 is src and col 1 dst node in input csv + src_column=0, # col 0 is src and col 1 dst node in input csv + dst_column=1, delim=",", # CSV delimitor is "," splits=splits, # Splitting the data in train, valid and test remap_ids=remap_ids, # Remapping the raw entity ids into random integers diff --git a/examples/python/custom_nc_graphsage.py b/examples/python/custom_nc_graphsage.py index c9eaf405..29b8c8fe 100644 --- a/examples/python/custom_nc_graphsage.py +++ b/examples/python/custom_nc_graphsage.py @@ -112,7 +112,8 @@ def preprocess( output_dir=self.output_directory, train_edges=self.input_edge_list_file, num_partitions=num_partitions, - columns=[0, 1], + src_column=0, + dst_column=1, remap_ids=remap_ids, sequential_train_nodes=sequential_train_nodes, delim=",", diff --git a/src/cpp/src/common/util.cpp b/src/cpp/src/common/util.cpp index 249831bb..300eae92 100644 --- a/src/cpp/src/common/util.cpp +++ b/src/cpp/src/common/util.cpp @@ -6,6 +6,7 @@ #include +#include #include #include "reporting/logger.h" diff --git a/src/cpp/src/reporting/reporting.cpp b/src/cpp/src/reporting/reporting.cpp index deb4e738..98b57f79 100644 --- a/src/cpp/src/reporting/reporting.cpp +++ b/src/cpp/src/reporting/reporting.cpp @@ -1,9 +1,10 @@ // // Created by Jason Mohoney on 8/24/21. // - #include "reporting/reporting.h" +#include + #include "configuration/constants.h" #include "reporting/logger.h" diff --git a/src/cpp/src/storage/buffer.cpp b/src/cpp/src/storage/buffer.cpp index 4cdd1a8c..1034d81e 100644 --- a/src/cpp/src/storage/buffer.cpp +++ b/src/cpp/src/storage/buffer.cpp @@ -8,8 +8,10 @@ #include #include +#include #include #include +#include #include #include "configuration/constants.h" diff --git a/src/python/tools/configuration/constants.py b/src/python/tools/configuration/constants.py index 61a0b167..6d4512da 100644 --- a/src/python/tools/configuration/constants.py +++ b/src/python/tools/configuration/constants.py @@ -14,6 +14,7 @@ class PathConstants: node_mapping_file: str = "node_mapping.txt" relation_mapping_file: str = "relation_mapping.txt" edge_file_name: str = "edges" + edge_weight_file_name: str = "edges_weights" node_file_name: str = "nodes" features_file_name: str = "features" labels_file_name: str = "labels" @@ -23,8 +24,14 @@ class PathConstants: file_ext: str = ".bin" train_edges_path: str = edges_directory + training_file_prefix + edge_file_name + file_ext + train_edges_weights_path: str = edges_directory + training_file_prefix + edge_weight_file_name + file_ext + valid_edges_path: str = edges_directory + validation_file_prefix + edge_file_name + file_ext + valid_edges_weights_path: str = edges_directory + validation_file_prefix + edge_weight_file_name + file_ext + test_edges_path: str = edges_directory + test_file_prefix + edge_file_name + file_ext + test_edges_weights_path: str = edges_directory + test_file_prefix + edge_weight_file_name + file_ext + train_edge_buckets_path: str = edges_directory + training_file_prefix + partition_offsets_file valid_edge_buckets_path: str = edges_directory + validation_file_prefix + partition_offsets_file test_edge_buckets_path: str = edges_directory + test_file_prefix + partition_offsets_file diff --git a/src/python/tools/marius_preprocess.py b/src/python/tools/marius_preprocess.py index 6295249a..4d0bb330 100644 --- a/src/python/tools/marius_preprocess.py +++ b/src/python/tools/marius_preprocess.py @@ -11,6 +11,7 @@ ogb_mag240m, ogb_wikikg90mv2, ogbl_citation2, + ogbl_collab, ogbl_ppa, ogbl_wikikg2, ogbn_arxiv, @@ -91,13 +92,39 @@ def set_args(): ) parser.add_argument( - "--columns", - metavar="columns", - nargs="*", + "--src_column", + metavar="src_column", required=False, type=int, - default=[0, 1, 2], - help="List of column ids of input delimited files which denote the src node, edge-type, and dst node of edges.", + default=None, + help="The column id of the src column", + ) + + parser.add_argument( + "--dst_column", + metavar="dst_column", + required=False, + type=int, + default=None, + help="The column id of the dst column", + ) + + parser.add_argument( + "--edge_type_column", + metavar="edge_type_column", + required=False, + type=int, + default=None, + help="The column id which denotes the edge weight column", + ) + + parser.add_argument( + "--edge_weight_column", + metavar="edge_weight_column", + required=False, + type=int, + default=None, + help="The column id which denotes the edge weight column", ) return parser @@ -106,6 +133,8 @@ def set_args(): def main(): parser = set_args() args = parser.parse_args() + if args.dataset == "custom" and (args.src_column is None or args.dst_column is None): + parser.error("When using a custom dataset, src column and dst column must be specified") if args.output_directory == "": args.output_directory = args.dataset @@ -127,10 +156,12 @@ def main(): "OGBN_PAPERS100M": ogbn_papers100m.OGBNPapers100M, "OGB_WIKIKG90MV2": ogb_wikikg90mv2.OGBWikiKG90Mv2, "OGB_MAG240M": ogb_mag240m.OGBMag240M, + "OGBL_COLLAB": ogbl_collab.OGBLCollab, } dataset = dataset_dict.get(args.dataset.upper()) if dataset is not None: + print("Using existing dataset of", args.dataset.upper()) dataset = dataset(args.output_directory, spark=args.spark) dataset.download(args.overwrite) dataset.preprocess( @@ -140,6 +171,7 @@ def main(): sequential_train_nodes=args.sequential_train_nodes, partitioned_eval=args.partitioned_eval, ) + else: print("Preprocess custom dataset") @@ -157,7 +189,10 @@ def main(): splits=args.dataset_split, partitioned_eval=args.partitioned_eval, sequential_train_nodes=args.sequential_train_nodes, - columns=args.columns, + src_column=args.src_column, + dst_column=args.dst_column, + edge_type_column=args.edge_type_column, + edge_weight_column=args.edge_weight_column, ) diff --git a/src/python/tools/preprocess/converters/partitioners/torch_partitioner.py b/src/python/tools/preprocess/converters/partitioners/torch_partitioner.py index b96a4633..2b61abc7 100644 --- a/src/python/tools/preprocess/converters/partitioners/torch_partitioner.py +++ b/src/python/tools/preprocess/converters/partitioners/torch_partitioner.py @@ -5,12 +5,11 @@ import torch # isort:skip -def dataframe_to_tensor(input_dataframe): - np_array = input_dataframe.to_dask_array().compute() - return torch.from_numpy(np_array) +def dataframe_to_tensor(df): + return torch.tensor(df.to_numpy()) -def partition_edges(edges, num_nodes, num_partitions): +def partition_edges(edges, num_nodes, num_partitions, edge_weights=None): partition_size = int(np.ceil(num_nodes / num_partitions)) src_partitions = torch.div(edges[:, 0], partition_size, rounding_mode="trunc") @@ -18,12 +17,14 @@ def partition_edges(edges, num_nodes, num_partitions): _, dst_args = torch.sort(dst_partitions, stable=True) _, src_args = torch.sort(src_partitions[dst_args], stable=True) + sort_order = dst_args[src_args] - edges = edges[dst_args[src_args]] - edge_bucket_ids = torch.div(edges, partition_size, rounding_mode="trunc") + edges = edges[sort_order] + if edge_weights is not None: + edge_weights = edge_weights[sort_order] + edge_bucket_ids = torch.div(edges, partition_size, rounding_mode="trunc") offsets = np.zeros([num_partitions, num_partitions], dtype=int) - unique_src, num_source = torch.unique_consecutive(edge_bucket_ids[:, 0], return_counts=True) num_source_offsets = torch.cumsum(num_source, 0) - num_source @@ -42,7 +43,7 @@ def partition_edges(edges, num_nodes, num_partitions): offsets = list(offsets.flatten()) - return edges, offsets + return edges, offsets, edge_weights class TorchPartitioner(Partitioner): @@ -51,19 +52,42 @@ def __init__(self, partitioned_evaluation): self.partitioned_evaluation = partitioned_evaluation - def partition_edges(self, train_edges_tens, valid_edges_tens, test_edges_tens, num_nodes, num_partitions): - """ """ - - train_edges_tens, train_offsets = partition_edges(train_edges_tens, num_nodes, num_partitions) + def partition_edges( + self, train_edges_tens, valid_edges_tens, test_edges_tens, num_nodes, num_partitions, edge_weights=None + ): + # Extract the edge weights + train_edge_weights, valid_edge_weights, test_edge_weights = None, None, None + if edge_weights is not None: + train_edge_weights, valid_edge_weights, test_edge_weights = ( + edge_weights[0], + edge_weights[1], + edge_weights[2], + ) + + train_edges_tens, train_offsets, train_edge_weights = partition_edges( + train_edges_tens, num_nodes, num_partitions, edge_weights=train_edge_weights + ) valid_offsets = None test_offsets = None if self.partitioned_evaluation: if valid_edges_tens is not None: - valid_edges_tens, valid_offsets = partition_edges(valid_edges_tens, num_nodes, num_partitions) + valid_edges_tens, valid_offsets, valid_edge_weights = partition_edges( + valid_edges_tens, num_nodes, num_partitions, edge_weights=valid_edge_weights + ) if test_edges_tens is not None: - test_edges_tens, test_offsets = partition_edges(test_edges_tens, num_nodes, num_partitions) - - return train_edges_tens, train_offsets, valid_edges_tens, valid_offsets, test_edges_tens, test_offsets + test_edges_tens, test_offsets, test_edge_weights = partition_edges( + test_edges_tens, num_nodes, num_partitions, edge_weights=test_edge_weights + ) + + return ( + train_edges_tens, + train_offsets, + valid_edges_tens, + valid_offsets, + test_edges_tens, + test_offsets, + [train_edge_weights, valid_edge_weights, test_edge_weights], + ) diff --git a/src/python/tools/preprocess/converters/readers/pandas_readers.py b/src/python/tools/preprocess/converters/readers/pandas_readers.py index 6c5265fc..1599a604 100644 --- a/src/python/tools/preprocess/converters/readers/pandas_readers.py +++ b/src/python/tools/preprocess/converters/readers/pandas_readers.py @@ -3,6 +3,7 @@ import pandas as pd from marius.tools.preprocess.converters.readers.reader import Reader +from marius.tools.preprocess.converters.torch_constants import TorchConverterColumnKeys as ColNames class PandasDelimitedFileReader(Reader): @@ -11,7 +12,7 @@ def __init__( train_edges: Path, valid_edges: Path = None, test_edges: Path = None, - columns: list = [0, 1, 2], + columns: dict = {}, header_length: int = 0, delim: str = "\t", ): @@ -22,49 +23,60 @@ def __init__( :param valid_edges: The path to the raw validation edge list :param test_edges: The path to the raw test edge list it is the train/valid/test split. The sum of this list must be 1. - :param columns: Denotes the columns to extract for the edges. The default is [0, 1, 2], - where the first index is the column id of the src nodes, the second the - relations (edge-types), and the third the dst nodes. For graphs without - edge types, only two ids should be provided. + :param columns: A dict containing the columns we want to extract and the names we want + to assing them. The key should be the name we want to assign the column + and the value is the column id. + Any columns with a None id are ignored. :param header_length: The length of the header of the input edge lists :param delim: The delimiter used between columns of the input edge lists """ super().__init__() + assert train_edges is not None self.train_edges = train_edges self.valid_edges = valid_edges self.test_edges = test_edges - self.columns = columns self.header_length = header_length - + self.columns = columns self.delim = delim - if len(self.columns) == 2: - self.has_rels = False - elif len(self.columns) == 3: - self.has_rels = True - else: - raise RuntimeError( - "Incorrect number of columns specified, expected length 2 or 3, received {}".format(len(self.columns)) - ) + def read_single_file(self, file_path): + if file_path is None: + return None - def read(self): - train_edges_df: pd.DataFrame = None - valid_edges_df: pd.DataFrame = None - test_edges_df: pd.DataFrame = None + # Determine the columns to read + cols_to_keeps = [] + id_to_name_mapping = {} + for col_name, col_id in self.columns.items(): + if col_id is not None: + cols_to_keeps.append(col_id) + id_to_name_mapping[col_id] = col_name.value - assert self.train_edges is not None - train_edges_df = pd.read_csv(self.train_edges, delimiter=self.delim, skiprows=self.header_length, header=None) - train_edges_df = train_edges_df[train_edges_df.columns[self.columns]] + # Read the file and extracted the columns we need + file_data = pd.read_csv(file_path, delimiter=self.delim, skiprows=self.header_length, header=None) + file_data = file_data[cols_to_keeps] + file_data = file_data.rename(columns=id_to_name_mapping) - if self.valid_edges is not None: - valid_edges_df = pd.read_csv( - self.valid_edges, delimiter=self.delim, skiprows=self.header_length, header=None - ) - valid_edges_df = valid_edges_df[valid_edges_df.columns[self.columns]] - if self.test_edges is not None: - test_edges_df = pd.read_csv(self.test_edges, delimiter=self.delim, skiprows=self.header_length, header=None) - test_edges_df = test_edges_df[test_edges_df.columns[self.columns]] + # Make sure we got the src and dst columns + columns_read = list(file_data.columns) + assert "src_column" in columns_read + assert "dst_column" in columns_read - return train_edges_df, valid_edges_df, test_edges_df + # Ensure that data is in the proper order + cols_order = [ColNames.SRC_COL.value, ColNames.DST_COL.value] + if "edge_type_column" in columns_read: + cols_order.insert(len(cols_order) - 1, ColNames.EDGE_TYPE_COL.value) + + if "edge_weight_column" in columns_read: + cols_order.insert(len(cols_order), ColNames.EDGE_WEIGHT_COL.value) + + file_data = file_data[cols_order] + return file_data + + def read(self): + return ( + self.read_single_file(self.train_edges), + self.read_single_file(self.valid_edges), + self.read_single_file(self.test_edges), + ) diff --git a/src/python/tools/preprocess/converters/torch_constants.py b/src/python/tools/preprocess/converters/torch_constants.py new file mode 100644 index 00000000..92887050 --- /dev/null +++ b/src/python/tools/preprocess/converters/torch_constants.py @@ -0,0 +1,12 @@ +from enum import Enum, unique + + +@unique +class TorchConverterColumnKeys(Enum): + SRC_COL = "src_column" + DST_COL = "dst_column" + EDGE_TYPE_COL = "edge_type_column" + EDGE_WEIGHT_COL = "edge_weight_column" + + def __hash__(self) -> int: + return hash(self.name) diff --git a/src/python/tools/preprocess/converters/torch_converter.py b/src/python/tools/preprocess/converters/torch_converter.py index 5cdf5c81..0dde1f93 100644 --- a/src/python/tools/preprocess/converters/torch_converter.py +++ b/src/python/tools/preprocess/converters/torch_converter.py @@ -7,6 +7,7 @@ from marius.tools.configuration.constants import PathConstants from marius.tools.preprocess.converters.partitioners.torch_partitioner import TorchPartitioner from marius.tools.preprocess.converters.readers.pandas_readers import PandasDelimitedFileReader +from marius.tools.preprocess.converters.torch_constants import TorchConverterColumnKeys as ColNames from marius.tools.preprocess.converters.writers.torch_writer import TorchWriter import torch # isort:skip @@ -76,93 +77,165 @@ def apply_mapping1d(input_ids, mapping_df): raise RuntimeError("Unsupported datatype for input. Must be a pandas.Series or a 1D torch.Tensor") -def map_edge_list_dfs(edge_lists: list, known_node_ids=None, sequential_train_nodes=False, sequential_deg_nodes=0): - if sequential_train_nodes or sequential_deg_nodes > 0: - raise RuntimeError("sequential_train_nodes not yet supported for map_edge_list_dfs") +def extract_tensors_from_df(df, column_mappings): + if df is None: + return None, None - all_edges_df = pd.concat(edge_lists) + edge_weight_tensor = None + edge_weight_column_num = column_mappings[ColNames.EDGE_WEIGHT_COL] + edge_weight_column_name = ColNames.EDGE_WEIGHT_COL.value - unique_src = all_edges_df.iloc[:, 0].unique() - unique_dst = all_edges_df.iloc[:, -1].unique() + if edge_weight_column_num is not None: + assert edge_weight_column_name in list(df.columns) + edge_weight_tensor = torch.tensor(df[edge_weight_column_name].values) + df = df.drop(columns=[edge_weight_column_name]) + + edges_tensor = dataframe_to_tensor(df) + return edges_tensor, edge_weight_tensor - if known_node_ids is None: - unique_nodes = np.unique(np.concatenate([unique_src.astype(str), unique_dst.astype(str)])) - else: - node_ids = [unique_src.astype(str), unique_dst.astype(str)] - for n in known_node_ids: - node_ids.append(n.numpy().astype(str)) - unique_nodes = np.unique(np.concatenate(node_ids)) +def map_edge_list_dfs( + edge_lists: list, + known_node_ids=None, + sequential_train_nodes=False, + sequential_deg_nodes=0, + column_mappings: dict = {}, +): + if sequential_train_nodes or sequential_deg_nodes > 0: + raise RuntimeError("sequential_train_nodes not yet supported for map_edge_list_dfs") + + # Combine all the non null dfs + combined_dfs = [] + has_rels = column_mappings[ColNames.EDGE_TYPE_COL] is not None + for edge_df in edge_lists: + if edge_df is not None: + # Convert all columns to str + edge_df[ColNames.SRC_COL.value] = edge_df[ColNames.SRC_COL.value].astype(str) + edge_df[ColNames.DST_COL.value] = edge_df[ColNames.DST_COL.value].astype(str) + if has_rels: + edge_df[ColNames.EDGE_TYPE_COL.value] = edge_df[ColNames.EDGE_TYPE_COL.value].astype(str) + combined_dfs.append(edge_df) + + # Get the unique nodes + all_edges_df = pd.concat(combined_dfs) + unique_src = all_edges_df[ColNames.SRC_COL.value].unique().astype(str) + unique_dst = all_edges_df[ColNames.DST_COL.value].unique().astype(str) + + unique_list = [unique_src, unique_dst] + if known_node_ids is not None: + for n in known_node_ids: + unique_list.append(n.numpy().astype(str)) + unique_nodes = np.unique(np.concatenate(unique_list, axis=None)) num_nodes = unique_nodes.shape[0] mapped_node_ids = np.random.permutation(num_nodes) nodes_dict = dict(zip(list(unique_nodes), list(mapped_node_ids))) - has_rels = False unique_rels = torch.empty([0]) mapped_rel_ids = torch.empty([0]) rels_dict = None - if len(all_edges_df.columns) == 3: - has_rels = True if has_rels: - unique_rels = all_edges_df.iloc[:, 1].unique() + unique_rels = all_edges_df[ColNames.EDGE_TYPE_COL.value].unique() num_rels = unique_rels.shape[0] mapped_rel_ids = np.random.permutation(num_rels) rels_dict = dict(zip(list(unique_rels), list(mapped_rel_ids))) - all_edges_df = None # can safely free this df - - output_edge_lists = [] + output_edge_lists, output_edge_weights = [], [] for edge_list in edge_lists: - node_columns = edge_list.columns[[0, -1]] - edge_list[node_columns] = edge_list[node_columns].applymap(nodes_dict.get) + if edge_list is None: + output_edge_lists.append(None) + output_edge_weights.append(None) + continue + + # Map the src and dst values + edge_list[ColNames.SRC_COL.value] = edge_list[ColNames.SRC_COL.value].map(nodes_dict) + assert edge_list[ColNames.SRC_COL.value].isna().sum() == 0 + + edge_list[ColNames.DST_COL.value] = edge_list[ColNames.DST_COL.value].map(nodes_dict) + assert edge_list[ColNames.DST_COL.value].isna().sum() == 0 if has_rels: - rel_columns = edge_list.columns[1] - edge_list[rel_columns] = edge_list[rel_columns].map(rels_dict.get) + edge_list[ColNames.EDGE_TYPE_COL.value] = edge_list[ColNames.EDGE_TYPE_COL.value].map(rels_dict) + assert edge_list[ColNames.EDGE_TYPE_COL.value].isna().sum() == 0 - output_edge_lists.append(dataframe_to_tensor(edge_list)) + edge_tensor, edge_weights = extract_tensors_from_df(edge_list, column_mappings) + output_edge_lists.append(edge_tensor) + output_edge_weights.append(edge_weights) node_mapping = np.stack([unique_nodes, mapped_node_ids], axis=1) rel_mapping = None if has_rels: rel_mapping = np.stack([unique_rels, mapped_rel_ids], axis=1) - return output_edge_lists, node_mapping, rel_mapping + return output_edge_lists, node_mapping, rel_mapping, output_edge_weights -def map_edge_lists( - edge_lists: list, perform_unique=True, known_node_ids=None, sequential_train_nodes=False, sequential_deg_nodes=0 -): - print("Remapping Edges") - defined_edges = [] - for edge_list in edge_lists: - if edge_list is not None: - defined_edges.append(edge_list) +def extract_tensor_from_tens(edges_tensor, column_mappings): + if edges_tensor is None: + return None, None - edge_lists = defined_edges + edge_weights_column = column_mappings[ColNames.EDGE_WEIGHT_COL] + cols_to_keep = [column_mappings[ColNames.SRC_COL], column_mappings[ColNames.DST_COL]] + if column_mappings[ColNames.EDGE_TYPE_COL] is not None: + cols_to_keep.insert(len(cols_to_keep) - 1, column_mappings[ColNames.EDGE_TYPE_COL]) - if isinstance(edge_lists[0], pd.DataFrame): - if isinstance(edge_lists[0].iloc[0][0], str): - # need to take uniques using pandas for string datatypes, since torch doesn't support strings - return map_edge_list_dfs(edge_lists, known_node_ids, sequential_train_nodes, sequential_deg_nodes) + converted_tensor = edges_tensor[:, cols_to_keep] + converted_weights = None + if edge_weights_column is not None: + converted_weights = edges_tensor[:, edge_weights_column] - new_edge_lists = [] - for edge_list in edge_lists: - new_edge_lists.append(dataframe_to_tensor(edge_list)) + return converted_tensor, converted_weights - edge_lists = new_edge_lists - all_edges = torch.cat(edge_lists) +def map_edge_lists( + edge_lists: list, + perform_unique=True, + known_node_ids=None, + sequential_train_nodes=False, + sequential_deg_nodes=0, + column_mappings: dict = {}, +): + print("Remapping node ids") + + # Ensure that we extract the edge weights as well that edge_lists are in [src, dst] or in [src, type, dst] order + edge_weights_list = [None] * len(edge_lists) + has_rels = column_mappings[ColNames.EDGE_TYPE_COL] is not None + all_edges = [] + if isinstance(edge_lists[0], pd.DataFrame): + first_df = edge_lists[0] + if any(col_dtype != np.number for col_dtype in first_df.dtypes): + # need to take uniques using pandas for string datatypes, since torch doesn't support strings + return map_edge_list_dfs( + edge_lists, + known_node_ids, + sequential_train_nodes, + sequential_deg_nodes, + column_mappings=column_mappings, + ) - has_rels = False + for idx in range(len(edge_lists)): + edge_tensors, edge_weights = extract_tensors_from_df(edge_lists[idx], column_mappings) + edge_lists[idx] = edge_tensors + edge_weights_list[idx] = edge_weights + if edge_tensors is not None: + all_edges.append(edge_tensors) + else: + # Determine the order of tensors to keep + for idx in range(len(edge_lists)): + curr_edges = edge_lists[idx] + if curr_edges is None: + continue + + converted_edges, converted_weights = extract_tensor_from_tens(curr_edges, column_mappings) + edge_lists[idx] = converted_edges + all_edges.append(converted_edges) + edge_weights_list[idx] = converted_weights + + all_edges = torch.cat(all_edges) num_rels = 1 unique_rels = torch.empty([0]) mapped_rel_ids = torch.empty([0]) - if all_edges.size(1) == 3: - has_rels = True - output_dtype = torch.int32 if perform_unique: @@ -174,10 +247,10 @@ def map_edge_lists( unique_nodes = torch.unique(torch.cat([unique_src, unique_dst] + known_node_ids), sorted=True) num_nodes = unique_nodes.size(0) - if has_rels: unique_rels = torch.unique(all_edges[:, 1], sorted=True) num_rels = unique_rels.size(0) + else: num_nodes = torch.max(all_edges[:, 0])[0] unique_nodes = torch.arange(num_nodes).to(output_dtype) @@ -186,7 +259,11 @@ def map_edge_lists( num_rels = torch.max(all_edges[:, 1])[0] unique_rels = torch.arange(num_rels).to(output_dtype) + if has_rels: + min_rel_val = unique_rels[0].to(torch.int64) + if sequential_train_nodes or sequential_deg_nodes > 0: + print("inside sequential mode because", sequential_train_nodes, sequential_deg_nodes) seq_nodes = None if sequential_train_nodes and sequential_deg_nodes <= 0: @@ -272,31 +349,34 @@ def map_edge_lists( all_edges = None # can safely free this tensor output_edge_lists = [] - for edge_list in edge_lists: + for idx, edge_list in enumerate(edge_lists): + if edge_list is None: + output_edge_lists.append(None) + continue + new_src = extended_map[edge_list[:, 0].to(torch.int64)] new_dst = extended_map[edge_list[:, -1].to(torch.int64)] + curr_row = [new_src, new_dst] if has_rels: - new_rel = mapped_rel_ids[edge_list[:, 1].to(torch.int64)] - output_edge_lists.append(torch.stack([new_src, new_rel, new_dst], dim=1)) - else: - output_edge_lists.append(torch.stack([new_src, new_dst], dim=1)) + new_rel = mapped_rel_ids[edge_list[:, 1].to(torch.int64) - min_rel_val] + curr_row.insert(len(curr_row) - 1, new_rel) + output_edge_lists.append(torch.stack(curr_row, dim=1)) node_mapping = np.stack([unique_nodes.numpy(), mapped_node_ids.numpy()], axis=1) rel_mapping = None if has_rels: rel_mapping = np.stack([unique_rels.numpy(), mapped_rel_ids.numpy()], axis=1) - return output_edge_lists, node_mapping, rel_mapping + return output_edge_lists, node_mapping, rel_mapping, edge_weights_list -def split_edges(edges, splits): - train_edges_tens = None - valid_edges_tens = None - test_edges_tens = None +def split_edges(edges, edges_weights, splits): + train_edges_tens, train_edges_weights = None, None + valid_edges_tens, valid_edges_weights = None, None + test_edges_tens, test_edges_weights = None, None total_split_edges = int(sum(splits) * edges.shape[0]) - num_total_edges = edges.shape[0] rand_perm = torch.randperm(num_total_edges) @@ -304,7 +384,6 @@ def split_edges(edges, splits): train_split = splits[0] valid_split = splits[1] test_split = splits[2] - print("Splitting into: {}/{}/{} fractions".format(train_split, valid_split, test_split)) num_train = int(num_total_edges * train_split) @@ -313,20 +392,37 @@ def split_edges(edges, splits): train_edges_tens = edges[rand_perm[:num_train]] valid_edges_tens = edges[rand_perm[num_train : num_train + num_valid]] test_edges_tens = edges[rand_perm[num_train + num_valid : total_split_edges]] + + if edges_weights is not None: + train_edges_weights = edges_weights[rand_perm[:num_train]] + valid_edges_weights = edges_weights[rand_perm[num_train : num_train + num_valid]] + test_edges_weights = edges_weights[rand_perm[num_train + num_valid : total_split_edges]] + elif len(splits) == 2: train_split = splits[0] test_split = splits[1] - print("Splitting into: {}/{} fractions".format(train_split, test_split)) num_train = int(num_total_edges * train_split) train_edges_tens = edges[rand_perm[:num_train]] test_edges_tens = edges[rand_perm[num_train:total_split_edges]] + + if edges_weights is not None: + train_edges_weights = edges_weights[rand_perm[:num_train]] + test_edges_weights = edges_weights[rand_perm[num_train:total_split_edges]] + else: raise RuntimeError("Splits must be length 2 or 3") - return train_edges_tens, valid_edges_tens, test_edges_tens + return ( + train_edges_tens, + train_edges_weights, + valid_edges_tens, + valid_edges_weights, + test_edges_tens, + test_edges_weights, + ) class TorchEdgeListConverter(object): @@ -338,12 +434,15 @@ def __init__( test_edges: Path = None, splits: list = None, format: str = "csv", - columns: list = [0, 1, 2], header_length: int = 0, delim: str = "\t", dtype: str = "int32", num_partitions: int = 1, partitioned_evaluation: bool = False, + src_column: int = None, + dst_column: int = None, + edge_type_column: int = None, + edge_weight_column: int = None, remap_ids: bool = True, sequential_train_nodes: bool = False, sequential_deg_nodes: int = 0, @@ -369,7 +468,12 @@ def __init__( edges/ train_edges.bin Binary file of size num_train * 2 * sizeof(dtype) or num_train * 3 * sizeof(dtype) train_partition_offsets.txt (optional) List of training edge bucket sizes in sequential order (0, 0), (0, 1) ... (1, 0), ... (n-1, n-1) - valid_edges.bin (optional) Binary file of size num_valid * 2 * sizeof(dtype) or num_valid * 3 * sizeof(dtype) + valid_edges.bin (optional) Binary file of size num_valid * 2 * sizeof(dtype) or num_valid * 3 * sizeof(dtype) or num_valid * 3 * sizeof(dtype) + The ordering of the data is as as follows based on dataset breakdown: + Both edge weights and edge types present: [src, type, weight, dst] + Neither edge weight or edge type present: [src, dst] + Only edge weight present: [src, weight, dst] + Only edge type present: [src, type, dst] valid_partition_offsets.txt (optional) List of validation edge bucket sizes in sequential order (0, 0), (0, 1) ... (1, 0), ... (n-1, n-1) test_edges.bin (optional) Binary file of size num_test * 2 * sizeof(dtype) or num_test * 3 * sizeof(dtype) test_partition_offsets.txt (optional) List of test edge bucket sizes in sequential order (0, 0), (0, 1) ... (1, 0), ... (n-1, n-1) @@ -384,10 +488,10 @@ def __init__( :param test_edges: Raw input test edges, can be a delimited file, a numpy array, or pytorch tensor (optional) :param splits: Train/valid/test split to use for the input :param format: Format of the input dataset, can be a delimited file (CSV, TSV, TXT) or a numpy array or a pytorch tensor. - :param columns: List of column ids of input delimited files which denote the src node, edge-type, and dst node of edges. - E.g. columns=[0, 2, 1] means that the source nodes are found in the first column, the edge-types are found in - in the third column, and the destination nodes are found in the second column. - For datasets without edge-types, only two column ids should be specified. E.g. columns=[0, 2] + :param src_column: The column storing the src nodes. + :param dst_column: The column storing the dst nodes. + :param edge_type_column: The column storing the edge type. + :param edge_weight_column: The column storing the edge weights. :param header_length: Length of the header for input delimited files :param delim: Delimiter of the input delimited files :param dtype: Datatype of the node ids in the output preprocessed datasets. Unless you have over 2 billion nodes, this should @@ -409,18 +513,34 @@ def __init__( :param known_node_ids: List of node id arrays or tensors which contain known node ids for the dataset. Used for generating node id mappings when some nodes may not be present in the edge list. """ # noqa: E501 + + # Read in the src and dst column + if src_column is None: + raise ValueError("Src column must be specified with a non None value") + + if dst_column is None: + raise ValueError("Dst column must be specified with a non None value") + + # Save these variables self.output_dir = output_dir self.num_nodes = num_nodes self.num_rels = num_rels + self.column_mappings = { + ColNames.SRC_COL: src_column, + ColNames.DST_COL: dst_column, + ColNames.EDGE_TYPE_COL: edge_type_column, + ColNames.EDGE_WEIGHT_COL: edge_weight_column, + } if format.upper() in SUPPORTED_DELIM_FORMATS: assert isinstance(train_edges, str) or isinstance(train_edges, Path) - - self.reader = PandasDelimitedFileReader(train_edges, valid_edges, test_edges, columns, header_length, delim) + self.reader = PandasDelimitedFileReader( + train_edges, valid_edges, test_edges, self.column_mappings, header_length, delim + ) elif format.upper() in SUPPORTED_IN_MEMORY_FORMATS: self.reader = None - if format.upper() == "NUMPY": + if format.upper() == "NUMPY" or format.upper() == "NP": assert isinstance(train_edges, np.ndarray) self.train_edges_tens = torch.from_numpy(train_edges) self.valid_edges_tens = None @@ -433,7 +553,8 @@ def __init__( if test_edges is not None: assert isinstance(test_edges, np.ndarray) self.test_edges_tens = torch.from_numpy(test_edges) - elif format.upper() == "PYTORCH": + + elif format.upper() == "PYTORCH" or format.upper() == "TORCH": assert isinstance(train_edges, torch.Tensor) self.train_edges_tens = train_edges self.valid_edges_tens = valid_edges @@ -454,17 +575,16 @@ def __init__( self.partitioner = None self.writer = TorchWriter(self.output_dir, partitioned_evaluation) - self.splits = splits - self.has_rels = False - if len(columns) == 3: - self.has_rels = True - + # Determine if this has edge types + self.has_rels = self.column_mappings[ColNames.EDGE_TYPE_COL] is not None if dtype.upper() == "INT32" or dtype.upper() == "INT": self.dtype = torch.int32 + self.weight_dtype = torch.float32 elif dtype.upper() == "INT64" or dtype.upper() == "LONG": self.dtype = torch.int64 + self.weight_dtype = torch.float64 else: raise RuntimeError("Unrecognized datatype") @@ -502,24 +622,26 @@ def __init__( else: self.known_node_ids = None + # flake8: noqa: C901 def convert(self): - train_edges_tens = None - valid_edges_tens = None - test_edges_tens = None + train_edges_tens, train_edge_weights = None, None + valid_edges_tens, valid_edge_weights = None, None + test_edges_tens, test_edge_weights = None, None os.makedirs(self.output_dir / Path("nodes"), exist_ok=True) os.makedirs(self.output_dir / Path("edges"), exist_ok=True) - print("Reading edges") if self.reader is not None: + print("Reading edges") train_edges_df, valid_edges_df, test_edges_df = self.reader.read() if self.remap_ids: - edge_lists, node_mapping, rel_mapping = map_edge_lists( + all_edge_lists, node_mapping, rel_mapping, all_edge_weights = map_edge_lists( [train_edges_df, valid_edges_df, test_edges_df], known_node_ids=self.known_node_ids, sequential_train_nodes=self.sequential_train_nodes, sequential_deg_nodes=self.sequential_deg_nodes, + column_mappings=self.column_mappings, ) self.num_nodes = node_mapping.shape[0] @@ -529,12 +651,16 @@ def convert(self): else: self.num_rels = rel_mapping.shape[0] - train_edges_tens = edge_lists[0] - if len(edge_lists) == 2: - test_edges_tens = edge_lists[1] - elif len(edge_lists) == 3: - valid_edges_tens = edge_lists[1] - test_edges_tens = edge_lists[2] + train_edges_tens = all_edge_lists[0] + if len(all_edge_lists) == 2: + test_edges_tens = all_edge_lists[1] + elif len(all_edge_lists) == 3: + valid_edges_tens = all_edge_lists[1] + test_edges_tens = all_edge_lists[2] + + train_edge_weights = all_edge_weights[0] + valid_edge_weights = all_edge_weights[1] + test_edge_weights = all_edge_weights[2] print( "Node mapping written to: {}".format( @@ -561,39 +687,45 @@ def convert(self): delimiter=",", ) else: - train_edges_tens = dataframe_to_tensor(train_edges_df) + # Determine which columns to keep + print("Not remapping node ids") - if valid_edges_df is not None: - valid_edges_tens = dataframe_to_tensor(valid_edges_df) + # Extract all the tensors and weights + train_edges_tens, train_edge_weights = extract_tensors_from_df(train_edges_df, self.column_mappings) + valid_edges_tens, valid_edge_weights = extract_tensors_from_df(valid_edges_df, self.column_mappings) + test_edges_tens, test_edge_weights = extract_tensors_from_df(test_edges_df, self.column_mappings) - if test_edges_df is not None: - test_edges_tens = dataframe_to_tensor(test_edges_df) else: + print("Using in memory data") train_edges_tens = self.train_edges_tens valid_edges_tens = self.valid_edges_tens test_edges_tens = self.test_edges_tens if self.remap_ids: - edge_lists, node_mapping, rel_mapping = map_edge_lists( + all_edges_list, node_mapping, rel_mapping, all_edge_weights = map_edge_lists( [train_edges_tens, valid_edges_tens, test_edges_tens], known_node_ids=self.known_node_ids, sequential_train_nodes=self.sequential_train_nodes, sequential_deg_nodes=self.sequential_deg_nodes, + column_mappings=self.column_mappings, ) self.num_nodes = node_mapping.shape[0] - if rel_mapping is None: self.num_rels = 1 else: self.num_rels = rel_mapping.shape[0] - train_edges_tens = edge_lists[0] - if len(edge_lists) == 2: - test_edges_tens = edge_lists[1] - elif len(edge_lists) == 3: - valid_edges_tens = edge_lists[1] - test_edges_tens = edge_lists[2] + train_edges_tens = all_edges_list[0] + if len(all_edges_list) == 2: + test_edges_tens = all_edges_list[1] + elif len(all_edges_list) == 3: + valid_edges_tens = all_edges_list[1] + test_edges_tens = all_edges_list[2] + + train_edge_weights = all_edge_weights[0] + valid_edge_weights = all_edge_weights[1] + test_edge_weights = all_edge_weights[2] print( "Node mapping written to: {}".format( @@ -620,15 +752,56 @@ def convert(self): delimiter=",", ) - train_edges_tens = train_edges_tens.to(self.dtype) - if valid_edges_tens is not None: - valid_edges_tens = valid_edges_tens.to(self.dtype) - if test_edges_tens is not None: - test_edges_tens = test_edges_tens.to(self.dtype) + else: + train_edges_tens, train_edge_weights = extract_tensor_from_tens(train_edges_tens, self.column_mappings) + test_edges_tens, test_edge_weights = extract_tensor_from_tens(test_edges_tens, self.column_mappings) + valid_edges_tens, valid_edge_weights = extract_tensor_from_tens(valid_edges_tens, self.column_mappings) + # Split the edges if self.splits is not None: - train_edges_tens, valid_edges_tens, test_edges_tens = split_edges(train_edges_tens, self.splits) + ( + train_edges_tens, + train_edge_weights, + valid_edges_tens, + valid_edge_weights, + test_edges_tens, + test_edge_weights, + ) = split_edges(train_edges_tens, train_edge_weights, self.splits) + + # Cast to the correct dtype + def perform_cast(edge_tensor, weights_tensor, edge_dtype, weights_dtype): + if edge_tensor is None: + return edge_tensor, weights_tensor + if weights_tensor is not None: + weights_tensor = weights_tensor.to(weights_dtype) + return edge_tensor.to(edge_dtype), weights_tensor + + train_edges_tens, train_edge_weights = perform_cast( + train_edges_tens, train_edge_weights, self.dtype, self.weight_dtype + ) + valid_edges_tens, valid_edge_weights = perform_cast( + valid_edges_tens, valid_edge_weights, self.dtype, self.weight_dtype + ) + test_edges_tens, test_edge_weights = perform_cast( + test_edges_tens, test_edge_weights, self.dtype, self.weight_dtype + ) + + # Resolve all the null counts + if self.num_nodes is None: + combined_nodes = [train_edges_tens[:, [0, -1]]] + if test_edges_tens is not None: + combined_nodes.append(test_edges_tens[:, [0, -1]]) + if valid_edges_tens is not None: + combined_nodes.append(valid_edges_tens[:, [0, -1]]) + + combined_tensor = torch.unique(combined_nodes, sorted=False) + self.num_nodes = torch.numel(combined_tensor) + + if self.num_rels is None: + self.num_rels = 1 + + all_edge_weights = [train_edge_weights, valid_edge_weights, test_edge_weights] if self.partitioner is not None: print("Partition nodes into {} partitions".format(self.num_partitions)) ( @@ -638,8 +811,14 @@ def convert(self): valid_edges_offsets, test_edges_tens, test_edges_offsets, + all_edge_weights, ) = self.partitioner.partition_edges( - train_edges_tens, valid_edges_tens, test_edges_tens, self.num_nodes, self.num_partitions + train_edges_tens, + valid_edges_tens, + test_edges_tens, + self.num_nodes, + self.num_partitions, + edge_weights=all_edge_weights, ) return self.writer.write_to_binary( @@ -652,8 +831,15 @@ def convert(self): train_edges_offsets, valid_edges_offsets, test_edges_offsets, + edge_weights=all_edge_weights, ) else: return self.writer.write_to_binary( - train_edges_tens, valid_edges_tens, test_edges_tens, self.num_nodes, self.num_rels, self.num_partitions + train_edges_tens, + valid_edges_tens, + test_edges_tens, + self.num_nodes, + self.num_rels, + self.num_partitions, + edge_weights=all_edge_weights, ) diff --git a/src/python/tools/preprocess/converters/writers/torch_writer.py b/src/python/tools/preprocess/converters/writers/torch_writer.py index 59ca7261..f61b8eff 100644 --- a/src/python/tools/preprocess/converters/writers/torch_writer.py +++ b/src/python/tools/preprocess/converters/writers/torch_writer.py @@ -1,5 +1,6 @@ from pathlib import Path +import numpy as np from omegaconf import OmegaConf from marius.tools.configuration.constants import PathConstants @@ -24,6 +25,7 @@ def write_to_binary( train_edges_offsets=None, valid_edges_offsets=None, test_edges_offsets=None, + edge_weights=None, ): dataset_stats = DatasetConfig() dataset_stats.dataset_dir = Path(self.output_dir).absolute().__str__() + "/" @@ -44,26 +46,59 @@ def write_to_binary( yaml_file = OmegaConf.to_yaml(dataset_stats) f.writelines(yaml_file) + # Read the edge weights + train_edges_weights, valid_edges_weights, test_edges_weights = None, None, None + if edge_weights is not None: + train_edges_weights, valid_edges_weights, test_edges_weights = ( + edge_weights[0], + edge_weights[1], + edge_weights[2], + ) + with open(self.output_dir / Path(PathConstants.train_edges_path), "wb") as f: + print("Train edges written to:", PathConstants.train_edges_path) f.write(bytes(train_edges_tens.numpy())) + if train_edges_weights is not None: + train_weights_save_path = self.output_dir / Path(PathConstants.train_edges_weights_path) + print("Train edges weights written to:", train_weights_save_path) + train_weights_arr = train_edges_weights.numpy().flatten().astype(np.float32) + train_weights_arr.tofile(train_weights_save_path) + if valid_edges_tens is not None: + print("Valid edges written to:", PathConstants.valid_edges_path) with open(self.output_dir / Path(PathConstants.valid_edges_path), "wb") as f: f.write(bytes(valid_edges_tens.numpy())) + if valid_edges_weights is not None: + valid_weights_save_path = self.output_dir / Path(PathConstants.valid_edges_weights_path) + print("Valid edges weights written to:", PathConstants.valid_edges_weights_path) + valid_weights_arr = valid_edges_weights.numpy().flatten().astype(np.float32) + valid_weights_arr.tofile(valid_weights_save_path) + if test_edges_tens is not None: + print("Test edges written to:", PathConstants.test_edges_path) with open(self.output_dir / Path(PathConstants.test_edges_path), "wb") as f: f.write(bytes(test_edges_tens.numpy())) + if test_edges_weights is not None: + test_weights_save_path = self.output_dir / Path(PathConstants.test_edges_weights_path) + print("Test edge weights written to:", PathConstants.test_edges_weights_path) + test_weights_arr = test_edges_weights.numpy().flatten().astype(np.float32) + test_weights_arr.tofile(test_weights_save_path) + if num_partitions > 1: with open(self.output_dir / Path(PathConstants.train_edge_buckets_path), "w") as f: + print("Train partition offsets written to:", PathConstants.train_edge_buckets_path) f.writelines([str(o) + "\n" for o in train_edges_offsets]) if valid_edges_offsets is not None: + print("Valid partition offsets written to:", PathConstants.valid_edge_buckets_path) with open(self.output_dir / Path(PathConstants.valid_edge_buckets_path), "w") as f: f.writelines([str(o) + "\n" for o in valid_edges_offsets]) if test_edges_offsets is not None: + print("Test partition offsets written to:", PathConstants.test_edge_buckets_path) with open(self.output_dir / Path(PathConstants.test_edge_buckets_path), "w") as f: f.writelines([str(o) + "\n" for o in test_edges_offsets]) diff --git a/src/python/tools/preprocess/custom.py b/src/python/tools/preprocess/custom.py index 9f032c0a..78bf6be8 100644 --- a/src/python/tools/preprocess/custom.py +++ b/src/python/tools/preprocess/custom.py @@ -43,7 +43,10 @@ def preprocess( splits=[0.9, 0.05, 0.05], partitioned_eval=False, sequential_train_nodes=False, - columns=[0, 1, 2], + src_column=None, + dst_column=None, + edge_type_column=None, + edge_weight_column=None, ): if self.spark and pyspark_found: converter_class = SparkEdgeListConverter @@ -56,11 +59,15 @@ def preprocess( valid_edges=self.valid_edges_file, test_edges=self.test_edges_file, delim=self.delim, - columns=columns, + src_column=src_column, + dst_column=dst_column, + edge_type_column=edge_type_column, + edge_weight_column=edge_weight_column, num_partitions=num_partitions, + sequential_train_nodes=sequential_train_nodes, splits=splits, remap_ids=remap_ids, partitioned_evaluation=partitioned_eval, ) - converter.convert() + return converter.convert() diff --git a/src/python/tools/preprocess/datasets/__init__.py b/src/python/tools/preprocess/datasets/__init__.py index abce66ec..a4bba5ce 100644 --- a/src/python/tools/preprocess/datasets/__init__.py +++ b/src/python/tools/preprocess/datasets/__init__.py @@ -12,4 +12,5 @@ "fb15k_237", "ogb_wikikg90mv2", "ogb_mag240m", + "ogbl_collab", ] diff --git a/src/python/tools/preprocess/datasets/fb15k.py b/src/python/tools/preprocess/datasets/fb15k.py index 4c2483ca..25d62b64 100644 --- a/src/python/tools/preprocess/datasets/fb15k.py +++ b/src/python/tools/preprocess/datasets/fb15k.py @@ -52,6 +52,9 @@ def preprocess( test_edges=self.input_test_edges_file, num_partitions=num_partitions, remap_ids=remap_ids, + src_column=0, + dst_column=2, + edge_type_column=1, partitioned_evaluation=partitioned_eval, ) diff --git a/src/python/tools/preprocess/datasets/fb15k_237.py b/src/python/tools/preprocess/datasets/fb15k_237.py index 0161c269..348d0fed 100644 --- a/src/python/tools/preprocess/datasets/fb15k_237.py +++ b/src/python/tools/preprocess/datasets/fb15k_237.py @@ -55,6 +55,9 @@ def preprocess( num_partitions=num_partitions, remap_ids=remap_ids, partitioned_evaluation=partitioned_eval, + src_column=0, + dst_column=2, + edge_type_column=1, ) return converter.convert() diff --git a/src/python/tools/preprocess/datasets/freebase86m.py b/src/python/tools/preprocess/datasets/freebase86m.py index 14d51ba3..42daea04 100644 --- a/src/python/tools/preprocess/datasets/freebase86m.py +++ b/src/python/tools/preprocess/datasets/freebase86m.py @@ -49,7 +49,9 @@ def preprocess( valid_edges=self.input_valid_edges_file, test_edges=self.input_test_edges_file, num_partitions=num_partitions, - columns=[0, 2, 1], + src_column=0, + dst_column=1, + edge_type_column=2, remap_ids=remap_ids, partitioned_evaluation=partitioned_eval, ) diff --git a/src/python/tools/preprocess/datasets/friendster.py b/src/python/tools/preprocess/datasets/friendster.py index 561ff4a8..027cc175 100644 --- a/src/python/tools/preprocess/datasets/friendster.py +++ b/src/python/tools/preprocess/datasets/friendster.py @@ -48,7 +48,8 @@ def preprocess( output_dir=self.output_directory, train_edges=self.input_edges, delim="\t", - columns=[0, 1], + src_column=0, + dst_column=1, header_length=0, num_partitions=num_partitions, splits=splits, diff --git a/src/python/tools/preprocess/datasets/livejournal.py b/src/python/tools/preprocess/datasets/livejournal.py index 70b70e4d..ca9747fd 100644 --- a/src/python/tools/preprocess/datasets/livejournal.py +++ b/src/python/tools/preprocess/datasets/livejournal.py @@ -43,7 +43,8 @@ def preprocess( output_dir=self.output_directory, train_edges=self.input_edges, delim="\t", - columns=[0, 1], + src_column=0, + dst_column=1, header_length=0, num_partitions=num_partitions, splits=splits, diff --git a/src/python/tools/preprocess/datasets/ogb_mag240m.py b/src/python/tools/preprocess/datasets/ogb_mag240m.py index d4658ec8..5f7b7a2a 100644 --- a/src/python/tools/preprocess/datasets/ogb_mag240m.py +++ b/src/python/tools/preprocess/datasets/ogb_mag240m.py @@ -82,6 +82,9 @@ def preprocess( remap_ids=remap_ids, sequential_train_nodes=sequential_train_nodes, format="numpy", + src_column=0, + dst_column=2, + edge_type_column=1, known_node_ids=[ train_nodes, valid_nodes, diff --git a/src/python/tools/preprocess/datasets/ogb_wikikg90mv2.py b/src/python/tools/preprocess/datasets/ogb_wikikg90mv2.py index 195b676c..2e7ff2a0 100644 --- a/src/python/tools/preprocess/datasets/ogb_wikikg90mv2.py +++ b/src/python/tools/preprocess/datasets/ogb_wikikg90mv2.py @@ -65,7 +65,9 @@ def preprocess( valid_edges=valid_edges, test_edges=valid_edges, num_partitions=num_partitions, - columns=[0, 1, 2], + src_column=0, + dst_column=2, + edge_type_column=1, remap_ids=remap_ids, sequential_train_nodes=sequential_train_nodes, format="numpy", diff --git a/src/python/tools/preprocess/datasets/ogbl_citation2.py b/src/python/tools/preprocess/datasets/ogbl_citation2.py index fc3683dc..320a6456 100644 --- a/src/python/tools/preprocess/datasets/ogbl_citation2.py +++ b/src/python/tools/preprocess/datasets/ogbl_citation2.py @@ -63,6 +63,9 @@ def preprocess( valid_edges=valid_list, test_edges=test_list, num_partitions=num_partitions, + src_column=0, + dst_column=2, + edge_type_column=1, remap_ids=remap_ids, known_node_ids=[ torch.arange(2927963) diff --git a/src/python/tools/preprocess/datasets/ogbl_collab.py b/src/python/tools/preprocess/datasets/ogbl_collab.py new file mode 100644 index 00000000..0eff5fdb --- /dev/null +++ b/src/python/tools/preprocess/datasets/ogbl_collab.py @@ -0,0 +1,127 @@ +from pathlib import Path + +import pandas as pd + +from marius.tools.preprocess.converters.torch_converter import TorchEdgeListConverter +from marius.tools.preprocess.dataset import LinkPredictionDataset +from marius.tools.preprocess.utils import download_url, extract_file + +import torch # isort:skip + + +class OGBLCollab(LinkPredictionDataset): + """ + Open Graph Benchmark: collab + + The ogbl-collab dataset is a weighted directed graph, representing a subset of the collaboration network + between authors indexed by MAG. Each node represents an author and edges indicate the collaboration between + authors. All nodes come with 128-dimensional features, obtained by averaging the word embeddings of papers + that are published by the authors. All edges are associated with two meta-information: the year and the + edge weight, representing the number of co-authored papers published in that year. The graph can be viewed + as a dynamic multi-graph since there can be multiple edges between two nodes if they collaborate in more + than one year. + """ + + def __init__(self, output_directory: Path, spark=False, include_edge_type=True, include_edge_weight=True): + super().__init__(output_directory, spark) + + self.dataset_name = "ogbl_citation2" + self.dataset_url = "http://snap.stanford.edu/ogb/data/linkproppred/collab.zip" + self.node_ids = None + self.include_edge_type = include_edge_type + self.include_edge_weight = include_edge_weight + + def download(self, overwrite=False): + self.input_train_edges_file = self.output_directory / Path("train.pt") + self.input_valid_edges_file = self.output_directory / Path("valid.pt") + self.input_test_edges_file = self.output_directory / Path("test.pt") + + download = False + if overwrite: + download = True + elif not self.input_train_edges_file.exists(): + download = True + elif not self.input_valid_edges_file.exists(): + download = True + elif not self.input_test_edges_file.exists(): + download = True + + if download: + archive_path = download_url(self.dataset_url, self.output_directory, overwrite) + extract_file(archive_path, remove_input=False) + + for file in (self.output_directory / Path("collab/split/time")).iterdir(): + file.rename(self.output_directory / Path(file.name)) + + # Read in the nodes + nodes_path = Path(self.output_directory).joinpath("collab", "raw", "num-node-list.csv.gz") + df = pd.read_csv(nodes_path, compression="gzip", header=None) + self.num_nodes = df.iloc[0][0] + + def preprocess( + self, + num_partitions=1, + remap_ids=True, + splits=None, + sequential_train_nodes=False, + partitioned_eval=False, + ): + # Read in the training data + train_idx = torch.load(self.input_train_edges_file) + train_edges = torch.from_numpy(train_idx.get("edge")) + + # Read in the valid data + valid_idx = torch.load(self.input_valid_edges_file) + valid_edges = torch.from_numpy(valid_idx.get("edge")) + + # Read in the test data + test_idx = torch.load(self.input_test_edges_file) + test_edges = torch.from_numpy(test_idx.get("edge")) + + edge_type_column, edge_weight_column = None, None + if self.include_edge_type: + # Added in the year information + train_year = torch.from_numpy(train_idx.get("year").reshape(-1, 1)) + train_edges = torch.cat((train_edges, train_year), dim=1) + + valid_year = torch.from_numpy(valid_idx.get("year").reshape(-1, 1)) + valid_edges = torch.cat((valid_edges, valid_year), dim=1) + + test_year = torch.from_numpy(test_idx.get("year").reshape(-1, 1)) + test_edges = torch.cat((test_edges, test_year), dim=1) + + edge_type_column = 2 + + if self.include_edge_weight: + # Add in the weights + train_weight = torch.from_numpy(train_idx.get("weight").reshape(-1, 1)) + train_edges = torch.cat((train_edges, train_weight), dim=1) + + valid_weight = torch.from_numpy(valid_idx.get("weight").reshape(-1, 1)) + valid_edges = torch.cat((valid_edges, valid_weight), dim=1) + + test_weight = torch.from_numpy(test_idx.get("weight").reshape(-1, 1)) + test_edges = torch.cat((test_edges, test_weight), dim=1) + + edge_weight_column = 3 + + # Add in the edge type information + converter = TorchEdgeListConverter( + output_dir=self.output_directory, + train_edges=train_edges, + valid_edges=valid_edges, + test_edges=test_edges, + num_partitions=num_partitions, + remap_ids=remap_ids, + known_node_ids=[torch.arange(self.num_nodes)], + format="pytorch", + splits=splits, + sequential_train_nodes=sequential_train_nodes, + src_column=0, + dst_column=1, + edge_type_column=edge_type_column, + edge_weight_column=edge_weight_column, + partitioned_evaluation=partitioned_eval, + ) + + converter.convert() diff --git a/src/python/tools/preprocess/datasets/ogbl_ppa.py b/src/python/tools/preprocess/datasets/ogbl_ppa.py index 53238ead..0d09c8f3 100644 --- a/src/python/tools/preprocess/datasets/ogbl_ppa.py +++ b/src/python/tools/preprocess/datasets/ogbl_ppa.py @@ -60,6 +60,9 @@ def preprocess( remap_ids=remap_ids, format="numpy", partitioned_evaluation=partitioned_eval, + src_column=0, + dst_column=2, + edge_type_column=1, ) return converter.convert() diff --git a/src/python/tools/preprocess/datasets/ogbl_wikikg2.py b/src/python/tools/preprocess/datasets/ogbl_wikikg2.py index 6c2bf460..9ccb8c28 100644 --- a/src/python/tools/preprocess/datasets/ogbl_wikikg2.py +++ b/src/python/tools/preprocess/datasets/ogbl_wikikg2.py @@ -66,6 +66,9 @@ def preprocess( format="numpy", remap_ids=remap_ids, partitioned_evaluation=partitioned_eval, + src_column=0, + dst_column=2, + edge_type_column=1, ) return converter.convert() diff --git a/src/python/tools/preprocess/datasets/ogbn_arxiv.py b/src/python/tools/preprocess/datasets/ogbn_arxiv.py index 29c57c76..a9489b22 100644 --- a/src/python/tools/preprocess/datasets/ogbn_arxiv.py +++ b/src/python/tools/preprocess/datasets/ogbn_arxiv.py @@ -82,6 +82,8 @@ def preprocess( train_edges=self.input_edge_list_file, num_partitions=num_partitions, columns=[0, 1], + src_column=0, + dst_column=1, remap_ids=remap_ids, sequential_train_nodes=sequential_train_nodes, delim=",", diff --git a/src/python/tools/preprocess/datasets/ogbn_papers100m.py b/src/python/tools/preprocess/datasets/ogbn_papers100m.py index 2dbb9ea6..eeb7ad31 100644 --- a/src/python/tools/preprocess/datasets/ogbn_papers100m.py +++ b/src/python/tools/preprocess/datasets/ogbn_papers100m.py @@ -82,6 +82,9 @@ def preprocess( format="pytorch", known_node_ids=[train_nodes, valid_nodes, test_nodes], partitioned_evaluation=partitioned_eval, + src_column=0, + dst_column=2, + edge_type_column=1, ) dataset_stats = converter.convert() diff --git a/src/python/tools/preprocess/datasets/ogbn_products.py b/src/python/tools/preprocess/datasets/ogbn_products.py index 135b0b8e..bade27c6 100644 --- a/src/python/tools/preprocess/datasets/ogbn_products.py +++ b/src/python/tools/preprocess/datasets/ogbn_products.py @@ -81,6 +81,8 @@ def preprocess( train_edges=self.input_edge_list_file, num_partitions=num_partitions, columns=[0, 1], + src_column=0, + dst_column=1, remap_ids=remap_ids, sequential_train_nodes=sequential_train_nodes, delim=",", diff --git a/src/python/tools/preprocess/datasets/twitter.py b/src/python/tools/preprocess/datasets/twitter.py index a66accc4..ec3fb0c1 100644 --- a/src/python/tools/preprocess/datasets/twitter.py +++ b/src/python/tools/preprocess/datasets/twitter.py @@ -33,7 +33,8 @@ def preprocess(self, num_partitions=1, remap_ids=True, splits=[0.9, 0.05, 0.05], output_dir=self.output_directory, train_edges=self.input_edges, delim=" ", - columns=[0, 1], + src_column=0, + dst_column=1, num_partitions=num_partitions, splits=splits, remap_ids=remap_ids, diff --git a/test/python/preprocessing/test_torch_converter.py b/test/python/preprocessing/test_torch_converter.py index e5e0cb35..3aaf6a6e 100644 --- a/test/python/preprocessing/test_torch_converter.py +++ b/test/python/preprocessing/test_torch_converter.py @@ -14,12 +14,19 @@ import torch # isort:skip -test_files = ["train_edges.txt", "valid_edges.txt", "test_edges.txt"] +test_files = ["train_edges.txt", "train_edges_weights.txt", "valid_edges.txt", "test_edges.txt"] def validate_partitioned_output_dir( - output_dir: Path, expected_stats: DatasetConfig, num_partitions, dtype=np.int32, partitioned_eval=False + output_dir: Path, + expected_stats: DatasetConfig, + num_partitions, + dtype=np.int32, + weight_dtype=np.float32, + partitioned_eval=False, + has_weights=False, ): + print("Validate partioned called with value", has_weights) validate_output_dir(output_dir, expected_stats, dtype, remap_ids=True) train_edges_path = output_dir / Path(PathConstants.train_edges_path) @@ -57,8 +64,24 @@ def validate_partitioned_output_dir( assert offset == expected_stats.num_train - -def validate_output_dir(output_dir: Path, expected_stats: DatasetConfig, dtype=np.int32, remap_ids=True): + print("Checking with has_weight of", has_weights) + if has_weights: + weights_file_path = output_dir / Path(PathConstants.train_edges_weights_path) + assert weights_file_path.exists() + values = np.fromfile(weights_file_path, dtype=weight_dtype) + values = np.sort(values) + for i in range(len(values)): + assert values[i] == float(i) + + +def validate_output_dir( + output_dir: Path, + expected_stats: DatasetConfig, + dtype=np.int32, + remap_ids=True, + has_weights=False, + weight_dtype=np.float32, +): assert output_dir.exists() assert (output_dir / Path("edges")).exists() assert (output_dir / Path("nodes")).exists() @@ -120,6 +143,14 @@ def validate_output_dir(output_dir: Path, expected_stats: DatasetConfig, dtype=n assert not node_mapping_path.exists() assert not relation_mapping_path.exists() + print("Checking with has_weight of", has_weights) + if has_weights: + weights_file_path = output_dir / Path(PathConstants.train_edges_weights_path) + assert weights_file_path.exists() + values = np.fromfile(weights_file_path, dtype=weight_dtype) + for i in range(len(values)): + assert values[i] == float(i) + class TestTorchConverter(unittest.TestCase): """ @@ -145,7 +176,12 @@ def test_delimited_defaults(self): output_dir.mkdir() converter = TorchEdgeListConverter( - output_dir=output_dir, train_edges=Path(TMP_TEST_DIR) / Path("train_edges.txt"), delim=" " + output_dir=output_dir, + train_edges=Path(TMP_TEST_DIR) / Path("train_edges.txt"), + delim=" ", + src_column=0, + dst_column=2, + edge_type_column=1, ) converter.convert() @@ -172,7 +208,12 @@ def test_delimited_str_ids(self): tmp.to_csv(Path(TMP_TEST_DIR) / Path("str_train_edges.txt"), header=None, sep=" ", index=False) converter = TorchEdgeListConverter( - output_dir=output_dir, train_edges=Path(TMP_TEST_DIR) / Path("str_train_edges.txt"), delim=" " + output_dir=output_dir, + train_edges=Path(TMP_TEST_DIR) / Path("str_train_edges.txt"), + delim=" ", + src_column=0, + dst_column=2, + edge_type_column=1, ) converter.convert() @@ -194,7 +235,14 @@ def test_numpy_defaults(self): train_edges = train_edges_df.to_numpy() - converter = TorchEdgeListConverter(output_dir=output_dir, train_edges=train_edges, format="numpy") + converter = TorchEdgeListConverter( + output_dir=output_dir, + train_edges=train_edges, + format="numpy", + src_column=0, + dst_column=2, + edge_type_column=1, + ) converter.convert() @@ -215,7 +263,14 @@ def test_pytorch_defaults(self): train_edges = torch.tensor(train_edges_df.to_numpy()) - converter = TorchEdgeListConverter(output_dir=output_dir, train_edges=train_edges, format="pytorch") + converter = TorchEdgeListConverter( + output_dir=output_dir, + train_edges=train_edges, + format="pytorch", + src_column=0, + dst_column=2, + edge_type_column=1, + ) converter.convert() @@ -237,6 +292,9 @@ def test_splits(self): train_edges=Path(TMP_TEST_DIR) / Path("train_edges.txt"), delim=" ", splits=[0.9, 0.05, 0.05], + src_column=0, + dst_column=2, + edge_type_column=1, ) converter.convert() @@ -257,7 +315,11 @@ def test_columns(self): output_dir.mkdir() converter = TorchEdgeListConverter( - output_dir=output_dir, train_edges=Path(TMP_TEST_DIR) / Path("train_edges.txt"), delim=" ", columns=[0, 2] + output_dir=output_dir, + train_edges=Path(TMP_TEST_DIR) / Path("train_edges.txt"), + delim=" ", + src_column=0, + dst_column=2, ) converter.convert() @@ -285,6 +347,9 @@ def test_header(self): train_edges=Path(TMP_TEST_DIR) / Path("header_train_edges.txt"), delim=" ", header_length=1, + src_column=0, + dst_column=2, + edge_type_column=1, ) converter.convert() @@ -306,7 +371,12 @@ def test_delim(self): tmp.to_csv(Path(TMP_TEST_DIR) / Path("delim_train_edges.txt"), header=None, sep=",", index=False) converter = TorchEdgeListConverter( - output_dir=output_dir, train_edges=Path(TMP_TEST_DIR) / Path("delim_train_edges.txt"), delim="," + output_dir=output_dir, + train_edges=Path(TMP_TEST_DIR) / Path("delim_train_edges.txt"), + delim=",", + src_column=0, + dst_column=2, + edge_type_column=1, ) converter.convert() @@ -325,7 +395,13 @@ def test_dtype(self): output_dir.mkdir() converter = TorchEdgeListConverter( - output_dir=output_dir, train_edges=Path(TMP_TEST_DIR) / Path("train_edges.txt"), delim=" ", dtype="int64" + output_dir=output_dir, + train_edges=Path(TMP_TEST_DIR) / Path("train_edges.txt"), + delim=" ", + dtype="int64", + src_column=0, + dst_column=2, + edge_type_column=1, ) converter.convert() @@ -337,7 +413,13 @@ def test_dtype(self): expected_stats.num_relations = 10 expected_stats.num_train = 1000 - validate_output_dir(output_dir=output_dir, expected_stats=expected_stats, dtype=np.int64, remap_ids=True) + validate_output_dir( + output_dir=output_dir, + expected_stats=expected_stats, + dtype=np.int64, + weight_dtype=np.float64, + remap_ids=True, + ) def test_partitions(self): output_dir = Path(TMP_TEST_DIR) / Path("test_partitions") @@ -348,6 +430,9 @@ def test_partitions(self): train_edges=Path(TMP_TEST_DIR) / Path("train_edges.txt"), delim=" ", num_partitions=10, + src_column=0, + dst_column=2, + edge_type_column=1, ) converter.convert() @@ -368,6 +453,9 @@ def test_partitions(self): train_edges=Path(TMP_TEST_DIR) / Path("train_edges.txt"), delim=" ", num_partitions=100, + src_column=0, + dst_column=2, + edge_type_column=1, ) converter.convert() @@ -387,6 +475,9 @@ def test_no_remap(self): remap_ids=False, num_nodes=100, num_rels=10, + src_column=0, + dst_column=2, + edge_type_column=1, ) converter.convert() @@ -399,3 +490,453 @@ def test_no_remap(self): expected_stats.num_train = 1000 validate_output_dir(output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=False) + + def test_torch_no_relation_no_remap(self): + remap_val = False + output_dir = Path(TMP_TEST_DIR) / Path("test_torch_defaults") + output_dir.mkdir() + + train_edges_df = pd.read_csv(Path(TMP_TEST_DIR) / Path("train_edges.txt"), header=None, sep=" ") + train_edges = torch.tensor(train_edges_df.to_numpy()) + + num_rows = train_edges.size(0) + train_edges = torch.column_stack((train_edges, torch.arange(num_rows))) + converter = TorchEdgeListConverter( + output_dir=output_dir, + train_edges=train_edges, + remap_ids=remap_val, + src_column=0, + dst_column=2, + num_nodes=100, + format="pytorch", + ) + converter.convert() + + expected_stats = DatasetConfig() + expected_stats.dataset_dir = output_dir.__str__() + expected_stats.num_edges = 1000 + expected_stats.num_nodes = 100 + expected_stats.num_relations = 1 + expected_stats.num_train = 1000 + + validate_output_dir(output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=remap_val) + + def test_pandas_no_relation_no_remap(self): + remap_val = False + output_dir = Path(TMP_TEST_DIR) / Path("test_torch_defaults") + output_dir.mkdir() + + train_edges_file = Path(TMP_TEST_DIR) / Path("train_edges_weights.txt") + + converter = TorchEdgeListConverter( + output_dir=output_dir, + train_edges=train_edges_file, + delim=" ", + remap_ids=remap_val, + src_column=0, + dst_column=2, + num_nodes=100, + ) + converter.convert() + + expected_stats = DatasetConfig() + expected_stats.dataset_dir = output_dir.__str__() + expected_stats.num_edges = 1000 + expected_stats.num_nodes = 100 + expected_stats.num_relations = 1 + expected_stats.num_train = 1000 + + validate_output_dir(output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=remap_val) + + def test_torch_no_relation_remap(self): + remap_val = True + output_dir = Path(TMP_TEST_DIR) / Path("test_torch_defaults") + output_dir.mkdir() + + train_edges_df = pd.read_csv(Path(TMP_TEST_DIR) / Path("train_edges.txt"), header=None, sep=" ") + train_edges = torch.tensor(train_edges_df.to_numpy()) + + num_rows = train_edges.size(0) + train_edges = torch.column_stack((train_edges, torch.arange(num_rows))) + + converter = TorchEdgeListConverter( + output_dir=output_dir, + train_edges=train_edges, + remap_ids=remap_val, + src_column=0, + dst_column=2, + num_nodes=100, + format="pytorch", + ) + converter.convert() + + expected_stats = DatasetConfig() + expected_stats.dataset_dir = output_dir.__str__() + expected_stats.num_edges = 1000 + expected_stats.num_nodes = 100 + expected_stats.num_relations = 1 + expected_stats.num_train = 1000 + + validate_output_dir(output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=remap_val) + + def test_pandas_no_relation_remap(self): + remap_val = True + output_dir = Path(TMP_TEST_DIR) / Path("test_torch_defaults") + output_dir.mkdir() + + train_edges_file = Path(TMP_TEST_DIR) / Path("train_edges_weights.txt") + + converter = TorchEdgeListConverter( + output_dir=output_dir, + train_edges=train_edges_file, + delim=" ", + remap_ids=remap_val, + src_column=0, + dst_column=2, + num_nodes=100, + ) + converter.convert() + + expected_stats = DatasetConfig() + expected_stats.dataset_dir = output_dir.__str__() + expected_stats.num_edges = 1000 + expected_stats.num_nodes = 100 + expected_stats.num_relations = 1 + expected_stats.num_train = 1000 + + validate_output_dir(output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=remap_val) + + def test_torch_only_weights_no_remap(self): + remap_val = False + output_dir = Path(TMP_TEST_DIR) / Path("test_torch_defaults") + output_dir.mkdir() + + train_edges_df = pd.read_csv(Path(TMP_TEST_DIR) / Path("train_edges.txt"), header=None, sep=" ") + train_edges = torch.tensor(train_edges_df.to_numpy()) + + num_rows = train_edges.size(0) + train_edges = torch.column_stack((train_edges, torch.arange(num_rows))) + + converter = TorchEdgeListConverter( + output_dir=output_dir, + train_edges=train_edges, + remap_ids=remap_val, + src_column=0, + dst_column=2, + edge_weight_column=3, + num_nodes=100, + format="pytorch", + ) + converter.convert() + + expected_stats = DatasetConfig() + expected_stats.dataset_dir = output_dir.__str__() + expected_stats.num_edges = 1000 + expected_stats.num_nodes = 100 + expected_stats.num_relations = 1 + expected_stats.num_train = 1000 + + validate_output_dir( + output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=remap_val, has_weights=True + ) + + def test_pandas_only_weights_no_remap(self): + remap_val = False + output_dir = Path(TMP_TEST_DIR) / Path("test_torch_defaults") + output_dir.mkdir() + + train_edges_file = Path(TMP_TEST_DIR) / Path("train_edges_weights.txt") + + converter = TorchEdgeListConverter( + output_dir=output_dir, + train_edges=train_edges_file, + delim=" ", + remap_ids=remap_val, + src_column=0, + dst_column=2, + edge_weight_column=3, + num_nodes=100, + ) + converter.convert() + + expected_stats = DatasetConfig() + expected_stats.dataset_dir = output_dir.__str__() + expected_stats.num_edges = 1000 + expected_stats.num_nodes = 100 + expected_stats.num_relations = 1 + expected_stats.num_train = 1000 + + validate_output_dir( + output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=remap_val, has_weights=True + ) + + def test_torch_only_weights_remap(self): + remap_val = True + output_dir = Path(TMP_TEST_DIR) / Path("test_torch_defaults") + output_dir.mkdir() + + train_edges_df = pd.read_csv(Path(TMP_TEST_DIR) / Path("train_edges.txt"), header=None, sep=" ") + train_edges = torch.tensor(train_edges_df.to_numpy()) + + num_rows = train_edges.size(0) + train_edges = torch.column_stack((train_edges, torch.arange(num_rows))) + + converter = TorchEdgeListConverter( + output_dir=output_dir, + train_edges=train_edges, + remap_ids=remap_val, + src_column=0, + dst_column=2, + edge_weight_column=3, + num_nodes=100, + format="pytorch", + ) + converter.convert() + + expected_stats = DatasetConfig() + expected_stats.dataset_dir = output_dir.__str__() + expected_stats.num_edges = 1000 + expected_stats.num_nodes = 100 + expected_stats.num_relations = 1 + expected_stats.num_train = 1000 + + validate_output_dir( + output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=remap_val, has_weights=True + ) + + def test_pandas_only_weights_remap(self): + remap_val = True + output_dir = Path(TMP_TEST_DIR) / Path("test_torch_defaults") + output_dir.mkdir() + + train_edges_file = Path(TMP_TEST_DIR) / Path("train_edges_weights.txt") + + converter = TorchEdgeListConverter( + output_dir=output_dir, + train_edges=train_edges_file, + delim=" ", + remap_ids=remap_val, + src_column=0, + dst_column=2, + edge_weight_column=3, + num_nodes=100, + ) + converter.convert() + + expected_stats = DatasetConfig() + expected_stats.dataset_dir = output_dir.__str__() + expected_stats.num_edges = 1000 + expected_stats.num_nodes = 100 + expected_stats.num_relations = 1 + expected_stats.num_train = 1000 + + validate_output_dir( + output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=remap_val, has_weights=True + ) + + def test_torch_relationship_weights_no_remap(self): + remap_val = False + output_dir = Path(TMP_TEST_DIR) / Path("test_torch_defaults") + output_dir.mkdir() + + train_edges_df = pd.read_csv(Path(TMP_TEST_DIR) / Path("train_edges.txt"), header=None, sep=" ") + train_edges = torch.tensor(train_edges_df.to_numpy()) + + num_rows = train_edges.size(0) + train_edges = torch.column_stack((train_edges, torch.arange(num_rows))) + + converter = TorchEdgeListConverter( + output_dir=output_dir, + train_edges=train_edges, + remap_ids=remap_val, + src_column=0, + dst_column=2, + edge_type_column=1, + edge_weight_column=3, + num_nodes=100, + num_rels=10, + format="pytorch", + ) + converter.convert() + + expected_stats = DatasetConfig() + expected_stats.dataset_dir = output_dir.__str__() + expected_stats.num_edges = 1000 + expected_stats.num_nodes = 100 + expected_stats.num_relations = 10 + expected_stats.num_train = 1000 + + validate_output_dir( + output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=remap_val, has_weights=True + ) + + def test_pandas_relationship_weights_no_remap(self): + remap_val = False + output_dir = Path(TMP_TEST_DIR) / Path("test_torch_defaults") + output_dir.mkdir() + + train_edges_file = Path(TMP_TEST_DIR) / Path("train_edges_weights.txt") + + converter = TorchEdgeListConverter( + output_dir=output_dir, + train_edges=train_edges_file, + delim=" ", + remap_ids=remap_val, + src_column=0, + dst_column=2, + edge_type_column=1, + edge_weight_column=3, + num_nodes=100, + num_rels=10, + ) + converter.convert() + + expected_stats = DatasetConfig() + expected_stats.dataset_dir = output_dir.__str__() + expected_stats.num_edges = 1000 + expected_stats.num_nodes = 100 + expected_stats.num_relations = 10 + expected_stats.num_train = 1000 + + validate_output_dir( + output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=remap_val, has_weights=True + ) + + def test_torch_relationship_weights_remap(self): + remap_val = True + output_dir = Path(TMP_TEST_DIR) / Path("test_torch_defaults") + output_dir.mkdir() + + train_edges_df = pd.read_csv(Path(TMP_TEST_DIR) / Path("train_edges.txt"), header=None, sep=" ") + train_edges = torch.tensor(train_edges_df.to_numpy()) + + num_rows = train_edges.size(0) + train_edges = torch.column_stack((train_edges, torch.arange(num_rows))) + + converter = TorchEdgeListConverter( + output_dir=output_dir, + train_edges=train_edges, + remap_ids=remap_val, + src_column=0, + dst_column=2, + edge_type_column=1, + edge_weight_column=3, + num_nodes=100, + format="pytorch", + ) + converter.convert() + + expected_stats = DatasetConfig() + expected_stats.dataset_dir = output_dir.__str__() + expected_stats.num_edges = 1000 + expected_stats.num_nodes = 100 + expected_stats.num_relations = 10 + expected_stats.num_train = 1000 + + validate_output_dir( + output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=remap_val, has_weights=True + ) + + def test_pandas_relationship_weights_remap(self): + remap_val = True + output_dir = Path(TMP_TEST_DIR) / Path("test_torch_defaults") + output_dir.mkdir() + + train_edges_file = Path(TMP_TEST_DIR) / Path("train_edges_weights.txt") + + converter = TorchEdgeListConverter( + output_dir=output_dir, + train_edges=train_edges_file, + delim=" ", + remap_ids=remap_val, + src_column=0, + dst_column=2, + edge_type_column=1, + edge_weight_column=3, + num_nodes=100, + num_rels=10, + ) + converter.convert() + + expected_stats = DatasetConfig() + expected_stats.dataset_dir = output_dir.__str__() + expected_stats.num_edges = 1000 + expected_stats.num_nodes = 100 + expected_stats.num_relations = 10 + expected_stats.num_train = 1000 + + validate_output_dir( + output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=remap_val, has_weights=True + ) + + def test_torch_relationship_weights_remap_partioned(self): + num_paritions = 10 + output_dir = Path(TMP_TEST_DIR) / Path("test_torch_defaults") + output_dir.mkdir() + + train_edges_df = pd.read_csv(Path(TMP_TEST_DIR) / Path("train_edges.txt"), header=None, sep=" ") + train_edges = torch.tensor(train_edges_df.to_numpy()) + + num_rows = train_edges.size(0) + train_edges = torch.column_stack((train_edges, torch.arange(num_rows))) + + converter = TorchEdgeListConverter( + output_dir=output_dir, + train_edges=train_edges, + src_column=0, + dst_column=2, + edge_type_column=1, + edge_weight_column=3, + num_partitions=num_paritions, + format="pytorch", + ) + converter.convert() + + expected_stats = DatasetConfig() + expected_stats.dataset_dir = output_dir.__str__() + expected_stats.num_edges = 1000 + expected_stats.num_nodes = 100 + expected_stats.num_relations = 10 + expected_stats.num_train = 1000 + + validate_partitioned_output_dir( + output_dir=output_dir, + expected_stats=expected_stats, + dtype=np.int32, + num_partitions=num_paritions, + has_weights=True, + ) + + def test_pandas_relationship_weights_remap_partioned(self): + num_paritions = 10 + output_dir = Path(TMP_TEST_DIR) / Path("test_torch_defaults") + output_dir.mkdir() + + train_edges_file = Path(TMP_TEST_DIR) / Path("train_edges_weights.txt") + + converter = TorchEdgeListConverter( + output_dir=output_dir, + train_edges=train_edges_file, + delim=" ", + src_column=0, + dst_column=2, + edge_type_column=1, + edge_weight_column=3, + num_partitions=num_paritions, + ) + converter.convert() + + expected_stats = DatasetConfig() + expected_stats.dataset_dir = output_dir.__str__() + expected_stats.num_edges = 1000 + expected_stats.num_nodes = 100 + expected_stats.num_relations = 10 + expected_stats.num_train = 1000 + + validate_partitioned_output_dir( + output_dir=output_dir, + expected_stats=expected_stats, + dtype=np.int32, + num_partitions=num_paritions, + has_weights=True, + ) diff --git a/test/test_data/generate.py b/test/test_data/generate.py index 96706ede..d93bcefa 100644 --- a/test/test_data/generate.py +++ b/test/test_data/generate.py @@ -86,10 +86,11 @@ def generate_random_dataset_nc( edges = get_random_graph(num_nodes, num_edges, num_rels) edges_df = pd.DataFrame(data=edges) + src_col, dst_col, edge_type_col = None, None, None if edges.shape[1] == 3: - columns = [0, 1, 2] + src_col, dst_col, edge_type_col = 0, 2, 1 else: - columns = [0, 1] + src_col, dst_col = 0, 1 raw_edges_filename = output_dir / Path("raw_edges.csv") edges_df.to_csv(raw_edges_filename, ",", header=False, index=False) @@ -100,7 +101,9 @@ def generate_random_dataset_nc( valid_nodes = None test_nodes = None if splits is not None: - train_nodes, valid_nodes, test_nodes = split_edges(all_nodes, splits) + train_nodes, train_weights, valid_nodes, valid_weights, test_nodes, test_weights = split_edges( + all_nodes, None, splits + ) converter = TorchEdgeListConverter( output_dir, @@ -108,11 +111,13 @@ def generate_random_dataset_nc( delim=",", remap_ids=remap_ids, num_partitions=num_partitions, - columns=columns, partitioned_evaluation=partitioned_eval, sequential_train_nodes=sequential_train_nodes, known_node_ids=[train_nodes, valid_nodes, test_nodes], format="CSV", + src_column=src_col, + dst_column=dst_col, + edge_type_column=edge_type_col, ) dataset_stats = converter.convert() @@ -193,10 +198,11 @@ def generate_random_dataset_lp( edges = get_random_graph(num_nodes, num_edges, num_rels) edges_df = pd.DataFrame(data=edges) + src_col, dst_col, edge_type_col = None, None, None if edges.shape[1] == 3: - columns = [0, 1, 2] + src_col, dst_col, edge_type_col = 0, 2, 1 else: - columns = [0, 1] + src_col, dst_col = 0, 1 raw_edges_filename = output_dir / Path("raw_edges.csv") @@ -209,10 +215,12 @@ def generate_random_dataset_lp( splits=splits, num_partitions=num_partitions, remap_ids=remap_ids, - columns=columns, partitioned_evaluation=partitioned_eval, sequential_train_nodes=sequential_train_nodes, format="CSV", + src_column=src_col, + dst_column=dst_col, + edge_type_column=edge_type_col, ) dataset_stats = converter.convert() diff --git a/test/test_data/train_edges_weights.txt b/test/test_data/train_edges_weights.txt new file mode 100644 index 00000000..5baa0644 --- /dev/null +++ b/test/test_data/train_edges_weights.txt @@ -0,0 +1,1000 @@ +80 6 73 0 +83 8 2 1 +50 8 66 2 +64 5 42 3 +31 5 91 4 +40 8 92 5 +18 2 32 6 +21 5 64 7 +47 8 19 8 +71 2 71 9 +12 5 11 10 +76 6 58 11 +12 6 24 12 +69 9 11 13 +12 3 55 14 +77 4 14 15 +12 8 8 16 +29 5 14 17 +46 8 8 18 +30 0 60 19 +46 6 7 20 +51 6 69 21 +0 2 52 22 +81 9 26 23 +50 0 78 24 +59 9 93 25 +62 5 12 26 +93 0 14 27 +72 7 31 28 +46 2 12 29 +44 3 67 30 +45 1 46 31 +0 2 56 32 +68 7 49 33 +51 2 21 34 +66 9 99 35 +93 2 74 36 +59 9 9 37 +12 6 3 38 +26 6 11 39 +8 7 13 40 +46 8 70 41 +50 8 2 42 +10 8 5 43 +20 1 3 44 +43 3 46 45 +51 5 70 46 +73 4 74 47 +95 7 50 48 +59 8 12 49 +46 4 99 50 +20 3 55 51 +39 3 24 52 +28 8 8 53 +31 5 22 54 +84 3 95 55 +48 3 50 56 +81 4 10 57 +66 7 4 58 +15 2 78 59 +68 6 23 60 +55 0 0 61 +58 0 48 62 +75 4 50 63 +9 4 20 64 +48 9 87 65 +97 3 94 66 +44 9 83 67 +87 8 37 68 +74 6 33 69 +10 9 8 70 +81 3 18 71 +42 0 7 72 +74 3 37 73 +37 7 33 74 +35 7 47 75 +19 4 8 76 +19 4 78 77 +87 4 2 78 +39 2 21 79 +79 8 74 80 +21 1 24 81 +25 5 33 82 +24 2 33 83 +12 6 98 84 +47 8 6 85 +30 2 94 86 +61 1 78 87 +80 0 83 88 +91 1 97 89 +24 6 5 90 +32 8 82 91 +40 7 34 92 +68 6 98 93 +76 8 19 94 +90 3 40 95 +90 1 77 96 +11 4 49 97 +10 3 82 98 +39 9 2 99 +15 0 85 100 +85 3 81 101 +67 2 80 102 +0 8 58 103 +77 9 48 104 +93 6 20 105 +67 4 62 106 +51 9 36 107 +74 3 76 108 +7 4 94 109 +23 9 46 110 +2 5 32 111 +48 7 49 112 +35 6 19 113 +52 2 33 114 +31 1 2 115 +54 3 26 116 +63 3 85 117 +40 1 43 118 +57 7 51 119 +74 3 59 120 +11 1 82 121 +13 9 23 122 +70 5 83 123 +6 2 25 124 +86 7 59 125 +71 0 62 126 +77 0 82 127 +63 8 88 128 +4 7 10 129 +36 6 73 130 +77 5 58 131 +6 2 4 132 +89 8 84 133 +8 8 80 134 +27 3 32 135 +22 1 96 136 +58 0 45 137 +62 8 19 138 +10 4 67 139 +5 7 21 140 +18 7 3 141 +59 0 96 142 +17 6 49 143 +82 3 39 144 +41 2 24 145 +43 0 22 146 +4 5 79 147 +76 7 29 148 +13 7 3 149 +2 9 52 150 +65 9 37 151 +46 1 65 152 +72 0 67 153 +42 8 83 154 +92 3 72 155 +46 4 97 156 +7 1 35 157 +10 5 23 158 +39 4 28 159 +78 3 7 160 +23 0 94 161 +86 1 22 162 +13 6 47 163 +15 4 8 164 +63 4 73 165 +63 7 54 166 +51 8 22 167 +74 7 90 168 +55 9 68 169 +55 8 89 170 +95 4 86 171 +70 8 34 172 +11 1 42 173 +74 8 32 174 +90 9 33 175 +25 8 65 176 +60 1 59 177 +34 9 45 178 +59 8 53 179 +1 2 75 180 +8 5 63 181 +79 9 30 182 +21 9 32 183 +4 5 2 184 +40 1 94 185 +3 2 20 186 +20 5 11 187 +52 9 77 188 +60 4 38 189 +22 8 68 190 +64 4 26 191 +44 7 32 192 +82 7 62 193 +58 8 55 194 +7 3 18 195 +15 6 53 196 +21 6 62 197 +99 0 22 198 +37 1 51 199 +1 6 46 200 +68 5 78 201 +34 0 92 202 +9 4 41 203 +8 5 46 204 +43 1 87 205 +96 5 78 206 +84 7 43 207 +72 3 60 208 +59 7 57 209 +28 0 83 210 +93 5 34 211 +78 2 36 212 +15 2 89 213 +68 3 71 214 +51 1 26 215 +67 2 67 216 +68 2 79 217 +85 3 66 218 +68 3 74 219 +21 3 28 220 +25 8 87 221 +82 3 67 222 +36 5 2 223 +38 9 12 224 +30 1 25 225 +89 7 45 226 +31 1 7 227 +22 8 72 228 +30 4 56 229 +14 7 60 230 +26 4 74 231 +74 0 1 232 +42 3 70 233 +91 0 85 234 +74 5 87 235 +83 0 0 236 +14 0 33 237 +48 4 18 238 +47 7 3 239 +34 8 74 240 +91 7 3 241 +13 6 56 242 +5 6 19 243 +43 5 80 244 +45 5 68 245 +41 2 29 246 +88 3 83 247 +39 4 42 248 +31 1 4 249 +51 6 13 250 +49 0 59 251 +0 0 37 252 +28 6 41 253 +58 0 94 254 +86 1 86 255 +96 0 22 256 +11 7 91 257 +61 2 5 258 +93 6 55 259 +17 5 63 260 +47 2 17 261 +93 6 42 262 +96 5 4 263 +73 1 35 264 +41 6 46 265 +8 3 69 266 +5 7 9 267 +38 3 27 268 +7 9 61 269 +10 9 75 270 +55 9 37 271 +53 1 18 272 +9 8 19 273 +58 1 56 274 +10 7 90 275 +15 2 13 276 +47 3 45 277 +74 6 60 278 +38 5 40 279 +32 4 30 280 +9 2 74 281 +85 5 37 282 +74 9 13 283 +4 5 37 284 +17 5 20 285 +88 8 11 286 +5 5 70 287 +71 2 74 288 +88 7 4 289 +71 4 89 290 +50 7 50 291 +3 2 77 292 +8 6 83 293 +30 9 74 294 +87 7 3 295 +58 3 32 296 +48 4 1 297 +93 5 99 298 +15 4 48 299 +59 6 18 300 +13 5 14 301 +42 0 4 302 +97 0 55 303 +41 7 7 304 +45 1 70 305 +47 1 49 306 +72 9 73 307 +73 6 18 308 +12 4 57 309 +65 6 2 310 +7 9 52 311 +76 3 78 312 +60 4 70 313 +69 2 17 314 +65 9 25 315 +44 7 7 316 +59 9 15 317 +39 4 7 318 +91 9 26 319 +82 9 51 320 +70 2 28 321 +29 3 38 322 +52 9 35 323 +22 5 83 324 +5 7 5 325 +61 7 98 326 +12 9 65 327 +44 7 89 328 +62 9 6 329 +87 4 26 330 +66 4 10 331 +84 9 49 332 +68 9 39 333 +56 0 52 334 +26 6 22 335 +42 6 64 336 +61 9 90 337 +78 5 39 338 +71 7 19 339 +1 0 89 340 +87 4 23 341 +23 0 52 342 +94 4 57 343 +7 0 85 344 +98 4 89 345 +87 7 39 346 +94 8 4 347 +45 1 93 348 +99 8 45 349 +21 1 79 350 +65 9 97 351 +85 5 14 352 +45 0 65 353 +41 5 12 354 +58 3 27 355 +88 4 86 356 +13 1 8 357 +71 4 39 358 +66 2 22 359 +89 6 53 360 +13 7 66 361 +61 5 91 362 +99 0 73 363 +76 3 3 364 +7 3 51 365 +61 6 93 366 +63 0 13 367 +33 7 96 368 +6 2 69 369 +68 2 65 370 +76 4 9 371 +66 0 37 372 +4 4 63 373 +76 2 26 374 +28 5 63 375 +92 9 82 376 +1 1 49 377 +43 5 20 378 +34 0 18 379 +38 7 2 380 +74 3 72 381 +71 5 76 382 +53 8 58 383 +61 7 45 384 +57 9 55 385 +79 7 87 386 +55 5 95 387 +10 1 54 388 +83 1 32 389 +74 6 61 390 +50 1 1 391 +89 5 87 392 +54 7 40 393 +83 7 48 394 +20 1 76 395 +57 2 80 396 +18 7 54 397 +56 2 13 398 +9 4 15 399 +76 7 48 400 +20 8 29 401 +34 3 95 402 +80 0 85 403 +79 4 17 404 +94 2 23 405 +46 2 94 406 +13 8 70 407 +31 2 28 408 +63 8 49 409 +83 2 97 410 +51 6 28 411 +64 0 5 412 +19 9 52 413 +69 0 27 414 +80 7 4 415 +39 9 81 416 +98 9 82 417 +28 9 81 418 +73 9 58 419 +68 7 40 420 +72 4 48 421 +9 2 65 422 +34 3 35 423 +62 0 3 424 +73 8 54 425 +13 2 38 426 +50 0 29 427 +81 2 96 428 +48 3 4 429 +58 5 97 430 +22 1 91 431 +41 7 14 432 +47 1 0 433 +44 8 58 434 +77 6 92 435 +65 6 73 436 +8 8 61 437 +74 0 2 438 +21 0 83 439 +80 9 92 440 +53 0 34 441 +85 8 55 442 +53 3 83 443 +32 6 33 444 +52 3 14 445 +34 1 14 446 +45 0 55 447 +93 5 79 448 +33 9 65 449 +79 7 27 450 +5 9 4 451 +99 7 26 452 +26 2 78 453 +36 4 9 454 +56 6 92 455 +82 7 21 456 +82 9 46 457 +99 2 90 458 +57 6 25 459 +97 4 4 460 +66 7 53 461 +79 3 23 462 +56 5 16 463 +23 8 88 464 +61 9 36 465 +27 1 51 466 +7 1 93 467 +27 7 38 468 +15 1 60 469 +83 1 5 470 +58 2 6 471 +14 4 95 472 +33 3 90 473 +45 8 88 474 +96 5 24 475 +42 5 94 476 +46 6 80 477 +31 2 65 478 +59 6 4 479 +16 4 13 480 +10 2 41 481 +81 3 73 482 +83 0 68 483 +11 0 26 484 +52 2 11 485 +75 3 81 486 +89 5 29 487 +75 9 66 488 +87 4 15 489 +73 3 10 490 +4 9 67 491 +76 2 35 492 +15 0 43 493 +37 5 93 494 +37 2 55 495 +61 4 12 496 +2 2 81 497 +4 0 69 498 +1 8 95 499 +7 4 72 500 +9 1 16 501 +25 8 88 502 +8 2 74 503 +65 3 30 504 +83 3 67 505 +42 4 1 506 +36 3 30 507 +19 1 23 508 +76 5 90 509 +83 8 13 510 +31 6 79 511 +87 6 36 512 +7 1 74 513 +0 6 69 514 +30 1 52 515 +57 0 89 516 +0 2 62 517 +55 8 25 518 +28 8 13 519 +50 9 20 520 +44 1 33 521 +48 2 77 522 +93 5 56 523 +29 6 97 524 +93 3 21 525 +4 2 94 526 +26 7 43 527 +20 0 28 528 +76 6 63 529 +15 5 66 530 +59 1 60 531 +29 4 7 532 +41 7 27 533 +40 4 97 534 +10 2 43 535 +44 6 76 536 +73 9 38 537 +88 4 89 538 +44 9 21 539 +73 9 17 540 +8 5 21 541 +9 0 85 542 +84 0 48 543 +36 3 89 544 +58 2 25 545 +27 5 5 546 +13 1 90 547 +50 3 51 548 +3 8 41 549 +79 3 69 550 +73 5 75 551 +71 6 32 552 +95 4 65 553 +65 0 98 554 +12 1 46 555 +93 8 60 556 +81 7 95 557 +48 5 30 558 +8 8 14 559 +83 1 47 560 +38 8 37 561 +58 7 12 562 +52 1 89 563 +86 0 0 564 +36 1 69 565 +20 0 56 566 +71 3 2 567 +94 6 92 568 +20 7 14 569 +53 2 1 570 +50 2 77 571 +91 6 57 572 +28 1 15 573 +26 9 97 574 +52 5 73 575 +19 7 32 576 +5 7 63 577 +27 7 73 578 +5 7 13 579 +48 9 89 580 +13 5 84 581 +48 8 11 582 +12 5 66 583 +13 8 39 584 +10 5 35 585 +30 0 79 586 +41 8 79 587 +72 9 70 588 +82 2 93 589 +49 9 5 590 +85 7 48 591 +95 4 22 592 +58 6 7 593 +45 5 87 594 +81 8 46 595 +69 7 99 596 +34 0 29 597 +57 3 57 598 +65 0 84 599 +29 3 78 600 +12 4 10 601 +93 7 5 602 +74 9 99 603 +53 0 77 604 +26 3 87 605 +62 0 99 606 +12 3 73 607 +58 3 92 608 +42 7 46 609 +98 7 15 610 +33 5 82 611 +51 3 66 612 +39 0 18 613 +23 0 14 614 +64 8 22 615 +31 9 42 616 +96 0 91 617 +73 0 21 618 +69 5 15 619 +46 7 47 620 +82 6 87 621 +96 3 79 622 +1 8 69 623 +31 7 5 624 +16 3 90 625 +45 7 94 626 +58 2 82 627 +51 0 44 628 +43 7 34 629 +2 3 26 630 +99 1 48 631 +17 8 45 632 +37 1 38 633 +12 5 81 634 +79 9 35 635 +69 3 76 636 +13 8 21 637 +8 5 67 638 +41 5 30 639 +74 2 53 640 +56 9 70 641 +86 6 8 642 +47 8 44 643 +46 9 82 644 +0 4 14 645 +80 1 47 646 +20 8 18 647 +83 2 22 648 +75 9 82 649 +71 8 55 650 +0 5 46 651 +93 7 11 652 +65 3 22 653 +26 8 88 654 +4 8 18 655 +23 5 6 656 +32 6 22 657 +26 3 94 658 +40 2 16 659 +4 0 77 660 +82 2 71 661 +2 8 74 662 +90 0 9 663 +92 4 98 664 +48 8 44 665 +47 2 53 666 +58 9 2 667 +97 9 12 668 +5 5 67 669 +24 9 56 670 +99 2 85 671 +19 1 14 672 +88 2 47 673 +95 2 49 674 +14 6 57 675 +56 7 94 676 +84 5 31 677 +5 6 96 678 +94 0 0 679 +33 0 38 680 +24 0 83 681 +77 5 62 682 +73 2 28 683 +53 4 21 684 +4 0 46 685 +30 5 34 686 +9 6 4 687 +11 3 31 688 +1 1 3 689 +86 5 42 690 +31 1 13 691 +73 4 13 692 +36 9 13 693 +27 4 2 694 +5 2 48 695 +60 9 19 696 +96 2 52 697 +69 9 96 698 +17 2 2 699 +73 8 67 700 +71 9 58 701 +31 1 54 702 +38 5 82 703 +3 0 67 704 +69 3 25 705 +50 6 98 706 +93 9 4 707 +48 7 47 708 +19 3 13 709 +40 5 77 710 +21 2 42 711 +42 1 23 712 +14 3 29 713 +42 4 38 714 +76 0 34 715 +85 6 0 716 +91 1 79 717 +75 8 58 718 +60 1 44 719 +29 2 4 720 +88 0 37 721 +53 8 28 722 +88 8 10 723 +54 6 24 724 +25 6 56 725 +26 8 79 726 +76 2 87 727 +36 9 84 728 +38 3 68 729 +84 7 50 730 +60 6 84 731 +60 3 24 732 +86 3 49 733 +52 7 56 734 +59 1 77 735 +26 4 19 736 +92 8 94 737 +18 3 6 738 +40 2 56 739 +38 2 49 740 +60 6 11 741 +35 9 30 742 +4 9 17 743 +24 5 51 744 +33 5 2 745 +3 7 82 746 +99 8 57 747 +61 9 28 748 +11 7 28 749 +31 6 73 750 +67 4 68 751 +43 5 56 752 +49 6 57 753 +78 2 87 754 +94 6 93 755 +85 2 47 756 +65 1 99 757 +98 1 63 758 +47 3 2 759 +50 8 4 760 +42 5 30 761 +77 0 85 762 +67 9 65 763 +26 3 65 764 +59 1 24 765 +36 0 76 766 +68 3 95 767 +34 6 96 768 +61 5 7 769 +44 0 59 770 +30 7 15 771 +81 2 14 772 +78 4 30 773 +20 3 65 774 +85 6 42 775 +41 7 43 776 +51 2 6 777 +26 7 25 778 +92 5 49 779 +90 0 61 780 +11 8 15 781 +77 2 31 782 +30 9 48 783 +88 9 93 784 +90 5 70 785 +57 5 17 786 +18 9 23 787 +56 2 82 788 +25 7 34 789 +26 1 9 790 +91 9 30 791 +49 8 99 792 +96 8 88 793 +93 2 65 794 +36 8 67 795 +40 5 76 796 +8 2 31 797 +92 4 66 798 +92 4 28 799 +13 2 73 800 +4 1 30 801 +83 4 6 802 +96 0 3 803 +12 9 45 804 +85 5 29 805 +34 0 39 806 +51 7 97 807 +3 9 85 808 +19 5 73 809 +92 2 38 810 +51 5 83 811 +71 9 79 812 +83 4 60 813 +62 8 77 814 +0 9 32 815 +70 7 95 816 +72 6 0 817 +69 4 95 818 +3 1 43 819 +62 9 20 820 +76 9 85 821 +84 4 79 822 +21 1 3 823 +20 5 83 824 +91 2 22 825 +83 3 21 826 +75 6 25 827 +56 1 74 828 +31 2 30 829 +66 8 3 830 +19 9 37 831 +19 5 11 832 +81 5 93 833 +68 4 38 834 +37 2 39 835 +56 8 97 836 +82 5 58 837 +81 2 65 838 +98 5 40 839 +78 6 53 840 +18 5 45 841 +42 9 29 842 +75 9 93 843 +99 8 14 844 +97 7 35 845 +33 4 41 846 +36 8 85 847 +42 3 54 848 +58 7 50 849 +3 7 53 850 +64 3 80 851 +0 7 23 852 +98 5 30 853 +71 8 86 854 +37 8 11 855 +90 2 12 856 +5 9 41 857 +54 9 58 858 +14 4 96 859 +16 5 97 860 +1 9 15 861 +41 4 9 862 +32 5 17 863 +96 7 71 864 +83 4 61 865 +21 3 81 866 +28 9 31 867 +96 8 39 868 +90 5 46 869 +65 6 63 870 +50 4 7 871 +43 7 21 872 +23 9 76 873 +54 0 47 874 +39 8 11 875 +71 4 90 876 +47 8 99 877 +46 5 71 878 +90 4 57 879 +81 4 89 880 +43 1 90 881 +32 1 72 882 +0 4 70 883 +47 5 34 884 +43 1 28 885 +13 1 69 886 +49 4 9 887 +36 7 38 888 +94 0 24 889 +64 4 11 890 +53 7 12 891 +17 5 12 892 +96 2 69 893 +99 7 75 894 +70 4 85 895 +93 6 64 896 +61 7 2 897 +47 2 50 898 +50 1 58 899 +3 4 18 900 +41 2 31 901 +45 2 49 902 +98 2 83 903 +88 2 40 904 +34 2 59 905 +86 2 99 906 +49 4 28 907 +20 0 24 908 +98 0 0 909 +51 4 78 910 +66 8 50 911 +37 2 77 912 +62 5 53 913 +97 1 20 914 +84 2 15 915 +48 3 95 916 +18 4 17 917 +20 5 9 918 +56 0 24 919 +90 8 64 920 +13 7 5 921 +80 5 19 922 +49 1 33 923 +20 2 12 924 +92 8 4 925 +25 7 28 926 +47 7 24 927 +84 4 61 928 +2 7 84 929 +0 0 25 930 +13 9 62 931 +17 4 4 932 +1 0 96 933 +59 6 6 934 +50 5 76 935 +69 1 60 936 +64 0 82 937 +37 0 96 938 +57 0 77 939 +60 5 89 940 +83 3 1 941 +23 5 86 942 +54 5 87 943 +83 8 76 944 +12 4 15 945 +13 6 86 946 +89 7 97 947 +12 8 2 948 +26 0 13 949 +64 4 48 950 +3 1 12 951 +86 1 68 952 +78 8 4 953 +96 3 14 954 +64 7 71 955 +51 7 72 956 +66 5 73 957 +86 4 17 958 +1 1 82 959 +91 9 71 960 +50 5 88 961 +60 6 81 962 +57 5 45 963 +30 7 6 964 +50 1 11 965 +84 7 30 966 +66 6 86 967 +39 4 47 968 +29 8 1 969 +82 7 30 970 +82 2 54 971 +35 3 74 972 +38 9 9 973 +64 8 88 974 +74 6 51 975 +58 4 30 976 +8 4 6 977 +72 3 63 978 +81 4 44 979 +90 4 1 980 +91 3 62 981 +19 4 53 982 +2 9 78 983 +70 0 84 984 +89 1 74 985 +66 3 0 986 +95 5 73 987 +44 9 94 988 +18 3 87 989 +6 5 90 990 +42 9 45 991 +17 1 41 992 +81 6 70 993 +72 0 42 994 +45 8 43 995 +16 5 31 996 +61 5 69 997 +87 3 6 998 +80 7 33 999 diff --git a/tox.ini b/tox.ini index 9745965f..14c65988 100644 --- a/tox.ini +++ b/tox.ini @@ -87,3 +87,4 @@ allowlist_externals = /usr/bin/clang-format /usr/local/bin/clang-format /usr/local/bin/bash + \ No newline at end of file