diff --git a/docs/user_guide/command_line_interface.rst b/docs/user_guide/command_line_interface.rst index 0f8d9068..06ece971 100644 --- a/docs/user_guide/command_line_interface.rst +++ b/docs/user_guide/command_line_interface.rst @@ -31,45 +31,49 @@ The available options: :: - usage: preprocess [-h] [--files files [files ...]] [--dataset dataset] [--num_partitions num_partitions] [--overwrite] - [--generate_config [generate_config]] [--format format] [--delim delim] [--dtype dtype] [--not_remap_ids] - [--dataset_split dataset_split dataset_split] [--start_col start_col] [--num_line_skip num_line_skip] - output_directory - - Preprocess Datasets - - positional arguments: - output_directory Directory to put graph data - - optional arguments: - -h, --help show this help message and exit - --files files [files ...] - Files containing custom dataset - --dataset dataset Supported dataset to preprocess - --num_partitions num_partitions - Number of partitions to split the edges into - --overwrite Overwrites the output_directory if this issetOtherwise, files with same the names will be treated as the data for current dataset. - --generate_config [generate_config], -gc [generate_config] - Generates a single-GPU/multi-CPU/multi-GPU training configuration file by default. - Valid options (default to GPU): [GPU, CPU, multi-GPU] - --format format Format of data, eg. srd - --delim delim, -d delim - Specifies the delimiter - --dtype dtype Indicates the numpy.dtype - --not_remap_ids If set, will not remap ids - --dataset_split dataset_split dataset_split, -ds dataset_split dataset_split - Split dataset into specified fractions - --start_col start_col, -sc start_col - Indicates the column index to start from - --num_line_skip num_line_skip, -nls num_line_skip - Indicates number of lines to skip from the beginning - - Specify certain config (optional): [--
.=] + usage: preprocess [-h] [--download_directory download_directory] [--input_files input_files [input_files ...] | + --dataset dataset] [--num_partitions num_partitions] + [--generate_template_config [generate_template_config]] [--format format] [--delim delim] + [--remap_id_dtype remap_id_dtype] [--not_remap_ids] [--dataset_split dataset_split dataset_split] + [--start_col start_col] [--num_line_skip num_line_skip] + output_directory + + Preprocess Datasets + + positional arguments: + output_directory Directory to put preprocessed graph data. + + optional arguments: + -h, --help show this help message and exit + --download_directory download_directory + Directory to put downloaded data files for supported datasets. + --input_files input_files [input_files ...] + Input files of custom dataset + --dataset dataset Name of supported dataset to preprocess + --num_partitions num_partitions + Number of partitions to split the edges into + --generate_template_config [generate_template_config], -gtc [generate_template_config] + Generates a single-GPU training configuration file which contains parameters with default values. + Valid options (default to GPU): [GPU, CPU, multi-GPU] + --format format Specifies the sequence of source, destination (and relation) in input data files, eg. srd + --delim delim, -d delim + Specifies the delimiter between source, (relation,) destination strings in input data files. + --remap_id_dtype remap_id_dtype + Indicates the data format to store the remapped IDs. + --not_remap_ids If set, will not remap ids + --dataset_split dataset_split dataset_split, -ds dataset_split dataset_split + Split dataset into specified fractions + --start_col start_col, -sc start_col + Indicates the column index to start parsing source/destination nodes( or relation). + --num_line_skip num_line_skip, -nls num_line_skip + Indicates number of lines/rows to skip from the beginning of the file. + + Specify certain config (optional): [--
.=] output_directory ++++++++++++++++ ```` is a **required** argument for ``marius_preprocess``. -It is the directory where all the files created by ``marius_preprocess`` wil be stored. +It is the directory where all the preprocessed files created by ``marius_preprocess`` wil be stored. ``marius_preprocess`` will create this file if it does not exist. ``marius_preprocess`` outputs the following files to ````. For the preprocessing of supported datasets, ```` also includes @@ -105,24 +109,32 @@ The source, relation and destination of edge ``i`` can be retrieved from files by reading 3 4-byte integers (or 8-byte integers if using int64 data type for storage) at the offset in the file ``i * 3 * 4`` (or ``i * 3 * 8`` when using int64). -\-\-files +\-\-download_directory ++++++++++++++++++++++++++++++++++++++++++++ +``--download_directory`` is an **optional** argument for ``marius_preprocess``. +It is the directory where ``marius_preprocess`` puts all downloaded files for +:ref:`built-in datasets`. The default value of this argument is ``download_dir``. + +\-\-intput_files +++++++++++++++++++++ -``--files`` is an **optional** argument for ``marius_preprocess``. +``--intput_files`` is an **optional** argument for ``marius_preprocess``. It should be a list of files containing custom dataset. It should not be used -at the same time when ``--dataset`` is used. +at the same time when ``--dataset`` is used. The input dataset files should +have columnar format where each edge occupies its own row and is composed of +a source node, a destination node (and a relation) separated by a delimiter. For example, the following command preprocesses the custom dataset composed of ``custom_train.csv``, ``custom_valid.csv`` and ``custom_test.csv`` and stores them into directory ``output_dir``. :: - marius_preprocess output_dir --files custom_train.csv custom_valid.csv custom_test.csv + marius_preprocess output_dir --input_files custom_train.csv custom_valid.csv custom_test.csv \-\-dataset +++++++++++++++++++++ ``--dataset`` is an **optional** argument for ``marius_preprocess``. It can be one of the names of a Marius supported dataset. -It should not be used at the same time when ``--files`` is used. +It should not be used at the same time when ``--input_files`` is used. To see which datasets are supported by Marius, check out :ref:`dataset` table. @@ -135,40 +147,38 @@ The default value for ```` is one. \-\-overwrite +++++++++++++ ``--overwrite`` is an **optional** argument for ``marius_preprocess``. If this option is set, then -the ```` will be overwritten. Otherwise, ``marius_preprocess`` will treat the files -in ```` with the same file names as the latest files for current run. When switching -from one dataset to another one, the converted data files of the previous dataset in same ```` -may be treated as the already-preprocessed data files for the current dataset if this option is not set. +the ```` and ```` will removed before the preprocessing starts +to prevent files left from the previous runs to interfere with files from current run. -\-\-generate_config , \-gc -+++++++++++++++++++++++++++++++++++++++++++ -``--generate_config , -gc `` is an **optional** argument for ``marius_preprocess``. +\-\-generate_template_config , \-gtc +++++++++++++++++++++++++++++++++++++++++++++++++++++ +``--generate_template_config , -gtc `` is an **optional** argument for ``marius_preprocess``. If this option is set, ``marius_preprocess`` will generate a Marius configuration file in the ```` with all configuration parameters set to the recommended defaults if not explicitly defined. The generated Marius configuration is for single-GPU setting by default if ```` is not set. If other device, such as ``CPU`` or ``multi-GPU``, is required, users can just append the option after -``--generate_config``, e.g. ``--generate_config CPU``. +``--generate_template_config``, e.g. ``--generate_template_config CPU``. For example, the following example will set ``general.device=CPU`` in the Marius configuration file generated for dataset WordNet18 (``wn18_cpu.ini``). :: - marius_preprocess ./output_dir --dataset wn18 --generate_config CPU + marius_preprocess ./output_dir --dataset wn18 --generate_template_config CPU \-\-
.= +++++++++++++++++++++++++++ ``--
.=`` is an **optional** argument for ``marius_preprocess``. -When ``--generate_config `` is set, ``--
.=`` can be used +When ``--generate_template_config `` is set, ``--
.=`` can be used to change the value of certain option in the Marius configuration file generated. For example, the following example will set ``model.embedding_sze=256`` and ``training.num_epochs=100`` in the Marius configuration file generated for custom dataset composed of ``custom_dataset.csv`` (``custom_gpu.ini``). :: - marius_preprocess ./output_dir --files custom_dataset.csv --generate_config --model.embedding_sze=256 --training.num_epochs=100 + marius_preprocess ./output_dir --input_files custom_dataset.csv --generate_template_config --model.embedding_sze=256 --training.num_epochs=100 \-\-format +++++++++++++++++++ @@ -182,7 +192,7 @@ storing edges in the sequence of source node, relation and destination node. :: - marius_preprocess ./output_dir --files custom_dataset.csv --format src + marius_preprocess ./output_dir --input_files custom_dataset.csv --format src \-\-delim , \-d +++++++++++++++++++++++++++++ @@ -191,9 +201,9 @@ storing edges in the sequence of source node, relation and destination node. If ```` is not set, ``marius_preprocess`` will use Python Sniffer to detect a delimiter. The delimiter is printed to the terminal so users can verify it. -\-\-dtype -+++++++++++++++++ -``--dtype `` is an **optional** argument for ``marius_preprocess``. +\-\-remap_id_dtype +++++++++++++++++++++++++++ +``--remap_id_dtype `` is an **optional** argument for ``marius_preprocess``. It defines the format for storing each node remapped ID and relation remapped ID. The current supported format is ``int32`` and ``int64``. When storing in ``int32``, each remapped ID will be a 4-byte integer. @@ -207,6 +217,7 @@ The default ```` is set to ``int32``. \-\-not_remap_ids +++++++++++++++++ ``--not_remap_ids`` is an **optional** argument for ``marius_preprocess``. +During the preprocess, nodes and relations are all mapped to numerical IDs. If this option is set, the remapped IDs of nodes and relations will be the same as the read-in order of the nodes and relations from original dataset. @@ -224,7 +235,7 @@ validation, and test sets with a corresponding proportion of 0.99, 0.05, and 0.0 :: - marius_preprocess ./output_dir --files custom_dataset.csv --dataset_split 0.05 0.05 + marius_preprocess ./output_dir --input_files custom_dataset.csv --dataset_split 0.05 0.05 \-\-start_col +++++++++++++++++++++++++ @@ -258,7 +269,7 @@ The available options: :: usage: config_generator [-h] [--data_directory data_directory] [--dataset dataset | --stats num_nodes num_edge_types num_train num_valid num_test] - [--device [generate_config]] + [--device [device]] output_directory Generate configs @@ -276,7 +287,7 @@ The available options: --stats num_nodes num_edge_types num_train num_valid num_test, -s num_nodes num_edge_types num_train num_valid num_test Dataset statistics Enter in order of num_nodes, num_edge_types, num_train num_valid, num_test - --device [generate_config], -dev [generate_config] + --device [device], -dev [device] Generates configs for a single-GPU/multi-CPU/multi-GPU training configuration file by default. Valid options (default to GPU): [GPU, CPU, multi-GPU] diff --git a/docs/user_guide/preprocess.rst b/docs/user_guide/preprocess.rst index 7c0187f4..23ea6e58 100644 --- a/docs/user_guide/preprocess.rst +++ b/docs/user_guide/preprocess.rst @@ -160,6 +160,8 @@ The second approach can be done in the following steps: The names of the output files can be anything, as long as the path options are set in the configuration file. +.. _built-in datasets: + Built-in datasets ---------------------------------------------------------- @@ -202,14 +204,15 @@ For example, preprocessing the wn18 dataset produces the following output :: user@ubuntu: marius_preprocess output_dir/ --dataset wn18 - Downloading fetch.phpmedia=en:wordnet-mlj12.tar.gz to output_dir/fetch.phpmedia=en:wordnet-mlj12.tar.gz + wn18 + Downloading fetch.phpmedia=en:wordnet-mlj12.tar.gz to download_dir/fetch.phpmedia=en:wordnet-mlj12.tar.gz Extracting Extraction completed Detected delimiter: ~ ~ - Reading in output_dir/wordnet-mlj12-train.txt 1/3 - Reading in output_dir/wordnet-mlj12-valid.txt 2/3 - Reading in output_dir/wordnet-mlj12-test.txt 3/3 - Number of instance per file: [141442, 5000, 5000] + Reading in download_dir/wordnet-mlj12-train.txt 1/3 + Reading in download_dir/wordnet-mlj12-valid.txt 2/3 + Reading in download_dir/wordnet-mlj12-test.txt 3/3 + Number of instance per file:[141442, 5000, 5000] Number of nodes: 40943 Number of edges: 151442 Number of relations: 18 @@ -218,13 +221,13 @@ For example, preprocessing the wn18 dataset produces the following output Generating configuration files ------------------------------ -The ``marius_preprocess`` tool can generate a training configuration file for the input dataset using the argument ``--generate_config ``, where the is CPU for cpu-based processing, and GPU for gpu-based processing. +The ``marius_preprocess`` tool can generate a training configuration file for the input dataset using the argument ``--generate_template_config ``, where the is CPU for cpu-based processing, and GPU for gpu-based processing. Specific configuration options can be set by passing ``--
.=`` to the command for each option. E.g. :: - marius_preprocess output_dir/ --dataset wn18 --generate_config CPU --model.embedding_size=256 --training.num_epochs=100 + marius_preprocess output_dir/ --dataset wn18 --generate_template_config CPU --model.embedding_size=256 --training.num_epochs=100 This will preprocess the wn18 dataset and will generate a configuration file with following options set: diff --git a/src/python/tools/config_generator.py b/src/python/tools/config_generator.py index 527a0761..d8dba020 100644 --- a/src/python/tools/config_generator.py +++ b/src/python/tools/config_generator.py @@ -143,7 +143,7 @@ def set_args(): nargs=5, help='Dataset statistics\n' + 'Enter in order of num_nodes, num_relations, num_train' + ' num_valid, num_test') - parser.add_argument('--device', '-dev', metavar='generate_config', + parser.add_argument('--device', '-dev', metavar='device', choices=["GPU", "CPU", "multi-GPU"], nargs='?', default='GPU', help=('Generates configs for a single-GPU/multi-CPU' + diff --git a/src/python/tools/csv_converter.py b/src/python/tools/csv_converter.py index 3a52e2d0..13ac347a 100644 --- a/src/python/tools/csv_converter.py +++ b/src/python/tools/csv_converter.py @@ -1,3 +1,9 @@ +"""Converter for CSV, TSV and TXT dataset files. + +This module contains the functions for converting CSV, TSV and TXT dataset +files into Marius input formats. +""" + import argparse import re from pathlib import Path @@ -9,7 +15,43 @@ def split_dataset(input_dataset, validation_fraction, test_fraction, entry_regex, num_line_skip, data_cols, - delim, dtype=np.int32): + delim): + """Splits dataset into training, validation and testing sets. + + Splits one input dataset file into training, validation and testing sets + according to the given fractions. During the splitting process, all edges + in the input dataset are randomly sampled into training set, validation + set and testing set according to validation_fraction and test_fraction. + Then only these edges are written to splitted_train_edges.txt, + splitted_valid_edges.txt and splitted_test_edges.txt files in the same + directory of the input dataset file. If either of validation_fraction or + test_fraction is set to zero, the corresponding file will not be created. + The following files are created by this function: + splitted_train_edges.txt: File containing training set edges. + splitted_valid_edges.txt: File containing validation set edges. + splitted_test_edges.txt: File containing testing set edges. + + Args: + input_dataset: The path to the original data file to be splitted. + validation_fraction: The proportion of the input dataset that will be + put into the validation set. + test_fraction: The proportion of the input dataset that will be put + into the testing set. + entry_regex: The regular expression of the representation of an edge in + the dataset. + num_line_skip: Number of lines to skip as the header of the dataset + file. + data_cols: A list of index indicating which columns in the dataset file + compose the edges. + delim: The delimiter between two columns in the dataset file. + + Returns: + The list of file path to splitted_train_edges.txt, + splitted_valid_edges.txt and splitted_test_edges.txt are returned. In + the meantime, the num_line_skip is set to 0 and data_cols is set to + the first two or three columns based on whether there is relation in + the dataset for the downstream preprocessing operations. + """ train_fraction = 1 - validation_fraction - test_fraction assert(train_fraction > 0) @@ -84,6 +126,26 @@ def split_dataset(input_dataset, validation_fraction, test_fraction, def get_header_length(input_file, entry_regex): + """Detects the number of rows to skip as the file header. + + This function counts the number of rows do not contain the substring that + matches the edge regular expression from the start of the file to detects + the number of rows to skip as the file header. + + Args: + input_file: The object file for detecting number of header rows. + entry_regex: The regular expression of the representation of an edge in + the dataset. + + Returns: + The number of rows to skip as the file header. + + Raises: + RuntimeError: An error occurred when the process of detecting file + header length fails. A common failure case is that the file header + also contains the regular expression for edges. In this case, + number of rows to skip as file header should be manually set. + """ num_line_skip = 0 with open(input_file, 'r') as f: n = 0 @@ -99,15 +161,33 @@ def get_header_length(input_file, entry_regex): raise RuntimeError("Please give number of rows to skip " + "at file header.") - if a == n: - raise RuntimeWarning("No nodes detected, dataset format may " + - "be incorrect.") - return num_line_skip def check_given_num_line_skip_start_col(input_file, num_line_skip, data_cols, delim, start_col): + """Check if the given combination of num_line_skip and start_col is valid. + + This function splits the first row after the file header with the given + delimiter and check if start_col index is within the valid range (less than + the number of tokens splitted). + + Args: + input_file: A dataset file used to check the validity of the given + combination of num_line_skip and start_col. + num_line_skip: Number of lines to skip as the header of the dataset + file. + data_cols: A list of index indicating which columns in the dataset file + compose the edges. + delim: The delimiter between two columns in the dataset file. + start_col: The index of the first column of the edge representations in + the dataset file. + + Returns: + True if the given combination of num_line_skip and start_col is valid. + False if the given combination of num_line_skip and start_col is not + valid. + """ with open(input_file, 'r') as f: for i in range(num_line_skip): line = next(f) @@ -121,6 +201,23 @@ def check_given_num_line_skip_start_col(input_file, num_line_skip, data_cols, def partition_edges(edges, num_partitions, num_nodes): + """Split the nodes into num_partitions partitions. + + In the case of large scale graphs that have an embedding table which + exceeds CPU memory capacity, this function can partition the graph nodes + uniformly into num_partitions partitions and group the edges into edge + buckets. This partitioning method assumes that all edges fit in memory. + See partition_scheme for more details. + + Args: + edges: All edges of original dataset. + num_partitions: The number of graph partitions that the graph nodes are + uniformly partitioned into. + num_nodes: The total number of nodes. + + Returns: + Reordered edges and a list of offsets indicating node partitions. + """ partition_size = int(np.ceil(num_nodes / num_partitions)) src_partitions = edges[:, 0] // partition_size dst_partitions = edges[:, 2] // partition_size @@ -135,6 +232,30 @@ def partition_edges(edges, num_partitions, num_nodes): def join_files(files, regex, num_line_skip, data_cols, delim): + """Joins multiple dataset files into one dataset file + + Joins edges from multiple dataset files into one dataset file. This + function should only be called when there are more than one file. During + the process of joining, only edges from each dataset file is extracted and + then written to joined_file.txt. + The following files are created by this function: + joined_file.txt: The file contains all edges from the current dataset. + + Args: + files: A list of dataset files to be joined. + regex: The regular expression of the representation of an edge in the + dataset. + num_line_skip: Number of lines to skip as the header of the dataset + file. + data_cols: A list of index indicating which columns in the dataset file + compose the edges. + delim: The delimiter between two columns in the dataset file. + + Returns: + The joint file is returned as a list of one file. Meaning while, + num_line_skip is set to zero and data_cols is set to the first two or + three columns depends on if the edges in the dataset has relations. + """ assert(len(files) > 1) base_path = "/".join(files[0].split("/")[:-1]) joined_file = base_path + "/joined_file.txt" @@ -160,6 +281,74 @@ def join_files(files, regex, num_line_skip, data_cols, delim): def general_parser(files, format, output_dir, delim="", num_partitions=1, dtype=np.int32, remap_ids=True, dataset_split=(-1, -1), start_col=0, num_line_skip=None): + """Parses dataset in the format of CSV, TSV and TXT to marius input format. + + This function retrieves all edges from given dataset file. Each node and + edge_type is randomly assigned an integer id. The mappings from these + integer ids to the original ids are stored in node_mapping.txt and + rel_mapping.txt. + The original edges list is converted to an [|E|, 3] int32 tensor, shuffled and + then the contents of the tensor are written to the train_edges.pt file + and/or valid_edges.pt and test_edges.pt depend on dataset_split. + The following files are created by this function: + train_edges.pt: Dump of tensor memory for edges in the training set. + valid_edges.pt: Dump of tensor memroy for edges in the validation set. + test_edges.pt: Dump of tensor memroy for edges in the testing set. + node_mapping.txt: Mapping of original node ids to unique int32 ids. + rel_mapping.txt: Mapping of original edge_type ids to unique int32 ids. + + If num_partitions is set to a value greater than one, then the + following file is also created: + train_edges_partitions.txt: text file with num_partitions^2 lines, + where each line denotes the size of an edge bucket + + Args: + files: The list of original dataset files. If there are three files, + they are treated as training, validation and testing set based on + their order by default (if dataset_split is not set). + format: A string denotes the order of edge components. The value of + this string can only be "s" for source nodes, "r" for relation, + "d" for destination nodes. The length of this string can be two or + three depends on if the edges have relations. + output_dir: The directory where all the files created will be stored. + delim: The delimiter between two columns in the dataset file. + num_partitions: The number of graph partitions that the graph nodes are + uniformly partitioned into. + dtype: The data type of the edge list. The common values for this + argument is np.int32 or np.int64. If there are less then 2 billion + nodes (which is almost every dataset), int32 should be used. If the + value is set to np.int32, then each edge takes 3*4/2*4 bytes of + space to store. In the case of np.int64, each edge takes 3*8/2*8 + bytes of space to store. + remap_ids: Whether to assign node and relations random ids or + sequential ids based on their order in original dataset file. + dataset_split: The proportion of the input data that will be used for + validation and testing during training. The argument takes a tuple + of length two where the first value is the proportion of validation + set and the second value is the proportion of testing set. + start_col: The index of the first column of the edge representations in + the dataset file. + num_line_skip: Number of lines to skip as the header of the dataset + file. + + Returns: + The created files described above will be stored into the output_dir + directory. Statistics of the preprocessed dataset are put into a list + and returned. These statistics are placed in the following order: + number of edges in train_edges.pt, number of edges in valid_edges.pt, + number of edges in test_edgs.pt, number of relations, and number of + nodes. These statistics are also printed to the terminal. + + Raises: + RuntimeError: An error occurred when the denotation of source node "s" + or destination node "d" is not found in the value of argument + format. + This error also occurred if the delimiter given or the delimiter + detected is not correct. In this case, a new delimiter should be + assigned manually. + Detailed helper messages indicating the possible causes are printed + when this error is raised. + """ assert(len(files) != 0), "Number of data files cannot be 0" assert(len(format) == 1), "Format is specified incorrectly" assert((start_col == 0) or @@ -175,6 +364,8 @@ def general_parser(files, format, output_dir, delim="", num_partitions=1, (len(format[0]) == 2 and dst_idx != -1 and src_idx != -1)), "Format is specified incorrectly" + if not Path(output_dir).exists(): + Path(output_dir).mkdir(parents=True) assert(Path(output_dir).exists()), "Output directory not found" output_dir = output_dir.strip("/") output_dir = output_dir + "/" @@ -382,23 +573,31 @@ def general_parser(files, format, output_dir, delim="", num_partitions=1, def set_args(): + """Sets command line arguments for this csv_converter modules. + + Returns: + A dict containing all command line arguments and their values. + """ parser = argparse.ArgumentParser( description='csv converter', prog='csv_converter', formatter_class=argparse.RawTextHelpFormatter) - parser.add_argument('files', metavar='files', nargs='+', type=str, - help='Data files') + parser.add_argument('input_files', metavar='input_files', nargs='+', type=str, + help='Input files of custom dataset') parser.add_argument('format', metavar='format', nargs=1, type=str, help='Format of data, eg. srd') parser.add_argument('output_dir', metavar='output_dir', type=str, help='Output directory for preprocessed data') parser.add_argument('--delim', '-d', metavar='delim', type=str, default="", - help='Specifies the delimiter') + help='Specifies the delimiter between source, ' + + 'destination(and relation) in input data ' + + 'files, eg. srd') parser.add_argument('--num_partitions', '-np', metavar='num_partitions', type=int, default=1, help='number of partitions') - parser.add_argument('--dtype', metavar='dtype', type=np.dtype, + parser.add_argument('--remap_id_dtype', metavar='remap_id_dtype', type=np.dtype, default=np.int32, - help='Indicates the numpy.dtype') + help='Indicates the data format to store the ' + + 'remapped IDs.') parser.add_argument('--not_remap_ids', action='store_false', help='If set, will not remap ids') parser.add_argument('--dataset_split', '-ds', metavar='dataset_split', @@ -406,11 +605,12 @@ def set_args(): help='Split dataset into specified fractions') parser.add_argument('--start_col', '-sc', metavar='start_col', type=int, default=0, - help='Indicates the column index to start from') + help='Indicates the column index to start parsing ' + + 'source/destination nodes( or relation).') parser.add_argument('--num_line_skip', '-nls', metavar='num_line_skip', type=int, default=None, help='Indicates number of lines to ' + - 'skip from the beginning') + 'skip from the beginning of the file.') args = parser.parse_args() arg_dict = vars(args) @@ -421,9 +621,9 @@ def set_args(): def main(): arg_dict = set_args() - general_parser(arg_dict.get("files"), arg_dict.get("format"), + general_parser(arg_dict.get("input_files"), arg_dict.get("format"), arg_dict.get("output_dir"), arg_dict.get("delim"), - arg_dict.get("num_partitions"), arg_dict.get("dtype"), + arg_dict.get("num_partitions"),arg_dict.get("remap_id_dtype"), arg_dict.get("not_remap_ids"), arg_dict.get("dataset_split"), arg_dict.get("start_col"), arg_dict.get("num_line_skip")) diff --git a/src/python/tools/preprocess.py b/src/python/tools/preprocess.py index 049699db..a0e7bba0 100644 --- a/src/python/tools/preprocess.py +++ b/src/python/tools/preprocess.py @@ -1,3 +1,9 @@ +"""Preprocess module of Marius. + +This module contains the functions for preprocessing both custom datasets and +supported datasets. +""" + import argparse import gzip import re @@ -22,172 +28,528 @@ from marius.tools.csv_converter import general_parser -def live_journal(output_dir, num_partitions=1, split=(.05, .05)): +def live_journal(download_dir, output_dir, num_partitions=1, + split=(.05, .05)): + """Preprocesses the dataset live_journal. + + During preprocessing, Marius has randomly assigned integer ids to each node + and edge_type, where the mappings to the original ids are stored in + node_mapping.txt and rel_mapping.txt. + The edge list in original dataset files is then converted to an [|E|, 3] + int32 tensor, shuffled and then the contents of the tensor are written to + the train_edges.pt, valid_edges.pt and test_edges.pt files. + After the preprocess, the following files will be created in the designated + directory: + train_edges.pt: Dump of tensor memory for edges in the training set. + valid_edges.pt: Dump of tensor memroy for edges in the validation set. + test_edges.pt: Dump of tensor memroy for edges in the testing set. + node_mapping.txt: Mapping of original node ids to unique int32 ids. + rel_mapping.txt: Mapping of original edge_type ids to unique int32 ids. + + Args: + download_dir: The directory where downloaded dataset files are stored. + output_dir: The directory where preprocessed files will be stored. + num_partitions: The number of graph partitions that the graph nodes are + uniformly partitioned into. + split: The proportion of the input data that will be used for + validation and testing during training. The argument takes a tuple + of length two where the first value is the proportion of validation + set and the second value is the proportion of testing set. + + Returns: + The statistics of current dataset. In the mean time, the original + dataset files are downloaded to download_dir and the preprocessed data + files described above are created and stored in output_dir. + """ LIVE_JOURNAL_URL = "https://snap.stanford.edu/data/soc-LiveJournal1.txt.gz" - download_path = download_file(LIVE_JOURNAL_URL, output_dir) + download_path = download_file(LIVE_JOURNAL_URL, download_dir) extract_file(download_path) - return general_parser([str(Path(output_dir) / + return general_parser([str(Path(download_dir) / Path("soc-LiveJournal1.txt"))], ["sd"], output_dir, num_partitions=num_partitions, dataset_split=split) -def fb15k(output_dir, num_partitions=1): +def fb15k(download_dir, output_dir, num_partitions=1): + """Preprocesses the dataset fb15k. + + During preprocessing, Marius has randomly assigned integer ids to each node + and edge_type, where the mappings to the original ids are stored in + node_mapping.txt and rel_mapping.txt. + The edge list in original dataset files is then converted to an [|E|, 3] + int32 tensor, shuffled and then the contents of the tensor are written to + the train_edges.pt file. + After the preprocess, the following files will be created in the designated + directory: + train_edges.pt: Dump of tensor memory for edges in the training set. + node_mapping.txt: Mapping of original node ids to unique int32 ids. + rel_mapping.txt: Mapping of original edge_type ids to unique int32 ids. + + Args: + download_dir: The directory where downloaded dataset files are stored. + output_dir: The directory where preprocessed files will be stored. + num_partitions: The number of graph partitions that the graph nodes are + uniformly partitioned into. + + Returns: + The statistics of current dataset. In the mean time, the original + dataset files are downloaded to download_dir and the preprocessed data + files described above are created and stored in output_dir. + """ FB15K_URL = "https://dl.fbaipublicfiles.com/starspace/fb15k.tgz" - download_path = download_file(FB15K_URL, output_dir) + download_path = download_file(FB15K_URL, download_dir) extract_file(download_path) - for file in (output_dir / Path("FB15k")).iterdir(): - file.rename(output_dir / Path(file.name)) - (output_dir / Path("FB15k")).rmdir() + for file in (download_dir / Path("FB15k")).iterdir(): + file.rename(download_dir / Path(file.name)) + (download_dir / Path("FB15k")).rmdir() return general_parser( - [str(Path(output_dir) / + [str(Path(download_dir) / Path("freebase_mtr100_mte100-train.txt")), - str(Path(output_dir) / Path("freebase_mtr100_mte100-valid.txt")), - str(Path(output_dir) / Path("freebase_mtr100_mte100-test.txt"))], + str(Path(download_dir) / Path("freebase_mtr100_mte100-valid.txt")), + str(Path(download_dir) / Path("freebase_mtr100_mte100-test.txt"))], ["srd"], output_dir, num_partitions=num_partitions) -def twitter(output_dir, num_partitions=1, split=(.05, .05)): +def twitter(download_dir, output_dir, num_partitions=1, split=(.05, .05)): + """Preprocesses the dataset twitter. + + During preprocessing, Marius has randomly assigned integer ids to each node + and edge_type, where the mappings to the original ids are stored in + node_mapping.txt and rel_mapping.txt. + The edge list in original dataset files is then converted to an [|E|, 3] + int32 tensor, shuffled and then the contents of the tensor are written to + the train_edges.pt, valid_edges.pt and test_edges.pt files. + After the preprocess, the following files will be created in the designated + directory: + train_edges.pt: Dump of tensor memory for edges in the training set. + valid_edges.pt: Dump of tensor memroy for edges in the validation set. + test_edges.pt: Dump of tensor memroy for edges in the testing set. + node_mapping.txt: Mapping of original node ids to unique int32 ids. + rel_mapping.txt: Mapping of original edge_type ids to unique int32 ids. + + Args: + download_dir: The directory where downloaded dataset files are stored. + output_dir: The directory where preprocessed files will be stored. + num_partitions: The number of graph partitions that the graph nodes are + uniformly partitioned into. + split: The proportion of the input data that will be used for + validation and testing during training. The argument takes a tuple + of length two where the first value is the proportion of validation + set and the second value is the proportion of testing set. + + Returns: + The statistics of current dataset. In the mean time, the original + dataset files are downloaded to download_dir and the preprocessed data + files described above are created and stored in output_dir. + """ TWITTER_URL = "https://snap.stanford.edu/data/twitter-2010.txt.gz" - download_path = download_file(TWITTER_URL, output_dir) + download_path = download_file(TWITTER_URL, download_dir) extract_file(download_path) - return general_parser([str(Path(output_dir) / Path("twitter-2010.txt"))], + return general_parser([str(Path(download_dir) / Path("twitter-2010.txt"))], ["srd"], output_dir, num_partitions=num_partitions, dataset_split=split, num_line_skip=1) -def freebase86m(output_dir, num_partitions=1): +def freebase86m(download_dir, output_dir, num_partitions=1): + """Preprocesses the dataset freebase86m. + + During preprocessing, Marius has randomly assigned integer ids to each node + and edge_type, where the mappings to the original ids are stored in + node_mapping.txt and rel_mapping.txt. + The edge list in original dataset files is then converted to an [|E|, 3] + int32 tensor, shuffled and then the contents of the tensor are written to + the train_edges.pt file. + After the preprocess, the following files will be created in the designated + directory: + train_edges.pt: Dump of tensor memory for edges in the training set. + node_mapping.txt: Mapping of original node ids to unique int32 ids. + rel_mapping.txt: Mapping of original edge_type ids to unique int32 ids. + + Args: + download_dir: The directory where downloaded dataset files are stored. + output_dir: The directory where preprocessed files will be stored. + num_partitions: The number of graph partitions that the graph nodes are + uniformly partitioned into. + + Returns: + The statistics of current dataset. In the mean time, the original + dataset files are downloaded to download_dir and the preprocessed data + files described above are created and stored in output_dir. + """ FREEBASE86M_URL = "https://data.dgl.ai/dataset/Freebase.zip" - download_path = download_file(FREEBASE86M_URL, output_dir) + download_path = download_file(FREEBASE86M_URL, download_dir) extract_file(download_path) - for file in (output_dir / Path("Freebase")).iterdir(): - file.rename(output_dir / Path(file.name)) - (output_dir / Path("Freebase")).rmdir() + for file in (download_dir / Path("Freebase")).iterdir(): + file.rename(download_dir / Path(file.name)) + (download_dir / Path("Freebase")).rmdir() return general_parser( - [str(Path(output_dir) / Path("train.txt")), - str(Path(output_dir) / Path("valid.txt")), - str(Path(output_dir) / Path("test.txt"))], + [str(Path(download_dir) / Path("train.txt")), + str(Path(download_dir) / Path("valid.txt")), + str(Path(download_dir) / Path("test.txt"))], ["sdr"], output_dir, num_partitions=num_partitions) -def wn18(output_dir, num_partitions=1): +def wn18(download_dir, output_dir, num_partitions=1): + """Preprocesses the dataset wn18. + + During preprocessing, Marius has randomly assigned integer ids to each node + and edge_type, where the mappings to the original ids are stored in + node_mapping.txt and rel_mapping.txt. + The edge list in original dataset files is then converted to an [|E|, 3] + int32 tensor, shuffled and then the contents of the tensor are written to + the train_edges.pt file. + After the preprocess, the following files will be created in the designated + directory: + train_edges.pt: Dump of tensor memory for edges in the training set. + node_mapping.txt: Mapping of original node ids to unique int32 ids. + rel_mapping.txt: Mapping of original edge_type ids to unique int32 ids. + + Args: + download_dir: The directory where downloaded dataset files are stored. + output_dir: The directory where preprocessed files will be stored. + num_partitions: The number of graph partitions that the graph nodes are + uniformly partitioned into. + + Returns: + The statistics of current dataset. In the mean time, the original + dataset files are downloaded to download_dir and the preprocessed data + files described above are created and stored in output_dir. + """ WN18_URL = "https://everest.hds.utc.fr/lib/exe/fetch.php?media=en:wordnet-mlj12.tar.gz" - download_path = download_file(WN18_URL, output_dir) + download_path = download_file(WN18_URL, download_dir) extract_file(download_path) - for file in (output_dir / Path("wordnet-mlj12")).iterdir(): - file.rename(output_dir / Path(file.name)) - (output_dir / Path("wordnet-mlj12")).rmdir() + for file in (download_dir / Path("wordnet-mlj12")).iterdir(): + file.rename(download_dir / Path(file.name)) + (download_dir / Path("wordnet-mlj12")).rmdir() return general_parser( - [str(Path(output_dir) / Path("wordnet-mlj12-train.txt")), - str(Path(output_dir) / Path("wordnet-mlj12-valid.txt")), - str(Path(output_dir) / Path("wordnet-mlj12-test.txt"))], ["srd"], - output_dir, num_partitions=num_partitions) + [str(Path(download_dir) / Path("wordnet-mlj12-train.txt")), + str(Path(download_dir) / Path("wordnet-mlj12-valid.txt")), + str(Path(download_dir) / Path("wordnet-mlj12-test.txt"))], + ["srd"], output_dir, num_partitions=num_partitions) -def fb15k_237(output_dir, num_partitions=1): +def fb15k_237(download_dir, output_dir, num_partitions=1): + """Preprocesses the dataset fb15k_237. + + During preprocessing, Marius has randomly assigned integer ids to each node + and edge_type, where the mappings to the original ids are stored in + node_mapping.txt and rel_mapping.txt. + The edge list in original dataset files is then converted to an [|E|, 3] + int32 tensor, shuffled and then the contents of the tensor are written to + the train_edges.pt file. + After the preprocess, the following files will be created in the designated + directory: + train_edges.pt: Dump of tensor memory for edges in the training set. + node_mapping.txt: Mapping of original node ids to unique int32 ids. + rel_mapping.txt: Mapping of original edge_type ids to unique int32 ids. + + Args: + download_dir: The directory where downloaded dataset files are stored. + output_dir: The directory where preprocessed files will be stored. + num_partitions: The number of graph partitions that the graph nodes are + uniformly partitioned into. + + Returns: + The statistics of current dataset. In the mean time, the original + dataset files are downloaded to download_dir and the preprocessed data + files described above are created and stored in output_dir. + """ FB15K_237 = "https://data.deepai.org/FB15K-237.2.zip" - download_path = download_file(FB15K_237, output_dir) + download_path = download_file(FB15K_237, download_dir) extract_file(download_path) - for file in (output_dir / Path("Release")).iterdir(): - file.rename(output_dir / Path(file.name)) - (output_dir / Path("Release")).rmdir() + for file in (download_dir / Path("Release")).iterdir(): + file.rename(download_dir / Path(file.name)) + (download_dir / Path("Release")).rmdir() return general_parser( - [str(Path(output_dir) / Path("train.txt")), - str(Path(output_dir) / Path("valid.txt")), - str(Path(output_dir) / Path("test.txt"))], + [str(Path(download_dir) / Path("train.txt")), + str(Path(download_dir) / Path("valid.txt")), + str(Path(download_dir) / Path("test.txt"))], ["srd"], output_dir, num_partitions=num_partitions) -def wn18rr(output_dir, num_partitions=1): +def wn18rr(download_dir, output_dir, num_partitions=1): + """Preprocesses the dataset wn18rr. + + During preprocessing, Marius has randomly assigned integer ids to each node + and edge_type, where the mappings to the original ids are stored in + node_mapping.txt and rel_mapping.txt. + The edge list in original dataset files is then converted to an [|E|, 3] + int32 tensor, shuffled and then the contents of the tensor are written to + the train_edges.pt file. + After the preprocess, the following files will be created in the designated + directory: + train_edges.pt: Dump of tensor memory for edges in the training set. + node_mapping.txt: Mapping of original node ids to unique int32 ids. + rel_mapping.txt: Mapping of original edge_type ids to unique int32 ids. + + Args: + download_dir: The directory where downloaded dataset files are stored. + output_dir: The directory where preprocessed files will be stored. + num_partitions: The number of graph partitions that the graph nodes are + uniformly partitioned into. + + Returns: + The statistics of current dataset. In the mean time, the original + dataset files are downloaded to download_dir and the preprocessed data + files described above are created and stored in output_dir. + """ WN18RR_URL = "https://data.dgl.ai/dataset/wn18rr.zip" - download_path = download_file(WN18RR_URL, output_dir) + download_path = download_file(WN18RR_URL, download_dir) extract_file(download_path) - for file in (output_dir / Path("wn18rr")).iterdir(): - file.rename(output_dir / Path(file.name)) - (output_dir / Path("wn18rr")).rmdir() + for file in (download_dir / Path("wn18rr")).iterdir(): + file.rename(download_dir / Path(file.name)) + (download_dir / Path("wn18rr")).rmdir() return general_parser( - [str(Path(output_dir) / Path("train.txt")), - str(Path(output_dir) / Path("valid.txt")), - str(Path(output_dir) / Path("test.txt"))], + [str(Path(download_dir) / Path("train.txt")), + str(Path(download_dir) / Path("valid.txt")), + str(Path(download_dir) / Path("test.txt"))], ["srd"], output_dir, num_partitions=num_partitions) -def codex_s(output_dir, num_partitions=1): +def codex_s(download_dir, output_dir, num_partitions=1): + """Preprocesses the dataset codex_s. + + During preprocessing, Marius has randomly assigned integer ids to each node + and edge_type, where the mappings to the original ids are stored in + node_mapping.txt and rel_mapping.txt. + The edge list in original dataset files is then converted to an [|E|, 3] + int32 tensor, shuffled and then the contents of the tensor are written to + the train_edges.pt file. + After the preprocess, the following files will be created in the designated + directory: + train_edges.pt: Dump of tensor memory for edges in the training set. + node_mapping.txt: Mapping of original node ids to unique int32 ids. + rel_mapping.txt: Mapping of original edge_type ids to unique int32 ids. + + Args: + download_dir: The directory where downloaded dataset files are stored. + output_dir: The directory where preprocessed files will be stored. + num_partitions: The number of graph partitions that the graph nodes are + uniformly partitioned into. + + Returns: + The statistics of current dataset. In the mean time, the original + dataset files are downloaded to download_dir and the preprocessed data + files described above are created and stored in output_dir. + """ CODEX_S_TRAIN_URL = "https://raw.githubusercontent.com/tsafavi/codex/master/data/triples/codex-s/train.txt" CODEX_S_VALID_URL = "https://raw.githubusercontent.com/tsafavi/codex/master/data/triples/codex-s/valid.txt" CODEX_S_TEST_URL = "https://raw.githubusercontent.com/tsafavi/codex/master/data/triples/codex-s/test.txt" - download_path = download_file(CODEX_S_TRAIN_URL, output_dir) - download_path = download_file(CODEX_S_VALID_URL, output_dir) - download_path = download_file(CODEX_S_TEST_URL, output_dir) + download_path = download_file(CODEX_S_TRAIN_URL, download_dir) + download_path = download_file(CODEX_S_VALID_URL, download_dir) + download_path = download_file(CODEX_S_TEST_URL, download_dir) - return general_parser([str(Path(output_dir) / Path("train.txt")), - str(Path(output_dir) / Path("valid.txt")), - str(Path(output_dir) / Path("test.txt"))], + return general_parser([str(Path(download_dir) / Path("train.txt")), + str(Path(download_dir) / Path("valid.txt")), + str(Path(download_dir) / Path("test.txt"))], ["srd"], output_dir, num_partitions=num_partitions) -def codex_m(output_dir, num_partitions=1): +def codex_m(download_dir, output_dir, num_partitions=1): + """Preprocesses the dataset codex_m. + + During preprocessing, Marius has randomly assigned integer ids to each node + and edge_type, where the mappings to the original ids are stored in + node_mapping.txt and rel_mapping.txt. + The edge list in original dataset files is then converted to an [|E|, 3] + int32 tensor, shuffled and then the contents of the tensor are written to + the train_edges.pt file. + After the preprocess, the following files will be created in the designated + directory: + train_edges.pt: Dump of tensor memory for edges in the training set. + node_mapping.txt: Mapping of original node ids to unique int32 ids. + rel_mapping.txt: Mapping of original edge_type ids to unique int32 ids. + + Args: + download_dir: The directory where downloaded dataset files are stored. + output_dir: The directory where preprocessed files will be stored. + num_partitions: The number of graph partitions that the graph nodes are + uniformly partitioned into. + + Returns: + The statistics of current dataset. In the mean time, the original + dataset files are downloaded to download_dir and the preprocessed data + files described above are created and stored in output_dir. + """ CODEX_M_TRAIN_URL = "https://raw.githubusercontent.com/tsafavi/codex/master/data/triples/codex-m/train.txt" CODEX_M_VALID_URL = "https://raw.githubusercontent.com/tsafavi/codex/master/data/triples/codex-m/valid.txt" CODEX_M_TEST_URL = "https://raw.githubusercontent.com/tsafavi/codex/master/data/triples/codex-m/test.txt" - download_path = download_file(CODEX_M_TRAIN_URL, output_dir) - download_path = download_file(CODEX_M_VALID_URL, output_dir) - download_path = download_file(CODEX_M_TEST_URL, output_dir) + download_path = download_file(CODEX_M_TRAIN_URL, download_dir) + download_path = download_file(CODEX_M_VALID_URL, download_dir) + download_path = download_file(CODEX_M_TEST_URL, download_dir) - return general_parser([str(Path(output_dir) / Path("train.txt")), - str(Path(output_dir) / Path("valid.txt")), - str(Path(output_dir) / Path("test.txt"))], + return general_parser([str(Path(download_dir) / Path("train.txt")), + str(Path(download_dir) / Path("valid.txt")), + str(Path(download_dir) / Path("test.txt"))], ["srd"], output_dir, num_partitions=num_partitions) -def codex_l(output_dir, num_partitions=1): +def codex_l(download_dir, output_dir, num_partitions=1): + """Preprocesses the dataset codex_l. + + During preprocessing, Marius has randomly assigned integer ids to each node + and edge_type, where the mappings to the original ids are stored in + node_mapping.txt and rel_mapping.txt. + The edge list in original dataset files is then converted to an [|E|, 3] + int32 tensor, shuffled and then the contents of the tensor are written to + the train_edges.pt file. + After the preprocess, the following files will be created in the designated + directory: + train_edges.pt: Dump of tensor memory for edges in the training set. + node_mapping.txt: Mapping of original node ids to unique int32 ids. + rel_mapping.txt: Mapping of original edge_type ids to unique int32 ids. + + Args: + download_dir: The directory where downloaded dataset files are stored. + output_dir: The directory where preprocessed files will be stored. + num_partitions: The number of graph partitions that the graph nodes are + uniformly partitioned into. + + Returns: + The statistics of current dataset. In the mean time, the original + dataset files are downloaded to download_dir and the preprocessed data + files described above are created and stored in output_dir. + """ CODEX_L_TRAIN_URL = "https://raw.githubusercontent.com/tsafavi/codex/master/data/triples/codex-l/train.txt" CODEX_L_VALID_URL = "https://raw.githubusercontent.com/tsafavi/codex/master/data/triples/codex-l/valid.txt" CODEX_L_TEST_URL = "https://raw.githubusercontent.com/tsafavi/codex/master/data/triples/codex-l/test.txt" - download_path = download_file(CODEX_L_TRAIN_URL, output_dir) - download_path = download_file(CODEX_L_VALID_URL, output_dir) - download_path = download_file(CODEX_L_TEST_URL, output_dir) + download_path = download_file(CODEX_L_TRAIN_URL, download_dir) + download_path = download_file(CODEX_L_VALID_URL, download_dir) + download_path = download_file(CODEX_L_TEST_URL, download_dir) - return general_parser([str(Path(output_dir) / Path("train.txt")), - str(Path(output_dir) / Path("valid.txt")), - str(Path(output_dir) / Path("test.txt"))], + return general_parser([str(Path(download_dir) / Path("train.txt")), + str(Path(download_dir) / Path("valid.txt")), + str(Path(download_dir) / Path("test.txt"))], ["srd"], output_dir, num_partitions=num_partitions) -def drkg(output_dir, num_partitions=1, split=(.05, .05)): +def drkg(download_dir, output_dir, num_partitions=1, split=(.05, .05)): + """Preprocesses the dataset drkg. + + During preprocessing, Marius has randomly assigned integer ids to each node + and edge_type, where the mappings to the original ids are stored in + node_mapping.txt and rel_mapping.txt. + The edge list in original dataset files is then converted to an [|E|, 3] + int32 tensor, shuffled and then the contents of the tensor are written to + the train_edges.pt, valid_edges.pt and test_edges.pt files. + After the preprocess, the following files will be created in the designated + directory: + train_edges.pt: Dump of tensor memory for edges in the training set. + valid_edges.pt: Dump of tensor memroy for edges in the validation set. + test_edges.pt: Dump of tensor memroy for edges in the testing set. + node_mapping.txt: Mapping of original node ids to unique int32 ids. + rel_mapping.txt: Mapping of original edge_type ids to unique int32 ids. + + Args: + download_dir: The directory where downloaded dataset files are stored. + output_dir: The directory where preprocessed files will be stored. + num_partitions: The number of graph partitions that the graph nodes are + uniformly partitioned into. + split: The proportion of the input data that will be used for + validation and testing during training. The argument takes a tuple + of length two where the first value is the proportion of validation + set and the second value is the proportion of testing set. + + Returns: + The statistics of current dataset. In the mean time, the original + dataset files are downloaded to download_dir and the preprocessed data + files described above are created and stored in output_dir. + """ DRKG_URL = "https://dgl-data.s3-us-west-2.amazonaws.com/dataset/DRKG/drkg.tar.gz" - download_path = download_file(DRKG_URL, output_dir) + download_path = download_file(DRKG_URL, download_dir) extract_file(download_path) - return general_parser([str(Path(output_dir) / + return general_parser([str(Path(download_dir) / Path("drkg.tsv"))], ["srd"], output_dir, num_partitions=num_partitions, dataset_split=split) -def hetionet(output_dir, num_partitions=1, split=(.05, .05)): +def hetionet(download_dir, output_dir, num_partitions=1, split=(.05, .05)): + """Preprocesses the dataset hetionet. + + During preprocessing, Marius has randomly assigned integer ids to each node + and edge_type, where the mappings to the original ids are stored in + node_mapping.txt and rel_mapping.txt. + The edge list in original dataset files is then converted to an [|E|, 3] + int32 tensor, shuffled and then the contents of the tensor are written to + the train_edges.pt, valid_edges.pt and test_edges.pt files. + After the preprocess, the following files will be created in the designated + directory: + train_edges.pt: Dump of tensor memory for edges in the training set. + valid_edges.pt: Dump of tensor memroy for edges in the validation set. + test_edges.pt: Dump of tensor memroy for edges in the testing set. + node_mapping.txt: Mapping of original node ids to unique int32 ids. + rel_mapping.txt: Mapping of original edge_type ids to unique int32 ids. + + Args: + download_dir: The directory where downloaded dataset files are stored. + output_dir: The directory where preprocessed files will be stored. + num_partitions: The number of graph partitions that the graph nodes are + uniformly partitioned into. + split: The proportion of the input data that will be used for + validation and testing during training. The argument takes a tuple + of length two where the first value is the proportion of validation + set and the second value is the proportion of testing set. + + Returns: + The statistics of current dataset. In the mean time, the original + dataset files are downloaded to download_dir and the preprocessed data + files described above are created and stored in output_dir. + """ HETIONET_URL = "https://github.com/hetio/hetionet/raw/master/hetnet/tsv/hetionet-v1.0-edges.sif.gz" - download_path = download_file(HETIONET_URL, output_dir) + download_path = download_file(HETIONET_URL, download_dir) extract_file(download_path) - return general_parser([str(Path(output_dir) / + return general_parser([str(Path(download_dir) / Path("hetionet-v1.0-edges.sif"))], ["srd"], output_dir, num_partitions=num_partitions, dataset_split=split) -def kinships(output_dir, num_partitions=1, split=(.05, .05)): +def kinships(download_dir, output_dir, num_partitions=1, split=(.05, .05)): + """Preprocesses the dataset kinships. + + During preprocessing, Marius has randomly assigned integer ids to each node + and edge_type, where the mappings to the original ids are stored in + node_mapping.txt and rel_mapping.txt. + The edge list in original dataset files is then converted to an [|E|, 3] + int32 tensor, shuffled and then the contents of the tensor are written to + the train_edges.pt, valid_edges.pt and test_edges.pt files. + After the preprocess, the following files will be created in the designated + directory: + train_edges.pt: Dump of tensor memory for edges in the training set. + valid_edges.pt: Dump of tensor memroy for edges in the validation set. + test_edges.pt: Dump of tensor memroy for edges in the testing set. + node_mapping.txt: Mapping of original node ids to unique int32 ids. + rel_mapping.txt: Mapping of original edge_type ids to unique int32 ids. + + Args: + download_dir: The directory where downloaded dataset files are stored. + output_dir: The directory where preprocessed files will be stored. + num_partitions: The number of graph partitions that the graph nodes are + uniformly partitioned into. + split: The proportion of the input data that will be used for + validation and testing during training. The argument takes a tuple + of length two where the first value is the proportion of validation + set and the second value is the proportion of testing set. + + Returns: + The statistics of current dataset. In the mean time, the original + dataset files are downloaded to download_dir and the preprocessed data + files described above are created and stored in output_dir. + """ KINSHIPS_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/kinship/kinship.data" - download_path = download_file(KINSHIPS_URL, output_dir) + download_path = download_file(KINSHIPS_URL, download_dir) edges = [] pattern = re.compile("^(?P[a-z]+)" + r"\((?P[A-Za-z]+).{2}(?P[A-Za-z]+)\)\n$") @@ -203,133 +565,379 @@ def kinships(output_dir, num_partitions=1, split=(.05, .05)): node_2 = m.group("n2") edges.append([node_1, rel, node_2]) - if (Path(output_dir) / Path("sample_edges.txt")).exists(): - (Path(output_dir) / Path("sample_edges.txt")).unlink() + if (Path(download_dir) / Path("sample_edges.txt")).exists(): + (Path(download_dir) / Path("sample_edges.txt")).unlink() np.random.shuffle(edges) - np.savetxt((Path(output_dir) / Path("sample_edges.txt")), edges, fmt="%s", + np.savetxt((Path(download_dir) / Path("sample_edges.txt")), edges, fmt="%s", delimiter="\t", newline="\n") - return general_parser([str(Path(output_dir) / Path("sample_edges.txt"))], + return general_parser([str(Path(download_dir) / Path("sample_edges.txt"))], ["srd"], output_dir, dataset_split=split) -def openbiolink_hq(output_dir, num_partitions=1): +def openbiolink_hq(download_dir, output_dir, num_partitions=1): + """Preprocesses the dataset openbiolink_hq. + + During preprocessing, Marius has randomly assigned integer ids to each node + and edge_type, where the mappings to the original ids are stored in + node_mapping.txt and rel_mapping.txt. + The edge list in original dataset files is then converted to an [|E|, 3] + int32 tensor, shuffled and then the contents of the tensor are written to + the train_edges.pt file. + After the preprocess, the following files will be created in the designated + directory: + train_edges.pt: Dump of tensor memory for edges in the training set. + node_mapping.txt: Mapping of original node ids to unique int32 ids. + rel_mapping.txt: Mapping of original edge_type ids to unique int32 ids. + + Args: + download_dir: The directory where downloaded dataset files are stored. + output_dir: The directory where preprocessed files will be stored. + num_partitions: The number of graph partitions that the graph nodes are + uniformly partitioned into. + + Returns: + The statistics of current dataset. In the mean time, the original + dataset files are downloaded to download_dir and the preprocessed data + files described above are created and stored in output_dir. + """ OPENBIOLINK_HQ_URL = "https://zenodo.org/record/3834052/files/HQ_DIR.zip?download=1" - download_path = download_file(OPENBIOLINK_HQ_URL, output_dir) + download_path = download_file(OPENBIOLINK_HQ_URL, download_dir) extract_file(download_path) return general_parser( - [str(Path(output_dir) / + [str(Path(download_dir) / Path("HQ_DIR/train_test_data/train_sample.csv")), - str(Path(output_dir) / + str(Path(download_dir) / Path("HQ_DIR/train_test_data/val_sample.csv")), - str(Path(output_dir) / + str(Path(download_dir) / Path("HQ_DIR/train_test_data/test_sample.csv"))], ["srd"], output_dir, num_partitions=num_partitions, num_line_skip=0) -def openbiolink_lq(output_dir, num_partitions=1): +def openbiolink_lq(download_dir, output_dir, num_partitions=1): + """Preprocesses the dataset openbiolink_lq. + + During preprocessing, Marius has randomly assigned integer ids to each node + and edge_type, where the mappings to the original ids are stored in + node_mapping.txt and rel_mapping.txt. + The edge list in original dataset files is then converted to an [|E|, 3] + int32 tensor, shuffled and then the contents of the tensor are written to + the train_edges.pt file. + After the preprocess, the following files will be created in the designated + directory: + train_edges.pt: Dump of tensor memory for edges in the training set. + node_mapping.txt: Mapping of original node ids to unique int32 ids. + rel_mapping.txt: Mapping of original edge_type ids to unique int32 ids. + + Args: + download_dir: The directory where downloaded dataset files are stored. + output_dir: The directory where preprocessed files will be stored. + num_partitions: The number of graph partitions that the graph nodes are + uniformly partitioned into. + + Returns: + The statistics of current dataset. In the mean time, the original + dataset files are downloaded to download_dir and the preprocessed data + files described above are created and stored in output_dir. + """ OPENBIOLINK_LQ_URL = "https://samwald.info/res/OpenBioLink_2020_final/ALL_DIR.zip" - download_path = download_file(OPENBIOLINK_LQ_URL, output_dir) + download_path = download_file(OPENBIOLINK_LQ_URL, download_dir) extract_file(download_path) return general_parser( - [str(Path(output_dir) / + [str(Path(download_dir) / Path("ALL_DIR/train_test_data/train_sample.csv")), - str(Path(output_dir) / + str(Path(download_dir) / Path("ALL_DIR/train_test_data/val_sample.csv")), - str(Path(output_dir) / + str(Path(download_dir) / Path("ALL_DIR/train_test_data/test_sample.csv"))], ["srd"], output_dir, num_partitions=num_partitions, num_line_skip=0) -def ogbl_biokg(output_dir, num_partitions=1): +def ogbl_biokg(download_dir, output_dir, num_partitions=1): + """Preprocesses the dataset ogbl_biokg. + + During preprocessing, Marius has randomly assigned integer ids to each node + and edge_type, where the mappings to the original ids are stored in + node_mapping.txt and rel_mapping.txt. + The edge list in original dataset files is then converted to an [|E|, 3] + int32 tensor, shuffled and then the contents of the tensor are written to + the train_edges.pt file. + After the preprocess, the following files will be created in the designated + directory: + train_edges.pt: Dump of tensor memory for edges in the training set. + node_mapping.txt: Mapping of original node ids to unique int32 ids. + rel_mapping.txt: Mapping of original edge_type ids to unique int32 ids. + + Args: + download_dir: The directory where downloaded dataset files are stored. + output_dir: The directory where preprocessed files will be stored. + num_partitions: The number of graph partitions that the graph nodes are + uniformly partitioned into. + + Returns: + The statistics of current dataset. In the mean time, the original + dataset files are downloaded to download_dir and the preprocessed data + files described above are created and stored in output_dir. + """ OGBL_BIOKG_URL = "https://snap.stanford.edu/ogb/data/linkproppred/biokg.zip" - download_path = download_file(OGBL_BIOKG_URL, output_dir) + download_path = download_file(OGBL_BIOKG_URL, download_dir) extract_file(download_path) - files = [str(Path(output_dir) / Path("biokg/split/random/train.pt")), - str(Path(output_dir) / Path("biokg/split/random/valid.pt")), - str(Path(output_dir) / Path("biokg/split/random/test.pt"))] - - return parse_ogbl(files, True, output_dir, num_partitions=num_partitions) - - -def ogbl_ppa(output_dir, num_partitions=1): + files = [str(Path(download_dir) / Path("biokg/split/random/train.pt")), + str(Path(download_dir) / Path("biokg/split/random/valid.pt")), + str(Path(download_dir) / Path("biokg/split/random/test.pt"))] + + return parse_ogbl(files, True, download_dir, output_dir, + num_partitions=num_partitions) + + +def ogbl_ppa(download_dir, output_dir, num_partitions=1): + """Preprocesses the dataset ogbl_ppa. + + During preprocessing, Marius has randomly assigned integer ids to each node + and edge_type, where the mappings to the original ids are stored in + node_mapping.txt and rel_mapping.txt. + The edge list in original dataset files is then converted to an [|E|, 3] + int32 tensor, shuffled and then the contents of the tensor are written to + the train_edges.pt file. + After the preprocess, the following files will be created in the designated + directory: + train_edges.pt: Dump of tensor memory for edges in the training set. + node_mapping.txt: Mapping of original node ids to unique int32 ids. + rel_mapping.txt: Mapping of original edge_type ids to unique int32 ids. + + Args: + download_dir: The directory where downloaded dataset files are stored. + output_dir: The directory where preprocessed files will be stored. + num_partitions: The number of graph partitions that the graph nodes are + uniformly partitioned into. + + Returns: + The statistics of current dataset. In the mean time, the original + dataset files are downloaded to download_dir and the preprocessed data + files described above are created and stored in output_dir. + """ OGBL_PPA_URL = "https://snap.stanford.edu/ogb/data/linkproppred/ppassoc.zip" - download_path = download_file(OGBL_PPA_URL, output_dir) + download_path = download_file(OGBL_PPA_URL, download_dir) extract_file(download_path) - files = [str(Path(output_dir) / Path("ppassoc/split/throughput/train.pt")), - str(Path(output_dir) / Path("ppassoc/split/throughput/valid.pt")), - str(Path(output_dir) / Path("ppassoc/split/throughput/test.pt"))] - - return parse_ogbl(files, False, output_dir, num_partitions=num_partitions) - - -def ogbl_ddi(output_dir, num_partitions=1): + files = [str(Path(download_dir) / Path("ppassoc/split/throughput/train.pt")), + str(Path(download_dir) / Path("ppassoc/split/throughput/valid.pt")), + str(Path(download_dir) / Path("ppassoc/split/throughput/test.pt"))] + + return parse_ogbl(files, False, download_dir, output_dir, + num_partitions=num_partitions) + + +def ogbl_ddi(download_dir, output_dir, num_partitions=1): + """Preprocesses the dataset ogbl_ddi. + + During preprocessing, Marius has randomly assigned integer ids to each node + and edge_type, where the mappings to the original ids are stored in + node_mapping.txt and rel_mapping.txt. + The edge list in original dataset files is then converted to an [|E|, 3] + int32 tensor, shuffled and then the contents of the tensor are written to + the train_edges.pt file. + After the preprocess, the following files will be created in the designated + directory: + train_edges.pt: Dump of tensor memory for edges in the training set. + node_mapping.txt: Mapping of original node ids to unique int32 ids. + rel_mapping.txt: Mapping of original edge_type ids to unique int32 ids. + + Args: + download_dir: The directory where downloaded dataset files are stored. + output_dir: The directory where preprocessed files will be stored. + num_partitions: The number of graph partitions that the graph nodes are + uniformly partitioned into. + + Returns: + The statistics of current dataset. In the mean time, the original + dataset files are downloaded to download_dir and the preprocessed data + files described above are created and stored in output_dir. + """ OGBL_DDI_URL = "https://snap.stanford.edu/ogb/data/linkproppred/ddi.zip" - download_path = download_file(OGBL_DDI_URL, output_dir) + download_path = download_file(OGBL_DDI_URL, download_dir) extract_file(download_path) - files = [str(Path(output_dir) / Path("ddi/split/target/train.pt")), - str(Path(output_dir) / Path("ddi/split/target/valid.pt")), - str(Path(output_dir) / Path("ddi/split/target/test.pt"))] - - return parse_ogbl(files, False, output_dir, num_partitions=num_partitions) - - -def ogbl_collab(output_dir, num_partitions=1): + files = [str(Path(download_dir) / Path("ddi/split/target/train.pt")), + str(Path(download_dir) / Path("ddi/split/target/valid.pt")), + str(Path(download_dir) / Path("ddi/split/target/test.pt"))] + + return parse_ogbl(files, False, download_dir, output_dir, + num_partitions=num_partitions) + + +def ogbl_collab(download_dir, output_dir, num_partitions=1): + """Preprocesses the dataset ogbl_collab. + + During preprocessing, Marius has randomly assigned integer ids to each node + and edge_type, where the mappings to the original ids are stored in + node_mapping.txt and rel_mapping.txt. + The edge list in original dataset files is then converted to an [|E|, 3] + int32 tensor, shuffled and then the contents of the tensor are written to + the train_edges.pt file. + After the preprocess, the following files will be created in the designated + directory: + train_edges.pt: Dump of tensor memory for edges in the training set. + node_mapping.txt: Mapping of original node ids to unique int32 ids. + rel_mapping.txt: Mapping of original edge_type ids to unique int32 ids. + + Args: + download_dir: The directory where downloaded dataset files are stored. + output_dir: The directory where preprocessed files will be stored. + num_partitions: The number of graph partitions that the graph nodes are + uniformly partitioned into. + + Returns: + The statistics of current dataset. In the mean time, the original + dataset files are downloaded to download_dir and the preprocessed data + files described above are created and stored in output_dir. + """ OGBL_COLLAB_URL = "https://snap.stanford.edu/ogb/data/linkproppred/collab.zip" - download_path = download_file(OGBL_COLLAB_URL, output_dir) + download_path = download_file(OGBL_COLLAB_URL, download_dir) extract_file(download_path) - files = [str(Path(output_dir) / Path("collab/split/time/train.pt")), - str(Path(output_dir) / Path("collab/split/time/valid.pt")), - str(Path(output_dir) / Path("collab/split/time/test.pt"))] - - return parse_ogbl(files, False, output_dir, num_partitions=num_partitions) - - -def ogbn_arxiv(output_dir, num_partitions=1): + files = [str(Path(download_dir) / Path("collab/split/time/train.pt")), + str(Path(download_dir) / Path("collab/split/time/valid.pt")), + str(Path(download_dir) / Path("collab/split/time/test.pt"))] + + return parse_ogbl(files, False, download_dir, output_dir, + num_partitions=num_partitions) + + +def ogbn_arxiv(download_dir, output_dir, num_partitions=1): + """Preprocesses the dataset ogbn_arxiv. + + During preprocessing, Marius has randomly assigned integer ids to each node + and edge_type, where the mappings to the original ids are stored in + node_mapping.txt and rel_mapping.txt. + The edge list in original dataset files is then converted to an [|E|, 3] + int32 tensor, shuffled and then the contents of the tensor are written to + the train_edges.pt file. + After the preprocess, the following files will be created in the designated + directory: + train_edges.pt: Dump of tensor memory for edges in the training set. + node_mapping.txt: Mapping of original node ids to unique int32 ids. + rel_mapping.txt: Mapping of original edge_type ids to unique int32 ids. + + Args: + download_dir: The directory where downloaded dataset files are stored. + output_dir: The directory where preprocessed files will be stored. + num_partitions: The number of graph partitions that the graph nodes are + uniformly partitioned into. + + Returns: + The statistics of current dataset. In the mean time, the original + dataset files are downloaded to download_dir and the preprocessed data + files described above are created and stored in output_dir. + """ OGBN_ARXIV_URL = "http://snap.stanford.edu/ogb/data/nodeproppred/arxiv.zip" - download_path = download_file(OGBN_ARXIV_URL, output_dir) + download_path = download_file(OGBN_ARXIV_URL, download_dir) extract_file(download_path) - files = [str(Path(output_dir) / Path("arxiv/split/time/train.csv.gz")), - str(Path(output_dir) / Path("arxiv/split/time/valid.csv.gz")), - str(Path(output_dir) / Path("arxiv/split/time/test.csv.gz")), - str(Path(output_dir) / Path("arxiv/raw/edge.csv.gz"))] - - return parse_ogbn(files, output_dir, num_partitions=num_partitions) - - -def ogbn_proteins(output_dir, num_partitions=1): + files = [str(Path(download_dir) / Path("arxiv/split/time/train.csv.gz")), + str(Path(download_dir) / Path("arxiv/split/time/valid.csv.gz")), + str(Path(download_dir) / Path("arxiv/split/time/test.csv.gz")), + str(Path(download_dir) / Path("arxiv/raw/edge.csv.gz"))] + + return parse_ogbn(files, download_dir, output_dir, + num_partitions=num_partitions) + + +def ogbn_proteins(download_dir, output_dir, num_partitions=1): + """Preprocesses the dataset ogbn_proteins. + + During preprocessing, Marius has randomly assigned integer ids to each node + and edge_type, where the mappings to the original ids are stored in + node_mapping.txt and rel_mapping.txt. + The edge list in original dataset files is then converted to an [|E|, 3] + int32 tensor, shuffled and then the contents of the tensor are written to + the train_edges.pt file. + After the preprocess, the following files will be created in the designated + directory: + train_edges.pt: Dump of tensor memory for edges in the training set. + node_mapping.txt: Mapping of original node ids to unique int32 ids. + rel_mapping.txt: Mapping of original edge_type ids to unique int32 ids. + + Args: + download_dir: The directory where downloaded dataset files are stored. + output_dir: The directory where preprocessed files will be stored. + num_partitions: The number of graph partitions that the graph nodes are + uniformly partitioned into. + + Returns: + The statistics of current dataset. In the mean time, the original + dataset files are downloaded to download_dir and the preprocessed data + files described above are created and stored in output_dir. + """ OGBN_PROTEINS_URL = "http://snap.stanford.edu/ogb/data/nodeproppred/proteins.zip" - download_path = download_file(OGBN_PROTEINS_URL, output_dir) + download_path = download_file(OGBN_PROTEINS_URL, download_dir) extract_file(download_path) - files = [str(Path(output_dir) / + files = [str(Path(download_dir) / Path("proteins/split/species/train.csv.gz")), - str(Path(output_dir) / + str(Path(download_dir) / Path("proteins/split/species/valid.csv.gz")), - str(Path(output_dir) / + str(Path(download_dir) / Path("proteins/split/species/test.csv.gz")), - str(Path(output_dir) / Path("proteins/raw/edge.csv.gz"))] - - return parse_ogbn(files, output_dir, num_partitions=num_partitions) - - -def ogbn_products(output_dir, num_partitions=1): + str(Path(download_dir) / Path("proteins/raw/edge.csv.gz"))] + + return parse_ogbn(files, download_dir, output_dir, + num_partitions=num_partitions) + + +def ogbn_products(download_dir, output_dir, num_partitions=1): + """Preprocesses the dataset ogbn_products. + + During preprocessing, Marius has randomly assigned integer ids to each node + and edge_type, where the mappings to the original ids are stored in + node_mapping.txt and rel_mapping.txt. + The edge list in original dataset files is then converted to an [|E|, 3] + int32 tensor, shuffled and then the contents of the tensor are written to + the train_edges.pt file. + After the preprocess, the following files will be created in the designated + directory: + train_edges.pt: Dump of tensor memory for edges in the training set. + node_mapping.txt: Mapping of original node ids to unique int32 ids. + rel_mapping.txt: Mapping of original edge_type ids to unique int32 ids. + + Args: + download_dir: The directory where downloaded dataset files are stored. + output_dir: The directory where preprocessed files will be stored. + num_partitions: The number of graph partitions that the graph nodes are + uniformly partitioned into. + + Returns: + The statistics of current dataset. In the mean time, the original + dataset files are downloaded to download_dir and the preprocessed data + files described above are created and stored in output_dir. + """ OGBN_PRODUCTS_URL = "http://snap.stanford.edu/ogb/data/nodeproppred/products.zip" - download_path = download_file(OGBN_PRODUCTS_URL, output_dir) + download_path = download_file(OGBN_PRODUCTS_URL, download_dir) extract_file(download_path) - files = [str(Path(output_dir) / + files = [str(Path(download_dir) / Path("products/split/sales_ranking/train.csv.gz")), - str(Path(output_dir) / + str(Path(download_dir) / Path("products/split/sales_ranking/valid.csv.gz")), - str(Path(output_dir) / + str(Path(download_dir) / Path("products/split/sales_ranking/test.csv.gz")), - str(Path(output_dir) / Path("products/raw/edge.csv.gz"))] + str(Path(download_dir) / Path("products/raw/edge.csv.gz"))] + + return parse_ogbn(files, download_dir, output_dir, + num_partitions=num_partitions) + - return parse_ogbn(files, output_dir, num_partitions=num_partitions) +def parse_ogbn(files, download_dir, output_dir, num_partitions=1): + """Parse ogbn datasets. + Retrieves the graph data from downloaded ogbn dataset files. -def parse_ogbn(files, output_dir, num_partitions=1): + Args: + files: The original ogbn dataset files. + download_dir: The directory where downloaded dataset files are stored. + output_dir: The directory where preprocessed files will be stored. + num_partitions: The number of graph partitions that the graph nodes are + uniformly partitioned into. + + Returns: + The statistics of current dataset. + """ splits = [] for file in files[0:-1]: nodes = pd.read_csv(file, compression='gzip', header=None) @@ -341,23 +949,38 @@ def parse_ogbn(files, output_dir, num_partitions=1): valid_edges = edges.loc[np.in1d(edges[0], splits[1])] test_edges = edges.loc[np.in1d(edges[0], splits[2])] - train_edges.to_csv(str(Path(output_dir) / + train_edges.to_csv(str(Path(download_dir) / Path("train.txt")), sep="\t", header=False, index=False) - valid_edges.to_csv(str(Path(output_dir) / + valid_edges.to_csv(str(Path(download_dir) / Path("valid.txt")), sep="\t", header=False, index=False) - test_edges.to_csv(str(Path(output_dir) / + test_edges.to_csv(str(Path(download_dir) / Path("test.txt")), sep="\t", header=False, index=False) stats = general_parser( - [str(Path(output_dir) / Path("train.txt")), - str(Path(output_dir) / Path("valid.txt")), - str(Path(output_dir) / Path("test.txt"))], + [str(Path(download_dir) / Path("train.txt")), + str(Path(download_dir) / Path("valid.txt")), + str(Path(download_dir) / Path("test.txt"))], ["sd"], output_dir, num_partitions=num_partitions) return stats -def parse_ogbl(files, has_rel, output_dir, num_partitions=1): +def parse_ogbl(files, has_rel, download_dir, output_dir, num_partitions=1): + """Parse ogbl datasets. + + Retrieves the graph from downloaded ogbl dataset files. + + Args: + files: The original obgl dataset files. + has_rel: Indicates whether the current dataset has relation edges. + download_dir: The directory where downloaded dataset files are stored. + output_dir: The directory where preprocessed files will be stored. + num_partitions: The number of graph partitions that the graph nodes are + uniformly partitioned into. + + Returns: + The statistics of current dataset. + """ if has_rel is True: train_idx = torch.load(str(files[0])) valid_idx = torch.load(str(files[1])) @@ -376,37 +999,51 @@ def parse_ogbl(files, has_rel, output_dir, num_partitions=1): valid_list = torch.load(files[1]).get("edge") test_list = torch.load(files[2]).get("edge") - np.savetxt(str(Path(output_dir) / Path("train.txt")), + np.savetxt(str(Path(data_dir) / Path("train.txt")), train_list, fmt="%s", delimiter="\t", newline="\n") - np.savetxt(str(Path(output_dir) / Path("valid.txt")), + np.savetxt(str(Path(data_dir) / Path("valid.txt")), valid_list, fmt="%s", delimiter="\t", newline="\n") - np.savetxt(str(Path(output_dir) / Path("test.txt")), + np.savetxt(str(Path(data_dir) / Path("test.txt")), test_list, fmt="%s", delimiter="\t", newline="\n") print("Conversion completed.") if has_rel is True: stats = general_parser( - [str(Path(output_dir) / Path("train.txt")), - str(Path(output_dir) / Path("valid.txt")), - str(Path(output_dir) / Path("test.txt"))], ["srd"], - output_dir, num_partitions=num_partitions) + [str(Path(data_dir) / Path("train.txt")), + str(Path(data_dir) / Path("valid.txt")), + str(Path(data_dir) / Path("test.txt"))], ["srd"], + data_dir, num_partitions=num_partitions) else: stats = general_parser( - [str(Path(output_dir) / Path("train.txt")), - str(Path(output_dir) / Path("valid.txt")), - str(Path(output_dir) / Path("test.txt"))], ["sd"], - output_dir, num_partitions=num_partitions) + [str(Path(data_dir) / Path("train.txt")), + str(Path(data_dir) / Path("valid.txt")), + str(Path(data_dir) / Path("test.txt"))], ["sd"], + data_dir, num_partitions=num_partitions) return stats -def download_file(url, output_dir): - output_dir = Path(output_dir) - if not output_dir.exists(): - output_dir.mkdir() +def download_file(url, data_dir): + """Downloads files. + + Downloads the files from the input url to the designated data directory. + + Args: + url: The url to the files to be downloaded. + data_dir: The location to save all downloaded files. + + Returns: + The path to the downloaded files. + + Raises: + RuntimeError: An error occurred when downloading is failed. + """ + data_dir = Path(data_dir) + if not data_dir.exists(): + data_dir.mkdir() url_components = urlparse(url) filename = Path(url_components.path + url_components.query).name - filepath = output_dir / filename + filepath = data_dir / filename if filepath.is_file(): print(f"File already exists: {filepath} May be outdated!") @@ -422,6 +1059,23 @@ def download_file(url, output_dir): def extract_file(filepath): + """Extracts files from a compressed file. + + Extracts the files pointed by filepath. The supported file formats include + gzip, gz, tar.gz, tgz, tar, bz2, zip. + + Args: + filepath: The path to the files needed to be extracted. + + Returns: + The directory contains all extracted files. + + Raises: + RuntimeError: An error occurred when the file format cannot be + recognized or the file to be extracted is not + complete. Detailed information is given if the exception is raised. + + """ print("Extracting") try: if tarfile.is_tarfile(str(filepath)): @@ -470,25 +1124,43 @@ def extract_file(filepath): def update_param(config_dict, arg_dict): - if arg_dict.get("generate_config") is None: + """Updates parametars. + + Updates parameters for the configuration files to be generated according to + command line arguments. + + Args: + config_dict: The dict containing all configuration parameters and their + default values. + arg_dict: The dict containing all command line arguments. + + Returns: + The updated configuration dict. + + Raises: + RuntimeError: An error occurred if users specify a certain + configuration parameter while the command line argument + generate_template_config is not set. + """ + if arg_dict.get("generate_template_config") is None: for key in config_dict: if arg_dict.get(key) is not None: raise RuntimeError( - "Please specify --generate_config when " + + "Please specify --generate_template_config when " + "specifying generating options" ) else: - if arg_dict.get("generate_config") is None: + if arg_dict.get("generate_template_config") is None: config_dict.update({"device": "GPU"}) config_dict.update({"general.device": "GPU"}) - elif arg_dict.get("generate_config") == "multi-GPU": + elif arg_dict.get("generate_template_config") == "multi-GPU": config_dict.update({"device": "multi_GPU"}) config_dict.update({"general.device": "multi-GPU"}) else: config_dict.update({"general.device": - arg_dict.get("generate_config")}) + arg_dict.get("generate_template_config")}) config_dict.update({"device": - arg_dict.get("generate_config")}) + arg_dict.get("generate_template_config")}) for key in config_dict.keys(): if arg_dict.get(key) is not None: @@ -501,6 +1173,12 @@ def update_param(config_dict, arg_dict): def set_args(): + """Sets command line arguments for this preprocess module. + + Returns: + The parser containing all command line arguments and the configuration + dict containing all parameters and their default values. + """ parser = argparse.ArgumentParser( description='Preprocess Datasets', prog='preprocess', formatter_class=argparse.RawTextHelpFormatter, @@ -508,36 +1186,44 @@ def set_args(): '[--
.=]'))) mode = parser.add_mutually_exclusive_group() parser.add_argument('output_directory', metavar='output_directory', - type=str, help='Directory to put graph data') - mode.add_argument('--files', metavar='files', nargs='+', type=str, - help='Files containing custom dataset') + type=str, help='Directory to put preprocessed graph ' + + 'data.') + parser.add_argument('--download_directory', metavar='download_directory', + type=str, default="download_dir", + help='Directory to put downloaded data ' + + 'files for supported datasets.') + mode.add_argument('--input_files', metavar='input_files', nargs='+', + type=str, + help='Input files of custom dataset') mode.add_argument('--dataset', metavar='dataset', - type=str, help='Supported dataset to preprocess') + type=str, + help='Name of supported dataset to preprocess') parser.add_argument('--num_partitions', metavar='num_partitions', required=False, type=int, default=1, help='Number of partitions to split the edges into') - parser.add_argument('--overwrite', action='store_true', - required=False, - help=('Overwrites the output_directory if this is ' + - 'set. ' - 'Otherwise, files with same the names will be ' + - 'treated as the data for current dataset.')) - parser.add_argument('--generate_config', '-gc', metavar='generate_config', + parser.add_argument('--generate_template_config', '-gtc', + metavar='generate_template_config', choices=["GPU", "CPU", "multi-GPU"], nargs='?', const="GPU", help=('Generates a single-GPU ' + - 'training configuration file by default. ' + + 'training configuration file which contains ' + + 'parameters with default values. ' + '\nValid options (default to GPU): ' + '[GPU, CPU, multi-GPU]')) parser.add_argument('--format', metavar='format', nargs=1, type=str, default=['srd'], - help='Format of data, eg. srd') + help='Specifies the sequence of source, destination ' + + '(and relation) in input data files, eg. srd') parser.add_argument('--delim', '-d', metavar='delim', type=str, default="", - help='Specifies the delimiter') - parser.add_argument('--dtype', metavar='dtype', type=np.dtype, + help='Specifies the delimiter between source, ' + + '(relation,) destination strings in input ' + + 'data files.') + parser.add_argument('--remap_id_dtype', metavar='remap_id_dtype', + type=np.dtype, default=np.int32, - help='Indicates the numpy.dtype') + help='Indicates the data format to store the ' + + 'remapped IDs.') parser.add_argument('--not_remap_ids', action='store_false', help='If set, will not remap ids') parser.add_argument('--dataset_split', '-ds', metavar='dataset_split', @@ -545,11 +1231,12 @@ def set_args(): help='Split dataset into specified fractions') parser.add_argument('--start_col', '-sc', metavar='start_col', type=int, default=0, - help='Indicates the column index to start from') + help='Indicates the column index to start parsing ' + + 'source/destination nodes( or relation).') parser.add_argument('--num_line_skip', '-nls', metavar='num_line_skip', type=int, default=None, - help='Indicates number of lines to ' + - 'skip from the beginning') + help='Indicates number of lines/rows to ' + + 'skip from the beginning of the file.') config_dict, valid_dict = read_template(DEFAULT_CONFIG_FILE) @@ -566,9 +1253,24 @@ def set_args(): def parse_args(config_dict, args): + """Parse command line arguments. + + Identifies the dataset to be preprocess and update configuration parameters + if they are set by command line arguments. + + Args: + config_dict: The dict containing all configuration parameters and their + default values. + args: All command line arguments. + + Returns: + The dict containing updated configuration parameters and the dict + containing parsed command line arguments. + """ arg_dict = vars(args) config_dict = update_param(config_dict, arg_dict) set_up_files(args.output_directory) + set_up_files(args.download_directory) if arg_dict.get("dataset") is None: config_dict.update({"dataset": "custom"}) @@ -608,25 +1310,23 @@ def main(): "ogbn_products": ogbn_products, } - if args.overwrite and Path(args.output_directory).exists(): - shutil.rmtree(args.output_directory) - if dataset_dict.get(args.dataset) is not None: print(args.dataset) stats = dataset_dict.get(args.dataset)( - args.output_directory, args.num_partitions) + args.download_directory, + args.output_directory, + args.num_partitions) else: print("Preprocess custom dataset") - stats = general_parser(args.files, args.format, + stats = general_parser(args.input_files, args.format, args.output_directory, args.delim, args.num_partitions, - args.dtype, args.not_remap_ids, + args.remap_id_dtype, args.not_remap_ids, args.dataset_split, args.start_col, args.num_line_skip) - - if args.generate_config is not None: + if args.generate_template_config is not None: dir = args.output_directory config_dict = update_stats(stats, config_dict) config_dict = update_data_path(dir, config_dict) diff --git a/test/python/bindings/test_fb15k.py b/test/python/bindings/test_fb15k.py index c99fa54c..ef27ea41 100644 --- a/test/python/bindings/test_fb15k.py +++ b/test/python/bindings/test_fb15k.py @@ -19,7 +19,7 @@ def tearDown(self): @pytest.mark.skipif(os.environ.get("MARIUS_ONLY_PYTHON", None) == "TRUE", reason="Requires building the bindings") def test_one_epoch(self): - preprocess.fb15k(output_dir="output_dir/") + preprocess.fb15k("output_dir/", "output_dir/") config_path = "examples/training/configs/fb15k_cpu.ini" config = m.parseConfig(config_path) diff --git a/test/python/postprocess/test_postprocess.py b/test/python/postprocess/test_postprocess.py index 0a934fe2..b44af8d0 100644 --- a/test/python/postprocess/test_postprocess.py +++ b/test/python/postprocess/test_postprocess.py @@ -8,10 +8,12 @@ import numpy as np import torch + class TestPostprocess(unittest.TestCase): """ Tests for postprocess """ + download_dir = Path("./download_dir") dataset_dir = Path("./output_dir") data_dir = Path("./data/") node_mapping_file = Path(dataset_dir) / Path("node_mapping.txt") @@ -22,7 +24,7 @@ class TestPostprocess(unittest.TestCase): @classmethod def setUpClass(self): - wn18(str(self.dataset_dir)) + wn18(str(self.download_dir), str(self.dataset_dir)) if not Path("./data/marius/embeddings").exists(): Path("./data/marius/embeddings").mkdir(parents=True) @@ -42,6 +44,9 @@ def tearDownClass(self): if Path("./output_dir").exists(): shutil.rmtree(Path("./output_dir")) + if Path("./download_dir").exists(): + shutil.rmtree(Path("./download_dir")) + def test_get_embs(self): """ Check if embeddings are returned in correct size diff --git a/test/python/predict/test_predict.py b/test/python/predict/test_predict.py index 0f6febd0..91e7fa49 100644 --- a/test/python/predict/test_predict.py +++ b/test/python/predict/test_predict.py @@ -13,6 +13,7 @@ class TestPredict(unittest.TestCase): """ Tests for predict """ + download_dir = Path("./download_dir") dataset_dir = Path("./output_dir") data_dir = Path("./data/") node_mapping_file = Path(dataset_dir) / Path("node_mapping.txt") @@ -23,7 +24,7 @@ class TestPredict(unittest.TestCase): @classmethod def setUpClass(self): - wn18(str(self.dataset_dir)) + wn18(str(self.download_dir), str(self.dataset_dir)) if not Path("./data/marius/embeddings").exists(): Path("./data/marius/embeddings").mkdir(parents=True) @@ -43,6 +44,9 @@ def tearDownClass(self): if Path("./output_dir").exists(): shutil.rmtree(Path("./output_dir")) + if Path("./download_dir").exists(): + shutil.rmtree(Path("./download_dir")) + def test_cmd_line_infer_list(self): """ Check if inference can be extracted from command line correctly diff --git a/test/python/preprocessing/test_config_generator_cmd_opt_parsing.py b/test/python/preprocessing/test_config_generator_cmd_opt_parsing.py index 665bfd99..d91099e6 100644 --- a/test/python/preprocessing/test_config_generator_cmd_opt_parsing.py +++ b/test/python/preprocessing/test_config_generator_cmd_opt_parsing.py @@ -40,6 +40,9 @@ def setUp(self): def tearDown(self): if Path("./output_dir").exists(): shutil.rmtree(Path("./output_dir")) + + if Path("./download_dir").exists(): + shutil.rmtree(Path("./download_dir")) def test_device_default(self): """ diff --git a/test/python/preprocessing/test_csv_preprocessor.py b/test/python/preprocessing/test_csv_preprocessor.py index e7070950..79b1ca00 100644 --- a/test/python/preprocessing/test_csv_preprocessor.py +++ b/test/python/preprocessing/test_csv_preprocessor.py @@ -6,6 +6,7 @@ import numpy as np from marius.tools.csv_converter import general_parser from test.python.helpers import dataset_generator +from marius.tools.preprocess import wn18 TEST_DIR = "./output_dir" test_data_dir = "./test/test_data/" @@ -17,6 +18,7 @@ train_path = str(Path(input_dir) / Path(train_file)) valid_path = str(Path(input_dir) / Path(valid_file)) test_path = str(Path(input_dir) / Path(test_file)) +download_dir = Path("./download_dir") class TestGeneralParser(unittest.TestCase): @@ -31,6 +33,9 @@ def setUp(self): if Path(input_dir).exists(): shutil.rmtree(Path(input_dir)) + if Path(download_dir).exists(): + shutil.rmtree(Path(download_dir)) + Path(input_dir).mkdir() shutil.copy(str(Path(test_data_dir) / Path(train_file)), str(Path(input_dir) / Path(train_file))) @@ -48,6 +53,9 @@ def tearDown(self): if Path(input_dir).exists(): shutil.rmtree(Path(input_dir)) + if Path(download_dir).exists(): + shutil.rmtree(Path(download_dir)) + def test_basic(self): """ Check the preprocessor executes on the test data without error @@ -403,7 +411,7 @@ def test_dtype_cmd_opt(self): str(Path(input_dir) / Path(train_file)), str(Path(input_dir) / Path(valid_file)), str(Path(input_dir) / Path(test_file)), - "srd", output_dir, "--dtype=int32"], capture_output=True) + "srd", output_dir, "--remap_id_dtype=int32"], capture_output=True) self.assertEqual((Path(output_dir) / Path("train_edges.pt")).stat().st_size, 1000*3*4) @@ -411,3 +419,17 @@ def test_dtype_cmd_opt(self): Path("valid_edges.pt")).stat().st_size, 100*3*4) self.assertEqual((Path(output_dir) / Path("test_edges.pt")).stat().st_size, 100*3*4) + + def test_download_dir(self): + """ + Check if download_dir is created correctly and if dataset files are + stored in download_dir + """ + wn18(download_dir, output_dir) + self.assertTrue(Path(download_dir).exists()) + self.assertTrue((Path(download_dir) / + Path("wordnet-mlj12-train.txt")).exists()) + self.assertTrue((Path(download_dir) / + Path("wordnet-mlj12-valid.txt")).exists()) + self.assertTrue((Path(download_dir) / + Path("wordnet-mlj12-test.txt")).exists()) \ No newline at end of file diff --git a/test/python/preprocessing/test_preprocess_cmd_opt_parsing.py b/test/python/preprocessing/test_preprocess_cmd_opt_parsing.py index 9f825408..35272f59 100644 --- a/test/python/preprocessing/test_preprocess_cmd_opt_parsing.py +++ b/test/python/preprocessing/test_preprocess_cmd_opt_parsing.py @@ -12,24 +12,24 @@ class TestPreprocessCmdOptParser(unittest.TestCase): Tests for functions parsing command line arguments """ cmd_args = [ - ["./output_dir", "--dataset", "wn18", "--generate_config", + ["./output_dir", "--dataset", "wn18", "--generate_template_config", "--num_partitions", "5"], - ["./output_dir", "--dataset", "wn18", "-gc", "GPU"], - ["./output_dir", "--dataset", "wn18", "-gc", "CPU", + ["./output_dir", "--dataset", "wn18", "-gtc", "GPU"], + ["./output_dir", "--dataset", "wn18", "-gtc", "CPU", "--model.embedding_size=400", "--training.batch_size=51200", "--training.num_epochs=23"], - ["./output_dir", "--dataset", "wn18", "-gc", "GPU", + ["./output_dir", "--dataset", "wn18", "-gtc", "GPU", "--general.embedding_size=400"], ["./output_dir", "--dataset", "wn18", "--general.embedding_size=200"], ["--dataset", "wn18", "./output_dir"], ["--dataset", "wn18"], ["./output_dir", "--dataset", "wn18", "CPU"], - ["./output_dir", "--dataset", "wn18", "--gc", "--model.decoder"], + ["./output_dir", "--dataset", "wn18", "--gtc", "--model.decoder"], [], ["./output_dir", "--dataset", "wn18", "multi_cpu"], - ["./output_dir", "--dataset", "wn18", "--gc", + ["./output_dir", "--dataset", "wn18", "--gtc", "--storage.edge_bucket_ordering=EliminationCus"], - ["marius_preprocess", "./output_dir", "--dataset", "wn18", "-gc"] + ["marius_preprocess", "./output_dir", "--dataset", "wn18", "-gtc"] ] @classmethod @@ -37,14 +37,17 @@ def setUp(self): if not Path("./output_dir").exists(): Path("./output_dir").mkdir() + if Path("./download_dir").exists(): + shutil.rmtree(Path("./download_dir")) + @classmethod def tearDown(self): if Path("./output_dir").exists(): shutil.rmtree(Path("./output_dir")) - def test_generate_config_default(self): + def test_generate_template_config_default(self): """ - Check if default value of --generate_config is assigned correctly + Check if default value of --generate_template_config is assigned correctly """ parser, config_dict = set_args() args = parser.parse_args(self.cmd_args[0]) @@ -55,7 +58,7 @@ def test_generate_config_default(self): def test_gpu(self): """ - Check if --gc can parse device choice correctly + Check if --gtc can parse device choice correctly """ parser, config_dict = set_args() args = parser.parse_args(self.cmd_args[1]) @@ -90,7 +93,7 @@ def test_unmatching_training_config(self): def test_inconsistent_training_config(self): """ Check if excpetion is thrown if trainig config is specified without - --generate_config being specified + --generate_template_config being specified """ parser, config_dict = set_args() with self.assertRaises(SystemExit): @@ -99,13 +102,13 @@ def test_inconsistent_training_config(self): def test_required_args(self): """ - Check if args.generate_config is set correctly if --generate_config + Check if args.generate_template_config is set correctly if --generate_template_config is not specified """ parser, config_dict = set_args() args = parser.parse_args(self.cmd_args[5]) config_dict, arg_dict = parse_args(config_dict, args) - self.assertTrue(arg_dict.get("generate_config") is None) + self.assertTrue(arg_dict.get("generate_template_config") is None) def test_required_arg_omitted(self): """ @@ -169,9 +172,9 @@ def test_custom_dataset(self): """ subprocess.run(["python3", "./src/python/tools/preprocess.py", "./output_dir", - "--files", + "--input_files", "./test/test_data/train_edges.txt", "./test/test_data/valid_edges.txt", "./test/test_data/test_edges.txt", - "-gc", "CPU"]) + "-gtc", "CPU"]) self.assertTrue(Path("./output_dir/custom_cpu.ini").exists()) \ No newline at end of file