marius-team · AnzeXie · Oct 1, 2021 · Oct 7, 2021 · Oct 7, 2021
diff --git a/docs/user_guide/command_line_interface.rst b/docs/user_guide/command_line_interface.rst
@@ -31,48 +31,48 @@ The available options:
 
 ::
 
- usage: preprocess [-h] [--files files [files ...]] [--dataset dataset] [--num_partitions num_partitions] [--overwrite]
- [--generate_config [generate_config]] [--format format] [--delim delim] [--dtype dtype] [--not_remap_ids]
- [--dataset_split dataset_split dataset_split] [--start_col start_col] [--num_line_skip num_line_skip]
- output_directory
-
- Preprocess Datasets
-
- positional arguments:
- output_directory      Directory to put graph data
-
- optional arguments:
- -h, --help            show this help message and exit
- --files files [files ...]
-     Files containing custom dataset
- --dataset dataset     Supported dataset to preprocess
- --num_partitions num_partitions
-     Number of partitions to split the edges into
- --overwrite           Overwrites the output_directory if this issetOtherwise, files with same the names will be treated as the data for current dataset.
- --generate_config [generate_config], -gc [generate_config]
-     Generates a single-GPU/multi-CPU/multi-GPU training configuration file by default.
-     Valid options (default to GPU): [GPU, CPU, multi-GPU]
- --format format       Format of data, eg. srd
- --delim delim, -d delim
-     Specifies the delimiter
- --dtype dtype         Indicates the numpy.dtype
- --not_remap_ids       If set, will not remap ids
- --dataset_split dataset_split dataset_split, -ds dataset_split dataset_split
-     Split dataset into specified fractions
- --start_col start_col, -sc start_col
-     Indicates the column index to start from
- --num_line_skip num_line_skip, -nls num_line_skip
-     Indicates number of lines to skip from the beginning
-
- Specify certain config (optional): [--<section>.<key>=<value>]
+    usage: preprocess [-h] [--files files [files ...] | --dataset dataset] [--num_partitions num_partitions] [--overwrite]
+                    [--generate_config [generate_config]] [--format format] [--delim delim] [--dtype dtype] [--not_remap_ids]
+                    [--dataset_split dataset_split dataset_split] [--start_col start_col] [--num_line_skip num_line_skip]
+                    output_directory
+
+    Preprocess Datasets
+
+    positional arguments:
+    output_directory      Directory to put graph data
+
+    optional arguments:
+    -h, --help            show this help message and exit
+    --files files [files ...]
+                            Files containing custom dataset
+    --dataset dataset     Built-in dataset to preprocess
+    --num_partitions num_partitions
+                            Number of partitions to split the edges into
+    --overwrite           Overwrites the output_directory if this is set. Otherwise, files with same the names will be treated as the data for current dataset.
+    --generate_config [generate_config], -gc [generate_config]
+                            Generates a single-GPU training configuration file by default.
+                            Valid options (default to GPU): [GPU, CPU, multi-GPU]
+    --format format       Format of data, eg. srd
+    --delim delim, -d delim
+                            Specifies the delimiter
+    --dtype dtype         Indicates the numpy.dtype
+    --not_remap_ids       If set, will not remap ids
+    --dataset_split dataset_split dataset_split, -ds dataset_split dataset_split
+                            Split dataset into specified fractions
+    --start_col start_col, -sc start_col
+                            Indicates the column index to start from
+    --num_line_skip num_line_skip, -nls num_line_skip
+                            Indicates number of lines to skip from the beginning
+
+    Specify certain config (optional): [--<section>.<key>=<value>]
 
 output_directory
 ++++++++++++++++
 ``<output_directory>`` is a **required** argument for ``marius_preprocess``. 
 It is the directory where all the files created by ``marius_preprocess`` wil be stored.
 ``marius_preprocess`` will create this file if it does not exist.
 ``marius_preprocess`` outputs the following files to ``<output_directory>``.
-For the preprocessing of supported datasets, ``<output_directory>`` also includes
+For the preprocessing of built-in datasets, ``<output_directory>`` also includes
 the downloaded raw dataset.
 
 ==================  ============
@@ -121,7 +121,7 @@ For example, the following command preprocesses the custom dataset composed of `
 \-\-dataset <dataset>
 +++++++++++++++++++++
 ``--dataset`` is an **optional** argument for ``marius_preprocess``.
-It can be one of the names of a Marius supported dataset. 
+It can be one of the names of a Marius built-in dataset. 
 It should not be used at the same time when ``--files`` is used.
 To see which datasets are supported by Marius, check out
 :ref:`dataset` table.
@@ -246,7 +246,8 @@ marius_config_generator
 ^^^^^^^^^^^^^^^^^^^^^^^
 
 This command lets users to create a Marius configuration file from the command line with
-some parameters specified according to their needs.
+some parameters specified according to their needs. Options are provided for generating 
+Marius configuration files for both custom (``--stats``) and built-in datasets (``--dataset``).
 This command can be called with:
 
 ::
@@ -257,28 +258,29 @@ The available options:
 
 ::
 
-    usage: config_generator [-h] [--data_directory data_directory] [--dataset dataset | --stats num_nodes num_edge_types num_train num_valid num_test]
-    [--device [generate_config]]
-    output_directory
+    usage: config_generator [-h] [--data_directory data_directory]
+                        [--dataset dataset | --stats num_nodes num_relations num_train num_valid num_test]
+                        [--device [generate_config]]
+                        output_directory
 
     Generate configs
 
     positional arguments:
-    output_directory      Directory to put configs
-    Also assumed to be the default directory of preprocessed data if --data_directory is not specified
+    output_directory      Directory to save Marius configuration files
+                            Also assumed to be the default directory of preprocessed data if --data_directory is not specified
 
     optional arguments:
     -h, --help            show this help message and exit
     --data_directory data_directory
-    Directory of the preprocessed data
+                            Directory of the preprocessed data
     --dataset dataset, -d dataset
-    Dataset to preprocess
-    --stats num_nodes num_edge_types num_train num_valid num_test, -s num_nodes num_edge_types num_train num_valid num_test
-    Dataset statistics
-    Enter in order of num_nodes, num_edge_types, num_train num_valid, num_test
+                            Name of the built-in dataset for generating Marius configuration file
+    --stats num_nodes num_relations num_train num_valid num_test, -s num_nodes num_relations num_train num_valid num_test
+                            Custom Dataset statistics
+                            Enter in order of num_nodes, num_relations, num_train num_valid, num_test
     --device [generate_config], -dev [generate_config]
-    Generates configs for a single-GPU/multi-CPU/multi-GPU training configuration file by default.
-    Valid options (default to GPU): [GPU, CPU, multi-GPU]
+                            Generates configs for a single-GPU/multi-CPU/multi-GPU Marius configuration file.
+                            Valid options (default to GPU): [GPU, CPU, multi-GPU]
 
     Specify certain config (optional): [--<section>.<key>=<value>]
 
@@ -289,18 +291,25 @@ The available options:
 \-\-data_directory <data_directory>
 +++++++++++++++++++++++++++++++++++
 ``--data_directory`` is an **optional** argument. It specifies the directory where ``marius_preprocess`` stores
-preprocessed data.
+preprocessed data. 
 
 \-\-dataset <dataset>, \-d <dataset>
 ++++++++++++++++++++++++++++++++++++
-``--dataset`` is an **optional** argument. It specifies the name of the supported dataset. It should not be
-used when ``--stats`` is in use.
+``--dataset`` is an **optional** argument. This argument is used when users want to 
+generate a Marius configuration file for a built-in dataset.
+It specifies the name of the built-in dataset which its configuration file will be generated. 
+It should not be used when ``--stats`` is in use. To see which datasets are built-in by Marius, check out
+:ref:`dataset` table.
 
 \-\-stats <num_nodes> <num_relations> <num_train> <num_valid> <num_test>, \-s <num_nodes> <num_relations> <num_train> <num_valid> <num_test>
 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 ``--stats <num_nodes> <num_relations> <num_train> <num_valid> <num_test>, -s <num_nodes> <num_relations> <num_train> <num_valid> <num_test>``
-is an **optional** argument. It specifies the stats of the dataset to be trained over. It should not be used at the same
-time with option ``--dataset``.
+is an **optional** argument. 
+This argument is used when users want to generate a Marius configuration file for
+a custom dataset. Users need to manually pass the statistics of the dataset in the order of 
+number of nodes, number of relations, number of edges in training set, number of edges in validation set, 
+number of edges in testing set. 
+It should not be used at the same time with option ``--dataset``.
 
 \-\-device <device>, \-dev <device>
 +++++++++++++++++

diff --git a/docs/user_guide/getting_started.rst b/docs/user_guide/getting_started.rst
@@ -90,7 +90,9 @@ Training embeddings on such a graph requires three steps:
 
     Marius also offers ``marius_config_generator`` to generate a configuration file
     for the users given the basic information of dataset statistics and where to store
-    the created configuration file.
+    the created configuration file. ``marius_config_generator`` can be used to generate
+    configuration files for both custom and built-in datasets by passing different
+    options.
     All other configuration parameters will be set to the default value.
     Users are given the options to specify the values of certain parameters.
     The following command shows how to use ``marius_config_generator`` to generate 

diff --git a/src/python/tools/config_generator.py b/src/python/tools/config_generator.py
@@ -130,25 +130,27 @@ def set_args():
                         '[--<section>.<key>=<value>]'))
     mode = parser.add_mutually_exclusive_group()
     parser.add_argument('output_directory', metavar='output_directory',
-                        type=str, help='Directory to put configs \nAlso ' +
+                        type=str, help='Directory to save Marius ' +
+                        'configuration files \nAlso ' +
                         'assumed to be the default directory of preprocessed' +
                         ' data if --data_directory is not specified')
     parser.add_argument('--data_directory', metavar='data_directory',
                         type=str, help='Directory of the preprocessed data')
     mode.add_argument('--dataset', '-d', metavar='dataset', type=str,
-                      help='Dataset to preprocess')
+                      help='Name of the built-in dataset for generating ' +
+                           'Marius configuration file')
     mode.add_argument('--stats', '-s',
                       metavar=('num_nodes', 'num_relations', 'num_train',
                                'num_valid', 'num_test'),
-                      nargs=5, help='Dataset statistics\n' +
+                      nargs=5, help='Custom Dataset statistics\n' +
                       'Enter in order of num_nodes, num_relations, num_train' +
                       ' num_valid, num_test')
     parser.add_argument('--device', '-dev', metavar='generate_config',
                         choices=["GPU", "CPU", "multi-GPU"],
                         nargs='?', default='GPU',
                         help=('Generates configs for a single-GPU/multi-CPU' +
-                              '/multi-GPU training configuration file by ' +
-                              'default. \nValid options (default to GPU): ' +
+                              '/multi-GPU Marius configuration file' +
+                              '. \nValid options (default to GPU): ' +
                               '[GPU, CPU, multi-GPU]'))
 
     config_dict, valid_dict = read_template(DEFAULT_CONFIG_FILE)

diff --git a/src/python/tools/preprocess.py b/src/python/tools/preprocess.py
@@ -512,7 +512,7 @@ def set_args():
     mode.add_argument('--files', metavar='files', nargs='+', type=str,
                         help='Files containing custom dataset')
     mode.add_argument('--dataset', metavar='dataset',
-                        type=str, help='Supported dataset to preprocess')
+                        type=str, help='Built-in dataset to preprocess')
     parser.add_argument('--num_partitions', metavar='num_partitions',
                         required=False, type=int, default=1,
                         help='Number of partitions to split the edges into')