From 3fb93c06b1127677cff5f580cb0c42e62d3b7da6 Mon Sep 17 00:00:00 2001 From: sardev Date: Sat, 31 Aug 2024 01:17:02 +0000 Subject: [PATCH] Fixed training config --- wikipedia_analysis/README.md | 16 ++++-- wikipedia_analysis/initial_training.yaml | 62 +++--------------------- 2 files changed, 17 insertions(+), 61 deletions(-) diff --git a/wikipedia_analysis/README.md b/wikipedia_analysis/README.md index 99768d8e..a10220a1 100644 --- a/wikipedia_analysis/README.md +++ b/wikipedia_analysis/README.md @@ -3,6 +3,8 @@ The following README contains the steps to perform the benchmarking on the wikipedia datasets. Before we run anything, run the commands: ``` $ sudo apt update -y && sudo apt upgrade -y +$ sudo apt-get install -y xfsprogs +$ sudo modprobe -v xfs ``` ## Mounting the data directory @@ -45,11 +47,6 @@ $ sudo mount -a $ sudo chmod ugo+rw -R all_data ``` -Verify by running `df -h` inside of `all_data` and ensure it produces this output: -``` - -``` - ## Setting up docker First install the nvidia driver using the command: @@ -88,6 +85,8 @@ $ curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --de $ sudo sed -i -e '/experimental/ s/^#//g' /etc/apt/sources.list.d/nvidia-container-toolkit.list $ sudo apt-get update $ sudo apt-get install -y nvidia-container-toolkit +$ sudo nvidia-ctk runtime configure --runtime=docker +$ sudo systemctl restart docker ``` Finally run `sudo reboot`. Verify the install by running the command `nvidia-smi`. @@ -121,6 +120,7 @@ $ python3 -m pip install boto3 Then setup aws using `aws configure`. Then run the preprocessing using the commands: ``` +$ apt install -y lbzip2 $ cd wikipedia_analysis $ python3 -u preprocess_runner.py &> preprocess.log ``` @@ -136,4 +136,10 @@ $ cmake ../ -DUSE_CUDA=TRUE -DUSE_OMP=TRUE and then: ``` $ rm -rf /root/all_data/graph_snapshots/initial_snapshot/marius_formatted/model_* && make marius_train -j && ./marius_train ../wikipedia_analysis/initial_training.yaml +``` + +Once the training is done then upload the results to AWS using the commands: +``` +$ tar -I lbzip2 -cvpf ~/all_data/graph_snapshots/trained_initial_snapshot.tar.gz ~/all_data/graph_snapshots/initial_snapshot +$ aws s3 mv ~/all_data/graph_snapshots/trained_initial_snapshot.tar.gz s3://wikidata-update-history ``` \ No newline at end of file diff --git a/wikipedia_analysis/initial_training.yaml b/wikipedia_analysis/initial_training.yaml index 1476ff7c..c8f5dc05 100644 --- a/wikipedia_analysis/initial_training.yaml +++ b/wikipedia_analysis/initial_training.yaml @@ -1,59 +1,9 @@ model: learning_task: LINK_PREDICTION encoder: - train_neighbor_sampling: - - type: UNIFORM - options: - max_neighbors: 32 - - type: UNIFORM - options: - max_neighbors: 32 - - type: UNIFORM - options: - max_neighbors: 32 - eval_neighbor_sampling: - - type: UNIFORM - options: - max_neighbors: 32 - - type: UNIFORM - options: - max_neighbors: 32 - - type: UNIFORM - options: - max_neighbors: 32 layers: - - type: EMBEDDING - output_dim: 32 - bias: true - init: - type: GLOROT_NORMAL - - - - type: GNN - options: - type: GRAPH_SAGE - aggregator: MEAN - input_dim: 32 - output_dim: 32 - bias: true - init: - type: GLOROT_NORMAL - - - - type: GNN - options: - type: GRAPH_SAGE - aggregator: MEAN - input_dim: 32 - output_dim: 32 - bias: true - init: - type: GLOROT_NORMAL - - - - type: GNN - options: - type: GRAPH_SAGE - aggregator: MEAN - input_dim: 32 - output_dim: 32 + output_dim: 128 bias: true init: type: GLOROT_NORMAL @@ -70,7 +20,7 @@ model: sparse_optimizer: type: ADAGRAD options: - learning_rate: 0.01 + learning_rate: 0.001 storage: device_type: cuda dataset: @@ -81,15 +31,15 @@ storage: type: DEVICE_MEMORY save_model: true training: - batch_size: 16 + batch_size: 1024 negative_sampling: num_chunks: 10 negatives_per_positive: 750 - degree_fraction: 0.0 + degree_fraction: 0.1 filtered: false num_epochs: 50 epochs_per_shuffle: 1 evaluation: - batch_size: 16 + batch_size: 1024 negative_sampling: - filtered: true \ No newline at end of file + filtered: false \ No newline at end of file