From 3fb93c06b1127677cff5f580cb0c42e62d3b7da6 Mon Sep 17 00:00:00 2001
From: sardev <sardev@node0.wiki-baseline.uw-mad-dash-pg0.wisc.cloudlab.us>
Date: Sat, 31 Aug 2024 01:17:02 +0000
Subject: [PATCH] Fixed training config

---
 wikipedia_analysis/README.md             | 16 ++++--
 wikipedia_analysis/initial_training.yaml | 62 +++---------------------
 2 files changed, 17 insertions(+), 61 deletions(-)

diff --git a/wikipedia_analysis/README.md b/wikipedia_analysis/README.md
index 99768d8e..a10220a1 100644
--- a/wikipedia_analysis/README.md
+++ b/wikipedia_analysis/README.md
@@ -3,6 +3,8 @@
 The following README contains the steps to perform the benchmarking on the wikipedia datasets. Before we run anything, run the commands:
 ```
 $ sudo apt update -y && sudo apt upgrade -y
+$ sudo apt-get install -y xfsprogs
+$ sudo modprobe -v xfs
 ```
 
 ## Mounting the data directory
@@ -45,11 +47,6 @@ $ sudo mount -a
 $ sudo chmod ugo+rw -R all_data
 ```
 
-Verify by running `df -h` inside of `all_data` and ensure it produces this output:
-```
-
-```
-
 ## Setting up docker
 
 First install the nvidia driver using the command:
@@ -88,6 +85,8 @@ $ curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --de
 $ sudo sed -i -e '/experimental/ s/^#//g' /etc/apt/sources.list.d/nvidia-container-toolkit.list
 $ sudo apt-get update
 $ sudo apt-get install -y nvidia-container-toolkit
+$ sudo nvidia-ctk runtime configure --runtime=docker
+$ sudo systemctl restart docker
 ```
 
 Finally run `sudo reboot`. Verify the install by running the command `nvidia-smi`. 
@@ -121,6 +120,7 @@ $ python3 -m pip install boto3
 
 Then setup aws using `aws configure`. Then run the preprocessing using the commands:
 ```
+$ apt install -y lbzip2
 $ cd wikipedia_analysis
 $ python3 -u preprocess_runner.py &> preprocess.log
 ```
@@ -136,4 +136,10 @@ $ cmake ../ -DUSE_CUDA=TRUE -DUSE_OMP=TRUE
 and then:
 ```
 $ rm -rf /root/all_data/graph_snapshots/initial_snapshot/marius_formatted/model_* && make marius_train -j && ./marius_train ../wikipedia_analysis/initial_training.yaml
+```
+
+Once the training is done then upload the results to AWS using the commands:
+```
+$ tar -I lbzip2 -cvpf ~/all_data/graph_snapshots/trained_initial_snapshot.tar.gz ~/all_data/graph_snapshots/initial_snapshot 
+$ aws s3 mv ~/all_data/graph_snapshots/trained_initial_snapshot.tar.gz s3://wikidata-update-history
 ```
\ No newline at end of file
diff --git a/wikipedia_analysis/initial_training.yaml b/wikipedia_analysis/initial_training.yaml
index 1476ff7c..c8f5dc05 100644
--- a/wikipedia_analysis/initial_training.yaml
+++ b/wikipedia_analysis/initial_training.yaml
@@ -1,59 +1,9 @@
 model:
   learning_task: LINK_PREDICTION
   encoder:
-    train_neighbor_sampling:
-      - type: UNIFORM
-        options:
-          max_neighbors: 32
-      - type: UNIFORM
-        options:
-          max_neighbors: 32
-      - type: UNIFORM
-        options:
-          max_neighbors: 32
-    eval_neighbor_sampling:
-      - type: UNIFORM
-        options:
-          max_neighbors: 32
-      - type: UNIFORM
-        options:
-          max_neighbors: 32
-      - type: UNIFORM
-        options:
-          max_neighbors: 32
     layers:
       - - type: EMBEDDING
-          output_dim: 32
-          bias: true
-          init:
-            type: GLOROT_NORMAL
-
-      - - type: GNN
-          options:
-            type: GRAPH_SAGE
-            aggregator: MEAN
-          input_dim: 32
-          output_dim: 32
-          bias: true
-          init:
-            type: GLOROT_NORMAL
-
-      - - type: GNN
-          options:
-            type: GRAPH_SAGE
-            aggregator: MEAN
-          input_dim: 32
-          output_dim: 32
-          bias: true
-          init:
-            type: GLOROT_NORMAL
-
-      - - type: GNN
-          options:
-            type: GRAPH_SAGE
-            aggregator: MEAN
-          input_dim: 32
-          output_dim: 32
+          output_dim: 128
           bias: true
           init:
             type: GLOROT_NORMAL
@@ -70,7 +20,7 @@ model:
   sparse_optimizer:
     type: ADAGRAD
     options:
-      learning_rate: 0.01
+      learning_rate: 0.001
 storage:
   device_type: cuda
   dataset:
@@ -81,15 +31,15 @@ storage:
     type: DEVICE_MEMORY
   save_model: true
 training:
-  batch_size: 16
+  batch_size: 1024
   negative_sampling:
     num_chunks: 10
     negatives_per_positive: 750
-    degree_fraction: 0.0
+    degree_fraction: 0.1
     filtered: false
   num_epochs: 50
   epochs_per_shuffle: 1
 evaluation:
-  batch_size: 16
+  batch_size: 1024
   negative_sampling:
-    filtered: true
\ No newline at end of file
+    filtered: false
\ No newline at end of file