Add in initial wikipedia code

marius-team · Aug 21, 2024 · 59c9cbc · 59c9cbc
1 parent 2f27ffe
commit 59c9cbc
Show file tree

Hide file tree

Showing 5 changed files with 310 additions and 10 deletions.
diff --git a/examples/docker/gpu_ubuntu/dockerfile b/examples/docker/gpu_ubuntu/dockerfile
@@ -1,4 +1,7 @@
 FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
+
+ENV TZ=Asia/Dubai
+RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
 RUN apt update
 
 RUN apt install -y g++ \
@@ -10,22 +13,23 @@ RUN apt install -y g++ \
          dstat \
          python3-pip
 
-# install gcc-9
+# install gcc-11
 RUN apt install -y software-properties-common
 RUN add-apt-repository -y ppa:ubuntu-toolchain-r/test
 RUN apt update
-RUN apt install -y gcc-9 g++-9
-RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 9
-RUN update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-9 9
+RUN apt install -y gcc-11 g++-11
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11
+RUN update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 11
 
-# install cmake 3.20
-RUN wget https://github.com/Kitware/CMake/releases/download/v3.20.0/cmake-3.20.0-linux-x86_64.sh
+# install cmake 3.28
+RUN wget https://github.com/Kitware/CMake/releases/download/v3.28.3/cmake-3.28.3-linux-x86_64.sh
 RUN mkdir /opt/cmake
-RUN sh cmake-3.20.0-linux-x86_64.sh --skip-license --prefix=/opt/cmake/
+RUN sh cmake-3.28.3-linux-x86_64.sh --skip-license --prefix=/opt/cmake/
 RUN ln -s /opt/cmake/bin/cmake /usr/local/bin/cmake
 
 # install pytorch
 RUN python3 -m pip install torch==2.0.1+cu118 -f https://download.pytorch.org/whl/torch_stable.html
+RUN git config --global --add safe.directory "*"
 
 RUN mkdir /working_dir
 WORKDIR /working_dir
diff --git a/setup.py b/setup.py
@@ -16,7 +16,7 @@ def __init__(self, name, sourcedir=""):
 class CMakeBuild(build_ext):
     def run(self):
         try:
-            _ = subprocess.check_output(["cmake", "--version"])
+            _ = subprocess.run(["cmake", "--version"])
         except OSError:
             raise RuntimeError(
                 "CMake must be installed to build the following extensions: "
@@ -69,8 +69,8 @@ def build_extension(self, ext):
 
         print(cmake_args)
 
-        subprocess.check_call(["cmake", ext.sourcedir] + cmake_args, cwd=self.build_temp, env=env)
-        subprocess.check_call(["cmake", "--build", ".", "--target", "bindings"] + build_args, cwd=self.build_temp)
+        subprocess.run(["cmake", ext.sourcedir] + cmake_args, cwd=self.build_temp, env=env)
+        subprocess.run(["cmake", "--build", ".", "--target", "bindings"] + build_args, cwd=self.build_temp)
         print()  # Add an empty line for cleaner output
 
 

diff --git a/wikipedia_analysis/README.md b/wikipedia_analysis/README.md
@@ -0,0 +1,139 @@
+# Wikipedia Analysis
+
+The following README contains the steps to perform the benchmarking on the wikipedia datasets. Before we run anything, run the commands:
+```
+$ sudo apt update -y && sudo apt upgrade -y
+```
+
+## Mounting the data directory
+
+First run `lsblk` which produces an output like this:
+```
+NAME      MAJ:MIN RM   SIZE RO TYPE MOUNTPOINTS
+sda         8:0    0 447.1G  0 disk 
+├─sda1      8:1    0   256M  0 part /boot/efi
+├─sda2      8:2    0     1M  0 part 
+├─sda3      8:3    0    64G  0 part /
+└─sda99   259:2    0     8G  0 part [SWAP]
+sdb         8:16   0 447.1G  0 disk 
+sdc         8:32   0 745.2G  0 disk 
+sdd         8:48   0 745.2G  0 disk 
+sde         8:64   0 745.2G  0 disk 
+sdf         8:80   0 745.2G  0 disk 
+sdg         8:96   0 745.2G  0 disk 
+sdh         8:112  0 745.2G  0 disk 
+sdi         8:128  0 745.2G  0 disk 
+sdj         8:144  0 745.2G  0 disk 
+nvme0n1   259:1    0   1.5T  0 disk 
+└─vg1-lv1 253:0    0   1.5T  0 lvm 
+```
+
+Then run the command:
+```
+$ mkdir -p all_data
+```
+
+The  update the `/etc/fstab` file to include the following line:
+```
+/dev/vg1/lv1  /users/sardev/all_data   xfs     defaults        0 0
+```
+but your path for the all_data directory might be different. 
+
+Then mount the directory using the commands:
+```
+$ sudo mount -a
+$ sudo chmod ugo+rw -R all_data
+```
+
+Verify by running `df -h` inside of `all_data` and ensure it produces this output:
+```
+
+```
+
+## Setting up docker
+
+First install the nvidia driver using the command:
+```
+$ sudo apt install -y nvidia-driver-550
+```
+
+Then install docker using the commands:
+```
+$ # Add Docker's official GPG key:
+sudo apt-get update
+sudo apt-get install ca-certificates curl
+sudo install -m 0755 -d /etc/apt/keyrings
+sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
+sudo chmod a+r /etc/apt/keyrings/docker.asc
+
+# Add the repository to Apt sources:
+echo \
+  "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
+  $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
+  sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
+sudo apt-get update
+$ sudo apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
+$ sudo docker run hello-world
+$ sudo groupadd docker
+$ sudo usermod -aG docker $USER
+$ newgrp docker
+```
+
+Then install the GPU driver for containers using:
+```
+$ curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
+  && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
+    sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
+    sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
+$ sudo sed -i -e '/experimental/ s/^#//g' /etc/apt/sources.list.d/nvidia-container-toolkit.list
+$ sudo apt-get update
+$ sudo apt-get install -y nvidia-container-toolkit
+```
+
+Finally run `sudo reboot`. Verify the install by running the command `nvidia-smi`. 
+
+## Installing marius
+
+Then install marius using the commands:
+```
+$ export CURRENT_DIR=`pwd`
+$ git clone https://github.com/marius-team/marius.git
+$ cd marius
+$ git checkout -b dsarda/wikipedia
+$ cd examples/docker/
+$ docker build -t marius:latest gpu_ubuntu/.
+$ docker kill marius
+$ docker rm marius
+$ docker run --gpus all -d -v $CURRENT_DIR:/root/ --name=marius marius:latest sleep infinity
+$ docker exec -it marius bash
+$ cd marius
+$ python3 -m pip install "numpy<2" pybind11
+$ pip3 install . --no-build-isolation
+```
+
+## Preprocessing graph snapshot
+
+First setup aws using:
+```
+$ apt install -y python3-pip awscli
+$ python3 -m pip install boto3
+```
+
+Then setup aws using `aws configure`. Then run the preprocessing using the commands:
+```
+$ cd wikipedia_analysis
+$ python3 -u preprocess_runner.py &> preprocess.log
+```
+
+## Training the initial snapshot
+
+Then train the initial snapshot using the file `initial_training.yaml`. Note that you might need to update the path of the datasets. Here is the command to run the training in (`marius`) by first running:
+```
+$ mkdir -p build && cd build
+$ cmake ../ -DUSE_CUDA=TRUE -DUSE_OMP=TRUE
+```
+
+and then:
+```
+$ rm -rf /root/all_data/graph_snapshots/initial_snapshot/marius_formatted/model_* && make marius_train -j && ./marius_train ../wikipedia_analysis/initial_training.yaml
+```
diff --git a/wikipedia_analysis/initial_training.yaml b/wikipedia_analysis/initial_training.yaml
@@ -0,0 +1,95 @@
+model:
+  learning_task: LINK_PREDICTION
+  encoder:
+    train_neighbor_sampling:
+      - type: UNIFORM
+        options:
+          max_neighbors: 32
+      - type: UNIFORM
+        options:
+          max_neighbors: 32
+      - type: UNIFORM
+        options:
+          max_neighbors: 32
+    eval_neighbor_sampling:
+      - type: UNIFORM
+        options:
+          max_neighbors: 32
+      - type: UNIFORM
+        options:
+          max_neighbors: 32
+      - type: UNIFORM
+        options:
+          max_neighbors: 32
+    layers:
+      - - type: EMBEDDING
+          output_dim: 32
+          bias: true
+          init:
+            type: GLOROT_NORMAL
+
+      - - type: GNN
+          options:
+            type: GRAPH_SAGE
+            aggregator: MEAN
+          input_dim: 32
+          output_dim: 32
+          bias: true
+          init:
+            type: GLOROT_NORMAL
+
+      - - type: GNN
+          options:
+            type: GRAPH_SAGE
+            aggregator: MEAN
+          input_dim: 32
+          output_dim: 32
+          bias: true
+          init:
+            type: GLOROT_NORMAL
+
+      - - type: GNN
+          options:
+            type: GRAPH_SAGE
+            aggregator: MEAN
+          input_dim: 32
+          output_dim: 32
+          bias: true
+          init:
+            type: GLOROT_NORMAL
+  decoder:
+    type: DISTMULT
+  loss:
+    type: SOFTMAX_CE
+    options:
+      reduction: SUM
+  dense_optimizer:
+    type: ADAM
+    options:
+      learning_rate: 0.001
+  sparse_optimizer:
+    type: ADAGRAD
+    options:
+      learning_rate: 0.01
+storage:
+  device_type: cuda
+  dataset:
+    dataset_dir: /root/all_data/graph_snapshots/initial_snapshot/marius_formatted
+  edges:
+    type: DEVICE_MEMORY
+  embeddings:
+    type: DEVICE_MEMORY
+  save_model: true
+training:
+  batch_size: 16
+  negative_sampling:
+    num_chunks: 10
+    negatives_per_positive: 750
+    degree_fraction: 0.0
+    filtered: false
+  num_epochs: 50
+  epochs_per_shuffle: 1
+evaluation:
+  batch_size: 16
+  negative_sampling:
+    filtered: true
diff --git a/wikipedia_analysis/preprocess_runner.py b/wikipedia_analysis/preprocess_runner.py
@@ -0,0 +1,62 @@
+import os
+import boto3
+import subprocess
+import numpy as np
+import multiprocessing
+
+BUCKET_NAME = "wikidata-update-history"
+FILE_NAME = "all_non_preprocess_snapshots.tar.bz2"
+USER_DIR = os.path.expanduser("~")
+DATA_DIR = os.path.join(USER_DIR, "all_data")
+SNAPSHOTS_DIR_NAME = "graph_snapshots"
+SNAPSHOTS_DIR = os.path.join(DATA_DIR, SNAPSHOTS_DIR_NAME)
+OUTPUT_DIR_NAME = "marius_formatted"
+COMPRESSED_TAR_NAME = "all_preprocessed_snapshots.tar.bz2"
+def preprocess_worker(dirs_to_preprocess):
+    for dir_path in list(dirs_to_preprocess):
+        # Determine the command to run
+        dir_path = str(dir_path)
+        output_path = os.path.join(dir_path, OUTPUT_DIR_NAME)
+        edges_path = os.path.join(dir_path, "graph.csv")
+
+        split_txt = "--dataset_split 0.8 0.1 0.1" if "initial_snapshot" in dir_path else ""
+        preprocess_command = f'marius_preprocess --edges {edges_path}  --output_directory {output_path} --delim "," --src_column 0 --edge_type_column 1 --dst_column 2 {split_txt} --overwrite'
+        print(preprocess_command)
+        subprocess.run(preprocess_command, shell = True, capture_output = True)
+
+NUM_WORKERS = int(0.1 * os.cpu_count())
+def main():
+    '''
+    # Determine all the dirs to preprocess
+    all_snapshots_dir = []
+    for dir_name in os.listdir(SNAPSHOTS_DIR):
+        if dir_name[0] == '.':
+            continue
+        
+        dir_path = os.path.join(SNAPSHOTS_DIR, dir_name)
+        all_snapshots_dir.append(dir_path)
+    
+    # Determine the dirs for each worker
+    dirs_per_worker = np.array_split(np.array(all_snapshots_dir), NUM_WORKERS)
+    all_workers = []
+    for curr_worker_dirs in dirs_per_worker:
+        curr_worker = multiprocessing.Process(target = preprocess_worker, args = (curr_worker_dirs, ))
+        curr_worker.start()
+        all_workers.append(curr_worker)
+    
+    [worker.join() for worker in all_workers]
+    '''
+
+    # Create the compressed zip
+    os.chdir(DATA_DIR)
+    compress_command = f'tar -cvjf {COMPRESSED_TAR_NAME} {SNAPSHOTS_DIR_NAME}'
+    print("Running command", compress_command, "in dir", DATA_DIR)
+    subprocess.run(compress_command, shell = True, capture_output = True)
+
+    # Then upload the data to S3
+    print("Uploading file", COMPRESSED_TAR_NAME, "to S3")
+    s3_client = boto3.client('s3')
+    s3_client.upload_file(COMPRESSED_TAR_NAME, BUCKET_NAME, COMPRESSED_TAR_NAME)
+
+if __name__ == "__main__":
+    main()