Skip to content

Commit

Permalink
Add in initial wikipedia code
Browse files Browse the repository at this point in the history
  • Loading branch information
sarda-devesh committed Aug 21, 2024
1 parent 2f27ffe commit 59c9cbc
Show file tree
Hide file tree
Showing 5 changed files with 310 additions and 10 deletions.
18 changes: 11 additions & 7 deletions examples/docker/gpu_ubuntu/dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04

ENV TZ=Asia/Dubai
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
RUN apt update

RUN apt install -y g++ \
Expand All @@ -10,22 +13,23 @@ RUN apt install -y g++ \
dstat \
python3-pip

# install gcc-9
# install gcc-11
RUN apt install -y software-properties-common
RUN add-apt-repository -y ppa:ubuntu-toolchain-r/test
RUN apt update
RUN apt install -y gcc-9 g++-9
RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 9
RUN update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-9 9
RUN apt install -y gcc-11 g++-11
RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11
RUN update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 11

# install cmake 3.20
RUN wget https://github.com/Kitware/CMake/releases/download/v3.20.0/cmake-3.20.0-linux-x86_64.sh
# install cmake 3.28
RUN wget https://github.com/Kitware/CMake/releases/download/v3.28.3/cmake-3.28.3-linux-x86_64.sh
RUN mkdir /opt/cmake
RUN sh cmake-3.20.0-linux-x86_64.sh --skip-license --prefix=/opt/cmake/
RUN sh cmake-3.28.3-linux-x86_64.sh --skip-license --prefix=/opt/cmake/
RUN ln -s /opt/cmake/bin/cmake /usr/local/bin/cmake

# install pytorch
RUN python3 -m pip install torch==2.0.1+cu118 -f https://download.pytorch.org/whl/torch_stable.html
RUN git config --global --add safe.directory "*"

RUN mkdir /working_dir
WORKDIR /working_dir
6 changes: 3 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def __init__(self, name, sourcedir=""):
class CMakeBuild(build_ext):
def run(self):
try:
_ = subprocess.check_output(["cmake", "--version"])
_ = subprocess.run(["cmake", "--version"])
except OSError:
raise RuntimeError(
"CMake must be installed to build the following extensions: "
Expand Down Expand Up @@ -69,8 +69,8 @@ def build_extension(self, ext):

print(cmake_args)

subprocess.check_call(["cmake", ext.sourcedir] + cmake_args, cwd=self.build_temp, env=env)
subprocess.check_call(["cmake", "--build", ".", "--target", "bindings"] + build_args, cwd=self.build_temp)
subprocess.run(["cmake", ext.sourcedir] + cmake_args, cwd=self.build_temp, env=env)
subprocess.run(["cmake", "--build", ".", "--target", "bindings"] + build_args, cwd=self.build_temp)
print() # Add an empty line for cleaner output


Expand Down
139 changes: 139 additions & 0 deletions wikipedia_analysis/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
# Wikipedia Analysis

The following README contains the steps to perform the benchmarking on the wikipedia datasets. Before we run anything, run the commands:
```
$ sudo apt update -y && sudo apt upgrade -y
```

## Mounting the data directory

First run `lsblk` which produces an output like this:
```
NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINTS
sda 8:0 0 447.1G 0 disk
├─sda1 8:1 0 256M 0 part /boot/efi
├─sda2 8:2 0 1M 0 part
├─sda3 8:3 0 64G 0 part /
└─sda99 259:2 0 8G 0 part [SWAP]
sdb 8:16 0 447.1G 0 disk
sdc 8:32 0 745.2G 0 disk
sdd 8:48 0 745.2G 0 disk
sde 8:64 0 745.2G 0 disk
sdf 8:80 0 745.2G 0 disk
sdg 8:96 0 745.2G 0 disk
sdh 8:112 0 745.2G 0 disk
sdi 8:128 0 745.2G 0 disk
sdj 8:144 0 745.2G 0 disk
nvme0n1 259:1 0 1.5T 0 disk
└─vg1-lv1 253:0 0 1.5T 0 lvm
```

Then run the command:
```
$ mkdir -p all_data
```

The update the `/etc/fstab` file to include the following line:
```
/dev/vg1/lv1 /users/sardev/all_data xfs defaults 0 0
```
but your path for the all_data directory might be different.

Then mount the directory using the commands:
```
$ sudo mount -a
$ sudo chmod ugo+rw -R all_data
```

Verify by running `df -h` inside of `all_data` and ensure it produces this output:
```
```

## Setting up docker

First install the nvidia driver using the command:
```
$ sudo apt install -y nvidia-driver-550
```

Then install docker using the commands:
```
$ # Add Docker's official GPG key:
sudo apt-get update
sudo apt-get install ca-certificates curl
sudo install -m 0755 -d /etc/apt/keyrings
sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
sudo chmod a+r /etc/apt/keyrings/docker.asc
# Add the repository to Apt sources:
echo \
"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
$(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
sudo apt-get update
$ sudo apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
$ sudo docker run hello-world
$ sudo groupadd docker
$ sudo usermod -aG docker $USER
$ newgrp docker
```

Then install the GPU driver for containers using:
```
$ curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
&& curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
$ sudo sed -i -e '/experimental/ s/^#//g' /etc/apt/sources.list.d/nvidia-container-toolkit.list
$ sudo apt-get update
$ sudo apt-get install -y nvidia-container-toolkit
```

Finally run `sudo reboot`. Verify the install by running the command `nvidia-smi`.

## Installing marius

Then install marius using the commands:
```
$ export CURRENT_DIR=`pwd`
$ git clone https://github.com/marius-team/marius.git
$ cd marius
$ git checkout -b dsarda/wikipedia
$ cd examples/docker/
$ docker build -t marius:latest gpu_ubuntu/.
$ docker kill marius
$ docker rm marius
$ docker run --gpus all -d -v $CURRENT_DIR:/root/ --name=marius marius:latest sleep infinity
$ docker exec -it marius bash
$ cd marius
$ python3 -m pip install "numpy<2" pybind11
$ pip3 install . --no-build-isolation
```

## Preprocessing graph snapshot

First setup aws using:
```
$ apt install -y python3-pip awscli
$ python3 -m pip install boto3
```

Then setup aws using `aws configure`. Then run the preprocessing using the commands:
```
$ cd wikipedia_analysis
$ python3 -u preprocess_runner.py &> preprocess.log
```

## Training the initial snapshot

Then train the initial snapshot using the file `initial_training.yaml`. Note that you might need to update the path of the datasets. Here is the command to run the training in (`marius`) by first running:
```
$ mkdir -p build && cd build
$ cmake ../ -DUSE_CUDA=TRUE -DUSE_OMP=TRUE
```

and then:
```
$ rm -rf /root/all_data/graph_snapshots/initial_snapshot/marius_formatted/model_* && make marius_train -j && ./marius_train ../wikipedia_analysis/initial_training.yaml
```
95 changes: 95 additions & 0 deletions wikipedia_analysis/initial_training.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
model:
learning_task: LINK_PREDICTION
encoder:
train_neighbor_sampling:
- type: UNIFORM
options:
max_neighbors: 32
- type: UNIFORM
options:
max_neighbors: 32
- type: UNIFORM
options:
max_neighbors: 32
eval_neighbor_sampling:
- type: UNIFORM
options:
max_neighbors: 32
- type: UNIFORM
options:
max_neighbors: 32
- type: UNIFORM
options:
max_neighbors: 32
layers:
- - type: EMBEDDING
output_dim: 32
bias: true
init:
type: GLOROT_NORMAL

- - type: GNN
options:
type: GRAPH_SAGE
aggregator: MEAN
input_dim: 32
output_dim: 32
bias: true
init:
type: GLOROT_NORMAL

- - type: GNN
options:
type: GRAPH_SAGE
aggregator: MEAN
input_dim: 32
output_dim: 32
bias: true
init:
type: GLOROT_NORMAL

- - type: GNN
options:
type: GRAPH_SAGE
aggregator: MEAN
input_dim: 32
output_dim: 32
bias: true
init:
type: GLOROT_NORMAL
decoder:
type: DISTMULT
loss:
type: SOFTMAX_CE
options:
reduction: SUM
dense_optimizer:
type: ADAM
options:
learning_rate: 0.001
sparse_optimizer:
type: ADAGRAD
options:
learning_rate: 0.01
storage:
device_type: cuda
dataset:
dataset_dir: /root/all_data/graph_snapshots/initial_snapshot/marius_formatted
edges:
type: DEVICE_MEMORY
embeddings:
type: DEVICE_MEMORY
save_model: true
training:
batch_size: 16
negative_sampling:
num_chunks: 10
negatives_per_positive: 750
degree_fraction: 0.0
filtered: false
num_epochs: 50
epochs_per_shuffle: 1
evaluation:
batch_size: 16
negative_sampling:
filtered: true
62 changes: 62 additions & 0 deletions wikipedia_analysis/preprocess_runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import os
import boto3
import subprocess
import numpy as np
import multiprocessing

BUCKET_NAME = "wikidata-update-history"
FILE_NAME = "all_non_preprocess_snapshots.tar.bz2"
USER_DIR = os.path.expanduser("~")
DATA_DIR = os.path.join(USER_DIR, "all_data")
SNAPSHOTS_DIR_NAME = "graph_snapshots"
SNAPSHOTS_DIR = os.path.join(DATA_DIR, SNAPSHOTS_DIR_NAME)
OUTPUT_DIR_NAME = "marius_formatted"
COMPRESSED_TAR_NAME = "all_preprocessed_snapshots.tar.bz2"
def preprocess_worker(dirs_to_preprocess):
for dir_path in list(dirs_to_preprocess):
# Determine the command to run
dir_path = str(dir_path)
output_path = os.path.join(dir_path, OUTPUT_DIR_NAME)
edges_path = os.path.join(dir_path, "graph.csv")

split_txt = "--dataset_split 0.8 0.1 0.1" if "initial_snapshot" in dir_path else ""
preprocess_command = f'marius_preprocess --edges {edges_path} --output_directory {output_path} --delim "," --src_column 0 --edge_type_column 1 --dst_column 2 {split_txt} --overwrite'
print(preprocess_command)
subprocess.run(preprocess_command, shell = True, capture_output = True)

NUM_WORKERS = int(0.1 * os.cpu_count())
def main():
'''
# Determine all the dirs to preprocess
all_snapshots_dir = []
for dir_name in os.listdir(SNAPSHOTS_DIR):
if dir_name[0] == '.':
continue
dir_path = os.path.join(SNAPSHOTS_DIR, dir_name)
all_snapshots_dir.append(dir_path)
# Determine the dirs for each worker
dirs_per_worker = np.array_split(np.array(all_snapshots_dir), NUM_WORKERS)
all_workers = []
for curr_worker_dirs in dirs_per_worker:
curr_worker = multiprocessing.Process(target = preprocess_worker, args = (curr_worker_dirs, ))
curr_worker.start()
all_workers.append(curr_worker)
[worker.join() for worker in all_workers]
'''

# Create the compressed zip
os.chdir(DATA_DIR)
compress_command = f'tar -cvjf {COMPRESSED_TAR_NAME} {SNAPSHOTS_DIR_NAME}'
print("Running command", compress_command, "in dir", DATA_DIR)
subprocess.run(compress_command, shell = True, capture_output = True)

# Then upload the data to S3
print("Uploading file", COMPRESSED_TAR_NAME, "to S3")
s3_client = boto3.client('s3')
s3_client.upload_file(COMPRESSED_TAR_NAME, BUCKET_NAME, COMPRESSED_TAR_NAME)

if __name__ == "__main__":
main()

0 comments on commit 59c9cbc

Please sign in to comment.