Skip to content
This repository has been archived by the owner on Jan 16, 2022. It is now read-only.

Commit

Permalink
tpu
Browse files Browse the repository at this point in the history
  • Loading branch information
mgrankin committed Oct 16, 2019
1 parent 126a439 commit 55ac6a9
Show file tree
Hide file tree
Showing 7 changed files with 115 additions and 29 deletions.
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ venv.bak/
config.json

data/
upload/
tmp/
bpe.ipynb
sentencepiece/
Expand All @@ -120,5 +121,4 @@ gpt2/
Untitled.ipynb
tpu/.gcp_credentials.json
.terraform
tpu/terraform.tfstate
tpu/terraform.tfstate.backup
terraform.tfstate
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,23 +58,23 @@ export TRAIN_FILE=./data/full

export CUDA_VISIBLE_DEVICES=1
export MODEL_SIZE=gpt2
export OUTPUT=output_s
export OUTPUT=output/s
export BS=8
export LR=5e-5

# GPT-2 355M, final perplexity 18.99?

export CUDA_VISIBLE_DEVICES=2
export MODEL_SIZE=gpt2-medium
export OUTPUT=output_m
export OUTPUT=output/m
export BS=3
export LR=3e-5

# GPT-2 774M, final perplexity 21.09?

export CUDA_VISIBLE_DEVICES=3
export MODEL_SIZE=gpt2-large
export OUTPUT=output_l
export OUTPUT=output/l
export BS=1
export LR=1e-5

Expand Down
50 changes: 50 additions & 0 deletions corpus/corpus.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -843,6 +843,56 @@
" pass"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Prepare cached dataset for upload (for GCloud)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"files = get_files('upload', '.txt', True); len(files)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"files = [item for item in files if '/full' in str(item) and '/cached' not in str(item)]; len(files)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for item in files:\n",
" with open(item, 'w'):\n",
" pass"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down
2 changes: 2 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ dependencies:
- uvicorn
- terraform
- google-cloud-sdk
- tar
- zstd
- pip:
- transformers
- sentencepiece
Expand Down
57 changes: 35 additions & 22 deletions tpu/main.tf → tpu/00_prepare/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -8,34 +8,47 @@ resource "google_compute_address" "ip_address" {
name = "my-address"
}

resource "google_compute_disk" "data-disk" {
name = "data-disk"
type = "pd-ssd"
size = 200
zone = "us-west1-a"
}

// A single Google Cloud Engine instance
resource "google_compute_instance" "default" {
name = "train-instance"
machine_type = "n1-standard-1"
zone = "us-west1-a"

boot_disk {
initialize_params {
image = "ubuntu-1804-lts"
size = 20
}
}

metadata_startup_script = "sudo apt update; sudo apt upgrade -y"

tags = ["train"]

metadata = {
name = "train-instance"
machine_type = "n1-standard-1"
zone = "us-west1-a"

boot_disk {
device_name = "basic-disk"
initialize_params {
image = "ubuntu-1804-lts"
type = "pd-ssd"
size = 200
}
}

attached_disk {
source = "${google_compute_disk.data-disk.name}"
}

metadata_startup_script = "sudo apt update; sudo apt upgrade -y"

tags = ["train"]

metadata = {
ssh-keys = "ubuntu:${file("~/.ssh/id_rsa.pub")}"
}

network_interface {
network = "open-network"
network_interface {
network = "open-network"

access_config {
nat_ip = "${google_compute_address.ip_address.address}"
}
}
access_config {
nat_ip = "${google_compute_address.ip_address.address}"
}
}
}

resource "google_compute_firewall" "default" {
Expand Down
20 changes: 19 additions & 1 deletion tpu/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ gcloud iam service-accounts get-iam-policy \
### Create instance with Terraform

```
cd 00_prepare/
terraform init
terraform plan
terraform apply
Expand All @@ -40,6 +41,23 @@ terraform apply
```
IP=35.185.201.94 # your node IP
scp train_setup.sh ubuntu@$IP:
ssh ubuntu@$IP bash ./train_setup.sh
# dataset packed with 'tar -caf data.zst.tar data/'
rsync -vP data.zst.tar ubuntu@$IP:
# go there
ssh ubuntu@$IP
sudo mkfs.ext4 -m 0 -F -E lazy_itable_init=0,lazy_journal_init=0,discard /dev/sdb
bash ./train_setup.sh
sudo -s
crontab -l | { cat; echo "@reboot mount /dev/sdb /home/ubuntu/ru_transformers/output"; } | crontab -
exit
```

### Create an image for preemptive instance

```
gcloud compute images create train-image --source-disk train-instance --source-disk-zone us-west1-a --force
#gcloud compute images delete train-image
```
5 changes: 4 additions & 1 deletion tpu/train_setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ source .bashrc
git clone https://github.com/mgrankin/ru_transformers
cd ru_transformers
conda env create -f environment.yml
mkdir ru_transformers/output

cd
rm train_setup.sh
tar -xaf data.zst.tar
rm data.zst.tar
mv data ru_transformers/

0 comments on commit 55ac6a9

Please sign in to comment.