Skip to content
This repository has been archived by the owner on Jan 16, 2022. It is now read-only.

Commit

Permalink
tpu
Browse files Browse the repository at this point in the history
  • Loading branch information
mgrankin committed Oct 17, 2019
1 parent 0022e17 commit 2e141e9
Show file tree
Hide file tree
Showing 6 changed files with 132 additions and 5 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -121,4 +121,4 @@ gpt2/
Untitled.ipynb
tpu/.gcp_credentials.json
.terraform
terraform.tfstate
terraform.tfstate*
9 changes: 9 additions & 0 deletions run_lm_finetuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,15 @@ def main():
parser.add_argument('--seed', type=int, default=42,
help="random seed for initialization")

parser.add_argument('--tpu', action='store_true',
help="Whether to run on the TPU defined in the environment variables")
parser.add_argument('--tpu_ip_address', type=str, default='',
help="TPU IP address if none are set in the environment variables")
parser.add_argument('--tpu_name', type=str, default='',
help="TPU name if none are set in the environment variables")
parser.add_argument('--xrt_tpu_config', type=str, default='',
help="XRT TPU config if none are set in the environment variables")

parser.add_argument('--fp16', action='store_true',
help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
parser.add_argument('--fp16_opt_level', type=str, default='O1',
Expand Down
7 changes: 3 additions & 4 deletions tpu/00_prepare/main.tf
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
provider "google" {
credentials = "${file(".gcp_credentials.json")}"
credentials = "${file("../.gcp_credentials.json")}"
project = "gpt2train"
region = "us-west1"
region = "us-central1"
zone = "us-central1-b"
}

resource "google_compute_address" "ip_address" {
Expand All @@ -12,14 +13,12 @@ resource "google_compute_disk" "data-disk" {
name = "data-disk"
type = "pd-ssd"
size = 200
zone = "us-west1-a"
}

// A single Google Cloud Engine instance
resource "google_compute_instance" "default" {
name = "train-instance"
machine_type = "n1-standard-1"
zone = "us-west1-a"

boot_disk {
device_name = "basic-disk"
Expand Down
99 changes: 99 additions & 0 deletions tpu/01_train/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
provider "google" {
credentials = "${file("../.gcp_credentials.json")}"
project = "gpt2train"
region = "us-west1"
}

resource "google_compute_address" "ip_address" {
name = "my-address"
}

resource "google_compute_disk" "data-disk" {
name = "data-disk"
type = "pd-ssd"
size = 200
zone = "us-west1-a"
}

// A single Google Cloud Engine instance
resource "google_compute_instance" "default" {
name = "train-instance"
machine_type = "n1-standard-32"
zone = "us-west1-a"

boot_disk {
device_name = "basic-disk"
initialize_params {
image = "train-image"
type = "pd-ssd"
size = 200
}
}

attached_disk {
source = "${google_compute_disk.data-disk.name}"
}

metadata_startup_script = "sudo apt update; sudo apt upgrade -y"

tags = ["train"]

metadata = {
ssh-keys = "ubuntu:${file("~/.ssh/id_rsa.pub")}"
}

network_interface {
network = "open-network"

access_config {
nat_ip = "${google_compute_address.ip_address.address}"
}
}
}

data "google_tpu_tensorflow_versions" "available" { }

resource "google_tpu_node" "tpu" {
name = "test-tpu"
zone = "us-central1-b"

accelerator_type = "v2-8"

cidr_block = "10.3.0.0/29"
tensorflow_version = "${data.google_tpu_tensorflow_versions.available.versions[0]}"

description = "Terraform Google Provider test TPU"
network = "default"

labels = {
foo = "bar"
}

scheduling_config {
preemptible = true
}
}

resource "google_compute_firewall" "default" {
name = "test-firewall"
network = "open-network"

allow {
protocol = "icmp"
}

allow {
protocol = "tcp"
ports = ["22", "80", "8080", "1000-2000"]
}

target_tags = ["train"]
}

resource "google_compute_network" "default" {
name = "open-network"
}

output "instance_ips" {
value = ["${google_compute_address.ip_address.address}"]
}
19 changes: 19 additions & 0 deletions tpu/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,15 @@ sudo -s
crontab -l | { cat; echo "@reboot mount /dev/sdb /home/ubuntu/ru_transformers/output"; } | crontab -
exit
# docker
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -
sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu bionic stable"
sudo apt update
sudo apt install docker-ce -y
sudo groupadd docker
sudo gpasswd -a $USER docker
sudo reboot
docker pull gcr.io/tpu-pytorch/xla:r0.5
```

### Create an image for preemptive instance
Expand All @@ -61,3 +70,13 @@ exit
gcloud compute images create train-image --source-disk train-instance --source-disk-zone us-west1-a --force
#gcloud compute images delete train-image
```

### Replace instance with Terraform

```
cp 00_prepare/terraform.tfstate 01_train/
cd 01_train/
terraform plan
terraform apply
```
1 change: 1 addition & 0 deletions tpu/train_setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@ cd
tar -xaf data.zst.tar
rm data.zst.tar
mv data ru_transformers/

0 comments on commit 2e141e9

Please sign in to comment.