diff --git a/.gitignore b/.gitignore index 1d4678f..aaf468c 100644 --- a/.gitignore +++ b/.gitignore @@ -118,3 +118,7 @@ dataset/ corpus/Untitled.ipynb gpt2/ Untitled.ipynb +tpu/.gcp_credentials.json +.terraform +tpu/terraform.tfstate +tpu/terraform.tfstate.backup diff --git a/environment.yml b/environment.yml index 6babc69..cb1af7b 100644 --- a/environment.yml +++ b/environment.yml @@ -19,6 +19,7 @@ dependencies: - tensorflow>=1.13 - uvicorn - terraform + - google-cloud-sdk - pip: - transformers - sentencepiece diff --git a/tpu/README.md b/tpu/README.md new file mode 100644 index 0000000..59d3783 --- /dev/null +++ b/tpu/README.md @@ -0,0 +1,45 @@ +# WIP +### Initialize credentials +```gcloud init``` +### Create a project dedicated to train a NN +``` +TF_VAR_billing_account= +gcloud projects create gpt2train +gcloud config set project gpt2train +``` + +Go to the web interface and link billing account to your project. I don't have a script for that. + +### Attach gcloud to Terraform +``` +gcloud iam service-accounts create terraform +gcloud iam service-accounts keys create ./.gcp_credentials.json \ + --iam-account terraform@gpt2train.iam.gserviceaccount.com +gcloud config set project gpt2train +gcloud services enable cloudbilling.googleapis.com +gcloud services enable compute.googleapis.com + +gcloud projects add-iam-policy-binding gpt2train \ + --member serviceAccount:terraform@gpt2train.iam.gserviceaccount.com \ + --role roles/editor + +gcloud iam service-accounts get-iam-policy \ + terraform@gpt2train.iam.gserviceaccount.com + +``` +### Create instance with Terraform + +``` +terraform init +terraform plan +terraform apply +``` + +### Setup an instance + +``` +IP=35.185.201.94 # your node IP +scp train_setup.sh ubuntu@$IP: +ssh ubuntu@$IP bash ./train_setup.sh +``` + diff --git a/tpu/main.tf b/tpu/main.tf new file mode 100644 index 0000000..8c0b76d --- /dev/null +++ b/tpu/main.tf @@ -0,0 +1,63 @@ +provider "google" { + credentials = "${file(".gcp_credentials.json")}" + project = "gpt2train" + region = "us-west1" +} + +resource "google_compute_address" "ip_address" { + name = "my-address" +} + +// A single Google Cloud Engine instance +resource "google_compute_instance" "default" { + name = "train-instance" + machine_type = "n1-standard-1" + zone = "us-west1-a" + + boot_disk { + initialize_params { + image = "ubuntu-1804-lts" + size = 20 + } + } + + metadata_startup_script = "sudo apt update; sudo apt upgrade -y" + + tags = ["train"] + + metadata = { + ssh-keys = "ubuntu:${file("~/.ssh/id_rsa.pub")}" + } + + network_interface { + network = "open-network" + + access_config { + nat_ip = "${google_compute_address.ip_address.address}" + } + } +} + +resource "google_compute_firewall" "default" { + name = "test-firewall" + network = "open-network" + + allow { + protocol = "icmp" + } + + allow { + protocol = "tcp" + ports = ["22", "80", "8080", "1000-2000"] + } + + target_tags = ["train"] +} + +resource "google_compute_network" "default" { + name = "open-network" +} + +output "instance_ips" { + value = ["${google_compute_address.ip_address.address}"] +} diff --git a/tpu/train_setup.sh b/tpu/train_setup.sh new file mode 100644 index 0000000..9eee18d --- /dev/null +++ b/tpu/train_setup.sh @@ -0,0 +1,16 @@ +mkdir .install +mkdir .conda +cd .install +wget `wget -O - https://www.anaconda.com/distribution/ 2>/dev/null | sed -ne 's@.*\(https:\/\/repo\.anaconda\.com\/archive\/Anaconda3-.*-Linux-x86_64\.sh\)\">64-Bit (x86) Installer.*@\1@p'` -O anaconda.sh +chmod +x anaconda.sh +./anaconda.sh -b -p $HOME/.anaconda +cd +sed -i '1ieval "$($HOME/.anaconda/bin/conda shell.bash hook)"' .bashrc +source .bashrc + +git clone https://github.com/mgrankin/ru_transformers +cd ru_transformers +conda env create -f environment.yml + +cd +rm train_setup.sh \ No newline at end of file