Oneflow-Inc · ShawnXuan · Jun 6, 2024 · Jun 6, 2024 · Jun 6, 2024 · Jul 31, 2024
diff --git a/Classification/resnet50/0_dist_ssh_key/README.md b/Classification/resnet50/0_dist_ssh_key/README.md
@@ -0,0 +1,83 @@
+# 使用 Ansible 将 SSH 公钥分发到多个目标主机
+
+## 1. 创建变量文件并加密
+
+创建一个包含密码的变量文件vars.yml：
+
+```yaml
+all:
+  hosts:
+    192.168.1.27:
+      ansible_user: myuser
+      ansible_password: mypassword
+    192.168.1.28:
+      ansible_user: myuser
+      ansible_password: mypassword
+```
+
+然后使用Ansible Vault加密这个文件：
+
+```bash
+ansible-vault encrypt vars.yml
+```
+
+注意：
+
+1. 执行 `ansible-vault` 的过程中需要设定一个密码，请记住或保存好这个密码
+2. `vars.yml`将被替换为加密后的文件
+
+## 2. 创建主机清单文件
+
+创建一个主机清单文件`inventory.ini`：
+
+```ini
+[all]
+node1 ansible_host=192.168.1.27 ansible_user=myuser
+node2 ansible_host=192.168.1.28 ansible_user=myuser
+```
+
+注：需要根据情况修改 `ansible_user` 的值
+
+## 3. 创建Playbook
+
+如果文件存在，这一步可以忽略。
+
+创建一个Playbook distribute_ssh_key.yml：
+
+```yaml
+---
+- name: Distribute SSH key
+  hosts: all
+  vars_files:
+    - vars.yml
+  tasks:
+    - name: Create .ssh directory if it doesn't exist
+      file:
+        path: /home/{{ ansible_user }}/.ssh
+        state: directory
+        mode: '0700'
+        owner: "{{ ansible_user }}"
+        group: "{{ ansible_user }}"
+
+    - name: Copy the SSH key to the authorized_keys file
+      authorized_key:
+        user: "{{ ansible_user }}"
+        state: present
+        key: "{{ lookup('file', '/path/to/id_rsa.pub') }}"
+```
+
+注：`vars_files` 配置为 `vars.yml`
+
+## 4. 运行Playbook
+
+使用以下命令运行Playbook，并解密变量文件：
+
+```bash
+ansible-playbook -i inventory.ini distribute_ssh_key.yml --ask-vault-pass
+```
+或者运行
+
+```bash
+./dist_ssh_key.sh
+```
+
diff --git a/Classification/resnet50/0_dist_ssh_key/dist_ssh_key.sh b/Classification/resnet50/0_dist_ssh_key/dist_ssh_key.sh
@@ -0,0 +1 @@
+ansible-playbook -i inventory.ini distribute_ssh_key.yml --ask-vault-pass
diff --git a/Classification/resnet50/0_dist_ssh_key/distribute_ssh_key.yml b/Classification/resnet50/0_dist_ssh_key/distribute_ssh_key.yml
@@ -0,0 +1,19 @@
+---
+- name: Distribute SSH key
+  hosts: all
+  vars_files:
+    - vars.yml
+  tasks:
+    - name: Create .ssh directory if it doesn't exist
+      file:
+        path: /home/{{ ansible_user }}/.ssh
+        state: directory
+        mode: '0700'
+        owner: "{{ ansible_user }}"
+        group: "{{ ansible_user }}"
+
+    - name: Copy the SSH key to the authorized_keys file
+      authorized_key:
+        user: "{{ ansible_user }}"
+        state: present
+        key: "{{ lookup('file', '/home/xiexuan/.ssh/id_rsa.pub') }}"
diff --git a/Classification/resnet50/0_dist_ssh_key/inventory.ini b/Classification/resnet50/0_dist_ssh_key/inventory.ini
@@ -0,0 +1,3 @@
+[all]
+of27 ansible_host=192.168.1.27 ansible_user=myuser
+of28 ansible_host=192.168.1.28 ansible_user=myuser
diff --git a/Classification/resnet50/0_dist_ssh_key/vars.yml b/Classification/resnet50/0_dist_ssh_key/vars.yml
@@ -0,0 +1,8 @@
+all:
+  hosts:
+    192.168.1.27:
+      ansible_user: myuser
+      ansible_password: mypassword
+    192.168.1.28:
+      ansible_user: myuser
+      ansible_password: mypassword
diff --git a/Classification/resnet50/1_get_docker_image/README.md b/Classification/resnet50/1_get_docker_image/README.md
@@ -0,0 +1,61 @@
+# 拉取或导入镜像
+
+## 拉取镜像
+
+适用于直接从 dockerhub 拉取镜像。
+
+用法: `./pull.sh [镜像标签]`
+
+参数说明:
+
+- 镜像标签 (可选)  : 要拉取的Docker镜像标签，例如 alpine:latest。如果未提供，则使用playbook中的默认值。
+
+示例:
+
+- 默认使用:
+
+```bash
+./pull.sh
+```
+
+- 指定镜像标签:
+
+ ```bash
+./pull.sh alpine:latest
+ ```
+
+## 导入镜像
+
+适用于本地共享目录有已经保存镜像的tar文件，使用 `docker load` 导入。
+
+用法: `./load.sh [镜像文件路径] [镜像标签] [强制导入]`
+
+参数说明:
+
+- 镜像文件路径 (可选)  : 要导入的Docker镜像tar文件路径，默认为 `/share_nfs/k85/oneflow.0.9.1.dev20240203-cuda11.8.tar`
+- 镜像标签 (可选)      : 导入后设置的Docker镜像标签，默认为 `oneflowinc/oneflow:0.9.1.dev20240203-cuda11.8`
+- 强制导入 (可选)      : 是否强制导入镜像（true 或 false），默认为 false
+
+示例:
+
+- 默认使用:
+
+```bash
+./load.sh
+```
+
+- 指定镜像文件路径和标签:
+
+```bash
+./load.sh /path/to/shared/abc.tar myrepo/myimage:latest
+```
+
+- 强制导入镜像:    
+
+```bash
+./load.sh /path/to/shared/abc.tar myrepo/myimage:latest true
+```
+
+
+
+
diff --git a/Classification/resnet50/1_get_docker_image/load.sh b/Classification/resnet50/1_get_docker_image/load.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+if [ -n "$1" ]; then
+  docker_image_path=$1
+else
+  docker_image_path="/share_nfs/k85/oneflow.0.9.1.dev20240203-cuda11.8.tar"
+fi
+
+if [ -n "$2" ]; then
+  docker_image_tag=$2
+else
+  docker_image_tag="oneflowinc/oneflow:0.9.1.dev20240203-cuda11.8"
+fi
+
+if [ -n "$3" ]; then
+  force_load=$3
+else
+  force_load=false
+fi
+
+ansible-playbook \
+    -i ../inventory.ini \
+    load_and_tag_docker_image.yml \
+    -e "docker_image_path=$docker_image_path" \
+    -e "docker_image_tag=$docker_image_tag" \
+    -e "force_load=$force_load"
diff --git a/Classification/resnet50/1_get_docker_image/load_and_tag_docker_image.yml b/Classification/resnet50/1_get_docker_image/load_and_tag_docker_image.yml
@@ -0,0 +1,28 @@
+---
+- name: Load and tag Docker image
+  hosts: all
+  vars:
+    docker_image_path: "/share_nfs/k85/oneflow.0.9.1.dev20240203-cuda11.8.tar"
+    docker_image_tag: "oneflowinc/oneflow:0.9.1.dev20240203-cuda11.8"
+    force_load: false
+
+  tasks:
+    - name: Check if Docker image with the specified tag already exists
+      command: "docker images -q {{ docker_image_tag }}"
+      register: image_id
+      changed_when: false
+      when: not force_load
+
+    - name: Load Docker image from tar file
+      command: "docker load -i {{ docker_image_path }}"
+      when: force_load or image_id.stdout == ""
+      register: load_output
+
+    - name: Get image ID from load output
+      set_fact:
+        loaded_image_id: "{{ load_output.stdout_lines[-1] | regex_search('sha256:[0-9a-f]+') }}"
+      when: force_load or image_id.stdout == ""
+
+    - name: Tag the loaded Docker image
+      command: "docker tag {{ loaded_image_id }} {{ docker_image_tag }}"
+      when: force_load or image_id.stdout == ""
diff --git a/Classification/resnet50/1_get_docker_image/pull.sh b/Classification/resnet50/1_get_docker_image/pull.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+if [ -n "$1" ]; then
+  ansible-playbook -i ../inventory.ini pull_docker_image.yml -e "docker_image=$1"
+else
+  ansible-playbook -i ../inventory.ini pull_docker_image.yml
+fi
diff --git a/Classification/resnet50/1_get_docker_image/pull_docker_image.yml b/Classification/resnet50/1_get_docker_image/pull_docker_image.yml
@@ -0,0 +1,17 @@
+---
+- name: Pull specified Docker image
+  hosts: all
+  vars:
+    docker_image: "oneflowinc/oneflow:0.9.1.dev20240203-cuda11.8"
+
+  tasks:
+    - name: Check if the Docker image is already present
+      command: "docker images -q {{ docker_image }}"
+      register: docker_image_id
+      changed_when: false
+
+    - name: Pull Docker image if not present
+      docker_image:
+        name: "{{ docker_image }}"
+        source: pull
+      when: docker_image_id.stdout == ""
diff --git a/Classification/resnet50/2_distributed_training/README.md b/Classification/resnet50/2_distributed_training/README.md
@@ -0,0 +1,39 @@
+# run_dist_training.sh 使用说明
+
+`run_dist_training.sh` 是一个 Bash 脚本，用于运行 `ansible-playbook` 命令来启动分布式训练。此脚本支持通过参数指定 Docker 镜像和源目录。
+
+## 用法
+
+```bash
+./run_dist_training.sh [docker_image] [src]
+```
+
+## 参数
+
+- `docker_image` (可选): 要使用的 Docker 镜像名称。默认为 `oneflowinc/oneflow:0.9.1.dev20240203-cuda11.8`。
+- `src` (可选): 要挂载到 Docker 容器的源目录。默认为 `/share_nfs/k85/models/Vision/classification/image/resnet50`。
+
+## 示例
+
+1. 使用默认值运行：
+
+```bash
+./run_dist_training.sh
+```
+
+2. 指定 Docker 镜像运行：
+
+```bash
+./run_dist_training.sh "my_custom_image:latest"
+```
+
+3. 指定 Docker 镜像和源目录运行：
+
+```bash
+./run_dist_training.sh "my_custom_image:latest" "/my/custom/src"
+```
+
+## 注意
+
+如果不提供参数，脚本将使用默认的 Docker 镜像和源目录。
+
diff --git a/Classification/resnet50/2_distributed_training/dist_training.yml b/Classification/resnet50/2_distributed_training/dist_training.yml
@@ -0,0 +1,51 @@
+---
+- name: Distributed Training Setup
+  hosts: all
+  vars:
+    device_num_per_node: 8
+    num_nodes: "{{ groups['all'] | length }}"
+    master_addr: "{{ hostvars[groups['all'][0]].ansible_host }}"
+    docker_image: "oneflowinc/oneflow:0.9.1.dev20240203-cuda11.8"
+    src: "/share_nfs/k85/models/Vision/classification/image/resnet50"
+
+  tasks:
+    - name: Set node rank
+      set_fact:
+        node_rank: "{{ groups['all'].index(inventory_hostname) }}"
+
+    - name: distributed training in Docker container
+      command: >
+        docker run --rm --gpus all 
+        --runtime=nvidia --privileged
+        --network host --ipc=host 
+        -v {{ src }}:/workspace
+        -w /workspace
+        {{ docker_image }} /bin/bash -c "
+          python3 -m oneflow.distributed.launch \
+            --nproc_per_node {{ device_num_per_node }} \
+            --nnodes {{ num_nodes }} \
+            --node_rank {{ node_rank }} \
+            --master_addr {{ master_addr }} \
+            /workspace/train.py \
+            --synthetic-data \
+            --batches-per-epoch 1000 \
+            --num-devices-per-node {{ device_num_per_node }} \
+            --lr 1.536 \
+            --num-epochs 1 \
+            --train-batch-size 32 \
+            --graph \
+            --use-fp16 \
+            --metric-local False \
+            --metric-train-acc True \
+            --fuse-bn-relu \
+            --fuse-bn-add-relu \
+            --use-gpu-decode \
+            --channel-last \
+            --skip-eval
+        "
+      register: output
+
+    - name: Display output
+      debug:
+        var: output.stdout
+
diff --git a/Classification/resnet50/2_distributed_training/run_dist_training.sh b/Classification/resnet50/2_distributed_training/run_dist_training.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+DOCKER_IMAGE="oneflowinc/oneflow:0.9.1.dev20240203-cuda11.8"
+SRC="/share_nfs/k85/models/Vision/classification/image/resnet50"
+
+if [ -n "$1" ]; then
+  DOCKER_IMAGE="$1"
+fi
+
+if [ -n "$2" ]; then
+  SRC="$2"
+fi
+
+# 运行 ansible-playbook 命令
+ansible-playbook -i ../inventory.ini dist_training.yml -e "docker_image=${DOCKER_IMAGE}" -e "src=${SRC}"