diff --git a/language_model/tensorflow/.gitignore b/language_model/tensorflow/.gitignore new file mode 100644 index 000000000..05e212d40 --- /dev/null +++ b/language_model/tensorflow/.gitignore @@ -0,0 +1 @@ +workspace/data/ \ No newline at end of file diff --git a/language_model/tensorflow/README.md b/language_model/tensorflow/README.md new file mode 100644 index 000000000..c2b5a7321 --- /dev/null +++ b/language_model/tensorflow/README.md @@ -0,0 +1,45 @@ +# Bert benchmark + +## MLCube execution + +### Project setup +```Python +# Create Python environment and install MLCube Docker runner +virtualenv -p python3 ./env && source ./env/bin/activate && pip install mlcube-docker +``` + +## Clone Training repo and go to Bert directory +``` +git clone https://github.com/mlcommons/training.git && cd ./training +git fetch origin pull/503/head:feature/bert_mlcube && git checkout feature/bert_mlcube +cd ./language_model/tensorflow +``` + +## Run Bert MLCube on a local machine with Docker runner + +```bash +# Run Bert tasks: download, extract, preprocess, generate_tfrecords and train +mlcube run --task download +mlcube run --task extract +mlcube run --task preprocess +mlcube run --task generate_tfrecords +mlcube run --task train +``` + +We are targeting pull-type installation, so MLCubes should be available on docker hub. If not, try this: + +```bash +mlcube run ... -Pdocker.build_strategy=auto +``` + +Also, users can override the workspace directory by using: + +```bash +mlcube run --task=download --workspace=absolute_path_to_custom_dir +``` + +We are targeting pull-type installation, so MLCube images should be available on docker hub. If not, try this: + +```bash +mlcube run ... -Pdocker.build_strategy=always +``` \ No newline at end of file diff --git a/language_model/tensorflow/bert/Dockerfile b/language_model/tensorflow/bert/Dockerfile new file mode 100644 index 000000000..97e7a45e5 --- /dev/null +++ b/language_model/tensorflow/bert/Dockerfile @@ -0,0 +1,16 @@ +#FROM nvidia/cuda:9.0-cudnn7-runtime-ubuntu16.04 +FROM tensorflow/tensorflow:1.15.2-gpu + +RUN apt-get update +RUN apt-get update && apt-get install -y --no-install-recommends time \ + ca-certificates \ + build-essential \ + git \ + bzip2 + +COPY requirements.txt /requirements.txt +RUN pip install --no-cache-dir -r /requirements.txt + +COPY . /workspace +WORKDIR /workspace +ENTRYPOINT ["python", "mlcube.py"] \ No newline at end of file diff --git a/language_model/tensorflow/bert/cleanup_scripts/download_and_uncompress.sh b/language_model/tensorflow/bert/cleanup_scripts/download_and_uncompress.sh index 9d858737b..19ac6afeb 100755 --- a/language_model/tensorflow/bert/cleanup_scripts/download_and_uncompress.sh +++ b/language_model/tensorflow/bert/cleanup_scripts/download_and_uncompress.sh @@ -2,9 +2,10 @@ pip install --user gdown -mkdir -p wiki +data_dir=${DATA_DIR:-./} +mkdir -p $data_dir/wiki -cd wiki +cd $data_dir/wiki # Downloading files from Google Drive location: https://drive.google.com/drive/folders/1oQF4diVHNPCclykwdvQJw8n_VIWwV0PT @@ -23,6 +24,8 @@ gdown https://drive.google.com/uc?id=14_A6gQ0NJ7Pay1X0xFq9rCKUuFJcKLF- # enwiki-20200101-pages-articles-multistream.xml.bz2 gdown https://drive.google.com/uc?id=18K1rrNJ_0lSR9bsLaoP3PkQeSFO-9LE7 +echo uncompressing enwiki-20200101-pages-articles-multistream.xml.bz2 +echo this may take a while... bzip2 -d enwiki-20200101-pages-articles-multistream.xml.bz2 # Download TF-1 checkpoints @@ -51,6 +54,3 @@ gdown https://drive.google.com/uc?id=1oVBgtSxkXC9rH2SXJv85RXR9-WrMPy-Q # Back to bert/cleanup_scripts cd ../.. - - - diff --git a/language_model/tensorflow/bert/cleanup_scripts/generate_tfrecords.sh b/language_model/tensorflow/bert/cleanup_scripts/generate_tfrecords.sh new file mode 100644 index 000000000..bd368525f --- /dev/null +++ b/language_model/tensorflow/bert/cleanup_scripts/generate_tfrecords.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +data_dir=${DATA_DIR:-./} +wiki_dir=$data_dir/wiki/ +results_dir=$data_dir/results/ +tfrecord_dir=$data_dir/tfrecord/ + +mkdir -p $tfrecord_dir + +echo "Processing train data" +# Generate one TFRecord for each results_dir/part-00XXX-of-00500 file. +for file in $results_dir/* +do + if [[ $file == *"part"* ]]; then + echo "Processing file: $file" + python create_pretraining_data.py \ + --input_file=$file \ + --output_file=$tfrecord_dir/${file##*/} \ + --vocab_file=$wiki_dir/vocab.txt \ + --do_lower_case=True \ + --max_seq_length=512 \ + --max_predictions_per_seq=76 \ + --masked_lm_prob=0.15 \ + --random_seed=12345 \ + --dupe_factor=10 + fi +done + +echo "Processing eval data" +python create_pretraining_data.py \ + --input_file=$results_dir/eval.txt \ + --output_file=$tfrecord_dir/eval_intermediate \ + --vocab_file=$wiki_dir/vocab.txt \ + --do_lower_case=True \ + --max_seq_length=512 \ + --max_predictions_per_seq=76 \ + --masked_lm_prob=0.15 \ + --random_seed=12345 \ + --dupe_factor=10 + +python3 pick_eval_samples.py \ + --input_tfrecord=$tfrecord_dir/eval_intermediate \ + --output_tfrecord=$tfrecord_dir/eval_10k \ + --num_examples_to_pick=10000 \ No newline at end of file diff --git a/language_model/tensorflow/bert/cleanup_scripts/process_wiki.sh b/language_model/tensorflow/bert/cleanup_scripts/process_wiki.sh index caae53a0d..3a49f2986 100755 --- a/language_model/tensorflow/bert/cleanup_scripts/process_wiki.sh +++ b/language_model/tensorflow/bert/cleanup_scripts/process_wiki.sh @@ -5,29 +5,39 @@ # example: ./process_wiki.sh 'sample_data/wiki_??' # The resulted files will be placed in ./results -inputs=$1 +data_dir=${DATA_DIR:-./} +text_dir=$data_dir/text/ +inputs="${text_dir}*/wiki_??" + +ls $inputs pip install nltk==3.4.5 # Remove doc tag and title +echo "RUNNING SCRIPT #1: cleanup_file.py" python ./cleanup_file.py --data=$inputs --output_suffix='.1' # Further clean up files +echo "RUNNING SCRIPT #2: clean.sh" for f in ${inputs}; do ./clean.sh ${f}.1 ${f}.2 done # Sentence segmentation +echo "RUNNING SCRIPT #3: do_sentence_segmentation.py" python ./do_sentence_segmentation.py --data=$inputs --input_suffix='.2' --output_suffix='.3' -mkdir -p ./results +result_dir=$data_dir/results +mkdir -p $result_dir # Train/Eval seperation -python ./seperate_test_set.py --data=$inputs --input_suffix='.3' --output_suffix='.4' --num_test_articles=10000 --test_output='./results/eval' +echo "RUNNING SCRIPT #4: seperate_test_set.py" +python ./seperate_test_set.py --data=$inputs --input_suffix='.3' --output_suffix='.4' --num_test_articles=10000 --test_output="${result_dir}/eval" ## Choose file size method or number of packages by uncommenting only one of the following do_gather options # Gather into fixed size packages -python ./do_gather.py --data=$inputs --input_suffix='.4' --block_size=26.92 --out_dir='./results' +echo "RUNNING SCRIPT #5: do_gather.py" +python ./do_gather.py --data=$inputs --input_suffix='.4' --block_size=26.92 --out_dir=$result_dir # Gather into fixed number of packages #NUM_PACKAGES=512 diff --git a/language_model/tensorflow/bert/cleanup_scripts/run_wiki_extractor.sh b/language_model/tensorflow/bert/cleanup_scripts/run_wiki_extractor.sh new file mode 100644 index 000000000..a9c5965a0 --- /dev/null +++ b/language_model/tensorflow/bert/cleanup_scripts/run_wiki_extractor.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +git clone https://github.com/attardi/wikiextractor.git + +cd wikiextractor + +git checkout 3162bb6c3c9ebd2d15be507aa11d6fa818a454ac + +# Back to /cleanup_scripts +cd .. + +# Run `WikiExtractor.py` to extract data from XML. +data_dir=${DATA_DIR:-./} +wiki_dir=$data_dir/wiki +python wikiextractor/WikiExtractor.py $wiki_dir/enwiki-20200101-pages-articles-multistream.xml -o $data_dir/text \ No newline at end of file diff --git a/language_model/tensorflow/bert/mlcube.py b/language_model/tensorflow/bert/mlcube.py new file mode 100644 index 000000000..c0891e3cb --- /dev/null +++ b/language_model/tensorflow/bert/mlcube.py @@ -0,0 +1,128 @@ +"""MLCube handler file""" +import os +import shutil +import subprocess +from pathlib import Path + +import typer +import yaml + +app = typer.Typer() + + +class DownloadTask(object): + """Download task Class + It defines the environment variables: + DATA_ROOT_DIR: Directory path to download the dataset + Then executes the download script""" + @staticmethod + def run(data_dir: str) -> None: + + env = os.environ.copy() + env.update({ + 'DATA_DIR': data_dir, + }) + + process = subprocess.Popen( + "./cleanup_scripts/download_and_uncompress.sh", cwd=".", env=env) + process.wait() + + +class ExtractTask(object): + """Extract task Class + It defines the environment variables: + DATA_ROOT_DIR: Directory path to download the dataset + Then executes the download script""" + @staticmethod + def run(data_dir: str) -> None: + + env = os.environ.copy() + env.update({ + 'DATA_DIR': data_dir, + }) + + process = subprocess.Popen( + "./cleanup_scripts/run_wiki_extractor.sh", cwd=".", env=env) + process.wait() + + +class PreprocessTask(object): + """Preprocess task Class + It defines the environment variables: + DATA_ROOT_DIR: Directory path to download the dataset + Then executes the download script""" + @staticmethod + def run(data_dir: str) -> None: + + env = os.environ.copy() + env.update({ + 'DATA_DIR': data_dir, + }) + process = subprocess.Popen( + "./process_wiki.sh", cwd="./cleanup_scripts", env=env) + process.wait() + + +class GenerateTfrecordsTask(object): + """Preprocess task Class + It defines the environment variables: + DATA_ROOT_DIR: Directory path to download the dataset + Then executes the download script""" + @staticmethod + def run(data_dir: str) -> None: + + env = os.environ.copy() + env.update({ + 'DATA_DIR': data_dir, + }) + process = subprocess.Popen( + "./generate_tfrecords.sh", cwd="./cleanup_scripts", env=env) + process.wait() + + +class TrainTask(object): + """Preprocess dataset task Class + It defines the environment variables: + DATA_DIR: Dataset directory path + All other parameters are defined in the parameters_file + Then executes the benchmark script""" + @staticmethod + def run(data_dir: str, output_dir: str) -> None: + env = os.environ.copy() + env.update({ + 'DATA_DIR': data_dir, + 'OUTPUT_DIR': output_dir + }) + process = subprocess.Popen( + "./run_and_time.sh", cwd=".", env=env) + process.wait() + + +@app.command("download") +def download(data_dir: str = typer.Option(..., '--data_dir')): + DownloadTask.run(data_dir) + + +@app.command("extract") +def extract(data_dir: str = typer.Option(..., '--data_dir')): + ExtractTask.run(data_dir) + + +@app.command("preprocess") +def preprocess(data_dir: str = typer.Option(..., '--data_dir')): + PreprocessTask.run(data_dir) + + +@app.command("generate_tfrecords") +def generate_tfrecords(data_dir: str = typer.Option(..., '--data_dir')): + GenerateTfrecordsTask.run(data_dir) + + +@app.command("train") +def train(data_dir: str = typer.Option(..., '--data_dir'), + output_dir: str = typer.Option(..., '--output_dir')): + TrainTask.run(data_dir, output_dir) + + +if __name__ == '__main__': + app() diff --git a/language_model/tensorflow/bert/requirements.txt b/language_model/tensorflow/bert/requirements.txt new file mode 100644 index 000000000..d84e58f41 --- /dev/null +++ b/language_model/tensorflow/bert/requirements.txt @@ -0,0 +1,5 @@ +PyYAML==5.4.1 +typer==0.3.2 +gdown==3.3.1 +wheel==0.37.0 +git+https://github.com/mlperf/logging.git@9aa718d525d1e8e64d32b12fe1b22133973d7063 \ No newline at end of file diff --git a/language_model/tensorflow/bert/run_and_time.sh b/language_model/tensorflow/bert/run_and_time.sh new file mode 100644 index 000000000..6a14cdb58 --- /dev/null +++ b/language_model/tensorflow/bert/run_and_time.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +data_dir=${DATA_DIR:-./cleanup_scripts} +output_dir=${OUTPUT_DIR:-/tmp/output/} +wiki_dir=$data_dir/wiki/ +results_dir=$data_dir/results/ +tfrecord_dir=$data_dir/tfrecord/ + +TF_XLA_FLAGS='--tf_xla_auto_jit=2' \ +time python3 run_pretraining.py \ + --bert_config_file=$wiki_dir/bert_config.json \ + --output_dir=$output_dir \ + --input_file="${tfrecord_dir}/part*" \ + --do_train \ + --do_eval \ + --eval_batch_size=8 \ + --init_checkpoint=./checkpoint/model.ckpt-28252 \ + --iterations_per_loop=1000 \ + --learning_rate=0.0001 \ + --max_eval_steps=1250 \ + --max_predictions_per_seq=76 \ + --max_seq_length=512 \ + --num_gpus=1 \ + --num_train_steps=107538 \ + --num_warmup_steps=1562 \ + --optimizer=lamb \ + --save_checkpoints_steps=1562 \ + --start_warmup_step=0 \ + --train_batch_size=24 \ + --nouse_tpu \ No newline at end of file diff --git a/language_model/tensorflow/bert/run_pretraining.py b/language_model/tensorflow/bert/run_pretraining.py index 7de5514e4..52df57abc 100644 --- a/language_model/tensorflow/bert/run_pretraining.py +++ b/language_model/tensorflow/bert/run_pretraining.py @@ -537,7 +537,7 @@ def main(_): # Creates session config. allow_soft_placement = True, is required for # multi-GPU and is not harmful for other modes. - session_config = tf.compat.v1.ConfigProto( + session_config = tf.ConfigProto( inter_op_parallelism_threads=8, allow_soft_placement=True) diff --git a/language_model/tensorflow/mlcube.yaml b/language_model/tensorflow/mlcube.yaml new file mode 100644 index 000000000..e1f94f5fc --- /dev/null +++ b/language_model/tensorflow/mlcube.yaml @@ -0,0 +1,37 @@ +name: MLCommons Bert Benchmark +authors: + - {name: "MLCommons Best Practices Working Group"} + +platform: + accelerator_count: 1 + +docker: + # Image name. + image: mlcommons/bert:0.0.1 + # Docker build context relative to $MLCUBE_ROOT. Default is `build`. + build_context: "bert" + # Docker file name within docker build context, default is `Dockerfile`. + build_file: "Dockerfile" + +tasks: + download: + # Download dataset + parameters: + outputs: {data_dir: data/} + extract: + # Extract dataset + parameters: + inputs: {data_dir: data/} + preprocess: + # Preprocess dataset + parameters: + inputs: {data_dir: data/} + generate_tfrecords: + # Convert training data in tfrecords format + parameters: + inputs: {data_dir: data/} + train: + # Train model + parameters: + inputs: {data_dir: data/} + outputs: {output_dir: output/} \ No newline at end of file