From 2f2185607d142dd34971e0a3a9fde24756984ea2 Mon Sep 17 00:00:00 2001 From: differentiablepizza <> Date: Sun, 26 Mar 2023 13:24:19 -0300 Subject: [PATCH 01/10] - Added support for WAV files - Added docker-compose for easy usage - Tidy up of bash scripts --- .gitignore | 9 ++++++-- Dockerfile | 24 ++++++++++++------- autosub/logger.py | 2 +- autosub/main.py | 39 ++++++++++++++++++++++--------- autosub/segmentAudio.py | 4 +++- autosub/utils.py | 4 ++-- docker-compose.yml | 10 ++++++++ getmodels.sh | 51 +++++++++++++++++++++++++++++++++-------- requirements.txt | 2 +- run.sh | 32 ++++++++++++++++++++++++++ 10 files changed, 141 insertions(+), 36 deletions(-) create mode 100644 docker-compose.yml create mode 100644 run.sh diff --git a/.gitignore b/.gitignore index 6b32b87..21c3038 100644 --- a/.gitignore +++ b/.gitignore @@ -49,9 +49,14 @@ Thumbs.db # Venv sub/* -__pycache__ +**/__pycache__/ # Folders audio/ sub/ -output/ \ No newline at end of file +output/ +model/ + +.gitgitignore + +data/ \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 0020ca2..648f40c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,29 +1,37 @@ -ARG BASEIMAGE=ubuntu:18.04 -#ARG BASEIMAGE=nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04 +#ARG BASEIMAGE=ubuntu:18.04 +ARG BASEIMAGE=nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04 FROM ${BASEIMAGE} -ARG DEPSLIST=requirements.txt -#ARG DEPSLIST=requirements-gpu.txt +#ARG DEPSLIST=requirements.txt +ARG DEPSLIST=requirements-gpu.txt ENV PYTHONUNBUFFERED 1 COPY *.pbmm ./ COPY *.scorer ./ COPY setup.py ./ -COPY autosub ./autosub RUN DEBIAN_FRONTEND=noninteractive apt update && \ - apt -y install ffmpeg libsm6 libxext6 python3 python3-pip && \ + apt -y install ffmpeg libsm6 libxext6 python3 python3-pip wget && \ apt -y clean && \ rm -rf /var/lib/apt/lists/* COPY $DEPSLIST ./requirements.txt +COPY run.sh ./run.sh +RUN chmod +x ./run.sh # make sure pip is up-to-date RUN python3 -m pip install --upgrade pip RUN pip3 install --no-cache-dir -r requirements.txt -RUN mkdir audio output +# Mount data +VOLUME /input +VOLUME /output +VOLUME /deepspeech +RUN mkdir /audio + +# Copying autosub for last for faster debugging +COPY autosub ./autosub -ENTRYPOINT ["python3", "autosub/main.py"] +ENTRYPOINT ["./run.sh"] diff --git a/autosub/logger.py b/autosub/logger.py index d6c0739..cc7a5fa 100644 --- a/autosub/logger.py +++ b/autosub/logger.py @@ -5,7 +5,7 @@ def setup_applevel_logger(logger_name = APP_NAME, file_name=None): logger = logging.getLogger(logger_name) - logger.setLevel(logging.INFO) + logger.setLevel(logging.DEBUG) formatter = logging.Formatter("[%(levelname)s] %(message)s") #%(name)s | sh = logging.StreamHandler(sys.stdout) sh.setFormatter(formatter) diff --git a/autosub/main.py b/autosub/main.py index 6960fd0..2bc7a15 100644 --- a/autosub/main.py +++ b/autosub/main.py @@ -5,11 +5,11 @@ import re import sys import wave -from autosub import logger +from . import logger import argparse import numpy as np -from tqdm import tqdm +# from tqdm import tqdm from autosub.utils import * from autosub.writeToFile import write_to_file @@ -95,6 +95,7 @@ def main(): parser.add_argument("--engine", choices=supported_engines, nargs="?", default="stt", help="Select either DeepSpeech or Coqui STT for inference. Latter is default") parser.add_argument("--file", required=False, help="Input video file") + parser.add_argument("--wav", required=False, help="Input wav file") parser.add_argument("--model", required=False, help="Input *.pbmm model file") parser.add_argument("--scorer", required=False, help="Input *.scorer file") @@ -120,22 +121,35 @@ def main(): else: _logger.error(f"Invalid file: {args.file}") sys.exit(1) + elif args.wav is not None: + if os.path.isfile(args.wav): + input_file = args.wav + _logger.info(f"Input file: {args.wav}") + else: + _logger.error(f"Invalid file: {args.wav}") + sys.exit(1) else: - _logger.error("One or more of --file or --dry-run are required") + _logger.error("One or more of --file or --dry-run or --wav are required") sys.exit(1) + # File names base_directory = os.getcwd() output_directory = os.path.join(base_directory, "output") + file_prefix = os.path.splitext(os.path.basename(input_file))[0] audio_directory = os.path.join(base_directory, "audio") - video_prefix = os.path.splitext(os.path.basename(input_file))[0] - audio_file_name = os.path.join(audio_directory, video_prefix + ".wav") + _logger.debug(f"Audio directory: {audio_directory}") + if args.wav is not None: + audio_file_name = input_file + else: + audio_file_name = os.path.join(audio_directory, file_prefix + ".wav") + os.makedirs(audio_directory, exist_ok=True) os.makedirs(output_directory, exist_ok=True) os.makedirs(audio_directory, exist_ok=True) output_file_handle_dict = {} for format in args.format: - output_filename = os.path.join(output_directory, video_prefix + "." + format) + output_filename = os.path.join(output_directory, file_prefix + "." + format) # print("Creating file: " + output_filename) output_file_handle_dict[format] = open(output_filename, "w") # For VTT format, write header @@ -144,19 +158,22 @@ def main(): output_file_handle_dict[format].write("Kind: captions\n\n") clean_folder(audio_directory) - extract_audio(input_file, audio_file_name) + if args.wav is None: + extract_audio(input_file, audio_file_name) _logger.info("Splitting on silent parts in audio file") - remove_silent_segments(audio_file_name) + remove_silent_segments(audio_file_name, output_dir = audio_directory) + + _logger.debug(f"audio_directory: {os.listdir(audio_directory)[:5]}") - audiofiles = [file for file in os.listdir(audio_directory) if file.startswith(video_prefix)] + audiofiles = [file for file in os.listdir(audio_directory) if file.startswith(file_prefix)] audiofiles = sort_alphanumeric(audiofiles) - audiofiles.remove(os.path.basename(audio_file_name)) + audiofiles.remove(os.path.basename(audio_file_name)) if args.wav is None else None _logger.info("Running inference...") ds = create_model(args.engine, ds_model, ds_scorer) - for filename in tqdm(audiofiles): + for filename in audiofiles: audio_segment_path = os.path.join(audio_directory, filename) ds_process_audio(ds, audio_segment_path, output_file_handle_dict, split_duration=args.split_duration) diff --git a/autosub/segmentAudio.py b/autosub/segmentAudio.py index 7a9575d..fe5f8e4 100644 --- a/autosub/segmentAudio.py +++ b/autosub/segmentAudio.py @@ -185,7 +185,7 @@ def silence_removal(signal, sampling_rate, st_win, st_step, smooth_window=0.5, return seg_limits -def remove_silent_segments(input_file, smoothing_window=1.0, weight=0.2): +def remove_silent_segments(input_file, smoothing_window=1.0, weight=0.2, output_dir=None): """Remove silent segments from an audio file and split on those segments Args: @@ -202,5 +202,7 @@ def remove_silent_segments(input_file, smoothing_window=1.0, weight=0.2): for i, s in enumerate(segmentLimits): strOut = "{0:s}_{1:.3f}-{2:.3f}.wav".format(input_file[0:-4], s[0], s[1]) + if output_dir is not None: + strOut = os.path.join(output_dir, os.path.basename(strOut)) wavfile.write(strOut, fs, x[int(fs * s[0]):int(fs * s[1])]) diff --git a/autosub/utils.py b/autosub/utils.py index 7f6f67a..c3867aa 100644 --- a/autosub/utils.py +++ b/autosub/utils.py @@ -62,10 +62,10 @@ def download_model(engine, fname): fname : either of "model" or "scorer" """ - _logger.info(f"{fname.capitalize()} not found locally. Downloading") try: _file = _models[engine][fname] - command = ["wget", _file, "-q", "--show-progress"] + _logger.warning(f"{fname.capitalize()} not found locally. Downloading: {_file}") + command = ["wget", _file, "-q"] ret = subprocess.run(command).returncode except Exception as e: _logger.error(str(e)) diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..5c4dc32 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,10 @@ +version: '3' + +services: + autosub: + build: . + volumes: + - ./data/input:/input + - ./data/output:/output + - ./deepspeech:/deepspeech + command: ./run.sh \ No newline at end of file diff --git a/getmodels.sh b/getmodels.sh index 927256a..a1bb3fb 100755 --- a/getmodels.sh +++ b/getmodels.sh @@ -1,14 +1,45 @@ #!/bin/bash +#Downloaded models will be saved on ./model -if [ -z $1 ]; then - echo "Please provide as argument the model number you wish to download. E.G. 0.9.3" - exit 1; -else - model=$1 -fi +#Flags: +# -m or --model: Model version. Default: 0.9.3 +# -t or --type: Model type. Default: stt +# -h or --help: Show help +while [ "$1" != "" ]; do + case $1 in + -m | --model ) shift + model=$1 + ;; + -t | --type ) shift + type=$1 + ;; + -h | --help ) echo "Usage: getmodels.sh [-m model] [-t type] [-h help]" + echo "Flags:" + echo "-m or --model: Model version. Default: 0.9.3" + echo "-t or --type: Model type. Default: stt" + echo "-h or --help: Show help" + exit + ;; + * ) echo "Usage: getmodels.sh [-m model] [-t type] [-h help]" + echo "Flags:" + echo "-m or --model: Model version. Default: 0.9.3" + echo "-t or --type: Model type. Default: stt" + echo "-h or --help: Show help" + exit 1 + esac + shift +done -model_url=https://github.com/mozilla/DeepSpeech/releases/download/v$model/deepspeech-$model-models.pbmm -scorer_url=https://github.com/mozilla/DeepSpeech/releases/download/v$model/deepspeech-$model-models.scorer - -wget ${model_url} && wget ${scorer_url} +#If type is ds, download DeepSpeech model, otherwise download Mozilla TTS model +mkdir -p ./deepspeech +if [ "$type" = "ds" ]; then + model_url=https://github.com/mozilla/DeepSpeech/releases/download/v$model/deepspeech-$model-models.pbmm + scorer_url=https://github.com/mozilla/DeepSpeech/releases/download/v$model/deepspeech-$model-models.scorer + wget -O ./deepspeech/deepspeech-$model-models.pbmm $model_url + wget -O ./deepspeech/deepspeech-$model-models.scorer $scorer_url +elif [ "$type" = "tts" ]; then + model_url="https://github.com/coqui-ai/STT-models/releases/download/english/coqui/v$model/model.tflite" + scorer_url="https://github.com/coqui-ai/STT-models/releases/download/english%2Fcoqui%2Fv1.0.0-huge-vocab/huge-vocabulary.scorer" + wget -O ./deepspeech/model.tflite $model_url + wget -O ./deepspeech/huge-vocabulary.scorer $scorer_url diff --git a/requirements.txt b/requirements.txt index 87d1e57..5d5117c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,6 +8,6 @@ pydub==0.23.1 pyparsing==2.4.7 python-dateutil==2.8.1 scikit-learn -scipy==1.9.3 +scipy six==1.15.0 tqdm==4.44.1 diff --git a/run.sh b/run.sh new file mode 100644 index 0000000..8e02909 --- /dev/null +++ b/run.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# Meant to be used inside Docker. It scans the directory /input +# for WAV or MP4 files and executes the python script for each of them. + +input_path=/input + +shopt -s nullglob # prevent errors if no files are found + +#Get model and scorer names from the model directory. +#Ignore if doesn't exist +model=$(find /deepspeech -name '*.pbmm' -print -quit) +scorer=$(find /deepspeech -name '*.scorer' -print -quit) + +for file in "$input_path"/*.wav; do + echo "Processing WAV $file" + #If model and scorer are not empty, use them + if [ -n "$model" ] && [ -n "$scorer" ]; then + python3 -m autosub.main --wav "$file" --model "$model" --scorer "$scorer" + else + python3 -m autosub.main --wav "$file" + fi +done + +for file in "$input_path"/*.mp4; do + echo "Processing MP4 $file" + #If model and scorer are not empty, use them + if [ -n "$model" ] && [ -n "$scorer" ]; then + python3 -m autosub.main --file "$file" --model "$model" --scorer "$scorer" + else + python3 -m autosub.main --file "$file" + fi +done From aef82e3dd2b296e324352d9fb2909aa1eaebd695 Mon Sep 17 00:00:00 2001 From: differentiablepizza <> Date: Sun, 26 Mar 2023 13:37:16 -0300 Subject: [PATCH 02/10] Added details about docker-compose on readme --- README.md | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index b7b9627..afc0190 100644 --- a/README.md +++ b/README.md @@ -49,8 +49,9 @@ AutoSub is a CLI application to generate subtitle files (.srt, .vtt, and .txt tr * If you don't have the model files, get them ```bash - $ ./getmodels.sh 0.9.3 + $ ./getmodels.sh -m 0.9.3 -t stt ``` + More options are available with `./getmodels.sh -h` * For a CPU build ```bash $ docker build -t autosub . @@ -69,6 +70,22 @@ AutoSub is a CLI application to generate subtitle files (.srt, .vtt, and .txt tr $ docker cp autosub:/output/ . ``` +## Docker-Compose + +The docker compose file allows for easy processing of multiple MP4 or WAV files at once. Create a folder `data/input` in the root directory and paste all of the media files to be transcripted. Then, run: + +``` bash +docker-compose up +``` + +If you haven't built it already, append the flag `--build` at the end of the command: + +``` bash +docker-compose up --build +``` + +Once the process thas finished, the output SRTs will be located at `data/output`. + ## How-to example * The model files should be in the repo root directory and will be loaded/downloaded automatically. Incase you have multiple versions, use the `--model` and `--scorer` args while executing From 94a59986d412d3062a044173e0cabf7cfac79b78 Mon Sep 17 00:00:00 2001 From: differentiablepizza <> Date: Sun, 26 Mar 2023 13:46:13 -0300 Subject: [PATCH 03/10] tidy up getmodels.sh and put tqdm back on main.py --- autosub/main.py | 4 +-- getmodels.sh | 78 +++++++++++++++++++++++++++---------------------- 2 files changed, 45 insertions(+), 37 deletions(-) diff --git a/autosub/main.py b/autosub/main.py index 2bc7a15..ff04d95 100644 --- a/autosub/main.py +++ b/autosub/main.py @@ -9,7 +9,7 @@ import argparse import numpy as np -# from tqdm import tqdm +from tqdm import tqdm from autosub.utils import * from autosub.writeToFile import write_to_file @@ -173,7 +173,7 @@ def main(): _logger.info("Running inference...") ds = create_model(args.engine, ds_model, ds_scorer) - for filename in audiofiles: + for filename in tqdm(audiofiles, desc="Inference"): audio_segment_path = os.path.join(audio_directory, filename) ds_process_audio(ds, audio_segment_path, output_file_handle_dict, split_duration=args.split_duration) diff --git a/getmodels.sh b/getmodels.sh index a1bb3fb..d034a4a 100755 --- a/getmodels.sh +++ b/getmodels.sh @@ -1,45 +1,53 @@ #!/bin/bash -#Downloaded models will be saved on ./model +# Downloaded models will be saved on ./model -#Flags: +# Flags: # -m or --model: Model version. Default: 0.9.3 # -t or --type: Model type. Default: stt # -h or --help: Show help + while [ "$1" != "" ]; do - case $1 in - -m | --model ) shift - model=$1 - ;; - -t | --type ) shift - type=$1 - ;; - -h | --help ) echo "Usage: getmodels.sh [-m model] [-t type] [-h help]" - echo "Flags:" - echo "-m or --model: Model version. Default: 0.9.3" - echo "-t or --type: Model type. Default: stt" - echo "-h or --help: Show help" - exit - ;; - * ) echo "Usage: getmodels.sh [-m model] [-t type] [-h help]" - echo "Flags:" - echo "-m or --model: Model version. Default: 0.9.3" - echo "-t or --type: Model type. Default: stt" - echo "-h or --help: Show help" - exit 1 - esac - shift + case $1 in + -m | --model ) + shift + model=$1 + ;; + -t | --type ) + shift + type=$1 + ;; + -h | --help ) + echo "Usage: getmodels.sh [-m model] [-t type] [-h help]" + echo "Flags:" + echo "-m or --model: Model version. Default: 0.9.3" + echo "-t or --type: Model type. Default: stt" + echo "-h or --help: Show help" + exit + ;; + * ) + echo "Usage: getmodels.sh [-m model] [-t type] [-h help]" + echo "Flags:" + echo "-m or --model: Model version. Default: 0.9.3" + echo "-t or --type: Model type. Default: stt" + echo "-h or --help: Show help" + exit 1 + esac + shift done -#If type is ds, download DeepSpeech model, otherwise download Mozilla TTS model +# If type is ds, download DeepSpeech model, otherwise download Mozilla TTS model mkdir -p ./deepspeech if [ "$type" = "ds" ]; then - model_url=https://github.com/mozilla/DeepSpeech/releases/download/v$model/deepspeech-$model-models.pbmm - scorer_url=https://github.com/mozilla/DeepSpeech/releases/download/v$model/deepspeech-$model-models.scorer - wget -O ./deepspeech/deepspeech-$model-models.pbmm $model_url - wget -O ./deepspeech/deepspeech-$model-models.scorer $scorer_url -elif [ "$type" = "tts" ]; then - model_url="https://github.com/coqui-ai/STT-models/releases/download/english/coqui/v$model/model.tflite" - scorer_url="https://github.com/coqui-ai/STT-models/releases/download/english%2Fcoqui%2Fv1.0.0-huge-vocab/huge-vocabulary.scorer" - wget -O ./deepspeech/model.tflite $model_url - wget -O ./deepspeech/huge-vocabulary.scorer $scorer_url - + model_url="https://github.com/mozilla/DeepSpeech/releases/download/v$model/deepspeech-$model-models.pbmm" + scorer_url="https://github.com/mozilla/DeepSpeech/releases/download/v$model/deepspeech-$model-models.scorer" + wget -O ./deepspeech/deepspeech-$model-models.pbmm "$model_url" + wget -O ./deepspeech/deepspeech-$model-models.scorer "$scorer_url" +elif [ "$type" = "stt" ]; then + model_url="https://github.com/coqui-ai/STT-models/releases/download/english/coqui/v$model/model.tflite" + scorer_url="https://github.com/coqui-ai/STT-models/releases/download/english%2Fcoqui%2Fv1.0.0-huge-vocab/huge-vocabulary.scorer" + wget -O ./deepspeech/model.tflite "$model_url" + wget -O ./deepspeech/huge-vocabulary.scorer "$scorer_url" +else + echo "Invalid model type. Use -t or --type to specify model type." + exit 1 +fi From 1221c59827eeb353f82f6efdb3b2457c63764fcf Mon Sep 17 00:00:00 2001 From: differentiablepizza <> Date: Sun, 26 Mar 2023 13:50:18 -0300 Subject: [PATCH 04/10] run.sh wasn't working for .tflite --- run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run.sh b/run.sh index 8e02909..f99102a 100644 --- a/run.sh +++ b/run.sh @@ -8,7 +8,7 @@ shopt -s nullglob # prevent errors if no files are found #Get model and scorer names from the model directory. #Ignore if doesn't exist -model=$(find /deepspeech -name '*.pbmm' -print -quit) +model=$(find /deepspeech \( -name '*.pbmm' -o -name '*.tflite' \) -print -quit) scorer=$(find /deepspeech -name '*.scorer' -print -quit) for file in "$input_path"/*.wav; do From d11fc61219ef9e423c3f0dabe0d5a25b040a2477 Mon Sep 17 00:00:00 2001 From: differentiablepizza <> Date: Sun, 26 Mar 2023 14:36:36 -0300 Subject: [PATCH 05/10] Changed tqdm to print every one in a while instead of its default progress bar --- Dockerfile | 8 ++++---- autosub/main.py | 5 ++++- autosub/utils.py | 21 +++++++++++++-------- 3 files changed, 21 insertions(+), 13 deletions(-) diff --git a/Dockerfile b/Dockerfile index 648f40c..28af997 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,10 +1,10 @@ -#ARG BASEIMAGE=ubuntu:18.04 -ARG BASEIMAGE=nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04 +ARG BASEIMAGE=ubuntu:18.04 +#ARG BASEIMAGE=nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04 FROM ${BASEIMAGE} -#ARG DEPSLIST=requirements.txt -ARG DEPSLIST=requirements-gpu.txt +ARG DEPSLIST=requirements.txt +#ARG DEPSLIST=requirements-gpu.txt ENV PYTHONUNBUFFERED 1 diff --git a/autosub/main.py b/autosub/main.py index ff04d95..69a973f 100644 --- a/autosub/main.py +++ b/autosub/main.py @@ -173,9 +173,12 @@ def main(): _logger.info("Running inference...") ds = create_model(args.engine, ds_model, ds_scorer) - for filename in tqdm(audiofiles, desc="Inference"): + progress = tqdm(total=len(audiofiles), desc="Inference", position=0) + for filename in audiofiles: audio_segment_path = os.path.join(audio_directory, filename) ds_process_audio(ds, audio_segment_path, output_file_handle_dict, split_duration=args.split_duration) + progress.update(1) + progress.close() for format in output_file_handle_dict: file_handle = output_file_handle_dict[format] diff --git a/autosub/utils.py b/autosub/utils.py index c3867aa..06fded9 100644 --- a/autosub/utils.py +++ b/autosub/utils.py @@ -123,21 +123,26 @@ def create_model(engine, model, scorer): Args: engine : "ds" for DeepSpeech and "stt" for Coqui STT - model : .pbmm model file + model : .pbmm or .tflite model file scorer : .scorer file """ - try: - if engine == "ds": - ds = DModel(model) - else: - ds = SModel(model) - except: - _logger.error("Invalid model file") + if engine == "ds": + _logger.debug("Loading DeepSpeech model") + ds = DModel(model) + _logger.debug("Completed loading DeepSpeech model") + elif engine == "stt": + _logger.debug("Loading Coqui STT model") + ds = SModel(model) + _logger.debug("Completed loading Coqui STT model") + else: + _logger.error("Invalid engine") sys.exit(1) try: + _logger.debug("Loading scorer") ds.enableExternalScorer(scorer) + _logger.debug("Completed loading scorer") except: _logger.warn("Invalid scorer file. Running inference using only model file") return(ds) From 986d79c557b911761af657ec7644b46e39b673d6 Mon Sep 17 00:00:00 2001 From: differentiablepizza <> Date: Sun, 26 Mar 2023 14:42:28 -0300 Subject: [PATCH 06/10] Created one separate Dockerfile for CPU and GPU --- Dockerfile => Dockerfile.cpu | 2 -- Dockerfile.gpu | 35 +++++++++++++++++++++++++++++++++++ autosub/main.py | 1 + docker-compose.yml | 1 + 4 files changed, 37 insertions(+), 2 deletions(-) rename Dockerfile => Dockerfile.cpu (88%) create mode 100644 Dockerfile.gpu diff --git a/Dockerfile b/Dockerfile.cpu similarity index 88% rename from Dockerfile rename to Dockerfile.cpu index 28af997..f1be9e9 100644 --- a/Dockerfile +++ b/Dockerfile.cpu @@ -1,10 +1,8 @@ ARG BASEIMAGE=ubuntu:18.04 -#ARG BASEIMAGE=nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04 FROM ${BASEIMAGE} ARG DEPSLIST=requirements.txt -#ARG DEPSLIST=requirements-gpu.txt ENV PYTHONUNBUFFERED 1 diff --git a/Dockerfile.gpu b/Dockerfile.gpu new file mode 100644 index 0000000..f4ae2a8 --- /dev/null +++ b/Dockerfile.gpu @@ -0,0 +1,35 @@ +ARG BASEIMAGE=nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04 + +FROM ${BASEIMAGE} + +ARG DEPSLIST=requirements-gpu.txt + +ENV PYTHONUNBUFFERED 1 + +COPY *.pbmm ./ +COPY *.scorer ./ +COPY setup.py ./ + +RUN DEBIAN_FRONTEND=noninteractive apt update && \ + apt -y install ffmpeg libsm6 libxext6 python3 python3-pip wget && \ + apt -y clean && \ + rm -rf /var/lib/apt/lists/* + +COPY $DEPSLIST ./requirements.txt +COPY run.sh ./run.sh +RUN chmod +x ./run.sh + +# make sure pip is up-to-date +RUN python3 -m pip install --upgrade pip +RUN pip3 install --no-cache-dir -r requirements.txt + +# Mount data +VOLUME /input +VOLUME /output +VOLUME /deepspeech +RUN mkdir /audio + +# Copying autosub for last for faster debugging +COPY autosub ./autosub + +ENTRYPOINT ["./run.sh"] diff --git a/autosub/main.py b/autosub/main.py index 69a973f..affe8dd 100644 --- a/autosub/main.py +++ b/autosub/main.py @@ -173,6 +173,7 @@ def main(): _logger.info("Running inference...") ds = create_model(args.engine, ds_model, ds_scorer) + _logger.info("Starting transcription") progress = tqdm(total=len(audiofiles), desc="Inference", position=0) for filename in audiofiles: audio_segment_path = os.path.join(audio_directory, filename) diff --git a/docker-compose.yml b/docker-compose.yml index 5c4dc32..08db2bc 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -3,6 +3,7 @@ version: '3' services: autosub: build: . + dockerfile: Dockerfile.gpu volumes: - ./data/input:/input - ./data/output:/output From 7cba57e9331878515589a07a2a900f67d1657852 Mon Sep 17 00:00:00 2001 From: differentiablepizza <> Date: Sun, 26 Mar 2023 14:47:01 -0300 Subject: [PATCH 07/10] Docker compose can choose dockerfile dynamically --- README.md | 12 ++++++++++++ docker-compose.yml | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index afc0190..712fab9 100644 --- a/README.md +++ b/README.md @@ -78,12 +78,24 @@ The docker compose file allows for easy processing of multiple MP4 or WAV files docker-compose up ``` +To use the CPU implementation instead of GPU, change the dockefile with the environment variable `DOCKERFILE`: + +``` bash +docker-compose up -e DOCKERFILE=Dockerfile.cpu +``` + If you haven't built it already, append the flag `--build` at the end of the command: ``` bash docker-compose up --build ``` +or + +``` bash +docker-compose up -e DOCKERFILE=Dockerfile.cpu --build +``` + Once the process thas finished, the output SRTs will be located at `data/output`. ## How-to example diff --git a/docker-compose.yml b/docker-compose.yml index 08db2bc..f1d77a7 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -3,7 +3,7 @@ version: '3' services: autosub: build: . - dockerfile: Dockerfile.gpu + dockerfile: ${DOCKERFILE:-Dockerfile.gpu} volumes: - ./data/input:/input - ./data/output:/output From 100b13178191925a64a1d85c6eac7b49d4fc05ac Mon Sep 17 00:00:00 2001 From: differentiablepizza <> Date: Sun, 26 Mar 2023 15:20:03 -0300 Subject: [PATCH 08/10] Made input and output as env variables on docker-compose --- autosub/main.py | 2 +- docker-compose.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/autosub/main.py b/autosub/main.py index affe8dd..a66b81e 100644 --- a/autosub/main.py +++ b/autosub/main.py @@ -174,7 +174,7 @@ def main(): ds = create_model(args.engine, ds_model, ds_scorer) _logger.info("Starting transcription") - progress = tqdm(total=len(audiofiles), desc="Inference", position=0) + progress = tqdm(total=len(audiofiles), desc="Inference", position=0, dynamic_ncols=False) for filename in audiofiles: audio_segment_path = os.path.join(audio_directory, filename) ds_process_audio(ds, audio_segment_path, output_file_handle_dict, split_duration=args.split_duration) diff --git a/docker-compose.yml b/docker-compose.yml index f1d77a7..5dc5d04 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -5,7 +5,7 @@ services: build: . dockerfile: ${DOCKERFILE:-Dockerfile.gpu} volumes: - - ./data/input:/input - - ./data/output:/output + - ${INPUT:-./data/input}:/input + - ${OUTPUT:-./data/output}:/output - ./deepspeech:/deepspeech command: ./run.sh \ No newline at end of file From 51b8fc082cfc8eae9dbf1ad188a9e657a3a6875a Mon Sep 17 00:00:00 2001 From: differentiablepizza <> Date: Sun, 26 Mar 2023 23:02:20 -0300 Subject: [PATCH 09/10] Minor change to docker-compose file --- README.md | 10 ++++++++-- docker-compose.yml | 7 ++++--- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 712fab9..669d833 100644 --- a/README.md +++ b/README.md @@ -81,7 +81,7 @@ docker-compose up To use the CPU implementation instead of GPU, change the dockefile with the environment variable `DOCKERFILE`: ``` bash -docker-compose up -e DOCKERFILE=Dockerfile.cpu +DOCKERFILE=Dockerfile.cpu docker-compose up ``` If you haven't built it already, append the flag `--build` at the end of the command: @@ -93,11 +93,17 @@ docker-compose up --build or ``` bash -docker-compose up -e DOCKERFILE=Dockerfile.cpu --build +DOCKERFILE=Dockerfile.cpu docker-compose up --build ``` Once the process thas finished, the output SRTs will be located at `data/output`. +To specify custom paths, use the `INPUT` and `OUTPUT` variables: + +``` bash +INPUT=/folder/to/input OUTPUT=/folder/to/output docker-compose up +``` + ## How-to example * The model files should be in the repo root directory and will be loaded/downloaded automatically. Incase you have multiple versions, use the `--model` and `--scorer` args while executing diff --git a/docker-compose.yml b/docker-compose.yml index 5dc5d04..44bda2f 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -2,10 +2,11 @@ version: '3' services: autosub: - build: . - dockerfile: ${DOCKERFILE:-Dockerfile.gpu} + build: + context: . + dockerfile: ${DOCKERFILE:-Dockerfile.gpu} volumes: - ${INPUT:-./data/input}:/input - ${OUTPUT:-./data/output}:/output - ./deepspeech:/deepspeech - command: ./run.sh \ No newline at end of file + command: ./run.sh From 057e7495991e68e65faa3a2b3c222db4e4cb0236 Mon Sep 17 00:00:00 2001 From: differentiablepizza <> Date: Mon, 27 Mar 2023 01:43:19 -0300 Subject: [PATCH 10/10] Made adaptations to run GPU on container --- README.md | 1 + docker-compose.yml | 9 ++++++++- run.sh | 12 ++++++++---- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 669d833..21db9b7 100644 --- a/README.md +++ b/README.md @@ -149,3 +149,4 @@ I would love to follow up on any suggestions/issues you find :) 1. https://github.com/mozilla/DeepSpeech/ 2. https://github.com/tyiannak/pyAudioAnalysis 3. https://deepspeech.readthedocs.io/ +4. [Quick guide to install nvidia container toolkit](https://www.server-world.info/en/note?os=Ubuntu_20.04&p=nvidia&f=2) \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 44bda2f..ffedcc0 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -9,4 +9,11 @@ services: - ${INPUT:-./data/input}:/input - ${OUTPUT:-./data/output}:/output - ./deepspeech:/deepspeech - command: ./run.sh + command: ./run.sh ${ENGINE:-stt} + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] \ No newline at end of file diff --git a/run.sh b/run.sh index f99102a..bb0426a 100644 --- a/run.sh +++ b/run.sh @@ -2,6 +2,10 @@ # Meant to be used inside Docker. It scans the directory /input # for WAV or MP4 files and executes the python script for each of them. +# Get engine parameter +engine=$2 +echo "Engine: $engine" + input_path=/input shopt -s nullglob # prevent errors if no files are found @@ -15,9 +19,9 @@ for file in "$input_path"/*.wav; do echo "Processing WAV $file" #If model and scorer are not empty, use them if [ -n "$model" ] && [ -n "$scorer" ]; then - python3 -m autosub.main --wav "$file" --model "$model" --scorer "$scorer" + python3 -m autosub.main --wav "$file" --model "$model" --scorer "$scorer" --engine $engine else - python3 -m autosub.main --wav "$file" + python3 -m autosub.main --wav "$file" --engine $engine fi done @@ -25,8 +29,8 @@ for file in "$input_path"/*.mp4; do echo "Processing MP4 $file" #If model and scorer are not empty, use them if [ -n "$model" ] && [ -n "$scorer" ]; then - python3 -m autosub.main --file "$file" --model "$model" --scorer "$scorer" + python3 -m autosub.main --file "$file" --model "$model" --scorer "$scorer" --engine $engine else - python3 -m autosub.main --file "$file" + python3 -m autosub.main --file "$file" --engine $engine fi done