diff --git a/.gitignore b/.gitignore index 6b32b87..21c3038 100644 --- a/.gitignore +++ b/.gitignore @@ -49,9 +49,14 @@ Thumbs.db # Venv sub/* -__pycache__ +**/__pycache__/ # Folders audio/ sub/ -output/ \ No newline at end of file +output/ +model/ + +.gitgitignore + +data/ \ No newline at end of file diff --git a/Dockerfile b/Dockerfile.cpu similarity index 62% rename from Dockerfile rename to Dockerfile.cpu index 0020ca2..f1be9e9 100644 --- a/Dockerfile +++ b/Dockerfile.cpu @@ -1,29 +1,35 @@ ARG BASEIMAGE=ubuntu:18.04 -#ARG BASEIMAGE=nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04 FROM ${BASEIMAGE} ARG DEPSLIST=requirements.txt -#ARG DEPSLIST=requirements-gpu.txt ENV PYTHONUNBUFFERED 1 COPY *.pbmm ./ COPY *.scorer ./ COPY setup.py ./ -COPY autosub ./autosub RUN DEBIAN_FRONTEND=noninteractive apt update && \ - apt -y install ffmpeg libsm6 libxext6 python3 python3-pip && \ + apt -y install ffmpeg libsm6 libxext6 python3 python3-pip wget && \ apt -y clean && \ rm -rf /var/lib/apt/lists/* COPY $DEPSLIST ./requirements.txt +COPY run.sh ./run.sh +RUN chmod +x ./run.sh # make sure pip is up-to-date RUN python3 -m pip install --upgrade pip RUN pip3 install --no-cache-dir -r requirements.txt -RUN mkdir audio output +# Mount data +VOLUME /input +VOLUME /output +VOLUME /deepspeech +RUN mkdir /audio + +# Copying autosub for last for faster debugging +COPY autosub ./autosub -ENTRYPOINT ["python3", "autosub/main.py"] +ENTRYPOINT ["./run.sh"] diff --git a/Dockerfile.gpu b/Dockerfile.gpu new file mode 100644 index 0000000..f4ae2a8 --- /dev/null +++ b/Dockerfile.gpu @@ -0,0 +1,35 @@ +ARG BASEIMAGE=nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04 + +FROM ${BASEIMAGE} + +ARG DEPSLIST=requirements-gpu.txt + +ENV PYTHONUNBUFFERED 1 + +COPY *.pbmm ./ +COPY *.scorer ./ +COPY setup.py ./ + +RUN DEBIAN_FRONTEND=noninteractive apt update && \ + apt -y install ffmpeg libsm6 libxext6 python3 python3-pip wget && \ + apt -y clean && \ + rm -rf /var/lib/apt/lists/* + +COPY $DEPSLIST ./requirements.txt +COPY run.sh ./run.sh +RUN chmod +x ./run.sh + +# make sure pip is up-to-date +RUN python3 -m pip install --upgrade pip +RUN pip3 install --no-cache-dir -r requirements.txt + +# Mount data +VOLUME /input +VOLUME /output +VOLUME /deepspeech +RUN mkdir /audio + +# Copying autosub for last for faster debugging +COPY autosub ./autosub + +ENTRYPOINT ["./run.sh"] diff --git a/README.md b/README.md index b7b9627..21db9b7 100644 --- a/README.md +++ b/README.md @@ -49,8 +49,9 @@ AutoSub is a CLI application to generate subtitle files (.srt, .vtt, and .txt tr * If you don't have the model files, get them ```bash - $ ./getmodels.sh 0.9.3 + $ ./getmodels.sh -m 0.9.3 -t stt ``` + More options are available with `./getmodels.sh -h` * For a CPU build ```bash $ docker build -t autosub . @@ -69,6 +70,40 @@ AutoSub is a CLI application to generate subtitle files (.srt, .vtt, and .txt tr $ docker cp autosub:/output/ . ``` +## Docker-Compose + +The docker compose file allows for easy processing of multiple MP4 or WAV files at once. Create a folder `data/input` in the root directory and paste all of the media files to be transcripted. Then, run: + +``` bash +docker-compose up +``` + +To use the CPU implementation instead of GPU, change the dockefile with the environment variable `DOCKERFILE`: + +``` bash +DOCKERFILE=Dockerfile.cpu docker-compose up +``` + +If you haven't built it already, append the flag `--build` at the end of the command: + +``` bash +docker-compose up --build +``` + +or + +``` bash +DOCKERFILE=Dockerfile.cpu docker-compose up --build +``` + +Once the process thas finished, the output SRTs will be located at `data/output`. + +To specify custom paths, use the `INPUT` and `OUTPUT` variables: + +``` bash +INPUT=/folder/to/input OUTPUT=/folder/to/output docker-compose up +``` + ## How-to example * The model files should be in the repo root directory and will be loaded/downloaded automatically. Incase you have multiple versions, use the `--model` and `--scorer` args while executing @@ -114,3 +149,4 @@ I would love to follow up on any suggestions/issues you find :) 1. https://github.com/mozilla/DeepSpeech/ 2. https://github.com/tyiannak/pyAudioAnalysis 3. https://deepspeech.readthedocs.io/ +4. [Quick guide to install nvidia container toolkit](https://www.server-world.info/en/note?os=Ubuntu_20.04&p=nvidia&f=2) \ No newline at end of file diff --git a/autosub/logger.py b/autosub/logger.py index d6c0739..cc7a5fa 100644 --- a/autosub/logger.py +++ b/autosub/logger.py @@ -5,7 +5,7 @@ def setup_applevel_logger(logger_name = APP_NAME, file_name=None): logger = logging.getLogger(logger_name) - logger.setLevel(logging.INFO) + logger.setLevel(logging.DEBUG) formatter = logging.Formatter("[%(levelname)s] %(message)s") #%(name)s | sh = logging.StreamHandler(sys.stdout) sh.setFormatter(formatter) diff --git a/autosub/main.py b/autosub/main.py index 6960fd0..a66b81e 100644 --- a/autosub/main.py +++ b/autosub/main.py @@ -5,7 +5,7 @@ import re import sys import wave -from autosub import logger +from . import logger import argparse import numpy as np @@ -95,6 +95,7 @@ def main(): parser.add_argument("--engine", choices=supported_engines, nargs="?", default="stt", help="Select either DeepSpeech or Coqui STT for inference. Latter is default") parser.add_argument("--file", required=False, help="Input video file") + parser.add_argument("--wav", required=False, help="Input wav file") parser.add_argument("--model", required=False, help="Input *.pbmm model file") parser.add_argument("--scorer", required=False, help="Input *.scorer file") @@ -120,22 +121,35 @@ def main(): else: _logger.error(f"Invalid file: {args.file}") sys.exit(1) + elif args.wav is not None: + if os.path.isfile(args.wav): + input_file = args.wav + _logger.info(f"Input file: {args.wav}") + else: + _logger.error(f"Invalid file: {args.wav}") + sys.exit(1) else: - _logger.error("One or more of --file or --dry-run are required") + _logger.error("One or more of --file or --dry-run or --wav are required") sys.exit(1) + # File names base_directory = os.getcwd() output_directory = os.path.join(base_directory, "output") + file_prefix = os.path.splitext(os.path.basename(input_file))[0] audio_directory = os.path.join(base_directory, "audio") - video_prefix = os.path.splitext(os.path.basename(input_file))[0] - audio_file_name = os.path.join(audio_directory, video_prefix + ".wav") + _logger.debug(f"Audio directory: {audio_directory}") + if args.wav is not None: + audio_file_name = input_file + else: + audio_file_name = os.path.join(audio_directory, file_prefix + ".wav") + os.makedirs(audio_directory, exist_ok=True) os.makedirs(output_directory, exist_ok=True) os.makedirs(audio_directory, exist_ok=True) output_file_handle_dict = {} for format in args.format: - output_filename = os.path.join(output_directory, video_prefix + "." + format) + output_filename = os.path.join(output_directory, file_prefix + "." + format) # print("Creating file: " + output_filename) output_file_handle_dict[format] = open(output_filename, "w") # For VTT format, write header @@ -144,21 +158,28 @@ def main(): output_file_handle_dict[format].write("Kind: captions\n\n") clean_folder(audio_directory) - extract_audio(input_file, audio_file_name) + if args.wav is None: + extract_audio(input_file, audio_file_name) _logger.info("Splitting on silent parts in audio file") - remove_silent_segments(audio_file_name) + remove_silent_segments(audio_file_name, output_dir = audio_directory) + + _logger.debug(f"audio_directory: {os.listdir(audio_directory)[:5]}") - audiofiles = [file for file in os.listdir(audio_directory) if file.startswith(video_prefix)] + audiofiles = [file for file in os.listdir(audio_directory) if file.startswith(file_prefix)] audiofiles = sort_alphanumeric(audiofiles) - audiofiles.remove(os.path.basename(audio_file_name)) + audiofiles.remove(os.path.basename(audio_file_name)) if args.wav is None else None _logger.info("Running inference...") ds = create_model(args.engine, ds_model, ds_scorer) - for filename in tqdm(audiofiles): + _logger.info("Starting transcription") + progress = tqdm(total=len(audiofiles), desc="Inference", position=0, dynamic_ncols=False) + for filename in audiofiles: audio_segment_path = os.path.join(audio_directory, filename) ds_process_audio(ds, audio_segment_path, output_file_handle_dict, split_duration=args.split_duration) + progress.update(1) + progress.close() for format in output_file_handle_dict: file_handle = output_file_handle_dict[format] diff --git a/autosub/segmentAudio.py b/autosub/segmentAudio.py index 7a9575d..fe5f8e4 100644 --- a/autosub/segmentAudio.py +++ b/autosub/segmentAudio.py @@ -185,7 +185,7 @@ def silence_removal(signal, sampling_rate, st_win, st_step, smooth_window=0.5, return seg_limits -def remove_silent_segments(input_file, smoothing_window=1.0, weight=0.2): +def remove_silent_segments(input_file, smoothing_window=1.0, weight=0.2, output_dir=None): """Remove silent segments from an audio file and split on those segments Args: @@ -202,5 +202,7 @@ def remove_silent_segments(input_file, smoothing_window=1.0, weight=0.2): for i, s in enumerate(segmentLimits): strOut = "{0:s}_{1:.3f}-{2:.3f}.wav".format(input_file[0:-4], s[0], s[1]) + if output_dir is not None: + strOut = os.path.join(output_dir, os.path.basename(strOut)) wavfile.write(strOut, fs, x[int(fs * s[0]):int(fs * s[1])]) diff --git a/autosub/utils.py b/autosub/utils.py index 7f6f67a..06fded9 100644 --- a/autosub/utils.py +++ b/autosub/utils.py @@ -62,10 +62,10 @@ def download_model(engine, fname): fname : either of "model" or "scorer" """ - _logger.info(f"{fname.capitalize()} not found locally. Downloading") try: _file = _models[engine][fname] - command = ["wget", _file, "-q", "--show-progress"] + _logger.warning(f"{fname.capitalize()} not found locally. Downloading: {_file}") + command = ["wget", _file, "-q"] ret = subprocess.run(command).returncode except Exception as e: _logger.error(str(e)) @@ -123,21 +123,26 @@ def create_model(engine, model, scorer): Args: engine : "ds" for DeepSpeech and "stt" for Coqui STT - model : .pbmm model file + model : .pbmm or .tflite model file scorer : .scorer file """ - try: - if engine == "ds": - ds = DModel(model) - else: - ds = SModel(model) - except: - _logger.error("Invalid model file") + if engine == "ds": + _logger.debug("Loading DeepSpeech model") + ds = DModel(model) + _logger.debug("Completed loading DeepSpeech model") + elif engine == "stt": + _logger.debug("Loading Coqui STT model") + ds = SModel(model) + _logger.debug("Completed loading Coqui STT model") + else: + _logger.error("Invalid engine") sys.exit(1) try: + _logger.debug("Loading scorer") ds.enableExternalScorer(scorer) + _logger.debug("Completed loading scorer") except: _logger.warn("Invalid scorer file. Running inference using only model file") return(ds) diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..ffedcc0 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,19 @@ +version: '3' + +services: + autosub: + build: + context: . + dockerfile: ${DOCKERFILE:-Dockerfile.gpu} + volumes: + - ${INPUT:-./data/input}:/input + - ${OUTPUT:-./data/output}:/output + - ./deepspeech:/deepspeech + command: ./run.sh ${ENGINE:-stt} + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] \ No newline at end of file diff --git a/getmodels.sh b/getmodels.sh index 927256a..d034a4a 100755 --- a/getmodels.sh +++ b/getmodels.sh @@ -1,14 +1,53 @@ #!/bin/bash +# Downloaded models will be saved on ./model -if [ -z $1 ]; then - echo "Please provide as argument the model number you wish to download. E.G. 0.9.3" - exit 1; -else - model=$1 -fi - -model_url=https://github.com/mozilla/DeepSpeech/releases/download/v$model/deepspeech-$model-models.pbmm -scorer_url=https://github.com/mozilla/DeepSpeech/releases/download/v$model/deepspeech-$model-models.scorer +# Flags: +# -m or --model: Model version. Default: 0.9.3 +# -t or --type: Model type. Default: stt +# -h or --help: Show help -wget ${model_url} && wget ${scorer_url} +while [ "$1" != "" ]; do + case $1 in + -m | --model ) + shift + model=$1 + ;; + -t | --type ) + shift + type=$1 + ;; + -h | --help ) + echo "Usage: getmodels.sh [-m model] [-t type] [-h help]" + echo "Flags:" + echo "-m or --model: Model version. Default: 0.9.3" + echo "-t or --type: Model type. Default: stt" + echo "-h or --help: Show help" + exit + ;; + * ) + echo "Usage: getmodels.sh [-m model] [-t type] [-h help]" + echo "Flags:" + echo "-m or --model: Model version. Default: 0.9.3" + echo "-t or --type: Model type. Default: stt" + echo "-h or --help: Show help" + exit 1 + esac + shift +done +# If type is ds, download DeepSpeech model, otherwise download Mozilla TTS model +mkdir -p ./deepspeech +if [ "$type" = "ds" ]; then + model_url="https://github.com/mozilla/DeepSpeech/releases/download/v$model/deepspeech-$model-models.pbmm" + scorer_url="https://github.com/mozilla/DeepSpeech/releases/download/v$model/deepspeech-$model-models.scorer" + wget -O ./deepspeech/deepspeech-$model-models.pbmm "$model_url" + wget -O ./deepspeech/deepspeech-$model-models.scorer "$scorer_url" +elif [ "$type" = "stt" ]; then + model_url="https://github.com/coqui-ai/STT-models/releases/download/english/coqui/v$model/model.tflite" + scorer_url="https://github.com/coqui-ai/STT-models/releases/download/english%2Fcoqui%2Fv1.0.0-huge-vocab/huge-vocabulary.scorer" + wget -O ./deepspeech/model.tflite "$model_url" + wget -O ./deepspeech/huge-vocabulary.scorer "$scorer_url" +else + echo "Invalid model type. Use -t or --type to specify model type." + exit 1 +fi diff --git a/requirements.txt b/requirements.txt index 87d1e57..5d5117c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,6 +8,6 @@ pydub==0.23.1 pyparsing==2.4.7 python-dateutil==2.8.1 scikit-learn -scipy==1.9.3 +scipy six==1.15.0 tqdm==4.44.1 diff --git a/run.sh b/run.sh new file mode 100644 index 0000000..bb0426a --- /dev/null +++ b/run.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# Meant to be used inside Docker. It scans the directory /input +# for WAV or MP4 files and executes the python script for each of them. + +# Get engine parameter +engine=$2 +echo "Engine: $engine" + +input_path=/input + +shopt -s nullglob # prevent errors if no files are found + +#Get model and scorer names from the model directory. +#Ignore if doesn't exist +model=$(find /deepspeech \( -name '*.pbmm' -o -name '*.tflite' \) -print -quit) +scorer=$(find /deepspeech -name '*.scorer' -print -quit) + +for file in "$input_path"/*.wav; do + echo "Processing WAV $file" + #If model and scorer are not empty, use them + if [ -n "$model" ] && [ -n "$scorer" ]; then + python3 -m autosub.main --wav "$file" --model "$model" --scorer "$scorer" --engine $engine + else + python3 -m autosub.main --wav "$file" --engine $engine + fi +done + +for file in "$input_path"/*.mp4; do + echo "Processing MP4 $file" + #If model and scorer are not empty, use them + if [ -n "$model" ] && [ -n "$scorer" ]; then + python3 -m autosub.main --file "$file" --model "$model" --scorer "$scorer" --engine $engine + else + python3 -m autosub.main --file "$file" --engine $engine + fi +done