From 2f2185607d142dd34971e0a3a9fde24756984ea2 Mon Sep 17 00:00:00 2001
From: differentiablepizza <>
Date: Sun, 26 Mar 2023 13:24:19 -0300
Subject: [PATCH 01/10] - Added support for WAV files - Added docker-compose
 for easy usage - Tidy up of bash scripts

---
 .gitignore              |  9 ++++++--
 Dockerfile              | 24 ++++++++++++-------
 autosub/logger.py       |  2 +-
 autosub/main.py         | 39 ++++++++++++++++++++++---------
 autosub/segmentAudio.py |  4 +++-
 autosub/utils.py        |  4 ++--
 docker-compose.yml      | 10 ++++++++
 getmodels.sh            | 51 +++++++++++++++++++++++++++++++++--------
 requirements.txt        |  2 +-
 run.sh                  | 32 ++++++++++++++++++++++++++
 10 files changed, 141 insertions(+), 36 deletions(-)
 create mode 100644 docker-compose.yml
 create mode 100644 run.sh

diff --git a/.gitignore b/.gitignore
index 6b32b87..21c3038 100644
--- a/.gitignore
+++ b/.gitignore
@@ -49,9 +49,14 @@ Thumbs.db
 
 # Venv
 sub/*
-__pycache__
+**/__pycache__/
 
 # Folders
 audio/
 sub/
-output/
\ No newline at end of file
+output/
+model/
+
+.gitgitignore
+
+data/
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
index 0020ca2..648f40c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,29 +1,37 @@
-ARG BASEIMAGE=ubuntu:18.04
-#ARG BASEIMAGE=nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04
+#ARG BASEIMAGE=ubuntu:18.04
+ARG BASEIMAGE=nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04
 
 FROM ${BASEIMAGE}
 
-ARG DEPSLIST=requirements.txt
-#ARG DEPSLIST=requirements-gpu.txt
+#ARG DEPSLIST=requirements.txt
+ARG DEPSLIST=requirements-gpu.txt
 
 ENV PYTHONUNBUFFERED 1
 
 COPY *.pbmm ./
 COPY *.scorer ./
 COPY setup.py ./
-COPY autosub ./autosub
 
 RUN DEBIAN_FRONTEND=noninteractive apt update && \
-    apt -y install ffmpeg libsm6 libxext6 python3 python3-pip && \
+    apt -y install ffmpeg libsm6 libxext6 python3 python3-pip wget && \
     apt -y clean && \
 	rm -rf /var/lib/apt/lists/*
 
 COPY $DEPSLIST ./requirements.txt
+COPY run.sh ./run.sh
+RUN chmod +x ./run.sh
 
 # make sure pip is up-to-date
 RUN python3 -m pip install --upgrade pip
 RUN pip3 install --no-cache-dir -r requirements.txt
 
-RUN mkdir audio output
+# Mount data
+VOLUME /input
+VOLUME /output
+VOLUME /deepspeech
+RUN mkdir /audio
+
+# Copying autosub for last for faster debugging
+COPY autosub ./autosub
 
-ENTRYPOINT ["python3", "autosub/main.py"]
+ENTRYPOINT ["./run.sh"]
diff --git a/autosub/logger.py b/autosub/logger.py
index d6c0739..cc7a5fa 100644
--- a/autosub/logger.py
+++ b/autosub/logger.py
@@ -5,7 +5,7 @@
 
 def setup_applevel_logger(logger_name = APP_NAME, file_name=None): 
     logger = logging.getLogger(logger_name)
-    logger.setLevel(logging.INFO)
+    logger.setLevel(logging.DEBUG)
     formatter = logging.Formatter("[%(levelname)s] %(message)s") #%(name)s |
     sh = logging.StreamHandler(sys.stdout)
     sh.setFormatter(formatter)
diff --git a/autosub/main.py b/autosub/main.py
index 6960fd0..2bc7a15 100644
--- a/autosub/main.py
+++ b/autosub/main.py
@@ -5,11 +5,11 @@
 import re
 import sys
 import wave
-from autosub import logger
+from . import logger
 import argparse
 
 import numpy as np
-from tqdm import tqdm
+# from tqdm import tqdm
 
 from autosub.utils import *
 from autosub.writeToFile import write_to_file
@@ -95,6 +95,7 @@ def main():
     parser.add_argument("--engine", choices=supported_engines, nargs="?", default="stt",
                         help="Select either DeepSpeech or Coqui STT for inference. Latter is default")
     parser.add_argument("--file", required=False, help="Input video file")
+    parser.add_argument("--wav", required=False, help="Input wav file")
     parser.add_argument("--model", required=False, help="Input *.pbmm model file")
     parser.add_argument("--scorer", required=False, help="Input *.scorer file")
     
@@ -120,22 +121,35 @@ def main():
         else:
             _logger.error(f"Invalid file: {args.file}")
             sys.exit(1)
+    elif args.wav is not None:
+        if os.path.isfile(args.wav):
+            input_file = args.wav
+            _logger.info(f"Input file: {args.wav}")
+        else:
+            _logger.error(f"Invalid file: {args.wav}")
+            sys.exit(1)
     else:
-        _logger.error("One or more of --file or --dry-run are required")
+        _logger.error("One or more of --file or --dry-run or --wav are required")
         sys.exit(1)
 
+    # File names
     base_directory = os.getcwd()
     output_directory = os.path.join(base_directory, "output")
+    file_prefix = os.path.splitext(os.path.basename(input_file))[0]
     audio_directory = os.path.join(base_directory, "audio")
-    video_prefix = os.path.splitext(os.path.basename(input_file))[0]
-    audio_file_name = os.path.join(audio_directory, video_prefix + ".wav")
+    _logger.debug(f"Audio directory: {audio_directory}")
+    if args.wav is not None:
+        audio_file_name = input_file
+    else:
+        audio_file_name = os.path.join(audio_directory, file_prefix + ".wav")
+        os.makedirs(audio_directory, exist_ok=True)
     
     os.makedirs(output_directory, exist_ok=True)
     os.makedirs(audio_directory, exist_ok=True)
     output_file_handle_dict = {}
 
     for format in args.format:
-        output_filename = os.path.join(output_directory, video_prefix + "." + format)
+        output_filename = os.path.join(output_directory, file_prefix + "." + format)
         # print("Creating file: " + output_filename)
         output_file_handle_dict[format] = open(output_filename, "w")
         # For VTT format, write header
@@ -144,19 +158,22 @@ def main():
             output_file_handle_dict[format].write("Kind: captions\n\n")
 
     clean_folder(audio_directory)
-    extract_audio(input_file, audio_file_name)
+    if args.wav is None:
+        extract_audio(input_file, audio_file_name)
 
     _logger.info("Splitting on silent parts in audio file")
-    remove_silent_segments(audio_file_name)
+    remove_silent_segments(audio_file_name, output_dir = audio_directory)
+
+    _logger.debug(f"audio_directory: {os.listdir(audio_directory)[:5]}")
 
-    audiofiles = [file for file in os.listdir(audio_directory) if file.startswith(video_prefix)]
+    audiofiles = [file for file in os.listdir(audio_directory) if file.startswith(file_prefix)]
     audiofiles = sort_alphanumeric(audiofiles)
-    audiofiles.remove(os.path.basename(audio_file_name))
+    audiofiles.remove(os.path.basename(audio_file_name)) if args.wav is None else None
 
     _logger.info("Running inference...")
     ds = create_model(args.engine, ds_model, ds_scorer) 
 
-    for filename in tqdm(audiofiles):
+    for filename in audiofiles:
         audio_segment_path = os.path.join(audio_directory, filename)
         ds_process_audio(ds, audio_segment_path, output_file_handle_dict, split_duration=args.split_duration)
 
diff --git a/autosub/segmentAudio.py b/autosub/segmentAudio.py
index 7a9575d..fe5f8e4 100644
--- a/autosub/segmentAudio.py
+++ b/autosub/segmentAudio.py
@@ -185,7 +185,7 @@ def silence_removal(signal, sampling_rate, st_win, st_step, smooth_window=0.5,
     return seg_limits
 
 
-def remove_silent_segments(input_file, smoothing_window=1.0, weight=0.2):
+def remove_silent_segments(input_file, smoothing_window=1.0, weight=0.2, output_dir=None):
     """Remove silent segments from an audio file and split on those segments
 
     Args:
@@ -202,5 +202,7 @@ def remove_silent_segments(input_file, smoothing_window=1.0, weight=0.2):
 
     for i, s in enumerate(segmentLimits):
         strOut = "{0:s}_{1:.3f}-{2:.3f}.wav".format(input_file[0:-4], s[0], s[1])
+        if output_dir is not None:
+            strOut = os.path.join(output_dir, os.path.basename(strOut))
         wavfile.write(strOut, fs, x[int(fs * s[0]):int(fs * s[1])])
 
diff --git a/autosub/utils.py b/autosub/utils.py
index 7f6f67a..c3867aa 100644
--- a/autosub/utils.py
+++ b/autosub/utils.py
@@ -62,10 +62,10 @@ def download_model(engine, fname):
         fname : either of "model" or "scorer"
     """
 
-    _logger.info(f"{fname.capitalize()} not found locally. Downloading")
     try:
         _file = _models[engine][fname]
-        command = ["wget", _file, "-q", "--show-progress"]
+        _logger.warning(f"{fname.capitalize()} not found locally. Downloading: {_file}")
+        command = ["wget", _file, "-q"]
         ret = subprocess.run(command).returncode
     except Exception as e:
         _logger.error(str(e))
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..5c4dc32
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,10 @@
+version: '3'
+
+services:
+  autosub:
+    build: .
+    volumes:
+      - ./data/input:/input
+      - ./data/output:/output
+      - ./deepspeech:/deepspeech
+    command: ./run.sh
\ No newline at end of file
diff --git a/getmodels.sh b/getmodels.sh
index 927256a..a1bb3fb 100755
--- a/getmodels.sh
+++ b/getmodels.sh
@@ -1,14 +1,45 @@
 #!/bin/bash
+#Downloaded models will be saved on ./model
 
-if [ -z $1 ]; then
-	echo "Please provide as argument the model number you wish to download. E.G. 0.9.3"
-	exit 1;
-else
-	model=$1
-fi
+#Flags:
+# -m or --model: Model version. Default: 0.9.3
+# -t or --type: Model type. Default: stt
+# -h or --help: Show help
+while [ "$1" != "" ]; do
+	case $1 in
+		-m | --model )          shift
+								model=$1
+								;;
+		-t | --type )           shift
+								type=$1
+								;;
+		-h | --help )           echo "Usage: getmodels.sh [-m model] [-t type] [-h help]"
+								echo "Flags:"
+								echo "-m or --model: Model version. Default: 0.9.3"
+								echo "-t or --type: Model type. Default: stt"
+								echo "-h or --help: Show help"
+								exit
+								;;
+		* )                     echo "Usage: getmodels.sh [-m model] [-t type] [-h help]"
+								echo "Flags:"
+								echo "-m or --model: Model version. Default: 0.9.3"
+								echo "-t or --type: Model type. Default: stt"
+								echo "-h or --help: Show help"
+								exit 1
+	esac
+	shift
+done
 
-model_url=https://github.com/mozilla/DeepSpeech/releases/download/v$model/deepspeech-$model-models.pbmm
-scorer_url=https://github.com/mozilla/DeepSpeech/releases/download/v$model/deepspeech-$model-models.scorer
-
-wget ${model_url} && wget ${scorer_url}
+#If type is ds, download DeepSpeech model, otherwise download Mozilla TTS model
+mkdir -p ./deepspeech
+if [ "$type" = "ds" ]; then
+	model_url=https://github.com/mozilla/DeepSpeech/releases/download/v$model/deepspeech-$model-models.pbmm
+	scorer_url=https://github.com/mozilla/DeepSpeech/releases/download/v$model/deepspeech-$model-models.scorer
+	wget -O ./deepspeech/deepspeech-$model-models.pbmm $model_url
+	wget -O ./deepspeech/deepspeech-$model-models.scorer $scorer_url
+elif [ "$type" = "tts" ]; then
+	model_url="https://github.com/coqui-ai/STT-models/releases/download/english/coqui/v$model/model.tflite"
+	scorer_url="https://github.com/coqui-ai/STT-models/releases/download/english%2Fcoqui%2Fv1.0.0-huge-vocab/huge-vocabulary.scorer"
+	wget -O ./deepspeech/model.tflite $model_url
+	wget -O ./deepspeech/huge-vocabulary.scorer $scorer_url
 
diff --git a/requirements.txt b/requirements.txt
index 87d1e57..5d5117c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,6 +8,6 @@ pydub==0.23.1
 pyparsing==2.4.7
 python-dateutil==2.8.1
 scikit-learn
-scipy==1.9.3
+scipy
 six==1.15.0
 tqdm==4.44.1
diff --git a/run.sh b/run.sh
new file mode 100644
index 0000000..8e02909
--- /dev/null
+++ b/run.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+# Meant to be used inside Docker. It scans the directory /input
+# for WAV or MP4 files and executes the python script for each of them.
+
+input_path=/input
+
+shopt -s nullglob # prevent errors if no files are found
+
+#Get model and scorer names from the model directory.
+#Ignore if doesn't exist
+model=$(find /deepspeech -name '*.pbmm' -print -quit)
+scorer=$(find /deepspeech -name '*.scorer' -print -quit)
+
+for file in "$input_path"/*.wav; do
+    echo "Processing WAV $file"
+    #If model and scorer are not empty, use them
+    if [ -n "$model" ] && [ -n "$scorer" ]; then
+        python3 -m autosub.main --wav "$file" --model "$model" --scorer "$scorer"
+    else
+        python3 -m autosub.main --wav "$file"
+    fi
+done
+
+for file in "$input_path"/*.mp4; do
+    echo "Processing MP4 $file"
+    #If model and scorer are not empty, use them
+    if [ -n "$model" ] && [ -n "$scorer" ]; then
+        python3 -m autosub.main --file "$file" --model "$model" --scorer "$scorer"
+    else
+        python3 -m autosub.main --file "$file"
+    fi
+done

From aef82e3dd2b296e324352d9fb2909aa1eaebd695 Mon Sep 17 00:00:00 2001
From: differentiablepizza <>
Date: Sun, 26 Mar 2023 13:37:16 -0300
Subject: [PATCH 02/10] Added details about docker-compose on readme

---
 README.md | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index b7b9627..afc0190 100644
--- a/README.md
+++ b/README.md
@@ -49,8 +49,9 @@ AutoSub is a CLI application to generate subtitle files (.srt, .vtt, and .txt tr
 
 * If you don't have the model files, get them
     ```bash
-    $ ./getmodels.sh 0.9.3
+    $ ./getmodels.sh -m 0.9.3 -t stt
     ```
+    More options are available with `./getmodels.sh -h`
 * For a CPU build
     ```bash
     $ docker build -t autosub .
@@ -69,6 +70,22 @@ AutoSub is a CLI application to generate subtitle files (.srt, .vtt, and .txt tr
     $ docker cp autosub:/output/ .
     ```
 
+## Docker-Compose
+
+The docker compose file allows for easy processing of multiple MP4 or WAV files at once. Create a folder `data/input` in the root directory and paste all of the media files to be transcripted. Then, run:
+
+``` bash
+docker-compose up
+```
+
+If you haven't built it already, append the flag `--build` at the end of the command:
+
+``` bash
+docker-compose up --build
+```
+
+Once the process thas finished, the output SRTs will be located at `data/output`.
+
 ## How-to example
 
 * The model files should be in the repo root directory and will be loaded/downloaded automatically. Incase you have multiple versions, use the `--model` and `--scorer` args while executing

From 94a59986d412d3062a044173e0cabf7cfac79b78 Mon Sep 17 00:00:00 2001
From: differentiablepizza <>
Date: Sun, 26 Mar 2023 13:46:13 -0300
Subject: [PATCH 03/10] tidy up getmodels.sh and put tqdm back on main.py

---
 autosub/main.py |  4 +--
 getmodels.sh    | 78 +++++++++++++++++++++++++++----------------------
 2 files changed, 45 insertions(+), 37 deletions(-)

diff --git a/autosub/main.py b/autosub/main.py
index 2bc7a15..ff04d95 100644
--- a/autosub/main.py
+++ b/autosub/main.py
@@ -9,7 +9,7 @@
 import argparse
 
 import numpy as np
-# from tqdm import tqdm
+from tqdm import tqdm
 
 from autosub.utils import *
 from autosub.writeToFile import write_to_file
@@ -173,7 +173,7 @@ def main():
     _logger.info("Running inference...")
     ds = create_model(args.engine, ds_model, ds_scorer) 
 
-    for filename in audiofiles:
+    for filename in tqdm(audiofiles, desc="Inference"):
         audio_segment_path = os.path.join(audio_directory, filename)
         ds_process_audio(ds, audio_segment_path, output_file_handle_dict, split_duration=args.split_duration)
 
diff --git a/getmodels.sh b/getmodels.sh
index a1bb3fb..d034a4a 100755
--- a/getmodels.sh
+++ b/getmodels.sh
@@ -1,45 +1,53 @@
 #!/bin/bash
-#Downloaded models will be saved on ./model
+# Downloaded models will be saved on ./model
 
-#Flags:
+# Flags:
 # -m or --model: Model version. Default: 0.9.3
 # -t or --type: Model type. Default: stt
 # -h or --help: Show help
+
 while [ "$1" != "" ]; do
-	case $1 in
-		-m | --model )          shift
-								model=$1
-								;;
-		-t | --type )           shift
-								type=$1
-								;;
-		-h | --help )           echo "Usage: getmodels.sh [-m model] [-t type] [-h help]"
-								echo "Flags:"
-								echo "-m or --model: Model version. Default: 0.9.3"
-								echo "-t or --type: Model type. Default: stt"
-								echo "-h or --help: Show help"
-								exit
-								;;
-		* )                     echo "Usage: getmodels.sh [-m model] [-t type] [-h help]"
-								echo "Flags:"
-								echo "-m or --model: Model version. Default: 0.9.3"
-								echo "-t or --type: Model type. Default: stt"
-								echo "-h or --help: Show help"
-								exit 1
-	esac
-	shift
+    case $1 in
+        -m | --model )
+            shift
+            model=$1
+            ;;
+        -t | --type )
+            shift
+            type=$1
+            ;;
+        -h | --help )
+            echo "Usage: getmodels.sh [-m model] [-t type] [-h help]"
+            echo "Flags:"
+            echo "-m or --model: Model version. Default: 0.9.3"
+            echo "-t or --type: Model type. Default: stt"
+            echo "-h or --help: Show help"
+            exit
+            ;;
+        * )
+            echo "Usage: getmodels.sh [-m model] [-t type] [-h help]"
+            echo "Flags:"
+            echo "-m or --model: Model version. Default: 0.9.3"
+            echo "-t or --type: Model type. Default: stt"
+            echo "-h or --help: Show help"
+            exit 1
+    esac
+    shift
 done
 
-#If type is ds, download DeepSpeech model, otherwise download Mozilla TTS model
+# If type is ds, download DeepSpeech model, otherwise download Mozilla TTS model
 mkdir -p ./deepspeech
 if [ "$type" = "ds" ]; then
-	model_url=https://github.com/mozilla/DeepSpeech/releases/download/v$model/deepspeech-$model-models.pbmm
-	scorer_url=https://github.com/mozilla/DeepSpeech/releases/download/v$model/deepspeech-$model-models.scorer
-	wget -O ./deepspeech/deepspeech-$model-models.pbmm $model_url
-	wget -O ./deepspeech/deepspeech-$model-models.scorer $scorer_url
-elif [ "$type" = "tts" ]; then
-	model_url="https://github.com/coqui-ai/STT-models/releases/download/english/coqui/v$model/model.tflite"
-	scorer_url="https://github.com/coqui-ai/STT-models/releases/download/english%2Fcoqui%2Fv1.0.0-huge-vocab/huge-vocabulary.scorer"
-	wget -O ./deepspeech/model.tflite $model_url
-	wget -O ./deepspeech/huge-vocabulary.scorer $scorer_url
-
+    model_url="https://github.com/mozilla/DeepSpeech/releases/download/v$model/deepspeech-$model-models.pbmm"
+    scorer_url="https://github.com/mozilla/DeepSpeech/releases/download/v$model/deepspeech-$model-models.scorer"
+    wget -O ./deepspeech/deepspeech-$model-models.pbmm "$model_url"
+    wget -O ./deepspeech/deepspeech-$model-models.scorer "$scorer_url"
+elif [ "$type" = "stt" ]; then
+    model_url="https://github.com/coqui-ai/STT-models/releases/download/english/coqui/v$model/model.tflite"
+    scorer_url="https://github.com/coqui-ai/STT-models/releases/download/english%2Fcoqui%2Fv1.0.0-huge-vocab/huge-vocabulary.scorer"
+    wget -O ./deepspeech/model.tflite "$model_url"
+    wget -O ./deepspeech/huge-vocabulary.scorer "$scorer_url"
+else
+	echo "Invalid model type. Use -t or --type to specify model type."
+	exit 1
+fi

From 1221c59827eeb353f82f6efdb3b2457c63764fcf Mon Sep 17 00:00:00 2001
From: differentiablepizza <>
Date: Sun, 26 Mar 2023 13:50:18 -0300
Subject: [PATCH 04/10] run.sh wasn't working for .tflite

---
 run.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/run.sh b/run.sh
index 8e02909..f99102a 100644
--- a/run.sh
+++ b/run.sh
@@ -8,7 +8,7 @@ shopt -s nullglob # prevent errors if no files are found
 
 #Get model and scorer names from the model directory.
 #Ignore if doesn't exist
-model=$(find /deepspeech -name '*.pbmm' -print -quit)
+model=$(find /deepspeech \( -name '*.pbmm' -o -name '*.tflite' \) -print -quit)
 scorer=$(find /deepspeech -name '*.scorer' -print -quit)
 
 for file in "$input_path"/*.wav; do

From d11fc61219ef9e423c3f0dabe0d5a25b040a2477 Mon Sep 17 00:00:00 2001
From: differentiablepizza <>
Date: Sun, 26 Mar 2023 14:36:36 -0300
Subject: [PATCH 05/10] Changed tqdm to print every one in a while instead of
 its default progress bar

---
 Dockerfile       |  8 ++++----
 autosub/main.py  |  5 ++++-
 autosub/utils.py | 21 +++++++++++++--------
 3 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 648f40c..28af997 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,10 +1,10 @@
-#ARG BASEIMAGE=ubuntu:18.04
-ARG BASEIMAGE=nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04
+ARG BASEIMAGE=ubuntu:18.04
+#ARG BASEIMAGE=nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04
 
 FROM ${BASEIMAGE}
 
-#ARG DEPSLIST=requirements.txt
-ARG DEPSLIST=requirements-gpu.txt
+ARG DEPSLIST=requirements.txt
+#ARG DEPSLIST=requirements-gpu.txt
 
 ENV PYTHONUNBUFFERED 1
 
diff --git a/autosub/main.py b/autosub/main.py
index ff04d95..69a973f 100644
--- a/autosub/main.py
+++ b/autosub/main.py
@@ -173,9 +173,12 @@ def main():
     _logger.info("Running inference...")
     ds = create_model(args.engine, ds_model, ds_scorer) 
 
-    for filename in tqdm(audiofiles, desc="Inference"):
+    progress = tqdm(total=len(audiofiles), desc="Inference", position=0)
+    for filename in audiofiles:
         audio_segment_path = os.path.join(audio_directory, filename)
         ds_process_audio(ds, audio_segment_path, output_file_handle_dict, split_duration=args.split_duration)
+        progress.update(1)
+    progress.close()
 
     for format in output_file_handle_dict:
         file_handle = output_file_handle_dict[format]
diff --git a/autosub/utils.py b/autosub/utils.py
index c3867aa..06fded9 100644
--- a/autosub/utils.py
+++ b/autosub/utils.py
@@ -123,21 +123,26 @@ def create_model(engine, model, scorer):
 
     Args:
         engine : "ds" for DeepSpeech and "stt" for Coqui STT
-        model : .pbmm model file
+        model : .pbmm or .tflite model file
         scorer : .scorer file
     """
 
-    try:
-        if engine == "ds":
-            ds = DModel(model)
-        else:
-            ds = SModel(model)
-    except:
-        _logger.error("Invalid model file")
+    if engine == "ds":
+        _logger.debug("Loading DeepSpeech model")
+        ds = DModel(model)
+        _logger.debug("Completed loading DeepSpeech model")
+    elif engine == "stt":
+        _logger.debug("Loading Coqui STT model")
+        ds = SModel(model)
+        _logger.debug("Completed loading Coqui STT model")
+    else:
+        _logger.error("Invalid engine")
         sys.exit(1)
 
     try:
+        _logger.debug("Loading scorer")
         ds.enableExternalScorer(scorer)
+        _logger.debug("Completed loading scorer")
     except:
         _logger.warn("Invalid scorer file. Running inference using only model file")
     return(ds)

From 986d79c557b911761af657ec7644b46e39b673d6 Mon Sep 17 00:00:00 2001
From: differentiablepizza <>
Date: Sun, 26 Mar 2023 14:42:28 -0300
Subject: [PATCH 06/10] Created one separate Dockerfile for CPU and GPU

---
 Dockerfile => Dockerfile.cpu |  2 --
 Dockerfile.gpu               | 35 +++++++++++++++++++++++++++++++++++
 autosub/main.py              |  1 +
 docker-compose.yml           |  1 +
 4 files changed, 37 insertions(+), 2 deletions(-)
 rename Dockerfile => Dockerfile.cpu (88%)
 create mode 100644 Dockerfile.gpu

diff --git a/Dockerfile b/Dockerfile.cpu
similarity index 88%
rename from Dockerfile
rename to Dockerfile.cpu
index 28af997..f1be9e9 100644
--- a/Dockerfile
+++ b/Dockerfile.cpu
@@ -1,10 +1,8 @@
 ARG BASEIMAGE=ubuntu:18.04
-#ARG BASEIMAGE=nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04
 
 FROM ${BASEIMAGE}
 
 ARG DEPSLIST=requirements.txt
-#ARG DEPSLIST=requirements-gpu.txt
 
 ENV PYTHONUNBUFFERED 1
 
diff --git a/Dockerfile.gpu b/Dockerfile.gpu
new file mode 100644
index 0000000..f4ae2a8
--- /dev/null
+++ b/Dockerfile.gpu
@@ -0,0 +1,35 @@
+ARG BASEIMAGE=nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04
+
+FROM ${BASEIMAGE}
+
+ARG DEPSLIST=requirements-gpu.txt
+
+ENV PYTHONUNBUFFERED 1
+
+COPY *.pbmm ./
+COPY *.scorer ./
+COPY setup.py ./
+
+RUN DEBIAN_FRONTEND=noninteractive apt update && \
+    apt -y install ffmpeg libsm6 libxext6 python3 python3-pip wget && \
+    apt -y clean && \
+	rm -rf /var/lib/apt/lists/*
+
+COPY $DEPSLIST ./requirements.txt
+COPY run.sh ./run.sh
+RUN chmod +x ./run.sh
+
+# make sure pip is up-to-date
+RUN python3 -m pip install --upgrade pip
+RUN pip3 install --no-cache-dir -r requirements.txt
+
+# Mount data
+VOLUME /input
+VOLUME /output
+VOLUME /deepspeech
+RUN mkdir /audio
+
+# Copying autosub for last for faster debugging
+COPY autosub ./autosub
+
+ENTRYPOINT ["./run.sh"]
diff --git a/autosub/main.py b/autosub/main.py
index 69a973f..affe8dd 100644
--- a/autosub/main.py
+++ b/autosub/main.py
@@ -173,6 +173,7 @@ def main():
     _logger.info("Running inference...")
     ds = create_model(args.engine, ds_model, ds_scorer) 
 
+    _logger.info("Starting transcription")
     progress = tqdm(total=len(audiofiles), desc="Inference", position=0)
     for filename in audiofiles:
         audio_segment_path = os.path.join(audio_directory, filename)
diff --git a/docker-compose.yml b/docker-compose.yml
index 5c4dc32..08db2bc 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -3,6 +3,7 @@ version: '3'
 services:
   autosub:
     build: .
+    dockerfile: Dockerfile.gpu
     volumes:
       - ./data/input:/input
       - ./data/output:/output

From 7cba57e9331878515589a07a2a900f67d1657852 Mon Sep 17 00:00:00 2001
From: differentiablepizza <>
Date: Sun, 26 Mar 2023 14:47:01 -0300
Subject: [PATCH 07/10] Docker compose can choose dockerfile dynamically

---
 README.md          | 12 ++++++++++++
 docker-compose.yml |  2 +-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index afc0190..712fab9 100644
--- a/README.md
+++ b/README.md
@@ -78,12 +78,24 @@ The docker compose file allows for easy processing of multiple MP4 or WAV files
 docker-compose up
 ```
 
+To use the CPU implementation instead of GPU, change the dockefile with the environment variable `DOCKERFILE`:
+
+``` bash
+docker-compose up -e DOCKERFILE=Dockerfile.cpu
+```
+
 If you haven't built it already, append the flag `--build` at the end of the command:
 
 ``` bash
 docker-compose up --build
 ```
 
+or
+
+``` bash
+docker-compose up -e DOCKERFILE=Dockerfile.cpu --build
+```
+
 Once the process thas finished, the output SRTs will be located at `data/output`.
 
 ## How-to example
diff --git a/docker-compose.yml b/docker-compose.yml
index 08db2bc..f1d77a7 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -3,7 +3,7 @@ version: '3'
 services:
   autosub:
     build: .
-    dockerfile: Dockerfile.gpu
+    dockerfile: ${DOCKERFILE:-Dockerfile.gpu}
     volumes:
       - ./data/input:/input
       - ./data/output:/output

From 100b13178191925a64a1d85c6eac7b49d4fc05ac Mon Sep 17 00:00:00 2001
From: differentiablepizza <>
Date: Sun, 26 Mar 2023 15:20:03 -0300
Subject: [PATCH 08/10] Made input and output as env variables on
 docker-compose

---
 autosub/main.py    | 2 +-
 docker-compose.yml | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/autosub/main.py b/autosub/main.py
index affe8dd..a66b81e 100644
--- a/autosub/main.py
+++ b/autosub/main.py
@@ -174,7 +174,7 @@ def main():
     ds = create_model(args.engine, ds_model, ds_scorer) 
 
     _logger.info("Starting transcription")
-    progress = tqdm(total=len(audiofiles), desc="Inference", position=0)
+    progress = tqdm(total=len(audiofiles), desc="Inference", position=0, dynamic_ncols=False)
     for filename in audiofiles:
         audio_segment_path = os.path.join(audio_directory, filename)
         ds_process_audio(ds, audio_segment_path, output_file_handle_dict, split_duration=args.split_duration)
diff --git a/docker-compose.yml b/docker-compose.yml
index f1d77a7..5dc5d04 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -5,7 +5,7 @@ services:
     build: .
     dockerfile: ${DOCKERFILE:-Dockerfile.gpu}
     volumes:
-      - ./data/input:/input
-      - ./data/output:/output
+      - ${INPUT:-./data/input}:/input
+      - ${OUTPUT:-./data/output}:/output
       - ./deepspeech:/deepspeech
     command: ./run.sh
\ No newline at end of file

From 51b8fc082cfc8eae9dbf1ad188a9e657a3a6875a Mon Sep 17 00:00:00 2001
From: differentiablepizza <>
Date: Sun, 26 Mar 2023 23:02:20 -0300
Subject: [PATCH 09/10] Minor change to docker-compose file

---
 README.md          | 10 ++++++++--
 docker-compose.yml |  7 ++++---
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 712fab9..669d833 100644
--- a/README.md
+++ b/README.md
@@ -81,7 +81,7 @@ docker-compose up
 To use the CPU implementation instead of GPU, change the dockefile with the environment variable `DOCKERFILE`:
 
 ``` bash
-docker-compose up -e DOCKERFILE=Dockerfile.cpu
+DOCKERFILE=Dockerfile.cpu docker-compose up
 ```
 
 If you haven't built it already, append the flag `--build` at the end of the command:
@@ -93,11 +93,17 @@ docker-compose up --build
 or
 
 ``` bash
-docker-compose up -e DOCKERFILE=Dockerfile.cpu --build
+DOCKERFILE=Dockerfile.cpu docker-compose up --build
 ```
 
 Once the process thas finished, the output SRTs will be located at `data/output`.
 
+To specify custom paths, use the `INPUT` and `OUTPUT` variables:
+
+``` bash
+INPUT=/folder/to/input OUTPUT=/folder/to/output docker-compose up
+```
+
 ## How-to example
 
 * The model files should be in the repo root directory and will be loaded/downloaded automatically. Incase you have multiple versions, use the `--model` and `--scorer` args while executing
diff --git a/docker-compose.yml b/docker-compose.yml
index 5dc5d04..44bda2f 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -2,10 +2,11 @@ version: '3'
 
 services:
   autosub:
-    build: .
-    dockerfile: ${DOCKERFILE:-Dockerfile.gpu}
+    build:
+        context: .
+        dockerfile: ${DOCKERFILE:-Dockerfile.gpu}
     volumes:
       - ${INPUT:-./data/input}:/input
       - ${OUTPUT:-./data/output}:/output
       - ./deepspeech:/deepspeech
-    command: ./run.sh
\ No newline at end of file
+    command: ./run.sh

From 057e7495991e68e65faa3a2b3c222db4e4cb0236 Mon Sep 17 00:00:00 2001
From: differentiablepizza <>
Date: Mon, 27 Mar 2023 01:43:19 -0300
Subject: [PATCH 10/10] Made adaptations to run GPU on container

---
 README.md          |  1 +
 docker-compose.yml |  9 ++++++++-
 run.sh             | 12 ++++++++----
 3 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 669d833..21db9b7 100644
--- a/README.md
+++ b/README.md
@@ -149,3 +149,4 @@ I would love to follow up on any suggestions/issues you find :)
 1. https://github.com/mozilla/DeepSpeech/
 2. https://github.com/tyiannak/pyAudioAnalysis
 3. https://deepspeech.readthedocs.io/
+4. [Quick guide to install nvidia container toolkit](https://www.server-world.info/en/note?os=Ubuntu_20.04&p=nvidia&f=2)
\ No newline at end of file
diff --git a/docker-compose.yml b/docker-compose.yml
index 44bda2f..ffedcc0 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -9,4 +9,11 @@ services:
       - ${INPUT:-./data/input}:/input
       - ${OUTPUT:-./data/output}:/output
       - ./deepspeech:/deepspeech
-    command: ./run.sh
+    command: ./run.sh ${ENGINE:-stt}
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
\ No newline at end of file
diff --git a/run.sh b/run.sh
index f99102a..bb0426a 100644
--- a/run.sh
+++ b/run.sh
@@ -2,6 +2,10 @@
 # Meant to be used inside Docker. It scans the directory /input
 # for WAV or MP4 files and executes the python script for each of them.
 
+# Get engine parameter
+engine=$2
+echo "Engine: $engine"
+
 input_path=/input
 
 shopt -s nullglob # prevent errors if no files are found
@@ -15,9 +19,9 @@ for file in "$input_path"/*.wav; do
     echo "Processing WAV $file"
     #If model and scorer are not empty, use them
     if [ -n "$model" ] && [ -n "$scorer" ]; then
-        python3 -m autosub.main --wav "$file" --model "$model" --scorer "$scorer"
+        python3 -m autosub.main --wav "$file" --model "$model" --scorer "$scorer" --engine $engine
     else
-        python3 -m autosub.main --wav "$file"
+        python3 -m autosub.main --wav "$file" --engine $engine
     fi
 done
 
@@ -25,8 +29,8 @@ for file in "$input_path"/*.mp4; do
     echo "Processing MP4 $file"
     #If model and scorer are not empty, use them
     if [ -n "$model" ] && [ -n "$scorer" ]; then
-        python3 -m autosub.main --file "$file" --model "$model" --scorer "$scorer"
+        python3 -m autosub.main --file "$file" --model "$model" --scorer "$scorer" --engine $engine
     else
-        python3 -m autosub.main --file "$file"
+        python3 -m autosub.main --file "$file" --engine $engine
     fi
 done