abhirooptalasila · differentiablepizza · Mar 26, 2023 · Mar 26, 2023 · Mar 26, 2023 · Mar 26, 2023
diff --git a/.gitignore b/.gitignore
@@ -49,9 +49,14 @@ Thumbs.db
 
 # Venv
 sub/*
-__pycache__
+**/__pycache__/
 
 # Folders
 audio/
 sub/
-output/
+output/
+model/
+
+.gitgitignore
+
+data/
diff --git a/Dockerfile → Dockerfile.cpu b/Dockerfile → Dockerfile.cpu
@@ -1,29 +1,35 @@
 ARG BASEIMAGE=ubuntu:18.04
-#ARG BASEIMAGE=nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04
 
 FROM ${BASEIMAGE}
 
 ARG DEPSLIST=requirements.txt
-#ARG DEPSLIST=requirements-gpu.txt
 
 ENV PYTHONUNBUFFERED 1
 
 COPY *.pbmm ./
 COPY *.scorer ./
 COPY setup.py ./
-COPY autosub ./autosub
 
 RUN DEBIAN_FRONTEND=noninteractive apt update && \
-    apt -y install ffmpeg libsm6 libxext6 python3 python3-pip && \
+    apt -y install ffmpeg libsm6 libxext6 python3 python3-pip wget && \
     apt -y clean && \
 	rm -rf /var/lib/apt/lists/*
 
 COPY $DEPSLIST ./requirements.txt
+COPY run.sh ./run.sh
+RUN chmod +x ./run.sh
 
 # make sure pip is up-to-date
 RUN python3 -m pip install --upgrade pip
 RUN pip3 install --no-cache-dir -r requirements.txt
 
-RUN mkdir audio output
+# Mount data
+VOLUME /input
+VOLUME /output
+VOLUME /deepspeech
+RUN mkdir /audio
+
+# Copying autosub for last for faster debugging
+COPY autosub ./autosub
 
-ENTRYPOINT ["python3", "autosub/main.py"]
+ENTRYPOINT ["./run.sh"]
diff --git a/Dockerfile.gpu b/Dockerfile.gpu
@@ -0,0 +1,35 @@
+ARG BASEIMAGE=nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04
+
+FROM ${BASEIMAGE}
+
+ARG DEPSLIST=requirements-gpu.txt
+
+ENV PYTHONUNBUFFERED 1
+
+COPY *.pbmm ./
+COPY *.scorer ./
+COPY setup.py ./
+
+RUN DEBIAN_FRONTEND=noninteractive apt update && \
+    apt -y install ffmpeg libsm6 libxext6 python3 python3-pip wget && \
+    apt -y clean && \
+	rm -rf /var/lib/apt/lists/*
+
+COPY $DEPSLIST ./requirements.txt
+COPY run.sh ./run.sh
+RUN chmod +x ./run.sh
+
+# make sure pip is up-to-date
+RUN python3 -m pip install --upgrade pip
+RUN pip3 install --no-cache-dir -r requirements.txt
+
+# Mount data
+VOLUME /input
+VOLUME /output
+VOLUME /deepspeech
+RUN mkdir /audio
+
+# Copying autosub for last for faster debugging
+COPY autosub ./autosub
+
+ENTRYPOINT ["./run.sh"]
diff --git a/README.md b/README.md
@@ -49,8 +49,9 @@ AutoSub is a CLI application to generate subtitle files (.srt, .vtt, and .txt tr
 
 * If you don't have the model files, get them
     ```bash
-    $ ./getmodels.sh 0.9.3
+    $ ./getmodels.sh -m 0.9.3 -t stt
     ```
+    More options are available with `./getmodels.sh -h`
 * For a CPU build
     ```bash
     $ docker build -t autosub .
@@ -69,6 +70,40 @@ AutoSub is a CLI application to generate subtitle files (.srt, .vtt, and .txt tr
     $ docker cp autosub:/output/ .
     ```
 
+## Docker-Compose
+
+The docker compose file allows for easy processing of multiple MP4 or WAV files at once. Create a folder `data/input` in the root directory and paste all of the media files to be transcripted. Then, run:
+
+``` bash
+docker-compose up
+```
+
+To use the CPU implementation instead of GPU, change the dockefile with the environment variable `DOCKERFILE`:
+
+``` bash
+DOCKERFILE=Dockerfile.cpu docker-compose up
+```
+
+If you haven't built it already, append the flag `--build` at the end of the command:
+
+``` bash
+docker-compose up --build
+```
+
+or
+
+``` bash
+DOCKERFILE=Dockerfile.cpu docker-compose up --build
+```
+
+Once the process thas finished, the output SRTs will be located at `data/output`.
+
+To specify custom paths, use the `INPUT` and `OUTPUT` variables:
+
+``` bash
+INPUT=/folder/to/input OUTPUT=/folder/to/output docker-compose up
+```
+
 ## How-to example
 
 * The model files should be in the repo root directory and will be loaded/downloaded automatically. Incase you have multiple versions, use the `--model` and `--scorer` args while executing
@@ -114,3 +149,4 @@ I would love to follow up on any suggestions/issues you find :)
 1. https://github.com/mozilla/DeepSpeech/
 2. https://github.com/tyiannak/pyAudioAnalysis
 3. https://deepspeech.readthedocs.io/
+4. [Quick guide to install nvidia container toolkit](https://www.server-world.info/en/note?os=Ubuntu_20.04&p=nvidia&f=2)
diff --git a/autosub/logger.py b/autosub/logger.py
@@ -5,7 +5,7 @@
 
 def setup_applevel_logger(logger_name = APP_NAME, file_name=None): 
     logger = logging.getLogger(logger_name)
-    logger.setLevel(logging.INFO)
+    logger.setLevel(logging.DEBUG)
     formatter = logging.Formatter("[%(levelname)s] %(message)s") #%(name)s |
     sh = logging.StreamHandler(sys.stdout)
     sh.setFormatter(formatter)

diff --git a/autosub/main.py b/autosub/main.py
@@ -5,7 +5,7 @@
 import re
 import sys
 import wave
-from autosub import logger
+from . import logger
 import argparse
 
 import numpy as np
@@ -95,6 +95,7 @@ def main():
     parser.add_argument("--engine", choices=supported_engines, nargs="?", default="stt",
                         help="Select either DeepSpeech or Coqui STT for inference. Latter is default")
     parser.add_argument("--file", required=False, help="Input video file")
+    parser.add_argument("--wav", required=False, help="Input wav file")
     parser.add_argument("--model", required=False, help="Input *.pbmm model file")
     parser.add_argument("--scorer", required=False, help="Input *.scorer file")
 
@@ -120,22 +121,35 @@ def main():
         else:
             _logger.error(f"Invalid file: {args.file}")
             sys.exit(1)
+    elif args.wav is not None:
+        if os.path.isfile(args.wav):
+            input_file = args.wav
+            _logger.info(f"Input file: {args.wav}")
+        else:
+            _logger.error(f"Invalid file: {args.wav}")
+            sys.exit(1)
     else:
-        _logger.error("One or more of --file or --dry-run are required")
+        _logger.error("One or more of --file or --dry-run or --wav are required")
         sys.exit(1)
 
+    # File names
     base_directory = os.getcwd()
     output_directory = os.path.join(base_directory, "output")
+    file_prefix = os.path.splitext(os.path.basename(input_file))[0]
     audio_directory = os.path.join(base_directory, "audio")
-    video_prefix = os.path.splitext(os.path.basename(input_file))[0]
-    audio_file_name = os.path.join(audio_directory, video_prefix + ".wav")
+    _logger.debug(f"Audio directory: {audio_directory}")
+    if args.wav is not None:
+        audio_file_name = input_file
+    else:
+        audio_file_name = os.path.join(audio_directory, file_prefix + ".wav")
+        os.makedirs(audio_directory, exist_ok=True)
 
     os.makedirs(output_directory, exist_ok=True)
     os.makedirs(audio_directory, exist_ok=True)
     output_file_handle_dict = {}
 
     for format in args.format:
-        output_filename = os.path.join(output_directory, video_prefix + "." + format)
+        output_filename = os.path.join(output_directory, file_prefix + "." + format)
         # print("Creating file: " + output_filename)
         output_file_handle_dict[format] = open(output_filename, "w")
         # For VTT format, write header
@@ -144,21 +158,28 @@ def main():
             output_file_handle_dict[format].write("Kind: captions\n\n")
 
     clean_folder(audio_directory)
-    extract_audio(input_file, audio_file_name)
+    if args.wav is None:
+        extract_audio(input_file, audio_file_name)
 
     _logger.info("Splitting on silent parts in audio file")
-    remove_silent_segments(audio_file_name)
+    remove_silent_segments(audio_file_name, output_dir = audio_directory)
+
+    _logger.debug(f"audio_directory: {os.listdir(audio_directory)[:5]}")
 
-    audiofiles = [file for file in os.listdir(audio_directory) if file.startswith(video_prefix)]
+    audiofiles = [file for file in os.listdir(audio_directory) if file.startswith(file_prefix)]
     audiofiles = sort_alphanumeric(audiofiles)
-    audiofiles.remove(os.path.basename(audio_file_name))
+    audiofiles.remove(os.path.basename(audio_file_name)) if args.wav is None else None
 
     _logger.info("Running inference...")
     ds = create_model(args.engine, ds_model, ds_scorer) 
 
-    for filename in tqdm(audiofiles):
+    _logger.info("Starting transcription")
+    progress = tqdm(total=len(audiofiles), desc="Inference", position=0, dynamic_ncols=False)
+    for filename in audiofiles:
         audio_segment_path = os.path.join(audio_directory, filename)
         ds_process_audio(ds, audio_segment_path, output_file_handle_dict, split_duration=args.split_duration)
+        progress.update(1)
+    progress.close()
 
     for format in output_file_handle_dict:
         file_handle = output_file_handle_dict[format]

diff --git a/autosub/segmentAudio.py b/autosub/segmentAudio.py
@@ -185,7 +185,7 @@ def silence_removal(signal, sampling_rate, st_win, st_step, smooth_window=0.5,
     return seg_limits
 
 
-def remove_silent_segments(input_file, smoothing_window=1.0, weight=0.2):
+def remove_silent_segments(input_file, smoothing_window=1.0, weight=0.2, output_dir=None):
     """Remove silent segments from an audio file and split on those segments
 
     Args:
@@ -202,5 +202,7 @@ def remove_silent_segments(input_file, smoothing_window=1.0, weight=0.2):
 
     for i, s in enumerate(segmentLimits):
         strOut = "{0:s}_{1:.3f}-{2:.3f}.wav".format(input_file[0:-4], s[0], s[1])
+        if output_dir is not None:
+            strOut = os.path.join(output_dir, os.path.basename(strOut))
         wavfile.write(strOut, fs, x[int(fs * s[0]):int(fs * s[1])])
 
diff --git a/autosub/utils.py b/autosub/utils.py
@@ -62,10 +62,10 @@ def download_model(engine, fname):
         fname : either of "model" or "scorer"
     """
 
-    _logger.info(f"{fname.capitalize()} not found locally. Downloading")
     try:
         _file = _models[engine][fname]
-        command = ["wget", _file, "-q", "--show-progress"]
+        _logger.warning(f"{fname.capitalize()} not found locally. Downloading: {_file}")
+        command = ["wget", _file, "-q"]
         ret = subprocess.run(command).returncode
     except Exception as e:
         _logger.error(str(e))
@@ -123,21 +123,26 @@ def create_model(engine, model, scorer):
 
     Args:
         engine : "ds" for DeepSpeech and "stt" for Coqui STT
-        model : .pbmm model file
+        model : .pbmm or .tflite model file
         scorer : .scorer file
     """
 
-    try:
-        if engine == "ds":
-            ds = DModel(model)
-        else:
-            ds = SModel(model)
-    except:
-        _logger.error("Invalid model file")
+    if engine == "ds":
+        _logger.debug("Loading DeepSpeech model")
+        ds = DModel(model)
+        _logger.debug("Completed loading DeepSpeech model")
+    elif engine == "stt":
+        _logger.debug("Loading Coqui STT model")
+        ds = SModel(model)
+        _logger.debug("Completed loading Coqui STT model")
+    else:
+        _logger.error("Invalid engine")
         sys.exit(1)
 
     try:
+        _logger.debug("Loading scorer")
         ds.enableExternalScorer(scorer)
+        _logger.debug("Completed loading scorer")
     except:
         _logger.warn("Invalid scorer file. Running inference using only model file")
     return(ds)
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -0,0 +1,19 @@
+version: '3'
+
+services:
+  autosub:
+    build:
+        context: .
+        dockerfile: ${DOCKERFILE:-Dockerfile.gpu}
+    volumes:
+      - ${INPUT:-./data/input}:/input
+      - ${OUTPUT:-./data/output}:/output
+      - ./deepspeech:/deepspeech
+    command: ./run.sh ${ENGINE:-stt}
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]