diff --git a/ovos_stt_plugin_fasterwhisper/__init__.py b/ovos_stt_plugin_fasterwhisper/__init__.py index f51c73d..7e547f8 100644 --- a/ovos_stt_plugin_fasterwhisper/__init__.py +++ b/ovos_stt_plugin_fasterwhisper/__init__.py @@ -189,7 +189,7 @@ def __init__(self, *args, **kwargs): self.compute_type = self.config.get("compute_type", "int8") self.use_cuda = self.config.get("use_cuda", False) self.cpu_threads = self.config.get("cpu_threads", 4) - + self.vad_filter = self.config.get("vad_filter", False) if self.use_cuda: device = "cuda" else: @@ -203,8 +203,11 @@ def audiodata2array(audio_data): def execute(self, audio, language=None): lang = language or self.lang - segments, _ = self.engine.transcribe(self.audiodata2array(audio), beam_size=self.beam_size, - condition_on_previous_text=False, language=lang.split("-")[0].lower()) + segments, _ = self.engine.transcribe(self.audiodata2array(audio), + beam_size=self.beam_size, + vad_filter=self.vad_filter, + condition_on_previous_text=False, + language=lang.split("-")[0].lower()) # segments is an iterator, transcription only happens here transcription = "".join(segment.text for segment in segments).strip() return transcription diff --git a/ovos_stt_plugin_fasterwhisper/transcribe.py b/ovos_stt_plugin_fasterwhisper/transcribe.py new file mode 100644 index 0000000..f10890b --- /dev/null +++ b/ovos_stt_plugin_fasterwhisper/transcribe.py @@ -0,0 +1,63 @@ +import click +import os +from speech_recognition import Recognizer, AudioFile + +from ovos_stt_plugin_fasterwhisper import FasterWhisperSTT + + +@click.command() +@click.option("--path") +@click.option("--lang", default="en-us") +@click.option("--model", default="base") +@click.option("--format", default="wav") +@click.option("--beam", default=5) +@click.option("--cuda", default=False) +@click.option("--compute", default="int8") +@click.option("--vad", default=False) +def transcribe(path: str, lang: str, model: str, format: str, beam: int, cuda: bool, compute: str, vad: bool): + config = { + "lang": lang, + "model": model, + "beam_size": beam, + "use_cuda": cuda, + "compute_type": compute, + "vad_filter": vad + } + + b = FasterWhisperSTT(config=config) + + if os.path.isfile(path): + try: + with AudioFile(path) as source: + try: + audio = Recognizer().record(source) + t = b.execute(audio, language="en") + print(t) + with open(path.replace(f'.{format}', '.txt'), "w") as f: + f.write(t) + except: + print("failed to transcribe file") + except: + print("failed to open file", f) + elif os.path.isdir(path): + for root, folder, files in os.walk(path): + for f in files: + if f.endswith(".wav") and not os.path.isfile(f"{root}/{f.replace(f'.{format}', '.txt')}"): + print(root, f) + try: + with AudioFile(f"{root}/{f}") as source: + try: + audio = Recognizer().record(source) + t = b.execute(audio, language="en") + print(t) + with open(f"{root}/{f.replace(f'.{format}', '.txt')}", "w") as f: + f.write(t) + except: + print("failed to transcribe file") + except: + print("failed to open file", f) + continue + + +if __name__ == "__main__": + transcribe() diff --git a/setup.py b/setup.py index 552f73e..976abf7 100755 --- a/setup.py +++ b/setup.py @@ -85,5 +85,9 @@ def required(requirements_file): keywords='mycroft ovos plugin stt', entry_points={'mycroft.plugin.stt': PLUGIN_ENTRY_POINT, 'mycroft.plugin.stt.config': CONFIG_ENTRY_POINT, - 'neon.plugin.audio': LANG_PLUGIN_ENTRY_POINT} + 'neon.plugin.audio': LANG_PLUGIN_ENTRY_POINT, + 'console_scripts': [ + 'fw-transcribe=ovos_stt_plugin_fasterwhisper.transcribe:transcribe' + ] + } )