Skip to content

Commit

Permalink
feat/lang_detection_plugin
Browse files Browse the repository at this point in the history
  • Loading branch information
JarbasAl committed Apr 20, 2024
1 parent d4128f5 commit 535c3c5
Showing 1 changed file with 9 additions and 23 deletions.
32 changes: 9 additions & 23 deletions ovos_stt_plugin_fasterwhisper/__init__.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,11 @@
# this is needed to read the WAV file properly
from typing import List

import numpy as np
from faster_whisper import WhisperModel, decode_audio
from ovos_config.config import Configuration
from ovos_config.locale import get_default_lang
from ovos_plugin_manager.templates.stt import STT
from ovos_plugin_manager.templates.transformers import AudioTransformer
from ovos_utils.log import LOG
from ovos_plugin_manager.templates.transformers import AudioLanguageDetector
from speech_recognition import AudioData


class FasterWhisperLangClassifier(AudioTransformer):
class FasterWhisperLangClassifier(AudioLanguageDetector):
def __init__(self, config=None):
config = config or {}
super().__init__("ovos-audio-transformer-plugin-fasterwhisper", 10, config)
Expand All @@ -32,14 +26,8 @@ def __init__(self, config=None):
device = "cpu"
self.engine = WhisperModel(model, device=device, compute_type=self.compute_type)

@property
def valid_langs(self) -> List[str]:
return list(
set([get_default_lang()] + Configuration().get("secondary_langs", []))
)

@staticmethod
def audiochunk2array(audio_data):
def audiochunk2array(audio_data: bytes):
# Convert buffer to float32 using NumPy
audio_as_np_int16 = np.frombuffer(audio_data, dtype=np.int16)
audio_as_np_float32 = audio_as_np_int16.astype(np.float32)
Expand All @@ -49,9 +37,9 @@ def audiochunk2array(audio_data):
data = audio_as_np_float32 / max_int16
return data

def detect(self, audio, valid_langs=None):
def detect(self, audio_data: bytes, valid_langs=None):
valid_langs = [l.lower().split("-")[0] for l in valid_langs or self.valid_langs]

audio = self.audiochunk2array(audio_data)
if not self.engine.model.is_multilingual:
language = "en"
language_probability = 1
Expand All @@ -75,12 +63,6 @@ def detect(self, audio, valid_langs=None):
language, language_probability = results[0]
return language, language_probability

# plugin api
def transform(self, audio_data):
lang, prob = self.detect(self.audiochunk2array(audio_data))
LOG.info(f"Detected speech language '{lang}' with probability {prob}")
return audio_data, {"stt_lang": lang, "lang_probability": prob}


class FasterWhisperSTT(STT):
MODELS = (
Expand Down Expand Up @@ -288,3 +270,7 @@ def available_languages(self) -> set:
# 2023-04-29 17:42:30.769 - OVOS - __main__:execute:145 - INFO - Detected speech language 'en' with probability 1
print(a)
# And so, my fellow Americans, ask not what your country can do for you. Ask what you can do for your country.

l = FasterWhisperLangClassifier()
lang, prob = l.detect(audio.get_wav_data())
print(lang, prob)

0 comments on commit 535c3c5

Please sign in to comment.