From 4505792b1b5da0506e946a322affb18000c68eb1 Mon Sep 17 00:00:00 2001 From: miro Date: Wed, 30 Oct 2024 13:45:01 +0000 Subject: [PATCH 1/2] feat:binary handlers companion to https://github.com/JarbasHiveMind/hivemind-websocket-client/pull/33 and https://github.com/JarbasHiveMind/HiveMind-core/pull/100 --- hivemind_listener/__init__.py | 75 +++++++++++++++++++++++++++++------ 1 file changed, 63 insertions(+), 12 deletions(-) diff --git a/hivemind_listener/__init__.py b/hivemind_listener/__init__.py index 83d89b9..05e7b71 100644 --- a/hivemind_listener/__init__.py +++ b/hivemind_listener/__init__.py @@ -12,8 +12,7 @@ from ovos_bus_client import MessageBusClient from ovos_bus_client.message import Message -from hivemind_bus_client.message import HiveMessage, HiveMessageType -from hivemind_bus_client.serialization import HiveMindBinaryPayloadType +from hivemind_bus_client.message import HiveMessage, HiveMessageType, HiveMindBinaryPayloadType from hivemind_core.protocol import HiveMindListenerProtocol, HiveMindClientConnection from hivemind_core.service import HiveMindService from ovos_plugin_manager.stt import OVOSSTTFactory @@ -153,12 +152,17 @@ def handle_client_disconnected(self, client: HiveMindClientConnection): self.stop_listener(client) @classmethod - def get_b64_tts(cls, message: Message = None) -> str: + def get_tts(cls, message: Message = None) -> str: utterance = message.data['utterance'] ctxt = cls.tts._get_ctxt({"message": message}) wav, _ = cls.tts.synth(utterance, ctxt) + return str(wav) + + @classmethod + def get_b64_tts(cls, message: Message = None) -> str: + wav = cls.get_tts(message) # cast to str() to get a path, as it is a AudioFile object from tts cache - with open(str(wav), "rb") as f: + with open(wav, "rb") as f: audio = f.read() return base64.b64encode(audio).decode("utf-8") @@ -171,20 +175,67 @@ def transcribe_b64_audio(cls, message: Message = None) -> List[Tuple[str, float] utterances = cls.stt.transcribe(audio, lang) return utterances - def handle_binary_message(self, message: HiveMessage, client: HiveMindClientConnection): - assert message.msg_type == HiveMessageType.BINARY - if message.bin_type == HiveMindBinaryPayloadType.RAW_AUDIO: - bin_data = message.payload - if client.peer in self.listeners: - # LOG.debug(f"Got {len(bin_data)} bytes of audio data from {client.peer}") - m: FakeMicrophone = self.listeners[client.peer].mic + def handle_microphone_input(self, bin_data: bytes, + sample_rate: int, + sample_width: int, + client: HiveMindClientConnection): + if client.peer in self.listeners: + m: FakeMicrophone = self.listeners[client.peer].mic + if m.sample_rate != sample_rate or m.sample_width != sample_width: + LOG.debug(f"Got {len(bin_data)} bytes of audio data from {client.peer}") + LOG.error(f"sample_rate/sample_width mismatch! " + f"got: ({sample_rate}, {sample_width}) " + f"expected: ({m.sample_rate}, {m.sample_width})") + # TODO - convert sample_rate if needed + else: m.queue.put(bin_data) + def handle_stt_transcribe_request(self, bin_data: bytes, + sample_rate: int, + sample_width: int, + lang: str, + client: HiveMindClientConnection): + LOG.debug(f"Received binary STT input: {len(bin_data)} bytes") + audio = sr.AudioData(bin_data, sample_rate, sample_width) + tx = self.stt.transcribe(audio, lang) + m = Message("recognizer_loop:transcribe.response", {"transcriptions": tx, "lang": lang}) + client.send(HiveMessage(HiveMessageType.BUS, payload=m)) + + def handle_stt_handle_request(self, bin_data: bytes, + sample_rate: int, + sample_width: int, + lang: str, + client: HiveMindClientConnection): + LOG.debug(f"Received binary STT input: {len(bin_data)} bytes") + audio = sr.AudioData(bin_data, sample_rate, sample_width) + tx = self.stt.transcribe(audio, lang) + if tx: + utts = [t[0].rstrip(" '\"").lstrip(" '\"") for t in tx] + m = Message("recognizer_loop:utterance", + {"utterances": utts, "lang": lang}) + self.handle_inject_mycroft_msg(m, client) + else: + LOG.info(f"STT transcription error for client: {client.peer}") + m = Message("recognizer_loop:speech.recognition.unknown") + client.send(HiveMessage(HiveMessageType.BUS, payload=m)) + def handle_inject_mycroft_msg(self, message: Message, client: HiveMindClientConnection): """ message (Message): mycroft bus message object """ - if message.msg_type == "speak:b64_audio": + if message.msg_type == "speak:synth": + wav = self.get_tts(message) + with open(wav, "rb") as f: + bin_data = f.read() + payload = HiveMessage(HiveMessageType.BINARY, + payload=bin_data, + metadata={"lang": message.data["lang"], + "file_name": wav.split("/")[-1], + "utterance": message.data["utterance"]}, + bin_type=HiveMindBinaryPayloadType.TTS_AUDIO) + client.send(payload) + return + elif message.msg_type == "speak:b64_audio": msg: Message = message.reply("speak:b64_audio.response", message.data) msg.data["audio"] = self.get_b64_tts(message) if msg.context.get("destination") is None: From 894148bc1ec14287ca3f721cb5e5c3dd88539c7c Mon Sep 17 00:00:00 2001 From: miro Date: Wed, 30 Oct 2024 14:03:49 +0000 Subject: [PATCH 2/2] requirements.txt --- requirements.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index fe76d8e..036babb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ ovos-simple-listener -hivemind_bus_client -ovos-plugin-manager -jarbas_hive_mind \ No newline at end of file +hivemind_bus_client>=0.1.0,<1.0.0 +ovos-plugin-manager<1.0.0 +jarbas_hive_mind>=0.14.0,<1.0.0 \ No newline at end of file