GPSR QA & Repeat states (#140)

* data: add 2023 robocup gpsr xml files * feat: xml parser for gpsr q/a task component * fix: require install of requirements.tt * feat: add backend for creating & querying vector db * feat: add sentence embedding utils * feat: service to create text FAISS index * feat: text query service for FAISS * feat: working FAISS text services * feat: test scripts for FAISS services * docs: add documenation for FAISS vector service * chore: remove xml files * feat: xml question answer state * feat: separate package for vector db messages * feat: working state machine for gpsr QA task * feat: tts question answering * feat: working launch file and quesiton answering with TTS * fix: integrate @jws-1 review suggestions * feat: add listen state * chore: create gpsr commands sub folder * chore: properly fix ALSA errors * feat: speech and voice skills * feat: Q/A skill using new speech/voice skills * fix: working new question answer state machine --------- Co-authored-by: Jared Swift <[email protected]>
LASR-at-Home · Mar 11, 2024 · 8e00cad · 8e00cad
1 parent c7cbcbd
commit 8e00cad
Show file tree

Hide file tree

Showing 34 changed files with 1,257 additions and 305 deletions.
diff --git a/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server b/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
-
 import os
+import sounddevice  # needed to remove ALSA error messages
 import argparse
 from typing import Optional
 from dataclasses import dataclass
@@ -16,29 +16,6 @@ import lasr_speech_recognition_msgs.msg  # type: ignore
 from std_msgs.msg import String  # type: ignore
 from lasr_speech_recognition_whisper import load_model  # type: ignore
 
-# Error handler to remove ALSA error messages taken from:
-# https://stackoverflow.com/questions/7088672/pyaudio-working-but-spits-out-error-messages-each-time/17673011#17673011
-
-from ctypes import *
-from contextlib import contextmanager
-
-ERROR_HANDLER_FUNC = CFUNCTYPE(None, c_char_p, c_int, c_char_p, c_int, c_char_p)
-
-
-def py_error_handler(filename, line, function, err, fmt):
-    pass
-
-
-c_error_handler = ERROR_HANDLER_FUNC(py_error_handler)
-
-
-@contextmanager
-def noalsaerr():
-    asound = cdll.LoadLibrary("libasound.so")
-    asound.snd_lib_error_set_handler(c_error_handler)
-    yield
-    asound.snd_lib_error_set_handler(None)
-
 
 @dataclass
 class speech_model_params:
@@ -87,40 +64,25 @@ class TranscribeSpeechAction(object):
         self._transcription_server = rospy.Publisher(
             "/live_speech_transcription", String, queue_size=10
         )
-        with noalsaerr():
-            self._model = load_model(
-                self._model_params.model_name,
-                self._model_params.device,
-                self._model_params.warmup,
-            )
-            # Configure the speech recogniser object and adjust for ambient noise
-            self.recogniser = self._configure_recogniser(ambient_adj=True)
-            # Setup the action server and register execution callback
-            self._action_server = actionlib.SimpleActionServer(
-                self._action_name,
-                lasr_speech_recognition_msgs.msg.TranscribeSpeechAction,
-                execute_cb=self.execute_cb,
-                auto_start=False,
-            )
-            self._action_server.register_preempt_callback(self.prempt_cb)
-            # Setup the timer for adjusting the microphone for ambient noise every x seconds
-            self._timer_duration = self._model_params.timer_duration
-            self._timer = rospy.Timer(
-                rospy.Duration(self._timer_duration), self._timer_cb
-            )
-            self._listening = False
 
-            self._action_server.start()
+        self._model = load_model(
+            self._model_params.model_name,
+            self._model_params.device,
+            self._model_params.warmup,
+        )
+        # Configure the speech recogniser object and adjust for ambient noise
+        self.recogniser = self._configure_recogniser(ambient_adj=True)
+        # Setup the action server and register execution callback
+        self._action_server = actionlib.SimpleActionServer(
+            self._action_name,
+            lasr_speech_recognition_msgs.msg.TranscribeSpeechAction,
+            execute_cb=self.execute_cb,
+            auto_start=False,
+        )
+        self._action_server.register_preempt_callback(self.prempt_cb)
+        self._listening = False
 
-    def _timer_cb(self, _) -> None:
-        return
-        """Adjusts the microphone for ambient noise, unless the action server is listening."""
-        if self._listening:
-            return
-        rospy.loginfo("Adjusting microphone for ambient noise...")
-        with noalsaerr():
-            with self._configure_microphone() as source:
-                self.recogniser.adjust_for_ambient_noise(source)
+        self._action_server.start()
 
     def _reset_timer(self) -> None:
         """Resets the timer for adjusting the microphone for ambient noise."""
@@ -194,17 +156,13 @@ class TranscribeSpeechAction(object):
         rospy.loginfo("Request Received")
         if self._action_server.is_preempt_requested():
             return
-        # Since we are about to listen, reset the timer for adjusting the microphone for ambient noise
-        # as this assumes self_timer_duration seconds of silence before adjusting
-        self._reset_timer()
-        with noalsaerr():
-            with self._configure_microphone() as src:
-                self._listening = True
-                wav_data = self.recogniser.listen(
-                    src,
-                    timeout=self._model_params.start_timeout,
-                    phrase_time_limit=self._model_params.end_timeout,
-                ).get_wav_data()
+        with self._configure_microphone() as src:
+            self._listening = True
+            wav_data = self.recogniser.listen(
+                src,
+                timeout=self._model_params.start_timeout,
+                phrase_time_limit=self._model_params.end_timeout,
+            ).get_wav_data()
         # Magic number 32768.0 is the maximum value of a 16-bit signed integer
         float_data = (
             np.frombuffer(wav_data, dtype=np.int16).astype(np.float32, order="C")
@@ -293,12 +251,6 @@ def parse_args() -> dict:
         default=None,
         help="Microphone device index or name",
     )
-    parser.add_argument(
-        "--timer_duration",
-        type=int,
-        default=20,
-        help="Number of seconds of silence before the ambient noise adjustment is called.",
-    )
     parser.add_argument(
         "--no_warmup",
         action="store_true",
@@ -331,8 +283,6 @@ def configure_model_params(config: dict) -> speech_model_params:
         model_params.sample_rate = config["sample_rate"]
     if config["mic_device"]:
         model_params.mic_device = config["mic_device"]
-    if config["timer_duration"]:
-        model_params.timer_duration = config["timer_duration"]
     if config["no_warmup"]:
         model_params.warmup = False
 

diff --git a/common/speech/lasr_speech_recognition_whisper/requirements.in b/common/speech/lasr_speech_recognition_whisper/requirements.in
@@ -1,5 +1,6 @@
 SpeechRecognition==3.10.0
-openai-whisper==20230314
+sounddevice==0.4.6
+openai-whisper==20231117
 PyAudio==0.2.13
 PyYaml==6.0.1
 rospkg==1.5.0
diff --git a/common/speech/lasr_speech_recognition_whisper/requirements.txt b/common/speech/lasr_speech_recognition_whisper/requirements.txt
@@ -1,51 +1,54 @@
-catkin-pkg==0.5.2         # via rospkg
-certifi==2023.7.22        # via requests
-charset-normalizer==3.2.0  # via requests
-cmake==3.27.2             # via triton
-distro==1.8.0             # via rospkg
+--extra-index-url https://pypi.ngc.nvidia.com
+--trusted-host pypi.ngc.nvidia.com
+
+catkin-pkg==1.0.0         # via rospkg
+certifi==2024.2.2         # via requests
+cffi==1.16.0              # via sounddevice
+charset-normalizer==3.3.2  # via requests
+distro==1.9.0             # via rospkg
 docutils==0.20.1          # via catkin-pkg
-ffmpeg-python==0.2.0      # via openai-whisper
-filelock==3.12.2          # via torch, triton
-future==0.18.3            # via ffmpeg-python
-idna==3.4                 # via requests
-jinja2==3.1.2             # via torch
-lit==16.0.6               # via triton
-llvmlite==0.40.1          # via numba
-markupsafe==2.1.3         # via jinja2
-more-itertools==10.1.0    # via openai-whisper
+filelock==3.13.1          # via torch, triton
+fsspec==2024.2.0          # via torch
+idna==3.6                 # via requests
+jinja2==3.1.3             # via torch
+llvmlite==0.42.0          # via numba
+markupsafe==2.1.5         # via jinja2
+more-itertools==10.2.0    # via openai-whisper
 mpmath==1.3.0             # via sympy
-networkx==3.1             # via torch
-numba==0.57.1             # via openai-whisper
-numpy==1.24.4             # via numba, openai-whisper
-nvidia-cublas-cu11==11.10.3.66  # via nvidia-cudnn-cu11, nvidia-cusolver-cu11, torch
-nvidia-cuda-cupti-cu11==11.7.101  # via torch
-nvidia-cuda-nvrtc-cu11==11.7.99  # via torch
-nvidia-cuda-runtime-cu11==11.7.99  # via torch
-nvidia-cudnn-cu11==8.5.0.96  # via torch
-nvidia-cufft-cu11==10.9.0.58  # via torch
-nvidia-curand-cu11==10.2.10.91  # via torch
-nvidia-cusolver-cu11==11.4.0.1  # via torch
-nvidia-cusparse-cu11==11.7.4.91  # via torch
-nvidia-nccl-cu11==2.14.3  # via torch
-nvidia-nvtx-cu11==11.7.91  # via torch
-openai-whisper==20230314  # via -r requirements.in
+networkx==3.2.1           # via torch
+numba==0.59.0             # via openai-whisper
+numpy==1.26.4             # via numba, openai-whisper
+nvidia-cublas-cu12==12.1.3.1  # via nvidia-cudnn-cu12, nvidia-cusolver-cu12, torch
+nvidia-cuda-cupti-cu12==12.1.105  # via torch
+nvidia-cuda-nvrtc-cu12==12.1.105  # via torch
+nvidia-cuda-runtime-cu12==12.1.105  # via torch
+nvidia-cudnn-cu12==8.9.2.26  # via torch
+nvidia-cufft-cu12==11.0.2.54  # via torch
+nvidia-curand-cu12==10.3.2.106  # via torch
+nvidia-cusolver-cu12==11.4.5.107  # via torch
+nvidia-cusparse-cu12==12.1.0.106  # via nvidia-cusolver-cu12, torch
+nvidia-nccl-cu12==2.19.3  # via torch
+nvidia-nvjitlink-cu12==12.4.99  # via nvidia-cusolver-cu12, nvidia-cusparse-cu12
+nvidia-nvtx-cu12==12.1.105  # via torch
+openai-whisper==20231117  # via -r requirements.in
 pyaudio==0.2.13           # via -r requirements.in
-pyparsing==3.1.1          # via catkin-pkg
-python-dateutil==2.8.2    # via catkin-pkg
+pycparser==2.21           # via cffi
+pyparsing==3.1.2          # via catkin-pkg
+python-dateutil==2.9.0.post0  # via catkin-pkg
 pyyaml==6.0.1             # via -r requirements.in, rospkg
-regex==2023.8.8           # via tiktoken
+regex==2023.12.25         # via tiktoken
 requests==2.31.0          # via speechrecognition, tiktoken
 rospkg==1.5.0             # via -r requirements.in
 six==1.16.0               # via python-dateutil
+sounddevice==0.4.6        # via -r requirements.in
 speechrecognition==3.10.0  # via -r requirements.in
 sympy==1.12               # via torch
-tiktoken==0.3.1           # via openai-whisper
-torch==2.0.1              # via openai-whisper, triton
-tqdm==4.66.1              # via openai-whisper
-triton==2.0.0             # via openai-whisper, torch
-typing-extensions==4.7.1  # via torch
-urllib3==2.0.4            # via requests
-wheel==0.41.1             # via nvidia-cublas-cu11, nvidia-cuda-cupti-cu11, nvidia-cuda-runtime-cu11, nvidia-curand-cu11, nvidia-cusparse-cu11, nvidia-nvtx-cu11
+tiktoken==0.6.0           # via openai-whisper
+torch==2.2.1              # via openai-whisper
+tqdm==4.66.2              # via openai-whisper
+triton==2.2.0             # via openai-whisper, torch
+typing-extensions==4.10.0  # via torch
+urllib3==2.2.1            # via requests
 
 # The following packages are considered to be unsafe in a requirements file:
 # setuptools
diff --git a/common/vector_databases/lasr_vector_databases_faiss/CMakeLists.txt b/common/vector_databases/lasr_vector_databases_faiss/CMakeLists.txt
@@ -7,7 +7,10 @@ project(lasr_vector_databases_faiss)
 ## Find catkin macros and libraries
 ## if COMPONENTS list like find_package(catkin REQUIRED COMPONENTS xyz)
 ## is used, also find other catkin packages
-find_package(catkin REQUIRED catkin_virtualenv)
+find_package(catkin REQUIRED catkin_virtualenv COMPONENTS
+rospy
+lasr_vector_databases_msgs
+)
 
 ## System dependencies are found with CMake's conventions
 # find_package(Boost REQUIRED COMPONENTS system)
@@ -55,8 +58,6 @@ catkin_generate_virtualenv(
 ## Generate services in the 'srv' folder
 # add_service_files(
 #   FILES
-#   Service1.srv
-#   Service2.srv
 # )
 
 # Generate actions in the 'action' folder
@@ -68,8 +69,7 @@ catkin_generate_virtualenv(
 # Generate added messages and services with any dependencies listed here
 # generate_messages(
 #   DEPENDENCIES
-#   actionlib_msgs
-#   geometry_msgs
+#   std_msgs
 # )
 
 ################################################
@@ -157,22 +157,13 @@ include_directories(
 
 ## Mark executable scripts (Python etc.) for installation
 ## in contrast to setup.py, you can choose the destination
-# catkin_install_python(PROGRAMS
-#   nodes/qualification
-#   nodes/actions/wait_greet
-#   nodes/actions/identify
-#   nodes/actions/greet
-#   nodes/actions/get_name
-#   nodes/actions/learn_face
-#   nodes/actions/get_command
-#   nodes/actions/guide
-#   nodes/actions/find_person
-#   nodes/actions/detect_people
-#   nodes/actions/receive_object
-#   nodes/actions/handover_object
-#   nodes/better_qualification
-#   DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION}
-# )
+catkin_install_python(PROGRAMS
+  nodes/txt_index_service
+  nodes/txt_query_service
+  scripts/test_index_service.py
+  scripts/test_query_service.py
+  DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION}
+)
 
 ## Mark executables for installation
 ## See http://docs.ros.org/melodic/api/catkin/html/howto/format1/building_executables.html
@@ -196,11 +187,10 @@ include_directories(
 # )
 
 ## Mark other files for installation (e.g. launch and bag files, etc.)
-# install(FILES
-#   # myfile1
-#   # myfile2
-#   DESTINATION ${CATKIN_PACKAGE_SHARE_DESTINATION}
-# )
+install(FILES
+  requirements.txt
+  DESTINATION ${CATKIN_PACKAGE_SHARE_DESTINATION}
+)
 
 #############
 ## Testing ##

diff --git a/common/vector_databases/lasr_vector_databases_faiss/doc/TECHNICAL.md b/common/vector_databases/lasr_vector_databases_faiss/doc/TECHNICAL.md
diff --git a/common/vector_databases/lasr_vector_databases_faiss/doc/USAGE.md b/common/vector_databases/lasr_vector_databases_faiss/doc/USAGE.md
@@ -0,0 +1,33 @@
+This package currently contains two services `txt_index_service` and `txt_query_service`. These services are used to create and search (respectively) a vector database of natural language sentence embeddings.
+
+# Index Service
+The Index service is used to create a [FAISS](https://github.com/facebookresearch/faiss) index object containing a set of sentence embeddings, where each sentence is assumed to be a line in a given `.txt` file. This Index object is saved to disk at a specified location, and can be thought of as a Vector Database. 
+
+## Request
+The request takes two string parameters: `txt_path` which is the path to the `.txt` file we wish to create sentence embeddings for, where each line in this file is treated as a sentence; and `index_path` which is the path to a `.index` file that will be created by the Service.
+
+## Response
+No response is given from this service.
+
+## Example Usage
+Please see the `scripts/test_index_service.py` script for a simple example of sending a request to the service.
+
+# Query Service
+The query service is used to search the `.index` file created by the Index Service to find the most similar sentences given an input query sentence.
+
+## Request
+The request requires four fields:
+
+1. `txt_path` -- this is a `string` that is the path to the txt file that contains the original sentences that the `.index` file was populated with.
+2. `index_path` -- this is a `string` that is the path to the `.index` file that was created with the Index Service, on the same txt file as the `txt_path`.
+3. `query_sentence` -- this is a `string` that is the sentence that you wish to query the index with and find the most similar sentence.
+4. `k` -- this is a `uint8` that is the number of closest sentences you wish to return.
+
+## Response
+The response contains two fields:
+
+1. `closest_sentences` -- this is an ordered list of `string`s that contain the closest sentences to the given query sentence.
+2. `cosine_similaities` -- this is an ordered list of `float32`s that contain the cosine similarity scores of the closest sentences.
+
+## Example Usage
+Please see the `scripts/test_query_service.py` script for a simple example of sending a request to the service.
diff --git a/common/vector_databases/lasr_vector_databases_faiss/nodes/txt_index_service b/common/vector_databases/lasr_vector_databases_faiss/nodes/txt_index_service
@@ -0,0 +1,33 @@
+#!/usr/bin/env python3
+import rospy
+import numpy as np
+from lasr_vector_databases_msgs.srv import TxtIndexRequest, TxtIndexResponse, TxtIndex
+from lasr_vector_databases_faiss import (
+    load_model,
+    parse_txt_file,
+    get_sentence_embeddings,
+    create_vector_database,
+)
+
+
+class TxtIndexService:
+    def __init__(self):
+        rospy.init_node("txt_index_service")
+        rospy.Service("lasr_faiss/txt_index", TxtIndex, self.execute_cb)
+        self._sentence_embedding_model = load_model()
+        rospy.loginfo("Text index service started")
+
+    def execute_cb(self, req: TxtIndexRequest):
+        txt_fp: str = req.txt_path
+        sentences_to_embed: list[str] = parse_txt_file(txt_fp)
+        sentence_embeddings: np.ndarray = get_sentence_embeddings(
+            sentences_to_embed, self._sentence_embedding_model
+        )
+        index_path: str = req.index_path
+        create_vector_database(sentence_embeddings, index_path)
+        return TxtIndexResponse()
+
+
+if __name__ == "__main__":
+    TxtIndexService()
+    rospy.spin()