Skip to content

Commit

Permalink
GPSR QA & Repeat states (#140)
Browse files Browse the repository at this point in the history
* data: add 2023 robocup gpsr xml files

* feat: xml parser for gpsr q/a task component

* fix: require install of requirements.tt

* feat: add backend for creating & querying vector db

* feat: add sentence embedding utils

* feat: service to create text FAISS index

* feat: text query service for FAISS

* feat: working FAISS text services

* feat: test scripts for FAISS services

* docs: add documenation for FAISS vector service

* chore: remove xml files

* feat: xml question answer state

* feat: separate package for vector db messages

* feat: working state machine for gpsr QA task

* feat: tts question answering

* feat: working launch file and quesiton answering with TTS

* fix: integrate @jws-1 review suggestions

* feat: add listen state

* chore: create gpsr commands sub folder

* chore: properly fix ALSA errors

* feat: speech and voice skills

* feat: Q/A skill using new speech/voice skills

* fix: working new question answer state machine

---------

Co-authored-by: Jared Swift <[email protected]>
  • Loading branch information
m-barker and jws-1 authored Mar 11, 2024
1 parent c7cbcbd commit 8e00cad
Show file tree
Hide file tree
Showing 34 changed files with 1,257 additions and 305 deletions.
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python3

import os
import sounddevice # needed to remove ALSA error messages
import argparse
from typing import Optional
from dataclasses import dataclass
Expand All @@ -16,29 +16,6 @@ import lasr_speech_recognition_msgs.msg # type: ignore
from std_msgs.msg import String # type: ignore
from lasr_speech_recognition_whisper import load_model # type: ignore

# Error handler to remove ALSA error messages taken from:
# https://stackoverflow.com/questions/7088672/pyaudio-working-but-spits-out-error-messages-each-time/17673011#17673011

from ctypes import *
from contextlib import contextmanager

ERROR_HANDLER_FUNC = CFUNCTYPE(None, c_char_p, c_int, c_char_p, c_int, c_char_p)


def py_error_handler(filename, line, function, err, fmt):
pass


c_error_handler = ERROR_HANDLER_FUNC(py_error_handler)


@contextmanager
def noalsaerr():
asound = cdll.LoadLibrary("libasound.so")
asound.snd_lib_error_set_handler(c_error_handler)
yield
asound.snd_lib_error_set_handler(None)


@dataclass
class speech_model_params:
Expand Down Expand Up @@ -87,40 +64,25 @@ class TranscribeSpeechAction(object):
self._transcription_server = rospy.Publisher(
"/live_speech_transcription", String, queue_size=10
)
with noalsaerr():
self._model = load_model(
self._model_params.model_name,
self._model_params.device,
self._model_params.warmup,
)
# Configure the speech recogniser object and adjust for ambient noise
self.recogniser = self._configure_recogniser(ambient_adj=True)
# Setup the action server and register execution callback
self._action_server = actionlib.SimpleActionServer(
self._action_name,
lasr_speech_recognition_msgs.msg.TranscribeSpeechAction,
execute_cb=self.execute_cb,
auto_start=False,
)
self._action_server.register_preempt_callback(self.prempt_cb)
# Setup the timer for adjusting the microphone for ambient noise every x seconds
self._timer_duration = self._model_params.timer_duration
self._timer = rospy.Timer(
rospy.Duration(self._timer_duration), self._timer_cb
)
self._listening = False

self._action_server.start()
self._model = load_model(
self._model_params.model_name,
self._model_params.device,
self._model_params.warmup,
)
# Configure the speech recogniser object and adjust for ambient noise
self.recogniser = self._configure_recogniser(ambient_adj=True)
# Setup the action server and register execution callback
self._action_server = actionlib.SimpleActionServer(
self._action_name,
lasr_speech_recognition_msgs.msg.TranscribeSpeechAction,
execute_cb=self.execute_cb,
auto_start=False,
)
self._action_server.register_preempt_callback(self.prempt_cb)
self._listening = False

def _timer_cb(self, _) -> None:
return
"""Adjusts the microphone for ambient noise, unless the action server is listening."""
if self._listening:
return
rospy.loginfo("Adjusting microphone for ambient noise...")
with noalsaerr():
with self._configure_microphone() as source:
self.recogniser.adjust_for_ambient_noise(source)
self._action_server.start()

def _reset_timer(self) -> None:
"""Resets the timer for adjusting the microphone for ambient noise."""
Expand Down Expand Up @@ -194,17 +156,13 @@ class TranscribeSpeechAction(object):
rospy.loginfo("Request Received")
if self._action_server.is_preempt_requested():
return
# Since we are about to listen, reset the timer for adjusting the microphone for ambient noise
# as this assumes self_timer_duration seconds of silence before adjusting
self._reset_timer()
with noalsaerr():
with self._configure_microphone() as src:
self._listening = True
wav_data = self.recogniser.listen(
src,
timeout=self._model_params.start_timeout,
phrase_time_limit=self._model_params.end_timeout,
).get_wav_data()
with self._configure_microphone() as src:
self._listening = True
wav_data = self.recogniser.listen(
src,
timeout=self._model_params.start_timeout,
phrase_time_limit=self._model_params.end_timeout,
).get_wav_data()
# Magic number 32768.0 is the maximum value of a 16-bit signed integer
float_data = (
np.frombuffer(wav_data, dtype=np.int16).astype(np.float32, order="C")
Expand Down Expand Up @@ -293,12 +251,6 @@ def parse_args() -> dict:
default=None,
help="Microphone device index or name",
)
parser.add_argument(
"--timer_duration",
type=int,
default=20,
help="Number of seconds of silence before the ambient noise adjustment is called.",
)
parser.add_argument(
"--no_warmup",
action="store_true",
Expand Down Expand Up @@ -331,8 +283,6 @@ def configure_model_params(config: dict) -> speech_model_params:
model_params.sample_rate = config["sample_rate"]
if config["mic_device"]:
model_params.mic_device = config["mic_device"]
if config["timer_duration"]:
model_params.timer_duration = config["timer_duration"]
if config["no_warmup"]:
model_params.warmup = False

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
SpeechRecognition==3.10.0
openai-whisper==20230314
sounddevice==0.4.6
openai-whisper==20231117
PyAudio==0.2.13
PyYaml==6.0.1
rospkg==1.5.0
81 changes: 42 additions & 39 deletions common/speech/lasr_speech_recognition_whisper/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,51 +1,54 @@
catkin-pkg==0.5.2 # via rospkg
certifi==2023.7.22 # via requests
charset-normalizer==3.2.0 # via requests
cmake==3.27.2 # via triton
distro==1.8.0 # via rospkg
--extra-index-url https://pypi.ngc.nvidia.com
--trusted-host pypi.ngc.nvidia.com

catkin-pkg==1.0.0 # via rospkg
certifi==2024.2.2 # via requests
cffi==1.16.0 # via sounddevice
charset-normalizer==3.3.2 # via requests
distro==1.9.0 # via rospkg
docutils==0.20.1 # via catkin-pkg
ffmpeg-python==0.2.0 # via openai-whisper
filelock==3.12.2 # via torch, triton
future==0.18.3 # via ffmpeg-python
idna==3.4 # via requests
jinja2==3.1.2 # via torch
lit==16.0.6 # via triton
llvmlite==0.40.1 # via numba
markupsafe==2.1.3 # via jinja2
more-itertools==10.1.0 # via openai-whisper
filelock==3.13.1 # via torch, triton
fsspec==2024.2.0 # via torch
idna==3.6 # via requests
jinja2==3.1.3 # via torch
llvmlite==0.42.0 # via numba
markupsafe==2.1.5 # via jinja2
more-itertools==10.2.0 # via openai-whisper
mpmath==1.3.0 # via sympy
networkx==3.1 # via torch
numba==0.57.1 # via openai-whisper
numpy==1.24.4 # via numba, openai-whisper
nvidia-cublas-cu11==11.10.3.66 # via nvidia-cudnn-cu11, nvidia-cusolver-cu11, torch
nvidia-cuda-cupti-cu11==11.7.101 # via torch
nvidia-cuda-nvrtc-cu11==11.7.99 # via torch
nvidia-cuda-runtime-cu11==11.7.99 # via torch
nvidia-cudnn-cu11==8.5.0.96 # via torch
nvidia-cufft-cu11==10.9.0.58 # via torch
nvidia-curand-cu11==10.2.10.91 # via torch
nvidia-cusolver-cu11==11.4.0.1 # via torch
nvidia-cusparse-cu11==11.7.4.91 # via torch
nvidia-nccl-cu11==2.14.3 # via torch
nvidia-nvtx-cu11==11.7.91 # via torch
openai-whisper==20230314 # via -r requirements.in
networkx==3.2.1 # via torch
numba==0.59.0 # via openai-whisper
numpy==1.26.4 # via numba, openai-whisper
nvidia-cublas-cu12==12.1.3.1 # via nvidia-cudnn-cu12, nvidia-cusolver-cu12, torch
nvidia-cuda-cupti-cu12==12.1.105 # via torch
nvidia-cuda-nvrtc-cu12==12.1.105 # via torch
nvidia-cuda-runtime-cu12==12.1.105 # via torch
nvidia-cudnn-cu12==8.9.2.26 # via torch
nvidia-cufft-cu12==11.0.2.54 # via torch
nvidia-curand-cu12==10.3.2.106 # via torch
nvidia-cusolver-cu12==11.4.5.107 # via torch
nvidia-cusparse-cu12==12.1.0.106 # via nvidia-cusolver-cu12, torch
nvidia-nccl-cu12==2.19.3 # via torch
nvidia-nvjitlink-cu12==12.4.99 # via nvidia-cusolver-cu12, nvidia-cusparse-cu12
nvidia-nvtx-cu12==12.1.105 # via torch
openai-whisper==20231117 # via -r requirements.in
pyaudio==0.2.13 # via -r requirements.in
pyparsing==3.1.1 # via catkin-pkg
python-dateutil==2.8.2 # via catkin-pkg
pycparser==2.21 # via cffi
pyparsing==3.1.2 # via catkin-pkg
python-dateutil==2.9.0.post0 # via catkin-pkg
pyyaml==6.0.1 # via -r requirements.in, rospkg
regex==2023.8.8 # via tiktoken
regex==2023.12.25 # via tiktoken
requests==2.31.0 # via speechrecognition, tiktoken
rospkg==1.5.0 # via -r requirements.in
six==1.16.0 # via python-dateutil
sounddevice==0.4.6 # via -r requirements.in
speechrecognition==3.10.0 # via -r requirements.in
sympy==1.12 # via torch
tiktoken==0.3.1 # via openai-whisper
torch==2.0.1 # via openai-whisper, triton
tqdm==4.66.1 # via openai-whisper
triton==2.0.0 # via openai-whisper, torch
typing-extensions==4.7.1 # via torch
urllib3==2.0.4 # via requests
wheel==0.41.1 # via nvidia-cublas-cu11, nvidia-cuda-cupti-cu11, nvidia-cuda-runtime-cu11, nvidia-curand-cu11, nvidia-cusparse-cu11, nvidia-nvtx-cu11
tiktoken==0.6.0 # via openai-whisper
torch==2.2.1 # via openai-whisper
tqdm==4.66.2 # via openai-whisper
triton==2.2.0 # via openai-whisper, torch
typing-extensions==4.10.0 # via torch
urllib3==2.2.1 # via requests

# The following packages are considered to be unsafe in a requirements file:
# setuptools
42 changes: 16 additions & 26 deletions common/vector_databases/lasr_vector_databases_faiss/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@ project(lasr_vector_databases_faiss)
## Find catkin macros and libraries
## if COMPONENTS list like find_package(catkin REQUIRED COMPONENTS xyz)
## is used, also find other catkin packages
find_package(catkin REQUIRED catkin_virtualenv)
find_package(catkin REQUIRED catkin_virtualenv COMPONENTS
rospy
lasr_vector_databases_msgs
)

## System dependencies are found with CMake's conventions
# find_package(Boost REQUIRED COMPONENTS system)
Expand Down Expand Up @@ -55,8 +58,6 @@ catkin_generate_virtualenv(
## Generate services in the 'srv' folder
# add_service_files(
# FILES
# Service1.srv
# Service2.srv
# )

# Generate actions in the 'action' folder
Expand All @@ -68,8 +69,7 @@ catkin_generate_virtualenv(
# Generate added messages and services with any dependencies listed here
# generate_messages(
# DEPENDENCIES
# actionlib_msgs
# geometry_msgs
# std_msgs
# )

################################################
Expand Down Expand Up @@ -157,22 +157,13 @@ include_directories(

## Mark executable scripts (Python etc.) for installation
## in contrast to setup.py, you can choose the destination
# catkin_install_python(PROGRAMS
# nodes/qualification
# nodes/actions/wait_greet
# nodes/actions/identify
# nodes/actions/greet
# nodes/actions/get_name
# nodes/actions/learn_face
# nodes/actions/get_command
# nodes/actions/guide
# nodes/actions/find_person
# nodes/actions/detect_people
# nodes/actions/receive_object
# nodes/actions/handover_object
# nodes/better_qualification
# DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION}
# )
catkin_install_python(PROGRAMS
nodes/txt_index_service
nodes/txt_query_service
scripts/test_index_service.py
scripts/test_query_service.py
DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION}
)

## Mark executables for installation
## See http://docs.ros.org/melodic/api/catkin/html/howto/format1/building_executables.html
Expand All @@ -196,11 +187,10 @@ include_directories(
# )

## Mark other files for installation (e.g. launch and bag files, etc.)
# install(FILES
# # myfile1
# # myfile2
# DESTINATION ${CATKIN_PACKAGE_SHARE_DESTINATION}
# )
install(FILES
requirements.txt
DESTINATION ${CATKIN_PACKAGE_SHARE_DESTINATION}
)

#############
## Testing ##
Expand Down
Empty file.
33 changes: 33 additions & 0 deletions common/vector_databases/lasr_vector_databases_faiss/doc/USAGE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
This package currently contains two services `txt_index_service` and `txt_query_service`. These services are used to create and search (respectively) a vector database of natural language sentence embeddings.

# Index Service
The Index service is used to create a [FAISS](https://github.com/facebookresearch/faiss) index object containing a set of sentence embeddings, where each sentence is assumed to be a line in a given `.txt` file. This Index object is saved to disk at a specified location, and can be thought of as a Vector Database.

## Request
The request takes two string parameters: `txt_path` which is the path to the `.txt` file we wish to create sentence embeddings for, where each line in this file is treated as a sentence; and `index_path` which is the path to a `.index` file that will be created by the Service.

## Response
No response is given from this service.

## Example Usage
Please see the `scripts/test_index_service.py` script for a simple example of sending a request to the service.

# Query Service
The query service is used to search the `.index` file created by the Index Service to find the most similar sentences given an input query sentence.

## Request
The request requires four fields:

1. `txt_path` -- this is a `string` that is the path to the txt file that contains the original sentences that the `.index` file was populated with.
2. `index_path` -- this is a `string` that is the path to the `.index` file that was created with the Index Service, on the same txt file as the `txt_path`.
3. `query_sentence` -- this is a `string` that is the sentence that you wish to query the index with and find the most similar sentence.
4. `k` -- this is a `uint8` that is the number of closest sentences you wish to return.

## Response
The response contains two fields:

1. `closest_sentences` -- this is an ordered list of `string`s that contain the closest sentences to the given query sentence.
2. `cosine_similaities` -- this is an ordered list of `float32`s that contain the cosine similarity scores of the closest sentences.

## Example Usage
Please see the `scripts/test_query_service.py` script for a simple example of sending a request to the service.
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!/usr/bin/env python3
import rospy
import numpy as np
from lasr_vector_databases_msgs.srv import TxtIndexRequest, TxtIndexResponse, TxtIndex
from lasr_vector_databases_faiss import (
load_model,
parse_txt_file,
get_sentence_embeddings,
create_vector_database,
)


class TxtIndexService:
def __init__(self):
rospy.init_node("txt_index_service")
rospy.Service("lasr_faiss/txt_index", TxtIndex, self.execute_cb)
self._sentence_embedding_model = load_model()
rospy.loginfo("Text index service started")

def execute_cb(self, req: TxtIndexRequest):
txt_fp: str = req.txt_path
sentences_to_embed: list[str] = parse_txt_file(txt_fp)
sentence_embeddings: np.ndarray = get_sentence_embeddings(
sentences_to_embed, self._sentence_embedding_model
)
index_path: str = req.index_path
create_vector_database(sentence_embeddings, index_path)
return TxtIndexResponse()


if __name__ == "__main__":
TxtIndexService()
rospy.spin()
Loading

0 comments on commit 8e00cad

Please sign in to comment.