Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GPSR QA & Repeat states #140

Merged
merged 24 commits into from
Mar 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
5d6e21c
data: add 2023 robocup gpsr xml files
m-barker Mar 3, 2024
4cd6361
feat: xml parser for gpsr q/a task component
m-barker Mar 3, 2024
4c049d6
fix: require install of requirements.tt
m-barker Mar 3, 2024
0673914
feat: add backend for creating & querying vector db
m-barker Mar 4, 2024
e8d898a
feat: add sentence embedding utils
m-barker Mar 4, 2024
b4a4ce8
feat: service to create text FAISS index
m-barker Mar 5, 2024
3d3007c
feat: text query service for FAISS
m-barker Mar 5, 2024
898ed1d
feat: working FAISS text services
m-barker Mar 5, 2024
04ddae3
feat: test scripts for FAISS services
m-barker Mar 5, 2024
7321696
docs: add documenation for FAISS vector service
m-barker Mar 5, 2024
86b964b
chore: remove xml files
m-barker Mar 5, 2024
db0750f
feat: xml question answer state
m-barker Mar 6, 2024
9d8a9f1
feat: separate package for vector db messages
m-barker Mar 6, 2024
b012e23
feat: working state machine for gpsr QA task
m-barker Mar 6, 2024
014c6e9
feat: tts question answering
m-barker Mar 6, 2024
e234f9d
feat: working launch file and quesiton answering with TTS
m-barker Mar 6, 2024
d726542
fix: integrate @jws-1 review suggestions
m-barker Mar 11, 2024
397ac6f
feat: add listen state
m-barker Mar 11, 2024
85f5439
chore: create gpsr commands sub folder
m-barker Mar 11, 2024
2c8f257
chore: properly fix ALSA errors
m-barker Mar 11, 2024
d6dde82
feat: speech and voice skills
m-barker Mar 11, 2024
e41b4dd
feat: Q/A skill using new speech/voice skills
m-barker Mar 11, 2024
d7a9b22
fix: working new question answer state machine
m-barker Mar 11, 2024
c2690f1
Merge branch 'main' into gpsr_question_answer
jws-1 Mar 11, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python3

import os
import sounddevice # needed to remove ALSA error messages
import argparse
from typing import Optional
from dataclasses import dataclass
Expand All @@ -16,29 +16,6 @@ import lasr_speech_recognition_msgs.msg # type: ignore
from std_msgs.msg import String # type: ignore
from lasr_speech_recognition_whisper import load_model # type: ignore

# Error handler to remove ALSA error messages taken from:
# https://stackoverflow.com/questions/7088672/pyaudio-working-but-spits-out-error-messages-each-time/17673011#17673011

from ctypes import *
from contextlib import contextmanager

ERROR_HANDLER_FUNC = CFUNCTYPE(None, c_char_p, c_int, c_char_p, c_int, c_char_p)


def py_error_handler(filename, line, function, err, fmt):
pass


c_error_handler = ERROR_HANDLER_FUNC(py_error_handler)


@contextmanager
def noalsaerr():
asound = cdll.LoadLibrary("libasound.so")
asound.snd_lib_error_set_handler(c_error_handler)
yield
asound.snd_lib_error_set_handler(None)


@dataclass
class speech_model_params:
Expand Down Expand Up @@ -87,40 +64,25 @@ class TranscribeSpeechAction(object):
self._transcription_server = rospy.Publisher(
"/live_speech_transcription", String, queue_size=10
)
with noalsaerr():
self._model = load_model(
self._model_params.model_name,
self._model_params.device,
self._model_params.warmup,
)
# Configure the speech recogniser object and adjust for ambient noise
self.recogniser = self._configure_recogniser(ambient_adj=True)
# Setup the action server and register execution callback
self._action_server = actionlib.SimpleActionServer(
self._action_name,
lasr_speech_recognition_msgs.msg.TranscribeSpeechAction,
execute_cb=self.execute_cb,
auto_start=False,
)
self._action_server.register_preempt_callback(self.prempt_cb)
# Setup the timer for adjusting the microphone for ambient noise every x seconds
self._timer_duration = self._model_params.timer_duration
self._timer = rospy.Timer(
rospy.Duration(self._timer_duration), self._timer_cb
)
self._listening = False

self._action_server.start()
self._model = load_model(
self._model_params.model_name,
self._model_params.device,
self._model_params.warmup,
)
# Configure the speech recogniser object and adjust for ambient noise
self.recogniser = self._configure_recogniser(ambient_adj=True)
# Setup the action server and register execution callback
self._action_server = actionlib.SimpleActionServer(
self._action_name,
lasr_speech_recognition_msgs.msg.TranscribeSpeechAction,
execute_cb=self.execute_cb,
auto_start=False,
)
self._action_server.register_preempt_callback(self.prempt_cb)
self._listening = False

def _timer_cb(self, _) -> None:
return
"""Adjusts the microphone for ambient noise, unless the action server is listening."""
if self._listening:
return
rospy.loginfo("Adjusting microphone for ambient noise...")
with noalsaerr():
with self._configure_microphone() as source:
self.recogniser.adjust_for_ambient_noise(source)
self._action_server.start()

def _reset_timer(self) -> None:
"""Resets the timer for adjusting the microphone for ambient noise."""
Expand Down Expand Up @@ -194,17 +156,13 @@ class TranscribeSpeechAction(object):
rospy.loginfo("Request Received")
if self._action_server.is_preempt_requested():
return
# Since we are about to listen, reset the timer for adjusting the microphone for ambient noise
# as this assumes self_timer_duration seconds of silence before adjusting
self._reset_timer()
with noalsaerr():
with self._configure_microphone() as src:
self._listening = True
wav_data = self.recogniser.listen(
src,
timeout=self._model_params.start_timeout,
phrase_time_limit=self._model_params.end_timeout,
).get_wav_data()
with self._configure_microphone() as src:
self._listening = True
wav_data = self.recogniser.listen(
src,
timeout=self._model_params.start_timeout,
phrase_time_limit=self._model_params.end_timeout,
).get_wav_data()
# Magic number 32768.0 is the maximum value of a 16-bit signed integer
float_data = (
np.frombuffer(wav_data, dtype=np.int16).astype(np.float32, order="C")
Expand Down Expand Up @@ -293,12 +251,6 @@ def parse_args() -> dict:
default=None,
help="Microphone device index or name",
)
parser.add_argument(
"--timer_duration",
type=int,
default=20,
help="Number of seconds of silence before the ambient noise adjustment is called.",
)
parser.add_argument(
"--no_warmup",
action="store_true",
Expand Down Expand Up @@ -331,8 +283,6 @@ def configure_model_params(config: dict) -> speech_model_params:
model_params.sample_rate = config["sample_rate"]
if config["mic_device"]:
model_params.mic_device = config["mic_device"]
if config["timer_duration"]:
model_params.timer_duration = config["timer_duration"]
if config["no_warmup"]:
model_params.warmup = False

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
SpeechRecognition==3.10.0
openai-whisper==20230314
sounddevice==0.4.6
openai-whisper==20231117
PyAudio==0.2.13
PyYaml==6.0.1
rospkg==1.5.0
81 changes: 42 additions & 39 deletions common/speech/lasr_speech_recognition_whisper/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,51 +1,54 @@
catkin-pkg==0.5.2 # via rospkg
certifi==2023.7.22 # via requests
charset-normalizer==3.2.0 # via requests
cmake==3.27.2 # via triton
distro==1.8.0 # via rospkg
--extra-index-url https://pypi.ngc.nvidia.com
--trusted-host pypi.ngc.nvidia.com

catkin-pkg==1.0.0 # via rospkg
certifi==2024.2.2 # via requests
cffi==1.16.0 # via sounddevice
charset-normalizer==3.3.2 # via requests
distro==1.9.0 # via rospkg
docutils==0.20.1 # via catkin-pkg
ffmpeg-python==0.2.0 # via openai-whisper
filelock==3.12.2 # via torch, triton
future==0.18.3 # via ffmpeg-python
idna==3.4 # via requests
jinja2==3.1.2 # via torch
lit==16.0.6 # via triton
llvmlite==0.40.1 # via numba
markupsafe==2.1.3 # via jinja2
more-itertools==10.1.0 # via openai-whisper
filelock==3.13.1 # via torch, triton
fsspec==2024.2.0 # via torch
idna==3.6 # via requests
jinja2==3.1.3 # via torch
llvmlite==0.42.0 # via numba
markupsafe==2.1.5 # via jinja2
more-itertools==10.2.0 # via openai-whisper
mpmath==1.3.0 # via sympy
networkx==3.1 # via torch
numba==0.57.1 # via openai-whisper
numpy==1.24.4 # via numba, openai-whisper
nvidia-cublas-cu11==11.10.3.66 # via nvidia-cudnn-cu11, nvidia-cusolver-cu11, torch
nvidia-cuda-cupti-cu11==11.7.101 # via torch
nvidia-cuda-nvrtc-cu11==11.7.99 # via torch
nvidia-cuda-runtime-cu11==11.7.99 # via torch
nvidia-cudnn-cu11==8.5.0.96 # via torch
nvidia-cufft-cu11==10.9.0.58 # via torch
nvidia-curand-cu11==10.2.10.91 # via torch
nvidia-cusolver-cu11==11.4.0.1 # via torch
nvidia-cusparse-cu11==11.7.4.91 # via torch
nvidia-nccl-cu11==2.14.3 # via torch
nvidia-nvtx-cu11==11.7.91 # via torch
openai-whisper==20230314 # via -r requirements.in
networkx==3.2.1 # via torch
numba==0.59.0 # via openai-whisper
numpy==1.26.4 # via numba, openai-whisper
nvidia-cublas-cu12==12.1.3.1 # via nvidia-cudnn-cu12, nvidia-cusolver-cu12, torch
nvidia-cuda-cupti-cu12==12.1.105 # via torch
nvidia-cuda-nvrtc-cu12==12.1.105 # via torch
nvidia-cuda-runtime-cu12==12.1.105 # via torch
nvidia-cudnn-cu12==8.9.2.26 # via torch
nvidia-cufft-cu12==11.0.2.54 # via torch
nvidia-curand-cu12==10.3.2.106 # via torch
nvidia-cusolver-cu12==11.4.5.107 # via torch
nvidia-cusparse-cu12==12.1.0.106 # via nvidia-cusolver-cu12, torch
nvidia-nccl-cu12==2.19.3 # via torch
nvidia-nvjitlink-cu12==12.4.99 # via nvidia-cusolver-cu12, nvidia-cusparse-cu12
nvidia-nvtx-cu12==12.1.105 # via torch
openai-whisper==20231117 # via -r requirements.in
pyaudio==0.2.13 # via -r requirements.in
pyparsing==3.1.1 # via catkin-pkg
python-dateutil==2.8.2 # via catkin-pkg
pycparser==2.21 # via cffi
pyparsing==3.1.2 # via catkin-pkg
python-dateutil==2.9.0.post0 # via catkin-pkg
pyyaml==6.0.1 # via -r requirements.in, rospkg
regex==2023.8.8 # via tiktoken
regex==2023.12.25 # via tiktoken
requests==2.31.0 # via speechrecognition, tiktoken
rospkg==1.5.0 # via -r requirements.in
six==1.16.0 # via python-dateutil
sounddevice==0.4.6 # via -r requirements.in
speechrecognition==3.10.0 # via -r requirements.in
sympy==1.12 # via torch
tiktoken==0.3.1 # via openai-whisper
torch==2.0.1 # via openai-whisper, triton
tqdm==4.66.1 # via openai-whisper
triton==2.0.0 # via openai-whisper, torch
typing-extensions==4.7.1 # via torch
urllib3==2.0.4 # via requests
wheel==0.41.1 # via nvidia-cublas-cu11, nvidia-cuda-cupti-cu11, nvidia-cuda-runtime-cu11, nvidia-curand-cu11, nvidia-cusparse-cu11, nvidia-nvtx-cu11
tiktoken==0.6.0 # via openai-whisper
torch==2.2.1 # via openai-whisper
tqdm==4.66.2 # via openai-whisper
triton==2.2.0 # via openai-whisper, torch
typing-extensions==4.10.0 # via torch
urllib3==2.2.1 # via requests

# The following packages are considered to be unsafe in a requirements file:
# setuptools
42 changes: 16 additions & 26 deletions common/vector_databases/lasr_vector_databases_faiss/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@ project(lasr_vector_databases_faiss)
## Find catkin macros and libraries
## if COMPONENTS list like find_package(catkin REQUIRED COMPONENTS xyz)
## is used, also find other catkin packages
find_package(catkin REQUIRED catkin_virtualenv)
find_package(catkin REQUIRED catkin_virtualenv COMPONENTS
rospy
lasr_vector_databases_msgs
)

## System dependencies are found with CMake's conventions
# find_package(Boost REQUIRED COMPONENTS system)
Expand Down Expand Up @@ -55,8 +58,6 @@ catkin_generate_virtualenv(
## Generate services in the 'srv' folder
# add_service_files(
# FILES
# Service1.srv
# Service2.srv
# )

# Generate actions in the 'action' folder
Expand All @@ -68,8 +69,7 @@ catkin_generate_virtualenv(
# Generate added messages and services with any dependencies listed here
# generate_messages(
# DEPENDENCIES
# actionlib_msgs
# geometry_msgs
# std_msgs
# )

################################################
Expand Down Expand Up @@ -157,22 +157,13 @@ include_directories(

## Mark executable scripts (Python etc.) for installation
## in contrast to setup.py, you can choose the destination
# catkin_install_python(PROGRAMS
# nodes/qualification
# nodes/actions/wait_greet
# nodes/actions/identify
# nodes/actions/greet
# nodes/actions/get_name
# nodes/actions/learn_face
# nodes/actions/get_command
# nodes/actions/guide
# nodes/actions/find_person
# nodes/actions/detect_people
# nodes/actions/receive_object
# nodes/actions/handover_object
# nodes/better_qualification
# DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION}
# )
catkin_install_python(PROGRAMS
nodes/txt_index_service
nodes/txt_query_service
scripts/test_index_service.py
scripts/test_query_service.py
DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION}
)

## Mark executables for installation
## See http://docs.ros.org/melodic/api/catkin/html/howto/format1/building_executables.html
Expand All @@ -196,11 +187,10 @@ include_directories(
# )

## Mark other files for installation (e.g. launch and bag files, etc.)
# install(FILES
# # myfile1
# # myfile2
# DESTINATION ${CATKIN_PACKAGE_SHARE_DESTINATION}
# )
install(FILES
requirements.txt
DESTINATION ${CATKIN_PACKAGE_SHARE_DESTINATION}
)

#############
## Testing ##
Expand Down
Empty file.
33 changes: 33 additions & 0 deletions common/vector_databases/lasr_vector_databases_faiss/doc/USAGE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
This package currently contains two services `txt_index_service` and `txt_query_service`. These services are used to create and search (respectively) a vector database of natural language sentence embeddings.

# Index Service
The Index service is used to create a [FAISS](https://github.com/facebookresearch/faiss) index object containing a set of sentence embeddings, where each sentence is assumed to be a line in a given `.txt` file. This Index object is saved to disk at a specified location, and can be thought of as a Vector Database.

## Request
The request takes two string parameters: `txt_path` which is the path to the `.txt` file we wish to create sentence embeddings for, where each line in this file is treated as a sentence; and `index_path` which is the path to a `.index` file that will be created by the Service.

## Response
No response is given from this service.

## Example Usage
Please see the `scripts/test_index_service.py` script for a simple example of sending a request to the service.

# Query Service
The query service is used to search the `.index` file created by the Index Service to find the most similar sentences given an input query sentence.

## Request
The request requires four fields:

1. `txt_path` -- this is a `string` that is the path to the txt file that contains the original sentences that the `.index` file was populated with.
2. `index_path` -- this is a `string` that is the path to the `.index` file that was created with the Index Service, on the same txt file as the `txt_path`.
3. `query_sentence` -- this is a `string` that is the sentence that you wish to query the index with and find the most similar sentence.
4. `k` -- this is a `uint8` that is the number of closest sentences you wish to return.

## Response
The response contains two fields:

1. `closest_sentences` -- this is an ordered list of `string`s that contain the closest sentences to the given query sentence.
2. `cosine_similaities` -- this is an ordered list of `float32`s that contain the cosine similarity scores of the closest sentences.

## Example Usage
Please see the `scripts/test_query_service.py` script for a simple example of sending a request to the service.
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!/usr/bin/env python3
import rospy
import numpy as np
from lasr_vector_databases_msgs.srv import TxtIndexRequest, TxtIndexResponse, TxtIndex
from lasr_vector_databases_faiss import (
load_model,
parse_txt_file,
get_sentence_embeddings,
create_vector_database,
)


class TxtIndexService:
def __init__(self):
rospy.init_node("txt_index_service")
m-barker marked this conversation as resolved.
Show resolved Hide resolved
rospy.Service("lasr_faiss/txt_index", TxtIndex, self.execute_cb)
self._sentence_embedding_model = load_model()
rospy.loginfo("Text index service started")

def execute_cb(self, req: TxtIndexRequest):
txt_fp: str = req.txt_path
sentences_to_embed: list[str] = parse_txt_file(txt_fp)
sentence_embeddings: np.ndarray = get_sentence_embeddings(
sentences_to_embed, self._sentence_embedding_model
)
index_path: str = req.index_path
create_vector_database(sentence_embeddings, index_path)
return TxtIndexResponse()


if __name__ == "__main__":
TxtIndexService()
rospy.spin()
Loading
Loading