Skip to content

Commit

Permalink
Merge branch 'main' into find-person-skill
Browse files Browse the repository at this point in the history
  • Loading branch information
jws-1 committed Apr 18, 2024
2 parents 9d06de8 + a43e982 commit bab2d71
Show file tree
Hide file tree
Showing 46 changed files with 1,400 additions and 874 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,8 @@ def parse_args() -> dict:
action="store_true",
help="Disable warming up the model by running inference on a test file.",
)
args, unknown = parser.parse_known_args()

args,unknown = parser.parse_known_args()
return vars(args)


Expand Down Expand Up @@ -300,8 +301,6 @@ def configure_whisper_cache() -> None:
if __name__ == "__main__":
configure_whisper_cache()
config = parse_args()
rospy.init_node("speech_transcription_node")
server = TranscribeSpeechAction(
config["action_name"], configure_model_params(config)
)
rospy.init_node("transcribe_speech_server")
server = TranscribeSpeechAction("transcribe_speech", configure_model_params(config))
rospy.spin()
34 changes: 10 additions & 24 deletions common/vision/lasr_vision_clip/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ find_package(catkin REQUIRED catkin_virtualenv)
catkin_python_setup()
catkin_generate_virtualenv(
INPUT_REQUIREMENTS requirements.in
PYTHON_INTERPRETER python3.9
PYTHON_INTERPRETER python3.10
)
################################################
## Declare ROS messages, services and actions ##
Expand Down Expand Up @@ -48,15 +48,14 @@ catkin_generate_virtualenv(
## Generate messages in the 'msg' folder
# add_message_files(
# FILES
# Message1.msg
# Message2.msg
# VqaResult.msg
# VqaResult.msg
# )

## Generate services in the 'srv' folder
# Generate services in the 'srv' folder
# add_service_files(
# FILES
# Service1.srv
# Service2.srv
# Vqa.srv
# )

# Generate actions in the 'action' folder
Expand All @@ -68,8 +67,7 @@ catkin_generate_virtualenv(
# Generate added messages and services with any dependencies listed here
# generate_messages(
# DEPENDENCIES
# actionlib_msgs
# geometry_msgs
# sensor_msgs
# )

################################################
Expand Down Expand Up @@ -157,22 +155,10 @@ include_directories(

## Mark executable scripts (Python etc.) for installation
## in contrast to setup.py, you can choose the destination
# catkin_install_python(PROGRAMS
# nodes/qualification
# nodes/actions/wait_greet
# nodes/actions/identify
# nodes/actions/greet
# nodes/actions/get_name
# nodes/actions/learn_face
# nodes/actions/get_command
# nodes/actions/guide
# nodes/actions/find_person
# nodes/actions/detect_people
# nodes/actions/receive_object
# nodes/actions/handover_object
# nodes/better_qualification
# DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION}
# )
catkin_install_python(PROGRAMS
nodes/vqa
DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION}
)

## Mark executables for installation
## See http://docs.ros.org/melodic/api/catkin/html/howto/format1/building_executables.html
Expand Down
File renamed without changes.
50 changes: 50 additions & 0 deletions common/vision/lasr_vision_clip/nodes/vqa
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#!/usr/bin/env python3
import rospy
from typing import List
from lasr_vision_clip.clip_utils import load_model, query_image_stream
from lasr_vision_msgs.srv import VqaRequest, VqaResponse, Vqa
from sensor_msgs.msg import Image


class VqaService:
def __init__(self, model_device: str = "cuda") -> None:
"""Caches the clip model.
Args:
model_device (str, optional): device to load model onto. Defaults to "cuda".
"""

self._model = load_model(model_device)
self._debug_pub = rospy.Publisher("/clip_vqa/debug", Image, queue_size=1)
rospy.loginfo("Clip VQA service started")

def query_clip(self, request: VqaRequest) -> VqaResponse:
"""Queries CLIP from the robot's image stream and returns
the most likely answer and cosine similarity score.
Args:
possible_answers (List[str]): set of possible answers.
Returns:
VqaResult
"""
possible_answers = request.possible_answers
answer, cos_score, annotated_img = query_image_stream(
self._model, possible_answers, annotate=True
)

self._debug_pub.publish(annotated_img)

result = VqaResponse()
result.answer = answer
rospy.loginfo(f"Answer: {answer}")
result.similarity = float(cos_score)
return result


if __name__ == "__main__":
rospy.init_node("clip_vqa_service")
service = VqaService()
rospy.Service("/clip_vqa/query_service", Vqa, service.query_clip)
rospy.spin()
5 changes: 5 additions & 0 deletions common/vision/lasr_vision_clip/package.xml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@
<!-- <doc_depend>doxygen</doc_depend> -->
<buildtool_depend>catkin</buildtool_depend>
<build_depend>catkin_virtualenv</build_depend>
<build_depend>message_generation</build_depend>
<exec_depend>message_runtime</exec_depend>
<build_depend>sensor_msgs</build_depend>
<exec_depend>sensor_msgs</exec_depend>
<depend>lasr_vision_msgs</depend>
<!-- The export tag contains other, unspecified, tags -->
<export>
<!-- Other tools can request additional information be placed here -->
Expand Down
54 changes: 22 additions & 32 deletions common/vision/lasr_vision_clip/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,22 +1,15 @@
--extra-index-url https://pypi.ngc.nvidia.com
--trusted-host pypi.ngc.nvidia.com

certifi==2024.2.2 # via requests
charset-normalizer==3.3.2 # via requests
click==8.1.7 # via nltk
clip @ git+https://github.com/openai/CLIP.git # via -r requirements.in
filelock==3.13.1 # via huggingface-hub, torch, transformers, triton
fsspec==2024.2.0 # via huggingface-hub, torch
ftfy==6.1.3 # via -r requirements.in, clip
huggingface-hub==0.20.3 # via sentence-transformers, tokenizers, transformers
idna==3.6 # via requests
filelock==3.13.4 # via huggingface-hub, torch, transformers, triton
fsspec==2024.3.1 # via huggingface-hub, torch
huggingface-hub==0.22.2 # via sentence-transformers, tokenizers, transformers
idna==3.7 # via requests
jinja2==3.1.3 # via torch
joblib==1.3.2 # via nltk, scikit-learn
joblib==1.4.0 # via scikit-learn
markupsafe==2.1.5 # via jinja2
mpmath==1.3.0 # via sympy
networkx==3.2.1 # via torch
nltk==3.8.1 # via sentence-transformers
numpy==1.26.3 # via opencv-python, scikit-learn, scipy, sentence-transformers, torchvision, transformers
numpy==1.26.4 # via opencv-python, scikit-learn, scipy, sentence-transformers, transformers
nvidia-cublas-cu12==12.1.3.1 # via nvidia-cudnn-cu12, nvidia-cusolver-cu12, torch
nvidia-cuda-cupti-cu12==12.1.105 # via torch
nvidia-cuda-nvrtc-cu12==12.1.105 # via torch
Expand All @@ -27,27 +20,24 @@ nvidia-curand-cu12==10.3.2.106 # via torch
nvidia-cusolver-cu12==11.4.5.107 # via torch
nvidia-cusparse-cu12==12.1.0.106 # via nvidia-cusolver-cu12, torch
nvidia-nccl-cu12==2.19.3 # via torch
nvidia-nvjitlink-cu12==12.3.101 # via nvidia-cusolver-cu12, nvidia-cusparse-cu12
nvidia-nvjitlink-cu12==12.4.127 # via nvidia-cusolver-cu12, nvidia-cusparse-cu12
nvidia-nvtx-cu12==12.1.105 # via torch
opencv-python==4.9.0.80 # via -r requirements.in
packaging==23.2 # via huggingface-hub, transformers
pillow==10.2.0 # via sentence-transformers, torchvision
packaging==24.0 # via huggingface-hub, transformers
pillow==10.3.0 # via sentence-transformers
pyyaml==6.0.1 # via huggingface-hub, transformers
regex==2023.12.25 # via -r requirements.in, clip, nltk, transformers
requests==2.31.0 # via huggingface-hub, torchvision, transformers
safetensors==0.4.2 # via transformers
scikit-learn==1.4.0 # via sentence-transformers
scipy==1.12.0 # via scikit-learn, sentence-transformers
sentence-transformers==2.3.1 # via -r requirements.in
sentencepiece==0.1.99 # via sentence-transformers
regex==2024.4.16 # via transformers
requests==2.31.0 # via huggingface-hub, transformers
safetensors==0.4.3 # via transformers
scikit-learn==1.4.2 # via sentence-transformers
scipy==1.13.0 # via scikit-learn, sentence-transformers
sentence-transformers==2.7.0 # via -r requirements.in
sympy==1.12 # via torch
threadpoolctl==3.2.0 # via scikit-learn
tokenizers==0.15.1 # via transformers
torch==2.2.0 # via clip, sentence-transformers, torchvision
torchvision==0.17.0 # via clip
tqdm==4.66.1 # via -r requirements.in, clip, huggingface-hub, nltk, sentence-transformers, transformers
transformers==4.37.2 # via sentence-transformers
threadpoolctl==3.4.0 # via scikit-learn
tokenizers==0.15.2 # via transformers
torch==2.2.2 # via sentence-transformers
tqdm==4.66.2 # via huggingface-hub, sentence-transformers, transformers
transformers==4.39.3 # via sentence-transformers
triton==2.2.0 # via torch
typing-extensions==4.9.0 # via huggingface-hub, torch
urllib3==2.2.0 # via requests
wcwidth==0.2.13 # via ftfy
typing-extensions==4.11.0 # via huggingface-hub, torch
urllib3==2.2.1 # via requests
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

Original file line number Diff line number Diff line change
Expand Up @@ -97,4 +97,4 @@ def query_image_stream(
)

img = cv2_img.cv2_img_to_msg(cv2_im)
return answers[max_score], cos_scores, img
return answers[max_score], cos_scores[0, max_score], img
1 change: 1 addition & 0 deletions common/vision/lasr_vision_msgs/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ add_service_files(
TorchFaceFeatureDetection.srv
Recognise.srv
LearnFace.srv
Vqa.srv
PointingDirection.srv
)

Expand Down
9 changes: 9 additions & 0 deletions common/vision/lasr_vision_msgs/srv/Vqa.srv
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
string[] possible_answers

---

# most likely answer
string answer

# cosine similarity
float32 similarity
1 change: 1 addition & 0 deletions skills/src/lasr_skills/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@
from .receive_object import ReceiveObject
from .handover_object import HandoverObject
from .ask_and_listen import AskAndListen
from .clip_vqa import QueryImage
101 changes: 77 additions & 24 deletions skills/src/lasr_skills/ask_and_listen.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,39 +6,92 @@


class AskAndListen(smach.StateMachine):
def __init__(self, question: Union[str, None] = None):
if question is not None:
def __init__(
self,
tts_phrase: Union[str, None] = None,
tts_phrase_format_str: Union[str, None] = None,
):

if tts_phrase is not None:
smach.StateMachine.__init__(
self,
outcomes=["succeeded", "failed"],
output_keys=["transcribed_speech"],
)
else:
with self:
smach.StateMachine.add(
"SAY",
Say(text=tts_phrase),
transitions={
"succeeded": "LISTEN",
"aborted": "failed",
"preempted": "failed",
},
)
smach.StateMachine.add(
"LISTEN",
Listen(),
transitions={
"succeeded": "succeeded",
"aborted": "failed",
"preempted": "failed",
},
remapping={"sequence": "transcribed_speech"},
)
elif tts_phrase_format_str is not None:
smach.StateMachine.__init__(
self,
outcomes=["succeeded", "failed"],
input_keys=["tts_phrase"],
output_keys=["transcribed_speech"],
input_keys=["tts_phrase_placeholders"],
)
with self:
smach.StateMachine.add(
"SAY",
Say(format_str=tts_phrase_format_str),
transitions={
"succeeded": "LISTEN",
"aborted": "failed",
"preempted": "failed",
},
remapping={"placeholders": "tts_phrase_placeholders"},
)
smach.StateMachine.add(
"LISTEN",
Listen(),
transitions={
"succeeded": "succeeded",
"aborted": "failed",
"preempted": "failed",
},
remapping={"sequence": "transcribed_speech"},
)

with self:
smach.StateMachine.add(
"SAY",
Say(question),
transitions={
"succeeded": "LISTEN",
"aborted": "failed",
"preempted": "failed",
},
remapping={"text": "tts_phrase"} if question is None else {},
)
smach.StateMachine.add(
"LISTEN",
Listen(),
transitions={
"succeeded": "succeeded",
"aborted": "failed",
"preempted": "failed",
},
remapping={"sequence": "transcribed_speech"},
else:
smach.StateMachine.__init__(
self,
outcomes=["succeeded", "failed"],
output_keys=["transcribed_speech"],
input_keys=["tts_phrase"],
)
with self:
smach.StateMachine.add(
"SAY",
Say(),
transitions={
"succeeded": "LISTEN",
"aborted": "failed",
"preempted": "failed",
},
remapping={"text": "tts_phrase"},
)
smach.StateMachine.add(
"LISTEN",
Listen(),
transitions={
"succeeded": "succeeded",
"aborted": "failed",
"preempted": "failed",
},
remapping={"sequence": "transcribed_speech"},
)
24 changes: 24 additions & 0 deletions skills/src/lasr_skills/clip_vqa.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import smach_ros
from lasr_vision_msgs.srv import Vqa, VqaRequest

from typing import List, Union


class QueryImage(smach_ros.ServiceState):

def __init__(self, possible_answers: Union[None, List[str]] = None):

if possible_answers is not None:
super(QueryImage, self).__init__(
"/clip_vqa/query_service",
Vqa,
request=VqaRequest(possible_answers=possible_answers),
response_slots=["answer", "similarity"],
)
else:
super(QueryImage, self).__init__(
"/clip_vqa/query_service",
Vqa,
request_slots=["possible_answers"],
response_slots=["answer", "similarity"],
)
Loading

0 comments on commit bab2d71

Please sign in to comment.