Merge branch 'main' into face-skills

LASR-at-Home · May 7, 2024 · e06fa7b · e06fa7b
2 parents b5930f6 + d8fbf4f
commit e06fa7b
Show file tree

Hide file tree

Showing 18 changed files with 390 additions and 104 deletions.
diff --git a/common/speech/lasr_speech_recognition_whisper/requirements.txt b/common/speech/lasr_speech_recognition_whisper/requirements.txt
@@ -1,22 +1,19 @@
---extra-index-url https://pypi.ngc.nvidia.com
---trusted-host pypi.ngc.nvidia.com
-
 catkin-pkg==1.0.0         # via rospkg
 certifi==2024.2.2         # via requests
 cffi==1.16.0              # via sounddevice
 charset-normalizer==3.3.2  # via requests
 distro==1.9.0             # via rospkg
-docutils==0.20.1          # via catkin-pkg
-filelock==3.13.1          # via torch, triton
-fsspec==2024.2.0          # via torch
-idna==3.6                 # via requests
-jinja2==3.1.3             # via torch
+docutils==0.21.2          # via catkin-pkg
+filelock==3.14.0          # via torch, triton
+fsspec==2024.3.1          # via torch
+idna==3.7                 # via requests
+jinja2==3.1.4             # via torch
 llvmlite==0.42.0          # via numba
 markupsafe==2.1.5         # via jinja2
 more-itertools==10.2.0    # via openai-whisper
 mpmath==1.3.0             # via sympy
-networkx==3.2.1           # via torch
-numba==0.59.0             # via openai-whisper
+networkx==3.3             # via torch
+numba==0.59.1             # via openai-whisper
 numpy==1.26.4             # via numba, openai-whisper
 nvidia-cublas-cu12==12.1.3.1  # via nvidia-cudnn-cu12, nvidia-cusolver-cu12, torch
 nvidia-cuda-cupti-cu12==12.1.105  # via torch
@@ -27,27 +24,27 @@ nvidia-cufft-cu12==11.0.2.54  # via torch
 nvidia-curand-cu12==10.3.2.106  # via torch
 nvidia-cusolver-cu12==11.4.5.107  # via torch
 nvidia-cusparse-cu12==12.1.0.106  # via nvidia-cusolver-cu12, torch
-nvidia-nccl-cu12==2.19.3  # via torch
-nvidia-nvjitlink-cu12==12.4.99  # via nvidia-cusolver-cu12, nvidia-cusparse-cu12
+nvidia-nccl-cu12==2.20.5  # via torch
+nvidia-nvjitlink-cu12==12.4.127  # via nvidia-cusolver-cu12, nvidia-cusparse-cu12
 nvidia-nvtx-cu12==12.1.105  # via torch
 openai-whisper==20231117  # via -r requirements.in
 pyaudio==0.2.13           # via -r requirements.in
-pycparser==2.21           # via cffi
+pycparser==2.22           # via cffi
 pyparsing==3.1.2          # via catkin-pkg
 python-dateutil==2.9.0.post0  # via catkin-pkg
 pyyaml==6.0.1             # via -r requirements.in, rospkg
-regex==2023.12.25         # via tiktoken
+regex==2024.4.28          # via tiktoken
 requests==2.31.0          # via speechrecognition, tiktoken
 rospkg==1.5.0             # via -r requirements.in
 six==1.16.0               # via python-dateutil
 sounddevice==0.4.6        # via -r requirements.in
 speechrecognition==3.10.0  # via -r requirements.in
 sympy==1.12               # via torch
 tiktoken==0.6.0           # via openai-whisper
-torch==2.2.1              # via openai-whisper
-tqdm==4.66.2              # via openai-whisper
-triton==2.2.0             # via openai-whisper, torch
-typing-extensions==4.10.0  # via torch
+torch==2.3.0              # via openai-whisper
+tqdm==4.66.4              # via openai-whisper
+triton==2.3.0             # via openai-whisper, torch
+typing-extensions==4.11.0  # via torch
 urllib3==2.2.1            # via requests
 
 # The following packages are considered to be unsafe in a requirements file:

diff --git a/common/vision/lasr_vision_bodypix/src/lasr_vision_bodypix/bodypix.py b/common/vision/lasr_vision_bodypix/src/lasr_vision_bodypix/bodypix.py
@@ -5,11 +5,13 @@
 import numpy as np
 
 from PIL import Image
+import re
 import tensorflow as tf
 from tf_bodypix.api import download_model, load_model, BodyPixModelPaths
 
 from sensor_msgs.msg import Image as SensorImage
-from lasr_vision_msgs.msg import BodyPixMask, BodyPixPose
+
+from lasr_vision_msgs.msg import BodyPixMask, BodyPixPose, BodyPixKeypoint
 from lasr_vision_msgs.srv import BodyPixDetectionRequest, BodyPixDetectionResponse
 
 import rospkg
@@ -19,6 +21,15 @@
 r = rospkg.RosPack()
 
 
+def snake_to_camel(snake_str):
+    components = snake_str.split("_")
+    return components[0] + "".join(x.title() for x in components[1:])
+
+
+def camel_to_snake(name):
+    return re.sub(r"(?<!^)(?=[A-Z])", "_", name).lower()
+
+
 def load_model_cached(dataset: str) -> None:
     """
     Load a model into cache
@@ -41,7 +52,10 @@ def load_model_cached(dataset: str) -> None:
 
 
 def detect(
-    request: BodyPixDetectionRequest, debug_publisher: rospy.Publisher | None
+    request: BodyPixDetectionRequest,
+    debug_publisher: rospy.Publisher = rospy.Publisher(
+        "/bodypix/debug", SensorImage, queue_size=1
+    ),
 ) -> BodyPixDetectionResponse:
     """
     Run BodyPix inference on given detection request
@@ -58,6 +72,7 @@ def detect(
     # run inference
     rospy.loginfo("Running inference")
     result = model.predict_single(img)
+
     mask = result.get_mask(threshold=request.confidence)
     rospy.loginfo("Inference complete")
 
@@ -69,43 +84,13 @@ def detect(
         ).squeeze()
 
         bodypix_mask = BodyPixMask()
-        bodypix_mask.mask = part_mask.flatten().astype(bool)
+
+        bodypix_mask.mask = part_mask.flatten().astype(bool).tolist()
         bodypix_mask.shape = list(part_mask.shape)
         masks.append(bodypix_mask)
 
     # construct poses response and neck coordinates
     poses = result.get_poses()
-    rospy.loginfo(str(poses))
-
-    neck_coordinates = []
-    for pose in poses:
-        left_shoulder_keypoint = pose.keypoints.get(
-            5
-        )  # 5 is the typical index for left shoulder
-        right_shoulder_keypoint = pose.keypoints.get(
-            6
-        )  # 6 is the typical index for right shoulder
-
-        if left_shoulder_keypoint and right_shoulder_keypoint:
-            # If both shoulders are detected, calculate neck as midpoint
-            left_shoulder = left_shoulder_keypoint.position
-            right_shoulder = right_shoulder_keypoint.position
-            neck_x = (left_shoulder.x + right_shoulder.x) / 2
-            neck_y = (left_shoulder.y + right_shoulder.y) / 2
-        elif left_shoulder_keypoint:
-            # Only left shoulder detected, use it as neck coordinate
-            left_shoulder = left_shoulder_keypoint.position
-            neck_x = left_shoulder.x
-            neck_y = left_shoulder.y
-        elif right_shoulder_keypoint:
-            # Only right shoulder detected, use it as neck coordinate
-            right_shoulder = right_shoulder_keypoint.position
-            neck_x = right_shoulder.x
-            neck_y = right_shoulder.y
-
-        pose = BodyPixPose()
-        pose.coord = np.array([neck_x, neck_y]).astype(np.int32)
-        neck_coordinates.append(pose)
 
     # publish to debug topic
     if debug_publisher is not None:
@@ -122,7 +107,25 @@ def detect(
         )
         debug_publisher.publish(cv2_img.cv2_img_to_msg(coloured_mask))
 
+    output_poses = []
+
+    for pose in poses:
+        pose_msg = BodyPixPose()
+        keypoints_msg = []
+
+        for i, keypoint in pose.keypoints.items():
+
+            if camel_to_snake(keypoint.part) in request.masks[0].parts:
+                keypoint_msg = BodyPixKeypoint()
+                keypoint_msg.xy = [int(keypoint.position.x), int(keypoint.position.y)]
+                keypoint_msg.score = keypoint.score
+                keypoint_msg.part = keypoint.part
+                keypoints_msg.append(keypoint_msg)
+
+        pose_msg.keypoints = keypoints_msg
+        output_poses.append(pose_msg)
+
     response = BodyPixDetectionResponse()
+    response.poses = output_poses
     response.masks = masks
-    response.poses = neck_coordinates
     return response
diff --git a/common/vision/lasr_vision_msgs/CMakeLists.txt b/common/vision/lasr_vision_msgs/CMakeLists.txt
@@ -48,6 +48,7 @@ add_message_files(
   Detection.msg
   Detection3D.msg
   BodyPixPose.msg
+  BodyPixKeypoint.msg
   BodyPixMask.msg
   BodyPixMaskRequest.msg
 )
@@ -204,4 +205,4 @@ include_directories(
 # endif()
 
 ## Add folders to be run by python nosetests
-# catkin_add_nosetests(test)
+# catkin_add_nosetests(test)
diff --git a/common/vision/lasr_vision_msgs/msg/BodyPixKeypoint.msg b/common/vision/lasr_vision_msgs/msg/BodyPixKeypoint.msg
@@ -0,0 +1,12 @@
+# Keypoint.msg
+
+# int number of the parts following
+# https://github.com/de-code/python-tf-bodypix/blob/develop/tf_bodypix/bodypix_js_utils/part_channels.py#L5
+
+string part
+
+# the score of the body part
+float64 score
+
+# the x and y coordinates of the body part
+int32[] xy
diff --git a/common/vision/lasr_vision_msgs/msg/BodyPixPose.msg b/common/vision/lasr_vision_msgs/msg/BodyPixPose.msg
@@ -1,2 +1,2 @@
-# x and y coordinates
-float32[] coord
+# keypoints
+BodyPixKeypoint[] keypoints
diff --git a/common/vision/lasr_vision_pointing/requirements.txt b/common/vision/lasr_vision_pointing/requirements.txt
@@ -1,24 +1,24 @@
 absl-py==2.1.0            # via mediapipe
 attrs==23.2.0             # via mediapipe
 cffi==1.16.0              # via sounddevice
-contourpy==1.2.0          # via matplotlib
+contourpy==1.2.1          # via matplotlib
 cycler==0.12.1            # via matplotlib
-flatbuffers==24.3.7       # via mediapipe
-fonttools==4.50.0         # via matplotlib
-jax==0.4.25               # via mediapipe
+flatbuffers==24.3.25      # via mediapipe
+fonttools==4.51.0         # via matplotlib
+jax==0.4.26               # via mediapipe
 kiwisolver==1.4.5         # via matplotlib
-matplotlib==3.8.3         # via mediapipe
+matplotlib==3.8.4         # via mediapipe
 mediapipe==0.10.10        # via -r requirements.in
-ml-dtypes==0.3.2          # via jax
+ml-dtypes==0.4.0          # via jax
 numpy==1.26.4             # via contourpy, jax, matplotlib, mediapipe, ml-dtypes, opencv-contrib-python, opt-einsum, scipy
 opencv-contrib-python==4.9.0.80  # via mediapipe
 opt-einsum==3.3.0         # via jax
 packaging==24.0           # via matplotlib
-pillow==10.2.0            # via matplotlib
+pillow==10.3.0            # via matplotlib
 protobuf==3.20.3          # via mediapipe
-pycparser==2.21           # via cffi
+pycparser==2.22           # via cffi
 pyparsing==3.1.2          # via matplotlib
 python-dateutil==2.9.0.post0  # via matplotlib
-scipy==1.12.0             # via jax
+scipy==1.13.0             # via jax
 six==1.16.0               # via python-dateutil
 sounddevice==0.4.6        # via mediapipe
diff --git a/common/vision/lasr_vision_pointing/scripts/detect_pointing.py b/common/vision/lasr_vision_pointing/scripts/detect_pointing.py
@@ -37,7 +37,9 @@ def excute(self, req: PointingDirectionRequest) -> PointingDirectionResponse:
                 keypoints = self.detect_keypoints(
                     img
                 )  # Detect keypoints using MediaPipe
-                direction = self.determine_pointing_direction(keypoints)
+                direction = self.determine_pointing_direction(
+                    keypoints, buffer_width=25
+                )
                 rospy.loginfo(f"Person detected pointing: {direction}")
 
                 # Visualize pointing direction with landmarks
@@ -52,7 +54,7 @@ def excute(self, req: PointingDirectionRequest) -> PointingDirectionResponse:
 
                 resp.direction = direction
         else:
-            resp.direction = "Err"
+            resp.direction = "NONE"
 
         self.counter += 1
         return resp
@@ -117,16 +119,16 @@ def determine_pointing_direction(self, keypoints, buffer_width=50):
 
                 # Determine pointing direction based on the difference in x-coordinates
                 if abs(left_diff - right_diff) < buffer_width:
-                    return "Front"
+                    return "FORWARDS"
                 elif abs(left_diff) > buffer_width and abs(left_diff) > abs(right_diff):
-                    return "Left" if left_diff > 0 else "Right"
+                    return "LEFT" if left_diff > 0 else "RIGHT"
                 elif abs(right_diff) > buffer_width and abs(right_diff) > abs(
                     left_diff
                 ):
-                    return "Right" if right_diff > 0 else "Left"
+                    return "RIGHT" if right_diff > 0 else "LEFT"
 
         # Default: Determine direction based on the relative position to the center of the image
-        return "Front"
+        return "NONE"
 
     def visualize_pointing_direction_with_landmarks(
         self, image_path, person_bbox, pointing_direction, keypoints
@@ -142,12 +144,14 @@ def visualize_pointing_direction_with_landmarks(
 
         # Calculate endpoint of arrow based on pointing direction
         arrow_length = min(w, h) // 2
-        if pointing_direction == "Left":
+        if pointing_direction == "LEFT":
             endpoint = (center_x - arrow_length, center_y)
-        elif pointing_direction == "Right":
+        elif pointing_direction == "RIGHT":
             endpoint = (center_x + arrow_length, center_y)
-        else:
+        elif pointing_direction == "FORWARDS":
             endpoint = (center_x, center_y)
+        else:
+            return  # No pointing direction detected
 
         # Draw arrow on image
         color = (0, 255, 0)  # Green color

diff --git a/skills/config/motions.yaml b/skills/config/motions.yaml
@@ -1,6 +1,11 @@
 play_motion:
   motions:
-    reach_arm:
+    reach_arm_vertical_gripper:
+      joints: [arm_1_joint, arm_2_joint, arm_3_joint, arm_4_joint, arm_5_joint, arm_6_joint, arm_7_joint]
+      points:
+      - positions: [1.61, -0.93, -3.14, 1.83, -1.58, -0.53, 0.00]
+        time_from_start: 0.0
+    reach_arm_horizontal_gripper:
       joints: [arm_1_joint, arm_2_joint, arm_3_joint, arm_4_joint, arm_5_joint, arm_6_joint, arm_7_joint]
       points:
       - positions: [1.61, -0.93, -3.14, 1.83, -1.58, -0.53, 0.00]
@@ -19,4 +24,19 @@ play_motion:
       joints: [torso_lift_joint, head_1_joint, head_2_joint]
       points:
       - positions: [0.25, 0.0, 0.0]
+        time_from_start: 0.0
+    look_left:
+      joints: [head_1_joint, head_2_joint]
+      points:
+      - positions: [0.5, 0.0]
+        time_from_start: 0.0
+    look_right:
+      joints: [head_1_joint, head_2_joint]
+      points:
+      - positions: [-0.5, 0.0]
+        time_from_start: 0.0
+    look_centre:
+      joints: [head_1_joint, head_2_joint]
+      points:
+      - positions: [0.0, 0.0]
         time_from_start: 0.0
diff --git a/skills/src/lasr_skills/__init__.py b/skills/src/lasr_skills/__init__.py
@@ -19,3 +19,5 @@
 from .clip_vqa import QueryImage
 from .detect_faces import DetectFaces
 from .recognise import Recognise
+from .detect_pointing import DetectPointingDirection
+from .detect_gesture import DetectGesture