Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/receptionist-polishing'
Browse files Browse the repository at this point in the history
  • Loading branch information
Benteng Ma committed Jul 5, 2024
2 parents d0f17b4 + 925ab2f commit d0d4ade
Show file tree
Hide file tree
Showing 41 changed files with 1,736 additions and 611 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def main(args: dict) -> None:
output_dir = args["output_dir"]

r = sr.Recognizer()
with sr.Microphone(device_index=13,sample_rate=16000) as source:
with sr.Microphone(device_index=13, sample_rate=16000) as source:
print("Say something!")
audio = r.listen(source, timeout=5, phrase_time_limit=5)
print("Finished listening")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
import rospkg

# model cache
# preload resnet 50 model so that it won't waste the time
# preload resnet 50 model so that it won't waste the time
# doing that in the middle of the task.
loaded_models = {
"resnet50": load_model(download_model(BodyPixModelPaths.RESNET50_FLOAT_STRIDE_16))
Expand Down Expand Up @@ -171,7 +171,11 @@ def detect_keypoints(
BodyPixKeypoint(keypoint_name=keypoint.part, x=x, y=y)
)
detected_keypoints_normalized.append(
BodyPixKeypointNormalized(keypoint_name=keypoint.part, x=float(x)/mask.shape[1], y=float(y)/mask.shape[0])
BodyPixKeypointNormalized(
keypoint_name=keypoint.part,
x=float(x) / mask.shape[1],
y=float(y) / mask.shape[0],
)
)

# publish to debug topic
Expand Down Expand Up @@ -201,4 +205,6 @@ def detect_keypoints(
)
debug_publisher.publish(cv2_img.cv2_img_to_msg(coloured_mask))

return BodyPixKeypointDetectionResponse(keypoints=detected_keypoints, normalized_keypoints=detected_keypoints_normalized)
return BodyPixKeypointDetectionResponse(
keypoints=detected_keypoints, normalized_keypoints=detected_keypoints_normalized
)
1 change: 1 addition & 0 deletions common/vision/lasr_vision_clip/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ include_directories(
catkin_install_python(PROGRAMS
nodes/vqa
nodes/img_encoder.py
nodes/learn_face.py
examples/encode_image_example.py
DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION}
)
Expand Down
107 changes: 107 additions & 0 deletions common/vision/lasr_vision_clip/examples/test_person_detector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
#!/usr/bin/env python3
from lasr_vision_msgs.srv import (
ClipLearnFaceRequest,
ClipLearnFace,
ClipLearnFaceResponse,
CroppedDetection,
CroppedDetectionRequest,
CroppedDetectionResponse,
ClipRecogniseFaceRequest,
ClipRecogniseFace,
ClipRecogniseFaceResponse,
)
from lasr_vision_msgs.msg import CDRequest
from sensor_msgs.msg import Image
import cv2
from cv2_img import msg_to_cv2_img, cv2_img_to_msg
import rospy
from typing import List
import numpy as np


if __name__ == "__main__":
rospy.init_node("clip_encoder_test")
cropped_detector = rospy.ServiceProxy("/vision/cropped_detection", CroppedDetection)
learn_face_service = rospy.ServiceProxy("/vision/learn_face", ClipLearnFace)
detect_face_service = rospy.ServiceProxy(
"/vision/face_detection", ClipRecogniseFace
)
debug_pub = rospy.Publisher("/clip/recognise/debug", Image, queue_size=1)
input_str = ""
while True:
input_str = input("Please enter your name and hit enter to learn your face: ")
if input_str == "done":
break
person_1_imgs = []
for i in range(10):
cropped_response = cropped_detector(
CroppedDetectionRequest(
[
CDRequest(
method="centered",
use_mask=True,
object_names=["person"],
yolo_model="yolov8x-seg.pt",
yolo_model_confidence=0.8,
yolo_nms_threshold=0.4,
)
]
)
)
rospy.sleep(0.1)
try:
person_1_imgs.append(cropped_response.responses[0].cropped_imgs[0])
except:
continue

learn_face_service(ClipLearnFaceRequest(raw_imgs=person_1_imgs, name=input_str))

# Run inference
while not rospy.is_shutdown():
cropped_response = cropped_detector(
CroppedDetectionRequest(
[
CDRequest(
method="centered",
use_mask=True,
object_names=["person"],
yolo_model="yolov8x-seg.pt",
yolo_model_confidence=0.8,
yolo_nms_threshold=0.4,
)
]
)
)

try:
names = []
xywhs = []
for cropped_img in cropped_response.responses[0].cropped_imgs:
response = detect_face_service(
ClipRecogniseFaceRequest(image_raw=cropped_img)
)
names.append(response.name)
xywhs.append(response.xywh)
rospy.loginfo(f"Recognised face: {response.name}")

# Add names to image
cv2_img = msg_to_cv2_img(cropped_response.responses[0].masked_img)
for name, xywh in zip(names, xywhs):
x, y, w, h = xywh[0], xywh[1], xywh[2], xywh[3]
cv2.rectangle(cv2_img, (x, y), (x + w, y + h), (0, 255, 0), 2)
cv2.putText(
cv2_img,
name,
(x, y),
cv2.FONT_HERSHEY_SIMPLEX,
0.5,
(0, 255, 0),
2,
cv2.LINE_AA,
)
debug_pub.publish(cv2_img_to_msg(cv2_img))
except Exception as e:
rospy.loginfo(e)
continue

rospy.spin()
8 changes: 8 additions & 0 deletions common/vision/lasr_vision_clip/nodes/learn_face.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import rospy
from lasr_vision_clip import FaceService


if __name__ == "__main__":
rospy.init_node("clip_vqa_service")
face_service = FaceService()
rospy.spin()
4 changes: 3 additions & 1 deletion common/vision/lasr_vision_clip/requirements.in
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
facenet-pytorch
sentence-transformers
opencv-python
opencv-python
opencv-contrib-python
17 changes: 10 additions & 7 deletions common/vision/lasr_vision_clip/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
certifi==2024.7.4 # via requests
charset-normalizer==3.3.2 # via requests
facenet-pytorch==2.6.0 # via -r requirements.in
filelock==3.15.4 # via huggingface-hub, torch, transformers, triton
fsspec==2024.6.1 # via huggingface-hub, torch
huggingface-hub==0.23.4 # via sentence-transformers, tokenizers, transformers
Expand All @@ -9,7 +10,7 @@ joblib==1.4.2 # via scikit-learn
markupsafe==2.1.5 # via jinja2
mpmath==1.3.0 # via sympy
networkx==3.2.1 # via torch
numpy==1.26.4 # via opencv-python, scikit-learn, scipy, sentence-transformers, transformers
numpy==1.26.4 # via facenet-pytorch, opencv-contrib-python, opencv-python, scikit-learn, scipy, sentence-transformers, torchvision, transformers
nvidia-cublas-cu12==12.1.3.1 # via nvidia-cudnn-cu12, nvidia-cusolver-cu12, torch
nvidia-cuda-cupti-cu12==12.1.105 # via torch
nvidia-cuda-nvrtc-cu12==12.1.105 # via torch
Expand All @@ -19,25 +20,27 @@ nvidia-cufft-cu12==11.0.2.54 # via torch
nvidia-curand-cu12==10.3.2.106 # via torch
nvidia-cusolver-cu12==11.4.5.107 # via torch
nvidia-cusparse-cu12==12.1.0.106 # via nvidia-cusolver-cu12, torch
nvidia-nccl-cu12==2.20.5 # via torch
nvidia-nccl-cu12==2.19.3 # via torch
nvidia-nvjitlink-cu12==12.5.82 # via nvidia-cusolver-cu12, nvidia-cusparse-cu12
nvidia-nvtx-cu12==12.1.105 # via torch
opencv-contrib-python==4.10.0.84 # via -r requirements.in
opencv-python==4.10.0.84 # via -r requirements.in
packaging==24.1 # via huggingface-hub, transformers
pillow==10.4.0 # via sentence-transformers
pillow==10.2.0 # via facenet-pytorch, sentence-transformers, torchvision
pyyaml==6.0.1 # via huggingface-hub, transformers
regex==2024.5.15 # via transformers
requests==2.32.3 # via huggingface-hub, transformers
requests==2.32.3 # via facenet-pytorch, huggingface-hub, transformers
safetensors==0.4.3 # via transformers
scikit-learn==1.5.1 # via sentence-transformers
scipy==1.13.1 # via scikit-learn, sentence-transformers
sentence-transformers==3.0.1 # via -r requirements.in
sympy==1.12.1 # via torch
threadpoolctl==3.5.0 # via scikit-learn
tokenizers==0.19.1 # via transformers
torch==2.3.1 # via sentence-transformers
tqdm==4.66.4 # via huggingface-hub, sentence-transformers, transformers
torch==2.2.2 # via facenet-pytorch, sentence-transformers, torchvision
torchvision==0.17.2 # via facenet-pytorch
tqdm==4.66.4 # via facenet-pytorch, huggingface-hub, sentence-transformers, transformers
transformers==4.42.3 # via sentence-transformers
triton==2.3.1 # via torch
triton==2.2.0 # via torch
typing-extensions==4.12.2 # via huggingface-hub, torch
urllib3==2.2.2 # via requests
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@

from .clip_utils import load_model, encode_img, load_face_model, infer
from .learn_face import FaceService
26 changes: 21 additions & 5 deletions common/vision/lasr_vision_clip/src/lasr_vision_clip/clip_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import numpy as np
from copy import deepcopy
from sentence_transformers import SentenceTransformer, util

from sensor_msgs.msg import Image


Expand Down Expand Up @@ -41,12 +40,29 @@ def run_clip(
txt = model.encode(labels)
img = model.encode(img)
with torch.no_grad():
torch
cos_scores = util.cos_sim(img, txt)
return cos_scores


def encode_img(model: SentenceTransformer, img_msg: Image) -> np.ndarray:
def load_face_model():
from transformers import AutoImageProcessor, AutoModel

processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
model = AutoModel.from_pretrained("google/vit-base-patch16-224").to("cuda")

return processor, model


def infer(image, processor, model):
image = cv2_img.msg_to_cv2_img(image)
inputs = processor(image, return_tensors="pt").to("cuda")
outputs = model(**inputs)
# squeeze and flatten
outputs.pooler_output = outputs.pooler_output.squeeze(0).flatten()
return outputs.pooler_output.detach().cpu().numpy()


def encode_img(model, img_msg: Image) -> np.ndarray:
"""Run the CLIP model.
Args:
Expand All @@ -56,8 +72,8 @@ def encode_img(model: SentenceTransformer, img_msg: Image) -> np.ndarray:
Returns:
np.ndarray: the image embedding
"""

return model.encode(cv2_img.msg_to_pillow_img(img_msg))
img = cv2_img.msg_to_cv2_img(img_msg)
return model(img.unsqueeze(0)).detach().numpy()


def query_image_stream(
Expand Down
112 changes: 112 additions & 0 deletions common/vision/lasr_vision_clip/src/lasr_vision_clip/learn_face.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
#!/usr/bin/env python3
import os
import cv2
import rospy
from typing import Dict
import numpy as np
import rospkg
from lasr_vision_msgs.srv import (
ClipRecogniseFaceRequest,
ClipRecogniseFaceResponse,
ClipLearnFace,
ClipRecogniseFace,
ClipLearnFaceRequest,
ClipLearnFaceResponse,
)
from sensor_msgs.msg import Image
from cv2_img import msg_to_cv2_img, cv2_img_to_msg
from lasr_vision_clip import load_face_model, encode_img, infer


class FaceService:
def __init__(self, similarity_threshold: float = 6.0) -> None:
self._face_classifier = cv2.CascadeClassifier(
os.path.join(
rospkg.RosPack().get_path("lasr_vision_clip"),
"data",
"haarcascade_frontalface_default.xml",
)
)
self.learned_faces: Dict[str, np.ndarray] = {}
self._similarity_threshold = similarity_threshold
self.processor, self.model = load_face_model()
self._face_pub = rospy.Publisher("/clip/face_detection", Image, queue_size=1)

rospy.Service("/vision/face_detection", ClipRecogniseFace, self.face_detection)
rospy.Service("/vision/learn_face", ClipLearnFace, self.learn_face)

rospy.loginfo("Face detector service started")

def _detect_faces(self, img: np.ndarray):
faces = self._face_classifier.detectMultiScale(
img, 1.1, minNeighbors=5, minSize=(10, 10)
)
return faces

def face_detection(
self, req: ClipRecogniseFaceRequest
) -> ClipRecogniseFaceResponse:
img = req.image_raw
cv2_img = msg_to_cv2_img(img)
# cv2_img = cv2.cvtColor(cv2_img, cv2.COLOR_BGR2GRAY)
try:
faces = self._detect_faces(cv2_img)

# Assume only one face in image
encoded_face = None
closest_name = "Unknown"
min_dist = float("inf")
min_xywh = None
for x, y, w, h in faces:
cv2_face = cv2_img[y : y + h, x : x + w]
# cv2_face = cv2.cvtColor(cv2_face, cv2.COLOR_GRAY2BGR)
face_msg = cv2_img_to_msg(cv2_face)
self._face_pub.publish(face_msg)
encoded_face = infer(
cv2_img_to_msg(cv2_img), self.processor, self.model
)
encoded_face = encoded_face.flatten()
for name, face in self.learned_faces.items():
distance = np.linalg.norm(encoded_face - face)
rospy.loginfo(f"Distance to {name} : {distance}")
if distance < min_dist:
min_dist = distance
min_xywh = [x, y, w, h]
closest_name = name
return ClipRecogniseFaceResponse(
name=closest_name, distance=min_dist, xywh=min_xywh
)
except Exception as e:
rospy.loginfo(e)
return ClipRecogniseFaceResponse(name="Unknown", distance=None, xywh=None)

def learn_face(self, request: ClipLearnFaceRequest) -> ClipLearnFaceResponse:
imgs = request.raw_imgs

embedding_vectors = []
for img in imgs:
cv2_img = msg_to_cv2_img(img)
# cv2_img = cv2.cvtColor(cv2_img, cv2.COLOR_BGR2GRAY)
rospy.loginfo(f"Image shape: {cv2_img.shape}")
try:
faces = self._detect_faces(cv2_img)
except Exception as e: # No face detected
rospy.loginfo(e)
continue
for x, y, w, h in faces:
cv2_face = cv2_img[y : y + h, x : x + w]
# cv2_face = cv2.cvtColor(cv2_face, cv2.COLOR_GRAY2BGR)
face_msg = cv2_img_to_msg(cv2_face)
self._face_pub.publish(face_msg)
encoded_face = infer(
cv2_img_to_msg(cv2_img), self.processor, self.model
)
encoded_face = encoded_face.flatten()
embedding_vectors.append(encoded_face)

embedding_vectors = np.array(embedding_vectors)
embedding_vector = np.mean(embedding_vectors, axis=0)
self.learned_faces[request.name] = embedding_vector
rospy.loginfo(f"Learned {request.name}")

return ClipLearnFaceResponse()
Loading

0 comments on commit d0d4ade

Please sign in to comment.