diff --git a/pyroengine/vision.py b/pyroengine/vision.py index 50f5f3d..5b2b684 100644 --- a/pyroengine/vision.py +++ b/pyroengine/vision.py @@ -11,7 +11,7 @@ from typing import Optional, Tuple from urllib.request import urlretrieve -import ncnn +import ncnn # type: ignore import numpy as np import onnxruntime from huggingface_hub import HfApi # type: ignore[import-untyped] @@ -148,7 +148,6 @@ def prep_process(self, pil_img: Image.Image) -> Tuple[np.ndarray, Tuple[int, int - The resized and normalized image of shape (1, C, H, W). - Padding information as a tuple of integers (pad_height, pad_width). """ - np_img, pad = letterbox(np.array(pil_img), self.imgsz) # Applies letterbox resize with padding if self.format == "ncnn": @@ -156,59 +155,69 @@ def prep_process(self, pil_img: Image.Image) -> Tuple[np.ndarray, Tuple[int, int mean = [0, 0, 0] std = [1 / 255, 1 / 255, 1 / 255] np_img.substract_mean_normalize(mean=mean, norm=std) - else: - - np_img = np.expand_dims(np_img.astype("float"), axis=0) # Add batch dimension + np_img = np.expand_dims(np_img.astype("float32"), axis=0) # Add batch dimension np_img = np.ascontiguousarray(np_img.transpose((0, 3, 1, 2))) # Convert from BHWC to BCHW format - np_img = np_img.astype("float32") / 255 # Normalize to [0, 1] + np_img /= 255.0 # Normalize to [0, 1] return np_img, pad - def post_process(self, pred: np.ndarray, pad: int) -> Tuple[np.ndarray, Tuple[int, int]]: + def post_process(self, pred: np.ndarray, pad: Tuple[int, int]) -> np.ndarray: + """Post-process model predictions. - # Drop low conf for speed-up - pred = pred[:, pred[-1, :] > self.conf] - # Post processing + Args: + pred: Raw predictions from the model. + pad: Padding information as (left_pad, top_pad). + + Returns: + Processed predictions as a numpy array. + """ + pred = pred[:, pred[-1, :] > self.conf] # Drop low-confidence predictions pred = np.transpose(pred) pred = xywh2xyxy(pred) - # Sort by confidence - pred = pred[pred[:, 4].argsort()] + pred = pred[pred[:, 4].argsort()] # Sort by confidence pred = nms(pred) - pred = pred[::-1] + pred = pred[::-1] # Reverse for highest confidence first - # Normalize preds if len(pred) > 0: - # Remove padding - left_pad, top_pad = pad + left_pad, top_pad = pad # Unpack the tuple pred[:, :4:2] -= left_pad pred[:, 1:4:2] -= top_pad pred[:, :4:2] /= self.imgsz - 2 * left_pad pred[:, 1:4:2] /= self.imgsz - 2 * top_pad pred = np.clip(pred, 0, 1) - pred = np.reshape(pred, (-1, 5)) else: - pred = np.zeros((0, 5)) # normalize output + pred = np.zeros((0, 5)) # Return empty prediction array return pred def __call__(self, pil_img: Image.Image, occlusion_mask: Optional[np.ndarray] = None) -> np.ndarray: + """Run the classifier on an input image. + Args: + pil_img: The input PIL image. + occlusion_mask: Optional occlusion mask to exclude certain areas. + + Returns: + Processed predictions. + """ np_img, pad = self.prep_process(pil_img) if self.format == "ncnn": - extractor = self.model.create_extractor() extractor.set_light_mode(True) extractor.input("in0", np_img) pred = ncnn.Mat() extractor.extract("out0", pred) pred = np.asarray(pred) - else: pred = self.ort_session.run(["output0"], {"images": np_img})[0][0] - pred = self.post_process(pred, pad) + # Convert pad to a tuple if required + if isinstance(pad, list): + pad = tuple(pad) + + pred = self.post_process(pred, pad) # Ensure pad is passed as a tuple # drop big detections pred = np.clip(pred, 0, 1)