Skip to content

YOLOv5 Base

YoloV5 base detector class.

YOLOV5Base

Bases: BaseDetector

Base detector class for YOLO V5. This class provides utility methods for loading the model, generating results, and performing single and batch image detections.

Source code in PytorchWildlife/models/detection/ultralytics_based/yolov5_base.py
class YOLOV5Base(BaseDetector):
    """
    Base detector class for YOLO V5. This class provides utility methods for
    loading the model, generating results, and performing single and batch image detections.
    """
    def __init__(self, weights=None, device="cpu", url=None, transform=None):
        """
        Initialize the YOLO V5 detector.

        Args:
            weights (str, optional): 
                Path to the model weights. Defaults to None.
            device (str, optional): 
                Device for model inference. Defaults to "cpu".
            url (str, optional): 
                URL to fetch the model weights. Defaults to None.
            transform (callable, optional):
                Optional transform to be applied on the image. Defaults to None.
        """
        self.transform = transform
        super(YOLOV5Base, self).__init__(weights=weights, device=device, url=url)
        self._load_model(weights, device, url)

    def _load_model(self, weights=None, device="cpu", url=None):
        """
        Load the YOLO V5 model weights.

        Args:
            weights (str, optional): 
                Path to the model weights. Defaults to None.
            device (str, optional): 
                Device for model inference. Defaults to "cpu".
            url (str, optional): 
                URL to fetch the model weights. Defaults to None.
        Raises:
            Exception: If weights are not provided.
        """
        if weights:
            checkpoint = torch.load(weights, map_location=torch.device(device))
        elif url:
            checkpoint = load_state_dict_from_url(url, map_location=torch.device(self.device))
        else:
            raise Exception("Need weights for inference.")
        self.model = checkpoint["model"].float().fuse().eval().to(self.device)

        if not self.transform:
            self.transform = pw_trans.MegaDetector_v5_Transform(target_size=self.IMAGE_SIZE,
                                                                stride=self.STRIDE)

    def results_generation(self, preds, img_id, id_strip=None) -> dict:
        """
        Generate results for detection based on model predictions.

        Args:
            preds (numpy.ndarray): 
                Model predictions.
            img_id (str): 
                Image identifier.
            id_strip (str, optional): 
                Strip specific characters from img_id. Defaults to None.

        Returns:
            dict: Dictionary containing image ID, detections, and labels.
        """
        results = {"img_id": str(img_id).strip(id_strip)}
        results["detections"] = sv.Detections(
            xyxy=preds[:, :4],
            confidence=preds[:, 4],
            class_id=preds[:, 5].astype(int)
        )
        results["labels"] = [
            f"{self.CLASS_NAMES[class_id]} {confidence:0.2f}"
            for confidence, class_id in zip(results["detections"].confidence, results["detections"].class_id)
        ]
        return results

    def single_image_detection(self, img, img_path=None, det_conf_thres=0.2, id_strip=None) -> dict:
        """
        Perform detection on a single image.

        Args:
            img (str or ndarray): 
                Image path or ndarray of images.
            img_path (str, optional): 
                Image path or identifier.
            det_conf_thres (float, optional): 
                Confidence threshold for predictions. Defaults to 0.2.
            id_strip (str, optional): 
                Characters to strip from img_id. Defaults to None.

        Returns:
            dict: Detection results.
        """
        if type(img) == str:
            if img_path is None:
                img_path = img
            img = np.array(Image.open(img_path).convert("RGB"))
        img_size = img.shape
        img = self.transform(img)

        if img_size is None:
            img_size = img.permute((1, 2, 0)).shape # We need hwc instead of chw for coord scaling
        preds = self.model(img.unsqueeze(0).to(self.device))[0]
        preds = torch.cat(non_max_suppression(prediction=preds, conf_thres=det_conf_thres), axis=0).cpu().numpy()
        # preds[:, :4] = scale_coords([self.IMAGE_SIZE] * 2, preds[:, :4], img_size).round()
        preds[:, :4] = scale_boxes([self.IMAGE_SIZE] * 2, preds[:, :4], img_size).round()
        res = self.results_generation(preds, img_path, id_strip)

        normalized_coords = [[x1 / img_size[1], y1 / img_size[0], x2 / img_size[1], y2 / img_size[0]] for x1, y1, x2, y2 in preds[:, :4]]
        res["normalized_coords"] = normalized_coords

        return res

    def batch_image_detection(self, data_path, batch_size: int = 16, det_conf_thres: float = 0.2, id_strip: str = None) -> list[dict]:
        """
        Perform detection on a batch of images.

        Args:
            data_path (str): Path containing all images for inference.
            batch_size (int, optional): Batch size for inference. Defaults to 16.
            det_conf_thres (float, optional): Confidence threshold for predictions. Defaults to 0.2.
            id_strip (str, optional): Characters to strip from img_id. Defaults to None.

        Returns:
            list[dict]: List of detection results for all images.
        """

        dataset = pw_data.DetectionImageFolder(
            data_path,
            transform=self.transform,
        )

        # Creating a DataLoader for batching and parallel processing of the images
        loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, 
                            pin_memory=True, num_workers=0, drop_last=False)

        results = []
        with tqdm(total=len(loader)) as pbar:
            for batch_index, (imgs, paths, sizes) in enumerate(loader):
                imgs = imgs.to(self.device)
                predictions = self.model(imgs)[0].detach().cpu()
                predictions = non_max_suppression(predictions, conf_thres=det_conf_thres)

                batch_results = []
                for i, pred in enumerate(predictions):
                    if pred.size(0) == 0:  
                        continue
                    pred = pred.numpy()
                    size = sizes[i].numpy()
                    path = paths[i]
                    original_coords = pred[:, :4].copy()
                    # pred[:, :4] = scale_coords([self.IMAGE_SIZE] * 2, pred[:, :4], size).round()
                    pred[:, :4] = scale_boxes([self.IMAGE_SIZE] * 2, pred[:, :4], size).round()
                    # Normalize the coordinates for timelapse compatibility
                    normalized_coords = [[x1 / size[1], y1 / size[0], x2 / size[1], y2 / size[0]] for x1, y1, x2, y2 in pred[:, :4]]
                    res = self.results_generation(pred, path, id_strip)
                    res["normalized_coords"] = normalized_coords
                    batch_results.append(res)
                pbar.update(1)
                results.extend(batch_results)
            return results

__init__(weights=None, device='cpu', url=None, transform=None)

Initialize the YOLO V5 detector.

Parameters:

Name Type Description Default
weights str

Path to the model weights. Defaults to None.

None
device str

Device for model inference. Defaults to "cpu".

'cpu'
url str

URL to fetch the model weights. Defaults to None.

None
transform callable

Optional transform to be applied on the image. Defaults to None.

None
Source code in PytorchWildlife/models/detection/ultralytics_based/yolov5_base.py
def __init__(self, weights=None, device="cpu", url=None, transform=None):
    """
    Initialize the YOLO V5 detector.

    Args:
        weights (str, optional): 
            Path to the model weights. Defaults to None.
        device (str, optional): 
            Device for model inference. Defaults to "cpu".
        url (str, optional): 
            URL to fetch the model weights. Defaults to None.
        transform (callable, optional):
            Optional transform to be applied on the image. Defaults to None.
    """
    self.transform = transform
    super(YOLOV5Base, self).__init__(weights=weights, device=device, url=url)
    self._load_model(weights, device, url)

batch_image_detection(data_path, batch_size=16, det_conf_thres=0.2, id_strip=None)

Perform detection on a batch of images.

Parameters:

Name Type Description Default
data_path str

Path containing all images for inference.

required
batch_size int

Batch size for inference. Defaults to 16.

16
det_conf_thres float

Confidence threshold for predictions. Defaults to 0.2.

0.2
id_strip str

Characters to strip from img_id. Defaults to None.

None

Returns:

Type Description
list[dict]

list[dict]: List of detection results for all images.

Source code in PytorchWildlife/models/detection/ultralytics_based/yolov5_base.py
def batch_image_detection(self, data_path, batch_size: int = 16, det_conf_thres: float = 0.2, id_strip: str = None) -> list[dict]:
    """
    Perform detection on a batch of images.

    Args:
        data_path (str): Path containing all images for inference.
        batch_size (int, optional): Batch size for inference. Defaults to 16.
        det_conf_thres (float, optional): Confidence threshold for predictions. Defaults to 0.2.
        id_strip (str, optional): Characters to strip from img_id. Defaults to None.

    Returns:
        list[dict]: List of detection results for all images.
    """

    dataset = pw_data.DetectionImageFolder(
        data_path,
        transform=self.transform,
    )

    # Creating a DataLoader for batching and parallel processing of the images
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, 
                        pin_memory=True, num_workers=0, drop_last=False)

    results = []
    with tqdm(total=len(loader)) as pbar:
        for batch_index, (imgs, paths, sizes) in enumerate(loader):
            imgs = imgs.to(self.device)
            predictions = self.model(imgs)[0].detach().cpu()
            predictions = non_max_suppression(predictions, conf_thres=det_conf_thres)

            batch_results = []
            for i, pred in enumerate(predictions):
                if pred.size(0) == 0:  
                    continue
                pred = pred.numpy()
                size = sizes[i].numpy()
                path = paths[i]
                original_coords = pred[:, :4].copy()
                # pred[:, :4] = scale_coords([self.IMAGE_SIZE] * 2, pred[:, :4], size).round()
                pred[:, :4] = scale_boxes([self.IMAGE_SIZE] * 2, pred[:, :4], size).round()
                # Normalize the coordinates for timelapse compatibility
                normalized_coords = [[x1 / size[1], y1 / size[0], x2 / size[1], y2 / size[0]] for x1, y1, x2, y2 in pred[:, :4]]
                res = self.results_generation(pred, path, id_strip)
                res["normalized_coords"] = normalized_coords
                batch_results.append(res)
            pbar.update(1)
            results.extend(batch_results)
        return results

results_generation(preds, img_id, id_strip=None)

Generate results for detection based on model predictions.

Parameters:

Name Type Description Default
preds ndarray

Model predictions.

required
img_id str

Image identifier.

required
id_strip str

Strip specific characters from img_id. Defaults to None.

None

Returns:

Name Type Description
dict dict

Dictionary containing image ID, detections, and labels.

Source code in PytorchWildlife/models/detection/ultralytics_based/yolov5_base.py
def results_generation(self, preds, img_id, id_strip=None) -> dict:
    """
    Generate results for detection based on model predictions.

    Args:
        preds (numpy.ndarray): 
            Model predictions.
        img_id (str): 
            Image identifier.
        id_strip (str, optional): 
            Strip specific characters from img_id. Defaults to None.

    Returns:
        dict: Dictionary containing image ID, detections, and labels.
    """
    results = {"img_id": str(img_id).strip(id_strip)}
    results["detections"] = sv.Detections(
        xyxy=preds[:, :4],
        confidence=preds[:, 4],
        class_id=preds[:, 5].astype(int)
    )
    results["labels"] = [
        f"{self.CLASS_NAMES[class_id]} {confidence:0.2f}"
        for confidence, class_id in zip(results["detections"].confidence, results["detections"].class_id)
    ]
    return results

single_image_detection(img, img_path=None, det_conf_thres=0.2, id_strip=None)

Perform detection on a single image.

Parameters:

Name Type Description Default
img str or ndarray

Image path or ndarray of images.

required
img_path str

Image path or identifier.

None
det_conf_thres float

Confidence threshold for predictions. Defaults to 0.2.

0.2
id_strip str

Characters to strip from img_id. Defaults to None.

None

Returns:

Name Type Description
dict dict

Detection results.

Source code in PytorchWildlife/models/detection/ultralytics_based/yolov5_base.py
def single_image_detection(self, img, img_path=None, det_conf_thres=0.2, id_strip=None) -> dict:
    """
    Perform detection on a single image.

    Args:
        img (str or ndarray): 
            Image path or ndarray of images.
        img_path (str, optional): 
            Image path or identifier.
        det_conf_thres (float, optional): 
            Confidence threshold for predictions. Defaults to 0.2.
        id_strip (str, optional): 
            Characters to strip from img_id. Defaults to None.

    Returns:
        dict: Detection results.
    """
    if type(img) == str:
        if img_path is None:
            img_path = img
        img = np.array(Image.open(img_path).convert("RGB"))
    img_size = img.shape
    img = self.transform(img)

    if img_size is None:
        img_size = img.permute((1, 2, 0)).shape # We need hwc instead of chw for coord scaling
    preds = self.model(img.unsqueeze(0).to(self.device))[0]
    preds = torch.cat(non_max_suppression(prediction=preds, conf_thres=det_conf_thres), axis=0).cpu().numpy()
    # preds[:, :4] = scale_coords([self.IMAGE_SIZE] * 2, preds[:, :4], img_size).round()
    preds[:, :4] = scale_boxes([self.IMAGE_SIZE] * 2, preds[:, :4], img_size).round()
    res = self.results_generation(preds, img_path, id_strip)

    normalized_coords = [[x1 / img_size[1], y1 / img_size[0], x2 / img_size[1], y2 / img_size[0]] for x1, y1, x2, y2 in preds[:, :4]]
    res["normalized_coords"] = normalized_coords

    return res