From 2abf37cf0656725ef4227d81be388ed47b8b072a Mon Sep 17 00:00:00 2001 From: maximpavliv Date: Mon, 21 Jul 2025 14:10:06 +0200 Subject: [PATCH 01/34] Trim superanimal_humanbody.yaml default project config --- .../modelzoo/project_configs/superanimal_humanbody.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/deeplabcut/modelzoo/project_configs/superanimal_humanbody.yaml b/deeplabcut/modelzoo/project_configs/superanimal_humanbody.yaml index 5b6e05a76d..d1e665c17f 100644 --- a/deeplabcut/modelzoo/project_configs/superanimal_humanbody.yaml +++ b/deeplabcut/modelzoo/project_configs/superanimal_humanbody.yaml @@ -65,10 +65,10 @@ alphavalue: colormap: rainbow # Training,Evaluation and Analysis configuration -TrainingFraction: [0.95] -iteration: 0 -default_net_type: rtmpose_x -default_augmenter: +TrainingFraction: +iteration: +default_net_type: +default_augmenter: snapshotindex: detector_snapshotindex: -1 batch_size: 1 From a1a6be126ad56e038807f835dd5f6f79c62308b8 Mon Sep 17 00:00:00 2001 From: maximpavliv Date: Mon, 21 Jul 2025 14:12:26 +0200 Subject: [PATCH 02/34] Trim superanimal_humanbody_colors --- deeplabcut/modelzoo/utils.py | 29 ----------------------------- 1 file changed, 29 deletions(-) diff --git a/deeplabcut/modelzoo/utils.py b/deeplabcut/modelzoo/utils.py index 99f9be61d8..99b4c7b7f6 100644 --- a/deeplabcut/modelzoo/utils.py +++ b/deeplabcut/modelzoo/utils.py @@ -383,35 +383,6 @@ def get_superanimal_colormaps(): [220, 255, 0], [180, 255, 0], [140, 255, 0], - [100, 255, 0], - [60, 255, 0], - [20, 255, 0], - [0, 255, 0], - [0, 255, 40], - [0, 255, 80], - [0, 255, 120], - [0, 255, 160], - [0, 255, 200], - [0, 255, 240], - [0, 220, 255], - [0, 180, 255], - [0, 140, 255], - [0, 100, 255], - [0, 60, 255], - [0, 20, 255], - [0, 0, 255], - [40, 0, 255], - [80, 0, 255], - [120, 0, 255], - [160, 0, 255], - [200, 0, 255], - [240, 0, 255], - [255, 0, 240], - [255, 0, 200], - [255, 0, 160], - [255, 0, 120], - [255, 0, 80], - [255, 0, 40], ] ) / 255 From dfbce1d61fa24ac735ca7760ccf3c9a5244e34c2 Mon Sep 17 00:00:00 2001 From: maximpavliv Date: Mon, 21 Jul 2025 14:18:11 +0200 Subject: [PATCH 03/34] Correct get_checkpoint_epoch --- deeplabcut/modelzoo/video_inference.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/deeplabcut/modelzoo/video_inference.py b/deeplabcut/modelzoo/video_inference.py index 5b597275b2..46c4fdbfd8 100644 --- a/deeplabcut/modelzoo/video_inference.py +++ b/deeplabcut/modelzoo/video_inference.py @@ -45,9 +45,8 @@ def get_checkpoint_epoch(checkpoint_path): Returns: int: Current epoch number, or 0 if not found """ - # Use CUDA if available, otherwise use CPU - device = "cuda" if torch.cuda.is_available() else "cpu" - checkpoint = torch.load(checkpoint_path, map_location=device) + # For reading metadata, it is recommended to load onto the CPU + checkpoint = torch.load(checkpoint_path, map_location="cpu") if "metadata" in checkpoint and "epoch" in checkpoint["metadata"]: return checkpoint["metadata"]["epoch"] else: From 1432a73ac5a72fa4ca01d8c8fdfdf7549922cf41 Mon Sep 17 00:00:00 2001 From: maximpavliv Date: Mon, 21 Jul 2025 14:26:17 +0200 Subject: [PATCH 04/34] Add rtmpose_x modelzoo model config --- .../modelzoo/model_configs/rtmpose_x.yaml | 159 ++++++++++++++++++ 1 file changed, 159 insertions(+) create mode 100644 deeplabcut/modelzoo/model_configs/rtmpose_x.yaml diff --git a/deeplabcut/modelzoo/model_configs/rtmpose_x.yaml b/deeplabcut/modelzoo/model_configs/rtmpose_x.yaml new file mode 100644 index 0000000000..9a7df70196 --- /dev/null +++ b/deeplabcut/modelzoo/model_configs/rtmpose_x.yaml @@ -0,0 +1,159 @@ +data: + colormode: RGB + inference: + normalize_images: true + top_down_crop: + width: 288 + height: 384 + train: + affine: + p: 0.5 + rotation: 30 + scaling: + - 1.0 + - 1.0 + translation: 0 + collate: + covering: false + gaussian_noise: 12.75 + hist_eq: false + motion_blur: false + normalize_images: true + top_down_crop: + width: 288 + height: 384 +detector: null +device: auto +metadata: + project_path: null + pose_config_path: rtmpose_x_body7_pytorch_config.yaml + bodyparts: + - nose + - left_eye + - right_eye + - left_ear + - right_ear + - left_shoulder + - right_shoulder + - left_elbow + - right_elbow + - left_wrist + - right_wrist + - left_hip + - right_hip + - left_knee + - right_knee + - left_ankle + - right_ankle + unique_bodyparts: [] + individuals: + - idv0 + - idv1 + - idv2 + - idv3 + - idv4 + - idv5 + - idv6 + - idv7 + - idv8 + - idv9 + with_identity: false +method: td +model: + backbone: + type: CSPNeXt + model_name: cspnext_p5 + freeze_bn_stats: false + freeze_bn_weights: false + expand_ratio: 0.5 + deepen_factor: 1.33 + widen_factor: 1.25 + channel_attention: true + norm_layer: SyncBN + activation_fn: SiLU + backbone_output_channels: 1280 + heads: + bodypart: + type: RTMCCHead + weight_init: RTMPose + target_generator: + type: SimCCGenerator + input_size: + - 288 + - 384 + smoothing_type: gaussian + sigma: + - 6.0 + - 6.93 + simcc_split_ratio: 2.0 + label_smooth_weight: 0.0 + normalize: false + criterion: + x: + type: KLDiscreteLoss + use_target_weight: true + beta: 10.0 + label_softmax: true + y: + type: KLDiscreteLoss + use_target_weight: true + beta: 10.0 + label_softmax: true + predictor: + type: SimCCPredictor + simcc_split_ratio: 2.0 + input_size: + - 288 + - 384 + in_channels: 1280 + out_channels: 17 + in_featuremap_size: + - 9 + - 12 + simcc_split_ratio: 2.0 + final_layer_kernel_size: 7 + gau_cfg: + hidden_dims: 256 + s: 128 + expansion_factor: 2 + dropout_rate: 0 + drop_path: 0.0 + act_fn: SiLU + use_rel_bias: false + pos_enc: false +net_type: rtmpose_x +runner: + type: PoseTrainingRunner + gpus: + key_metric: test.mAP + key_metric_asc: true + eval_interval: 10 + optimizer: + type: AdamW + params: + lr: 0.0005 + scheduler: + type: SequentialLR + params: + schedulers: + - type: ConstantLR + params: + factor: 0.001 + total_iters: 5 + - type: CosineAnnealingLR + params: + T_max: 250 + eta_min: 1e-05 + milestones: + - 100 + snapshots: + max_snapshots: 5 + save_epochs: 25 + save_optimizer_state: false +train_settings: + batch_size: 1 + dataloader_workers: 0 + dataloader_pin_memory: false + display_iters: 500 + epochs: 200 + seed: 42 From a4d74ccdb2e3c7533bd9d5f6f09f3fd91a07ef63 Mon Sep 17 00:00:00 2001 From: maximpavliv Date: Mon, 21 Jul 2025 14:30:33 +0200 Subject: [PATCH 05/34] Add FilteredDetector --- .../models/detectors/filtered_detector.py | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 deeplabcut/pose_estimation_pytorch/models/detectors/filtered_detector.py diff --git a/deeplabcut/pose_estimation_pytorch/models/detectors/filtered_detector.py b/deeplabcut/pose_estimation_pytorch/models/detectors/filtered_detector.py new file mode 100644 index 0000000000..e4278dba65 --- /dev/null +++ b/deeplabcut/pose_estimation_pytorch/models/detectors/filtered_detector.py @@ -0,0 +1,40 @@ +import torch +from torch import nn + + +class FilteredDetector(nn.Module): + def __init__(self, base_model: nn.Module, class_id: int): + """ + Wrap a torchvision detector to return predictions only for a single class. + + Args: + base_model: A torchvision-style object detector. + class_id: The integer class ID to keep (e.g., 1 for 'person' in COCO). + """ + super().__init__() + self.base_model = base_model + self.class_id = class_id + + def forward(self, images: list[torch.Tensor]) -> list[dict[str, torch.Tensor]]: + """ + Arguments: + images: list of input images as Tensors + + Returns: + List of dicts, each containing boxes/scores/labels filtered to the specified class. + """ + with torch.no_grad(): + outputs = self.base_model(images) + + filtered_outputs = [] + for output in outputs: + mask = output["labels"] == self.class_id + filtered_output = { + "boxes": output["boxes"][mask], + "scores": output["scores"][mask], + "labels": output["labels"][mask], + } + filtered_outputs.append(filtered_output) + + losses = {} + return losses, filtered_outputs \ No newline at end of file From 0cbfe59fd5193dac13d1fb14102bb03f530345c7 Mon Sep 17 00:00:00 2001 From: maximpavliv Date: Mon, 21 Jul 2025 14:39:41 +0200 Subject: [PATCH 06/34] Add get_filtered_coco_detector_inference_runner() method --- .../pose_estimation_pytorch/apis/utils.py | 123 ++++++++++++++++++ 1 file changed, 123 insertions(+) diff --git a/deeplabcut/pose_estimation_pytorch/apis/utils.py b/deeplabcut/pose_estimation_pytorch/apis/utils.py index ea36e92178..6751b25a61 100644 --- a/deeplabcut/pose_estimation_pytorch/apis/utils.py +++ b/deeplabcut/pose_estimation_pytorch/apis/utils.py @@ -19,6 +19,15 @@ import numpy as np import pandas as pd +from torchvision.models import detection +from torchvision.models.detection import ( + fasterrcnn_resnet50_fpn, + fasterrcnn_mobilenet_v3_large_fpn, + FasterRCNN_ResNet50_FPN_Weights, + FasterRCNN_ResNet50_FPN_V2_Weights, + FasterRCNN_MobileNet_V3_Large_FPN_Weights, +) + from deeplabcut.core.config import read_config_as_dict from deeplabcut.core.engine import Engine from deeplabcut.pose_estimation_pytorch.data.ctd import CondFromModel @@ -38,6 +47,7 @@ ) from deeplabcut.pose_estimation_pytorch.data.transforms import build_transforms from deeplabcut.pose_estimation_pytorch.models import DETECTORS, PoseModel +from deeplabcut.pose_estimation_pytorch.models.detectors.filtered_detector import FilteredDetector from deeplabcut.pose_estimation_pytorch.runners import ( build_inference_runner, CTDTrackingConfig, @@ -713,6 +723,119 @@ def get_detector_inference_runner( return runner +TORCHVISION_DETECTORS = { + "fasterrcnn_resnet50_fpn": { + "fn": fasterrcnn_resnet50_fpn, + "weights": FasterRCNN_ResNet50_FPN_Weights.DEFAULT, + }, + "fasterrcnn_resnet50_fpn_v2": { + "fn": detection.fasterrcnn_resnet50_fpn_v2, + "weights": FasterRCNN_ResNet50_FPN_V2_Weights.DEFAULT, + }, + "fasterrcnn_mobilenet_v3_large_fpn": { + "fn": fasterrcnn_mobilenet_v3_large_fpn, + "weights": FasterRCNN_MobileNet_V3_Large_FPN_Weights.DEFAULT, + }, +} +def get_filtered_coco_detector_inference_runner( + model_name: str, + category_id: int, + batch_size: int = 1, + device: str | None = None, + box_score_thresh: float = 0.6, + max_individuals: int | None = None, + color_mode: str | None = None, + model_config: dict | None = None, + transform: A.BaseCompose | None = None, +) -> DetectorInferenceRunner: + """ + Builds a detector inference runner using a pretrained COCO detector from torchvision. + + This function loads a pretrained object detection model from `torchvision.models.detection`, + wraps it in a `FilteredDetector` that keeps only detections for a specified COCO category, + and packages it into a `DetectorInferenceRunner` ready for inference. + + You can optionally provide a model configuration dictionary to resolve `device`, `max_individuals`, + and `color_mode`. If no `model_config` is given, these must be specified explicitly. + + Args: + model_name (str): Name of the torchvision detection model to load. + Supported values include: + "fasterrcnn_resnet50_fpn", + "fasterrcnn_resnet50_fpn_v2", + "fasterrcnn_mobilenet_v3_large_fpn". + category_id (int): The COCO category ID to retain in the detections. + batch_size (int, optional): Batch size for inference. Defaults to 1. + device (str or None, optional): Device to run the model on (e.g., "cuda", "cpu", or "mps"). + If None, resolved from model_config or defaults to CUDA. + box_score_thresh (float, optional): Confidence threshold for filtering bounding boxes. + Defaults to 0.6. + max_individuals (int or None, optional): Maximum number of individuals to retain per image. + If None, resolved from model_config. + color_mode (str or None, optional): Color mode used for preprocessing (e.g., "RGB"). + If None, resolved from model_config. + model_config (dict or None, optional): Optional configuration dictionary used to resolve + `device`, `max_individuals`, and `color_mode`. + transform (A.BaseCompose or None, optional): Optional preprocessing pipeline. + If None, uses the model's default transform. + + Returns: + DetectorInferenceRunner: A configured detector inference runner. + + Raises: + ValueError: If `model_config` is not provided and required fields are missing. + """ + if model_name not in TORCHVISION_DETECTORS: + raise ValueError(f"Unsupported model: {model_name}") + + if model_config is not None: + if device is None: + device = resolve_device(model_config) + if max_individuals is None: + max_individuals = len(model_config["metadata"]["individuals"]) + if color_mode is None: + color_mode = model_config["data"]["colormode"] + else: + missing = [] + if device is None: + missing.append("device") + if max_individuals is None: + missing.append("max_individuals") + if color_mode is None: + missing.append("color_mode") + if missing: + raise ValueError( + f"If `model_config` is not provided, you must explicitly specify: {', '.join(missing)}." + ) + if device == "mps": + device = "cpu" + + if transform is None: + transform = build_transforms({"scale_to_unit_range": True}) + + entry = TORCHVISION_DETECTORS[model_name] + weights = entry["weights"] + detector = entry["fn"](weights=weights, box_score_thresh=box_score_thresh) + + detector.eval().to(device) + filtered_detector = FilteredDetector(detector, class_id=category_id).to(device) + detector_runner = build_inference_runner( + task=Task.DETECT, + model=filtered_detector, + device=device, + snapshot_path=None, + batch_size=batch_size, + preprocessor=build_bottom_up_preprocessor( + color_mode=color_mode, + transform=transform, + ), + postprocessor=build_detector_postprocessor( + max_individuals=max_individuals, + ), + ) + return detector_runner + + def get_pose_inference_runner( model_config: dict, snapshot_path: str | Path, From 84b230ebe3e3f398a0ddf07e283d50c5729907be Mon Sep 17 00:00:00 2001 From: maximpavliv Date: Mon, 21 Jul 2025 14:43:37 +0200 Subject: [PATCH 07/34] Add ScaleToUnitRange transform --- deeplabcut/pose_estimation_pytorch/data/transforms.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/deeplabcut/pose_estimation_pytorch/data/transforms.py b/deeplabcut/pose_estimation_pytorch/data/transforms.py index b11c736498..7a296525c1 100644 --- a/deeplabcut/pose_estimation_pytorch/data/transforms.py +++ b/deeplabcut/pose_estimation_pytorch/data/transforms.py @@ -143,6 +143,9 @@ def build_transforms(augmentations: dict) -> A.BaseCompose: A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ) + if augmentations.get("scale_to_unit_range"): + transforms.append(ScaleToUnitRange()) + return A.Compose( transforms, keypoint_params=A.KeypointParams( @@ -669,3 +672,11 @@ def _sample( return low + (delta * np.random.random(size)) raise ValueError(f"Unknown sampling: {self.sampling}") + + +class ScaleToUnitRange(A.ImageOnlyTransform): + def __init__(self, always_apply=True, p=1.0): + super().__init__(always_apply=always_apply, p=p) + + def apply(self, img, **params): + return img.astype(np.float32) / 255.0 From c4c1318317c6d1d6ad42845dff3766f775ce1bde Mon Sep 17 00:00:00 2001 From: maximpavliv Date: Mon, 21 Jul 2025 15:03:34 +0200 Subject: [PATCH 08/34] Superanimal humanbody inference: use filtered detector runner --- deeplabcut/modelzoo/video_inference.py | 84 ++++--------------- .../modelzoo/inference.py | 50 ++++++++--- 2 files changed, 56 insertions(+), 78 deletions(-) diff --git a/deeplabcut/modelzoo/video_inference.py b/deeplabcut/modelzoo/video_inference.py index 46c4fdbfd8..46ed21e7d3 100644 --- a/deeplabcut/modelzoo/video_inference.py +++ b/deeplabcut/modelzoo/video_inference.py @@ -231,8 +231,8 @@ def video_inference_superanimal( https://pytorch.org/vision/stable/models/faster_rcnn.html (Model Explanation) SuperAnimal-Bird: - `superanimal_superbird` model aims to work on various bird species. It was developed - during the 2024 DLC AI Residency Program. More info can be + `superanimal_superbird` model aims to work on various bird species. It was developed + during the 2024 DLC AI Residency Program. More info can be [found here](https://deeplabcut.medium.com/deeplabcut-ai-residency-2024-recap-working-with-the-superanimal-bird-model-and-dlc-3-0-live-e55807ca2c7c) (Model Explanation) SuperAnimal-HumanBody: @@ -250,10 +250,12 @@ def video_inference_superanimal( keypoints. When selecting this variant, a `detector_name` must be set with one of the provided object detectors. This model uses 17 body parts in the COCO body7 format. - - We provide an object detector (PyTorch engine): - - `fasterrcnn_mobilenet_v3_large_fpn` - This is a FasterRCNN model with a MobileNet backbone, see - https://pytorch.org/vision/stable/models/faster_rcnn.html + - The following object detectors can be used: + - `fasterrcnn_mobilenet_v3_large_fpn` (default) + This is a FasterRCNN model with a MobileNet backbone + - `fasterrcnn_resnet50_fpn` + - `fasterrcnn_resnet50_fpn_v2` + For more info, see https://pytorch.org/vision/stable/models/faster_rcnn.html Examples (PyTorch Engine) -------- @@ -345,62 +347,17 @@ def video_inference_superanimal( pseudo_threshold, ) elif framework == "pytorch": - if detector_name is None: + torchvision_detector_name = None + if superanimal_name != "superanimal_humanbody" and detector_name is None: raise ValueError( "You have to specify a detector_name when using the Pytorch framework." ) - - # Special handling for superanimal_humanbody - use dedicated implementation - if superanimal_name == "superanimal_humanbody": - from deeplabcut.pose_estimation_pytorch.modelzoo.superanimal_humanbody_video_inference import ( - analyze_videos_superanimal_humanbody, - ) - - # Convert videos to list if needed - if isinstance(videos, str): - videos = [videos] - - # Set destination folder - if dest_folder is None: - dest_folder = Path(videos[0]).parent - else: - dest_folder = Path(dest_folder) - - if not dest_folder.exists(): - dest_folder.mkdir(parents=True, exist_ok=True) - - # Map parameters to the dedicated function - # Note: analyze_videos_superanimal_humanbody has its own parameter set - # Handle device parameter - convert "auto" to actual device - if device == "auto": - import torch - actual_device = "cuda" if torch.cuda.is_available() else "cpu" + elif superanimal_name == "superanimal_humanbody": + if detector_name is not None: + torchvision_detector_name = detector_name else: - actual_device = device - - dedicated_kwargs = { - "videotype": videotype, - "destfolder": str(dest_folder), - "bbox_threshold": bbox_threshold, - "pose_threshold": pcutoff, - "device": actual_device, - "cropping": cropping, - "batch_size": batch_size, - "detector_batch_size": detector_batch_size, - } - - # Use a dummy config path since the dedicated function loads its own config - dummy_config = "superanimal_humanbody" - - results = analyze_videos_superanimal_humanbody( - dummy_config, - videos, - **dedicated_kwargs, - ) - - return results + torchvision_detector_name = "fasterrcnn_mobilenet_v3_large_fpn" - # Standard PyTorch implementation for other models from deeplabcut.pose_estimation_pytorch.modelzoo.inference import ( _video_inference_superanimal, ) @@ -411,7 +368,7 @@ def video_inference_superanimal( config = load_super_animal_config( super_animal=superanimal_name, model_name=model_name, - detector_name=detector_name, + detector_name=detector_name if superanimal_name != "superanimal_humanbody" else None, ) pose_model_path = customized_pose_checkpoint @@ -422,23 +379,17 @@ def video_inference_superanimal( ) detector_path = customized_detector_checkpoint - if detector_path is None: + if detector_path is None and superanimal_name != "superanimal_humanbody": detector_path = get_super_animal_snapshot_path( dataset=superanimal_name, model_name=detector_name, ) dlc_scorer = get_super_animal_scorer( - superanimal_name, pose_model_path, detector_path + superanimal_name, pose_model_path, detector_path, torchvision_detector_name ) - # Add superanimal_name to config metadata for all superanimal models (needed for detector routing) - if "metadata" not in config: - config["metadata"] = {} - config["metadata"]["superanimal_name"] = superanimal_name - config = update_config(config, max_individuals, device) - output_suffix = "_before_adapt" if video_adapt: # the users can pass in many videos. For now, we only use one video for @@ -638,4 +589,5 @@ def video_inference_superanimal( output_suffix=output_suffix, plot_bboxes=plot_bboxes, bboxes_pcutoff=bbox_threshold, + torchvision_detector_name=torchvision_detector_name, ) diff --git a/deeplabcut/pose_estimation_pytorch/modelzoo/inference.py b/deeplabcut/pose_estimation_pytorch/modelzoo/inference.py index 27ee16d6ea..970e977865 100644 --- a/deeplabcut/pose_estimation_pytorch/modelzoo/inference.py +++ b/deeplabcut/pose_estimation_pytorch/modelzoo/inference.py @@ -21,7 +21,11 @@ video_inference, VideoIterator, ) -from deeplabcut.pose_estimation_pytorch.apis.utils import get_inference_runners +from deeplabcut.pose_estimation_pytorch.apis.utils import ( + get_inference_runners, + get_pose_inference_runner, + get_filtered_coco_detector_inference_runner +) from deeplabcut.pose_estimation_pytorch.modelzoo.utils import ( raise_warning_if_called_directly, ) @@ -60,6 +64,7 @@ def _video_inference_superanimal( output_suffix: str = "", plot_bboxes: bool = True, bboxes_pcutoff: float = 0.9, + torchvision_detector_name: str | None = None, ) -> dict: """ Perform inference on a video using a superanimal model from the model zoo specified by `superanimal_name`. @@ -91,6 +96,7 @@ def _video_inference_superanimal( output_suffix: The suffix to add to output file names (e.g. _before_adapt) plot_bboxes: Whether to plot bounding boxes in the output video bboxes_pcutoff: Confidence threshold for bounding box plotting + torchvision_detector_name: If using a filtered torchvision detector, the torchvision model name Returns: results: Dictionary with the result pd.DataFrame for each video @@ -99,16 +105,36 @@ def _video_inference_superanimal( Warning: If the function is called directly. """ raise_warning_if_called_directly() - pose_runner, detector_runner = get_inference_runners( - model_config=model_cfg, - snapshot_path=model_snapshot_path, - max_individuals=max_individuals, - num_bodyparts=len(model_cfg["metadata"]["bodyparts"]), - num_unique_bodyparts=0, - batch_size=batch_size, - detector_batch_size=detector_batch_size, - detector_path=detector_snapshot_path, - ) + + if superanimal_name == "superanimal_humanbody": + if torchvision_detector_name is None: + torchvision_detector_name = "fasterrcnn_mobilenet_v3_large_fpn" + COCO_PERSON = 1 # COCO class ID for person + detector_runner = get_filtered_coco_detector_inference_runner( + model_name=torchvision_detector_name, + category_id=COCO_PERSON, + batch_size=detector_batch_size, + max_individuals=max_individuals, + model_config=model_cfg, + ) + pose_runner = get_pose_inference_runner( + model_cfg, + snapshot_path=model_snapshot_path, + batch_size=batch_size, + max_individuals=max_individuals, + ) + else: + pose_runner, detector_runner = get_inference_runners( + model_config=model_cfg, + snapshot_path=model_snapshot_path, + max_individuals=max_individuals, + num_bodyparts=len(model_cfg["metadata"]["bodyparts"]), + num_unique_bodyparts=0, + batch_size=batch_size, + detector_batch_size=detector_batch_size, + detector_path=detector_snapshot_path, + ) + results = {} if isinstance(video_paths, str): @@ -124,7 +150,7 @@ def _video_inference_superanimal( print(f"Processing video {video_path}") dlc_scorer = get_super_animal_scorer( - superanimal_name, model_snapshot_path, detector_snapshot_path + superanimal_name, model_snapshot_path, detector_snapshot_path, torchvision_detector_name ) output_prefix = f"{Path(video_path).stem}_{dlc_scorer}" From dc511cdc8e2a785311a467a99c3e77a9d99a8928 Mon Sep 17 00:00:00 2001 From: maximpavliv Date: Mon, 21 Jul 2025 16:07:18 +0200 Subject: [PATCH 09/34] ModelZoo tab: make humanbody general case --- deeplabcut/gui/tabs/modelzoo.py | 121 ++++++++++---------------------- 1 file changed, 37 insertions(+), 84 deletions(-) diff --git a/deeplabcut/gui/tabs/modelzoo.py b/deeplabcut/gui/tabs/modelzoo.py index ede5bf0e3e..54e74d7462 100644 --- a/deeplabcut/gui/tabs/modelzoo.py +++ b/deeplabcut/gui/tabs/modelzoo.py @@ -414,92 +414,45 @@ def run_video_adaptation(self): self.run_button.setStyleSheet("background-color: #9E9E9E; color: white; font-weight: bold;") # Gray when disabled self.root._progress_bar.show() try: - # Use dedicated function for superanimal_humanbody - if supermodel_name == "superanimal_humanbody": - # Download config from HuggingFace (needed for the dedicated function) - from deeplabcut.pose_estimation_pytorch.modelzoo.utils import get_snapshot_folder_path - import huggingface_hub - - model_files = get_snapshot_folder_path() - model_files.mkdir(exist_ok=True) - - # Download config file from HuggingFace - config_path = Path( - huggingface_hub.hf_hub_download( - "DeepLabCut/HumanBody", - "rtmpose-x_simcc-body7_pytorch_config.yaml", - local_dir=model_files, - ) + # Use standard function for other models + if can_run_in_background: + func = partial( + deeplabcut.video_inference_superanimal, + files, + supermodel_name, + dest_folder=self._destfolder, + **kwargs, ) - - # Map GUI parameters to dedicated function parameters - dedicated_kwargs = { - "destfolder": self._destfolder, - "bbox_threshold": kwargs.get("bbox_threshold", 0.1), - "pose_threshold": kwargs.get("pseudo_threshold", 0.4), - "device": "cuda" if torch.cuda.is_available() else "cpu", - "detector_name": kwargs.get("detector_name", "fasterrcnn_mobilenet_v3_large_fpn"), - } - - if can_run_in_background: - func = partial( - deeplabcut.analyze_videos_superanimal_humanbody, - config_path, - files, - **dedicated_kwargs, - ) - self.worker, self.thread = move_to_separate_thread(func) - self.worker.finished.connect(self.signal_analysis_complete) - self.thread.start() - else: - print(f"Calling analyze_videos_superanimal_humanbody with config={config_path}, kwargs={dedicated_kwargs}") - results = deeplabcut.analyze_videos_superanimal_humanbody( - config_path, - files, - **dedicated_kwargs, - ) - # Patch: Call signal_analysis_complete for non-background execution - self.signal_analysis_complete() + self.worker, self.thread = move_to_separate_thread(func) + self.worker.finished.connect(self.signal_analysis_complete) + self.thread.start() else: - # Use standard function for other models - if can_run_in_background: - func = partial( - deeplabcut.video_inference_superanimal, - files, - supermodel_name, - dest_folder=self._destfolder, - **kwargs, - ) - self.worker, self.thread = move_to_separate_thread(func) - self.worker.finished.connect(self.signal_analysis_complete) - self.thread.start() - else: - print(f"Calling video_inference_superanimal with kwargs={kwargs}") - results = deeplabcut.video_inference_superanimal( - files, - supermodel_name, - dest_folder=self._destfolder, - **kwargs, - ) - # Check for skipped frames and show warning if needed - for video_path in files: - try: - df = results[video_path] - n_processed = len(df) - cap = cv2.VideoCapture(video_path) - n_total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) - cap.release() - if n_processed < n_total: - msg = QtWidgets.QMessageBox() - msg.setIcon(QtWidgets.QMessageBox.Warning) - msg.setText(f"Warning: Only {n_processed} out of {n_total} frames had detections. The output movie and results include only those frames.") - msg.setWindowTitle("Partial Detections") - msg.setMinimumWidth(400) - msg.setStandardButtons(QtWidgets.QMessageBox.Ok) - msg.exec_() - except Exception as e: - print(f"[GUI Warning] Could not check processed frames: {e}") - self.signal_analysis_complete() + print(f"Calling video_inference_superanimal with kwargs={kwargs}") + results = deeplabcut.video_inference_superanimal( + files, + supermodel_name, + dest_folder=self._destfolder, + **kwargs, + ) + # Check for skipped frames and show warning if needed + for video_path in files: + try: + df = results[video_path] + n_processed = len(df) + cap = cv2.VideoCapture(video_path) + n_total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + cap.release() + if n_processed < n_total: + msg = QtWidgets.QMessageBox() + msg.setIcon(QtWidgets.QMessageBox.Warning) + msg.setText(f"Warning: Only {n_processed} out of {n_total} frames had detections. The output movie and results include only those frames.") + msg.setWindowTitle("Partial Detections") + msg.setMinimumWidth(400) + msg.setStandardButtons(QtWidgets.QMessageBox.Ok) + msg.exec_() + except Exception as e: + print(f"[GUI Warning] Could not check processed frames: {e}") + self.signal_analysis_complete() except Exception as e: print(f"[Error] {e}") self.run_button.setEnabled(True) From 479cc66c4698c1ae63d9756f1c4b77e5ce482832 Mon Sep 17 00:00:00 2001 From: maximpavliv Date: Mon, 21 Jul 2025 16:08:12 +0200 Subject: [PATCH 10/34] get_super_animal_scorer(): add torchvision_detector_name arg --- deeplabcut/modelzoo/utils.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/deeplabcut/modelzoo/utils.py b/deeplabcut/modelzoo/utils.py index 99b4c7b7f6..00de90afb9 100644 --- a/deeplabcut/modelzoo/utils.py +++ b/deeplabcut/modelzoo/utils.py @@ -62,17 +62,23 @@ def get_super_animal_project_cfg(super_animal: str) -> dict: def get_super_animal_scorer( super_animal: str, model_snapshot_path: Path, - detector_snapshot_path: Path | str | None, + detector_snapshot_path: Path | None, + torchvision_detector_name: str | None = None, ) -> str: """ Args: super_animal: The SuperAnimal dataset on which the models were trained model_snapshot_path: The path for the SuperAnimal pose model snapshot - detector_snapshot_path: The path or name for the SuperAnimal detector, if a detector is being used. + detector_snapshot_path: The path for the SuperAnimal detector snapshot, if a + detector is being used. + torchvision_detector_name: The name of a pretrained COCO detector from torchvision, + if such a detector is used instead of a snapshot. Returns: The DLC scorer name to use for the given SuperAnimal models. """ + if detector_snapshot_path is not None and torchvision_detector_name is not None: + raise ValueError("Provide only one of `detector_snapshot_path` or `torchvision_detector_name`, not both.") super_animal_prefix = super_animal + "_" # Always use model name first model_name = model_snapshot_path.stem if hasattr(model_snapshot_path, "stem") else str(model_snapshot_path) @@ -81,12 +87,13 @@ def get_super_animal_scorer( dlc_scorer = f"{super_animal_prefix}{model_name}" # Then add detector name if provided - if detector_snapshot_path: - if isinstance(detector_snapshot_path, (str, Path)): - detector_name = Path(detector_snapshot_path).stem if hasattr(detector_snapshot_path, "stem") else str(detector_snapshot_path) - if detector_name.startswith(super_animal_prefix): - detector_name = detector_name[len(super_animal_prefix) :] - dlc_scorer += f"_{detector_name}" + if detector_snapshot_path is not None: + detector_name = detector_snapshot_path.stem + if detector_name.startswith(super_animal_prefix): + detector_name = detector_name[len(super_animal_prefix):] + dlc_scorer += f"_{detector_name}_" + elif torchvision_detector_name is not None: + dlc_scorer += f"_{torchvision_detector_name}_" return dlc_scorer From 6a14584148a8e29eb378630a91180f90684412de Mon Sep 17 00:00:00 2001 From: maximpavliv Date: Mon, 21 Jul 2025 16:09:12 +0200 Subject: [PATCH 11/34] Remove superanimal_humanbody_video_inference.py module --- deeplabcut/__init__.py | 1 - .../modelzoo/__init__.py | 4 - .../superanimal_humanbody_video_inference.py | 487 ------------------ 3 files changed, 492 deletions(-) delete mode 100644 deeplabcut/pose_estimation_pytorch/modelzoo/superanimal_humanbody_video_inference.py diff --git a/deeplabcut/__init__.py b/deeplabcut/__init__.py index f157d06bb0..72dac1e3ce 100644 --- a/deeplabcut/__init__.py +++ b/deeplabcut/__init__.py @@ -60,7 +60,6 @@ ) from deeplabcut.modelzoo.video_inference import video_inference_superanimal -from deeplabcut.pose_estimation_pytorch.modelzoo.superanimal_humanbody_video_inference import analyze_videos_superanimal_humanbody from deeplabcut.utils import ( create_labeled_video, diff --git a/deeplabcut/pose_estimation_pytorch/modelzoo/__init__.py b/deeplabcut/pose_estimation_pytorch/modelzoo/__init__.py index 138ac3901c..e8232cd895 100644 --- a/deeplabcut/pose_estimation_pytorch/modelzoo/__init__.py +++ b/deeplabcut/pose_estimation_pytorch/modelzoo/__init__.py @@ -16,7 +16,3 @@ get_super_animal_snapshot_path, load_super_animal_config, ) - -from deeplabcut.pose_estimation_pytorch.modelzoo.superanimal_humanbody_video_inference import ( - analyze_videos_superanimal_humanbody, -) diff --git a/deeplabcut/pose_estimation_pytorch/modelzoo/superanimal_humanbody_video_inference.py b/deeplabcut/pose_estimation_pytorch/modelzoo/superanimal_humanbody_video_inference.py deleted file mode 100644 index e867207b10..0000000000 --- a/deeplabcut/pose_estimation_pytorch/modelzoo/superanimal_humanbody_video_inference.py +++ /dev/null @@ -1,487 +0,0 @@ -#!/usr/bin/env python3 -""" -Dedicated video inference implementation for superanimal_humanbody with torchvision detector. -This avoids modifying core functions and provides a clean, specific implementation. -""" - -import cv2 -import numpy as np -from pathlib import Path -from typing import List, Dict, Any, Union -import torch -import torchvision.models.detection as detection -from PIL import Image -from tqdm import tqdm -import json -import logging -import yaml -import pandas as pd - -from deeplabcut.pose_estimation_pytorch.apis.videos import VideoIterator -from deeplabcut.pose_estimation_pytorch.apis.utils import get_inference_runners -from deeplabcut.pose_estimation_pytorch.modelzoo.inference import _video_inference_superanimal -from deeplabcut.modelzoo.utils import get_super_animal_scorer, get_superanimal_colormaps - - -def torchvision_detector_inference(images, threshold=0.1, device="cpu"): - """ - Run the exact torchvision detector on a list of images. - This is the working implementation that matches the Colab code. - - Args: - images: list of np.ndarray or PIL.Image - threshold: float, detection threshold - device: str, device to run on - Returns: - list of dicts with 'bboxes', 'scores', and 'labels' - """ - weights = detection.FasterRCNN_MobileNet_V3_Large_FPN_Weights.DEFAULT - detector = detection.fasterrcnn_mobilenet_v3_large_fpn( - weights=weights, box_score_thresh=threshold - ) - detector.eval() - detector.to(device) - preprocess = weights.transforms() - - results = [] - for image in images: - if isinstance(image, np.ndarray): - image = Image.fromarray(image).convert("RGB") - elif not isinstance(image, Image.Image): - image = Image.open(image).convert("RGB") - - batch = [preprocess(image).to(device)] - with torch.no_grad(): - predictions = detector(batch)[0] - - bboxes = predictions["boxes"].cpu().numpy() - labels = predictions["labels"].cpu().numpy() - scores = predictions["scores"].cpu().numpy() - - # Filter for humans (COCO class 1) - human_mask = labels == 1 - human_bboxes = bboxes[human_mask] - human_scores = scores[human_mask] - human_labels = labels[human_mask] - - # Convert to xywh format - if len(human_bboxes) > 0: - human_bboxes[:, 2] -= human_bboxes[:, 0] # width = x2 - x1 - human_bboxes[:, 3] -= human_bboxes[:, 1] # height = y2 - y1 - - results.append({ - "bboxes": human_bboxes, - "scores": human_scores, - "labels": human_labels - }) - - return results - - -def video_inference_superanimal_humanbody( - video: Union[str, Path, VideoIterator], - model_config: dict, - model_snapshot_path: Union[str, Path], - detector_snapshot_path: Union[str, Path] = None, - max_individuals: int = 1, - bbox_threshold: float = 0.1, - device: str = "cpu", - cropping: List[int] = None, - dest_folder: str = None, - output_suffix: str = "", -) -> List[Dict[str, np.ndarray]]: - """ - Dedicated video inference for superanimal_humanbody with torchvision detector. - - This implementation: - 1. Uses the exact torchvision detector - 2. Preserves detection scores and labels - 3. Handles missing detections gracefully - 4. Doesn't modify core DeepLabCut functions - - Args: - video: Video path or VideoIterator - model_config: Model configuration - model_snapshot_path: Path to pose model snapshot - detector_snapshot_path: Path to detector snapshot (not used, we use Colab-style detector) - max_individuals: Maximum number of individuals to detect - bbox_threshold: Detection threshold - device: Device to run on - cropping: Video cropping parameters - dest_folder: Output folder - output_suffix: Output file suffix - - Returns: - List of predictions for each frame - """ - - # Initialize video iterator - if not isinstance(video, VideoIterator): - video = VideoIterator(str(video), cropping=cropping) - elif cropping is not None: - video.set_crop(cropping) - - n_frames = video.get_n_frames(robust=False) - vid_w, vid_h = video.dimensions - - print(f"Starting superanimal_humanbody analysis of {video.video_path}") - print( - f"Video metadata: \n" - f" Overall # of frames: {n_frames}\n" - f" Duration of video [s]: {n_frames / max(1, video.fps):.2f}\n" - f" fps: {video.fps}\n" - f" resolution: w={vid_w}, h={vid_h}\n" - ) - - # Step 1: Run Colab-style torchvision detector - print(f"Using torchvision detector with threshold {bbox_threshold}") - - detector_progress = tqdm(video, desc="Detector") - bbox_predictions = [] - - for i, frame in enumerate(detector_progress): - result = torchvision_detector_inference( - images=[frame], - threshold=bbox_threshold, - device=device - ) - bbox_predictions.extend(result) - - # Handle missing detections by padding with full-frame bboxes - if len(bbox_predictions) < n_frames: - print(f"Detector returned {len(bbox_predictions)} predictions for {n_frames} frames. Padding with full-frame bboxes.") - for _ in range(n_frames - len(bbox_predictions)): - bbox_predictions.append({ - 'bboxes': np.array([[0, 0, vid_w, vid_h]]), - 'scores': np.array([0.0]), - 'labels': np.array([1]) - }) - elif len(bbox_predictions) > n_frames: - print(f"Detector returned more predictions than frames. Truncating to {n_frames}.") - bbox_predictions = bbox_predictions[:n_frames] - - # Rename scores to bbox_scores to match DeepLabCut expectations - for pred in bbox_predictions: - if 'scores' in pred: - pred['bbox_scores'] = pred.pop('scores') - - video.set_context(bbox_predictions) - - # Step 2: Run pose estimation - print(f"Running pose estimation") - - # Get pose inference runner - pose_runner, _ = get_inference_runners( - model_config=model_config, - snapshot_path=model_snapshot_path, - max_individuals=max_individuals, - num_bodyparts=len(model_config["metadata"]["bodyparts"]), - num_unique_bodyparts=len(model_config["metadata"]["unique_bodyparts"]), - device=device, - detector_path=None, # We don't use the detector runner since we already have bboxes - ) - - pose_progress = tqdm(video, desc="Pose") - predictions = [] - - for i, frame in enumerate(pose_progress): - result = pose_runner.inference(images=[frame]) - predictions.extend(result) - - # Add detection context back to predictions - for i, pred in enumerate(predictions): - if i < len(bbox_predictions): - pred['bboxes'] = bbox_predictions[i]['bboxes'] - pred['bbox_scores'] = bbox_predictions[i]['bbox_scores'] - if 'labels' in bbox_predictions[i]: - pred['bbox_labels'] = bbox_predictions[i]['labels'] - - # Log detection statistics - frames_with_detections = sum( - 1 for pred in predictions if ( - 'bboxes' in pred and len(pred['bboxes']) > 0 and - not np.all(pred['bboxes'] == np.array([0, 0, vid_w, vid_h])) - ) - ) - logging.info(f"Detected individuals in {frames_with_detections} of {n_frames} frames") - - return predictions - - -def analyze_videos_superanimal_humanbody( - config: str, - videos: Union[str, List[str]], - videotype: str = None, - shuffle: int = 1, - trainingsetindex: int = 0, - save_as_csv: bool = False, - in_random_order: bool = False, - snapshot_index: Union[int, str] = None, - detector_snapshot_index: Union[int, str] = None, - device: str = None, - destfolder: str = None, - batch_size: int = None, - detector_batch_size: int = None, - dynamic: tuple = (False, 0.5, 10), - ctd_conditions: dict = None, - ctd_tracking: bool = False, - top_down_dynamic: dict = None, - modelprefix: str = "", - use_shelve: bool = False, - robust_nframes: bool = False, - transform = None, - auto_track: bool = True, - n_tracks: int = None, - animal_names: List[str] = None, - calibrate: bool = False, - identity_only: bool = None, - overwrite: bool = False, - cropping: List[int] = None, - save_as_df: bool = False, - bbox_threshold: float = 0.1, - pose_threshold: float = 0.4, # Add pose threshold parameter - model_snapshot_path: str = None, - detector_name: str = "fasterrcnn_mobilenet_v3_large_fpn", -) -> str: - """ - Wrapper function that uses the dedicated superanimal_humanbody implementation. - - This function mimics the interface of the standard analyze_videos function - but uses our dedicated implementation for superanimal_humanbody. - """ - - # Load model configuration using the standard function (which handles detector config) - from deeplabcut.pose_estimation_pytorch.modelzoo.utils import load_super_animal_config - - # Use the standard function to get the complete config with detector - model_config = load_super_animal_config( - super_animal="superanimal_humanbody", - model_name="rtmpose_x", - detector_name=detector_name, - max_individuals=10, # Default value - device=device - ) - - # Use provided model snapshot path or get it from dlclibrary - if model_snapshot_path is None: - from deeplabcut.pose_estimation_pytorch.modelzoo.utils import get_super_animal_snapshot_path - - # Get the model snapshot path using dlclibrary - model_snapshot_path = get_super_animal_snapshot_path( - dataset="superanimal_humanbody", - model_name="rtmpose_x", - download=True - ) - - # Convert videos to list - if isinstance(videos, str): - videos = [videos] - - # Set destination folder - if destfolder is None: - destfolder = Path(videos[0]).parent - else: - destfolder = Path(destfolder) - - if not destfolder.exists(): - destfolder.mkdir(parents=True, exist_ok=True) - - results = {} - - for video_path in videos: - print(f"Processing video {video_path}") - video_name = Path(video_path).stem - # Use detector_name in scorer and output file names - dlc_scorer = get_super_animal_scorer( - "superanimal_humanbody", model_snapshot_path, detector_name - ) - output_prefix = f"{video_name}_{dlc_scorer}" - output_json = destfolder / f"{output_prefix}_before_adapt.json" - - if output_json.exists(): - print(f"Predictions already exist for {video_path}, skipping inference.") - # Load predictions from existing JSON file - with open(output_json, "r") as f: - predictions = json.load(f) - results[video_path] = predictions - else: - # Run our dedicated inference - predictions = video_inference_superanimal_humanbody( - video=video_path, - model_config=model_config, - model_snapshot_path=model_snapshot_path, - max_individuals=len(model_config["metadata"]["individuals"]), - bbox_threshold=bbox_threshold, - device=device, - cropping=cropping, - dest_folder=str(destfolder), - ) - with open(output_json, "w") as f: - json.dump(predictions, f, cls=NumpyEncoder, indent=2) - print(f"Results saved to {output_json}") - results[video_path] = predictions - - # Always create labeled video, regardless of whether predictions already existed - # Create labeled video just like other superanimal_* models - # Note: This always runs regardless of whether predictions were loaded or newly created - try: - from deeplabcut.pose_estimation_pytorch.apis.videos import create_df_from_prediction - from deeplabcut.utils.make_labeled_video import create_video - - # Convert our predictions to the format expected by create_df_from_prediction - def convert_predictions_format(predictions, model_config): - """Convert our prediction format to the format expected by create_df_from_prediction.""" - bodyparts = model_config['metadata']['bodyparts'] - individuals = model_config['metadata'].get('individuals', ['individual_0']) - - converted_predictions = [] - for frame_pred in predictions: - # Create the expected numpy array: (num_individuals, num_bodyparts, 3) - num_individuals = len(individuals) - num_bodyparts = len(bodyparts) - - # Initialize with NaN values - bodyparts_array = np.full((num_individuals, num_bodyparts, 3), np.nan) - - # Handle different prediction formats - if 'bodyparts' in frame_pred: - if isinstance(frame_pred['bodyparts'], list): - # Handle list format (from JSON loading) - for i, individual_preds in enumerate(frame_pred['bodyparts']): - if i < num_individuals and isinstance(individual_preds, list): - for j, pred in enumerate(individual_preds): - if j < num_bodyparts and len(pred) >= 3: - bodyparts_array[i, j] = [pred[0], pred[1], pred[2]] - elif isinstance(frame_pred['bodyparts'], np.ndarray): - # Handle numpy array format (from fresh predictions after postprocessing) - poses = frame_pred['bodyparts'] - if poses.shape[1] == num_bodyparts: - # poses shape: (num_individuals, num_bodyparts, 3) - num_detected = min(poses.shape[0], num_individuals) - bodyparts_array[:num_detected] = poses[:num_detected] - elif 'bodypart' in frame_pred and 'poses' in frame_pred['bodypart']: - # Handle pose runner format (fresh predictions before postprocessing) - poses = frame_pred['bodypart']['poses'] - if isinstance(poses, np.ndarray) and poses.shape[1] == num_bodyparts: - # poses shape: (num_individuals, num_bodyparts, 3) - num_detected = min(poses.shape[0], num_individuals) - bodyparts_array[:num_detected] = poses[:num_detected] - - # Create the converted prediction - converted_pred = { - 'bodyparts': bodyparts_array - } - - # Add bbox info if available - if 'bboxes' in frame_pred: - converted_pred['bboxes'] = frame_pred['bboxes'] - if 'bbox_scores' in frame_pred: - converted_pred['bbox_scores'] = frame_pred['bbox_scores'] - - converted_predictions.append(converted_pred) - - return converted_predictions - - # Convert predictions to the expected format - converted_predictions = convert_predictions_format(predictions, model_config) - - # Get the proper scorer name - dlc_scorer = get_super_animal_scorer( - "superanimal_humanbody", model_snapshot_path, detector_name - ) - - output_path = destfolder - output_h5 = output_path / f"{output_prefix}.h5" - - # Convert predictions to DataFrame format - df = create_df_from_prediction( - predictions=converted_predictions, - dlc_scorer=dlc_scorer, - multi_animal=True, - model_cfg=model_config, - output_path=output_path, - output_prefix=output_prefix, - ) - - # Save HDF5 file - df.to_hdf(output_h5, key='df_with_missing', mode='w') - print(f"Created HDF5 file: {output_h5}") - - # Create labeled video using the same approach as other superanimal models - output_video = output_path / f"{output_prefix}_labeled.mp4" - - # Get colormap for humanbody - superanimal_colormaps = get_superanimal_colormaps() - colormap = superanimal_colormaps.get("superanimal_humanbody", "rainbow") - - # Load skeleton from the superanimal_humanbody.yaml config - skeleton_edges = None - try: - import yaml - import os - # Get the correct path to the config file using DeepLabCut's path resolution - from deeplabcut.utils.auxiliaryfunctions import get_deeplabcut_path - dlc_root_path = get_deeplabcut_path() - config_path = os.path.join(dlc_root_path, "modelzoo", "project_configs", "superanimal_humanbody.yaml") - with open(config_path, 'r') as f: - config = yaml.safe_load(f) - skeleton_indices = config.get('skeleton', None) - if skeleton_indices: - # Convert skeleton indices to bodypart names - bodyparts = model_config['metadata']['bodyparts'] - skeleton_edges = [] - for idx1, idx2 in skeleton_indices: - # Fix 1-based indexing (subtract 1 to convert to 0-based) - idx1_0based = idx1 - 1 if idx1 > 0 else idx1 - idx2_0based = idx2 - 1 if idx2 > 0 else idx2 - if idx1_0based < len(bodyparts) and idx2_0based < len(bodyparts) and idx1_0based >= 0 and idx2_0based >= 0: - skeleton_edges.append((bodyparts[idx1_0based], bodyparts[idx2_0based])) - else: - print(f"Warning: Skeleton indices {idx1}->{idx1_0based}, {idx2}->{idx2_0based} out of range for {len(bodyparts)} bodyparts") - print(f"Loaded skeleton with {len(skeleton_edges)} connections") - else: - print("No skeleton found in config, skeleton plotting will be disabled") - except Exception as e: - print(f"Could not load skeleton from config: {e}") - skeleton_edges = None - - # Get bbox info for video creation - bbox_keys_in_predictions = {"bboxes", "bbox_scores"} - bboxes_list = [ - {key: value for key, value in p.items() if key in bbox_keys_in_predictions} - for p in predictions - ] - - # Get cropping info - bbox = cropping if cropping is not None else (0, 1920, 0, 1080) # Default bbox - - print(f"Creating labeled video for {video_path}...") - create_video( - video_path, - output_h5, - pcutoff=pose_threshold, - fps=30, # Default fps - bbox=bbox, - cmap=colormap, - output_path=str(output_video), - plot_bboxes=True, - bboxes_list=bboxes_list, - bboxes_pcutoff=bbox_threshold, - skeleton_edges=skeleton_edges, # Add skeleton support - ) - print(f"Labeled video created: {output_video}") - - except Exception as e: - print(f"[Warning] Could not create labeled video for {video_path}: {e}") - import traceback - traceback.print_exc() - - return str(destfolder) - - -class NumpyEncoder(json.JSONEncoder): - """JSON encoder that handles numpy arrays""" - def default(self, obj): - if isinstance(obj, np.ndarray): - return obj.tolist() - return super().default(obj) \ No newline at end of file From 677481228433dd27f50a193004edbd562bbfc93c Mon Sep 17 00:00:00 2001 From: maximpavliv Date: Mon, 21 Jul 2025 17:22:04 +0200 Subject: [PATCH 12/34] Regularize get_super_animal_model_config_path() --- deeplabcut/pose_estimation_pytorch/modelzoo/utils.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/deeplabcut/pose_estimation_pytorch/modelzoo/utils.py b/deeplabcut/pose_estimation_pytorch/modelzoo/utils.py index 2c1cbf66b7..ec941e49f7 100644 --- a/deeplabcut/pose_estimation_pytorch/modelzoo/utils.py +++ b/deeplabcut/pose_estimation_pytorch/modelzoo/utils.py @@ -40,20 +40,15 @@ def get_snapshot_folder_path() -> Path: return Path(auxiliaryfunctions.get_deeplabcut_path()) / "modelzoo" / "checkpoints" -def get_super_animal_model_config_path(model_name: str, super_animal: str = None) -> Path: +def get_super_animal_model_config_path(model_name: str) -> Path: """Gets the path to the configuration file for a SuperAnimal model. Args: model_name: The name of the model for which to get the path. - super_animal: The name of the SuperAnimal (used for specific model configs). Returns: The path to the config file for a SuperAnimal model. """ - # Special case for superanimal_humanbody with rtmpose_x - if model_name == "rtmpose_x" and super_animal == "superanimal_humanbody": - return get_model_configs_folder_path() / "superanimal_humanbody_rtmpose_x.yaml" - return get_model_configs_folder_path() / f"{model_name}.yaml" From 424ac92941fbdf30ff61c432a2b6575fc3836ca6 Mon Sep 17 00:00:00 2001 From: maximpavliv Date: Mon, 21 Jul 2025 17:27:09 +0200 Subject: [PATCH 13/34] Regularize load_super_animal_config() --- .../pose_estimation_pytorch/modelzoo/utils.py | 50 +++---------------- 1 file changed, 8 insertions(+), 42 deletions(-) diff --git a/deeplabcut/pose_estimation_pytorch/modelzoo/utils.py b/deeplabcut/pose_estimation_pytorch/modelzoo/utils.py index ec941e49f7..e455b972cb 100644 --- a/deeplabcut/pose_estimation_pytorch/modelzoo/utils.py +++ b/deeplabcut/pose_estimation_pytorch/modelzoo/utils.py @@ -15,7 +15,6 @@ import torch from dlclibrary import download_huggingface_model -import huggingface_hub import deeplabcut.pose_estimation_pytorch.config.utils as config_utils from deeplabcut.core.config import read_config_as_dict @@ -108,52 +107,19 @@ def load_super_animal_config( project_cfg_path = get_super_animal_project_config_path(super_animal=super_animal) project_config = read_config_as_dict(project_cfg_path) - # Special handling for superanimal_humanbody with rtmpose_x - download config from HuggingFace - if super_animal == "superanimal_humanbody" and model_name == "rtmpose_x": - # Download config from HuggingFace - model_files = get_snapshot_folder_path() - model_files.mkdir(exist_ok=True) - - path_model_config = Path( - huggingface_hub.hf_hub_download( - "DeepLabCut/HumanBody", - "rtmpose-x_simcc-body7_pytorch_config.yaml", - local_dir=model_files, - ) - ) - model_config = read_config_as_dict(path_model_config) - else: - # Use local config file for other models - model_cfg_path = get_super_animal_model_config_path(model_name=model_name, super_animal=super_animal) - model_config = read_config_as_dict(model_cfg_path) - - model_config = add_metadata(project_config, model_config, model_cfg_path if 'model_cfg_path' in locals() else path_model_config) + model_cfg_path = get_super_animal_model_config_path(model_name=model_name) + model_config = read_config_as_dict(model_cfg_path) + model_config = add_metadata(project_config, model_config, model_cfg_path) + model_config = update_config(model_config, max_individuals, device) - if detector_name is None: + if detector_name is None and super_animal != "superanimal_humanbody": model_config["method"] = "BU" else: - # Check if this is a torchvision detector (not in dlclibrary) - if super_animal == "superanimal_humanbody" and detector_name == "fasterrcnn_mobilenet_v3_large_fpn": - # Use torchvision detector - set method to TD and load detector config - model_config["method"] = "TD" - detector_cfg_path = get_super_animal_model_config_path(model_name=detector_name, super_animal=super_animal) + model_config["method"] = "TD" + if super_animal != "superanimal_humanbody": + detector_cfg_path = get_super_animal_model_config_path(model_name=detector_name) detector_cfg = read_config_as_dict(detector_cfg_path) model_config["detector"] = detector_cfg - else: - # Load detector config from dlclibrary - detector_cfg_path = get_super_animal_model_config_path(model_name=detector_name, super_animal=super_animal) - detector_cfg = read_config_as_dict(detector_cfg_path) - model_config["method"] = "TD" - model_config["detector"] = detector_cfg - - # Update config after detector is added (if any) - model_config = update_config(model_config, max_individuals, device) - - # Add superanimal_name to metadata for all superanimal models (needed for detector routing) - if "metadata" not in model_config: - model_config["metadata"] = {} - model_config["metadata"]["superanimal_name"] = super_animal - return model_config From 04438f844ea1b53e2b23a50e25866158ad606a18 Mon Sep 17 00:00:00 2001 From: maximpavliv Date: Mon, 21 Jul 2025 17:28:23 +0200 Subject: [PATCH 14/34] Regularize download_super_animal_snapshot() --- .../pose_estimation_pytorch/modelzoo/utils.py | 24 ++++++------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/deeplabcut/pose_estimation_pytorch/modelzoo/utils.py b/deeplabcut/pose_estimation_pytorch/modelzoo/utils.py index e455b972cb..8b54317d0a 100644 --- a/deeplabcut/pose_estimation_pytorch/modelzoo/utils.py +++ b/deeplabcut/pose_estimation_pytorch/modelzoo/utils.py @@ -137,25 +137,15 @@ def download_super_animal_snapshot(dataset: str, model_name: str) -> Path: RuntimeError if the model fails to download. """ snapshot_dir = get_snapshot_folder_path() - full_model_name = f"{dataset}_{model_name}" - model_path = snapshot_dir / f"{full_model_name}.pt" + model_name = f"{dataset}_{model_name}" + model_filename = f"{model_name}.pt" + model_path = snapshot_dir / model_filename - # Use the full name for dlclibrary lookup (consistent with dlclibrary naming) - download_huggingface_model(full_model_name, target_dir=str(snapshot_dir)) - - # Check if the file was downloaded with the expected name + download_huggingface_model(model_name, target_dir=str(snapshot_dir), rename_mapping=model_filename) if not model_path.exists(): - # If not, look for the actual downloaded filename and rename it - if dataset == "superanimal_humanbody" and model_name == "rtmpose_x": - actual_file = snapshot_dir / "rtmpose-x_simcc-body7.pt" - if actual_file.exists(): - actual_file.rename(model_path) - else: - raise RuntimeError(f"Failed to download {model_name} to {model_path}") - else: - raise RuntimeError(f"Failed to download {model_name} to {model_path}") - - return snapshot_dir / f"{full_model_name}.pt" + raise RuntimeError(f"Failed to download {model_name} to {model_path}") + + return snapshot_dir / f"{model_name}.pt" def get_gpu_memory_map(): From 256977bfa8b7f2b15c3528df8aa0724fd924a7c8 Mon Sep 17 00:00:00 2001 From: maximpavliv Date: Mon, 21 Jul 2025 17:29:27 +0200 Subject: [PATCH 15/34] update_config(): superanimal_humanbody - compatible --- deeplabcut/pose_estimation_pytorch/modelzoo/utils.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/deeplabcut/pose_estimation_pytorch/modelzoo/utils.py b/deeplabcut/pose_estimation_pytorch/modelzoo/utils.py index 8b54317d0a..9ff66b4240 100644 --- a/deeplabcut/pose_estimation_pytorch/modelzoo/utils.py +++ b/deeplabcut/pose_estimation_pytorch/modelzoo/utils.py @@ -190,8 +190,6 @@ def update_config(config: dict, max_individuals: int, device: str): Returns: The model configuration for a SuperAnimal-pretrained model. """ - - config = config_utils.replace_default_values( config, num_bodyparts=len(config["metadata"]["bodyparts"]), @@ -201,6 +199,7 @@ def update_config(config: dict, max_individuals: int, device: str): config["metadata"]["individuals"] = [f"animal{i}" for i in range(max_individuals)] config["device"] = device - if "detector" in config: + if config.get("detector", None) is not None: config["detector"]["device"] = device + return config From 8d4cf202a1bd2bdd65ab0e85d92f9deb08356557 Mon Sep 17 00:00:00 2001 From: maximpavliv Date: Mon, 21 Jul 2025 17:49:48 +0200 Subject: [PATCH 16/34] Revert video_inference() --- .../pose_estimation_pytorch/apis/videos.py | 38 +++++-------------- 1 file changed, 9 insertions(+), 29 deletions(-) diff --git a/deeplabcut/pose_estimation_pytorch/apis/videos.py b/deeplabcut/pose_estimation_pytorch/apis/videos.py index 33b807fae8..03dd703403 100644 --- a/deeplabcut/pose_estimation_pytorch/apis/videos.py +++ b/deeplabcut/pose_estimation_pytorch/apis/videos.py @@ -194,45 +194,25 @@ def video_inference( if detector_runner is not None: print(f"Running detector with batch size {detector_runner.batch_size}") - - detector_progress = tqdm(video, desc="Detector") - bbox_predictions = [] - for i, frame in enumerate(detector_progress): - result = detector_runner.inference(images=[frame]) - bbox_predictions.extend(result) - - # PATCH: Ensure bbox_predictions is always length n_frames - if len(bbox_predictions) < n_frames: - print(f"[PATCH] Detector returned {len(bbox_predictions)} predictions for {n_frames} frames. Padding with empty bboxes.") - for _ in range(n_frames - len(bbox_predictions)): - bbox_predictions.append({'bboxes': np.zeros((0, 4))}) - elif len(bbox_predictions) > n_frames: - print(f"[PATCH] Detector returned more predictions than frames. Truncating to {n_frames}.") - bbox_predictions = bbox_predictions[:n_frames] + bbox_predictions = detector_runner.inference(images=tqdm(video)) video.set_context(bbox_predictions) print(f"Running pose prediction with batch size {pose_runner.batch_size}") if shelf_writer is not None: shelf_writer.open() - - pose_progress = tqdm(video, desc="Pose") - predictions = [] - for i, frame in enumerate(pose_progress): - result = pose_runner.inference(images=[frame]) - predictions.extend(result) - + + predictions = pose_runner.inference(images=tqdm(video), shelf_writer=shelf_writer) if shelf_writer is not None: shelf_writer.close() if shelf_writer is None and len(predictions) != n_frames: - frames_with_detections = sum( - 1 for pred in predictions if ( - ('bodyparts' in pred and pred['bodyparts'].shape[0] > 0) or - ('bboxes' in pred and len(pred['bboxes']) > 0) - ) - ) + tip_url = "https://deeplabcut.github.io/DeepLabCut/docs/recipes/io.html" + header = "#tips-on-video-re-encoding-and-preprocessing" logging.warning( - f"Only {frames_with_detections} of {n_frames} frames had detections!" + f"The video metadata indicates that there {n_frames} in the video, but " + f"only {len(predictions)} were able to be processed. This can happen if " + "the video is corrupted. You can try to fix the issue by re-encoding your " + f"video (tips on how to do that: {tip_url}{header})" ) return predictions From edf4f9ddf7c00c22cfb01418f479cd58abc39a29 Mon Sep 17 00:00:00 2001 From: maximpavliv Date: Mon, 21 Jul 2025 17:50:11 +0200 Subject: [PATCH 17/34] Revert create_df_from_prediction() --- .../pose_estimation_pytorch/apis/videos.py | 54 +------------------ 1 file changed, 1 insertion(+), 53 deletions(-) diff --git a/deeplabcut/pose_estimation_pytorch/apis/videos.py b/deeplabcut/pose_estimation_pytorch/apis/videos.py index 03dd703403..96cc879ecb 100644 --- a/deeplabcut/pose_estimation_pytorch/apis/videos.py +++ b/deeplabcut/pose_estimation_pytorch/apis/videos.py @@ -783,59 +783,7 @@ def create_df_from_prediction( output_prefix: str | Path, save_as_csv: bool = False, ) -> pd.DataFrame: - # Check if any predictions were made - if not predictions: - raise ValueError( - "No objects were detected in the video. This can happen if:\n" - "1. The video doesn't contain the type of objects the model was trained to detect\n" - "2. The objects are too small, blurry, or occluded\n" - "3. The detector confidence threshold is too high\n" - "4. The video quality is poor\n\n" - "Try:\n" - "- Using a different video with clearer objects\n" - "- Adjusting the detector confidence threshold\n" - "- Checking if the model is appropriate for your use case" - ) - - # Check if any predictions contain valid detections (non-empty bboxes) - valid_predictions = [] - for pred in predictions: - if "bboxes" in pred and len(pred["bboxes"]) > 0: - valid_predictions.append(pred) - elif "bodyparts" in pred and pred["bodyparts"].shape[0] > 0: - valid_predictions.append(pred) - - if not valid_predictions: - raise ValueError( - "No objects were detected in the video. This can happen if:\n" - "1. The video doesn't contain the type of objects the model was trained to detect\n" - "2. The objects are too small, blurry, or occluded\n" - "3. The detector confidence threshold is too high\n" - "4. The video quality is poor\n\n" - "Try:\n" - "- Using a different video with clearer objects\n" - "- Adjusting the detector confidence threshold\n" - "- Checking if the model is appropriate for your use case" - ) - - # Ensure all predictions have the same shape by padding with zeros if needed - max_individuals = max(p["bodyparts"].shape[0] for p in predictions) if predictions else 0 - num_bodyparts = predictions[0]["bodyparts"].shape[1] if predictions else 0 - - # Pad all predictions to have the same number of individuals - padded_predictions = [] - for p in predictions: - current_individuals = p["bodyparts"].shape[0] - if current_individuals < max_individuals: - # Pad with zeros for missing individuals - padding = np.zeros((max_individuals - current_individuals, num_bodyparts, 3)) - padded_bodyparts = np.concatenate([p["bodyparts"][..., :3], padding], axis=0) - else: - padded_bodyparts = p["bodyparts"][..., :3] - padded_predictions.append(padded_bodyparts) - - pred_bodyparts = np.stack(padded_predictions) - + pred_bodyparts = np.stack([p["bodyparts"][..., :3] for p in predictions]) pred_unique_bodyparts = None if len(predictions) > 0 and "unique_bodyparts" in predictions[0]: pred_unique_bodyparts = np.stack([p["unique_bodyparts"] for p in predictions]) From cded5817dde33eb89b1f8a9571115549ad436400 Mon Sep 17 00:00:00 2001 From: maximpavliv Date: Mon, 21 Jul 2025 18:56:25 +0200 Subject: [PATCH 18/34] Restore CTDInferenceRunner --- deeplabcut/pose_estimation_pytorch/runners/inference.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/deeplabcut/pose_estimation_pytorch/runners/inference.py b/deeplabcut/pose_estimation_pytorch/runners/inference.py index 7c38468117..be851cc44d 100644 --- a/deeplabcut/pose_estimation_pytorch/runners/inference.py +++ b/deeplabcut/pose_estimation_pytorch/runners/inference.py @@ -590,12 +590,13 @@ def predict( raw_predictions = self.model.get_predictions(outputs) predictions = [ { - "detection": { - "bboxes": item["boxes"].cpu().numpy().reshape(-1, 4), - "bbox_scores": item["scores"].cpu().numpy().reshape(-1), + head: { + pred_name: pred[b].cpu().numpy() + for pred_name, pred in head_outputs.items() } + for head, head_outputs in raw_predictions.items() } - for item in raw_predictions + for b in range(len(inputs)) ] return predictions From 7eaf923a015384ae895e4404ad766a8e90e68309 Mon Sep 17 00:00:00 2001 From: maximpavliv Date: Mon, 21 Jul 2025 19:05:15 +0200 Subject: [PATCH 19/34] Remove TorchvisionDetectorInferenceRunner --- .../runners/__init__.py | 1 - .../runners/inference.py | 71 +------------------ 2 files changed, 1 insertion(+), 71 deletions(-) diff --git a/deeplabcut/pose_estimation_pytorch/runners/__init__.py b/deeplabcut/pose_estimation_pytorch/runners/__init__.py index 29ab9724c6..6d3aa5fac7 100644 --- a/deeplabcut/pose_estimation_pytorch/runners/__init__.py +++ b/deeplabcut/pose_estimation_pytorch/runners/__init__.py @@ -24,7 +24,6 @@ from deeplabcut.pose_estimation_pytorch.runners.inference import ( build_inference_runner, DetectorInferenceRunner, - TorchvisionDetectorInferenceRunner, InferenceRunner, PoseInferenceRunner, ) diff --git a/deeplabcut/pose_estimation_pytorch/runners/inference.py b/deeplabcut/pose_estimation_pytorch/runners/inference.py index be851cc44d..bdb01a2f4f 100644 --- a/deeplabcut/pose_estimation_pytorch/runners/inference.py +++ b/deeplabcut/pose_estimation_pytorch/runners/inference.py @@ -20,7 +20,6 @@ import numpy as np import torch import torch.nn as nn -import torchvision import deeplabcut.pose_estimation_pytorch.post_processing.nms as nms import deeplabcut.pose_estimation_pytorch.runners.ctd as ctd @@ -889,69 +888,7 @@ def inference(self, images) -> list[dict[str, np.ndarray]]: return super().inference(images) -class TorchvisionDetectorInferenceRunner(DetectorInferenceRunner): - """Runner for torchvision detector inference that bypasses standard preprocessing""" - - def __init__(self, model: BaseDetector, **kwargs): - """ - Args: - model: The torchvision detector to use for inference. - **kwargs: Inference runner kwargs. - """ - super().__init__(model, **kwargs) - - def predict( - self, inputs: torch.Tensor, **kwargs - ) -> list[dict[str, dict[str, np.ndarray]]]: - """Makes predictions from a model input and output - - Args: - inputs: the inputs to the model, of shape (batch_size, ...) - - Returns: - predictions for each of the 'batch_size' inputs, made by each head - """ - if self.device and "cuda" in str(self.device): - with torch.autocast(device_type=str(self.device)): - _, raw_predictions = self.model(inputs.to(self.device)) - else: - _, raw_predictions = self.model(inputs.to(self.device)) - - predictions = [] - for item in raw_predictions: - if isinstance(item, dict) and "boxes" in item: - predictions.append({ - "detection": { - "bboxes": item["boxes"].cpu().numpy().reshape(-1, 4), - "bbox_scores": item["scores"].cpu().numpy().reshape(-1), - } - }) - else: - # Handle unexpected output format - predictions.append({ - "detection": { - "bboxes": np.zeros((0, 4)), - "bbox_scores": np.zeros(0), - } - }) - return predictions - - def inference(self, images) -> list[dict[str, np.ndarray]]: - """Run inference using the torchvision detector's inference method - - Args: - images: List of image paths, PIL Images, or numpy arrays - - Returns: - List of detection results with bboxes in xywh format - """ - # Always use the detector's own inference method for torchvision detectors - if hasattr(self.model, 'inference'): - return self.model.inference(images) - else: - # This should never happen for torchvision detectors - raise RuntimeError("TorchvisionDetectorInferenceRunner requires model to have inference method") def build_inference_runner( @@ -1019,13 +956,7 @@ def build_inference_runner( f"The DynamicCropper can only be used for pose estimation; not object " f"detection. Please turn off dynamic cropping." ) - - # Simple check: if superanimal_humanbody, use torchvision inference - # Otherwise, use standard inference - if hasattr(model, 'superanimal_name') and model.superanimal_name == "superanimal_humanbody": - return TorchvisionDetectorInferenceRunner(**kwargs) - else: - return DetectorInferenceRunner(**kwargs) + return DetectorInferenceRunner(**kwargs) if task != Task.BOTTOM_UP: if dynamic is not None and not isinstance(dynamic, TopDownDynamicCropper): From feb34d13f7ffc4280a6f2618e962be66f601acc5 Mon Sep 17 00:00:00 2001 From: maximpavliv Date: Mon, 21 Jul 2025 19:05:52 +0200 Subject: [PATCH 20/34] Revert DetectorInferenceRunner --- .../runners/inference.py | 47 ++++--------------- 1 file changed, 9 insertions(+), 38 deletions(-) diff --git a/deeplabcut/pose_estimation_pytorch/runners/inference.py b/deeplabcut/pose_estimation_pytorch/runners/inference.py index bdb01a2f4f..e4a51deaf9 100644 --- a/deeplabcut/pose_estimation_pytorch/runners/inference.py +++ b/deeplabcut/pose_estimation_pytorch/runners/inference.py @@ -850,44 +850,15 @@ def predict( _, raw_predictions = self.model(inputs.to(self.device)) else: _, raw_predictions = self.model(inputs.to(self.device)) - - predictions = [] - for item in raw_predictions: - if isinstance(item, dict) and "boxes" in item and "scores" in item: - predictions.append({ - "detection": { - "bboxes": item["boxes"].cpu().numpy().reshape(-1, 4), - "bbox_scores": item["scores"].cpu().numpy().reshape(-1), - } - }) - else: - # Handle unexpected output format - predictions.append({ - "detection": { - "bboxes": np.zeros((0, 4)), - "bbox_scores": np.zeros(0), - } - }) - - return predictions - - def inference(self, images) -> list[dict[str, np.ndarray]]: - """Run inference using the detector's own inference method if available - - Args: - images: List of image paths, PIL Images, or numpy arrays - - Returns: - List of detection results with bboxes in xywh format - """ - # Use the detector's own inference method if it exists - if hasattr(self.model, 'inference'): - return self.model.inference(images) - else: - # Fall back to standard inference pipeline - return super().inference(images) - - + predictions = [ + { + "detection": { + "bboxes": item["boxes"].cpu().numpy().reshape(-1, 4), + "scores": item["scores"].cpu().numpy().reshape(-1), + } + } + for item in raw_predictions + ] return predictions From 711a47eb7d9e7ab8e7aa4812c68fd5f74f313074 Mon Sep 17 00:00:00 2001 From: maximpavliv Date: Mon, 21 Jul 2025 19:52:21 +0200 Subject: [PATCH 21/34] superanimal_analyze_images() - make humanbody compatible --- .../apis/analyze_images.py | 64 +++++++++++++------ 1 file changed, 45 insertions(+), 19 deletions(-) diff --git a/deeplabcut/pose_estimation_pytorch/apis/analyze_images.py b/deeplabcut/pose_estimation_pytorch/apis/analyze_images.py index 0880923177..aca51296df 100644 --- a/deeplabcut/pose_estimation_pytorch/apis/analyze_images.py +++ b/deeplabcut/pose_estimation_pytorch/apis/analyze_images.py @@ -35,7 +35,7 @@ get_pose_inference_runner, get_scorer_name, get_scorer_uid, - parse_snapshot_index_for_analysis, + parse_snapshot_index_for_analysis, get_filtered_coco_detector_inference_runner, ) from deeplabcut.pose_estimation_pytorch.modelzoo.utils import update_config from deeplabcut.pose_estimation_pytorch.task import Task @@ -159,19 +159,30 @@ def superanimal_analyze_images( else: snapshot_path = Path(customized_pose_checkpoint) - if customized_detector_checkpoint is None: + detector_path = customized_detector_checkpoint + if detector_path is None and superanimal_name != "superanimal_humanbody": detector_path = modelzoo.get_super_animal_snapshot_path( dataset=superanimal_name, model_name=detector_name, ) - else: - detector_path = Path(customized_detector_checkpoint) + + filtered_detector_config = None + if superanimal_name == "superanimal_humanbody": + if detector_name is not None: + torchvision_detector_name = detector_name + else: + torchvision_detector_name = "fasterrcnn_mobilenet_v3_large_fpn" + COCO_PERSON = 1 # COCO class ID for person + filtered_detector_config = { + "torchvision_detector_name": torchvision_detector_name, + "category_id": COCO_PERSON, + } if customized_model_config is None: config = modelzoo.load_super_animal_config( super_animal=superanimal_name, model_name=model_name, - detector_name=detector_name, + detector_name=detector_name if superanimal_name != "superanimal_humanbody" else None, ) elif isinstance(customized_model_config, (str, Path)): config = config_utils.read_config_as_dict(customized_model_config) @@ -180,7 +191,7 @@ def superanimal_analyze_images( config = update_config(config, max_individuals, device) config["metadata"]["individuals"] = [f"animal{i}" for i in range(max_individuals)] - if "detector" in config: + if config.get("detector") is not None: config["detector"]["model"]["box_score_thresh"] = bbox_threshold predictions = analyze_image_folder( @@ -191,6 +202,7 @@ def superanimal_analyze_images( max_individuals=max_individuals, device=device, progress_bar=progress_bar, + filtered_detector_config=filtered_detector_config, ) skeleton_bodyparts = config.get("skeleton", []) @@ -394,6 +406,7 @@ def analyze_image_folder( device: str | None = None, max_individuals: int | None = None, progress_bar: bool = True, + filtered_detector_config: dict | None = None, ) -> dict[str, dict[str, np.ndarray | np.ndarray]]: """Runs pose inference on a folder of images and returns the predictions @@ -411,6 +424,8 @@ def analyze_image_folder( max_individuals: The maximum number of individuals to detect in each image. Set to the number of individuals in the project if None. progress_bar: Whether to display a progress bar when running inference. + filtered_detector_config: If using a filtered torchvision detector instead of a saved detector snapshot, + specify the filtered detector configuration Returns: A dictionary mapping each image filename to the different types of predictions @@ -423,15 +438,11 @@ def analyze_image_folder( model_cfg = config_utils.read_config_as_dict(model_cfg) pose_task = Task(model_cfg["method"]) - if pose_task == Task.TOP_DOWN and detector_path is None: - detector_variant = model_cfg.get("detector", {}).get("model", {}).get("variant", "") - # Allow torchvision detectors to be loaded without a checkpoint - if detector_variant not in ["fasterrcnn_mobilenet_v3_large_fpn", "fasterrcnn_resnet50_fpn_v2"]: - raise ValueError( - "A detector path must be specified for image analysis using top-down models" - f" Please specify the `detector_path` parameter." - ) - # else: will be handled by TorchvisionDetectorAdaptor + if pose_task == Task.TOP_DOWN and detector_path is None and filtered_detector_config is None: + raise ValueError( + "A detector path or filtered_detector_config must be specified for image analysis using top-down models" + f" Please specify the `detector_path` parameter or the `filtered_detector_config` parameter." + ) if max_individuals is None: max_individuals = len(model_cfg["metadata"]["individuals"]) @@ -452,6 +463,8 @@ def analyze_image_folder( image_paths = parse_images_and_image_folders(images, image_suffixes) pose_inputs = image_paths + + detector_runner = None if detector_path is not None: logging.info(f"Running object detection with {detector_path}") detector_runner = get_detector_inference_runner( @@ -460,14 +473,27 @@ def analyze_image_folder( device=device, max_individuals=max_individuals, ) + elif filtered_detector_config is not None: + model_name = filtered_detector_config["torchvision_detector_name"] + category_id = filtered_detector_config["category_id"] + + logging.info(f"Running object detection with filtered torchvision detector '{model_name}', category_id={category_id}") + detector_runner = get_filtered_coco_detector_inference_runner( + model_name=model_name, + category_id=category_id, + batch_size=1, + device=device, + max_individuals=max_individuals, + color_mode=model_cfg["data"]["colormode"], + model_config=model_cfg, + ) - detector_image_paths = image_paths - if progress_bar: - detector_image_paths = tqdm(detector_image_paths) + if detector_runner is not None: + detector_image_paths = tqdm(image_paths) if progress_bar else image_paths bbox_predictions = detector_runner.inference(images=detector_image_paths) pose_inputs = list(zip(image_paths, bbox_predictions)) - logging.info(f"Running pose estimation with {detector_path}") + logging.info(f"Running pose estimation with {snapshot_path}") if progress_bar: pose_inputs = tqdm(pose_inputs) From 28e4dd046c7effb2916bc33ea14fbac369d00dc3 Mon Sep 17 00:00:00 2001 From: maximpavliv Date: Mon, 21 Jul 2025 20:07:19 +0200 Subject: [PATCH 22/34] Revert build_predictions_dataframe() --- .../pose_estimation_pytorch/apis/utils.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/deeplabcut/pose_estimation_pytorch/apis/utils.py b/deeplabcut/pose_estimation_pytorch/apis/utils.py index 6751b25a61..4f0361396c 100644 --- a/deeplabcut/pose_estimation_pytorch/apis/utils.py +++ b/deeplabcut/pose_estimation_pytorch/apis/utils.py @@ -418,25 +418,6 @@ def build_predictions_dataframe( """ image_names = [] prediction_data = [] - - # Check if this is a humanbody model by looking at the first prediction - if predictions: - first_pred = next(iter(predictions.values())) - if "bodyparts" in first_pred: - actual_num_individuals = first_pred["bodyparts"].shape[0] - expected_num_individuals = len(parameters.individuals) - - # For humanbody models, if the actual number of individuals differs from expected, - # we need to adjust the parameters to match the actual predictions - if actual_num_individuals != expected_num_individuals: - # Create adjusted parameters with the actual number of individuals - adjusted_individuals = [f"individual_{i}" for i in range(actual_num_individuals)] - parameters = PoseDatasetParameters( - bodyparts=parameters.bodyparts, - unique_bpts=parameters.unique_bpts, - individuals=adjusted_individuals, - ) - for image_name, image_predictions in predictions.items(): image_data = image_predictions["bodyparts"][..., :3].reshape(-1) if "unique_bodyparts" in image_predictions: From 193f935688080d852d959580fc17864b7bca3c73 Mon Sep 17 00:00:00 2001 From: maximpavliv Date: Mon, 21 Jul 2025 20:07:48 +0200 Subject: [PATCH 23/34] Revert get_inference_runners() --- .../pose_estimation_pytorch/apis/utils.py | 39 ++++--------------- 1 file changed, 7 insertions(+), 32 deletions(-) diff --git a/deeplabcut/pose_estimation_pytorch/apis/utils.py b/deeplabcut/pose_estimation_pytorch/apis/utils.py index 4f0361396c..958146142a 100644 --- a/deeplabcut/pose_estimation_pytorch/apis/utils.py +++ b/deeplabcut/pose_estimation_pytorch/apis/utils.py @@ -579,45 +579,20 @@ def get_inference_runners( if device == "mps": detector_device = "cpu" - # Get superanimal name for filtering logic - superanimal_name = model_config.get("metadata", {}).get("superanimal_name", "") - - if detector_path is not None or "detector" in model_config: - if detector_path is not None: - detector_path = str(detector_path) + if detector_path is not None: + detector_path = str(detector_path) if detector_transform is None: detector_transform = build_transforms( model_config["detector"]["data"]["inference"] ) - print(f"DEBUG: Creating detector for superanimal_name: '{superanimal_name}'") - if superanimal_name == "superanimal_humanbody": - # Only for superanimal_humanbody, use torchvision detector - from deeplabcut.pose_estimation_pytorch.models.detectors.torchvision import TorchvisionDetectorAdaptor - detector_config = model_config["detector"]["model"].copy() - expected_fields = { - "model", "weights", "num_classes", "freeze_bn_stats", "freeze_bn_weights", - "box_score_thresh", "model_kwargs", "model_name", "superanimal_name" - } - unexpected_fields = [k for k in detector_config.keys() if k not in expected_fields] - for field in unexpected_fields: - detector_config.pop(field, None) - if detector_path is not None: - detector_config["weights"] = None - detector_model = TorchvisionDetectorAdaptor(**detector_config) - detector_model.superanimal_name = superanimal_name - print(f"DEBUG: Created TorchvisionDetectorAdaptor for {superanimal_name}") - else: - # For all other superanimal models, use the original logic (pre-humanbody integration) - detector_config = model_config["detector"]["model"].copy() - pretrained = False if detector_path is not None else True - detector_model = DETECTORS.build(detector_config, pretrained=pretrained) - detector_model.superanimal_name = superanimal_name - print(f"DEBUG: Created custom detector from DETECTORS registry for {superanimal_name}") - print(f"DEBUG: Custom detector type: {type(detector_model)}") + detector_config = model_config["detector"]["model"] + if "pretrained" in detector_config: + detector_config["pretrained"] = False + detector_runner = build_inference_runner( task=Task.DETECT, - model=detector_model, + model=DETECTORS.build(detector_config), device=detector_device, snapshot_path=detector_path, batch_size=detector_batch_size, From 613b2ac5ba8b75dce188f42ce49b90457959030a Mon Sep 17 00:00:00 2001 From: maximpavliv Date: Mon, 21 Jul 2025 20:42:32 +0200 Subject: [PATCH 24/34] Revert detectors/fasterRCNN.py --- .../models/detectors/fasterRCNN.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/deeplabcut/pose_estimation_pytorch/models/detectors/fasterRCNN.py b/deeplabcut/pose_estimation_pytorch/models/detectors/fasterRCNN.py index 8079682cb5..edfdbe8a23 100644 --- a/deeplabcut/pose_estimation_pytorch/models/detectors/fasterRCNN.py +++ b/deeplabcut/pose_estimation_pytorch/models/detectors/fasterRCNN.py @@ -59,17 +59,16 @@ def __init__( super().__init__( model=variant, - weights=None, # Always pass None to ensure num_classes=2 is used - num_classes=2, # Always use 2 classes for superanimal models + weights=("COCO_V1" if pretrained else None), + num_classes=None, freeze_bn_stats=freeze_bn_stats, freeze_bn_weights=freeze_bn_weights, box_score_thresh=box_score_thresh, ) - if not pretrained: - num_classes = 2 - in_features = self.model.roi_heads.box_predictor.cls_score.in_features - self.model.roi_heads.box_predictor = detection.faster_rcnn.FastRCNNPredictor( - in_features, num_classes - ) - + # Modify the base predictor to output the correct number of classes + num_classes = 2 + in_features = self.model.roi_heads.box_predictor.cls_score.in_features + self.model.roi_heads.box_predictor = detection.faster_rcnn.FastRCNNPredictor( + in_features, num_classes + ) From d848741b7ae73c64475ba588a5109399408cf10d Mon Sep 17 00:00:00 2001 From: maximpavliv Date: Mon, 21 Jul 2025 20:43:10 +0200 Subject: [PATCH 25/34] Revert detectors/torchvision.py --- .../models/detectors/torchvision.py | 245 +----------------- 1 file changed, 11 insertions(+), 234 deletions(-) diff --git a/deeplabcut/pose_estimation_pytorch/models/detectors/torchvision.py b/deeplabcut/pose_estimation_pytorch/models/detectors/torchvision.py index eb91bdbdf3..6c700377f7 100644 --- a/deeplabcut/pose_estimation_pytorch/models/detectors/torchvision.py +++ b/deeplabcut/pose_estimation_pytorch/models/detectors/torchvision.py @@ -13,9 +13,6 @@ import torch import torchvision.models.detection as detection -import numpy as np -from PIL import Image -import torchvision from deeplabcut.pose_estimation_pytorch.models.detectors.base import ( BaseDetector, @@ -51,58 +48,38 @@ class TorchvisionDetectorAdaptor(BaseDetector): freeze_bn_weights: Whether to freeze weights for BatchNorm layers. box_score_thresh: during inference, only return proposals with a classification score greater than box_score_thresh - model_name: Optional name of the model - superanimal_name: Optional name of the superanimal model """ def __init__( self, - model: str = "fasterrcnn_resnet50_fpn_v2", + model: str, weights: str | None = None, num_classes: int | None = 2, freeze_bn_stats: bool = False, freeze_bn_weights: bool = False, box_score_thresh: float = 0.01, model_kwargs: dict | None = None, - model_name: str | None = None, - superanimal_name: str | None = None, ) -> None: super().__init__( freeze_bn_stats=freeze_bn_stats, freeze_bn_weights=freeze_bn_weights, pretrained=weights is not None, ) - self.model_name = model_name - self.superanimal_name = superanimal_name + # Load the model model_fn = getattr(detection, model) if model_kwargs is None: model_kwargs = {} - # Get the proper weights class - if weights == "COCO_V1" or weights is None: - if model == "fasterrcnn_mobilenet_v3_large_fpn": - weights = detection.FasterRCNN_MobileNet_V3_Large_FPN_Weights.DEFAULT - elif model == "fasterrcnn_resnet50_fpn_v2" and self.superanimal_name == "superanimal_humanbody": - weights = detection.FasterRCNN_ResNet50_FPN_V2_Weights.DEFAULT - else: - weights = None - - if weights is not None: - self.model = model_fn( - weights=weights, - box_score_thresh=box_score_thresh, - **model_kwargs, - ) - else: - self.model = model_fn( - weights=weights, - box_score_thresh=box_score_thresh, - num_classes=num_classes, - **model_kwargs, - ) + self.model = model_fn( + weights=weights, + box_score_thresh=box_score_thresh, + num_classes=num_classes, + **model_kwargs, + ) - self.transforms = weights.transforms() if weights is not None else None + # See source: https://stackoverflow.com/a/65347721 + self.model.eager_outputs = lambda losses, detections: (losses, detections) def forward( self, x: torch.Tensor, targets: list[dict[str, torch.Tensor]] | None = None @@ -118,152 +95,7 @@ def forward( losses: {'loss_name': loss_value} detections: for each of the b images, {"boxes": bounding_boxes} """ - result = self.model(x, targets) - - # Handle different return formats from torchvision models - if isinstance(result, tuple): - if len(result) == 2: - # Standard format: (losses, predictions) - return result - elif len(result) > 2: - # Some models return additional values, take first two - return result[0], result[1] - else: - # Single value, assume it's predictions - # Return zero loss tensor for training compatibility - device = x.device - dummy_loss = torch.tensor(0.0, device=device, requires_grad=True) - return {"total_loss": dummy_loss}, result[0] - else: - # Single value, assume it's predictions - # Return zero loss tensor for training compatibility - device = x.device - dummy_loss = torch.tensor(0.0, device=device, requires_grad=True) - return {"total_loss": dummy_loss}, result - - def inference(self, images) -> list[dict[str, np.ndarray]]: - """ - Run inference on images using the torchvision detector - - Args: - images: List of PIL Images or numpy arrays - - Returns: - List of detection results, each containing "bboxes" in xywh format - """ - - - self.model.eval() - device = next(self.model.parameters()).device - - results = [] - - with torch.no_grad(): - for i, image in enumerate(images): - # Convert to PIL Image if needed - if isinstance(image, np.ndarray): - image = Image.fromarray(image).convert("RGB") - elif not isinstance(image, Image.Image): - image = Image.open(image).convert("RGB") - - # Apply proper preprocessing - if self.transforms is not None: - batch = [self.transforms(image).to(device)] - else: - # For SSD models, we need specific preprocessing - if hasattr(self.model, 'roi_heads'): - # FasterRCNN preprocessing - import torchvision.transforms as transforms - preprocess = transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) - ]) - else: - # SSD preprocessing - resize to 320x320 - import torchvision.transforms as transforms - preprocess = transforms.Compose([ - transforms.Resize((320, 320)), - transforms.ToTensor(), - transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) - ]) - batch = [preprocess(image).to(device)] - - # Run detection - call model directly without going through forward method - # Check if model has roi_heads (FasterRCNN) or not (SSD) - if hasattr(self.model, 'roi_heads'): - pass # Model has roi_heads (FasterRCNN) - else: - pass # Model is SSD - no roi_heads attribute - - # Call the underlying torchvision model directly for inference - predictions = self.model(batch) - - # Handle the output format - during inference, should be list of dicts - if isinstance(predictions, (list, tuple)) and len(predictions) > 0: - prediction = predictions[0] # First image - else: - prediction = predictions - - # Check if predictions are empty due to threshold - if isinstance(prediction, dict) and len(prediction) > 0: - if 'scores' in prediction: - print(f"DEBUG: Max score: {prediction['scores'].max() if len(prediction['scores']) > 0 else 'No scores'}") - # Check if model has roi_heads (FasterRCNN) or not (SSD) - # Skip threshold check for SSD models that don't have roi_heads - - if not isinstance(prediction, dict) or "boxes" not in prediction: - # Unexpected output, return empty - results.append({ - "bboxes": np.zeros((0, 4)), - "bbox_scores": np.zeros(0) - }) - continue - - bboxes = prediction["boxes"].cpu().numpy() - labels = prediction["labels"].cpu().numpy() - scores = prediction["scores"].cpu().numpy() - - # Handle empty detections - if len(bboxes) == 0: - detected_bboxes = np.zeros((0, 4)) - detected_scores = np.zeros(0) - else: - # For humanbody models, filter for humans (COCO class 1) - # For quadruped and other models, return all detections - if self.superanimal_name == 'superanimal_humanbody': - detection_mask = labels == 1 - else: - detection_mask = np.ones(len(bboxes), dtype=bool) - detected_bboxes = bboxes[detection_mask] - detected_scores = scores[detection_mask] - detected_labels = labels[detection_mask] - - # Convert to xywh format - if len(detected_bboxes) > 0: - # Convert from (x1, y1, x2, y2) to (x, y, w, h) - detected_bboxes[:, 2] -= detected_bboxes[:, 0] # width = x2 - x1 - detected_bboxes[:, 3] -= detected_bboxes[:, 1] # height = y2 - y1 - - # Sort by confidence and keep top detections - sorted_indices = np.argsort(detected_scores)[::-1] - detected_bboxes = detected_bboxes[sorted_indices] - detected_scores = detected_scores[sorted_indices] - - # Limit to reasonable number of detections - max_detections = 10 - if len(detected_bboxes) > max_detections: - detected_bboxes = detected_bboxes[:max_detections] - detected_scores = detected_scores[:max_detections] - else: - detected_bboxes = np.zeros((0, 4)) - detected_scores = np.zeros(0) - - results.append({ - "bboxes": detected_bboxes, - "bbox_scores": detected_scores - }) - - return results + return self.model(x, targets) def get_target(self, labels: dict) -> list[dict[str, torch.Tensor]]: """ @@ -328,58 +160,3 @@ def get_target(self, labels: dict) -> list[dict[str, torch.Tensor]]: ) return res - -def torchvision_detector_inference(images, threshold=0.1, device="cpu"): - """ - Run the Colab-style torchvision detector on a list of images. - Args: - images: list of np.ndarray or PIL.Image - threshold: float, detection threshold - device: str, device to run on - Returns: - list of dicts with 'bboxes' and 'scores' - """ - import torchvision.models.detection as detection - from PIL import Image - import numpy as np - import torch - - # Use the exact working logic from colab_style_detector_test.py - weights = detection.FasterRCNN_MobileNet_V3_Large_FPN_Weights.DEFAULT - detector = detection.fasterrcnn_mobilenet_v3_large_fpn( - weights=weights, box_score_thresh=threshold - ) - detector.eval() - detector.to(device) - preprocess = weights.transforms() - - results = [] - for image in images: - if isinstance(image, np.ndarray): - image = Image.fromarray(image).convert("RGB") - elif not isinstance(image, Image.Image): - image = Image.open(image).convert("RGB") - - batch = [preprocess(image).to(device)] - with torch.no_grad(): - predictions = detector(batch)[0] - - bboxes = predictions["boxes"].cpu().numpy() - labels = predictions["labels"].cpu().numpy() - scores = predictions["scores"].cpu().numpy() - - # Filter for humans (COCO class 1) - human_mask = labels == 1 - human_bboxes = bboxes[human_mask] - human_scores = scores[human_mask] - - # Convert to xywh format - if len(human_bboxes) > 0: - human_bboxes[:, 2] -= human_bboxes[:, 0] - human_bboxes[:, 3] -= human_bboxes[:, 1] - - results.append({ - "bboxes": human_bboxes, - "scores": human_scores - }) - return results From bba86890a4ed252e48801702f2ab062492ae6668 Mon Sep 17 00:00:00 2001 From: maximpavliv Date: Mon, 21 Jul 2025 21:05:22 +0200 Subject: [PATCH 26/34] Revert base Runner --- .../pose_estimation_pytorch/runners/base.py | 38 +------------------ 1 file changed, 1 insertion(+), 37 deletions(-) diff --git a/deeplabcut/pose_estimation_pytorch/runners/base.py b/deeplabcut/pose_estimation_pytorch/runners/base.py index c5fae450c2..ee8dbd6d1a 100644 --- a/deeplabcut/pose_estimation_pytorch/runners/base.py +++ b/deeplabcut/pose_estimation_pytorch/runners/base.py @@ -114,43 +114,7 @@ def load_snapshot( The content of the snapshot file. """ snapshot = attempt_snapshot_load(snapshot_path, device, weights_only) - - # Handle the case where snapshot keys have 'model.' prefix - snapshot_weights = snapshot["model"] - model_state_dict = model.state_dict() - - # Diagnostic: Always add 'model.' prefix for superanimal_topviewmouse detectors - is_topviewmouse = hasattr(model, 'superanimal_name') and getattr(model, 'superanimal_name', None) == 'superanimal_topviewmouse' - is_detector = 'FasterRCNN' in str(type(model)) or 'SSDLite' in str(type(model)) - if is_topviewmouse and is_detector: - print(f"DEBUG: Forcing prefix ADD for superanimal_topviewmouse detector!") - cleaned_weights = {} - for key, value in snapshot_weights.items(): - if not key.startswith('model.'): - cleaned_key = 'model.' + key # Add 'model.' prefix - cleaned_weights[cleaned_key] = value - else: - cleaned_weights[key] = value - print(f"DEBUG: Loading cleaned weights with {len(cleaned_weights)} keys") - model.load_state_dict(cleaned_weights) - elif (any(key.startswith('model.') for key in snapshot_weights.keys()) and - not any(key.startswith('model.') for key in model_state_dict.keys())): - print(f"DEBUG: Detected 'model.' prefix mismatch, cleaning keys...") - # Strip the 'model.' prefix from snapshot keys - cleaned_weights = {} - for key, value in snapshot_weights.items(): - if key.startswith('model.'): - cleaned_key = key[6:] # Remove 'model.' prefix - cleaned_weights[cleaned_key] = value - else: - cleaned_weights[key] = value - print(f"DEBUG: Loading cleaned weights with {len(cleaned_weights)} keys") - model.load_state_dict(cleaned_weights) - else: - print(f"DEBUG: No prefix mismatch, loading original weights") - # Use original snapshot weights - model.load_state_dict(snapshot["model"]) - + model.load_state_dict(snapshot["model"]) return snapshot From 25fa08dfc43dd69db2ee0d94996341bb8ebdd056 Mon Sep 17 00:00:00 2001 From: maximpavliv Date: Thu, 31 Jul 2025 13:27:05 +0200 Subject: [PATCH 27/34] Fix superanimal_humanbody unit test --- test_superanimal_humanbody.py | 71 ------------------- ...test_filtered_detector_inference_runner.py | 54 ++++++++++++++ 2 files changed, 54 insertions(+), 71 deletions(-) delete mode 100644 test_superanimal_humanbody.py create mode 100644 tests/pose_estimation_pytorch/runners/test_filtered_detector_inference_runner.py diff --git a/test_superanimal_humanbody.py b/test_superanimal_humanbody.py deleted file mode 100644 index d5b9d8af75..0000000000 --- a/test_superanimal_humanbody.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python3 -""" -Test script for superanimal_humanbody with torchvision detector -""" - -import torch -import torchvision.models.detection as detection -from deeplabcut.pose_estimation_pytorch.modelzoo import load_super_animal_config - -def test_torchvision_detector(): - """Test that the torchvision detector works with superanimal_humanbody""" - - # Load the superanimal_humanbody config - config = load_super_animal_config( - super_animal="superanimal_humanbody", - model_name="rtmpose_x", - detector_name="fasterrcnn_mobilenet_v3_large_fpn", - ) - - print("Config loaded successfully!") - print(f"Model method: {config['method']}") - print(f"Detector variant: {config['detector']['model']['variant']}") - - # Check if the detector is configured to use torchvision - detector_config = config['detector']['model'] - print(f"Detector config: {detector_config}") - - # Test loading the torchvision detector directly - print("\nTesting torchvision detector loading...") - weights = detection.FasterRCNN_MobileNet_V3_Large_FPN_Weights.DEFAULT - detector = detection.fasterrcnn_mobilenet_v3_large_fpn( - weights=weights, box_score_thresh=0.6, - ) - detector.eval() - print("Torchvision detector loaded successfully!") - - # Test that the detector config matches what we expect for torchvision - print("\nTesting detector config compatibility...") - expected_variant = "fasterrcnn_mobilenet_v3_large_fpn" - actual_variant = detector_config.get("variant", "") - - if actual_variant == expected_variant: - print(f"✅ Detector variant matches expected: {expected_variant}") - else: - print(f"❌ Detector variant mismatch. Expected: {expected_variant}, Got: {actual_variant}") - return False - - # Test that the config has the correct structure for torchvision detector - if "type" in detector_config and detector_config["type"] == "FasterRCNN": - print("✅ Detector type is correctly set to FasterRCNN") - else: - print("❌ Detector type is not correctly set") - return False - - # Test that the config allows for torchvision weights (no pretrained field or pretrained=False) - if "pretrained" not in detector_config or detector_config.get("pretrained") is False: - print("✅ Detector config allows torchvision weights") - else: - print("❌ Detector config has pretrained=True, which may conflict with torchvision weights") - return False - - print("\n✅ All tests passed! The torchvision detector integration is working correctly.") - return True - -if __name__ == "__main__": - print("Testing superanimal_humanbody with torchvision detector...") - success = test_torchvision_detector() - if success: - print("\n✅ Test passed! The torchvision detector works with superanimal_humanbody") - else: - print("\n❌ Test failed! There's an issue with the torchvision detector integration") \ No newline at end of file diff --git a/tests/pose_estimation_pytorch/runners/test_filtered_detector_inference_runner.py b/tests/pose_estimation_pytorch/runners/test_filtered_detector_inference_runner.py new file mode 100644 index 0000000000..8fa3242850 --- /dev/null +++ b/tests/pose_estimation_pytorch/runners/test_filtered_detector_inference_runner.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +""" +Test script for superanimal_humanbody with torchvision detector +""" + +from deeplabcut.pose_estimation_pytorch.apis.utils import TORCHVISION_DETECTORS, \ + get_filtered_coco_detector_inference_runner +from deeplabcut.pose_estimation_pytorch.models.detectors.filtered_detector import FilteredDetector +from deeplabcut.pose_estimation_pytorch.modelzoo import load_super_animal_config + +def test_torchvision_detector(): + """Test that the torchvision detector works with superanimal_humanbody""" + for detector_name in TORCHVISION_DETECTORS: + + # Load the superanimal_humanbody config + superanimal_config = load_super_animal_config( + super_animal="superanimal_humanbody", + model_name="rtmpose_x", + detector_name=detector_name, + ) + print("Config loaded successfully!") + + # Test loading the torchvision detector directly + print("\nTesting torchvision detector loading...") + entry = TORCHVISION_DETECTORS[detector_name] + weights = entry["weights"] + coco_detector = entry["fn"](weights=weights, box_score_thresh=0.6) + coco_detector.eval() + print("Torchvision detector loaded successfully!") + + # Test loading the FilteredDetector + COCO_PERSON = 1 # COCO class ID for person + person_detector = FilteredDetector(coco_detector, class_id=COCO_PERSON) + person_detector.eval() + print("Filtered detector loaded successfully!") + + _ = get_filtered_coco_detector_inference_runner( + model_name=detector_name, + category_id=COCO_PERSON, + batch_size=1, + model_config=superanimal_config, + ) + print("Filtered detector runner created successfully!") + + print("\n✅ All tests passed! The torchvision detector integration is working correctly.") + return True + +if __name__ == "__main__": + print("Testing superanimal_humanbody with torchvision detector...") + success = test_torchvision_detector() + if success: + print("\n✅ Test passed! The torchvision detector works with superanimal_humanbody") + else: + print("\n❌ Test failed! There's an issue with the torchvision detector integration") \ No newline at end of file From 425484b5d24c4b2365e10add9b315e998571db77 Mon Sep 17 00:00:00 2001 From: maximpavliv Date: Thu, 31 Jul 2025 13:27:33 +0200 Subject: [PATCH 28/34] Disable video adaptation for superanimal_humanbody --- deeplabcut/modelzoo/video_inference.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/deeplabcut/modelzoo/video_inference.py b/deeplabcut/modelzoo/video_inference.py index 46ed21e7d3..339772ca6d 100644 --- a/deeplabcut/modelzoo/video_inference.py +++ b/deeplabcut/modelzoo/video_inference.py @@ -390,7 +390,13 @@ def video_inference_superanimal( ) config = update_config(config, max_individuals, device) + output_suffix = "_before_adapt" + + if superanimal_name == "superanimal_humanbody" and video_adapt: + print(f"Video adaptation currently not supported for {superanimal_name}. Setting it to false.") + video_adapt = False + if video_adapt: # the users can pass in many videos. For now, we only use one video for # video adaptation. As reported in Ye et al. 2024, one video should be @@ -414,6 +420,7 @@ def video_inference_superanimal( output_suffix=output_suffix, plot_bboxes=plot_bboxes, bboxes_pcutoff=bbox_threshold, + torchvision_detector_name=torchvision_detector_name, ) # we prepare the pseudo dataset in the same folder of the target video From 4b04013819ac8f07107b961dd228e6dd7f83a673 Mon Sep 17 00:00:00 2001 From: maximpavliv Date: Thu, 31 Jul 2025 13:28:00 +0200 Subject: [PATCH 29/34] Fix testscript_superanimal_inference.py --- examples/testscript_superanimal_inference.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/examples/testscript_superanimal_inference.py b/examples/testscript_superanimal_inference.py index ee32b3b02a..c0a042e08a 100644 --- a/examples/testscript_superanimal_inference.py +++ b/examples/testscript_superanimal_inference.py @@ -47,14 +47,3 @@ videotype=".avi", scale_list=scale_list, ) - - print("testing superanimal_humanbody") - superanimal_name = "superanimal_humanbody" - deeplabcut.video_inference_superanimal( - video, - superanimal_name, - model_name="rtmpose_x", - detector_name="fasterrcnn_mobilenet_v3_large_fpn", - videotype=".avi", - scale_list=scale_list, - ) From eea470e7f1a2e2f98616d5ed685e2daa3f988ff6 Mon Sep 17 00:00:00 2001 From: maximpavliv Date: Thu, 31 Jul 2025 13:28:18 +0200 Subject: [PATCH 30/34] Remove debug print --- examples/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/utils.py b/examples/utils.py index 6b735a1c40..3877b335b4 100644 --- a/examples/utils.py +++ b/examples/utils.py @@ -256,7 +256,6 @@ def generate_video_from_images(image_dir: Path, output_video: Path) -> None: def create_fake_project(path: Path, params: SyntheticProjectParameters) -> None: if path.exists(): - print(f"[DEBUG] Path exists: {path} (is_dir={path.is_dir()}, is_file={path.is_file()})") raise ValueError(f"Cannot create a fake project at an existing path") scorer = "synthetic" From 3d47a36e438321a2ef8ef1200c3e440d64bd4673 Mon Sep 17 00:00:00 2001 From: maximpavliv Date: Thu, 31 Jul 2025 13:45:23 +0200 Subject: [PATCH 31/34] Black formatting --- deeplabcut/modelzoo/utils.py | 12 +- deeplabcut/modelzoo/video_inference.py | 10 +- .../apis/analyze_images.py | 25 ++-- .../pose_estimation_pytorch/apis/utils.py | 6 +- .../pose_estimation_pytorch/apis/videos.py | 109 +++++++++++++----- .../data/transforms.py | 4 +- .../models/detectors/filtered_detector.py | 2 +- .../modelzoo/inference.py | 7 +- .../pose_estimation_pytorch/modelzoo/utils.py | 8 +- .../pose_estimation_pytorch/runners/base.py | 6 +- examples/utils.py | 1 + ...test_filtered_detector_inference_runner.py | 24 +++- 12 files changed, 155 insertions(+), 59 deletions(-) diff --git a/deeplabcut/modelzoo/utils.py b/deeplabcut/modelzoo/utils.py index 00de90afb9..066c5f6dbc 100644 --- a/deeplabcut/modelzoo/utils.py +++ b/deeplabcut/modelzoo/utils.py @@ -78,10 +78,16 @@ def get_super_animal_scorer( The DLC scorer name to use for the given SuperAnimal models. """ if detector_snapshot_path is not None and torchvision_detector_name is not None: - raise ValueError("Provide only one of `detector_snapshot_path` or `torchvision_detector_name`, not both.") + raise ValueError( + "Provide only one of `detector_snapshot_path` or `torchvision_detector_name`, not both." + ) super_animal_prefix = super_animal + "_" # Always use model name first - model_name = model_snapshot_path.stem if hasattr(model_snapshot_path, "stem") else str(model_snapshot_path) + model_name = ( + model_snapshot_path.stem + if hasattr(model_snapshot_path, "stem") + else str(model_snapshot_path) + ) if model_name.startswith(super_animal_prefix): model_name = model_name[len(super_animal_prefix) :] dlc_scorer = f"{super_animal_prefix}{model_name}" @@ -90,7 +96,7 @@ def get_super_animal_scorer( if detector_snapshot_path is not None: detector_name = detector_snapshot_path.stem if detector_name.startswith(super_animal_prefix): - detector_name = detector_name[len(super_animal_prefix):] + detector_name = detector_name[len(super_animal_prefix) :] dlc_scorer += f"_{detector_name}_" elif torchvision_detector_name is not None: dlc_scorer += f"_{torchvision_detector_name}_" diff --git a/deeplabcut/modelzoo/video_inference.py b/deeplabcut/modelzoo/video_inference.py index 339772ca6d..e2dfaca256 100644 --- a/deeplabcut/modelzoo/video_inference.py +++ b/deeplabcut/modelzoo/video_inference.py @@ -368,7 +368,11 @@ def video_inference_superanimal( config = load_super_animal_config( super_animal=superanimal_name, model_name=model_name, - detector_name=detector_name if superanimal_name != "superanimal_humanbody" else None, + detector_name=( + detector_name + if superanimal_name != "superanimal_humanbody" + else None + ), ) pose_model_path = customized_pose_checkpoint @@ -394,7 +398,9 @@ def video_inference_superanimal( output_suffix = "_before_adapt" if superanimal_name == "superanimal_humanbody" and video_adapt: - print(f"Video adaptation currently not supported for {superanimal_name}. Setting it to false.") + print( + f"Video adaptation currently not supported for {superanimal_name}. Setting it to false." + ) video_adapt = False if video_adapt: diff --git a/deeplabcut/pose_estimation_pytorch/apis/analyze_images.py b/deeplabcut/pose_estimation_pytorch/apis/analyze_images.py index aca51296df..05668cf8ba 100644 --- a/deeplabcut/pose_estimation_pytorch/apis/analyze_images.py +++ b/deeplabcut/pose_estimation_pytorch/apis/analyze_images.py @@ -35,7 +35,8 @@ get_pose_inference_runner, get_scorer_name, get_scorer_uid, - parse_snapshot_index_for_analysis, get_filtered_coco_detector_inference_runner, + parse_snapshot_index_for_analysis, + get_filtered_coco_detector_inference_runner, ) from deeplabcut.pose_estimation_pytorch.modelzoo.utils import update_config from deeplabcut.pose_estimation_pytorch.task import Task @@ -103,7 +104,7 @@ def superanimal_analyze_images( The device to use to run image analysis. pose_threshold: float, default=0.4 - The cutoff score when plotting pose predictions. To note, this is called + The cutoff score when plotting pose predictions. To note, this is called pcutoff in other parts of the code. Must be in (0, 1). bbox_threshold: float, default=0.1 @@ -182,7 +183,9 @@ def superanimal_analyze_images( config = modelzoo.load_super_animal_config( super_animal=superanimal_name, model_name=model_name, - detector_name=detector_name if superanimal_name != "superanimal_humanbody" else None, + detector_name=( + detector_name if superanimal_name != "superanimal_humanbody" else None + ), ) elif isinstance(customized_model_config, (str, Path)): config = config_utils.read_config_as_dict(customized_model_config) @@ -211,9 +214,7 @@ def superanimal_analyze_images( skeleton = [] bodyparts = config["metadata"]["bodyparts"] for bpt_0, bpt_1 in skeleton_bodyparts: - skeleton.append( - (bodyparts.index(bpt_0), bodyparts.index(bpt_1)) - ) + skeleton.append((bodyparts.index(bpt_0), bodyparts.index(bpt_1))) visualization.create_labeled_images( predictions=predictions, @@ -438,7 +439,11 @@ def analyze_image_folder( model_cfg = config_utils.read_config_as_dict(model_cfg) pose_task = Task(model_cfg["method"]) - if pose_task == Task.TOP_DOWN and detector_path is None and filtered_detector_config is None: + if ( + pose_task == Task.TOP_DOWN + and detector_path is None + and filtered_detector_config is None + ): raise ValueError( "A detector path or filtered_detector_config must be specified for image analysis using top-down models" f" Please specify the `detector_path` parameter or the `filtered_detector_config` parameter." @@ -459,7 +464,7 @@ def analyze_image_folder( image_suffixes = ".png", ".jpg", ".jpeg" if frame_type is not None: - image_suffixes = (frame_type, ) + image_suffixes = (frame_type,) image_paths = parse_images_and_image_folders(images, image_suffixes) pose_inputs = image_paths @@ -477,7 +482,9 @@ def analyze_image_folder( model_name = filtered_detector_config["torchvision_detector_name"] category_id = filtered_detector_config["category_id"] - logging.info(f"Running object detection with filtered torchvision detector '{model_name}', category_id={category_id}") + logging.info( + f"Running object detection with filtered torchvision detector '{model_name}', category_id={category_id}" + ) detector_runner = get_filtered_coco_detector_inference_runner( model_name=model_name, category_id=category_id, diff --git a/deeplabcut/pose_estimation_pytorch/apis/utils.py b/deeplabcut/pose_estimation_pytorch/apis/utils.py index 958146142a..58314ccd79 100644 --- a/deeplabcut/pose_estimation_pytorch/apis/utils.py +++ b/deeplabcut/pose_estimation_pytorch/apis/utils.py @@ -47,7 +47,9 @@ ) from deeplabcut.pose_estimation_pytorch.data.transforms import build_transforms from deeplabcut.pose_estimation_pytorch.models import DETECTORS, PoseModel -from deeplabcut.pose_estimation_pytorch.models.detectors.filtered_detector import FilteredDetector +from deeplabcut.pose_estimation_pytorch.models.detectors.filtered_detector import ( + FilteredDetector, +) from deeplabcut.pose_estimation_pytorch.runners import ( build_inference_runner, CTDTrackingConfig, @@ -693,6 +695,8 @@ def get_detector_inference_runner( "weights": FasterRCNN_MobileNet_V3_Large_FPN_Weights.DEFAULT, }, } + + def get_filtered_coco_detector_inference_runner( model_name: str, category_id: int, diff --git a/deeplabcut/pose_estimation_pytorch/apis/videos.py b/deeplabcut/pose_estimation_pytorch/apis/videos.py index 96cc879ecb..29d6d55225 100644 --- a/deeplabcut/pose_estimation_pytorch/apis/videos.py +++ b/deeplabcut/pose_estimation_pytorch/apis/videos.py @@ -396,7 +396,10 @@ def analyze_videos( pose_cfg = auxiliaryfunctions.read_plainconfig(pose_cfg_path) snapshot_index, detector_snapshot_index = utils.parse_snapshot_index_for_analysis( - loader.project_cfg, loader.model_cfg, snapshot_index, detector_snapshot_index, + loader.project_cfg, + loader.model_cfg, + snapshot_index, + detector_snapshot_index, ) if cropping is None and loader.project_cfg.get("cropping", False): @@ -456,10 +459,12 @@ def analyze_videos( except (ValueError, IndexError) as e: print(f"Error loading snapshot with index {snapshot_index}: {e}") print("Attempting to find available snapshots...") - + # Try to get all available snapshots try: - all_snapshots = utils.get_model_snapshots("all", loader.model_folder, loader.pose_task) + all_snapshots = utils.get_model_snapshots( + "all", loader.model_folder, loader.pose_task + ) if all_snapshots: # Try to find a "best" snapshot first best_snapshots = [s for s in all_snapshots if s.best] @@ -469,21 +474,31 @@ def analyze_videos( else: # Use the last available snapshot snapshot = all_snapshots[-1] - print(f"No best snapshot found, using last available: {snapshot.path}") + print( + f"No best snapshot found, using last available: {snapshot.path}" + ) else: raise FileNotFoundError(f"No snapshots found in {loader.model_folder}") except Exception as fallback_error: - raise FileNotFoundError(f"Failed to load any snapshots from {loader.model_folder}. Original error: {e}. Fallback error: {fallback_error}") + raise FileNotFoundError( + f"Failed to load any snapshots from {loader.model_folder}. Original error: {e}. Fallback error: {fallback_error}" + ) # Additional validation for best snapshots if "best" in str(snapshot.path) and not snapshot.path.exists(): - print(f"Warning: Best snapshot path {snapshot.path} does not exist. Checking for alternative snapshots...") + print( + f"Warning: Best snapshot path {snapshot.path} does not exist. Checking for alternative snapshots..." + ) # Try to find any available snapshot try: - all_snapshots = utils.get_model_snapshots("all", loader.model_folder, loader.pose_task) + all_snapshots = utils.get_model_snapshots( + "all", loader.model_folder, loader.pose_task + ) if all_snapshots: # Try to find a different best snapshot - best_snapshots = [s for s in all_snapshots if s.best and s.path.exists()] + best_snapshots = [ + s for s in all_snapshots if s.best and s.path.exists() + ] if best_snapshots: snapshot = best_snapshots[0] print(f"Using alternative best snapshot: {snapshot.path}") @@ -499,7 +514,7 @@ def analyze_videos( # Verify the snapshot file exists if not snapshot.path.exists(): raise FileNotFoundError(f"Snapshot file not found: {snapshot.path}") - + print(f"Successfully loaded snapshot: {snapshot.path}") # Load the BU model for the conditions provider @@ -512,7 +527,8 @@ def analyze_videos( ) elif isinstance(ctd_conditions, dict): cond_provider = get_condition_provider( - condition_cfg=ctd_conditions, config=config, + condition_cfg=ctd_conditions, + config=config, ) else: cond_provider = ctd_conditions @@ -551,50 +567,83 @@ def analyze_videos( detector_snapshot_index, loader.model_folder, Task.DETECT )[0] except (ValueError, IndexError) as e: - print(f"Error loading detector snapshot with index {detector_snapshot_index}: {e}") + print( + f"Error loading detector snapshot with index {detector_snapshot_index}: {e}" + ) print("Attempting to find available detector snapshots...") - + # Try to get all available detector snapshots try: - all_detector_snapshots = utils.get_model_snapshots("all", loader.model_folder, Task.DETECT) + all_detector_snapshots = utils.get_model_snapshots( + "all", loader.model_folder, Task.DETECT + ) if all_detector_snapshots: # Try to find a "best" detector snapshot first - best_detector_snapshots = [s for s in all_detector_snapshots if s.best] + best_detector_snapshots = [ + s for s in all_detector_snapshots if s.best + ] if best_detector_snapshots: detector_snapshot = best_detector_snapshots[0] - print(f"Found and using best detector snapshot: {detector_snapshot.path}") + print( + f"Found and using best detector snapshot: {detector_snapshot.path}" + ) else: # Use the last available detector snapshot detector_snapshot = all_detector_snapshots[-1] - print(f"No best detector snapshot found, using last available: {detector_snapshot.path}") + print( + f"No best detector snapshot found, using last available: {detector_snapshot.path}" + ) else: - raise FileNotFoundError(f"No detector snapshots found in {loader.model_folder}") + raise FileNotFoundError( + f"No detector snapshots found in {loader.model_folder}" + ) except Exception as fallback_error: - raise FileNotFoundError(f"Failed to load any detector snapshots from {loader.model_folder}. Original error: {e}. Fallback error: {fallback_error}") + raise FileNotFoundError( + f"Failed to load any detector snapshots from {loader.model_folder}. Original error: {e}. Fallback error: {fallback_error}" + ) # Additional validation for detector snapshots - if "best" in str(detector_snapshot.path) and not detector_snapshot.path.exists(): - print(f"Warning: Best detector snapshot path {detector_snapshot.path} does not exist. Checking for alternative detector snapshots...") + if ( + "best" in str(detector_snapshot.path) + and not detector_snapshot.path.exists() + ): + print( + f"Warning: Best detector snapshot path {detector_snapshot.path} does not exist. Checking for alternative detector snapshots..." + ) try: - all_detector_snapshots = utils.get_model_snapshots("all", loader.model_folder, Task.DETECT) + all_detector_snapshots = utils.get_model_snapshots( + "all", loader.model_folder, Task.DETECT + ) if all_detector_snapshots: # Try to find a different best detector snapshot - best_detector_snapshots = [s for s in all_detector_snapshots if s.best and s.path.exists()] + best_detector_snapshots = [ + s for s in all_detector_snapshots if s.best and s.path.exists() + ] if best_detector_snapshots: detector_snapshot = best_detector_snapshots[0] - print(f"Using alternative best detector snapshot: {detector_snapshot.path}") + print( + f"Using alternative best detector snapshot: {detector_snapshot.path}" + ) else: # Use the last available detector snapshot detector_snapshot = all_detector_snapshots[-1] - print(f"Using alternative detector snapshot: {detector_snapshot.path}") + print( + f"Using alternative detector snapshot: {detector_snapshot.path}" + ) else: - raise FileNotFoundError(f"No detector snapshots found in {loader.model_folder}") + raise FileNotFoundError( + f"No detector snapshots found in {loader.model_folder}" + ) except Exception as e: - raise FileNotFoundError(f"Failed to find alternative detector snapshots: {e}") + raise FileNotFoundError( + f"Failed to find alternative detector snapshots: {e}" + ) # Verify the detector snapshot file exists if not detector_snapshot.path.exists(): - raise FileNotFoundError(f"Detector snapshot file not found: {detector_snapshot.path}") + raise FileNotFoundError( + f"Detector snapshot file not found: {detector_snapshot.path}" + ) print(f" -> Using detector {detector_snapshot.path}") detector_runner = utils.get_detector_inference_runner( @@ -610,7 +659,7 @@ def analyze_videos( # Reading video and init variables videos = utils.list_videos_in_folder(videos, videotype, shuffle=in_random_order) h5_files_created = False # Track if any .h5 files were created - + for video in videos: if destfolder is None: output_path = video.parent @@ -706,7 +755,9 @@ def analyze_videos( for i in range(num_frames): frame_data = full_data.get("frame" + str(i).zfill(str_width)) if frame_data is None: - pose = np.full((len(individuals), len(bodyparts), 3), np.nan) + pose = np.full( + (len(individuals), len(bodyparts), 3), np.nan + ) ctd_predictions.append(dict(bodyparts=pose)) continue diff --git a/deeplabcut/pose_estimation_pytorch/data/transforms.py b/deeplabcut/pose_estimation_pytorch/data/transforms.py index 7a296525c1..cb321a0aca 100644 --- a/deeplabcut/pose_estimation_pytorch/data/transforms.py +++ b/deeplabcut/pose_estimation_pytorch/data/transforms.py @@ -127,7 +127,7 @@ def build_transforms(augmentations: dict) -> A.BaseCompose: noise = 0.05 * 255 transforms.append( A.GaussNoise( - var_limit=(0, noise ** 2), + var_limit=(0, noise**2), mean=0, per_channel=True, # Albumentations doesn't support per_channel = 0.5 @@ -475,7 +475,7 @@ def __init__( p, ) self._neighbor_dist = 3 - self._neighbor_dist_square = self._neighbor_dist ** 2 + self._neighbor_dist_square = self._neighbor_dist**2 def apply_to_keypoints( self, keypoints: Sequence[float], random_state: int | None = None, **params diff --git a/deeplabcut/pose_estimation_pytorch/models/detectors/filtered_detector.py b/deeplabcut/pose_estimation_pytorch/models/detectors/filtered_detector.py index e4278dba65..7ea4da46a5 100644 --- a/deeplabcut/pose_estimation_pytorch/models/detectors/filtered_detector.py +++ b/deeplabcut/pose_estimation_pytorch/models/detectors/filtered_detector.py @@ -37,4 +37,4 @@ def forward(self, images: list[torch.Tensor]) -> list[dict[str, torch.Tensor]]: filtered_outputs.append(filtered_output) losses = {} - return losses, filtered_outputs \ No newline at end of file + return losses, filtered_outputs diff --git a/deeplabcut/pose_estimation_pytorch/modelzoo/inference.py b/deeplabcut/pose_estimation_pytorch/modelzoo/inference.py index 970e977865..e7a8c64909 100644 --- a/deeplabcut/pose_estimation_pytorch/modelzoo/inference.py +++ b/deeplabcut/pose_estimation_pytorch/modelzoo/inference.py @@ -24,7 +24,7 @@ from deeplabcut.pose_estimation_pytorch.apis.utils import ( get_inference_runners, get_pose_inference_runner, - get_filtered_coco_detector_inference_runner + get_filtered_coco_detector_inference_runner, ) from deeplabcut.pose_estimation_pytorch.modelzoo.utils import ( raise_warning_if_called_directly, @@ -150,7 +150,10 @@ def _video_inference_superanimal( print(f"Processing video {video_path}") dlc_scorer = get_super_animal_scorer( - superanimal_name, model_snapshot_path, detector_snapshot_path, torchvision_detector_name + superanimal_name, + model_snapshot_path, + detector_snapshot_path, + torchvision_detector_name, ) output_prefix = f"{Path(video_path).stem}_{dlc_scorer}" diff --git a/deeplabcut/pose_estimation_pytorch/modelzoo/utils.py b/deeplabcut/pose_estimation_pytorch/modelzoo/utils.py index 9ff66b4240..2cb06e2ffe 100644 --- a/deeplabcut/pose_estimation_pytorch/modelzoo/utils.py +++ b/deeplabcut/pose_estimation_pytorch/modelzoo/utils.py @@ -117,7 +117,9 @@ def load_super_animal_config( else: model_config["method"] = "TD" if super_animal != "superanimal_humanbody": - detector_cfg_path = get_super_animal_model_config_path(model_name=detector_name) + detector_cfg_path = get_super_animal_model_config_path( + model_name=detector_name + ) detector_cfg = read_config_as_dict(detector_cfg_path) model_config["detector"] = detector_cfg return model_config @@ -141,7 +143,9 @@ def download_super_animal_snapshot(dataset: str, model_name: str) -> Path: model_filename = f"{model_name}.pt" model_path = snapshot_dir / model_filename - download_huggingface_model(model_name, target_dir=str(snapshot_dir), rename_mapping=model_filename) + download_huggingface_model( + model_name, target_dir=str(snapshot_dir), rename_mapping=model_filename + ) if not model_path.exists(): raise RuntimeError(f"Failed to download {model_name} to {model_path}") diff --git a/deeplabcut/pose_estimation_pytorch/runners/base.py b/deeplabcut/pose_estimation_pytorch/runners/base.py index ee8dbd6d1a..f0b4dd735a 100644 --- a/deeplabcut/pose_estimation_pytorch/runners/base.py +++ b/deeplabcut/pose_estimation_pytorch/runners/base.py @@ -23,8 +23,9 @@ ModelType = TypeVar("ModelType", bound=nn.Module) -_load_weights_only: bool = ( - os.getenv("TORCH_LOAD_WEIGHTS_ONLY", "true").lower() in ("true", "1") +_load_weights_only: bool = os.getenv("TORCH_LOAD_WEIGHTS_ONLY", "true").lower() in ( + "true", + "1", ) @@ -218,6 +219,7 @@ def _add_numpy_to_torch_safe_globals(): try: from numpy.core.multiarray import scalar from numpy.dtypes import Float64DType + torch.serialization.add_safe_globals([np.dtype, Float64DType, scalar]) except Exception: pass diff --git a/examples/utils.py b/examples/utils.py index 3877b335b4..656f15f300 100644 --- a/examples/utils.py +++ b/examples/utils.py @@ -18,6 +18,7 @@ from typing import Any import matplotlib + matplotlib.use("Agg") # Non-interactive backend, for CI/CD on Windows import cv2 diff --git a/tests/pose_estimation_pytorch/runners/test_filtered_detector_inference_runner.py b/tests/pose_estimation_pytorch/runners/test_filtered_detector_inference_runner.py index 8fa3242850..5d52050026 100644 --- a/tests/pose_estimation_pytorch/runners/test_filtered_detector_inference_runner.py +++ b/tests/pose_estimation_pytorch/runners/test_filtered_detector_inference_runner.py @@ -3,11 +3,16 @@ Test script for superanimal_humanbody with torchvision detector """ -from deeplabcut.pose_estimation_pytorch.apis.utils import TORCHVISION_DETECTORS, \ - get_filtered_coco_detector_inference_runner -from deeplabcut.pose_estimation_pytorch.models.detectors.filtered_detector import FilteredDetector +from deeplabcut.pose_estimation_pytorch.apis.utils import ( + TORCHVISION_DETECTORS, + get_filtered_coco_detector_inference_runner, +) +from deeplabcut.pose_estimation_pytorch.models.detectors.filtered_detector import ( + FilteredDetector, +) from deeplabcut.pose_estimation_pytorch.modelzoo import load_super_animal_config + def test_torchvision_detector(): """Test that the torchvision detector works with superanimal_humanbody""" for detector_name in TORCHVISION_DETECTORS: @@ -42,13 +47,20 @@ def test_torchvision_detector(): ) print("Filtered detector runner created successfully!") - print("\n✅ All tests passed! The torchvision detector integration is working correctly.") + print( + "\n✅ All tests passed! The torchvision detector integration is working correctly." + ) return True + if __name__ == "__main__": print("Testing superanimal_humanbody with torchvision detector...") success = test_torchvision_detector() if success: - print("\n✅ Test passed! The torchvision detector works with superanimal_humanbody") + print( + "\n✅ Test passed! The torchvision detector works with superanimal_humanbody" + ) else: - print("\n❌ Test failed! There's an issue with the torchvision detector integration") \ No newline at end of file + print( + "\n❌ Test failed! There's an issue with the torchvision detector integration" + ) From 03456da84c05fa5e6575c52ed3e0c38b12ebb2bb Mon Sep 17 00:00:00 2001 From: maximpavliv Date: Fri, 22 Aug 2025 10:16:05 +0200 Subject: [PATCH 32/34] SimCCPredictor: add visilibity computation --- .../models/predictors/sim_cc.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/deeplabcut/pose_estimation_pytorch/models/predictors/sim_cc.py b/deeplabcut/pose_estimation_pytorch/models/predictors/sim_cc.py index 25d953c178..3f769fc633 100644 --- a/deeplabcut/pose_estimation_pytorch/models/predictors/sim_cc.py +++ b/deeplabcut/pose_estimation_pytorch/models/predictors/sim_cc.py @@ -42,21 +42,33 @@ class SimCCPredictor(BasePredictor): def __init__( self, simcc_split_ratio: float = 2.0, - apply_softmax: bool = False, normalize_outputs: bool = False, + apply_softmax: bool = True, + sigma: float | int | tuple[float, ...] = 6.0, + decode_beta: float = 150.0, ) -> None: super().__init__() self.simcc_split_ratio = simcc_split_ratio - self.apply_softmax = apply_softmax self.normalize_outputs = normalize_outputs + self.apply_softmax = apply_softmax + + if isinstance(sigma, (float, int)): + self.sigma = np.array([sigma, sigma]) + else: + self.sigma = np.array(sigma) + self.decode_beta = decode_beta def forward( self, stride: float, outputs: dict[str, torch.Tensor] ) -> dict[str, torch.Tensor]: x, y = outputs["x"].detach(), outputs["y"].detach() + if self.normalize_outputs: x = get_simcc_normalized(x) y = get_simcc_normalized(y) + else: + x = x * (self.sigma[0] * self.decode_beta) + y = y * (self.sigma[1] * self.decode_beta) keypoints, scores = get_simcc_maximum( x.cpu().numpy(), y.cpu().numpy(), self.apply_softmax From c6bd02b59a9f8da4bdf06972fe0e706583d03c6b Mon Sep 17 00:00:00 2001 From: maximpavliv Date: Fri, 22 Aug 2025 10:17:43 +0200 Subject: [PATCH 33/34] Add new SimCCPredictor params to model configs --- deeplabcut/modelzoo/model_configs/rtmpose_s.yaml | 4 ++++ deeplabcut/modelzoo/model_configs/rtmpose_x.yaml | 4 ++++ .../config/ctd/ctd_prenet_rtmpose_m.yaml | 2 ++ .../config/ctd/ctd_prenet_rtmpose_s.yaml | 2 ++ .../config/ctd/ctd_prenet_rtmpose_x.yaml | 2 ++ .../config/ctd/ctd_prenet_rtmpose_x_human.yaml | 2 ++ .../pose_estimation_pytorch/config/rtmpose/rtmpose_m.yaml | 2 ++ .../pose_estimation_pytorch/config/rtmpose/rtmpose_s.yaml | 2 ++ .../pose_estimation_pytorch/config/rtmpose/rtmpose_x.yaml | 2 ++ 9 files changed, 22 insertions(+) diff --git a/deeplabcut/modelzoo/model_configs/rtmpose_s.yaml b/deeplabcut/modelzoo/model_configs/rtmpose_s.yaml index 2e7d693ba0..9c80b0583c 100644 --- a/deeplabcut/modelzoo/model_configs/rtmpose_s.yaml +++ b/deeplabcut/modelzoo/model_configs/rtmpose_s.yaml @@ -51,6 +51,10 @@ model: predictor: type: SimCCPredictor simcc_split_ratio: 2.0 + sigma: + - 5.66 + - 5.66 + decode_beta: 150.0 input_size: - 256 - 256 diff --git a/deeplabcut/modelzoo/model_configs/rtmpose_x.yaml b/deeplabcut/modelzoo/model_configs/rtmpose_x.yaml index 9a7df70196..0d1fb8a547 100644 --- a/deeplabcut/modelzoo/model_configs/rtmpose_x.yaml +++ b/deeplabcut/modelzoo/model_configs/rtmpose_x.yaml @@ -102,6 +102,10 @@ model: predictor: type: SimCCPredictor simcc_split_ratio: 2.0 + sigma: + - 6.0 + - 6.93 + decode_beta: 150.0 input_size: - 288 - 384 diff --git a/deeplabcut/pose_estimation_pytorch/config/ctd/ctd_prenet_rtmpose_m.yaml b/deeplabcut/pose_estimation_pytorch/config/ctd/ctd_prenet_rtmpose_m.yaml index ca35c8a3ed..88b708cc39 100644 --- a/deeplabcut/pose_estimation_pytorch/config/ctd/ctd_prenet_rtmpose_m.yaml +++ b/deeplabcut/pose_estimation_pytorch/config/ctd/ctd_prenet_rtmpose_m.yaml @@ -55,6 +55,8 @@ model: predictor: type: SimCCPredictor simcc_split_ratio: 2.0 + sigma: [5.66, 5.66] + decode_beta: 150.0 input_size: [256, 256] in_channels: 768 out_channels: "num_bodyparts" diff --git a/deeplabcut/pose_estimation_pytorch/config/ctd/ctd_prenet_rtmpose_s.yaml b/deeplabcut/pose_estimation_pytorch/config/ctd/ctd_prenet_rtmpose_s.yaml index c1870e8160..6ae8b5364a 100644 --- a/deeplabcut/pose_estimation_pytorch/config/ctd/ctd_prenet_rtmpose_s.yaml +++ b/deeplabcut/pose_estimation_pytorch/config/ctd/ctd_prenet_rtmpose_s.yaml @@ -55,6 +55,8 @@ model: predictor: type: SimCCPredictor simcc_split_ratio: 2.0 + sigma: [5.66, 5.66] + decode_beta: 150.0 input_size: [256, 256] in_channels: 512 out_channels: "num_bodyparts" diff --git a/deeplabcut/pose_estimation_pytorch/config/ctd/ctd_prenet_rtmpose_x.yaml b/deeplabcut/pose_estimation_pytorch/config/ctd/ctd_prenet_rtmpose_x.yaml index 75c7228d21..f809a0c569 100644 --- a/deeplabcut/pose_estimation_pytorch/config/ctd/ctd_prenet_rtmpose_x.yaml +++ b/deeplabcut/pose_estimation_pytorch/config/ctd/ctd_prenet_rtmpose_x.yaml @@ -55,6 +55,8 @@ model: predictor: type: SimCCPredictor simcc_split_ratio: 2.0 + sigma: [6.93, 6.93] + decode_beta: 150.0 input_size: [384, 384] in_channels: 1280 out_channels: "num_bodyparts" diff --git a/deeplabcut/pose_estimation_pytorch/config/ctd/ctd_prenet_rtmpose_x_human.yaml b/deeplabcut/pose_estimation_pytorch/config/ctd/ctd_prenet_rtmpose_x_human.yaml index ad6579fe5e..1d47cb3306 100644 --- a/deeplabcut/pose_estimation_pytorch/config/ctd/ctd_prenet_rtmpose_x_human.yaml +++ b/deeplabcut/pose_estimation_pytorch/config/ctd/ctd_prenet_rtmpose_x_human.yaml @@ -56,6 +56,8 @@ model: predictor: type: SimCCPredictor simcc_split_ratio: 2.0 + sigma: [6., 6.93] + decode_beta: 150.0 input_size: [288, 384] in_channels: 1280 out_channels: "num_bodyparts" diff --git a/deeplabcut/pose_estimation_pytorch/config/rtmpose/rtmpose_m.yaml b/deeplabcut/pose_estimation_pytorch/config/rtmpose/rtmpose_m.yaml index d6bc515f94..d2ed3ae52e 100644 --- a/deeplabcut/pose_estimation_pytorch/config/rtmpose/rtmpose_m.yaml +++ b/deeplabcut/pose_estimation_pytorch/config/rtmpose/rtmpose_m.yaml @@ -49,6 +49,8 @@ model: predictor: type: SimCCPredictor simcc_split_ratio: 2.0 + sigma: [5.66, 5.66] + decode_beta: 150.0 input_size: [256, 256] in_channels: 768 out_channels: "num_bodyparts" diff --git a/deeplabcut/pose_estimation_pytorch/config/rtmpose/rtmpose_s.yaml b/deeplabcut/pose_estimation_pytorch/config/rtmpose/rtmpose_s.yaml index fbc4ff7ed4..463e104654 100644 --- a/deeplabcut/pose_estimation_pytorch/config/rtmpose/rtmpose_s.yaml +++ b/deeplabcut/pose_estimation_pytorch/config/rtmpose/rtmpose_s.yaml @@ -49,6 +49,8 @@ model: predictor: type: SimCCPredictor simcc_split_ratio: 2.0 + sigma: [5.66, 5.66] + decode_beta: 150.0 input_size: [256, 256] in_channels: 512 out_channels: "num_bodyparts" diff --git a/deeplabcut/pose_estimation_pytorch/config/rtmpose/rtmpose_x.yaml b/deeplabcut/pose_estimation_pytorch/config/rtmpose/rtmpose_x.yaml index 0a49baec75..f1d6f61e67 100644 --- a/deeplabcut/pose_estimation_pytorch/config/rtmpose/rtmpose_x.yaml +++ b/deeplabcut/pose_estimation_pytorch/config/rtmpose/rtmpose_x.yaml @@ -49,6 +49,8 @@ model: predictor: type: SimCCPredictor simcc_split_ratio: 2.0 + sigma: [6.93, 6.93] + decode_beta: 150.0 input_size: [384, 384] in_channels: 1280 out_channels: "num_bodyparts" From bd9e618c206ffc8ac6918ba9ee4267fa41971acf Mon Sep 17 00:00:00 2001 From: maximpavliv Date: Tue, 2 Sep 2025 14:02:16 +0200 Subject: [PATCH 34/34] SimCCPredictor constructor: restore args order --- .../pose_estimation_pytorch/models/predictors/sim_cc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deeplabcut/pose_estimation_pytorch/models/predictors/sim_cc.py b/deeplabcut/pose_estimation_pytorch/models/predictors/sim_cc.py index 3f769fc633..b36a9639ab 100644 --- a/deeplabcut/pose_estimation_pytorch/models/predictors/sim_cc.py +++ b/deeplabcut/pose_estimation_pytorch/models/predictors/sim_cc.py @@ -42,15 +42,15 @@ class SimCCPredictor(BasePredictor): def __init__( self, simcc_split_ratio: float = 2.0, - normalize_outputs: bool = False, apply_softmax: bool = True, + normalize_outputs: bool = False, sigma: float | int | tuple[float, ...] = 6.0, decode_beta: float = 150.0, ) -> None: super().__init__() self.simcc_split_ratio = simcc_split_ratio - self.normalize_outputs = normalize_outputs self.apply_softmax = apply_softmax + self.normalize_outputs = normalize_outputs if isinstance(sigma, (float, int)): self.sigma = np.array([sigma, sigma])