From c6927f3b55269fa6bcc9d06c5318ebcf645bf47c Mon Sep 17 00:00:00 2001
From: Steffen Schneider <steffen@bethgelab.org>
Date: Tue, 12 Apr 2022 17:17:38 +0200
Subject: [PATCH 1/8] Add benchmark API

 Co-authored-by: Jessy <30733203+jeylau@users.noreply.github.com>
---
 deeplabcut/benchmark/__init__.py   | 102 ++++++++++++++
 deeplabcut/benchmark/_crypt.py     |  46 +++++++
 deeplabcut/benchmark/base.py       | 205 +++++++++++++++++++++++++++++
 deeplabcut/benchmark/benchmarks.py | 132 +++++++++++++++++++
 deeplabcut/benchmark/cli.py        |  35 +++++
 deeplabcut/benchmark/metrics.py    | 139 +++++++++++++++++++
 deeplabcut/benchmark/utils.py      |  59 +++++++++
 7 files changed, 718 insertions(+)
 create mode 100644 deeplabcut/benchmark/__init__.py
 create mode 100644 deeplabcut/benchmark/_crypt.py
 create mode 100644 deeplabcut/benchmark/base.py
 create mode 100644 deeplabcut/benchmark/benchmarks.py
 create mode 100644 deeplabcut/benchmark/cli.py
 create mode 100644 deeplabcut/benchmark/metrics.py
 create mode 100644 deeplabcut/benchmark/utils.py

diff --git a/deeplabcut/benchmark/__init__.py b/deeplabcut/benchmark/__init__.py
new file mode 100644
index 0000000000..08748040de
--- /dev/null
+++ b/deeplabcut/benchmark/__init__.py
@@ -0,0 +1,102 @@
+"""The DeepLabCut benchmark suite."""
+
+import json
+import os
+from typing import Container
+
+from benchmark.base import Benchmark, Result, ResultCollection
+
+DATA_ROOT = os.path.join(os.path.dirname(__file__), "data")
+CACHE = os.path.join(os.path.dirname(__file__), ".results")
+
+__registry = []
+
+
+def register(cls):
+    """Add a benchmark to the list of evaluations to run.
+
+    Apply this function as a decorator to a class. Note that the
+    class needs to be a subclass of the ``benchmark.base.Benchmark``
+    base class.
+
+    In most situations, it will be a subclass of one of the pre-defined
+    benchmarks in ``benchmark.benchmarks``.
+
+    Throws:
+        ``ValueError`` if the decorator is applied to a class that is
+        not a subclass of ``benchmark.base.Benchmark``.
+    """
+    if not issubclass(cls, Benchmark):
+        raise ValueError(
+            f"Can only register subclasses of {type(Benchmark)}, " f"but got {cls}."
+        )
+    __registry.append(cls)
+
+
+def evaluate(
+    include_benchmarks: Container[str] = None,
+    results: ResultCollection = None,
+    on_error="return",
+) -> ResultCollection:
+    """Run evaluation for all benchmarks and methods.
+
+    Note that in order for your custom benchmark to be included during
+    evaluation, the following conditions need to be met:
+
+        - The benchmark subclassed one of the benchmark definitions in
+          in ``benchmark.benchmarks``
+        - The benchmark is registered by applying the ``@benchmark.register``
+          decorator to the class
+        - The benchmark was imported. This is done automatically for all
+          benchmarks that are defined in submodules or subpackages of the
+          ``benchmark.submissions`` module. For all other locations, make
+          sure to manually import the packages **before** calling the
+          ``evaluate()`` function.
+
+    Args:
+        include_benchmarks:
+            If ``None``, run all benchmarks that were discovered. If a container
+            is passed, only include methods that were defined on benchmarks with
+            the specified names. E.g., ``include_benchmarks = ["trimouse"]`` would
+            only evaluate methods of the trimouse benchmark dataset.
+        on_error:
+            see documentation in ``benchmark.base.Benchmark.evaluate()``
+
+    Returns:
+        A collection of all results, which can be printed or exported to
+        ``pd.DataFrame`` or ``json`` file formats.
+    """
+    if results is None:
+        results = ResultCollection()
+    for benchmark_cls in __registry:
+        if include_benchmarks is not None:
+            if benchmark_cls.name not in include_benchmarks:
+                continue
+        benchmark = benchmark_cls()
+        for name in benchmark.names():
+            if Result(method_name=name, benchmark_name=benchmark_cls.name) in results:
+                continue
+            else:
+                result = benchmark.evaluate(name, on_error=on_error)
+                results.add(result)
+    return results
+
+
+def get_filepath(basename: str):
+    return os.path.join(DATA_ROOT, basename)
+
+
+def savecache(results: ResultCollection):
+    with open(CACHE, "w") as fh:
+        json.dump(results.todicts(), fh, indent=2)
+
+
+def loadcache() -> ResultCollection:
+    if not os.path.exists(CACHE):
+        return ResultCollection()
+    with open(CACHE, "r") as fh:
+        try:
+            data = json.load(fh)
+        except json.decoder.JSONDecodeError:
+            return ResultCollection()
+    return ResultCollection.fromdicts(data)
diff --git a/deeplabcut/benchmark/_crypt.py b/deeplabcut/benchmark/_crypt.py
new file mode 100644
index 0000000000..60d523db89
--- /dev/null
+++ b/deeplabcut/benchmark/_crypt.py
@@ -0,0 +1,46 @@
+"""Routines for handling (non-public) ground truth labels.
+"""
+
+import glob
+import io
+import pickle
+import random
+
+import cryptography
+from cryptography.fernet import Fernet
+
+
+def encrypt(file: str, key: bytes):
+    """Encrypt the given file (passed as filename)."""
+    f = Fernet(key)
+    with open(file, "rb") as fh:
+        data = fh.read()
+    encrypted_data = f.encrypt(data)
+    with open(file + ".secret", "wb") as fh:
+        fh.write(encrypted_data)
+
+
+class EncryptedFile:
+    """Contextmanager for opening encrypted files"""
+
+    def __init__(self, filename: str, key: bytes):
+        if not isinstance(key, bytes):
+            raise ValueError(
+                "Pass a bytes object as the key. If key "
+                "is supplied as a string, make sure to call "
+                "encode() before passing the key to this "
+                "function."
+            )
+        self.filename = filename
+        self.key = key
+
+    def __enter__(self):
+        crypt = Fernet(self.key)
+        with open(self.filename, "rb") as fh:
+            data = crypt.decrypt(fh.read())
+        self._stream = io.BytesIO(data)
+        self._stream.seek(0)
+        return self._stream
+
+    def __exit__(self, type, value, traceback):
+        self._stream.close()
diff --git a/deeplabcut/benchmark/base.py b/deeplabcut/benchmark/base.py
new file mode 100644
index 0000000000..10d7390d16
--- /dev/null
+++ b/deeplabcut/benchmark/base.py
@@ -0,0 +1,205 @@
+"""Base classes for benchmark and result definition
+
+Benchmarks subclass the abstract ``Benchmark`` class and are defined by ``name``, their
+``keypoints`` names, as well as groundtruth and metadata necessary to run evaluation.
+Right now, the metrics to compute and report for each of the multi-animal benchmarks is the
+root mean-squared-error (RMSE) and the mean average precision (mAP).
+
+Note for contributors:
+---------------------
+
+If you decide to contribute a benchmark which does not fit into this evaluation framework,
+please feel free to extend the base classes (e.g. to support additional metrics).
+"""
+import abc
+import dataclasses
+import os
+from typing import Iterable
+from typing import List
+from typing import Tuple
+
+import pandas as pd
+
+import benchmark.metrics
+
+
+class BenchmarkEvaluationError(RuntimeError):
+    pass
+
+
+class Benchmark(abc.ABC):
+    """Abstract benchmark baseclass.
+
+    All benchmarks should subclass this class.
+    """
+
+    @abc.abstractmethod
+    def names(self):
+        """A unique key to describe this submission, e.g. the model name.
+
+        This is also the name that will later appear in the benchmark table.
+        The name needs to be unique across the whole benchmark. Non-unique names
+        will raise an error during submission of a PR.
+        """
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def get_predictions(self):
+        """Return predictions for all images in the benchmark."""
+        raise NotImplementedError()
+
+    def __init__(self):
+        keys = ["name", "keypoints", "ground_truth", "metadata"]
+        for key in keys:
+            if not hasattr(self, key):
+                raise NotImplementedError(
+                    f"Subclass of abstract Benchmark class need "
+                    f"to define the {key} property."
+                )
+
+    def compute_pose_rmse(self, results_objects):
+        return benchmark.metrics.calc_rmse_from_obj(
+            results_objects, h5_file=self.ground_truth, metadata_file=self.metadata
+        )
+
+    def compute_pose_map(self, results_objects):
+        return benchmark.metrics.calc_map_from_obj(
+            results_objects, h5_file=self.ground_truth, metadata_file=self.metadata
+        )
+
+    def evaluate(self, name: str, on_error="raise"):
+        """Evaluate this benchmark with all registered methods."""
+
+        if name not in self.names():
+            raise ValueError(
+                f"{name} is not registered. Valid names are {self.names()}"
+            )
+        if on_error not in ("ignore", "return", "raise"):
+            raise ValueError(f"on_error got an undefined value: {on_error}")
+        mean_avg_precision = float("nan")
+        root_mean_squared_error = float("nan")
+        try:
+            predictions = self.get_predictions(name)
+            mean_avg_precision = self.compute_pose_map(predictions)
+            root_mean_squared_error = self.compute_pose_rmse(predictions)
+        except Exception as exception:
+            if on_error == "ignore":
+                # ignore the exception and continue with the next evaluation, without
+                # yielding a result value.
+                return
+            elif on_error == "return":
+                # return the result value, with NaN as the result for all metrics that
+                # could not be computed due to the error.
+                pass
+            elif on_error == "raise":
+                # raise the error and stop evaluation
+                raise BenchmarkEvaluationError(
+                    f"Error during benchmark evaluation for model {name}"
+                ) from exception
+            else:
+                raise NotImplementedError() from exception
+        return Result(
+            method_name=name,
+            benchmark_name=self.name,
+            mean_avg_precision=mean_avg_precision,
+            root_mean_squared_error=root_mean_squared_error,
+        )
+
+
+@dataclasses.dataclass
+class Result:
+    """Benchmark result."""
+
+    method_name: str
+    benchmark_name: str
+    root_mean_squared_error: float = float("nan")
+    mean_avg_precision: float = float("nan")
+    benchmark_version: str = benchmark.__version__
+
+    _export_mapping = dict(
+        benchmark_name="benchmark",
+        method_name="method",
+        benchmark_version="version",
+        root_mean_squared_error="RMSE",
+        mean_avg_precision="mAP",
+    )
+
+    _primary_key = ("benchmark_name", "method_name", "benchmark_version")
+
+    @property
+    def primary_key(self) -> Tuple[str]:
+        """The primary key to uniquely identify this result."""
+        return tuple(getattr(self, k) for k in self._primary_key)
+
+    @property
+    def primary_key_names(self) -> Tuple[str]:
+        """Names of the primary keys"""
+        return tuple(self._export_mapping.get(k) for k in self._primary_key)
+
+    def __str__(self):
+        return (
+            f"{self.method_name}, {self.benchmark_name}: "
+            f"{self.mean_avg_precision} mAP, "
+            f"{self.root_mean_squared_error} RMSE"
+        )
+
+    @classmethod
+    def fromdict(cls, data: dict):
+        """Construct result object from dictionary."""
+        kwargs = {attr: data[key] for attr, key in cls._export_mapping.items()}
+        return cls(**kwargs)
+
+    def todict(self) -> dict:
+        """Export result object to dictionary, with less verbose key names."""
+        return {key: getattr(self, attr) for attr, key in self._export_mapping.items()}
+
+
+class ResultCollection:
+    def __init__(self, *results):
+        self.results = {result.primary_key: result for result in results}
+
+    @property
+    def primary_key_names(self):
+        return next(iter(self.results.values())).primary_key_names
+
+    def toframe(self) -> pd.DataFrame:
+        """Convert results to pandas dataframe"""
+        return pd.DataFrame(
+            [result.todict() for result in self.results.values()]
+        ).set_index(list(self.primary_key_names))
+
+    def add(self, result: Result):
+        """Add a result to the collection."""
+        if result.primary_key in self.results:
+            raise ValueError(
+                "An entry for {result.primary_key} does already "
+                "exist in this collection. Did you try to add the "
+                "same result twice?"
+            )
+        if len(self) > 0:
+            if result.primary_key_names != self.primary_key_names:
+                raise ValueError("Incompatible result format.")
+        self.results[result.primary_key] = result
+
+    @classmethod
+    def fromdicts(cls, data: Iterable[dict]):
+        return cls(*[Result.fromdict(entry) for entry in data])
+
+    def todicts(self):
+        return [result.todict() for result in self.results.values()]
+
+    def __len__(self):
+        return len(self.results)
+
+    def __contains__(self, other: Result):
+        if not isinstance(other, Result):
+            raise ValueError(
+                f"{type(self)} can only store objects of type Result, "
+                f"but got {type(other)}."
+            )
+        return other.primary_key in self.results
+
+    def __eq__(self, other):
+        if not isinstance(other, ResultCollection):
+            return False
+        return other.results == self.results
diff --git a/deeplabcut/benchmark/benchmarks.py b/deeplabcut/benchmark/benchmarks.py
new file mode 100644
index 0000000000..b14f4b6675
--- /dev/null
+++ b/deeplabcut/benchmark/benchmarks.py
@@ -0,0 +1,132 @@
+"""The actual benchmark definitions."""
+import benchmark
+import benchmark.base
+
+
+class TriMouseBenchmark(benchmark.base.Benchmark):
+    """Datasets with three mice with a top-view camera.
+
+    Three wild-type (C57BL/6J) male mice ran on a paper spool following odor trails (Mathis et al 2018). These experiments were carried out in the laboratory of Venkatesh N. Murthy at Harvard University. Data were recorded at 30 Hz with 640 x 480 pixels resolution acquired with a Point Grey Firefly FMVU-03MTM-CS. One human annotator was instructed to localize the 12 keypoints (snout, left ear, right ear, shoulder, four spine points, tail base and three tail points). All surgical and experimental procedures for mice were in accordance with the National Institutes of Health Guide for the Care and Use of Laboratory Animals and approved by the Harvard Institutional Animal Care and Use Committee. 161 frames were labeled, making this a real-world sized laboratory dataset.
+    """
+
+    name = "trimouse"
+    keypoints = (
+        "snout",
+        "leftear",
+        "rightear",
+        "shoulder",
+        "spine1",
+        "spine2",
+        "spine3",
+        "spine4",
+        "tailbase",
+        "tail1",
+        "tail2",
+        "tailend",
+    )
+    ground_truth = benchmark.get_filepath("CollectedData_Daniel.h5")
+    metadata = benchmark.get_filepath("Documentation_data-MultiMouse_70shuffle1.pickle")
+    num_animals = 3
+
+
+class ParentingMouseBenchmark(benchmark.base.Benchmark):
+    """Datasets with three mice, one parenting, two pups.
+
+    Parenting behavior is a pup directed behavior observed in adult mice involving complex motor actions directed towards the benefit of the offspring. These experiments were carried out in the laboratory of Catherine Dulac at Harvard University. The behavioral assay was performed in the homecage of singly housed adult female mice in dark/red light conditions. For these videos, the adult mice was monitored for several minutes in the cage followed by the introduction of pup (4 days old) in one corner of the cage. The behavior of the adult and pup was monitored for a duration of 15 minutes. Video was recorded at 30Hz using a Microsoft LifeCam camera (Part#: 6CH-00001) with a resolution of 1280 x 720 pixels or a Geovision camera (model no.: GV-BX4700-3V) also acquired at 30 frames per second at a resolution of 704 x 480 pixels. A human annotator labeled on the adult animal the same 12 body points as in the tri-mouse dataset, and five body points on the pup along its spine. Initially only the two ends were labeled, and intermediate points were added by interpolation and their positions was manually adjusted if necessary. All surgical and experimental procedures for mice were in accordance with the National Institutes of Health Guide for the Care and Use of Laboratory Animals and approved by the Harvard Institutional Animal Care and Use Committee. 542 frames were labeled, making this a real-world sized laboratory dataset.
+    """
+
+    name = "parenting"
+    keypoints = (
+        "end1",
+        "interm1",
+        "interm2",
+        "interm3",
+        "end2",
+        "snout",
+        "leftear",
+        "rightear",
+        "shoulder",
+        "spine1",
+        "spine2",
+        "spine3",
+        "spine4",
+        "tailbase",
+        "tail1",
+        "tail2",
+        "tailend",
+    )
+
+    ground_truth = benchmark.get_filepath("CollectedData_Mostafizur.h5")
+    metadata = benchmark.get_filepath(
+        "Documentation_data-CrackingParenting_70shuffle1.pickle"
+    )
+    num_animals = 2
+
+    def compute_pose_map(self, results_objects):
+        return benchmark.metrics.calc_map_from_obj(
+            results_objects,
+            h5_file=self.ground_truth,
+            metadata_file=self.metadata,
+            oks_sigma=0.15,
+            margin=10,
+            symmetric_kpts=[(0, 4), (1, 3)],
+        )
+
+
+class MarmosetBenchmark(benchmark.base.Benchmark):
+    """Dataset with two marmosets.
+
+    All animal procedures are overseen by veterinary staff of the MIT and Broad Institute Department of Comparative Medicine, in compliance with the NIH guide for the care and use of laboratory animals and approved by the MIT and Broad Institute animal care and use committees. Video of common marmosets (Callithrix jacchus) was collected in the laboratory of Guoping Feng at MIT. Marmosets were recorded using Kinect V2 cameras (Microsoft) with a resolution of 1080p and frame rate of 30 Hz. After acquisition, images to be used for training the network were manually cropped to 1000 x 1000 pixels or smaller. The dataset is 7,600 labeled frames from 40 different marmosets collected from 3 different colonies (in different facilities). Each cage contains a pair of marmosets, where one marmoset had light blue dye applied to its tufts. One human annotator labeled the 15 marker points on each animal present in the frame (frames contained either 1 or 2 animals).
+
+    """
+
+    name = "marmosets"
+    keypoints = (
+        "Front",
+        "Right",
+        "Middle",
+        "Left",
+        "FL1",
+        "BL1",
+        "FR1",
+        "BR1",
+        "BL2",
+        "BR2",
+        "FL2",
+        "FR2",
+        "Body1",
+        "Body2",
+        "Body3",
+    )
+    ground_truth = benchmark.get_filepath("CollectedData_Mackenzie.h5")
+    metadata = benchmark.get_filepath("Documentation_data-Marmoset_70shuffle1.pickle")
+    num_animals = 2
+
+
+class FishBenchmark(benchmark.base.Benchmark):
+    """Dataset with multiple fish, filmed from top-view
+
+    Schools of inland silversides (Menidia beryllina, n=14 individuals per school) were recorded in the Lauder Lab at Harvard University while swimming at 15 speeds (0.5 to 8 BL/s, body length, at 0.5 BL/s intervals) in a flow tank with a total working section of 28 x 28 x 40 cm as described in previous work, at a constant temperature (18±1°C) and salinity (33 ppt), at a Reynolds number of approximately 10,000 (based on BL). Dorsal views of steady swimming across these speeds were recorded by high-speed video cameras (FASTCAM Mini AX50, Photron USA, San Diego, CA, USA) at 60-125 frames per second (feeding videos at 60 fps, swimming alone 125 fps). The dorsal view was recorded above the swim tunnel and a floating Plexiglas panel at the water surface prevented surface ripples from interfering with dorsal view videos. Five keypoints were labeled (tip, gill, peduncle, dorsal fin tip, caudal tip). 100 frames were labeled, making this a real-world sized laboratory dataset.
+    """
+
+    name = "fish"
+    keypoints = ("tip", "gill", "peduncle", "caudaltip", "dfintip")
+    ground_truth = benchmark.get_filepath("CollectedData_Valentina.h5")
+    metadata = benchmark.get_filepath("Documentation_data-Schooling_70shuffle1.pickle")
+    num_animals = 14
+
+    def compute_pose_rmse(self, results_objects):
+        return benchmark.metrics.calc_rmse_from_obj(
+            results_objects,
+            h5_file=self.ground_truth,
+            metadata_file=self.metadata,
+            drop_kpts=[4, 5],
+        )
+
+    def compute_pose_map(self, results_objects):
+        return benchmark.metrics.calc_map_from_obj(
+            results_objects,
+            h5_file=self.ground_truth,
+            metadata_file=self.metadata,
+            drop_kpts=[4, 5],
+        )
diff --git a/deeplabcut/benchmark/cli.py b/deeplabcut/benchmark/cli.py
new file mode 100644
index 0000000000..6bb111cafe
--- /dev/null
+++ b/deeplabcut/benchmark/cli.py
@@ -0,0 +1,35 @@
+"""Command line interface for DeepLabCut benchmark."""
+
+import argparse
+
+import deeplabcut.benchmark
+
+
+def _parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--include", nargs="+", default=None, required=False)
+    parser.add_argument(
+        "--onerror", 
+        default="return", 
+        required=False, 
+        choices=("ignore", "return", "raise")
+    )
+    parser.add_argument("--nocache", action="store_true")
+    return parser.parse_args()
+
+
+def main():
+    """Main CLI entry point for generating benchmark results."""
+    args = _parse_args()
+    if not args.nocache:
+        results = benchmark.loadcache()
+    else:
+        results = None
+    results = benchmark.evaluate(
+        include_benchmarks=args.include,
+        results=results,
+        on_error=args.onerror,
+    )
+    if not args.nocache:
+        benchmark.savecache(results)
+    print(results.toframe())
diff --git a/deeplabcut/benchmark/metrics.py b/deeplabcut/benchmark/metrics.py
new file mode 100644
index 0000000000..d8317cb759
--- /dev/null
+++ b/deeplabcut/benchmark/metrics.py
@@ -0,0 +1,139 @@
+import sys
+import unittest.mock
+
+# TODO(stes) mocking a few modules to rely in fewer dependencies, without
+# causing import errors when using deeplabcut.
+MOCK_MODULES = ["statsmodels", "statsmodels.api", "pytables"]
+for mod_name in MOCK_MODULES:
+    sys.modules[mod_name] = unittest.mock.MagicMock()
+
+import numpy as np
+import pandas as pd
+import pickle
+from collections import defaultdict
+
+try:
+    from deeplabcut.pose_estimation_tensorflow.lib import inferenceutils
+    from deeplabcut.pose_estimation_tensorflow.core import evaluate_multianimal
+except (ImportError, ModuleNotFoundError) as e:
+    import warnings
+
+    warnings.warn(
+        "Did not find DeepLabCut. You will be able to use and extend the benchmark, "
+        "but it will not be possible to run evaluations."
+    )
+
+import benchmark.utils
+
+
+def conv_obj_to_assemblies(eval_results_obj, keypoint_names):
+    """Convert predictions to deeplabcut assemblies."""
+    assemblies = {}
+    for image_path, results in eval_results_obj.items():
+        lst = []
+        for dict_ in results:
+            ass = inferenceutils.Assembly(len(keypoint_names))
+            for i, kpt in enumerate(keypoint_names):
+                xy = dict_["pose"][kpt]
+                if ~np.isnan(xy).all():
+                    joint = inferenceutils.Joint(pos=(xy), label=i)
+                    ass.add_joint(joint)
+            # TODO(jeylau) add affinity.setter to Assembly
+            ass._affinity = dict_["score"]
+            ass._links = [None]
+            if len(ass):
+                lst.append(ass)
+        assemblies[image_path] = lst
+    return assemblies
+
+
+def calc_map_from_obj(
+    eval_results_obj,
+    h5_file,
+    metadata_file,
+    oks_sigma=0.1,
+    margin=0,
+    symmetric_kpts=None,
+    drop_kpts=None,
+):
+    """Calculate mean average precision (mAP) based on predictions."""
+    df = pd.read_hdf(h5_file)
+    try:
+        df.drop("single", level="individuals", axis=1, inplace=True)
+    except KeyError:
+        pass
+    n_animals = len(df.columns.get_level_values("individuals").unique())
+    kpts = list(df.columns.get_level_values("bodyparts").unique())
+    image_paths = list(eval_results_obj)
+    ground_truth = (
+        df.loc[image_paths].to_numpy().reshape((len(image_paths), n_animals, -1, 2))
+    )
+    temp = np.ones((*ground_truth.shape[:3], 3))
+    temp[..., :2] = ground_truth
+    assemblies_gt = inferenceutils._parse_ground_truth_data(temp)
+    with open(metadata_file, "rb") as f:
+        inds_test = set(pickle.load(f)[2])
+    assemblies_gt_test = {k: v for k, v in assemblies_gt.items() if k in inds_test}
+
+    # TODO(stes): remove/rewrite
+    if drop_kpts is not None:
+        temp = {}
+        for k, v in assemblies_gt_test.items():
+            lst = []
+            for a in v:
+                arr = np.delete(a.data[:, :3], drop_kpts, axis=0)
+                a = inferenceutils.Assembly.from_array(arr)
+                lst.append(a)
+            temp[k] = lst
+        assemblies_gt_test = temp
+        for ind in sorted(drop_kpts, reverse=True):
+            kpts.pop(ind)
+
+    assemblies_pred_ = conv_obj_to_assemblies(eval_results_obj, kpts)
+    assemblies_pred = dict(enumerate(assemblies_pred_.values()))
+
+    with benchmark.utils.DisableOutput():
+        oks = inferenceutils.evaluate_assembly(
+            assemblies_pred,
+            assemblies_gt_test,
+            oks_sigma,
+            margin=margin,
+            symmetric_kpts=symmetric_kpts,
+        )
+    return oks["mAP"]
+
+
+def calc_rmse_from_obj(
+    eval_results_obj,
+    h5_file,
+    metadata_file,
+    drop_kpts=None,
+):
+    """Calc prediction errors for submissions."""
+    gt = evaluate_multianimal._format_gt_data(h5_file)
+    kpts = gt["metadata"]["keypoints"]
+    if drop_kpts:
+        for k, v in gt["annotations"].items():
+            gt["annotations"][k] = np.delete(v, drop_kpts, axis=1)
+        for ind in sorted(drop_kpts, reverse=True):
+            kpts.pop(ind)
+    with open(metadata_file, "rb") as f:
+        inds_test = set(pickle.load(f)[2])
+    test_objects = {
+        k: v for i, (k, v) in enumerate(eval_results_obj.items()) if i in inds_test
+    }
+    assemblies_pred = conv_obj_to_assemblies(test_objects, kpts)
+    preds = defaultdict(dict)
+    preds["metadata"]["keypoints"] = kpts
+    for image, assemblies in assemblies_pred.items():
+        if assemblies:
+            arr = np.stack([a.data for a in assemblies]).swapaxes(0, 1)
+            data = [xy[~np.isnan(xy).any(axis=1)] for xy in arr[..., :2]]
+            temp = {
+                "coordinates": tuple([data]),
+                "confidence": list(np.expand_dims(arr[..., 2], axis=2)),
+            }
+            preds["predictions"][image] = temp
+    with benchmark.utils.DisableOutput():
+        errors = evaluate_multianimal.calc_prediction_errors(preds, gt)
+    return np.nanmean(errors[..., 0])
diff --git a/deeplabcut/benchmark/utils.py b/deeplabcut/benchmark/utils.py
new file mode 100644
index 0000000000..b2717793fc
--- /dev/null
+++ b/deeplabcut/benchmark/utils.py
@@ -0,0 +1,59 @@
+"""Helper functions in this file are not affected by the main repositories
+license. They are independent from the remainder of the benchmarking code. 
+"""
+import importlib
+import os
+import pkgutil
+import sys
+
+
+class RedirectStdStreams(object):
+    """Context manager for redirecting stdout and stderr
+    Reference:
+        https://stackoverflow.com/a/6796752
+        CC BY-SA 3.0, https://stackoverflow.com/users/46690/rob-cowie
+    """
+
+    def __init__(self, stdout=None, stderr=None):
+        self._stdout = stdout or sys.stdout
+        self._stderr = stderr or sys.stderr
+
+    def __enter__(self):
+        self.old_stdout, self.old_stderr = sys.stdout, sys.stderr
+        self.old_stdout.flush()
+        self.old_stderr.flush()
+        sys.stdout, sys.stderr = self._stdout, self._stderr
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self._stdout.flush()
+        self._stderr.flush()
+        sys.stdout = self.old_stdout
+        sys.stderr = self.old_stderr
+
+
+class DisableOutput(RedirectStdStreams):
+    def __init__(self):
+        devnull = open(os.devnull, "w")
+        super().__init__(stdout=devnull, stderr=devnull)
+
+
+def import_submodules(package, recursive=True):
+    """Import all submodules of a module, recursively, including subpackages
+
+    :param package: package (name or actual module)
+    :type package: str | module
+    :rtype: dict[str, types.ModuleType]
+
+    Reference:
+        https://stackoverflow.com/a/25562415
+        CC BY-SA 3.0, https://stackoverflow.com/users/712522/mr-b
+    """
+    if isinstance(package, str):
+        package = importlib.import_module(package)
+    results = {}
+    for loader, name, is_pkg in pkgutil.walk_packages(package.__path__):
+        full_name = package.__name__ + "." + name
+        results[full_name] = importlib.import_module(full_name)
+        if recursive and is_pkg:
+            results.update(import_submodules(full_name))
+    return results

From 07b95c35fb9ab44c0d4e68213e12cf1a7157e0ee Mon Sep 17 00:00:00 2001
From: Steffen Schneider <steffen@bethgelab.org>
Date: Tue, 12 Apr 2022 17:42:42 +0200
Subject: [PATCH 2/8] Move benchmark files into subpackage

---
 deeplabcut/benchmark/.results      |  93 +++++++++++++++++++++
 deeplabcut/benchmark/__init__.py   |  12 ++-
 deeplabcut/benchmark/_crypt.py     |  11 ++-
 deeplabcut/benchmark/base.py       |  17 +++-
 deeplabcut/benchmark/benchmarks.py |  34 ++++----
 deeplabcut/benchmark/cli.py        |   6 +-
 deeplabcut/benchmark/metrics.py    | 127 +++++++++++++++++++++++------
 7 files changed, 249 insertions(+), 51 deletions(-)
 create mode 100644 deeplabcut/benchmark/.results

diff --git a/deeplabcut/benchmark/.results b/deeplabcut/benchmark/.results
new file mode 100644
index 0000000000..8d5d62725d
--- /dev/null
+++ b/deeplabcut/benchmark/.results
@@ -0,0 +1,93 @@
+[
+  {
+    "benchmark": "trimouse",
+    "method": "DLCRNet_ms5 (30K)",
+    "version": "2.2.1",
+    "RMSE": NaN,
+    "mAP": NaN
+  },
+  {
+    "benchmark": "trimouse",
+    "method": "EfficientNet B7 (30K)",
+    "version": "2.2.1",
+    "RMSE": NaN,
+    "mAP": NaN
+  },
+  {
+    "benchmark": "trimouse",
+    "method": "ResNet50 (30K)",
+    "version": "2.2.1",
+    "RMSE": NaN,
+    "mAP": NaN
+  },
+  {
+    "benchmark": "parenting",
+    "method": "DLCRNet_ms5 (30k)",
+    "version": "2.2.1",
+    "RMSE": NaN,
+    "mAP": NaN
+  },
+  {
+    "benchmark": "parenting",
+    "method": "EfficientNet B7 (30k) ",
+    "version": "2.2.1",
+    "RMSE": NaN,
+    "mAP": NaN
+  },
+  {
+    "benchmark": "parenting",
+    "method": "EfficientNet B7_s4 (30k)",
+    "version": "2.2.1",
+    "RMSE": NaN,
+    "mAP": NaN
+  },
+  {
+    "benchmark": "marmosets",
+    "method": "DLCRNet_ms5 (200k)",
+    "version": "2.2.1",
+    "RMSE": NaN,
+    "mAP": NaN
+  },
+  {
+    "benchmark": "marmosets",
+    "method": "DLCRNet_ms5_s4 (200k)",
+    "version": "2.2.1",
+    "RMSE": NaN,
+    "mAP": NaN
+  },
+  {
+    "benchmark": "marmosets",
+    "method": "EfficientNet B7 (200k)",
+    "version": "2.2.1",
+    "RMSE": NaN,
+    "mAP": NaN
+  },
+  {
+    "benchmark": "marmosets",
+    "method": "EfficientNet B7_s4 (200k) ",
+    "version": "2.2.1",
+    "RMSE": NaN,
+    "mAP": NaN
+  },
+  {
+    "benchmark": "fish",
+    "method": "DLCRNet_ms5 (30k)",
+    "version": "2.2.1",
+    "RMSE": NaN,
+    "mAP": NaN
+  },
+  {
+    "benchmark": "fish",
+    "method": "EfficientNet B7 (30K)",
+    "version": "2.2.1",
+    "RMSE": NaN,
+    "mAP": NaN
+  },
+  {
+    "benchmark": "fish",
+    "method": "ResNet50 (30K)",
+    "version": "2.2.1",
+    "RMSE": NaN,
+    "mAP": NaN
+  }
+]
\ No newline at end of file
diff --git a/deeplabcut/benchmark/__init__.py b/deeplabcut/benchmark/__init__.py
index 08748040de..561325a0d0 100644
--- a/deeplabcut/benchmark/__init__.py
+++ b/deeplabcut/benchmark/__init__.py
@@ -1,10 +1,18 @@
-"""The DeepLabCut benchmark suite."""
+"""
+DeepLabCut2.0 Toolbox (deeplabcut.org)
+© A. & M. Mathis Labs
+https://github.com/AlexEMG/DeepLabCut
+Please see AUTHORS for contributors.
+
+https://github.com/AlexEMG/DeepLabCut/blob/master/AUTHORS
+Licensed under GNU Lesser General Public License v3.0
+"""
 
 import json
 import os
 from typing import Container
 
-from benchmark.base import Benchmark, Result, ResultCollection
+from deeplabcut.benchmark.base import Benchmark, Result, ResultCollection
 
 DATA_ROOT = os.path.join(os.path.dirname(__file__), "data")
 CACHE = os.path.join(os.path.dirname(__file__), ".results")
diff --git a/deeplabcut/benchmark/_crypt.py b/deeplabcut/benchmark/_crypt.py
index 60d523db89..736ee42c97 100644
--- a/deeplabcut/benchmark/_crypt.py
+++ b/deeplabcut/benchmark/_crypt.py
@@ -1,5 +1,12 @@
-"""Routines for handling (non-public) ground truth labels.
-"""
+# DeepLabCut2.0 Toolbox (deeplabcut.org)
+# © A. & M. Mathis Labs
+# https://github.com/AlexEMG/DeepLabCut
+# Please see AUTHORS for contributors.
+# 
+# https://github.com/AlexEMG/DeepLabCut/blob/master/AUTHORS
+# Licensed under GNU Lesser General Public License v3.0
+
+"""Routines for handling (non-public) ground truth labels."""
 
 import glob
 import io
diff --git a/deeplabcut/benchmark/base.py b/deeplabcut/benchmark/base.py
index 10d7390d16..0c23c31e61 100644
--- a/deeplabcut/benchmark/base.py
+++ b/deeplabcut/benchmark/base.py
@@ -1,3 +1,11 @@
+# DeepLabCut2.0 Toolbox (deeplabcut.org)
+# © A. & M. Mathis Labs
+# https://github.com/AlexEMG/DeepLabCut
+# Please see AUTHORS for contributors.
+#
+# https://github.com/AlexEMG/DeepLabCut/blob/master/AUTHORS
+# Licensed under GNU Lesser General Public License v3.0
+
 """Base classes for benchmark and result definition
 
 Benchmarks subclass the abstract ``Benchmark`` class and are defined by ``name``, their
@@ -11,6 +19,7 @@
 If you decide to contribute a benchmark which does not fit into this evaluation framework,
 please feel free to extend the base classes (e.g. to support additional metrics).
 """
+
 import abc
 import dataclasses
 import os
@@ -20,7 +29,7 @@
 
 import pandas as pd
 
-import benchmark.metrics
+import deeplabcut.benchmark.metrics
 
 
 class BenchmarkEvaluationError(RuntimeError):
@@ -58,12 +67,12 @@ def __init__(self):
                 )
 
     def compute_pose_rmse(self, results_objects):
-        return benchmark.metrics.calc_rmse_from_obj(
+        return deeplabcut.benchmark.metrics.calc_rmse_from_obj(
             results_objects, h5_file=self.ground_truth, metadata_file=self.metadata
         )
 
     def compute_pose_map(self, results_objects):
-        return benchmark.metrics.calc_map_from_obj(
+        return deeplabcut.benchmark.metrics.calc_map_from_obj(
             results_objects, h5_file=self.ground_truth, metadata_file=self.metadata
         )
 
@@ -114,7 +123,7 @@ class Result:
     benchmark_name: str
     root_mean_squared_error: float = float("nan")
     mean_avg_precision: float = float("nan")
-    benchmark_version: str = benchmark.__version__
+    benchmark_version: str = deeplabcut.__version__
 
     _export_mapping = dict(
         benchmark_name="benchmark",
diff --git a/deeplabcut/benchmark/benchmarks.py b/deeplabcut/benchmark/benchmarks.py
index b14f4b6675..b83a31c5a7 100644
--- a/deeplabcut/benchmark/benchmarks.py
+++ b/deeplabcut/benchmark/benchmarks.py
@@ -1,9 +1,9 @@
 """The actual benchmark definitions."""
-import benchmark
-import benchmark.base
+import deeplabcut.benchmark
+import deeplabcut.benchmark.base
 
 
-class TriMouseBenchmark(benchmark.base.Benchmark):
+class TriMouseBenchmark(deeplabcut.benchmark.base.Benchmark):
     """Datasets with three mice with a top-view camera.
 
     Three wild-type (C57BL/6J) male mice ran on a paper spool following odor trails (Mathis et al 2018). These experiments were carried out in the laboratory of Venkatesh N. Murthy at Harvard University. Data were recorded at 30 Hz with 640 x 480 pixels resolution acquired with a Point Grey Firefly FMVU-03MTM-CS. One human annotator was instructed to localize the 12 keypoints (snout, left ear, right ear, shoulder, four spine points, tail base and three tail points). All surgical and experimental procedures for mice were in accordance with the National Institutes of Health Guide for the Care and Use of Laboratory Animals and approved by the Harvard Institutional Animal Care and Use Committee. 161 frames were labeled, making this a real-world sized laboratory dataset.
@@ -24,12 +24,12 @@ class TriMouseBenchmark(benchmark.base.Benchmark):
         "tail2",
         "tailend",
     )
-    ground_truth = benchmark.get_filepath("CollectedData_Daniel.h5")
-    metadata = benchmark.get_filepath("Documentation_data-MultiMouse_70shuffle1.pickle")
+    ground_truth = deeplabcut.benchmark.get_filepath("CollectedData_Daniel.h5")
+    metadata = deeplabcut.benchmark.get_filepath("Documentation_data-MultiMouse_70shuffle1.pickle")
     num_animals = 3
 
 
-class ParentingMouseBenchmark(benchmark.base.Benchmark):
+class ParentingMouseBenchmark(deeplabcut.benchmark.base.Benchmark):
     """Datasets with three mice, one parenting, two pups.
 
     Parenting behavior is a pup directed behavior observed in adult mice involving complex motor actions directed towards the benefit of the offspring. These experiments were carried out in the laboratory of Catherine Dulac at Harvard University. The behavioral assay was performed in the homecage of singly housed adult female mice in dark/red light conditions. For these videos, the adult mice was monitored for several minutes in the cage followed by the introduction of pup (4 days old) in one corner of the cage. The behavior of the adult and pup was monitored for a duration of 15 minutes. Video was recorded at 30Hz using a Microsoft LifeCam camera (Part#: 6CH-00001) with a resolution of 1280 x 720 pixels or a Geovision camera (model no.: GV-BX4700-3V) also acquired at 30 frames per second at a resolution of 704 x 480 pixels. A human annotator labeled on the adult animal the same 12 body points as in the tri-mouse dataset, and five body points on the pup along its spine. Initially only the two ends were labeled, and intermediate points were added by interpolation and their positions was manually adjusted if necessary. All surgical and experimental procedures for mice were in accordance with the National Institutes of Health Guide for the Care and Use of Laboratory Animals and approved by the Harvard Institutional Animal Care and Use Committee. 542 frames were labeled, making this a real-world sized laboratory dataset.
@@ -56,14 +56,14 @@ class ParentingMouseBenchmark(benchmark.base.Benchmark):
         "tailend",
     )
 
-    ground_truth = benchmark.get_filepath("CollectedData_Mostafizur.h5")
-    metadata = benchmark.get_filepath(
+    ground_truth = deeplabcut.benchmark.get_filepath("CollectedData_Mostafizur.h5")
+    metadata = deeplabcut.benchmark.get_filepath(
         "Documentation_data-CrackingParenting_70shuffle1.pickle"
     )
     num_animals = 2
 
     def compute_pose_map(self, results_objects):
-        return benchmark.metrics.calc_map_from_obj(
+        return deeplabcut.benchmark.metrics.calc_map_from_obj(
             results_objects,
             h5_file=self.ground_truth,
             metadata_file=self.metadata,
@@ -73,7 +73,7 @@ def compute_pose_map(self, results_objects):
         )
 
 
-class MarmosetBenchmark(benchmark.base.Benchmark):
+class MarmosetBenchmark(deeplabcut.benchmark.base.Benchmark):
     """Dataset with two marmosets.
 
     All animal procedures are overseen by veterinary staff of the MIT and Broad Institute Department of Comparative Medicine, in compliance with the NIH guide for the care and use of laboratory animals and approved by the MIT and Broad Institute animal care and use committees. Video of common marmosets (Callithrix jacchus) was collected in the laboratory of Guoping Feng at MIT. Marmosets were recorded using Kinect V2 cameras (Microsoft) with a resolution of 1080p and frame rate of 30 Hz. After acquisition, images to be used for training the network were manually cropped to 1000 x 1000 pixels or smaller. The dataset is 7,600 labeled frames from 40 different marmosets collected from 3 different colonies (in different facilities). Each cage contains a pair of marmosets, where one marmoset had light blue dye applied to its tufts. One human annotator labeled the 15 marker points on each animal present in the frame (frames contained either 1 or 2 animals).
@@ -98,12 +98,12 @@ class MarmosetBenchmark(benchmark.base.Benchmark):
         "Body2",
         "Body3",
     )
-    ground_truth = benchmark.get_filepath("CollectedData_Mackenzie.h5")
-    metadata = benchmark.get_filepath("Documentation_data-Marmoset_70shuffle1.pickle")
+    ground_truth = deeplabcut.benchmark.get_filepath("CollectedData_Mackenzie.h5")
+    metadata = deeplabcut.benchmark.get_filepath("Documentation_data-Marmoset_70shuffle1.pickle")
     num_animals = 2
 
 
-class FishBenchmark(benchmark.base.Benchmark):
+class FishBenchmark(deeplabcut.benchmark.base.Benchmark):
     """Dataset with multiple fish, filmed from top-view
 
     Schools of inland silversides (Menidia beryllina, n=14 individuals per school) were recorded in the Lauder Lab at Harvard University while swimming at 15 speeds (0.5 to 8 BL/s, body length, at 0.5 BL/s intervals) in a flow tank with a total working section of 28 x 28 x 40 cm as described in previous work, at a constant temperature (18±1°C) and salinity (33 ppt), at a Reynolds number of approximately 10,000 (based on BL). Dorsal views of steady swimming across these speeds were recorded by high-speed video cameras (FASTCAM Mini AX50, Photron USA, San Diego, CA, USA) at 60-125 frames per second (feeding videos at 60 fps, swimming alone 125 fps). The dorsal view was recorded above the swim tunnel and a floating Plexiglas panel at the water surface prevented surface ripples from interfering with dorsal view videos. Five keypoints were labeled (tip, gill, peduncle, dorsal fin tip, caudal tip). 100 frames were labeled, making this a real-world sized laboratory dataset.
@@ -111,12 +111,12 @@ class FishBenchmark(benchmark.base.Benchmark):
 
     name = "fish"
     keypoints = ("tip", "gill", "peduncle", "caudaltip", "dfintip")
-    ground_truth = benchmark.get_filepath("CollectedData_Valentina.h5")
-    metadata = benchmark.get_filepath("Documentation_data-Schooling_70shuffle1.pickle")
+    ground_truth = deeplabcut.benchmark.get_filepath("CollectedData_Valentina.h5")
+    metadata = deeplabcut.benchmark.get_filepath("Documentation_data-Schooling_70shuffle1.pickle")
     num_animals = 14
 
     def compute_pose_rmse(self, results_objects):
-        return benchmark.metrics.calc_rmse_from_obj(
+        return deeplabcut.benchmark.metrics.calc_rmse_from_obj(
             results_objects,
             h5_file=self.ground_truth,
             metadata_file=self.metadata,
@@ -124,7 +124,7 @@ def compute_pose_rmse(self, results_objects):
         )
 
     def compute_pose_map(self, results_objects):
-        return benchmark.metrics.calc_map_from_obj(
+        return deeplabcut.benchmark.metrics.calc_map_from_obj(
             results_objects,
             h5_file=self.ground_truth,
             metadata_file=self.metadata,
diff --git a/deeplabcut/benchmark/cli.py b/deeplabcut/benchmark/cli.py
index 6bb111cafe..8bd227ed76 100644
--- a/deeplabcut/benchmark/cli.py
+++ b/deeplabcut/benchmark/cli.py
@@ -22,14 +22,14 @@ def main():
     """Main CLI entry point for generating benchmark results."""
     args = _parse_args()
     if not args.nocache:
-        results = benchmark.loadcache()
+        results = deeplabcut.benchmark.loadcache()
     else:
         results = None
-    results = benchmark.evaluate(
+    results = deeplabcut.benchmark.evaluate(
         include_benchmarks=args.include,
         results=results,
         on_error=args.onerror,
     )
     if not args.nocache:
-        benchmark.savecache(results)
+        deeplabcut.benchmark.savecache(results)
     print(results.toframe())
diff --git a/deeplabcut/benchmark/metrics.py b/deeplabcut/benchmark/metrics.py
index d8317cb759..68e26f6af6 100644
--- a/deeplabcut/benchmark/metrics.py
+++ b/deeplabcut/benchmark/metrics.py
@@ -7,23 +7,109 @@
 for mod_name in MOCK_MODULES:
     sys.modules[mod_name] = unittest.mock.MagicMock()
 
-import numpy as np
-import pandas as pd
+import os
 import pickle
 from collections import defaultdict
 
-try:
-    from deeplabcut.pose_estimation_tensorflow.lib import inferenceutils
-    from deeplabcut.pose_estimation_tensorflow.core import evaluate_multianimal
-except (ImportError, ModuleNotFoundError) as e:
-    import warnings
+import numpy as np
+import pandas as pd
+
+import deeplabcut.benchmark.utils
+from deeplabcut.pose_estimation_tensorflow.core import evaluate_multianimal
+from deeplabcut.pose_estimation_tensorflow.lib import inferenceutils
+from deeplabcut.utils import auxiliaryfunctions
+from deeplabcut.utils.conversioncode import guarantee_multiindex_rows
 
-    warnings.warn(
-        "Did not find DeepLabCut. You will be able to use and extend the benchmark, "
-        "but it will not be possible to run evaluations."
+
+def _format_gt_data(h5file):
+    df = pd.read_hdf(h5file)
+
+    def _get_unique_level_values(header, level):
+        return header.get_level_values(level).unique().to_list()
+
+    animals = _get_unique_level_values(df.columns, "individuals")
+    kpts = _get_unique_level_values(df.columns, "bodyparts")
+    try:
+        n_unique = len(
+            _get_unique_level_values(
+                df.xs("single", level="individuals", axis=1).columns, "bodyparts"
+            )
+        )
+    except KeyError:
+        n_unique = 0
+    guarantee_multiindex_rows(df)
+    file_paths = [os.path.join(*row) for row in df.index.to_list()]
+    temp = (
+        df.stack("individuals", dropna=False)
+        .reindex(animals, level="individuals")
+        .reindex(kpts, level="bodyparts", axis=1)
     )
+    data = temp.to_numpy().reshape((len(file_paths), len(animals), -1, 2))
+    meta = {"animals": animals, "keypoints": kpts, "n_unique": n_unique}
+    return {
+        "annotations": dict(zip(file_paths, data)),
+        "metadata": meta,
+    }
+
 
-import benchmark.utils
+def calc_prediction_errors(preds, gt):
+    kpts_gt = gt["metadata"]["keypoints"]
+    kpts_pred = preds["metadata"]["keypoints"]
+    map_ = {kpts_gt.index(kpt): i for i, kpt in enumerate(kpts_pred)}
+    annot = gt["annotations"]
+
+    # Map image paths from predicted data to GT as the first are typically
+    # absolute whereas the latter are relative to the project path.
+    def _map(strings, substrings):
+        lookup = dict()
+        strings_ = strings.copy()
+        substrings_ = substrings.copy()
+        while strings_:
+            string = strings_.pop()
+            for s in substrings_:
+                if string.endswith(s):
+                    lookup[string] = s
+                    substrings_.remove(s)
+                    break
+        return lookup
+
+    map_images = _map(list(preds["predictions"]), list(annot))
+
+    errors = np.full(
+        (
+            len(preds["predictions"]),
+            len(gt["metadata"]["animals"]),
+            len(kpts_gt),
+            2,  # Hold distance to GT and confidence
+        ),
+        np.nan,
+    )
+    for n, (path, preds_) in enumerate(preds["predictions"].items()):
+        if not preds_:
+            continue
+        xy_gt = annot[map_images[path]].swapaxes(0, 1)
+        xy_pred = preds_["coordinates"][0]
+        conf_pred = preds_["confidence"]
+        for i, xy_gt_ in enumerate(xy_gt):
+            visible = np.flatnonzero(np.all(~np.isnan(xy_gt_), axis=1))
+            xy_pred_ = xy_pred[map_[i]]
+            if visible.size and xy_pred_.size:
+                # Pick the predictions closest to ground truth,
+                # rather than the ones the model has most confident in.
+                neighbors = evaluate_multianimal._find_closest_neighbors(
+                    xy_gt_[visible], xy_pred_, k=3
+                )
+                found = neighbors != -1
+                if ~np.any(found):
+                    continue
+                min_dists = np.linalg.norm(
+                    xy_gt_[visible][found] - xy_pred_[neighbors[found]],
+                    axis=1,
+                )
+                conf_pred_ = conf_pred[map_[i]]
+                errors[n, visible[found], i, 0] = min_dists
+                errors[n, visible[found], i, 1] = conf_pred_[neighbors[found], 0]
+    return errors
 
 
 def conv_obj_to_assemblies(eval_results_obj, keypoint_names):
@@ -31,18 +117,13 @@ def conv_obj_to_assemblies(eval_results_obj, keypoint_names):
     assemblies = {}
     for image_path, results in eval_results_obj.items():
         lst = []
-        for dict_ in results:
+        for pose in results:
             ass = inferenceutils.Assembly(len(keypoint_names))
             for i, kpt in enumerate(keypoint_names):
-                xy = dict_["pose"][kpt]
-                if ~np.isnan(xy).all():
-                    joint = inferenceutils.Joint(pos=(xy), label=i)
-                    ass.add_joint(joint)
-            # TODO(jeylau) add affinity.setter to Assembly
-            ass._affinity = dict_["score"]
-            ass._links = [None]
-            if len(ass):
-                lst.append(ass)
+                xy = pose[kpt]
+                joint = inferenceutils.Joint(pos=(xy), label=i)
+                ass.add_joint(joint)
+            lst.append(ass)
         assemblies[image_path] = lst
     return assemblies
 
@@ -110,7 +191,7 @@ def calc_rmse_from_obj(
     drop_kpts=None,
 ):
     """Calc prediction errors for submissions."""
-    gt = evaluate_multianimal._format_gt_data(h5_file)
+    gt = _format_gt_data(h5_file)
     kpts = gt["metadata"]["keypoints"]
     if drop_kpts:
         for k, v in gt["annotations"].items():
@@ -135,5 +216,5 @@ def calc_rmse_from_obj(
             }
             preds["predictions"][image] = temp
     with benchmark.utils.DisableOutput():
-        errors = evaluate_multianimal.calc_prediction_errors(preds, gt)
+        errors = calc_prediction_errors(preds, gt)
     return np.nanmean(errors[..., 0])

From af63c4f6736888b8be20599395c925982e46f593 Mon Sep 17 00:00:00 2001
From: Jessy <30733203+jeylau@users.noreply.github.com>
Date: Wed, 18 May 2022 11:03:22 +0200
Subject: [PATCH 3/8] Fix ImportErrors

---
 deeplabcut/benchmark/__init__.py   |  2 +-
 deeplabcut/benchmark/base.py       | 11 +++++-----
 deeplabcut/benchmark/benchmarks.py | 34 +++++++++++++++---------------
 deeplabcut/benchmark/cli.py        | 19 ++++++++++-------
 deeplabcut/benchmark/metrics.py    |  3 +--
 5 files changed, 35 insertions(+), 34 deletions(-)

diff --git a/deeplabcut/benchmark/__init__.py b/deeplabcut/benchmark/__init__.py
index 561325a0d0..b8c88ce044 100644
--- a/deeplabcut/benchmark/__init__.py
+++ b/deeplabcut/benchmark/__init__.py
@@ -12,7 +12,7 @@
 import os
 from typing import Container
 
-from deeplabcut.benchmark.base import Benchmark, Result, ResultCollection
+from benchmark.base import Benchmark, Result, ResultCollection
 
 DATA_ROOT = os.path.join(os.path.dirname(__file__), "data")
 CACHE = os.path.join(os.path.dirname(__file__), ".results")
diff --git a/deeplabcut/benchmark/base.py b/deeplabcut/benchmark/base.py
index 0c23c31e61..f5b70b3e14 100644
--- a/deeplabcut/benchmark/base.py
+++ b/deeplabcut/benchmark/base.py
@@ -22,14 +22,13 @@
 
 import abc
 import dataclasses
-import os
 from typing import Iterable
-from typing import List
 from typing import Tuple
 
 import pandas as pd
 
-import deeplabcut.benchmark.metrics
+import benchmark.metrics
+from deeplabcut import __version__
 
 
 class BenchmarkEvaluationError(RuntimeError):
@@ -67,12 +66,12 @@ def __init__(self):
                 )
 
     def compute_pose_rmse(self, results_objects):
-        return deeplabcut.benchmark.metrics.calc_rmse_from_obj(
+        return benchmark.metrics.calc_rmse_from_obj(
             results_objects, h5_file=self.ground_truth, metadata_file=self.metadata
         )
 
     def compute_pose_map(self, results_objects):
-        return deeplabcut.benchmark.metrics.calc_map_from_obj(
+        return benchmark.metrics.calc_map_from_obj(
             results_objects, h5_file=self.ground_truth, metadata_file=self.metadata
         )
 
@@ -123,7 +122,7 @@ class Result:
     benchmark_name: str
     root_mean_squared_error: float = float("nan")
     mean_avg_precision: float = float("nan")
-    benchmark_version: str = deeplabcut.__version__
+    benchmark_version: str = __version__
 
     _export_mapping = dict(
         benchmark_name="benchmark",
diff --git a/deeplabcut/benchmark/benchmarks.py b/deeplabcut/benchmark/benchmarks.py
index b83a31c5a7..b14f4b6675 100644
--- a/deeplabcut/benchmark/benchmarks.py
+++ b/deeplabcut/benchmark/benchmarks.py
@@ -1,9 +1,9 @@
 """The actual benchmark definitions."""
-import deeplabcut.benchmark
-import deeplabcut.benchmark.base
+import benchmark
+import benchmark.base
 
 
-class TriMouseBenchmark(deeplabcut.benchmark.base.Benchmark):
+class TriMouseBenchmark(benchmark.base.Benchmark):
     """Datasets with three mice with a top-view camera.
 
     Three wild-type (C57BL/6J) male mice ran on a paper spool following odor trails (Mathis et al 2018). These experiments were carried out in the laboratory of Venkatesh N. Murthy at Harvard University. Data were recorded at 30 Hz with 640 x 480 pixels resolution acquired with a Point Grey Firefly FMVU-03MTM-CS. One human annotator was instructed to localize the 12 keypoints (snout, left ear, right ear, shoulder, four spine points, tail base and three tail points). All surgical and experimental procedures for mice were in accordance with the National Institutes of Health Guide for the Care and Use of Laboratory Animals and approved by the Harvard Institutional Animal Care and Use Committee. 161 frames were labeled, making this a real-world sized laboratory dataset.
@@ -24,12 +24,12 @@ class TriMouseBenchmark(deeplabcut.benchmark.base.Benchmark):
         "tail2",
         "tailend",
     )
-    ground_truth = deeplabcut.benchmark.get_filepath("CollectedData_Daniel.h5")
-    metadata = deeplabcut.benchmark.get_filepath("Documentation_data-MultiMouse_70shuffle1.pickle")
+    ground_truth = benchmark.get_filepath("CollectedData_Daniel.h5")
+    metadata = benchmark.get_filepath("Documentation_data-MultiMouse_70shuffle1.pickle")
     num_animals = 3
 
 
-class ParentingMouseBenchmark(deeplabcut.benchmark.base.Benchmark):
+class ParentingMouseBenchmark(benchmark.base.Benchmark):
     """Datasets with three mice, one parenting, two pups.
 
     Parenting behavior is a pup directed behavior observed in adult mice involving complex motor actions directed towards the benefit of the offspring. These experiments were carried out in the laboratory of Catherine Dulac at Harvard University. The behavioral assay was performed in the homecage of singly housed adult female mice in dark/red light conditions. For these videos, the adult mice was monitored for several minutes in the cage followed by the introduction of pup (4 days old) in one corner of the cage. The behavior of the adult and pup was monitored for a duration of 15 minutes. Video was recorded at 30Hz using a Microsoft LifeCam camera (Part#: 6CH-00001) with a resolution of 1280 x 720 pixels or a Geovision camera (model no.: GV-BX4700-3V) also acquired at 30 frames per second at a resolution of 704 x 480 pixels. A human annotator labeled on the adult animal the same 12 body points as in the tri-mouse dataset, and five body points on the pup along its spine. Initially only the two ends were labeled, and intermediate points were added by interpolation and their positions was manually adjusted if necessary. All surgical and experimental procedures for mice were in accordance with the National Institutes of Health Guide for the Care and Use of Laboratory Animals and approved by the Harvard Institutional Animal Care and Use Committee. 542 frames were labeled, making this a real-world sized laboratory dataset.
@@ -56,14 +56,14 @@ class ParentingMouseBenchmark(deeplabcut.benchmark.base.Benchmark):
         "tailend",
     )
 
-    ground_truth = deeplabcut.benchmark.get_filepath("CollectedData_Mostafizur.h5")
-    metadata = deeplabcut.benchmark.get_filepath(
+    ground_truth = benchmark.get_filepath("CollectedData_Mostafizur.h5")
+    metadata = benchmark.get_filepath(
         "Documentation_data-CrackingParenting_70shuffle1.pickle"
     )
     num_animals = 2
 
     def compute_pose_map(self, results_objects):
-        return deeplabcut.benchmark.metrics.calc_map_from_obj(
+        return benchmark.metrics.calc_map_from_obj(
             results_objects,
             h5_file=self.ground_truth,
             metadata_file=self.metadata,
@@ -73,7 +73,7 @@ def compute_pose_map(self, results_objects):
         )
 
 
-class MarmosetBenchmark(deeplabcut.benchmark.base.Benchmark):
+class MarmosetBenchmark(benchmark.base.Benchmark):
     """Dataset with two marmosets.
 
     All animal procedures are overseen by veterinary staff of the MIT and Broad Institute Department of Comparative Medicine, in compliance with the NIH guide for the care and use of laboratory animals and approved by the MIT and Broad Institute animal care and use committees. Video of common marmosets (Callithrix jacchus) was collected in the laboratory of Guoping Feng at MIT. Marmosets were recorded using Kinect V2 cameras (Microsoft) with a resolution of 1080p and frame rate of 30 Hz. After acquisition, images to be used for training the network were manually cropped to 1000 x 1000 pixels or smaller. The dataset is 7,600 labeled frames from 40 different marmosets collected from 3 different colonies (in different facilities). Each cage contains a pair of marmosets, where one marmoset had light blue dye applied to its tufts. One human annotator labeled the 15 marker points on each animal present in the frame (frames contained either 1 or 2 animals).
@@ -98,12 +98,12 @@ class MarmosetBenchmark(deeplabcut.benchmark.base.Benchmark):
         "Body2",
         "Body3",
     )
-    ground_truth = deeplabcut.benchmark.get_filepath("CollectedData_Mackenzie.h5")
-    metadata = deeplabcut.benchmark.get_filepath("Documentation_data-Marmoset_70shuffle1.pickle")
+    ground_truth = benchmark.get_filepath("CollectedData_Mackenzie.h5")
+    metadata = benchmark.get_filepath("Documentation_data-Marmoset_70shuffle1.pickle")
     num_animals = 2
 
 
-class FishBenchmark(deeplabcut.benchmark.base.Benchmark):
+class FishBenchmark(benchmark.base.Benchmark):
     """Dataset with multiple fish, filmed from top-view
 
     Schools of inland silversides (Menidia beryllina, n=14 individuals per school) were recorded in the Lauder Lab at Harvard University while swimming at 15 speeds (0.5 to 8 BL/s, body length, at 0.5 BL/s intervals) in a flow tank with a total working section of 28 x 28 x 40 cm as described in previous work, at a constant temperature (18±1°C) and salinity (33 ppt), at a Reynolds number of approximately 10,000 (based on BL). Dorsal views of steady swimming across these speeds were recorded by high-speed video cameras (FASTCAM Mini AX50, Photron USA, San Diego, CA, USA) at 60-125 frames per second (feeding videos at 60 fps, swimming alone 125 fps). The dorsal view was recorded above the swim tunnel and a floating Plexiglas panel at the water surface prevented surface ripples from interfering with dorsal view videos. Five keypoints were labeled (tip, gill, peduncle, dorsal fin tip, caudal tip). 100 frames were labeled, making this a real-world sized laboratory dataset.
@@ -111,12 +111,12 @@ class FishBenchmark(deeplabcut.benchmark.base.Benchmark):
 
     name = "fish"
     keypoints = ("tip", "gill", "peduncle", "caudaltip", "dfintip")
-    ground_truth = deeplabcut.benchmark.get_filepath("CollectedData_Valentina.h5")
-    metadata = deeplabcut.benchmark.get_filepath("Documentation_data-Schooling_70shuffle1.pickle")
+    ground_truth = benchmark.get_filepath("CollectedData_Valentina.h5")
+    metadata = benchmark.get_filepath("Documentation_data-Schooling_70shuffle1.pickle")
     num_animals = 14
 
     def compute_pose_rmse(self, results_objects):
-        return deeplabcut.benchmark.metrics.calc_rmse_from_obj(
+        return benchmark.metrics.calc_rmse_from_obj(
             results_objects,
             h5_file=self.ground_truth,
             metadata_file=self.metadata,
@@ -124,7 +124,7 @@ def compute_pose_rmse(self, results_objects):
         )
 
     def compute_pose_map(self, results_objects):
-        return deeplabcut.benchmark.metrics.calc_map_from_obj(
+        return benchmark.metrics.calc_map_from_obj(
             results_objects,
             h5_file=self.ground_truth,
             metadata_file=self.metadata,
diff --git a/deeplabcut/benchmark/cli.py b/deeplabcut/benchmark/cli.py
index 8bd227ed76..2124bc0235 100644
--- a/deeplabcut/benchmark/cli.py
+++ b/deeplabcut/benchmark/cli.py
@@ -2,16 +2,16 @@
 
 import argparse
 
-import deeplabcut.benchmark
+import benchmark
 
 
 def _parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--include", nargs="+", default=None, required=False)
     parser.add_argument(
-        "--onerror", 
-        default="return", 
-        required=False, 
+        "--onerror",
+        default="return",
+        required=False,
         choices=("ignore", "return", "raise")
     )
     parser.add_argument("--nocache", action="store_true")
@@ -22,14 +22,17 @@ def main():
     """Main CLI entry point for generating benchmark results."""
     args = _parse_args()
     if not args.nocache:
-        results = deeplabcut.benchmark.loadcache()
+        results = benchmark.loadcache()
     else:
         results = None
-    results = deeplabcut.benchmark.evaluate(
+    results = benchmark.evaluate(
         include_benchmarks=args.include,
         results=results,
         on_error=args.onerror,
     )
     if not args.nocache:
-        deeplabcut.benchmark.savecache(results)
-    print(results.toframe())
+        benchmark.savecache(results)
+    try:
+        print(results.toframe())
+    except StopIteration:
+        pass
diff --git a/deeplabcut/benchmark/metrics.py b/deeplabcut/benchmark/metrics.py
index 68e26f6af6..c9add74ef6 100644
--- a/deeplabcut/benchmark/metrics.py
+++ b/deeplabcut/benchmark/metrics.py
@@ -14,10 +14,9 @@
 import numpy as np
 import pandas as pd
 
-import deeplabcut.benchmark.utils
+import benchmark.utils
 from deeplabcut.pose_estimation_tensorflow.core import evaluate_multianimal
 from deeplabcut.pose_estimation_tensorflow.lib import inferenceutils
-from deeplabcut.utils import auxiliaryfunctions
 from deeplabcut.utils.conversioncode import guarantee_multiindex_rows
 
 

From 60013cf3a6edf2ab93943abde49626e9ab0269ac Mon Sep 17 00:00:00 2001
From: Jessy <30733203+jeylau@users.noreply.github.com>
Date: Wed, 18 May 2022 11:08:25 +0200
Subject: [PATCH 4/8] Add __main__ file

---
 deeplabcut/benchmark/__main__.py | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 deeplabcut/benchmark/__main__.py

diff --git a/deeplabcut/benchmark/__main__.py b/deeplabcut/benchmark/__main__.py
new file mode 100644
index 0000000000..17029164fd
--- /dev/null
+++ b/deeplabcut/benchmark/__main__.py
@@ -0,0 +1,4 @@
+if __name__ == '__main__':
+    from .cli import main
+
+    main()
\ No newline at end of file

From 6529e72e78d88c8dfa56753b261a240f354e85e6 Mon Sep 17 00:00:00 2001
From: Steffen Schneider <steffen@bethgelab.org>
Date: Wed, 8 Jun 2022 08:12:58 +0200
Subject: [PATCH 5/8] Fix import errors

---
 deeplabcut/benchmark/.results      | 93 ------------------------------
 deeplabcut/benchmark/__init__.py   |  6 +-
 deeplabcut/benchmark/__main__.py   |  6 +-
 deeplabcut/benchmark/_crypt.py     |  2 +-
 deeplabcut/benchmark/base.py       |  6 +-
 deeplabcut/benchmark/benchmarks.py | 38 +++++++-----
 deeplabcut/benchmark/cli.py        | 14 ++---
 deeplabcut/benchmark/metrics.py    | 21 ++++---
 8 files changed, 52 insertions(+), 134 deletions(-)
 delete mode 100644 deeplabcut/benchmark/.results

diff --git a/deeplabcut/benchmark/.results b/deeplabcut/benchmark/.results
deleted file mode 100644
index 8d5d62725d..0000000000
--- a/deeplabcut/benchmark/.results
+++ /dev/null
@@ -1,93 +0,0 @@
-[
-  {
-    "benchmark": "trimouse",
-    "method": "DLCRNet_ms5 (30K)",
-    "version": "2.2.1",
-    "RMSE": NaN,
-    "mAP": NaN
-  },
-  {
-    "benchmark": "trimouse",
-    "method": "EfficientNet B7 (30K)",
-    "version": "2.2.1",
-    "RMSE": NaN,
-    "mAP": NaN
-  },
-  {
-    "benchmark": "trimouse",
-    "method": "ResNet50 (30K)",
-    "version": "2.2.1",
-    "RMSE": NaN,
-    "mAP": NaN
-  },
-  {
-    "benchmark": "parenting",
-    "method": "DLCRNet_ms5 (30k)",
-    "version": "2.2.1",
-    "RMSE": NaN,
-    "mAP": NaN
-  },
-  {
-    "benchmark": "parenting",
-    "method": "EfficientNet B7 (30k) ",
-    "version": "2.2.1",
-    "RMSE": NaN,
-    "mAP": NaN
-  },
-  {
-    "benchmark": "parenting",
-    "method": "EfficientNet B7_s4 (30k)",
-    "version": "2.2.1",
-    "RMSE": NaN,
-    "mAP": NaN
-  },
-  {
-    "benchmark": "marmosets",
-    "method": "DLCRNet_ms5 (200k)",
-    "version": "2.2.1",
-    "RMSE": NaN,
-    "mAP": NaN
-  },
-  {
-    "benchmark": "marmosets",
-    "method": "DLCRNet_ms5_s4 (200k)",
-    "version": "2.2.1",
-    "RMSE": NaN,
-    "mAP": NaN
-  },
-  {
-    "benchmark": "marmosets",
-    "method": "EfficientNet B7 (200k)",
-    "version": "2.2.1",
-    "RMSE": NaN,
-    "mAP": NaN
-  },
-  {
-    "benchmark": "marmosets",
-    "method": "EfficientNet B7_s4 (200k) ",
-    "version": "2.2.1",
-    "RMSE": NaN,
-    "mAP": NaN
-  },
-  {
-    "benchmark": "fish",
-    "method": "DLCRNet_ms5 (30k)",
-    "version": "2.2.1",
-    "RMSE": NaN,
-    "mAP": NaN
-  },
-  {
-    "benchmark": "fish",
-    "method": "EfficientNet B7 (30K)",
-    "version": "2.2.1",
-    "RMSE": NaN,
-    "mAP": NaN
-  },
-  {
-    "benchmark": "fish",
-    "method": "ResNet50 (30K)",
-    "version": "2.2.1",
-    "RMSE": NaN,
-    "mAP": NaN
-  }
-]
\ No newline at end of file
diff --git a/deeplabcut/benchmark/__init__.py b/deeplabcut/benchmark/__init__.py
index b8c88ce044..1b617c48a2 100644
--- a/deeplabcut/benchmark/__init__.py
+++ b/deeplabcut/benchmark/__init__.py
@@ -12,10 +12,10 @@
 import os
 from typing import Container
 
-from benchmark.base import Benchmark, Result, ResultCollection
+from deeplabcut.benchmark.base import Benchmark, Result, ResultCollection
 
-DATA_ROOT = os.path.join(os.path.dirname(__file__), "data")
-CACHE = os.path.join(os.path.dirname(__file__), ".results")
+DATA_ROOT = os.path.join(os.getcwd(), "data")
+CACHE = os.path.join(os.getcwd(), ".results")
 
 __registry = []
 
diff --git a/deeplabcut/benchmark/__main__.py b/deeplabcut/benchmark/__main__.py
index 17029164fd..adcf479949 100644
--- a/deeplabcut/benchmark/__main__.py
+++ b/deeplabcut/benchmark/__main__.py
@@ -1,4 +1,4 @@
-if __name__ == '__main__':
-    from .cli import main
+from deeplabcut.benchmark.cli import main
 
-    main()
\ No newline at end of file
+if __name__ == "__main__":
+    main()
diff --git a/deeplabcut/benchmark/_crypt.py b/deeplabcut/benchmark/_crypt.py
index 736ee42c97..f4e811c5ad 100644
--- a/deeplabcut/benchmark/_crypt.py
+++ b/deeplabcut/benchmark/_crypt.py
@@ -2,7 +2,7 @@
 # © A. & M. Mathis Labs
 # https://github.com/AlexEMG/DeepLabCut
 # Please see AUTHORS for contributors.
-# 
+#
 # https://github.com/AlexEMG/DeepLabCut/blob/master/AUTHORS
 # Licensed under GNU Lesser General Public License v3.0
 
diff --git a/deeplabcut/benchmark/base.py b/deeplabcut/benchmark/base.py
index f5b70b3e14..14c307b8d5 100644
--- a/deeplabcut/benchmark/base.py
+++ b/deeplabcut/benchmark/base.py
@@ -27,7 +27,7 @@
 
 import pandas as pd
 
-import benchmark.metrics
+import deeplabcut.benchmark.metrics
 from deeplabcut import __version__
 
 
@@ -66,12 +66,12 @@ def __init__(self):
                 )
 
     def compute_pose_rmse(self, results_objects):
-        return benchmark.metrics.calc_rmse_from_obj(
+        return deeplabcut.benchmark.metrics.calc_rmse_from_obj(
             results_objects, h5_file=self.ground_truth, metadata_file=self.metadata
         )
 
     def compute_pose_map(self, results_objects):
-        return benchmark.metrics.calc_map_from_obj(
+        return deeplabcut.benchmark.metrics.calc_map_from_obj(
             results_objects, h5_file=self.ground_truth, metadata_file=self.metadata
         )
 
diff --git a/deeplabcut/benchmark/benchmarks.py b/deeplabcut/benchmark/benchmarks.py
index b14f4b6675..f31389688f 100644
--- a/deeplabcut/benchmark/benchmarks.py
+++ b/deeplabcut/benchmark/benchmarks.py
@@ -1,9 +1,9 @@
 """The actual benchmark definitions."""
 import benchmark
-import benchmark.base
+import deeplabcut.benchmark.base
 
 
-class TriMouseBenchmark(benchmark.base.Benchmark):
+class TriMouseBenchmark(deeplabcut.benchmark.base.Benchmark):
     """Datasets with three mice with a top-view camera.
 
     Three wild-type (C57BL/6J) male mice ran on a paper spool following odor trails (Mathis et al 2018). These experiments were carried out in the laboratory of Venkatesh N. Murthy at Harvard University. Data were recorded at 30 Hz with 640 x 480 pixels resolution acquired with a Point Grey Firefly FMVU-03MTM-CS. One human annotator was instructed to localize the 12 keypoints (snout, left ear, right ear, shoulder, four spine points, tail base and three tail points). All surgical and experimental procedures for mice were in accordance with the National Institutes of Health Guide for the Care and Use of Laboratory Animals and approved by the Harvard Institutional Animal Care and Use Committee. 161 frames were labeled, making this a real-world sized laboratory dataset.
@@ -24,12 +24,14 @@ class TriMouseBenchmark(benchmark.base.Benchmark):
         "tail2",
         "tailend",
     )
-    ground_truth = benchmark.get_filepath("CollectedData_Daniel.h5")
-    metadata = benchmark.get_filepath("Documentation_data-MultiMouse_70shuffle1.pickle")
+    ground_truth = deeplabcut.benchmark.get_filepath("CollectedData_Daniel.h5")
+    metadata = deeplabcut.benchmark.get_filepath(
+        "Documentation_data-MultiMouse_70shuffle1.pickle"
+    )
     num_animals = 3
 
 
-class ParentingMouseBenchmark(benchmark.base.Benchmark):
+class ParentingMouseBenchmark(deeplabcut.benchmark.base.Benchmark):
     """Datasets with three mice, one parenting, two pups.
 
     Parenting behavior is a pup directed behavior observed in adult mice involving complex motor actions directed towards the benefit of the offspring. These experiments were carried out in the laboratory of Catherine Dulac at Harvard University. The behavioral assay was performed in the homecage of singly housed adult female mice in dark/red light conditions. For these videos, the adult mice was monitored for several minutes in the cage followed by the introduction of pup (4 days old) in one corner of the cage. The behavior of the adult and pup was monitored for a duration of 15 minutes. Video was recorded at 30Hz using a Microsoft LifeCam camera (Part#: 6CH-00001) with a resolution of 1280 x 720 pixels or a Geovision camera (model no.: GV-BX4700-3V) also acquired at 30 frames per second at a resolution of 704 x 480 pixels. A human annotator labeled on the adult animal the same 12 body points as in the tri-mouse dataset, and five body points on the pup along its spine. Initially only the two ends were labeled, and intermediate points were added by interpolation and their positions was manually adjusted if necessary. All surgical and experimental procedures for mice were in accordance with the National Institutes of Health Guide for the Care and Use of Laboratory Animals and approved by the Harvard Institutional Animal Care and Use Committee. 542 frames were labeled, making this a real-world sized laboratory dataset.
@@ -56,14 +58,14 @@ class ParentingMouseBenchmark(benchmark.base.Benchmark):
         "tailend",
     )
 
-    ground_truth = benchmark.get_filepath("CollectedData_Mostafizur.h5")
-    metadata = benchmark.get_filepath(
+    ground_truth = deeplabcut.benchmark.get_filepath("CollectedData_Mostafizur.h5")
+    metadata = deeplabcut.benchmark.get_filepath(
         "Documentation_data-CrackingParenting_70shuffle1.pickle"
     )
     num_animals = 2
 
     def compute_pose_map(self, results_objects):
-        return benchmark.metrics.calc_map_from_obj(
+        return deeplabcut.benchmark.metrics.calc_map_from_obj(
             results_objects,
             h5_file=self.ground_truth,
             metadata_file=self.metadata,
@@ -73,7 +75,7 @@ def compute_pose_map(self, results_objects):
         )
 
 
-class MarmosetBenchmark(benchmark.base.Benchmark):
+class MarmosetBenchmark(deeplabcut.benchmark.base.Benchmark):
     """Dataset with two marmosets.
 
     All animal procedures are overseen by veterinary staff of the MIT and Broad Institute Department of Comparative Medicine, in compliance with the NIH guide for the care and use of laboratory animals and approved by the MIT and Broad Institute animal care and use committees. Video of common marmosets (Callithrix jacchus) was collected in the laboratory of Guoping Feng at MIT. Marmosets were recorded using Kinect V2 cameras (Microsoft) with a resolution of 1080p and frame rate of 30 Hz. After acquisition, images to be used for training the network were manually cropped to 1000 x 1000 pixels or smaller. The dataset is 7,600 labeled frames from 40 different marmosets collected from 3 different colonies (in different facilities). Each cage contains a pair of marmosets, where one marmoset had light blue dye applied to its tufts. One human annotator labeled the 15 marker points on each animal present in the frame (frames contained either 1 or 2 animals).
@@ -98,12 +100,14 @@ class MarmosetBenchmark(benchmark.base.Benchmark):
         "Body2",
         "Body3",
     )
-    ground_truth = benchmark.get_filepath("CollectedData_Mackenzie.h5")
-    metadata = benchmark.get_filepath("Documentation_data-Marmoset_70shuffle1.pickle")
+    ground_truth = deeplabcut.benchmark.get_filepath("CollectedData_Mackenzie.h5")
+    metadata = deeplabcut.benchmark.get_filepath(
+        "Documentation_data-Marmoset_70shuffle1.pickle"
+    )
     num_animals = 2
 
 
-class FishBenchmark(benchmark.base.Benchmark):
+class FishBenchmark(deeplabcut.benchmark.base.Benchmark):
     """Dataset with multiple fish, filmed from top-view
 
     Schools of inland silversides (Menidia beryllina, n=14 individuals per school) were recorded in the Lauder Lab at Harvard University while swimming at 15 speeds (0.5 to 8 BL/s, body length, at 0.5 BL/s intervals) in a flow tank with a total working section of 28 x 28 x 40 cm as described in previous work, at a constant temperature (18±1°C) and salinity (33 ppt), at a Reynolds number of approximately 10,000 (based on BL). Dorsal views of steady swimming across these speeds were recorded by high-speed video cameras (FASTCAM Mini AX50, Photron USA, San Diego, CA, USA) at 60-125 frames per second (feeding videos at 60 fps, swimming alone 125 fps). The dorsal view was recorded above the swim tunnel and a floating Plexiglas panel at the water surface prevented surface ripples from interfering with dorsal view videos. Five keypoints were labeled (tip, gill, peduncle, dorsal fin tip, caudal tip). 100 frames were labeled, making this a real-world sized laboratory dataset.
@@ -111,12 +115,14 @@ class FishBenchmark(benchmark.base.Benchmark):
 
     name = "fish"
     keypoints = ("tip", "gill", "peduncle", "caudaltip", "dfintip")
-    ground_truth = benchmark.get_filepath("CollectedData_Valentina.h5")
-    metadata = benchmark.get_filepath("Documentation_data-Schooling_70shuffle1.pickle")
+    ground_truth = deeplabcut.benchmark.get_filepath("CollectedData_Valentina.h5")
+    metadata = deeplabcut.benchmark.get_filepath(
+        "Documentation_data-Schooling_70shuffle1.pickle"
+    )
     num_animals = 14
 
     def compute_pose_rmse(self, results_objects):
-        return benchmark.metrics.calc_rmse_from_obj(
+        return deeplabcut.benchmark.metrics.calc_rmse_from_obj(
             results_objects,
             h5_file=self.ground_truth,
             metadata_file=self.metadata,
@@ -124,7 +130,7 @@ def compute_pose_rmse(self, results_objects):
         )
 
     def compute_pose_map(self, results_objects):
-        return benchmark.metrics.calc_map_from_obj(
+        return deeplabcut.benchmark.metrics.calc_map_from_obj(
             results_objects,
             h5_file=self.ground_truth,
             metadata_file=self.metadata,
diff --git a/deeplabcut/benchmark/cli.py b/deeplabcut/benchmark/cli.py
index 2124bc0235..55bff6435f 100644
--- a/deeplabcut/benchmark/cli.py
+++ b/deeplabcut/benchmark/cli.py
@@ -1,8 +1,8 @@
-"""Command line interface for DeepLabCut benchmark."""
+"""Command line interface for DeepLabCut deeplabcut.benchmark."""
 
 import argparse
 
-import benchmark
+import deeplabcut.benchmark
 
 
 def _parse_args():
@@ -12,26 +12,26 @@ def _parse_args():
         "--onerror",
         default="return",
         required=False,
-        choices=("ignore", "return", "raise")
+        choices=("ignore", "return", "raise"),
     )
     parser.add_argument("--nocache", action="store_true")
     return parser.parse_args()
 
 
 def main():
-    """Main CLI entry point for generating benchmark results."""
+    """Main CLI entry point for generating deeplabcut.benchmark results."""
     args = _parse_args()
     if not args.nocache:
-        results = benchmark.loadcache()
+        results = deeplabcut.benchmark.loadcache()
     else:
         results = None
-    results = benchmark.evaluate(
+    results = deeplabcut.benchmark.evaluate(
         include_benchmarks=args.include,
         results=results,
         on_error=args.onerror,
     )
     if not args.nocache:
-        benchmark.savecache(results)
+        deeplabcut.benchmark.savecache(results)
     try:
         print(results.toframe())
     except StopIteration:
diff --git a/deeplabcut/benchmark/metrics.py b/deeplabcut/benchmark/metrics.py
index c9add74ef6..050fe7724a 100644
--- a/deeplabcut/benchmark/metrics.py
+++ b/deeplabcut/benchmark/metrics.py
@@ -14,7 +14,7 @@
 import numpy as np
 import pandas as pd
 
-import benchmark.utils
+import deeplabcut.benchmark.utils
 from deeplabcut.pose_estimation_tensorflow.core import evaluate_multianimal
 from deeplabcut.pose_estimation_tensorflow.lib import inferenceutils
 from deeplabcut.utils.conversioncode import guarantee_multiindex_rows
@@ -116,13 +116,18 @@ def conv_obj_to_assemblies(eval_results_obj, keypoint_names):
     assemblies = {}
     for image_path, results in eval_results_obj.items():
         lst = []
-        for pose in results:
+        for dict_ in results:
             ass = inferenceutils.Assembly(len(keypoint_names))
             for i, kpt in enumerate(keypoint_names):
-                xy = pose[kpt]
-                joint = inferenceutils.Joint(pos=(xy), label=i)
-                ass.add_joint(joint)
-            lst.append(ass)
+                xy = dict_["pose"][kpt]
+                if ~np.isnan(xy).all():
+                    joint = inferenceutils.Joint(pos=(xy), label=i)
+                    ass.add_joint(joint)
+            # TODO(jeylau) add affinity.setter to Assembly
+            ass._affinity = dict_["score"]
+            ass._links = [None]
+            if len(ass):
+                lst.append(ass)
         assemblies[image_path] = lst
     return assemblies
 
@@ -172,7 +177,7 @@ def calc_map_from_obj(
     assemblies_pred_ = conv_obj_to_assemblies(eval_results_obj, kpts)
     assemblies_pred = dict(enumerate(assemblies_pred_.values()))
 
-    with benchmark.utils.DisableOutput():
+    with deeplabcut.benchmark.utils.DisableOutput():
         oks = inferenceutils.evaluate_assembly(
             assemblies_pred,
             assemblies_gt_test,
@@ -214,6 +219,6 @@ def calc_rmse_from_obj(
                 "confidence": list(np.expand_dims(arr[..., 2], axis=2)),
             }
             preds["predictions"][image] = temp
-    with benchmark.utils.DisableOutput():
+    with deeplabcut.benchmark.utils.DisableOutput():
         errors = calc_prediction_errors(preds, gt)
     return np.nanmean(errors[..., 0])

From fb344e843b180b956dfdf5af3cf0c99c1144cb3b Mon Sep 17 00:00:00 2001
From: Steffen Schneider <steffen@bethgelab.org>
Date: Sat, 29 Oct 2022 13:32:45 +0200
Subject: [PATCH 6/8] Add docs and update license headers

---
 deeplabcut/benchmark/__init__.py   | 17 +++++-----
 deeplabcut/benchmark/__main__.py   |  8 +++++
 deeplabcut/benchmark/_crypt.py     | 53 ------------------------------
 deeplabcut/benchmark/base.py       |  8 ++---
 deeplabcut/benchmark/benchmarks.py | 18 +++++++++-
 deeplabcut/benchmark/cli.py        |  8 +++++
 deeplabcut/benchmark/metrics.py    | 10 ++++++
 deeplabcut/benchmark/utils.py      |  8 +++++
 8 files changed, 62 insertions(+), 68 deletions(-)
 delete mode 100644 deeplabcut/benchmark/_crypt.py

diff --git a/deeplabcut/benchmark/__init__.py b/deeplabcut/benchmark/__init__.py
index 1b617c48a2..53e657ad8b 100644
--- a/deeplabcut/benchmark/__init__.py
+++ b/deeplabcut/benchmark/__init__.py
@@ -1,12 +1,11 @@
-"""
-DeepLabCut2.0 Toolbox (deeplabcut.org)
-© A. & M. Mathis Labs
-https://github.com/AlexEMG/DeepLabCut
-Please see AUTHORS for contributors.
-
-https://github.com/AlexEMG/DeepLabCut/blob/master/AUTHORS
-Licensed under GNU Lesser General Public License v3.0
-"""
+# DeepLabCut2.0 Toolbox (deeplabcut.org)
+# © A. & M. Mathis Labs
+# https://github.com/AlexEMG/DeepLabCut
+# Please see AUTHORS for contributors.
+#
+# https://github.com/AlexEMG/DeepLabCut/blob/master/AUTHORS
+# Licensed under GNU Lesser General Public License v3.0
+
 
 import json
 import os
diff --git a/deeplabcut/benchmark/__main__.py b/deeplabcut/benchmark/__main__.py
index adcf479949..627c685299 100644
--- a/deeplabcut/benchmark/__main__.py
+++ b/deeplabcut/benchmark/__main__.py
@@ -1,3 +1,11 @@
+# DeepLabCut2.0 Toolbox (deeplabcut.org)
+# © A. & M. Mathis Labs
+# https://github.com/AlexEMG/DeepLabCut
+# Please see AUTHORS for contributors.
+#
+# https://github.com/AlexEMG/DeepLabCut/blob/master/AUTHORS
+# Licensed under GNU Lesser General Public License v3.0
+
 from deeplabcut.benchmark.cli import main
 
 if __name__ == "__main__":
diff --git a/deeplabcut/benchmark/_crypt.py b/deeplabcut/benchmark/_crypt.py
deleted file mode 100644
index f4e811c5ad..0000000000
--- a/deeplabcut/benchmark/_crypt.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# DeepLabCut2.0 Toolbox (deeplabcut.org)
-# © A. & M. Mathis Labs
-# https://github.com/AlexEMG/DeepLabCut
-# Please see AUTHORS for contributors.
-#
-# https://github.com/AlexEMG/DeepLabCut/blob/master/AUTHORS
-# Licensed under GNU Lesser General Public License v3.0
-
-"""Routines for handling (non-public) ground truth labels."""
-
-import glob
-import io
-import pickle
-import random
-
-import cryptography
-from cryptography.fernet import Fernet
-
-
-def encrypt(file: str, key: bytes):
-    """Encrypt the given file (passed as filename)."""
-    f = Fernet(key)
-    with open(file, "rb") as fh:
-        data = fh.read()
-    encrypted_data = f.encrypt(data)
-    with open(file + ".secret", "wb") as fh:
-        fh.write(encrypted_data)
-
-
-class EncryptedFile:
-    """Contextmanager for opening encrypted files"""
-
-    def __init__(self, filename: str, key: bytes):
-        if not isinstance(key, bytes):
-            raise ValueError(
-                "Pass a bytes object as the key. If key "
-                "is supplied as a string, make sure to call "
-                "encode() before passing the key to this "
-                "function."
-            )
-        self.filename = filename
-        self.key = key
-
-    def __enter__(self):
-        crypt = Fernet(self.key)
-        with open(self.filename, "rb") as fh:
-            data = crypt.decrypt(fh.read())
-        self._stream = io.BytesIO(data)
-        self._stream.seek(0)
-        return self._stream
-
-    def __exit__(self, type, value, traceback):
-        self._stream.close()
diff --git a/deeplabcut/benchmark/base.py b/deeplabcut/benchmark/base.py
index 14c307b8d5..74c20122fb 100644
--- a/deeplabcut/benchmark/base.py
+++ b/deeplabcut/benchmark/base.py
@@ -13,11 +13,9 @@
 Right now, the metrics to compute and report for each of the multi-animal benchmarks is the
 root mean-squared-error (RMSE) and the mean average precision (mAP).
 
-Note for contributors:
----------------------
-
-If you decide to contribute a benchmark which does not fit into this evaluation framework,
-please feel free to extend the base classes (e.g. to support additional metrics).
+Note for contributors: If you decide to contribute a benchmark which does not fit
+into this evaluation framework, please feel free to extend the base classes
+(e.g. to support additional metrics).
 """
 
 import abc
diff --git a/deeplabcut/benchmark/benchmarks.py b/deeplabcut/benchmark/benchmarks.py
index f31389688f..f25c7c79b9 100644
--- a/deeplabcut/benchmark/benchmarks.py
+++ b/deeplabcut/benchmark/benchmarks.py
@@ -1,4 +1,20 @@
-"""The actual benchmark definitions."""
+# DeepLabCut2.0 Toolbox (deeplabcut.org)
+# © A. & M. Mathis Labs
+# https://github.com/AlexEMG/DeepLabCut
+# Please see AUTHORS for contributors.
+#
+# https://github.com/AlexEMG/DeepLabCut/blob/master/AUTHORS
+# Licensed under GNU Lesser General Public License v3.0
+
+"""Definition for official DeepLabCut benchmark tasks.
+
+See benchmark.deeplabcut.org for a current leaderboard with models and metrics
+for each of these benchmarks. Submissions can be done by opening a PR in the
+benchmark reporistory:
+
+https://github.com/DeepLabCut/benchmark
+"""
+
 import benchmark
 import deeplabcut.benchmark.base
 
diff --git a/deeplabcut/benchmark/cli.py b/deeplabcut/benchmark/cli.py
index 55bff6435f..250f2ec9e5 100644
--- a/deeplabcut/benchmark/cli.py
+++ b/deeplabcut/benchmark/cli.py
@@ -1,3 +1,11 @@
+# DeepLabCut2.0 Toolbox (deeplabcut.org)
+# © A. & M. Mathis Labs
+# https://github.com/AlexEMG/DeepLabCut
+# Please see AUTHORS for contributors.
+#
+# https://github.com/AlexEMG/DeepLabCut/blob/master/AUTHORS
+# Licensed under GNU Lesser General Public License v3.0
+
 """Command line interface for DeepLabCut deeplabcut.benchmark."""
 
 import argparse
diff --git a/deeplabcut/benchmark/metrics.py b/deeplabcut/benchmark/metrics.py
index 050fe7724a..3f0a67c78b 100644
--- a/deeplabcut/benchmark/metrics.py
+++ b/deeplabcut/benchmark/metrics.py
@@ -1,3 +1,13 @@
+# DeepLabCut2.0 Toolbox (deeplabcut.org)
+# © A. & M. Mathis Labs
+# https://github.com/AlexEMG/DeepLabCut
+# Please see AUTHORS for contributors.
+#
+# https://github.com/AlexEMG/DeepLabCut/blob/master/AUTHORS
+# Licensed under GNU Lesser General Public License v3.0
+
+"""Evaluation metrics for the DeepLabCut benchmark."""
+
 import sys
 import unittest.mock
 
diff --git a/deeplabcut/benchmark/utils.py b/deeplabcut/benchmark/utils.py
index b2717793fc..40aa2495d0 100644
--- a/deeplabcut/benchmark/utils.py
+++ b/deeplabcut/benchmark/utils.py
@@ -1,3 +1,11 @@
+# DeepLabCut2.0 Toolbox (deeplabcut.org)
+# © A. & M. Mathis Labs
+# https://github.com/AlexEMG/DeepLabCut
+# Please see AUTHORS for contributors.
+#
+# https://github.com/AlexEMG/DeepLabCut/blob/master/AUTHORS
+# Licensed under GNU Lesser General Public License v3.0
+
 """Helper functions in this file are not affected by the main repositories
 license. They are independent from the remainder of the benchmarking code. 
 """

From 30cae725a45cfb47e5c7c4c45deed2c0d7963844 Mon Sep 17 00:00:00 2001
From: Steffen Schneider <steffen@bethgelab.org>
Date: Sat, 29 Oct 2022 15:09:45 +0200
Subject: [PATCH 7/8] Allow custom path for cache loading

---
 deeplabcut/benchmark/__init__.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/deeplabcut/benchmark/__init__.py b/deeplabcut/benchmark/__init__.py
index 53e657ad8b..5540a8468c 100644
--- a/deeplabcut/benchmark/__init__.py
+++ b/deeplabcut/benchmark/__init__.py
@@ -10,6 +10,7 @@
 import json
 import os
 from typing import Container
+from typing import Literal
 
 from deeplabcut.benchmark.base import Benchmark, Result, ResultCollection
 
@@ -98,12 +99,18 @@ def savecache(results: ResultCollection):
         json.dump(results.todicts(), fh, indent=2)
 
 
-def loadcache() -> ResultCollection:
-    if not os.path.exists(CACHE):
+def loadcache(
+    cache=CACHE, on_missing: Literal["raise", "ignore"] = "ignore"
+) -> ResultCollection:
+    if not os.path.exists(cache):
+        if on_missing == "raise":
+            raise FileNotFoundError(cache)
         return ResultCollection()
-    with open(CACHE, "r") as fh:
+    with open(cache, "r") as fh:
         try:
             data = json.load(fh)
-        except json.decoder.JSONDecodeError:
+        except json.decoder.JSONDecodeError as e:
+            if on_missing == "raise":
+                raise e
             return ResultCollection()
     return ResultCollection.fromdicts(data)

From d514136aec78e1fe2c899209bbb53f44a44989a3 Mon Sep 17 00:00:00 2001
From: Steffen Schneider <steffen@bethgelab.org>
Date: Sat, 29 Oct 2022 16:21:45 +0200
Subject: [PATCH 8/8] Add very basic docs for deeplabcut.benchmark

---
 _toc.yml                           |  3 +++
 deeplabcut/benchmark/benchmarks.py |  1 -
 docs/benchmark.md                  | 35 ++++++++++++++++++++++++++++++
 3 files changed, 38 insertions(+), 1 deletion(-)
 create mode 100644 docs/benchmark.md

diff --git a/_toc.yml b/_toc.yml
index 9071294684..e53f806c8c 100644
--- a/_toc.yml
+++ b/_toc.yml
@@ -27,6 +27,9 @@ parts:
   chapters:
   - file: docs/recipes/UsingModelZooPupil
   - file: docs/recipes/MegaDetectorDLCLive
+- caption: DeepLabCut Benchmark
+  chapters:
+  - file: docs/benchmark
 - caption: Hardware
   chapters:
   - file: docs/recipes/TechHardware
diff --git a/deeplabcut/benchmark/benchmarks.py b/deeplabcut/benchmark/benchmarks.py
index f25c7c79b9..09dc39db6b 100644
--- a/deeplabcut/benchmark/benchmarks.py
+++ b/deeplabcut/benchmark/benchmarks.py
@@ -15,7 +15,6 @@
 https://github.com/DeepLabCut/benchmark
 """
 
-import benchmark
 import deeplabcut.benchmark.base
 
 
diff --git a/docs/benchmark.md b/docs/benchmark.md
new file mode 100644
index 0000000000..114568decd
--- /dev/null
+++ b/docs/benchmark.md
@@ -0,0 +1,35 @@
+# DeepLabCut benchmark
+
+For further information and the leaderboard, see [the official homepage](https://benchmark.deeplabcut.org/).
+
+## High Level API
+
+When implementing your own benchmarks, the most important functions are directly accessible
+under the ``deeplabcut.benchmark`` package.
+
+```{eval-rst}
+.. automodule:: deeplabcut.benchmark
+   :members:
+   :show-inheritance:
+```
+
+## Available benchmark definitions
+
+See [the official benchmark page](https://benchmark.deeplabcut.org/datasets.html) for a full overview
+of the available datasets. A benchmark submission should contain a result for at least one of these
+benchmarks. For an example of how to implement a benchmark submission, refer to the baselines in the
+[DeepLabCut benchmark repo](https://github.com/DeepLabCut/benchmark/tree/main/benchmark/baselines).
+
+```{eval-rst}
+.. automodule:: deeplabcut.benchmark.benchmarks
+   :members:
+   :show-inheritance:
+```
+
+## Metric calculation
+
+```{eval-rst}
+.. automodule:: deeplabcut.benchmark.metrics
+   :members:
+   :show-inheritance:
+```
\ No newline at end of file