diff --git a/README.md b/README.md
index c43062a393..97ad16a85a 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,7 @@ DeepLabCut is a toolbox for markerless pose estimation of animals performing var
 
 :purple_heart: **DeepLabCut supports multi-animal pose estimation!** maDLC is out of beta/rc mode and beta is deprecated, thanks to the testers out there for feedback! Your labeled data will be backwards compatible, but not all other steps. Please see the [new `2.2+` releases](https://github.com/DeepLabCut/DeepLabCut/releases) for what's new & how to install it, please see our new [paper, Lauer et al 2022](https://www.nature.com/articles/s41592-022-01443-0), and the [new docs]( https://deeplabcut.github.io/DeepLabCut) on how to use it!
 
-:purple_heart: We support mulit-animal re-identification, see [Lauer et al 2022](https://www.nature.com/articles/s41592-022-01443-0).
+:purple_heart: We support multi-animal re-identification, see [Lauer et al 2022](https://www.nature.com/articles/s41592-022-01443-0).
 
 :purple_heart: We have a **real-time** package available! http://DLClive.deeplabcut.org
 
diff --git a/_toc.yml b/_toc.yml
index 9071294684..e53f806c8c 100644
--- a/_toc.yml
+++ b/_toc.yml
@@ -27,6 +27,9 @@ parts:
   chapters:
   - file: docs/recipes/UsingModelZooPupil
   - file: docs/recipes/MegaDetectorDLCLive
+- caption: DeepLabCut Benchmark
+  chapters:
+  - file: docs/benchmark
 - caption: Hardware
   chapters:
   - file: docs/recipes/TechHardware
diff --git a/deeplabcut/benchmark/__init__.py b/deeplabcut/benchmark/__init__.py
new file mode 100644
index 0000000000..5540a8468c
--- /dev/null
+++ b/deeplabcut/benchmark/__init__.py
@@ -0,0 +1,116 @@
+# DeepLabCut2.0 Toolbox (deeplabcut.org)
+# © A. & M. Mathis Labs
+# https://github.com/AlexEMG/DeepLabCut
+# Please see AUTHORS for contributors.
+#
+# https://github.com/AlexEMG/DeepLabCut/blob/master/AUTHORS
+# Licensed under GNU Lesser General Public License v3.0
+
+
+import json
+import os
+from typing import Container
+from typing import Literal
+
+from deeplabcut.benchmark.base import Benchmark, Result, ResultCollection
+
+DATA_ROOT = os.path.join(os.getcwd(), "data")
+CACHE = os.path.join(os.getcwd(), ".results")
+
+__registry = []
+
+
+def register(cls):
+    """Add a benchmark to the list of evaluations to run.
+
+    Apply this function as a decorator to a class. Note that the
+    class needs to be a subclass of the ``benchmark.base.Benchmark``
+    base class.
+
+    In most situations, it will be a subclass of one of the pre-defined
+    benchmarks in ``benchmark.benchmarks``.
+
+    Throws:
+        ``ValueError`` if the decorator is applied to a class that is
+        not a subclass of ``benchmark.base.Benchmark``.
+    """
+    if not issubclass(cls, Benchmark):
+        raise ValueError(
+            f"Can only register subclasses of {type(Benchmark)}, " f"but got {cls}."
+        )
+    __registry.append(cls)
+
+
+def evaluate(
+    include_benchmarks: Container[str] = None,
+    results: ResultCollection = None,
+    on_error="return",
+) -> ResultCollection:
+    """Run evaluation for all benchmarks and methods.
+
+    Note that in order for your custom benchmark to be included during
+    evaluation, the following conditions need to be met:
+
+        - The benchmark subclassed one of the benchmark definitions in
+          in ``benchmark.benchmarks``
+        - The benchmark is registered by applying the ``@benchmark.register``
+          decorator to the class
+        - The benchmark was imported. This is done automatically for all
+          benchmarks that are defined in submodules or subpackages of the
+          ``benchmark.submissions`` module. For all other locations, make
+          sure to manually import the packages **before** calling the
+          ``evaluate()`` function.
+
+    Args:
+        include_benchmarks:
+            If ``None``, run all benchmarks that were discovered. If a container
+            is passed, only include methods that were defined on benchmarks with
+            the specified names. E.g., ``include_benchmarks = ["trimouse"]`` would
+            only evaluate methods of the trimouse benchmark dataset.
+        on_error:
+            see documentation in ``benchmark.base.Benchmark.evaluate()``
+
+    Returns:
+        A collection of all results, which can be printed or exported to
+        ``pd.DataFrame`` or ``json`` file formats.
+    """
+    if results is None:
+        results = ResultCollection()
+    for benchmark_cls in __registry:
+        if include_benchmarks is not None:
+            if benchmark_cls.name not in include_benchmarks:
+                continue
+        benchmark = benchmark_cls()
+        for name in benchmark.names():
+            if Result(method_name=name, benchmark_name=benchmark_cls.name) in results:
+                continue
+            else:
+                result = benchmark.evaluate(name, on_error=on_error)
+                results.add(result)
+    return results
+
+
+def get_filepath(basename: str):
+    return os.path.join(DATA_ROOT, basename)
+
+
+def savecache(results: ResultCollection):
+    with open(CACHE, "w") as fh:
+        json.dump(results.todicts(), fh, indent=2)
+
+
+def loadcache(
+    cache=CACHE, on_missing: Literal["raise", "ignore"] = "ignore"
+) -> ResultCollection:
+    if not os.path.exists(cache):
+        if on_missing == "raise":
+            raise FileNotFoundError(cache)
+        return ResultCollection()
+    with open(cache, "r") as fh:
+        try:
+            data = json.load(fh)
+        except json.decoder.JSONDecodeError as e:
+            if on_missing == "raise":
+                raise e
+            return ResultCollection()
+    return ResultCollection.fromdicts(data)
diff --git a/deeplabcut/benchmark/__main__.py b/deeplabcut/benchmark/__main__.py
new file mode 100644
index 0000000000..627c685299
--- /dev/null
+++ b/deeplabcut/benchmark/__main__.py
@@ -0,0 +1,12 @@
+# DeepLabCut2.0 Toolbox (deeplabcut.org)
+# © A. & M. Mathis Labs
+# https://github.com/AlexEMG/DeepLabCut
+# Please see AUTHORS for contributors.
+#
+# https://github.com/AlexEMG/DeepLabCut/blob/master/AUTHORS
+# Licensed under GNU Lesser General Public License v3.0
+
+from deeplabcut.benchmark.cli import main
+
+if __name__ == "__main__":
+    main()
diff --git a/deeplabcut/benchmark/base.py b/deeplabcut/benchmark/base.py
new file mode 100644
index 0000000000..74c20122fb
--- /dev/null
+++ b/deeplabcut/benchmark/base.py
@@ -0,0 +1,211 @@
+# DeepLabCut2.0 Toolbox (deeplabcut.org)
+# © A. & M. Mathis Labs
+# https://github.com/AlexEMG/DeepLabCut
+# Please see AUTHORS for contributors.
+#
+# https://github.com/AlexEMG/DeepLabCut/blob/master/AUTHORS
+# Licensed under GNU Lesser General Public License v3.0
+
+"""Base classes for benchmark and result definition
+
+Benchmarks subclass the abstract ``Benchmark`` class and are defined by ``name``, their
+``keypoints`` names, as well as groundtruth and metadata necessary to run evaluation.
+Right now, the metrics to compute and report for each of the multi-animal benchmarks is the
+root mean-squared-error (RMSE) and the mean average precision (mAP).
+
+Note for contributors: If you decide to contribute a benchmark which does not fit
+into this evaluation framework, please feel free to extend the base classes
+(e.g. to support additional metrics).
+"""
+
+import abc
+import dataclasses
+from typing import Iterable
+from typing import Tuple
+
+import pandas as pd
+
+import deeplabcut.benchmark.metrics
+from deeplabcut import __version__
+
+
+class BenchmarkEvaluationError(RuntimeError):
+    pass
+
+
+class Benchmark(abc.ABC):
+    """Abstract benchmark baseclass.
+
+    All benchmarks should subclass this class.
+    """
+
+    @abc.abstractmethod
+    def names(self):
+        """A unique key to describe this submission, e.g. the model name.
+
+        This is also the name that will later appear in the benchmark table.
+        The name needs to be unique across the whole benchmark. Non-unique names
+        will raise an error during submission of a PR.
+        """
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def get_predictions(self):
+        """Return predictions for all images in the benchmark."""
+        raise NotImplementedError()
+
+    def __init__(self):
+        keys = ["name", "keypoints", "ground_truth", "metadata"]
+        for key in keys:
+            if not hasattr(self, key):
+                raise NotImplementedError(
+                    f"Subclass of abstract Benchmark class need "
+                    f"to define the {key} property."
+                )
+
+    def compute_pose_rmse(self, results_objects):
+        return deeplabcut.benchmark.metrics.calc_rmse_from_obj(
+            results_objects, h5_file=self.ground_truth, metadata_file=self.metadata
+        )
+
+    def compute_pose_map(self, results_objects):
+        return deeplabcut.benchmark.metrics.calc_map_from_obj(
+            results_objects, h5_file=self.ground_truth, metadata_file=self.metadata
+        )
+
+    def evaluate(self, name: str, on_error="raise"):
+        """Evaluate this benchmark with all registered methods."""
+
+        if name not in self.names():
+            raise ValueError(
+                f"{name} is not registered. Valid names are {self.names()}"
+            )
+        if on_error not in ("ignore", "return", "raise"):
+            raise ValueError(f"on_error got an undefined value: {on_error}")
+        mean_avg_precision = float("nan")
+        root_mean_squared_error = float("nan")
+        try:
+            predictions = self.get_predictions(name)
+            mean_avg_precision = self.compute_pose_map(predictions)
+            root_mean_squared_error = self.compute_pose_rmse(predictions)
+        except Exception as exception:
+            if on_error == "ignore":
+                # ignore the exception and continue with the next evaluation, without
+                # yielding a result value.
+                return
+            elif on_error == "return":
+                # return the result value, with NaN as the result for all metrics that
+                # could not be computed due to the error.
+                pass
+            elif on_error == "raise":
+                # raise the error and stop evaluation
+                raise BenchmarkEvaluationError(
+                    f"Error during benchmark evaluation for model {name}"
+                ) from exception
+            else:
+                raise NotImplementedError() from exception
+        return Result(
+            method_name=name,
+            benchmark_name=self.name,
+            mean_avg_precision=mean_avg_precision,
+            root_mean_squared_error=root_mean_squared_error,
+        )
+
+
+@dataclasses.dataclass
+class Result:
+    """Benchmark result."""
+
+    method_name: str
+    benchmark_name: str
+    root_mean_squared_error: float = float("nan")
+    mean_avg_precision: float = float("nan")
+    benchmark_version: str = __version__
+
+    _export_mapping = dict(
+        benchmark_name="benchmark",
+        method_name="method",
+        benchmark_version="version",
+        root_mean_squared_error="RMSE",
+        mean_avg_precision="mAP",
+    )
+
+    _primary_key = ("benchmark_name", "method_name", "benchmark_version")
+
+    @property
+    def primary_key(self) -> Tuple[str]:
+        """The primary key to uniquely identify this result."""
+        return tuple(getattr(self, k) for k in self._primary_key)
+
+    @property
+    def primary_key_names(self) -> Tuple[str]:
+        """Names of the primary keys"""
+        return tuple(self._export_mapping.get(k) for k in self._primary_key)
+
+    def __str__(self):
+        return (
+            f"{self.method_name}, {self.benchmark_name}: "
+            f"{self.mean_avg_precision} mAP, "
+            f"{self.root_mean_squared_error} RMSE"
+        )
+
+    @classmethod
+    def fromdict(cls, data: dict):
+        """Construct result object from dictionary."""
+        kwargs = {attr: data[key] for attr, key in cls._export_mapping.items()}
+        return cls(**kwargs)
+
+    def todict(self) -> dict:
+        """Export result object to dictionary, with less verbose key names."""
+        return {key: getattr(self, attr) for attr, key in self._export_mapping.items()}
+
+
+class ResultCollection:
+    def __init__(self, *results):
+        self.results = {result.primary_key: result for result in results}
+
+    @property
+    def primary_key_names(self):
+        return next(iter(self.results.values())).primary_key_names
+
+    def toframe(self) -> pd.DataFrame:
+        """Convert results to pandas dataframe"""
+        return pd.DataFrame(
+            [result.todict() for result in self.results.values()]
+        ).set_index(list(self.primary_key_names))
+
+    def add(self, result: Result):
+        """Add a result to the collection."""
+        if result.primary_key in self.results:
+            raise ValueError(
+                "An entry for {result.primary_key} does already "
+                "exist in this collection. Did you try to add the "
+                "same result twice?"
+            )
+        if len(self) > 0:
+            if result.primary_key_names != self.primary_key_names:
+                raise ValueError("Incompatible result format.")
+        self.results[result.primary_key] = result
+
+    @classmethod
+    def fromdicts(cls, data: Iterable[dict]):
+        return cls(*[Result.fromdict(entry) for entry in data])
+
+    def todicts(self):
+        return [result.todict() for result in self.results.values()]
+
+    def __len__(self):
+        return len(self.results)
+
+    def __contains__(self, other: Result):
+        if not isinstance(other, Result):
+            raise ValueError(
+                f"{type(self)} can only store objects of type Result, "
+                f"but got {type(other)}."
+            )
+        return other.primary_key in self.results
+
+    def __eq__(self, other):
+        if not isinstance(other, ResultCollection):
+            return False
+        return other.results == self.results
diff --git a/deeplabcut/benchmark/benchmarks.py b/deeplabcut/benchmark/benchmarks.py
new file mode 100644
index 0000000000..a51eac3958
--- /dev/null
+++ b/deeplabcut/benchmark/benchmarks.py
@@ -0,0 +1,160 @@
+# DeepLabCut2.0 Toolbox (deeplabcut.org)
+# © A. & M. Mathis Labs
+# https://github.com/AlexEMG/DeepLabCut
+# Please see AUTHORS for contributors.
+#
+# https://github.com/AlexEMG/DeepLabCut/blob/master/AUTHORS
+# Licensed under GNU Lesser General Public License v3.0
+
+"""Definition for official DeepLabCut benchmark tasks.
+
+See benchmark.deeplabcut.org for a current leaderboard with models and metrics
+for each of these benchmarks. Submissions can be done by opening a PR in the
+benchmark reporistory:
+
+https://github.com/DeepLabCut/benchmark
+"""
+
+import deeplabcut.benchmark.base
+
+
+class TriMouseBenchmark(deeplabcut.benchmark.base.Benchmark):
+    """Datasets with three mice with a top-view camera.
+
+    Three wild-type (C57BL/6J) male mice ran on a paper spool following odor trails (Mathis et al 2018). These experiments were carried out in the laboratory of Venkatesh N. Murthy at Harvard University. Data were recorded at 30 Hz with 640 x 480 pixels resolution acquired with a Point Grey Firefly FMVU-03MTM-CS. One human annotator was instructed to localize the 12 keypoints (snout, left ear, right ear, shoulder, four spine points, tail base and three tail points). All surgical and experimental procedures for mice were in accordance with the National Institutes of Health Guide for the Care and Use of Laboratory Animals and approved by the Harvard Institutional Animal Care and Use Committee. 161 frames were labeled, making this a real-world sized laboratory dataset.
+
+    Introduced in Lauer et al. "Multi-animal pose estimation, identification and tracking with DeepLabCut." Nature Methods 19, no. 4 (2022): 496-504.
+    """
+
+    name = "trimouse"
+    keypoints = (
+        "snout",
+        "leftear",
+        "rightear",
+        "shoulder",
+        "spine1",
+        "spine2",
+        "spine3",
+        "spine4",
+        "tailbase",
+        "tail1",
+        "tail2",
+        "tailend",
+    )
+    ground_truth = deeplabcut.benchmark.get_filepath("CollectedData_Daniel.h5")
+    metadata = deeplabcut.benchmark.get_filepath(
+        "Documentation_data-MultiMouse_70shuffle1.pickle"
+    )
+    num_animals = 3
+
+
+class ParentingMouseBenchmark(deeplabcut.benchmark.base.Benchmark):
+    """Datasets with three mice, one parenting, two pups.
+
+    Parenting behavior is a pup directed behavior observed in adult mice involving complex motor actions directed towards the benefit of the offspring. These experiments were carried out in the laboratory of Catherine Dulac at Harvard University. The behavioral assay was performed in the homecage of singly housed adult female mice in dark/red light conditions. For these videos, the adult mice was monitored for several minutes in the cage followed by the introduction of pup (4 days old) in one corner of the cage. The behavior of the adult and pup was monitored for a duration of 15 minutes. Video was recorded at 30Hz using a Microsoft LifeCam camera (Part#: 6CH-00001) with a resolution of 1280 x 720 pixels or a Geovision camera (model no.: GV-BX4700-3V) also acquired at 30 frames per second at a resolution of 704 x 480 pixels. A human annotator labeled on the adult animal the same 12 body points as in the tri-mouse dataset, and five body points on the pup along its spine. Initially only the two ends were labeled, and intermediate points were added by interpolation and their positions was manually adjusted if necessary. All surgical and experimental procedures for mice were in accordance with the National Institutes of Health Guide for the Care and Use of Laboratory Animals and approved by the Harvard Institutional Animal Care and Use Committee. 542 frames were labeled, making this a real-world sized laboratory dataset.
+
+    Introduced in Lauer et al. "Multi-animal pose estimation, identification and tracking with DeepLabCut." Nature Methods 19, no. 4 (2022): 496-504.
+    """
+
+    name = "parenting"
+    keypoints = (
+        "end1",
+        "interm1",
+        "interm2",
+        "interm3",
+        "end2",
+        "snout",
+        "leftear",
+        "rightear",
+        "shoulder",
+        "spine1",
+        "spine2",
+        "spine3",
+        "spine4",
+        "tailbase",
+        "tail1",
+        "tail2",
+        "tailend",
+    )
+
+    ground_truth = deeplabcut.benchmark.get_filepath("CollectedData_Mostafizur.h5")
+    metadata = deeplabcut.benchmark.get_filepath(
+        "Documentation_data-CrackingParenting_70shuffle1.pickle"
+    )
+    num_animals = 2
+
+    def compute_pose_map(self, results_objects):
+        return deeplabcut.benchmark.metrics.calc_map_from_obj(
+            results_objects,
+            h5_file=self.ground_truth,
+            metadata_file=self.metadata,
+            oks_sigma=0.15,
+            margin=10,
+            symmetric_kpts=[(0, 4), (1, 3)],
+        )
+
+
+class MarmosetBenchmark(deeplabcut.benchmark.base.Benchmark):
+    """Dataset with two marmosets.
+
+    All animal procedures are overseen by veterinary staff of the MIT and Broad Institute Department of Comparative Medicine, in compliance with the NIH guide for the care and use of laboratory animals and approved by the MIT and Broad Institute animal care and use committees. Video of common marmosets (Callithrix jacchus) was collected in the laboratory of Guoping Feng at MIT. Marmosets were recorded using Kinect V2 cameras (Microsoft) with a resolution of 1080p and frame rate of 30 Hz. After acquisition, images to be used for training the network were manually cropped to 1000 x 1000 pixels or smaller. The dataset is 7,600 labeled frames from 40 different marmosets collected from 3 different colonies (in different facilities). Each cage contains a pair of marmosets, where one marmoset had light blue dye applied to its tufts. One human annotator labeled the 15 marker points on each animal present in the frame (frames contained either 1 or 2 animals).
+
+    Introduced in Lauer et al. "Multi-animal pose estimation, identification and tracking with DeepLabCut." Nature Methods 19, no. 4 (2022): 496-504.
+    """
+
+    name = "marmosets"
+    keypoints = (
+        "Front",
+        "Right",
+        "Middle",
+        "Left",
+        "FL1",
+        "BL1",
+        "FR1",
+        "BR1",
+        "BL2",
+        "BR2",
+        "FL2",
+        "FR2",
+        "Body1",
+        "Body2",
+        "Body3",
+    )
+    ground_truth = deeplabcut.benchmark.get_filepath("CollectedData_Mackenzie.h5")
+    metadata = deeplabcut.benchmark.get_filepath(
+        "Documentation_data-Marmoset_70shuffle1.pickle"
+    )
+    num_animals = 2
+
+
+class FishBenchmark(deeplabcut.benchmark.base.Benchmark):
+    """Dataset with multiple fish, filmed from top-view
+
+    Schools of inland silversides (Menidia beryllina, n=14 individuals per school) were recorded in the Lauder Lab at Harvard University while swimming at 15 speeds (0.5 to 8 BL/s, body length, at 0.5 BL/s intervals) in a flow tank with a total working section of 28 x 28 x 40 cm as described in previous work, at a constant temperature (18±1°C) and salinity (33 ppt), at a Reynolds number of approximately 10,000 (based on BL). Dorsal views of steady swimming across these speeds were recorded by high-speed video cameras (FASTCAM Mini AX50, Photron USA, San Diego, CA, USA) at 60-125 frames per second (feeding videos at 60 fps, swimming alone 125 fps). The dorsal view was recorded above the swim tunnel and a floating Plexiglas panel at the water surface prevented surface ripples from interfering with dorsal view videos. Five keypoints were labeled (tip, gill, peduncle, dorsal fin tip, caudal tip). 100 frames were labeled, making this a real-world sized laboratory dataset.
+
+    Introduced in Lauer et al. "Multi-animal pose estimation, identification and tracking with DeepLabCut." Nature Methods 19, no. 4 (2022): 496-504.
+    """
+
+    name = "fish"
+    keypoints = ("tip", "gill", "peduncle", "caudaltip", "dfintip")
+    ground_truth = deeplabcut.benchmark.get_filepath("CollectedData_Valentina.h5")
+    metadata = deeplabcut.benchmark.get_filepath(
+        "Documentation_data-Schooling_70shuffle1.pickle"
+    )
+    num_animals = 14
+
+    def compute_pose_rmse(self, results_objects):
+        return deeplabcut.benchmark.metrics.calc_rmse_from_obj(
+            results_objects,
+            h5_file=self.ground_truth,
+            metadata_file=self.metadata,
+            drop_kpts=[4, 5],
+        )
+
+    def compute_pose_map(self, results_objects):
+        return deeplabcut.benchmark.metrics.calc_map_from_obj(
+            results_objects,
+            h5_file=self.ground_truth,
+            metadata_file=self.metadata,
+            drop_kpts=[4, 5],
+        )
diff --git a/deeplabcut/benchmark/cli.py b/deeplabcut/benchmark/cli.py
new file mode 100644
index 0000000000..e42be6dfd9
--- /dev/null
+++ b/deeplabcut/benchmark/cli.py
@@ -0,0 +1,44 @@
+# DeepLabCut2.0 Toolbox (deeplabcut.org)
+# © A. & M. Mathis Labs
+# https://github.com/AlexEMG/DeepLabCut
+# Please see AUTHORS for contributors.
+#
+# https://github.com/AlexEMG/DeepLabCut/blob/master/AUTHORS
+# Licensed under GNU Lesser General Public License v3.0
+
+"""Command line interface for DeepLabCut deeplabcut.benchmark."""
+
+import argparse
+
+import deeplabcut.benchmark
+
+
+def _parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--include", nargs="+", default=None, required=False)
+    parser.add_argument(
+        "--onerror",
+        default="return",
+        required=False,
+        choices=("ignore", "return", "raise"),
+    )
+    parser.add_argument("--nocache", action="store_true")
+    return parser.parse_args()
+
+
+def main():
+    """Main CLI entry point for generating deeplabcut.benchmark results."""
+    args = _parse_args()
+    if not args.nocache:
+        results = deeplabcut.benchmark.loadcache()
+    else:
+        results = None
+    results = deeplabcut.benchmark.evaluate(
+        include_benchmarks=args.include, results=results, on_error=args.onerror,
+    )
+    if not args.nocache:
+        deeplabcut.benchmark.savecache(results)
+    try:
+        print(results.toframe())
+    except StopIteration:
+        pass
diff --git a/deeplabcut/benchmark/metrics.py b/deeplabcut/benchmark/metrics.py
new file mode 100644
index 0000000000..3369020b4b
--- /dev/null
+++ b/deeplabcut/benchmark/metrics.py
@@ -0,0 +1,230 @@
+# DeepLabCut2.0 Toolbox (deeplabcut.org)
+# © A. & M. Mathis Labs
+# https://github.com/AlexEMG/DeepLabCut
+# Please see AUTHORS for contributors.
+#
+# https://github.com/AlexEMG/DeepLabCut/blob/master/AUTHORS
+# Licensed under GNU Lesser General Public License v3.0
+
+"""Evaluation metrics for the DeepLabCut benchmark."""
+
+import sys
+import unittest.mock
+
+# TODO(stes) mocking a few modules to rely in fewer dependencies, without
+# causing import errors when using deeplabcut.
+MOCK_MODULES = ["statsmodels", "statsmodels.api", "pytables"]
+for mod_name in MOCK_MODULES:
+    sys.modules[mod_name] = unittest.mock.MagicMock()
+
+import os
+import pickle
+from collections import defaultdict
+
+import numpy as np
+import pandas as pd
+
+import deeplabcut.benchmark.utils
+from deeplabcut.pose_estimation_tensorflow.core import evaluate_multianimal
+from deeplabcut.pose_estimation_tensorflow.lib import inferenceutils
+from deeplabcut.utils.conversioncode import guarantee_multiindex_rows
+
+
+def _format_gt_data(h5file):
+    df = pd.read_hdf(h5file)
+
+    def _get_unique_level_values(header, level):
+        return header.get_level_values(level).unique().to_list()
+
+    animals = _get_unique_level_values(df.columns, "individuals")
+    kpts = _get_unique_level_values(df.columns, "bodyparts")
+    try:
+        n_unique = len(
+            _get_unique_level_values(
+                df.xs("single", level="individuals", axis=1).columns, "bodyparts"
+            )
+        )
+    except KeyError:
+        n_unique = 0
+    guarantee_multiindex_rows(df)
+    file_paths = [os.path.join(*row) for row in df.index.to_list()]
+    temp = (
+        df.stack("individuals", dropna=False)
+        .reindex(animals, level="individuals")
+        .reindex(kpts, level="bodyparts", axis=1)
+    )
+    data = temp.to_numpy().reshape((len(file_paths), len(animals), -1, 2))
+    meta = {"animals": animals, "keypoints": kpts, "n_unique": n_unique}
+    return {
+        "annotations": dict(zip(file_paths, data)),
+        "metadata": meta,
+    }
+
+
+def calc_prediction_errors(preds, gt):
+    kpts_gt = gt["metadata"]["keypoints"]
+    kpts_pred = preds["metadata"]["keypoints"]
+    map_ = {kpts_gt.index(kpt): i for i, kpt in enumerate(kpts_pred)}
+    annot = gt["annotations"]
+
+    # Map image paths from predicted data to GT as the first are typically
+    # absolute whereas the latter are relative to the project path.
+    def _map(strings, substrings):
+        lookup = dict()
+        strings_ = strings.copy()
+        substrings_ = substrings.copy()
+        while strings_:
+            string = strings_.pop()
+            for s in substrings_:
+                if string.endswith(s):
+                    lookup[string] = s
+                    substrings_.remove(s)
+                    break
+        return lookup
+
+    map_images = _map(list(preds["predictions"]), list(annot))
+
+    errors = np.full(
+        (
+            len(preds["predictions"]),
+            len(gt["metadata"]["animals"]),
+            len(kpts_gt),
+            2,  # Hold distance to GT and confidence
+        ),
+        np.nan,
+    )
+    for n, (path, preds_) in enumerate(preds["predictions"].items()):
+        if not preds_:
+            continue
+        xy_gt = annot[map_images[path]].swapaxes(0, 1)
+        xy_pred = preds_["coordinates"][0]
+        conf_pred = preds_["confidence"]
+        for i, xy_gt_ in enumerate(xy_gt):
+            visible = np.flatnonzero(np.all(~np.isnan(xy_gt_), axis=1))
+            xy_pred_ = xy_pred[map_[i]]
+            if visible.size and xy_pred_.size:
+                # Pick the predictions closest to ground truth,
+                # rather than the ones the model has most confident in.
+                neighbors = evaluate_multianimal._find_closest_neighbors(
+                    xy_gt_[visible], xy_pred_, k=3
+                )
+                found = neighbors != -1
+                if ~np.any(found):
+                    continue
+                min_dists = np.linalg.norm(
+                    xy_gt_[visible][found] - xy_pred_[neighbors[found]], axis=1,
+                )
+                conf_pred_ = conf_pred[map_[i]]
+                errors[n, visible[found], i, 0] = min_dists
+                errors[n, visible[found], i, 1] = conf_pred_[neighbors[found], 0]
+    return errors
+
+
+def conv_obj_to_assemblies(eval_results_obj, keypoint_names):
+    """Convert predictions to deeplabcut assemblies."""
+    assemblies = {}
+    for image_path, results in eval_results_obj.items():
+        lst = []
+        for dict_ in results:
+            ass = inferenceutils.Assembly(len(keypoint_names))
+            for i, kpt in enumerate(keypoint_names):
+                xy = dict_["pose"][kpt]
+                if ~np.isnan(xy).all():
+                    joint = inferenceutils.Joint(pos=(xy), label=i)
+                    ass.add_joint(joint)
+            # TODO(jeylau) add affinity.setter to Assembly
+            ass._affinity = dict_["score"]
+            ass._links = [None]
+            if len(ass):
+                lst.append(ass)
+        assemblies[image_path] = lst
+    return assemblies
+
+
+def calc_map_from_obj(
+    eval_results_obj,
+    h5_file,
+    metadata_file,
+    oks_sigma=0.1,
+    margin=0,
+    symmetric_kpts=None,
+    drop_kpts=None,
+):
+    """Calculate mean average precision (mAP) based on predictions."""
+    df = pd.read_hdf(h5_file)
+    try:
+        df.drop("single", level="individuals", axis=1, inplace=True)
+    except KeyError:
+        pass
+    n_animals = len(df.columns.get_level_values("individuals").unique())
+    kpts = list(df.columns.get_level_values("bodyparts").unique())
+    image_paths = list(eval_results_obj)
+    ground_truth = (
+        df.loc[image_paths].to_numpy().reshape((len(image_paths), n_animals, -1, 2))
+    )
+    temp = np.ones((*ground_truth.shape[:3], 3))
+    temp[..., :2] = ground_truth
+    assemblies_gt = inferenceutils._parse_ground_truth_data(temp)
+    with open(metadata_file, "rb") as f:
+        inds_test = set(pickle.load(f)[2])
+    assemblies_gt_test = {k: v for k, v in assemblies_gt.items() if k in inds_test}
+
+    # TODO(stes): remove/rewrite
+    if drop_kpts is not None:
+        temp = {}
+        for k, v in assemblies_gt_test.items():
+            lst = []
+            for a in v:
+                arr = np.delete(a.data[:, :3], drop_kpts, axis=0)
+                a = inferenceutils.Assembly.from_array(arr)
+                lst.append(a)
+            temp[k] = lst
+        assemblies_gt_test = temp
+        for ind in sorted(drop_kpts, reverse=True):
+            kpts.pop(ind)
+
+    assemblies_pred_ = conv_obj_to_assemblies(eval_results_obj, kpts)
+    assemblies_pred = dict(enumerate(assemblies_pred_.values()))
+
+    with deeplabcut.benchmark.utils.DisableOutput():
+        oks = inferenceutils.evaluate_assembly(
+            assemblies_pred,
+            assemblies_gt_test,
+            oks_sigma,
+            margin=margin,
+            symmetric_kpts=symmetric_kpts,
+        )
+    return oks["mAP"]
+
+
+def calc_rmse_from_obj(
+    eval_results_obj, h5_file, metadata_file, drop_kpts=None,
+):
+    """Calc prediction errors for submissions."""
+    gt = _format_gt_data(h5_file)
+    kpts = gt["metadata"]["keypoints"]
+    if drop_kpts:
+        for k, v in gt["annotations"].items():
+            gt["annotations"][k] = np.delete(v, drop_kpts, axis=1)
+        for ind in sorted(drop_kpts, reverse=True):
+            kpts.pop(ind)
+    with open(metadata_file, "rb") as f:
+        inds_test = set(pickle.load(f)[2])
+    test_objects = {
+        k: v for i, (k, v) in enumerate(eval_results_obj.items()) if i in inds_test
+    }
+    assemblies_pred = conv_obj_to_assemblies(test_objects, kpts)
+    preds = defaultdict(dict)
+    preds["metadata"]["keypoints"] = kpts
+    for image, assemblies in assemblies_pred.items():
+        if assemblies:
+            arr = np.stack([a.data for a in assemblies]).swapaxes(0, 1)
+            data = [xy[~np.isnan(xy).any(axis=1)] for xy in arr[..., :2]]
+            temp = {
+                "coordinates": tuple([data]),
+                "confidence": list(np.expand_dims(arr[..., 2], axis=2)),
+            }
+            preds["predictions"][image] = temp
+    with deeplabcut.benchmark.utils.DisableOutput():
+        errors = calc_prediction_errors(preds, gt)
+    return np.nanmean(errors[..., 0])
diff --git a/deeplabcut/benchmark/utils.py b/deeplabcut/benchmark/utils.py
new file mode 100644
index 0000000000..40aa2495d0
--- /dev/null
+++ b/deeplabcut/benchmark/utils.py
@@ -0,0 +1,67 @@
+# DeepLabCut2.0 Toolbox (deeplabcut.org)
+# © A. & M. Mathis Labs
+# https://github.com/AlexEMG/DeepLabCut
+# Please see AUTHORS for contributors.
+#
+# https://github.com/AlexEMG/DeepLabCut/blob/master/AUTHORS
+# Licensed under GNU Lesser General Public License v3.0
+
+"""Helper functions in this file are not affected by the main repositories
+license. They are independent from the remainder of the benchmarking code. 
+"""
+import importlib
+import os
+import pkgutil
+import sys
+
+
+class RedirectStdStreams(object):
+    """Context manager for redirecting stdout and stderr
+    Reference:
+        https://stackoverflow.com/a/6796752
+        CC BY-SA 3.0, https://stackoverflow.com/users/46690/rob-cowie
+    """
+
+    def __init__(self, stdout=None, stderr=None):
+        self._stdout = stdout or sys.stdout
+        self._stderr = stderr or sys.stderr
+
+    def __enter__(self):
+        self.old_stdout, self.old_stderr = sys.stdout, sys.stderr
+        self.old_stdout.flush()
+        self.old_stderr.flush()
+        sys.stdout, sys.stderr = self._stdout, self._stderr
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self._stdout.flush()
+        self._stderr.flush()
+        sys.stdout = self.old_stdout
+        sys.stderr = self.old_stderr
+
+
+class DisableOutput(RedirectStdStreams):
+    def __init__(self):
+        devnull = open(os.devnull, "w")
+        super().__init__(stdout=devnull, stderr=devnull)
+
+
+def import_submodules(package, recursive=True):
+    """Import all submodules of a module, recursively, including subpackages
+
+    :param package: package (name or actual module)
+    :type package: str | module
+    :rtype: dict[str, types.ModuleType]
+
+    Reference:
+        https://stackoverflow.com/a/25562415
+        CC BY-SA 3.0, https://stackoverflow.com/users/712522/mr-b
+    """
+    if isinstance(package, str):
+        package = importlib.import_module(package)
+    results = {}
+    for loader, name, is_pkg in pkgutil.walk_packages(package.__path__):
+        full_name = package.__name__ + "." + name
+        results[full_name] = importlib.import_module(full_name)
+        if recursive and is_pkg:
+            results.update(import_submodules(full_name))
+    return results
diff --git a/deeplabcut/utils/__init__.py b/deeplabcut/utils/__init__.py
index 048505d8ce..44202a1c3f 100644
--- a/deeplabcut/utils/__init__.py
+++ b/deeplabcut/utils/__init__.py
@@ -6,4 +6,3 @@
 from deeplabcut.utils.make_labeled_video import *
 from deeplabcut.utils.plotting import *
 from deeplabcut.utils.video_processor import *
-
diff --git a/deeplabcut/utils/auxfun_models.py b/deeplabcut/utils/auxfun_models.py
index fd02e7512d..55e1b83fdb 100644
--- a/deeplabcut/utils/auxfun_models.py
+++ b/deeplabcut/utils/auxfun_models.py
@@ -21,9 +21,9 @@
     "resnet_101": MODEL_BASE_PATH / "resnet_v1_101.ckpt",
     "resnet_152": MODEL_BASE_PATH / "resnet_v1_152.ckpt",
     "mobilenet_v2_1.0": MODEL_BASE_PATH / "mobilenet_v2_1.0_224.ckpt",
-    "mobilenet_v2_0.75":MODEL_BASE_PATH / "mobilenet_v2_0.75_224.ckpt",
+    "mobilenet_v2_0.75": MODEL_BASE_PATH / "mobilenet_v2_0.75_224.ckpt",
     "mobilenet_v2_0.5": MODEL_BASE_PATH / "mobilenet_v2_0.5_224.ckpt",
-    "mobilenet_v2_0.35":MODEL_BASE_PATH / "mobilenet_v2_0.35_224.ckpt",
+    "mobilenet_v2_0.35": MODEL_BASE_PATH / "mobilenet_v2_0.35_224.ckpt",
     "efficientnet-b0": MODEL_BASE_PATH / "efficientnet-b0" / "model.ckpt",
     "efficientnet-b1": MODEL_BASE_PATH / "efficientnet-b1" / "model.ckpt",
     "efficientnet-b2": MODEL_BASE_PATH / "efficientnet-b2" / "model.ckpt",
diff --git a/deeplabcut/utils/auxfun_multianimal.py b/deeplabcut/utils/auxfun_multianimal.py
index 603eda17b8..c2fc409c4b 100644
--- a/deeplabcut/utils/auxfun_multianimal.py
+++ b/deeplabcut/utils/auxfun_multianimal.py
@@ -25,6 +25,7 @@
 from deeplabcut.generate_training_dataset import trainingsetmanipulation
 from deeplabcut.pose_estimation_tensorflow.lib.trackingutils import TRACK_METHODS
 
+
 def reorder_individuals_in_df(df: pd.DataFrame, order: list) -> pd.DataFrame:
     """
     Reorders data of df to match the order given in a list
@@ -49,6 +50,7 @@ def reorder_individuals_in_df(df: pd.DataFrame, order: list) -> pd.DataFrame:
 
     return df
 
+
 def extractindividualsandbodyparts(cfg):
     individuals = cfg["individuals"].copy()
     if len(cfg["uniquebodyparts"]) > 0:
@@ -65,7 +67,7 @@ def get_track_method(cfg, track_method=""):
                     f"Invalid tracking method. Only {', '.join(TRACK_METHODS)} are currently supported."
                 )
             return track_method
-        else: # default
+        else:  # default
             track_method = cfg.get("default_track_method", "")
             if not track_method:
                 warnings.warn(
@@ -191,7 +193,6 @@ def SaveFullMultiAnimalData(data, metadata, dataname, suffix="_full"):
     data_path = dataname.split(".h5")[0] + suffix + ".pickle"
     metadata_path = dataname.split(".h5")[0] + "_meta.pickle"
 
-
     with open(data_path, "wb") as f:
         pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
     with open(metadata_path, "wb") as f:
@@ -199,7 +200,6 @@ def SaveFullMultiAnimalData(data, metadata, dataname, suffix="_full"):
     return data_path, metadata_path
 
 
-
 def LoadFullMultiAnimalData(dataname):
     """ Save predicted data as h5 file and metadata as pickle file; created by predict_videos.py """
     data_file = dataname.split(".h5")[0] + "_full.pickle"
diff --git a/deeplabcut/utils/auxfun_videos.py b/deeplabcut/utils/auxfun_videos.py
index b3e257c2c6..2918195ad0 100644
--- a/deeplabcut/utils/auxfun_videos.py
+++ b/deeplabcut/utils/auxfun_videos.py
@@ -21,8 +21,7 @@
 
 
 # more videos are in principle covered, as OpenCV is used and allows many formats.
-SUPPORTED_VIDEOS = 'avi', 'mp4', 'mov', 'mpeg', 'mpg', 'mpv', 'mkv', 'flv', 'qt', 'yuv'
-
+SUPPORTED_VIDEOS = "avi", "mp4", "mov", "mpeg", "mpg", "mpv", "mkv", "flv", "qt", "yuv"
 
 
 class VideoReader:
@@ -354,9 +353,10 @@ def check_video_integrity(video_path):
     vid.check_integrity()
     vid.check_integrity_robust()
 
+
 def imread(image_path, mode="skimage"):
-    ''' Read image either with skimage or cv2.
-    Returns frame in uint with 3 color channels. '''
+    """ Read image either with skimage or cv2.
+    Returns frame in uint with 3 color channels. """
     if mode == "skimage":
         image = io.imread(image_path)
         if image.ndim == 2 or image.shape[-1] == 1:
@@ -366,8 +366,10 @@ def imread(image_path, mode="skimage"):
 
         return img_as_ubyte(image)
 
-    elif mode=="cv2":
-        return cv2.imread(image_path, cv2.IMREAD_UNCHANGED)[..., ::-1]  # ~10% faster than using cv2.cvtColor
+    elif mode == "cv2":
+        return cv2.imread(image_path, cv2.IMREAD_UNCHANGED)[
+            ..., ::-1
+        ]  # ~10% faster than using cv2.cvtColor
 
 
 # https://docs.opencv.org/3.4.0/da/d54/group__imgproc__transform.html#ga5bb5a1fea74ea38e1a5445ca803ff121
diff --git a/deeplabcut/utils/auxiliaryfunctions.py b/deeplabcut/utils/auxiliaryfunctions.py
index 722a7d684f..6a8cf57165 100644
--- a/deeplabcut/utils/auxiliaryfunctions.py
+++ b/deeplabcut/utils/auxiliaryfunctions.py
@@ -361,7 +361,9 @@ def get_list_of_videos(
         if in_random_order:
             from random import shuffle
 
-            shuffle(videos) # this is useful so multiple nets can be used to analyze simultaneously
+            shuffle(
+                videos
+            )  # this is useful so multiple nets can be used to analyze simultaneously
         else:
             videos.sort()
 
@@ -620,7 +622,9 @@ def get_scorer_name(
     return scorer, scorer_legacy
 
 
-def check_if_post_processing(folder, vname, DLCscorer, DLCscorerlegacy, suffix="filtered"):
+def check_if_post_processing(
+    folder, vname, DLCscorer, DLCscorerlegacy, suffix="filtered"
+):
     """ Checks if filtered/bone lengths were already calculated. If not, figures
     out if data was already analyzed (either with legacy scorer name or new one!) """
     outdataname = os.path.join(folder, vname + DLCscorer + suffix + ".h5")
@@ -727,9 +731,8 @@ def find_analyzed_data(folder, videoname, scorer, filtered=False, track_method="
     candidates = []
     for file in grab_files_in_folder(folder, "h5"):
         stem = Path(file).stem.replace("_filtered", "")
-        starts_by_scorer = (
-            file.startswith(videoname + scorer)
-            or file.startswith(videoname + scorer_legacy)
+        starts_by_scorer = file.startswith(videoname + scorer) or file.startswith(
+            videoname + scorer_legacy
         )
         if tracker:
             matches_tracker = stem.endswith(tracker)
@@ -800,9 +803,7 @@ def find_next_unlabeled_folder(config_path, verbose=False):
     cfg = read_config(config_path)
     base_folder = Path(os.path.join(cfg["project_path"], "labeled-data"))
     h5files = sorted(
-        base_folder.rglob("*.h5"),
-        key=lambda p: p.lstat().st_mtime,
-        reverse=True,
+        base_folder.rglob("*.h5"), key=lambda p: p.lstat().st_mtime, reverse=True,
     )
     folders = sorted(f for f in base_folder.iterdir() if f.is_dir())
     most_recent_folder = h5files[0].parent
@@ -820,6 +821,7 @@ def find_next_unlabeled_folder(config_path, verbose=False):
                 print(f"{folder.name} | {int(100 * frac)} %")
     return next_folder
 
+
 # aliases for backwards-compatibility.
 SaveData = save_data
 SaveMetadata = save_metadata
@@ -827,7 +829,9 @@ def find_next_unlabeled_folder(config_path, verbose=False):
 GetVideoList = get_video_list
 GetTrainingSetFolder = get_training_set_folder
 GetDataandMetaDataFilenames = get_data_and_metadata_filenames
-IntersectionofBodyPartsandOnesGivenbyUser = intersection_of_body_parts_and_ones_given_by_user
+IntersectionofBodyPartsandOnesGivenbyUser = (
+    intersection_of_body_parts_and_ones_given_by_user
+)
 GetScorerName = get_scorer_name
 CheckifPostProcessing = check_if_post_processing
 CheckifNotAnalyzed = check_if_not_analyzed
diff --git a/deeplabcut/utils/auxiliaryfunctions_3d.py b/deeplabcut/utils/auxiliaryfunctions_3d.py
index c0d3b3c3b8..1fe857e4b5 100644
--- a/deeplabcut/utils/auxiliaryfunctions_3d.py
+++ b/deeplabcut/utils/auxiliaryfunctions_3d.py
@@ -266,6 +266,7 @@ def LoadMetadata3d(metadatafilename):
         metadata = pickle.load(f)
         return metadata
 
+
 def _reconstruct_tracks_as_tracklets(df):
     """
     Parameters:
@@ -297,6 +298,7 @@ def _associate_paired_view_tracks(tracklets1, tracklets2, F):
         Fundamental matrix between cam1 and cam2
     """
     from scipy.optimize import linear_sum_assignment
+
     # Initialize costs matrix
     costs = np.zeros([len(tracklets1), len(tracklets2)])
 
@@ -313,7 +315,7 @@ def _associate_paired_view_tracks(tracklets1, tracklets2, F):
             # cost for any point in time of t1 being the same
             # any point in time of t2
             cost = np.abs(np.nansum(np.matmul(_t1, F) * _t2, axis=2))
-            
+
             # Get average cost of the entire track
             cost = cost.mean()
             costs[i, j] = cost
@@ -337,4 +339,4 @@ def cross_view_match_dataframes(df1, df2, F):
     tracks2 = _reconstruct_tracks_as_tracklets(df2)
     costs, voting = _associate_paired_view_tracks(tracks1, tracks2, F)
 
-    return costs, voting
\ No newline at end of file
+    return costs, voting
diff --git a/deeplabcut/utils/conversioncode.py b/deeplabcut/utils/conversioncode.py
index db7d9b16fb..3a6421cfd6 100644
--- a/deeplabcut/utils/conversioncode.py
+++ b/deeplabcut/utils/conversioncode.py
@@ -82,7 +82,7 @@ def convertcsv2h5(config, userfeedback=True, scorer=None):
             print("Attention:", folder, "does not appear to have labeled data!")
 
 
-def analyze_videos_converth5_to_csv(video_folder, videotype=".mp4",listofvideos=False):
+def analyze_videos_converth5_to_csv(video_folder, videotype=".mp4", listofvideos=False):
     """
     By default the output poses (when running analyze_videos) are stored as MultiIndex Pandas Array, which contains the name of the network, body part name, (x, y) label position \n
     in pixels, and the likelihood for each frame per body part. These arrays are stored in an efficient Hierarchical Data Format (HDF) \n
@@ -106,11 +106,13 @@ def analyze_videos_converth5_to_csv(video_folder, videotype=".mp4",listofvideos=
 
     """
 
-    if listofvideos: # can also be called with a list of videos (from GUI)
-        videos = video_folder # GUI gives a list of videos
-        if len(videos)>0:
+    if listofvideos:  # can also be called with a list of videos (from GUI)
+        videos = video_folder  # GUI gives a list of videos
+        if len(videos) > 0:
             h5_files = list(
-                auxiliaryfunctions.grab_files_in_folder(Path(videos[0]).parent, "h5", relative=False)
+                auxiliaryfunctions.grab_files_in_folder(
+                    Path(videos[0]).parent, "h5", relative=False
+                )
             )
         else:
             h5_files = []
@@ -126,10 +128,7 @@ def analyze_videos_converth5_to_csv(video_folder, videotype=".mp4",listofvideos=
 
 
 def analyze_videos_converth5_to_nwb(
-    config,
-    video_folder,
-    videotype=".mp4",
-    listofvideos=False,
+    config, video_folder, videotype=".mp4", listofvideos=False,
 ):
     """
     Convert all h5 output data files in `video_folder` to NWB format.
@@ -152,11 +151,13 @@ def analyze_videos_converth5_to_nwb(
     deeplabcut.analyze_videos_converth5_to_csv('/media/alex/experimentaldata/cheetahvideos','.mp4')
 
     """
-    if listofvideos: # can also be called with a list of videos (from GUI)
-        videos = video_folder # GUI gives a list of videos
-        if len(videos)>0:
+    if listofvideos:  # can also be called with a list of videos (from GUI)
+        videos = video_folder  # GUI gives a list of videos
+        if len(videos) > 0:
             h5_files = list(
-                auxiliaryfunctions.grab_files_in_folder(Path(videos[0]).parent, "h5", relative=False)
+                auxiliaryfunctions.grab_files_in_folder(
+                    Path(videos[0]).parent, "h5", relative=False
+                )
             )
         else:
             h5_files = []
@@ -240,7 +241,7 @@ def guarantee_multiindex_rows(df):
             df.index = pd.MultiIndex.from_tuples(splits)
         except TypeError:  #  Ignore numerical index of frame indices
             pass
-    
+
     # Ensure folder names are strings
     try:
         df.index = df.index.set_levels(df.index.levels[1].astype(str), level=1)
diff --git a/deeplabcut/utils/make_labeled_video.py b/deeplabcut/utils/make_labeled_video.py
index 690d658314..bb8121b0e2 100644
--- a/deeplabcut/utils/make_labeled_video.py
+++ b/deeplabcut/utils/make_labeled_video.py
@@ -91,7 +91,7 @@ def CreateVideo(
 
     fps = clip.fps()
     if isinstance(fps, float):
-        if fps*1000 > 65535:
+        if fps * 1000 > 65535:
             fps = round(fps)
     nframes = clip.nframes
     duration = nframes / fps
@@ -124,7 +124,7 @@ def CreateVideo(
         nindividuals = len(Dataframe.columns.get_level_values("individuals").unique())
         map2bp = [bplist.index(bp) for bp in all_bpts]
         nbpts_per_ind = (
-            Dataframe.groupby(level="individuals", axis=1).size().values[0] // 3
+            Dataframe.groupby(level="individuals", axis=1).size().values // 3
         )
         map2id = []
         for i, j in enumerate(nbpts_per_ind):
@@ -774,14 +774,7 @@ def _create_labeled_video(
         sw = ""
         sh = ""
 
-    clip = vp(
-        fname=video,
-        sname=output_path,
-        codec=codec,
-        sw=sw,
-        sh=sh,
-        fps=fps,
-    )
+    clip = vp(fname=video, sname=output_path, codec=codec, sw=sw, sh=sh, fps=fps,)
     df = pd.read_hdf(h5file)
     try:
         animals = df.columns.get_level_values("individuals").unique().to_list()
@@ -967,7 +960,9 @@ def create_video_with_all_detections(
             print("Creating labeled video for ", str(Path(video).stem))
             h5file = full_pickle.replace("_full.pickle", ".h5")
             data, _ = auxfun_multianimal.LoadFullMultiAnimalData(h5file)
-            data = dict(data)  # Cast to dict (making a copy) so items can safely be popped
+            data = dict(
+                data
+            )  # Cast to dict (making a copy) so items can safely be popped
 
             header = data.pop("metadata")
             all_jointnames = header["all_joints_names"]
diff --git a/deeplabcut/utils/plotting.py b/deeplabcut/utils/plotting.py
index c3524ade3a..716eda4ee0 100644
--- a/deeplabcut/utils/plotting.py
+++ b/deeplabcut/utils/plotting.py
@@ -351,21 +351,18 @@ def _plot_paf_performance(
 
     bins = np.linspace(0, 1, nbins)
     if colors is None:
-        colors = '#EFC9AF', '#1F8AC0'
+        colors = "#EFC9AF", "#1F8AC0"
     if ax is None:
         fig, ax = plt.subplots(tight_layout=True, figsize=(3, 3))
-    sns.histplot(within, kde=kde, ax=ax, stat='probability',
-                 color=colors[0], bins=bins)
-    sns.histplot(between, kde=kde, ax=ax, stat='probability',
-                 color=colors[1], bins=bins)
+    sns.histplot(within, kde=kde, ax=ax, stat="probability", color=colors[0], bins=bins)
+    sns.histplot(
+        between, kde=kde, ax=ax, stat="probability", color=colors[1], bins=bins
+    )
     return ax
 
 
 def plot_edge_affinity_distributions(
-    eval_pickle_file,
-    include_bodyparts="all",
-    output_name="",
-    figsize=(10, 7),
+    eval_pickle_file, include_bodyparts="all", output_name="", figsize=(10, 7),
 ):
     """
     Display the distribution of affinity costs of within- and between-animal edges.
@@ -388,25 +385,25 @@ def plot_edge_affinity_distributions(
 
     """
 
-    with open(eval_pickle_file, 'rb') as file:
+    with open(eval_pickle_file, "rb") as file:
         data = pickle.load(file)
-    meta_pickle_file = eval_pickle_file.replace('_full.', '_meta.')
-    with open(meta_pickle_file, 'rb') as file:
+    meta_pickle_file = eval_pickle_file.replace("_full.", "_meta.")
+    with open(meta_pickle_file, "rb") as file:
         metadata = pickle.load(file)
     (w_train, _), (b_train, _) = crossvalutils._calc_within_between_pafs(
         data, metadata, train_set_only=True,
     )
-    data.pop('metadata', None)
+    data.pop("metadata", None)
     nonempty = set(i for i, vals in w_train.items() if vals)
-    meta = metadata['data']['DLC-model-config file']
-    bpts = list(map(str.lower, meta['all_joints_names']))
-    inds_multi = set(b for edge in meta['partaffinityfield_graph'] for b in edge)
-    if include_bodyparts == 'all':
+    meta = metadata["data"]["DLC-model-config file"]
+    bpts = list(map(str.lower, meta["all_joints_names"]))
+    inds_multi = set(b for edge in meta["partaffinityfield_graph"] for b in edge)
+    if include_bodyparts == "all":
         include_bodyparts = inds_multi
     else:
         include_bodyparts = set(bpts.index(bpt) for bpt in include_bodyparts)
     edges_to_keep = set()
-    graph = meta['partaffinityfield_graph']
+    graph = meta["partaffinityfield_graph"]
     for n, edge in enumerate(graph):
         if not any(i in include_bodyparts for i in edge):
             continue
@@ -419,14 +416,20 @@ def plot_edge_affinity_distributions(
     )
     axes = axes_.flatten()
     for ax in axes:
-        ax.axis('off')
+        ax.axis("off")
     for n, ind in enumerate(edge_inds):
         i1, i2 = graph[ind]
         w_tr = w_train[ind]
         b_tr = b_train[ind]
-        sep, _ = crossvalutils._calc_separability(b_tr, w_tr, metric='auc')
-        axes[n].text(0.5, 0.8, f'{bpts[i1]}–{bpts[i2]}\n{sep:.2f}', size=8,
-                     ha='center', transform=axes[n].transAxes)
+        sep, _ = crossvalutils._calc_separability(b_tr, w_tr, metric="auc")
+        axes[n].text(
+            0.5,
+            0.8,
+            f"{bpts[i1]}–{bpts[i2]}\n{sep:.2f}",
+            size=8,
+            ha="center",
+            transform=axes[n].transAxes,
+        )
         _plot_paf_performance(w_tr, b_tr, ax=axes[n], kde=False)
     axes[0].set_xticks([])
     axes[0].set_yticks([])
diff --git a/docs/benchmark.md b/docs/benchmark.md
new file mode 100644
index 0000000000..114568decd
--- /dev/null
+++ b/docs/benchmark.md
@@ -0,0 +1,35 @@
+# DeepLabCut benchmark
+
+For further information and the leaderboard, see [the official homepage](https://benchmark.deeplabcut.org/).
+
+## High Level API
+
+When implementing your own benchmarks, the most important functions are directly accessible
+under the ``deeplabcut.benchmark`` package.
+
+```{eval-rst}
+.. automodule:: deeplabcut.benchmark
+   :members:
+   :show-inheritance:
+```
+
+## Available benchmark definitions
+
+See [the official benchmark page](https://benchmark.deeplabcut.org/datasets.html) for a full overview
+of the available datasets. A benchmark submission should contain a result for at least one of these
+benchmarks. For an example of how to implement a benchmark submission, refer to the baselines in the
+[DeepLabCut benchmark repo](https://github.com/DeepLabCut/benchmark/tree/main/benchmark/baselines).
+
+```{eval-rst}
+.. automodule:: deeplabcut.benchmark.benchmarks
+   :members:
+   :show-inheritance:
+```
+
+## Metric calculation
+
+```{eval-rst}
+.. automodule:: deeplabcut.benchmark.metrics
+   :members:
+   :show-inheritance:
+```
\ No newline at end of file
diff --git a/docs/installation.md b/docs/installation.md
index 05ee5ac224..da04eb6c4a 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -1,7 +1,7 @@
 (how-to-install)=
 # How To Install DeepLabCut
 
-DeepLabCut can be run on Windows, Linux, or MacOS (see also [technical considerations](tech-considerations-during-install) and if you run into issues also check out [installation ProTips](installTips)).
+DeepLabCut can be run on Windows, Linux, or MacOS (see also [technical considerations](tech-considerations-during-install) and if you run into issues also check out the [Installation Tips](https://deeplabcut.github.io/DeepLabCut/docs/recipes/installTips.html) page).
 
 ## PIP:
 
@@ -96,7 +96,7 @@ The ONLY thing you need to do **first** if you have an NVIDIA GPU and the matchi
 
 #### The most common "new user" hurdle is installing and using your GPU, so don't get discouraged!
 
-**CRITICAL:** If you have a GPU, you should FIRST then **install the NVIDIA CUDA package and an appropriate driver for your specific GPU**, then you can use the supplied conda file. Please follow the instructions found here https://www.tensorflow.org/install/gpu, and more tips below, to install the correct version of CUDA and your graphic card driver. The order of operations matters.
+**CRITICAL:** If you have a GPU, you should FIRST **install the NVIDIA CUDA package and an appropriate driver for your specific GPU**, then you can use the supplied conda file. Please follow the instructions found here https://www.tensorflow.org/install/gpu, and more tips below, to install the correct version of CUDA and your graphic card driver. The order of operations matters.
 
 - Here we provide notes on how to install and check your GPU use with TensorFlow (which is used by DeepLabCut and already installed with the Anaconda files above). Thus, you do not need to independently install tensorflow.
 
diff --git a/docs/recipes/installTips.md b/docs/recipes/installTips.md
index 37c1f830b4..ca87741d30 100644
--- a/docs/recipes/installTips.md
+++ b/docs/recipes/installTips.md
@@ -313,37 +313,56 @@ Activate! `conda activate DEEPLABCUT` and then run: `conda install -c conda-forg
 Then run `python -m deeplabcut` which launches the DLC GUI.
 
 
-## DeepLabCut M1 chip installation environment instructions:
+## DeepLabCut MacOS M1 and M2 chip installation environment instructions:
 
-This only assumes you have anaconda installed!
+This only assumes you have anaconda installed.
 
-Use the `DEEPLABCUT_M1.yaml` conda file if you have an Macbok with an M1 chip, and follow these steps:
+Use the `DEEPLABCUT_M1.yaml` conda file if you have an Macbok with an M1 or M2 chip, and follow these steps:
 
 (1) git clone the deeplabcut cut repo:
 
-`git clone https://github.com/DeepLabCut/DeepLabCut.git`
-
-(2) in the program terminal, `cd DeepLabCut/conda-environments`
+```
+git clone https://github.com/DeepLabCut/DeepLabCut.git
+```
 
-(3) Click here to download the Rosetta wheel for TensorFlow.
+(2) in the program terminal run: `cd DeepLabCut/conda-environments`
 
-For instance, for 2.4.1:
-https://drive.google.com/file/d/17pSwfoNuyf3YR8vCaVggHeI-pMQ3xL7l/view?usp=sharing
-(for different versions see here: https://github.com/tensorflow/tensorflow/issues/46044)
+(3) Click [here](https://drive.google.com/file/d/17pSwfoNuyf3YR8vCaVggHeI-pMQ3xL7l/view?usp=sharing) to download the Rosetta wheel for TensorFlow. We assume this goes into your Downloads folder. This downloads TF 2.4.1; https://drive.google.com/file/d/17pSwfoNuyf3YR8vCaVggHeI-pMQ3xL7l/view?usp=sharing
+(for different versions see here: https://github.com/tensorflow/tensorflow/issues/46044).
 
 (4) Then, run:
 
-`conda env create -f DEEPLABCUT_M1.yaml`
+```
+conda env create -f DEEPLABCUT_M1.yaml
+```
+
+(5) Please activate the environment and set osx-64; i.e., then, run:
+
+```
+conda activate DEEPLABCUT_M1
+conda env config vars set CONDA_SUBDIR=osx-64
+```
+Now, as the print satement says, please deactivate and re-activate to set the change:
+
+```
+conda deactivate
+conda activate DEEPLABCUT_M1
+conda env update -f DEEPLABCUT_M1.yaml
+```
+
+(5) Next, run:
 
-(5) Next, **activate the environment,** and in the terminal `cd` to  Downloads, and then run:
+ ```
+ pip install ~/Downloads/tensorflow-2.4.1-py3-none-any.whl --no-dependencies --force-reinstall
+ ```
+ (again, assumes this file in your downloads folder)
 
-`pip install tensorflow-2.4.1-py3-none-any.whl --no-dependencies --force-reinstall`
 
-(6) Next, launch DLC with `pythonw -m deeplabcut`
+(6) Next, launch DLC with `pythonw -m deeplabcut` (or if DLC version 2.3+, please use `python -m deeplabcut`)
 
 GUI will open!
 
-Note: Based on issue #1380 thanks!
+Note: Based on issues  #1380 and #2011, thanks!
 
 ## How to confirm that your GPU is being used by DeepLabCut
 
diff --git a/requirements.txt b/requirements.txt
index 26c872ea7c..dbcdd18998 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,7 +9,7 @@ matplotlib<=3.5.2
 napari-deeplabcut>=0.0.6
 networkx>=2.6
 numpy>=1.18.5
-pandas>=1.0.1
+pandas>=1.0.1,!=1.5.0
 pyyaml
 qdarkstyle==3.1
 scikit-image>=0.17
diff --git a/setup.py b/setup.py
index 31bfe51f8c..eea5312652 100644
--- a/setup.py
+++ b/setup.py
@@ -33,7 +33,7 @@
         "matplotlib>=3.3",
         "networkx>=2.6",
         "numpy>=1.18.5",
-        "pandas>=1.0.1",
+        "pandas>=1.0.1,!=1.5.0",
         "scikit-image>=0.17",
         "scikit-learn>=1.0",
         "scipy>=1.4",