diff --git a/_toc.yml b/_toc.yml
index 9071294684..e53f806c8c 100644
--- a/_toc.yml
+++ b/_toc.yml
@@ -27,6 +27,9 @@ parts:
   chapters:
   - file: docs/recipes/UsingModelZooPupil
   - file: docs/recipes/MegaDetectorDLCLive
+- caption: DeepLabCut Benchmark
+  chapters:
+  - file: docs/benchmark
 - caption: Hardware
   chapters:
   - file: docs/recipes/TechHardware
diff --git a/deeplabcut/benchmark/__init__.py b/deeplabcut/benchmark/__init__.py
new file mode 100644
index 0000000000..5540a8468c
--- /dev/null
+++ b/deeplabcut/benchmark/__init__.py
@@ -0,0 +1,116 @@
+# DeepLabCut2.0 Toolbox (deeplabcut.org)
+# © A. & M. Mathis Labs
+# https://github.com/AlexEMG/DeepLabCut
+# Please see AUTHORS for contributors.
+#
+# https://github.com/AlexEMG/DeepLabCut/blob/master/AUTHORS
+# Licensed under GNU Lesser General Public License v3.0
+
+
+import json
+import os
+from typing import Container
+from typing import Literal
+
+from deeplabcut.benchmark.base import Benchmark, Result, ResultCollection
+
+DATA_ROOT = os.path.join(os.getcwd(), "data")
+CACHE = os.path.join(os.getcwd(), ".results")
+
+__registry = []
+
+
+def register(cls):
+    """Add a benchmark to the list of evaluations to run.
+
+    Apply this function as a decorator to a class. Note that the
+    class needs to be a subclass of the ``benchmark.base.Benchmark``
+    base class.
+
+    In most situations, it will be a subclass of one of the pre-defined
+    benchmarks in ``benchmark.benchmarks``.
+
+    Throws:
+        ``ValueError`` if the decorator is applied to a class that is
+        not a subclass of ``benchmark.base.Benchmark``.
+    """
+    if not issubclass(cls, Benchmark):
+        raise ValueError(
+            f"Can only register subclasses of {type(Benchmark)}, " f"but got {cls}."
+        )
+    __registry.append(cls)
+
+
+def evaluate(
+    include_benchmarks: Container[str] = None,
+    results: ResultCollection = None,
+    on_error="return",
+) -> ResultCollection:
+    """Run evaluation for all benchmarks and methods.
+
+    Note that in order for your custom benchmark to be included during
+    evaluation, the following conditions need to be met:
+
+        - The benchmark subclassed one of the benchmark definitions in
+          in ``benchmark.benchmarks``
+        - The benchmark is registered by applying the ``@benchmark.register``
+          decorator to the class
+        - The benchmark was imported. This is done automatically for all
+          benchmarks that are defined in submodules or subpackages of the
+          ``benchmark.submissions`` module. For all other locations, make
+          sure to manually import the packages **before** calling the
+          ``evaluate()`` function.
+
+    Args:
+        include_benchmarks:
+            If ``None``, run all benchmarks that were discovered. If a container
+            is passed, only include methods that were defined on benchmarks with
+            the specified names. E.g., ``include_benchmarks = ["trimouse"]`` would
+            only evaluate methods of the trimouse benchmark dataset.
+        on_error:
+            see documentation in ``benchmark.base.Benchmark.evaluate()``
+
+    Returns:
+        A collection of all results, which can be printed or exported to
+        ``pd.DataFrame`` or ``json`` file formats.
+    """
+    if results is None:
+        results = ResultCollection()
+    for benchmark_cls in __registry:
+        if include_benchmarks is not None:
+            if benchmark_cls.name not in include_benchmarks:
+                continue
+        benchmark = benchmark_cls()
+        for name in benchmark.names():
+            if Result(method_name=name, benchmark_name=benchmark_cls.name) in results:
+                continue
+            else:
+                result = benchmark.evaluate(name, on_error=on_error)
+                results.add(result)
+    return results
+
+
+def get_filepath(basename: str):
+    return os.path.join(DATA_ROOT, basename)
+
+
+def savecache(results: ResultCollection):
+    with open(CACHE, "w") as fh:
+        json.dump(results.todicts(), fh, indent=2)
+
+
+def loadcache(
+    cache=CACHE, on_missing: Literal["raise", "ignore"] = "ignore"
+) -> ResultCollection:
+    if not os.path.exists(cache):
+        if on_missing == "raise":
+            raise FileNotFoundError(cache)
+        return ResultCollection()
+    with open(cache, "r") as fh:
+        try:
+            data = json.load(fh)
+        except json.decoder.JSONDecodeError as e:
+            if on_missing == "raise":
+                raise e
+            return ResultCollection()
+    return ResultCollection.fromdicts(data)
diff --git a/deeplabcut/benchmark/__main__.py b/deeplabcut/benchmark/__main__.py
new file mode 100644
index 0000000000..627c685299
--- /dev/null
+++ b/deeplabcut/benchmark/__main__.py
@@ -0,0 +1,12 @@
+# DeepLabCut2.0 Toolbox (deeplabcut.org)
+# © A. & M. Mathis Labs
+# https://github.com/AlexEMG/DeepLabCut
+# Please see AUTHORS for contributors.
+#
+# https://github.com/AlexEMG/DeepLabCut/blob/master/AUTHORS
+# Licensed under GNU Lesser General Public License v3.0
+
+from deeplabcut.benchmark.cli import main
+
+if __name__ == "__main__":
+    main()
diff --git a/deeplabcut/benchmark/base.py b/deeplabcut/benchmark/base.py
new file mode 100644
index 0000000000..74c20122fb
--- /dev/null
+++ b/deeplabcut/benchmark/base.py
@@ -0,0 +1,211 @@
+# DeepLabCut2.0 Toolbox (deeplabcut.org)
+# © A. & M. Mathis Labs
+# https://github.com/AlexEMG/DeepLabCut
+# Please see AUTHORS for contributors.
+#
+# https://github.com/AlexEMG/DeepLabCut/blob/master/AUTHORS
+# Licensed under GNU Lesser General Public License v3.0
+
+"""Base classes for benchmark and result definition
+
+Benchmarks subclass the abstract ``Benchmark`` class and are defined by ``name``, their
+``keypoints`` names, as well as groundtruth and metadata necessary to run evaluation.
+Right now, the metrics to compute and report for each of the multi-animal benchmarks is the
+root mean-squared-error (RMSE) and the mean average precision (mAP).
+
+Note for contributors: If you decide to contribute a benchmark which does not fit
+into this evaluation framework, please feel free to extend the base classes
+(e.g. to support additional metrics).
+"""
+
+import abc
+import dataclasses
+from typing import Iterable
+from typing import Tuple
+
+import pandas as pd
+
+import deeplabcut.benchmark.metrics
+from deeplabcut import __version__
+
+
+class BenchmarkEvaluationError(RuntimeError):
+    pass
+
+
+class Benchmark(abc.ABC):
+    """Abstract benchmark baseclass.
+
+    All benchmarks should subclass this class.
+    """
+
+    @abc.abstractmethod
+    def names(self):
+        """A unique key to describe this submission, e.g. the model name.
+
+        This is also the name that will later appear in the benchmark table.
+        The name needs to be unique across the whole benchmark. Non-unique names
+        will raise an error during submission of a PR.
+        """
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def get_predictions(self):
+        """Return predictions for all images in the benchmark."""
+        raise NotImplementedError()
+
+    def __init__(self):
+        keys = ["name", "keypoints", "ground_truth", "metadata"]
+        for key in keys:
+            if not hasattr(self, key):
+                raise NotImplementedError(
+                    f"Subclass of abstract Benchmark class need "
+                    f"to define the {key} property."
+                )
+
+    def compute_pose_rmse(self, results_objects):
+        return deeplabcut.benchmark.metrics.calc_rmse_from_obj(
+            results_objects, h5_file=self.ground_truth, metadata_file=self.metadata
+        )
+
+    def compute_pose_map(self, results_objects):
+        return deeplabcut.benchmark.metrics.calc_map_from_obj(
+            results_objects, h5_file=self.ground_truth, metadata_file=self.metadata
+        )
+
+    def evaluate(self, name: str, on_error="raise"):
+        """Evaluate this benchmark with all registered methods."""
+
+        if name not in self.names():
+            raise ValueError(
+                f"{name} is not registered. Valid names are {self.names()}"
+            )
+        if on_error not in ("ignore", "return", "raise"):
+            raise ValueError(f"on_error got an undefined value: {on_error}")
+        mean_avg_precision = float("nan")
+        root_mean_squared_error = float("nan")
+        try:
+            predictions = self.get_predictions(name)
+            mean_avg_precision = self.compute_pose_map(predictions)
+            root_mean_squared_error = self.compute_pose_rmse(predictions)
+        except Exception as exception:
+            if on_error == "ignore":
+                # ignore the exception and continue with the next evaluation, without
+                # yielding a result value.
+                return
+            elif on_error == "return":
+                # return the result value, with NaN as the result for all metrics that
+                # could not be computed due to the error.
+                pass
+            elif on_error == "raise":
+                # raise the error and stop evaluation
+                raise BenchmarkEvaluationError(
+                    f"Error during benchmark evaluation for model {name}"
+                ) from exception
+            else:
+                raise NotImplementedError() from exception
+        return Result(
+            method_name=name,
+            benchmark_name=self.name,
+            mean_avg_precision=mean_avg_precision,
+            root_mean_squared_error=root_mean_squared_error,
+        )
+
+
+@dataclasses.dataclass
+class Result:
+    """Benchmark result."""
+
+    method_name: str
+    benchmark_name: str
+    root_mean_squared_error: float = float("nan")
+    mean_avg_precision: float = float("nan")
+    benchmark_version: str = __version__
+
+    _export_mapping = dict(
+        benchmark_name="benchmark",
+        method_name="method",
+        benchmark_version="version",
+        root_mean_squared_error="RMSE",
+        mean_avg_precision="mAP",
+    )
+
+    _primary_key = ("benchmark_name", "method_name", "benchmark_version")
+
+    @property
+    def primary_key(self) -> Tuple[str]:
+        """The primary key to uniquely identify this result."""
+        return tuple(getattr(self, k) for k in self._primary_key)
+
+    @property
+    def primary_key_names(self) -> Tuple[str]:
+        """Names of the primary keys"""
+        return tuple(self._export_mapping.get(k) for k in self._primary_key)
+
+    def __str__(self):
+        return (
+            f"{self.method_name}, {self.benchmark_name}: "
+            f"{self.mean_avg_precision} mAP, "
+            f"{self.root_mean_squared_error} RMSE"
+        )
+
+    @classmethod
+    def fromdict(cls, data: dict):
+        """Construct result object from dictionary."""
+        kwargs = {attr: data[key] for attr, key in cls._export_mapping.items()}
+        return cls(**kwargs)
+
+    def todict(self) -> dict:
+        """Export result object to dictionary, with less verbose key names."""
+        return {key: getattr(self, attr) for attr, key in self._export_mapping.items()}
+
+
+class ResultCollection:
+    def __init__(self, *results):
+        self.results = {result.primary_key: result for result in results}
+
+    @property
+    def primary_key_names(self):
+        return next(iter(self.results.values())).primary_key_names
+
+    def toframe(self) -> pd.DataFrame:
+        """Convert results to pandas dataframe"""
+        return pd.DataFrame(
+            [result.todict() for result in self.results.values()]
+        ).set_index(list(self.primary_key_names))
+
+    def add(self, result: Result):
+        """Add a result to the collection."""
+        if result.primary_key in self.results:
+            raise ValueError(
+                "An entry for {result.primary_key} does already "
+                "exist in this collection. Did you try to add the "
+                "same result twice?"
+            )
+        if len(self) > 0:
+            if result.primary_key_names != self.primary_key_names:
+                raise ValueError("Incompatible result format.")
+        self.results[result.primary_key] = result
+
+    @classmethod
+    def fromdicts(cls, data: Iterable[dict]):
+        return cls(*[Result.fromdict(entry) for entry in data])
+
+    def todicts(self):
+        return [result.todict() for result in self.results.values()]
+
+    def __len__(self):
+        return len(self.results)
+
+    def __contains__(self, other: Result):
+        if not isinstance(other, Result):
+            raise ValueError(
+                f"{type(self)} can only store objects of type Result, "
+                f"but got {type(other)}."
+            )
+        return other.primary_key in self.results
+
+    def __eq__(self, other):
+        if not isinstance(other, ResultCollection):
+            return False
+        return other.results == self.results
diff --git a/deeplabcut/benchmark/benchmarks.py b/deeplabcut/benchmark/benchmarks.py
new file mode 100644
index 0000000000..09dc39db6b
--- /dev/null
+++ b/deeplabcut/benchmark/benchmarks.py
@@ -0,0 +1,153 @@
+# DeepLabCut2.0 Toolbox (deeplabcut.org)
+# © A. & M. Mathis Labs
+# https://github.com/AlexEMG/DeepLabCut
+# Please see AUTHORS for contributors.
+#
+# https://github.com/AlexEMG/DeepLabCut/blob/master/AUTHORS
+# Licensed under GNU Lesser General Public License v3.0
+
+"""Definition for official DeepLabCut benchmark tasks.
+
+See benchmark.deeplabcut.org for a current leaderboard with models and metrics
+for each of these benchmarks. Submissions can be done by opening a PR in the
+benchmark reporistory:
+
+https://github.com/DeepLabCut/benchmark
+"""
+
+import deeplabcut.benchmark.base
+
+
+class TriMouseBenchmark(deeplabcut.benchmark.base.Benchmark):
+    """Datasets with three mice with a top-view camera.
+
+    Three wild-type (C57BL/6J) male mice ran on a paper spool following odor trails (Mathis et al 2018). These experiments were carried out in the laboratory of Venkatesh N. Murthy at Harvard University. Data were recorded at 30 Hz with 640 x 480 pixels resolution acquired with a Point Grey Firefly FMVU-03MTM-CS. One human annotator was instructed to localize the 12 keypoints (snout, left ear, right ear, shoulder, four spine points, tail base and three tail points). All surgical and experimental procedures for mice were in accordance with the National Institutes of Health Guide for the Care and Use of Laboratory Animals and approved by the Harvard Institutional Animal Care and Use Committee. 161 frames were labeled, making this a real-world sized laboratory dataset.
+    """
+
+    name = "trimouse"
+    keypoints = (
+        "snout",
+        "leftear",
+        "rightear",
+        "shoulder",
+        "spine1",
+        "spine2",
+        "spine3",
+        "spine4",
+        "tailbase",
+        "tail1",
+        "tail2",
+        "tailend",
+    )
+    ground_truth = deeplabcut.benchmark.get_filepath("CollectedData_Daniel.h5")
+    metadata = deeplabcut.benchmark.get_filepath(
+        "Documentation_data-MultiMouse_70shuffle1.pickle"
+    )
+    num_animals = 3
+
+
+class ParentingMouseBenchmark(deeplabcut.benchmark.base.Benchmark):
+    """Datasets with three mice, one parenting, two pups.
+
+    Parenting behavior is a pup directed behavior observed in adult mice involving complex motor actions directed towards the benefit of the offspring. These experiments were carried out in the laboratory of Catherine Dulac at Harvard University. The behavioral assay was performed in the homecage of singly housed adult female mice in dark/red light conditions. For these videos, the adult mice was monitored for several minutes in the cage followed by the introduction of pup (4 days old) in one corner of the cage. The behavior of the adult and pup was monitored for a duration of 15 minutes. Video was recorded at 30Hz using a Microsoft LifeCam camera (Part#: 6CH-00001) with a resolution of 1280 x 720 pixels or a Geovision camera (model no.: GV-BX4700-3V) also acquired at 30 frames per second at a resolution of 704 x 480 pixels. A human annotator labeled on the adult animal the same 12 body points as in the tri-mouse dataset, and five body points on the pup along its spine. Initially only the two ends were labeled, and intermediate points were added by interpolation and their positions was manually adjusted if necessary. All surgical and experimental procedures for mice were in accordance with the National Institutes of Health Guide for the Care and Use of Laboratory Animals and approved by the Harvard Institutional Animal Care and Use Committee. 542 frames were labeled, making this a real-world sized laboratory dataset.
+    """
+
+    name = "parenting"
+    keypoints = (
+        "end1",
+        "interm1",
+        "interm2",
+        "interm3",
+        "end2",
+        "snout",
+        "leftear",
+        "rightear",
+        "shoulder",
+        "spine1",
+        "spine2",
+        "spine3",
+        "spine4",
+        "tailbase",
+        "tail1",
+        "tail2",
+        "tailend",
+    )
+
+    ground_truth = deeplabcut.benchmark.get_filepath("CollectedData_Mostafizur.h5")
+    metadata = deeplabcut.benchmark.get_filepath(
+        "Documentation_data-CrackingParenting_70shuffle1.pickle"
+    )
+    num_animals = 2
+
+    def compute_pose_map(self, results_objects):
+        return deeplabcut.benchmark.metrics.calc_map_from_obj(
+            results_objects,
+            h5_file=self.ground_truth,
+            metadata_file=self.metadata,
+            oks_sigma=0.15,
+            margin=10,
+            symmetric_kpts=[(0, 4), (1, 3)],
+        )
+
+
+class MarmosetBenchmark(deeplabcut.benchmark.base.Benchmark):
+    """Dataset with two marmosets.
+
+    All animal procedures are overseen by veterinary staff of the MIT and Broad Institute Department of Comparative Medicine, in compliance with the NIH guide for the care and use of laboratory animals and approved by the MIT and Broad Institute animal care and use committees. Video of common marmosets (Callithrix jacchus) was collected in the laboratory of Guoping Feng at MIT. Marmosets were recorded using Kinect V2 cameras (Microsoft) with a resolution of 1080p and frame rate of 30 Hz. After acquisition, images to be used for training the network were manually cropped to 1000 x 1000 pixels or smaller. The dataset is 7,600 labeled frames from 40 different marmosets collected from 3 different colonies (in different facilities). Each cage contains a pair of marmosets, where one marmoset had light blue dye applied to its tufts. One human annotator labeled the 15 marker points on each animal present in the frame (frames contained either 1 or 2 animals).
+
+    """
+
+    name = "marmosets"
+    keypoints = (
+        "Front",
+        "Right",
+        "Middle",
+        "Left",
+        "FL1",
+        "BL1",
+        "FR1",
+        "BR1",
+        "BL2",
+        "BR2",
+        "FL2",
+        "FR2",
+        "Body1",
+        "Body2",
+        "Body3",
+    )
+    ground_truth = deeplabcut.benchmark.get_filepath("CollectedData_Mackenzie.h5")
+    metadata = deeplabcut.benchmark.get_filepath(
+        "Documentation_data-Marmoset_70shuffle1.pickle"
+    )
+    num_animals = 2
+
+
+class FishBenchmark(deeplabcut.benchmark.base.Benchmark):
+    """Dataset with multiple fish, filmed from top-view
+
+    Schools of inland silversides (Menidia beryllina, n=14 individuals per school) were recorded in the Lauder Lab at Harvard University while swimming at 15 speeds (0.5 to 8 BL/s, body length, at 0.5 BL/s intervals) in a flow tank with a total working section of 28 x 28 x 40 cm as described in previous work, at a constant temperature (18±1°C) and salinity (33 ppt), at a Reynolds number of approximately 10,000 (based on BL). Dorsal views of steady swimming across these speeds were recorded by high-speed video cameras (FASTCAM Mini AX50, Photron USA, San Diego, CA, USA) at 60-125 frames per second (feeding videos at 60 fps, swimming alone 125 fps). The dorsal view was recorded above the swim tunnel and a floating Plexiglas panel at the water surface prevented surface ripples from interfering with dorsal view videos. Five keypoints were labeled (tip, gill, peduncle, dorsal fin tip, caudal tip). 100 frames were labeled, making this a real-world sized laboratory dataset.
+    """
+
+    name = "fish"
+    keypoints = ("tip", "gill", "peduncle", "caudaltip", "dfintip")
+    ground_truth = deeplabcut.benchmark.get_filepath("CollectedData_Valentina.h5")
+    metadata = deeplabcut.benchmark.get_filepath(
+        "Documentation_data-Schooling_70shuffle1.pickle"
+    )
+    num_animals = 14
+
+    def compute_pose_rmse(self, results_objects):
+        return deeplabcut.benchmark.metrics.calc_rmse_from_obj(
+            results_objects,
+            h5_file=self.ground_truth,
+            metadata_file=self.metadata,
+            drop_kpts=[4, 5],
+        )
+
+    def compute_pose_map(self, results_objects):
+        return deeplabcut.benchmark.metrics.calc_map_from_obj(
+            results_objects,
+            h5_file=self.ground_truth,
+            metadata_file=self.metadata,
+            drop_kpts=[4, 5],
+        )
diff --git a/deeplabcut/benchmark/cli.py b/deeplabcut/benchmark/cli.py
new file mode 100644
index 0000000000..250f2ec9e5
--- /dev/null
+++ b/deeplabcut/benchmark/cli.py
@@ -0,0 +1,46 @@
+# DeepLabCut2.0 Toolbox (deeplabcut.org)
+# © A. & M. Mathis Labs
+# https://github.com/AlexEMG/DeepLabCut
+# Please see AUTHORS for contributors.
+#
+# https://github.com/AlexEMG/DeepLabCut/blob/master/AUTHORS
+# Licensed under GNU Lesser General Public License v3.0
+
+"""Command line interface for DeepLabCut deeplabcut.benchmark."""
+
+import argparse
+
+import deeplabcut.benchmark
+
+
+def _parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--include", nargs="+", default=None, required=False)
+    parser.add_argument(
+        "--onerror",
+        default="return",
+        required=False,
+        choices=("ignore", "return", "raise"),
+    )
+    parser.add_argument("--nocache", action="store_true")
+    return parser.parse_args()
+
+
+def main():
+    """Main CLI entry point for generating deeplabcut.benchmark results."""
+    args = _parse_args()
+    if not args.nocache:
+        results = deeplabcut.benchmark.loadcache()
+    else:
+        results = None
+    results = deeplabcut.benchmark.evaluate(
+        include_benchmarks=args.include,
+        results=results,
+        on_error=args.onerror,
+    )
+    if not args.nocache:
+        deeplabcut.benchmark.savecache(results)
+    try:
+        print(results.toframe())
+    except StopIteration:
+        pass
diff --git a/deeplabcut/benchmark/metrics.py b/deeplabcut/benchmark/metrics.py
new file mode 100644
index 0000000000..3f0a67c78b
--- /dev/null
+++ b/deeplabcut/benchmark/metrics.py
@@ -0,0 +1,234 @@
+# DeepLabCut2.0 Toolbox (deeplabcut.org)
+# © A. & M. Mathis Labs
+# https://github.com/AlexEMG/DeepLabCut
+# Please see AUTHORS for contributors.
+#
+# https://github.com/AlexEMG/DeepLabCut/blob/master/AUTHORS
+# Licensed under GNU Lesser General Public License v3.0
+
+"""Evaluation metrics for the DeepLabCut benchmark."""
+
+import sys
+import unittest.mock
+
+# TODO(stes) mocking a few modules to rely in fewer dependencies, without
+# causing import errors when using deeplabcut.
+MOCK_MODULES = ["statsmodels", "statsmodels.api", "pytables"]
+for mod_name in MOCK_MODULES:
+    sys.modules[mod_name] = unittest.mock.MagicMock()
+
+import os
+import pickle
+from collections import defaultdict
+
+import numpy as np
+import pandas as pd
+
+import deeplabcut.benchmark.utils
+from deeplabcut.pose_estimation_tensorflow.core import evaluate_multianimal
+from deeplabcut.pose_estimation_tensorflow.lib import inferenceutils
+from deeplabcut.utils.conversioncode import guarantee_multiindex_rows
+
+
+def _format_gt_data(h5file):
+    df = pd.read_hdf(h5file)
+
+    def _get_unique_level_values(header, level):
+        return header.get_level_values(level).unique().to_list()
+
+    animals = _get_unique_level_values(df.columns, "individuals")
+    kpts = _get_unique_level_values(df.columns, "bodyparts")
+    try:
+        n_unique = len(
+            _get_unique_level_values(
+                df.xs("single", level="individuals", axis=1).columns, "bodyparts"
+            )
+        )
+    except KeyError:
+        n_unique = 0
+    guarantee_multiindex_rows(df)
+    file_paths = [os.path.join(*row) for row in df.index.to_list()]
+    temp = (
+        df.stack("individuals", dropna=False)
+        .reindex(animals, level="individuals")
+        .reindex(kpts, level="bodyparts", axis=1)
+    )
+    data = temp.to_numpy().reshape((len(file_paths), len(animals), -1, 2))
+    meta = {"animals": animals, "keypoints": kpts, "n_unique": n_unique}
+    return {
+        "annotations": dict(zip(file_paths, data)),
+        "metadata": meta,
+    }
+
+
+def calc_prediction_errors(preds, gt):
+    kpts_gt = gt["metadata"]["keypoints"]
+    kpts_pred = preds["metadata"]["keypoints"]
+    map_ = {kpts_gt.index(kpt): i for i, kpt in enumerate(kpts_pred)}
+    annot = gt["annotations"]
+
+    # Map image paths from predicted data to GT as the first are typically
+    # absolute whereas the latter are relative to the project path.
+    def _map(strings, substrings):
+        lookup = dict()
+        strings_ = strings.copy()
+        substrings_ = substrings.copy()
+        while strings_:
+            string = strings_.pop()
+            for s in substrings_:
+                if string.endswith(s):
+                    lookup[string] = s
+                    substrings_.remove(s)
+                    break
+        return lookup
+
+    map_images = _map(list(preds["predictions"]), list(annot))
+
+    errors = np.full(
+        (
+            len(preds["predictions"]),
+            len(gt["metadata"]["animals"]),
+            len(kpts_gt),
+            2,  # Hold distance to GT and confidence
+        ),
+        np.nan,
+    )
+    for n, (path, preds_) in enumerate(preds["predictions"].items()):
+        if not preds_:
+            continue
+        xy_gt = annot[map_images[path]].swapaxes(0, 1)
+        xy_pred = preds_["coordinates"][0]
+        conf_pred = preds_["confidence"]
+        for i, xy_gt_ in enumerate(xy_gt):
+            visible = np.flatnonzero(np.all(~np.isnan(xy_gt_), axis=1))
+            xy_pred_ = xy_pred[map_[i]]
+            if visible.size and xy_pred_.size:
+                # Pick the predictions closest to ground truth,
+                # rather than the ones the model has most confident in.
+                neighbors = evaluate_multianimal._find_closest_neighbors(
+                    xy_gt_[visible], xy_pred_, k=3
+                )
+                found = neighbors != -1
+                if ~np.any(found):
+                    continue
+                min_dists = np.linalg.norm(
+                    xy_gt_[visible][found] - xy_pred_[neighbors[found]],
+                    axis=1,
+                )
+                conf_pred_ = conf_pred[map_[i]]
+                errors[n, visible[found], i, 0] = min_dists
+                errors[n, visible[found], i, 1] = conf_pred_[neighbors[found], 0]
+    return errors
+
+
+def conv_obj_to_assemblies(eval_results_obj, keypoint_names):
+    """Convert predictions to deeplabcut assemblies."""
+    assemblies = {}
+    for image_path, results in eval_results_obj.items():
+        lst = []
+        for dict_ in results:
+            ass = inferenceutils.Assembly(len(keypoint_names))
+            for i, kpt in enumerate(keypoint_names):
+                xy = dict_["pose"][kpt]
+                if ~np.isnan(xy).all():
+                    joint = inferenceutils.Joint(pos=(xy), label=i)
+                    ass.add_joint(joint)
+            # TODO(jeylau) add affinity.setter to Assembly
+            ass._affinity = dict_["score"]
+            ass._links = [None]
+            if len(ass):
+                lst.append(ass)
+        assemblies[image_path] = lst
+    return assemblies
+
+
+def calc_map_from_obj(
+    eval_results_obj,
+    h5_file,
+    metadata_file,
+    oks_sigma=0.1,
+    margin=0,
+    symmetric_kpts=None,
+    drop_kpts=None,
+):
+    """Calculate mean average precision (mAP) based on predictions."""
+    df = pd.read_hdf(h5_file)
+    try:
+        df.drop("single", level="individuals", axis=1, inplace=True)
+    except KeyError:
+        pass
+    n_animals = len(df.columns.get_level_values("individuals").unique())
+    kpts = list(df.columns.get_level_values("bodyparts").unique())
+    image_paths = list(eval_results_obj)
+    ground_truth = (
+        df.loc[image_paths].to_numpy().reshape((len(image_paths), n_animals, -1, 2))
+    )
+    temp = np.ones((*ground_truth.shape[:3], 3))
+    temp[..., :2] = ground_truth
+    assemblies_gt = inferenceutils._parse_ground_truth_data(temp)
+    with open(metadata_file, "rb") as f:
+        inds_test = set(pickle.load(f)[2])
+    assemblies_gt_test = {k: v for k, v in assemblies_gt.items() if k in inds_test}
+
+    # TODO(stes): remove/rewrite
+    if drop_kpts is not None:
+        temp = {}
+        for k, v in assemblies_gt_test.items():
+            lst = []
+            for a in v:
+                arr = np.delete(a.data[:, :3], drop_kpts, axis=0)
+                a = inferenceutils.Assembly.from_array(arr)
+                lst.append(a)
+            temp[k] = lst
+        assemblies_gt_test = temp
+        for ind in sorted(drop_kpts, reverse=True):
+            kpts.pop(ind)
+
+    assemblies_pred_ = conv_obj_to_assemblies(eval_results_obj, kpts)
+    assemblies_pred = dict(enumerate(assemblies_pred_.values()))
+
+    with deeplabcut.benchmark.utils.DisableOutput():
+        oks = inferenceutils.evaluate_assembly(
+            assemblies_pred,
+            assemblies_gt_test,
+            oks_sigma,
+            margin=margin,
+            symmetric_kpts=symmetric_kpts,
+        )
+    return oks["mAP"]
+
+
+def calc_rmse_from_obj(
+    eval_results_obj,
+    h5_file,
+    metadata_file,
+    drop_kpts=None,
+):
+    """Calc prediction errors for submissions."""
+    gt = _format_gt_data(h5_file)
+    kpts = gt["metadata"]["keypoints"]
+    if drop_kpts:
+        for k, v in gt["annotations"].items():
+            gt["annotations"][k] = np.delete(v, drop_kpts, axis=1)
+        for ind in sorted(drop_kpts, reverse=True):
+            kpts.pop(ind)
+    with open(metadata_file, "rb") as f:
+        inds_test = set(pickle.load(f)[2])
+    test_objects = {
+        k: v for i, (k, v) in enumerate(eval_results_obj.items()) if i in inds_test
+    }
+    assemblies_pred = conv_obj_to_assemblies(test_objects, kpts)
+    preds = defaultdict(dict)
+    preds["metadata"]["keypoints"] = kpts
+    for image, assemblies in assemblies_pred.items():
+        if assemblies:
+            arr = np.stack([a.data for a in assemblies]).swapaxes(0, 1)
+            data = [xy[~np.isnan(xy).any(axis=1)] for xy in arr[..., :2]]
+            temp = {
+                "coordinates": tuple([data]),
+                "confidence": list(np.expand_dims(arr[..., 2], axis=2)),
+            }
+            preds["predictions"][image] = temp
+    with deeplabcut.benchmark.utils.DisableOutput():
+        errors = calc_prediction_errors(preds, gt)
+    return np.nanmean(errors[..., 0])
diff --git a/deeplabcut/benchmark/utils.py b/deeplabcut/benchmark/utils.py
new file mode 100644
index 0000000000..40aa2495d0
--- /dev/null
+++ b/deeplabcut/benchmark/utils.py
@@ -0,0 +1,67 @@
+# DeepLabCut2.0 Toolbox (deeplabcut.org)
+# © A. & M. Mathis Labs
+# https://github.com/AlexEMG/DeepLabCut
+# Please see AUTHORS for contributors.
+#
+# https://github.com/AlexEMG/DeepLabCut/blob/master/AUTHORS
+# Licensed under GNU Lesser General Public License v3.0
+
+"""Helper functions in this file are not affected by the main repositories
+license. They are independent from the remainder of the benchmarking code. 
+"""
+import importlib
+import os
+import pkgutil
+import sys
+
+
+class RedirectStdStreams(object):
+    """Context manager for redirecting stdout and stderr
+    Reference:
+        https://stackoverflow.com/a/6796752
+        CC BY-SA 3.0, https://stackoverflow.com/users/46690/rob-cowie
+    """
+
+    def __init__(self, stdout=None, stderr=None):
+        self._stdout = stdout or sys.stdout
+        self._stderr = stderr or sys.stderr
+
+    def __enter__(self):
+        self.old_stdout, self.old_stderr = sys.stdout, sys.stderr
+        self.old_stdout.flush()
+        self.old_stderr.flush()
+        sys.stdout, sys.stderr = self._stdout, self._stderr
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self._stdout.flush()
+        self._stderr.flush()
+        sys.stdout = self.old_stdout
+        sys.stderr = self.old_stderr
+
+
+class DisableOutput(RedirectStdStreams):
+    def __init__(self):
+        devnull = open(os.devnull, "w")
+        super().__init__(stdout=devnull, stderr=devnull)
+
+
+def import_submodules(package, recursive=True):
+    """Import all submodules of a module, recursively, including subpackages
+
+    :param package: package (name or actual module)
+    :type package: str | module
+    :rtype: dict[str, types.ModuleType]
+
+    Reference:
+        https://stackoverflow.com/a/25562415
+        CC BY-SA 3.0, https://stackoverflow.com/users/712522/mr-b
+    """
+    if isinstance(package, str):
+        package = importlib.import_module(package)
+    results = {}
+    for loader, name, is_pkg in pkgutil.walk_packages(package.__path__):
+        full_name = package.__name__ + "." + name
+        results[full_name] = importlib.import_module(full_name)
+        if recursive and is_pkg:
+            results.update(import_submodules(full_name))
+    return results
diff --git a/docs/benchmark.md b/docs/benchmark.md
new file mode 100644
index 0000000000..114568decd
--- /dev/null
+++ b/docs/benchmark.md
@@ -0,0 +1,35 @@
+# DeepLabCut benchmark
+
+For further information and the leaderboard, see [the official homepage](https://benchmark.deeplabcut.org/).
+
+## High Level API
+
+When implementing your own benchmarks, the most important functions are directly accessible
+under the ``deeplabcut.benchmark`` package.
+
+```{eval-rst}
+.. automodule:: deeplabcut.benchmark
+   :members:
+   :show-inheritance:
+```
+
+## Available benchmark definitions
+
+See [the official benchmark page](https://benchmark.deeplabcut.org/datasets.html) for a full overview
+of the available datasets. A benchmark submission should contain a result for at least one of these
+benchmarks. For an example of how to implement a benchmark submission, refer to the baselines in the
+[DeepLabCut benchmark repo](https://github.com/DeepLabCut/benchmark/tree/main/benchmark/baselines).
+
+```{eval-rst}
+.. automodule:: deeplabcut.benchmark.benchmarks
+   :members:
+   :show-inheritance:
+```
+
+## Metric calculation
+
+```{eval-rst}
+.. automodule:: deeplabcut.benchmark.metrics
+   :members:
+   :show-inheritance:
+```
\ No newline at end of file