-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathbenchmark.py
More file actions
167 lines (141 loc) · 5.34 KB
/
benchmark.py
File metadata and controls
167 lines (141 loc) · 5.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
"""Benchmark resource class for synchronous operations."""
from __future__ import annotations
from typing import List
from typing_extensions import Unpack, override
from ..types import BenchmarkView
from ._types import (
BaseRequestOptions,
LongRequestOptions,
SDKBenchmarkUpdateParams,
SDKBenchmarkListRunsParams,
SDKBenchmarkStartRunParams,
)
from .._types import SequenceNotStr
from .._client import Runloop
from .benchmark_run import BenchmarkRun
class Benchmark:
"""A benchmark for evaluating agent performance across scenarios.
Provides methods for retrieving benchmark details, updating the benchmark,
managing scenarios, and starting benchmark runs. Obtain instances via
``runloop.benchmark.from_id()`` or ``runloop.benchmark.list()``.
Example:
>>> benchmark = runloop.benchmark.from_id("bmd_xxx")
>>> info = benchmark.get_info()
>>> run = benchmark.start_run(run_name="evaluation-v1")
>>> for scenario_id in info.scenario_ids:
... scenario = runloop.scenario.from_id(scenario_id)
... scenario_run = scenario.run(benchmark_run_id=run.id, run_name="evaluation-v1")
"""
def __init__(self, client: Runloop, benchmark_id: str) -> None:
"""Create a Benchmark instance.
:param client: Runloop client instance
:type client: Runloop
:param benchmark_id: Benchmark ID
:type benchmark_id: str
"""
self._client = client
self._id = benchmark_id
@override
def __repr__(self) -> str:
return f"<Benchmark id={self._id!r}>"
@property
def id(self) -> str:
"""Return the benchmark ID.
:return: Unique benchmark ID
:rtype: str
"""
return self._id
def get_info(
self,
**options: Unpack[BaseRequestOptions],
) -> BenchmarkView:
"""Retrieve current benchmark details.
:param options: See :typeddict:`~runloop_api_client.sdk._types.BaseRequestOptions` for available options
:return: Current benchmark info
:rtype: BenchmarkView
"""
return self._client.benchmarks.retrieve(
self._id,
**options,
)
def update(
self,
**params: Unpack[SDKBenchmarkUpdateParams],
) -> BenchmarkView:
"""Update the benchmark.
Only provided fields will be updated.
:param params: See :typeddict:`~runloop_api_client.sdk._types.SDKBenchmarkUpdateParams` for available parameters
:return: Updated benchmark info
:rtype: BenchmarkView
"""
return self._client.benchmarks.update(
self._id,
**params,
)
def start_run(
self,
**params: Unpack[SDKBenchmarkStartRunParams],
) -> BenchmarkRun:
"""Start a new benchmark run.
Creates a new benchmark run and returns a BenchmarkRun instance for
managing the run lifecycle.
:param params: See :typeddict:`~runloop_api_client.sdk._types.SDKBenchmarkStartRunParams` for available parameters
:return: BenchmarkRun instance for managing the run
:rtype: BenchmarkRun
"""
run_view = self._client.benchmarks.start_run(
benchmark_id=self._id,
**params,
)
assert run_view.benchmark_id is not None, "benchmark_id should be set for runs created from a benchmark"
return BenchmarkRun(self._client, run_view.id, run_view.benchmark_id)
def add_scenarios(
self,
scenario_ids: SequenceNotStr[str],
**options: Unpack[LongRequestOptions],
) -> BenchmarkView:
"""Add scenarios to the benchmark.
:param scenario_ids: List of scenario IDs to add
:type scenario_ids: SequenceNotStr[str]
:param options: See :typeddict:`~runloop_api_client.sdk._types.LongRequestOptions` for available options
:return: Updated benchmark info
:rtype: BenchmarkView
"""
return self._client.benchmarks.update_scenarios(
self._id,
scenarios_to_add=scenario_ids,
**options,
)
def remove_scenarios(
self,
scenario_ids: SequenceNotStr[str],
**options: Unpack[LongRequestOptions],
) -> BenchmarkView:
"""Remove scenarios from the benchmark.
:param scenario_ids: List of scenario IDs to remove
:type scenario_ids: SequenceNotStr[str]
:param options: See :typeddict:`~runloop_api_client.sdk._types.LongRequestOptions` for available options
:return: Updated benchmark info
:rtype: BenchmarkView
"""
return self._client.benchmarks.update_scenarios(
self._id,
scenarios_to_remove=scenario_ids,
**options,
)
def list_runs(
self,
**params: Unpack[SDKBenchmarkListRunsParams],
) -> List[BenchmarkRun]:
"""List all runs for this benchmark.
:param params: See :typeddict:`~runloop_api_client.sdk._types.SDKBenchmarkListRunsParams` for available parameters
:return: List of benchmark runs
:rtype: List[BenchmarkRun]
"""
page = self._client.benchmark_runs.list(
benchmark_id=self._id,
**params,
)
return [
BenchmarkRun(self._client, run.id, run.benchmark_id) for run in page.runs if run.benchmark_id is not None
]