Skip to content

Commit 513d4d6

Browse files
jsondaicopybara-github
authored andcommitted
chore: GenAI Client(evals) - Add class methods for parsing raw Agent Session history into the new AgentData structure. Add agent_resource_name attribute to AgentConfig and loading methods.
feat: GenAI Client(evals) - Add 3 new multi-turn predefined metrics for agent evaluation (`MULTI_TURN_TOOL_USE_QUALITY`, `MULTI_TURN_TRAJECTORY_QUALITY`, `MULTI_TURN_TASK_SUCCESS`). chore: GenAI Client(evals) - Update evaluation data converters and metric handlers to natively support `AgentData` in `EvaluationDataset` and `EvalCase`. chore: GenAI Client(evals) - Map `agent_data` to `agent_eval_data` in Vertex REST payload generation. PiperOrigin-RevId: 878061362
1 parent e9f1c88 commit 513d4d6

9 files changed

Lines changed: 595 additions & 86 deletions

File tree

tests/unit/vertexai/genai/replays/test_evaluate.py

Lines changed: 257 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
from tests.unit.vertexai.genai.replays import pytest_helper
1818
from vertexai._genai import types
19+
from google.genai import types as genai_types
1920
import pandas as pd
2021

2122

@@ -96,6 +97,262 @@ def test_evaluation_byor(client):
9697
assert case_result.response_candidate_results is not None
9798

9899

100+
def test_evaluation_agent_data(client):
101+
"""Tests evaluate method with AgentData."""
102+
client._api_client._http_options.base_url = (
103+
"https://autopush-aiplatform.sandbox.googleapis.com/"
104+
)
105+
client._api_client._http_options.api_version = "v1beta1"
106+
107+
agent_data = types.evals.AgentData(
108+
agents={
109+
"coordinator": types.evals.AgentConfig(
110+
agent_id="coordinator",
111+
agent_type="RouterAgent",
112+
description="Root agent that delegates to specialists.",
113+
instruction=(
114+
"You are a travel coordinator. Delegate flight tasks to"
115+
" 'flight_bot' and hotel tasks to 'hotel_bot'."
116+
),
117+
sub_agents=["flight_bot", "hotel_bot"],
118+
tools=[
119+
genai_types.Tool(
120+
function_declarations=[
121+
genai_types.FunctionDeclaration(
122+
name="delegate_to_agent",
123+
description=("Delegates conversation to a sub-agent."),
124+
)
125+
]
126+
)
127+
],
128+
),
129+
"flight_bot": types.evals.AgentConfig(
130+
agent_id="flight_bot",
131+
agent_type="SpecialistAgent",
132+
description="Handles flight searches.",
133+
instruction="Search for flights using the available tools.",
134+
tools=[
135+
genai_types.Tool(
136+
function_declarations=[
137+
genai_types.FunctionDeclaration(
138+
name="search_flights",
139+
description=(
140+
"Finds flights based on origin and" " destination."
141+
),
142+
)
143+
]
144+
)
145+
],
146+
),
147+
"hotel_bot": types.evals.AgentConfig(
148+
agent_id="hotel_bot",
149+
agent_type="SpecialistAgent",
150+
description="Handles hotel searches.",
151+
instruction="Search for hotels using the available tools.",
152+
tools=[
153+
genai_types.Tool(
154+
function_declarations=[
155+
genai_types.FunctionDeclaration(
156+
name="search_hotels",
157+
description="Finds hotels in a given location.",
158+
)
159+
]
160+
)
161+
],
162+
),
163+
},
164+
turns=[
165+
types.evals.ConversationTurn(
166+
turn_index=0,
167+
events=[
168+
types.evals.AgentEvent(
169+
author="user",
170+
content=genai_types.Content(
171+
role="user",
172+
parts=[
173+
genai_types.Part(
174+
text=(
175+
"I need to book a flight to NYC for next"
176+
" Monday."
177+
)
178+
)
179+
],
180+
),
181+
),
182+
types.evals.AgentEvent(
183+
author="coordinator",
184+
content=genai_types.Content(
185+
role="model",
186+
parts=[
187+
genai_types.Part(
188+
function_call=genai_types.FunctionCall(
189+
name="delegate_to_agent",
190+
args={"agent_name": "flight_bot"},
191+
)
192+
)
193+
],
194+
),
195+
),
196+
types.evals.AgentEvent(
197+
author="flight_bot",
198+
content=genai_types.Content(
199+
role="model",
200+
parts=[
201+
genai_types.Part(
202+
function_call=genai_types.FunctionCall(
203+
name="search_flights",
204+
args={
205+
"destination": "NYC",
206+
"date": "next Monday",
207+
},
208+
)
209+
)
210+
],
211+
),
212+
),
213+
types.evals.AgentEvent(
214+
author="flight_bot",
215+
content=genai_types.Content(
216+
role="tool",
217+
parts=[
218+
genai_types.Part(
219+
function_response=genai_types.FunctionResponse(
220+
name="search_flights",
221+
response={
222+
"flights": [
223+
{
224+
"id": "UA100",
225+
"price": "$300",
226+
}
227+
]
228+
},
229+
)
230+
)
231+
],
232+
),
233+
),
234+
types.evals.AgentEvent(
235+
author="flight_bot",
236+
content=genai_types.Content(
237+
role="model",
238+
parts=[
239+
genai_types.Part(
240+
text="I found flight UA100 to NYC for $300."
241+
)
242+
],
243+
),
244+
),
245+
],
246+
),
247+
types.evals.ConversationTurn(
248+
turn_index=1,
249+
events=[
250+
types.evals.AgentEvent(
251+
author="user",
252+
content=genai_types.Content(
253+
role="user",
254+
parts=[
255+
genai_types.Part(
256+
text=(
257+
"Great, book that. I also need a hotel"
258+
" there."
259+
)
260+
)
261+
],
262+
),
263+
),
264+
types.evals.AgentEvent(
265+
author="coordinator",
266+
content=genai_types.Content(
267+
role="model",
268+
parts=[
269+
genai_types.Part(
270+
function_call=genai_types.FunctionCall(
271+
name="delegate_to_agent",
272+
args={"agent_name": "hotel_bot"},
273+
)
274+
)
275+
],
276+
),
277+
),
278+
types.evals.AgentEvent(
279+
author="hotel_bot",
280+
content=genai_types.Content(
281+
role="model",
282+
parts=[
283+
genai_types.Part(
284+
function_call=genai_types.FunctionCall(
285+
name="search_hotels",
286+
args={"location": "NYC"},
287+
)
288+
)
289+
],
290+
),
291+
),
292+
types.evals.AgentEvent(
293+
author="hotel_bot",
294+
content=genai_types.Content(
295+
role="tool",
296+
parts=[
297+
genai_types.Part(
298+
function_response=genai_types.FunctionResponse(
299+
name="search_hotels",
300+
response={
301+
"hotels": [
302+
{
303+
"name": "Central Park Hotel",
304+
"rating": 4.5,
305+
}
306+
]
307+
},
308+
)
309+
)
310+
],
311+
),
312+
),
313+
types.evals.AgentEvent(
314+
author="hotel_bot",
315+
content=genai_types.Content(
316+
role="model",
317+
parts=[
318+
genai_types.Part(
319+
text="I recommend the Central Park Hotel."
320+
)
321+
],
322+
),
323+
),
324+
],
325+
),
326+
],
327+
)
328+
329+
# Create the EvalCase and wrap it in an EvaluationDataset
330+
eval_case = types.EvalCase(agent_data=agent_data)
331+
eval_dataset = types.EvaluationDataset(eval_cases=[eval_case])
332+
333+
metrics = [
334+
types.RubricMetric.MULTI_TURN_TRAJECTORY_QUALITY,
335+
]
336+
337+
evaluation_result = client.evals.evaluate(dataset=eval_dataset, metrics=metrics)
338+
339+
assert isinstance(evaluation_result, types.EvaluationResult)
340+
341+
assert evaluation_result.summary_metrics is not None
342+
assert len(evaluation_result.summary_metrics) > 0
343+
for summary in evaluation_result.summary_metrics:
344+
assert isinstance(summary, types.AggregatedMetricResult)
345+
assert summary.metric_name is not None
346+
assert summary.mean_score is not None
347+
348+
assert evaluation_result.eval_case_results is not None
349+
assert len(evaluation_result.eval_case_results) > 0
350+
for case_result in evaluation_result.eval_case_results:
351+
assert isinstance(case_result, types.EvalCaseResult)
352+
assert case_result.eval_case_index is not None
353+
assert case_result.response_candidate_results is not None
354+
355+
99356
pytestmark = pytest_helper.setup(
100357
file=__file__,
101358
globals_for_file=globals(),

tests/unit/vertexai/genai/replays/test_evaluate_instances.py

Lines changed: 0 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -105,39 +105,6 @@ def test_pointwise_metric(client):
105105
assert response.pointwise_metric_result.score is not None
106106

107107

108-
# def test_predefined_metric_with_agent_data(client):
109-
# """Tests the _evaluate_instances method with predefined metric and agent_data."""
110-
# agent_data = types.evals.AgentData(
111-
# agent_config=types.evals.AgentConfig(
112-
# tools=[
113-
# genai_types.Tool(
114-
# function_declarations=[
115-
# genai_types.FunctionDeclaration(name="search")
116-
# ]
117-
# )
118-
# ],
119-
# developer_instruction=types.evals.InstanceData(text="instruction"),
120-
# ),
121-
# events=types.evals.Events(
122-
# event=[genai_types.Content(parts=[genai_types.Part(text="hello")])]
123-
# ),
124-
# )
125-
# instance = types.EvaluationInstance(
126-
# prompt=types.evals.InstanceData(text="What is the capital of France?"),
127-
# response=types.evals.InstanceData(text="Paris"),
128-
# reference=types.evals.InstanceData(text="Paris"),
129-
# agent_data=agent_data,
130-
# )
131-
132-
# response = client.evals.evaluate_instances(
133-
# metric_config=types._EvaluateInstancesRequestParameters(
134-
# metrics=[types.Metric(name="general_quality_v1")],
135-
# instance=instance,
136-
# )
137-
# )
138-
# assert response.metric_results[0].score is not None
139-
140-
141108
def test_pairwise_metric_with_autorater(client):
142109
"""Tests the _evaluate_instances method with PairwiseMetricInput and AutoraterConfig."""
143110

vertexai/_genai/_evals_common.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1184,8 +1184,10 @@ def _resolve_dataset_inputs(
11841184
datasets_to_process = dataset
11851185
logger.info("Processing %s dataset(s).", num_response_candidates)
11861186

1187-
loaded_raw_datasets: list[list[dict[str, Any]]] = []
1188-
schemas_for_merge: list[str] = []
1187+
if len(datasets_to_process) == 1 and datasets_to_process[0].eval_cases:
1188+
return datasets_to_process[0], 1
1189+
1190+
parsed_evaluation_datasets: list[types.EvaluationDataset] = []
11891191

11901192
for i, ds_item in enumerate(datasets_to_process):
11911193
if not isinstance(ds_item, types.EvaluationDataset):
@@ -1199,17 +1201,20 @@ def _resolve_dataset_inputs(
11991201
f"Item at index {i} is not an EvaluationDataset: {type(ds_item)}"
12001202
)
12011203

1204+
if ds_item.eval_cases:
1205+
logger.info("Dataset %d already contains eval_cases.", i)
1206+
parsed_evaluation_datasets.append(ds_item)
1207+
continue
1208+
12021209
ds_source_for_loader = _get_dataset_source(ds_item)
12031210
current_loaded_data = loader.load(ds_source_for_loader)
1204-
loaded_raw_datasets.append(current_loaded_data)
12051211

12061212
if dataset_schema:
12071213
current_schema = _evals_data_converters.EvalDatasetSchema(dataset_schema)
12081214
else:
12091215
current_schema = _evals_data_converters.auto_detect_dataset_schema( # type: ignore[assignment]
12101216
current_loaded_data
12111217
)
1212-
schemas_for_merge.append(current_schema)
12131218

12141219
logger.info(
12151220
"Dataset %d: Schema: %s. Using %s converter.",
@@ -1219,13 +1224,12 @@ def _resolve_dataset_inputs(
12191224
current_schema
12201225
).__class__.__name__,
12211226
)
1227+
converter = _evals_data_converters.get_dataset_converter(current_schema)
1228+
parsed_evaluation_datasets.append(converter.convert(current_loaded_data))
12221229

1223-
processed_eval_dataset = (
1224-
_evals_data_converters.merge_response_datasets_into_canonical_format(
1225-
raw_datasets=loaded_raw_datasets,
1226-
schemas=schemas_for_merge,
1227-
agent_info=agent_info,
1228-
)
1230+
processed_eval_dataset = _evals_data_converters.merge_evaluation_datasets(
1231+
datasets=parsed_evaluation_datasets,
1232+
agent_info=agent_info,
12291233
)
12301234

12311235
if not processed_eval_dataset.eval_cases:

vertexai/_genai/_evals_constant.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@
2323
"safety_v1",
2424
"multi_turn_general_quality_v1",
2525
"multi_turn_text_quality_v1",
26+
"multi_turn_tool_use_quality_v1",
27+
"multi_turn_trajectory_quality_v1",
28+
"multi_turn_task_success_v1",
2629
"final_response_match_v2",
2730
"final_response_reference_free_v1",
2831
"final_response_quality_v1",

0 commit comments

Comments
 (0)