Skip to content

Commit 5aab168

Browse files
feat (Gen AI): Add new sample for Evaluation - Pairwise summarization metric (GoogleCloudPlatform#12608)
* Add sample for Gen AI Evaluation: Pairwise summarization quality * refresh CI builds * refresh CI builds * Update comments to follow the runnable-snippets-plan standard * move pairwise_evaluation sample and test to a dedicated directory * refresh CI checks * specify exact model version * update region tag names to follow standard schema * add example response
1 parent 48f5cc7 commit 5aab168

File tree

2 files changed

+113
-0
lines changed

2 files changed

+113
-0
lines changed
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
import os
15+
16+
from vertexai.preview.evaluation import EvalResult
17+
18+
PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT")
19+
20+
21+
def evaluate_output() -> EvalResult:
22+
# [START generativeaionvertexai_evaluation_pairwise_summarization_quality]
23+
import pandas as pd
24+
25+
import vertexai
26+
from vertexai.generative_models import GenerativeModel
27+
from vertexai.evaluation import (
28+
EvalTask,
29+
PairwiseMetric,
30+
MetricPromptTemplateExamples,
31+
)
32+
33+
# TODO(developer): Update & uncomment line below
34+
# PROJECT_ID = "your-project-id"
35+
vertexai.init(project=PROJECT_ID, location="us-central1")
36+
37+
prompt = """
38+
Summarize the text such that a five-year-old can understand.
39+
40+
# Text
41+
42+
As part of a comprehensive initiative to tackle urban congestion and foster
43+
sustainable urban living, a major city has revealed ambitious plans for an
44+
extensive overhaul of its public transportation system. The project aims not
45+
only to improve the efficiency and reliability of public transit but also to
46+
reduce the city\'s carbon footprint and promote eco-friendly commuting options.
47+
City officials anticipate that this strategic investment will enhance
48+
accessibility for residents and visitors alike, ushering in a new era of
49+
efficient, environmentally conscious urban transportation.
50+
"""
51+
52+
eval_dataset = pd.DataFrame({"prompt": [prompt]})
53+
54+
# Baseline model for pairwise comparison
55+
baseline_model = GenerativeModel("gemini-1.5-pro-001")
56+
57+
# Candidate model for pairwise comparison
58+
candidate_model = GenerativeModel(
59+
"gemini-1.5-pro-002", generation_config={"temperature": 0.4}
60+
)
61+
62+
prompt_template = MetricPromptTemplateExamples.get_prompt_template(
63+
"pairwise_summarization_quality"
64+
)
65+
66+
summarization_quality_metric = PairwiseMetric(
67+
metric="pairwise_summarization_quality",
68+
metric_prompt_template=prompt_template,
69+
baseline_model=baseline_model,
70+
)
71+
72+
eval_task = EvalTask(
73+
dataset=eval_dataset,
74+
metrics=[summarization_quality_metric],
75+
experiment="pairwise-experiment",
76+
)
77+
result = eval_task.evaluate(model=candidate_model)
78+
79+
baseline_model_response = result.metrics_table["baseline_model_response"].iloc[0]
80+
candidate_model_response = result.metrics_table["response"].iloc[0]
81+
winner_model = result.metrics_table[
82+
"pairwise_summarization_quality/pairwise_choice"
83+
].iloc[0]
84+
explanation = result.metrics_table[
85+
"pairwise_summarization_quality/explanation"
86+
].iloc[0]
87+
88+
print(f"Baseline's story:\n{baseline_model_response}")
89+
print(f"Candidate's story:\n{candidate_model_response}")
90+
print(f"Winner: {winner_model}")
91+
print(f"Explanation: {explanation}")
92+
# Example response:
93+
# Baseline's story:
94+
# A big city wants to make it easier for people to get around without using cars! They're going to make buses and trains ...
95+
#
96+
# Candidate's story:
97+
# A big city wants to make it easier for people to get around without using cars! ... This will help keep the air clean ...
98+
#
99+
# Winner: CANDIDATE
100+
# Explanation: Both responses adhere to the prompt's constraints, are grounded in the provided text, and ... However, Response B ...
101+
102+
# [END generativeaionvertexai_evaluation_pairwise_summarization_quality]
103+
return result
104+
105+
106+
if __name__ == "__main__":
107+
evaluate_output()

generative_ai/evaluation/test_evaluation.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,14 @@
1313
# limitations under the License.
1414

1515
import get_rouge_score
16+
import pairwise_summarization_quality
1617

1718

1819
def test_create_evaluation_task() -> None:
1920
response = get_rouge_score.get_rouge_score()
2021
assert response
22+
23+
24+
def test_pairwise_evaluation_summarization_quality() -> None:
25+
response = pairwise_summarization_quality.evaluate_output()
26+
assert response

0 commit comments

Comments
 (0)