feat: Add support for custom result parsing in LLM-based evaluation metrics

vertex-sdk-bot · copybara-github · commit 13dac96f33c5 · 2026-03-31T16:45:12.000-07:00
FUTURE_COPYBARA_INTEGRATE_REVIEW=#6486 from googleapis:release-please--branches--main fb49c58 PiperOrigin-RevId: 892593906
diff --git a/tests/unit/vertexai/genai/replays/test_evaluate.py b/tests/unit/vertexai/genai/replays/test_evaluate.py
@@ -361,41 +361,41 @@ def test_evaluation_metric_resource_name(client):
     )
     tone_check_metric = types.LLMMetric(
         name="tone_check",
-        prompt_template="""
-    # Instruction
-    You are a professional writing evaluator. Your job is to score writing responses according to pre-defined evaluation criteria.
-
-    # Criteria
-    Analyze the tone of the response based on these two criteria:
-    1. Professionalism: The response should use appropriate language and maintain a business-like demeanor.
-    2. Empathy: The response should acknowledge the user's feelings and show understanding.
-
-    # Input
-    Prompt: {agent_data.turns[0].events[0]}
-    Response: {agent_data.turns[0].events[1]}
-
-    # Output Format
-    Respond in a JSON format with the following schema:
-    {
-        "type": "OBJECT",
-        "properties": {
-            "score": {"type": "NUMBER"},
-            "explanation": {"type": "STRING"},
-        },
-        "required": ["score", "explanation"],
-    }
-    Return the JSON format output in a string representation of a Python dictionary directly, without strings like '```json' or '```'.
-
-    The output would include the following fields:
-    score: based on your evaluation, the score should be a number based on the rating rubrics.
-    explanation: your explanation for the score rating, in one line.
-
-    ## Example Output Format:
-    {"score" : -1, "explanation": "Here is the reason that the response is given a score of -1 based on the rating rubric."}
-    {"score" : 3, "explanation": "Here is the reason that the response is given a score of 3 based on the rating rubric."}
-    {"score" : 0, "explanation": "Here is the reason that the response is given a score of 0 based on the rating rubric."}
-    {"score" : 5, "explanation": "Here is the reason that the response is given a score of 5 based on the rating rubric."}
-    """,
+        prompt_template="""Analyze the tone of the response based on these two criteria:\n
+          1. Professionalism: The response should use appropriate language and maintain a business-like demeanor.\n
+          2. Empathy: The response should acknowledge the user's feelings and show understanding.\n\n
+          Prompt: {agent_data.turns[0].events[0]}
+          Response: {agent_data.turns[0].events[1]}
+          Return ONLY a JSON list of objects for these two properties:
+          '[{"property": "Professionalism", "verdict": true, "reasoning": "..."}, '
+          '{"property": "Empathy", "verdict": true, "reasoning": "..."}]'
+        """,
+        result_parsing_function="""
+import json, re
+def parse_results(responses):
+    text = responses[0]
+    # Use robust regex to find the JSON list block
+    match = re.search("[\\[].*[]]", text, re.DOTALL)
+    if not match: return {"score": 0.0, "explanation": "No valid JSON found"}
+
+    try:
+        data = json.loads(match.group(0))
+        # Calculate an overall score (e.g., average of verdicts)
+        passed_count = sum(1 for r in data if r.get("verdict", False))
+        total_count = len(data)
+        score = passed_count / total_count if total_count > 0 else 0.0
+
+        # Consolidate reasoning into a single explanation string
+        explanation = "\\n".join([f"{r.get('property')}: {r.get('reasoning')}" for r in data])
+
+        # IMPORTANT: Return a dictionary, not a list
+        return {
+            "score": float(score),
+            "explanation": explanation
+        }
+    except Exception as e:
+        return {"score": 0.0, "explanation": f"Parsing failed: {str(e)}"}
+""",
     )
     metric_resource_name = client.evals.create_evaluation_metric(
         metric=tone_check_metric,
diff --git a/tests/unit/vertexai/genai/replays/test_evaluation_metric.py b/tests/unit/vertexai/genai/replays/test_evaluation_metric.py
@@ -24,13 +24,12 @@
 
 def test_create_and_get_evaluation_metric(client):
     client._api_client._http_options.api_version = "v1beta1"
-    client._api_client._http_options.base_url = (
-        "https://us-central1-staging-aiplatform.sandbox.googleapis.com/"
-    )
     result = client.evals.create_evaluation_metric(
         display_name="test_metric",
         description="test_description",
-        metric=types.RubricMetric.GENERAL_QUALITY,
+        metric=types.LLMMetric(
+            name="custom_llm_metric", prompt_template="test_prompt_template"
+        ),
     )
     assert isinstance(result, str)
     assert re.match(
@@ -44,9 +43,6 @@ def test_create_and_get_evaluation_metric(client):
 
 def test_list_evaluation_metrics(client):
     client._api_client._http_options.api_version = "v1beta1"
-    client._api_client._http_options.base_url = (
-        "https://us-central1-staging-aiplatform.sandbox.googleapis.com/"
-    )
     response = client.evals.list_evaluation_metrics()
     assert isinstance(response, types.ListEvaluationMetricsResponse)
     assert len(response.evaluation_metrics) >= 0
diff --git a/vertexai/_genai/_transformers.py b/vertexai/_genai/_transformers.py
@@ -119,6 +119,14 @@ def t_metrics(
             if autorater_config:
                 llm_based_spec["judge_autorater_config"] = autorater_config
 
+            result_parsing_function = getv(metric, ["result_parsing_function"])
+            if result_parsing_function:
+                llm_based_spec["result_parser_config"] = {
+                    "custom_code_parser_config": {
+                        "parsing_function": result_parsing_function
+                    }
+                }
+
             metric_payload_item["llm_based_metric_spec"] = llm_based_spec
         elif getattr(metric, "metric_resource_name", None) is not None:
             # Safe pass
@@ -187,22 +195,8 @@ def t_metric_for_registry(
     if metric_name:
         metric_name = metric_name.lower()
 
-    # Handle standard computation metrics
-    if metric_name == "exact_match":
-        metric_payload_item["exact_match_spec"] = {}
-    elif metric_name == "bleu":
-        metric_payload_item["bleu_spec"] = {}
-    elif metric_name and metric_name.startswith("rouge"):
-        rouge_type = metric_name.replace("_", "")
-        metric_payload_item["rouge_spec"] = {"rouge_type": rouge_type}
-    # API Pre-defined metrics
-    elif metric_name and metric_name in _evals_constant.SUPPORTED_PREDEFINED_METRICS:
-        metric_payload_item["predefined_metric_spec"] = {
-            "metric_spec_name": metric_name,
-            "metric_spec_parameters": metric.metric_spec_parameters,
-        }
     # Custom Code Execution Metric
-    elif hasattr(metric, "remote_custom_function") and metric.remote_custom_function:
+    if hasattr(metric, "remote_custom_function") and metric.remote_custom_function:
         metric_payload_item["custom_code_execution_spec"] = {
             "evaluation_function": metric.remote_custom_function
         }
@@ -217,7 +211,7 @@ def t_metric_for_registry(
             "evaluation_function": metric.custom_function
         }
 
-    # Map LLM-based metrics to the new llm_based_metric_spec
+    # LLM-based metric
     elif (hasattr(metric, "prompt_template") and metric.prompt_template) or (
         hasattr(metric, "rubric_group_name") and metric.rubric_group_name
     ):
@@ -249,6 +243,14 @@ def t_metric_for_registry(
         if autorater_config:
             llm_based_spec["judge_autorater_config"] = autorater_config
 
+        result_parsing_function = getv(metric, ["result_parsing_function"])
+        if result_parsing_function:
+            llm_based_spec["result_parser_config"] = {
+                "custom_code_parser_config": {
+                    "parsing_function": result_parsing_function
+                }
+            }
+
         metric_payload_item["llm_based_metric_spec"] = llm_based_spec
 
     else: