@@ -361,41 +361,41 @@ def test_evaluation_metric_resource_name(client):
361361 )
362362 tone_check_metric = types .LLMMetric (
363363 name = "tone_check" ,
364- prompt_template = """
365- # Instruction
366- You are a professional writing evaluator. Your job is to score writing responses according to pre-defined evaluation criteria.
367-
368- # Criteria
369- Analyze the tone of the response based on these two criteria :
370- 1. Professionalism: The response should use appropriate language and maintain a business-like demeanor.
371- 2. Empathy: The response should acknowledge the user's feelings and show understanding.
372-
373- # Input
374- Prompt: {agent_data.turns[0].events[0]}
375- Response: {agent_data.turns[0].events[1]}
376-
377- # Output Format
378- Respond in a JSON format with the following schema:
379- {
380- "type": "OBJECT",
381- "properties": {
382- "score": {"type": "NUMBER"},
383- "explanation": {"type": "STRING"},
384- },
385- "required": ["score", "explanation"],
386- }
387- Return the JSON format output in a string representation of a Python dictionary directly, without strings like '```json' or '```'.
388-
389- The output would include the following fields:
390- score: based on your evaluation, the score should be a number based on the rating rubrics.
391- explanation: your explanation for the score rating, in one line.
392-
393- ## Example Output Format:
394- {"score" : -1, "explanation": "Here is the reason that the response is given a score of -1 based on the rating rubric."}
395- {"score" : 3, "explanation": "Here is the reason that the response is given a score of 3 based on the rating rubric." }
396- {"score" : 0, "explanation": "Here is the reason that the response is given a score of 0 based on the rating rubric."}
397- {"score" : 5 , "explanation": "Here is the reason that the response is given a score of 5 based on the rating rubric. "}
398- """ ,
364+ prompt_template = """Analyze the tone of the response based on these two criteria: \n
365+ 1. Professionalism: The response should use appropriate language and maintain a business-like demeanor. \n
366+ 2. Empathy: The response should acknowledge the user's feelings and show understanding. \n \n
367+ Prompt: {agent_data.turns[0].events[0]}
368+ Response: {agent_data.turns[0].events[1]}
369+ Return ONLY a JSON list of objects for these two properties :
370+ '[{"property": "Professionalism", "verdict": true, "reasoning": "..."}, '
371+ '{"property": "Empathy", "verdict": true, "reasoning": "..."}]'
372+ """ ,
373+ result_parsing_function = """
374+ import json, re
375+ def parse_results(responses):
376+ text = responses[0]
377+ # Use robust regex to find the JSON list block
378+ match = re.search("[ \\ [].*[]]", text, re.DOTALL)
379+ if not match: return {"score": 0.0, "explanation": "No valid JSON found"}
380+
381+ try:
382+ data = json.loads(match.group(0))
383+ # Calculate an overall score (e.g., average of verdicts)
384+ passed_count = sum(1 for r in data if r.get("verdict", False))
385+ total_count = len(data)
386+ score = passed_count / total_count if total_count > 0 else 0.0
387+
388+ # Consolidate reasoning into a single explanation string
389+ explanation = " \\ n".join([f"{r.get('property')}: {r.get('reasoning')}" for r in data])
390+
391+ # IMPORTANT: Return a dictionary, not a list
392+ return {
393+ "score": float(score),
394+ "explanation": explanation
395+ }
396+ except Exception as e:
397+ return {"score": 0.0 , "explanation": f"Parsing failed: {str(e)} "}
398+ """ ,
399399 )
400400 metric_resource_name = client .evals .create_evaluation_metric (
401401 metric = tone_check_metric ,
0 commit comments