-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathjudge.py
More file actions
82 lines (65 loc) · 2.69 KB
/
judge.py
File metadata and controls
82 lines (65 loc) · 2.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
"""LLM judge for semantic-similarity evaluation — provider-agnostic.
The judge calls an `LLMClient` Protocol, NOT a specific provider SDK. Wire
your concrete client (OpenAI, Anthropic, Azure, vLLM, …) at runtime; the
eval harness stays decoupled. The default behaviour when no client is wired
is `(None, "no LLM client configured")`, which the runner treats as
inconclusive rather than a hard failure.
"""
from __future__ import annotations
import json
import logging
from typing import Protocol
logger = logging.getLogger(__name__)
_JUDGE_PROMPT = """You are an evaluation judge. Given a question, an expected
answer, and an actual answer from an AI agent, score how well the actual
answer matches the expected answer.
Question: {question}
Expected answer: {expected_answer}
Actual answer: {actual_answer}
Respond with JSON only:
{{
"score": <float 0.0 to 1.0>,
"explanation": "<one line explaining the score>"
}}
Scoring guide:
- 1.0: Semantically identical, same information conveyed
- 0.8-0.9: Correct answer with minor wording or formatting differences
- 0.5-0.7: Partially correct, missing key details or minor inaccuracies
- 0.1-0.4: Substantially wrong but shows some relevant information
- 0.0: Completely wrong or unrelated"""
class LLMClient(Protocol):
"""Minimum surface the judge needs from any LLM SDK.
Concrete adapters live alongside the agent code. The Protocol form lets
a downstream consumer wire OpenAI's SDK, Anthropic's, Azure OpenAI, or a
self-hosted vLLM endpoint without the eval harness importing any
vendor-specific module.
"""
def complete_json(self, *, model: str, prompt: str) -> str:
"""Send *prompt* to *model* and return the raw JSON response body."""
def evaluate_semantic_similarity(
question: str,
expected: str,
actual: str,
client: LLMClient | None,
model: str,
) -> tuple[float | None, str]:
"""Score semantic similarity between expected and actual answers.
Returns ``(score in [0.0, 1.0], explanation)``. On failure returns
``(None, error)`` — the caller treats this as inconclusive, not a fail.
"""
if client is None:
return (None, "no LLM client configured")
prompt = _JUDGE_PROMPT.format(
question=question,
expected_answer=expected,
actual_answer=actual,
)
try:
body = client.complete_json(model=model, prompt=prompt)
parsed = json.loads(body)
score = float(parsed.get("score", 0.0))
explanation = str(parsed.get("explanation", "No explanation"))
return (max(0.0, min(1.0, score)), explanation)
except Exception as exc:
logger.exception("LLM judge call failed")
return (None, f"Judge call failed: {exc}")