harness-python-react/src/eval/judge.py at develop · constk/harness-python-react

82 lines (65 loc) · 2.69 KB
"""LLM judge for semantic-similarity evaluation — provider-agnostic.
The judge calls an `LLMClient` Protocol, NOT a specific provider SDK. Wire
your concrete client (OpenAI, Anthropic, Azure, vLLM, …) at runtime; the
eval harness stays decoupled. The default behaviour when no client is wired
is `(None, "no LLM client configured")`, which the runner treats as
inconclusive rather than a hard failure.
from __future__ import annotations
import json
import logging
from typing import Protocol
logger = logging.getLogger(__name__)
_JUDGE_PROMPT = """You are an evaluation judge. Given a question, an expected
answer, and an actual answer from an AI agent, score how well the actual
answer matches the expected answer.
Question: {question}
Expected answer: {expected_answer}
Actual answer: {actual_answer}
Respond with JSON only:
  "score": <float 0.0 to 1.0>,
  "explanation": "<one line explaining the score>"
Scoring guide:
- 1.0: Semantically identical, same information conveyed
- 0.8-0.9: Correct answer with minor wording or formatting differences
- 0.5-0.7: Partially correct, missing key details or minor inaccuracies
- 0.1-0.4: Substantially wrong but shows some relevant information
- 0.0: Completely wrong or unrelated"""
class LLMClient(Protocol):
    """Minimum surface the judge needs from any LLM SDK.
    Concrete adapters live alongside the agent code. The Protocol form lets
    a downstream consumer wire OpenAI's SDK, Anthropic's, Azure OpenAI, or a
    self-hosted vLLM endpoint without the eval harness importing any
    vendor-specific module.
    def complete_json(self, *, model: str, prompt: str) -> str:
        """Send *prompt* to *model* and return the raw JSON response body."""
def evaluate_semantic_similarity(
    question: str,
    expected: str,
    actual: str,
    client: LLMClient | None,
    model: str,
) -> tuple[float | None, str]:
    """Score semantic similarity between expected and actual answers.
    Returns ``(score in [0.0, 1.0], explanation)``. On failure returns
    ``(None, error)`` — the caller treats this as inconclusive, not a fail.
    if client is None:
        return (None, "no LLM client configured")
    prompt = _JUDGE_PROMPT.format(
        question=question,
        expected_answer=expected,
        actual_answer=actual,
        body = client.complete_json(model=model, prompt=prompt)
        parsed = json.loads(body)
        score = float(parsed.get("score", 0.0))
        explanation = str(parsed.get("explanation", "No explanation"))
        return (max(0.0, min(1.0, score)), explanation)
    except Exception as exc:
        logger.exception("LLM judge call failed")
        return (None, f"Judge call failed: {exc}")
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

judge.py

Latest commit

History

judge.py

File metadata and controls