harness-python-react/src/eval/models.py at develop · constk/harness-python-react

57 lines (48 loc) · 2.03 KB

"""Pydantic models for the evaluation harness."""
from __future__ import annotations
from typing import Literal
from pydantic import BaseModel, Field
class EvalCase(BaseModel):
    """A single test case from the golden QA dataset."""
    id: str = Field(description="Unique test case identifier")
    question: str = Field(description="Natural language question / input")
    category: str = Field(default="general", description="Test category")
    expected_answer: str = Field(description="Expected answer text")
    expected_tools: list[str] = Field(
        default_factory=list,
        description="Tools the agent should call (informational; not asserted)",
    tolerance: Literal["exact_match", "numeric_close", "semantic_similar"] = Field(
        description="How to compare actual vs expected"
    difficulty: Literal["easy", "medium", "hard"] = Field(
        default="easy",
        description="Difficulty level",
    notes: str = Field(default="", description="Why this test case exists")
class EvalResult(BaseModel):
    """Result of evaluating a single test case."""
    case_id: str = Field(description="Test case ID")
    question: str = Field(description="The question asked")
    category: str = Field(description="Test category")
    difficulty: str = Field(description="Difficulty level")
    expected_answer: str = Field(description="Expected answer")
    actual_answer: str = Field(description="Agent's actual answer")
    tools_called: list[str] = Field(
        default_factory=list,
        description="Tools the agent invoked",
    reasoning_trace: list[str] = Field(
        default_factory=list,
        description="Chain of thought steps",
    latency_ms: int = Field(description="Wall clock time in ms")
    pass_result: bool = Field(description="Whether the test passed")
    score: float | None = Field(
        default=None,
        description="LLM judge score for semantic_similar",
    failure_reason: str | None = Field(
        default=None,
        description="Why the test failed",

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

models.py

Latest commit

History

models.py

File metadata and controls