-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmodels.py
More file actions
57 lines (48 loc) · 2.03 KB
/
models.py
File metadata and controls
57 lines (48 loc) · 2.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
"""Pydantic models for the evaluation harness."""
from __future__ import annotations
from typing import Literal
from pydantic import BaseModel, Field
class EvalCase(BaseModel):
"""A single test case from the golden QA dataset."""
id: str = Field(description="Unique test case identifier")
question: str = Field(description="Natural language question / input")
category: str = Field(default="general", description="Test category")
expected_answer: str = Field(description="Expected answer text")
expected_tools: list[str] = Field(
default_factory=list,
description="Tools the agent should call (informational; not asserted)",
)
tolerance: Literal["exact_match", "numeric_close", "semantic_similar"] = Field(
description="How to compare actual vs expected"
)
difficulty: Literal["easy", "medium", "hard"] = Field(
default="easy",
description="Difficulty level",
)
notes: str = Field(default="", description="Why this test case exists")
class EvalResult(BaseModel):
"""Result of evaluating a single test case."""
case_id: str = Field(description="Test case ID")
question: str = Field(description="The question asked")
category: str = Field(description="Test category")
difficulty: str = Field(description="Difficulty level")
expected_answer: str = Field(description="Expected answer")
actual_answer: str = Field(description="Agent's actual answer")
tools_called: list[str] = Field(
default_factory=list,
description="Tools the agent invoked",
)
reasoning_trace: list[str] = Field(
default_factory=list,
description="Chain of thought steps",
)
latency_ms: int = Field(description="Wall clock time in ms")
pass_result: bool = Field(description="Whether the test passed")
score: float | None = Field(
default=None,
description="LLM judge score for semantic_similar",
)
failure_reason: str | None = Field(
default=None,
description="Why the test failed",
)