Skip to content

Commit af738d8

Browse files
committed
Add logging to judge
1 parent 5e5dfd3 commit af738d8

1 file changed

Lines changed: 4 additions & 0 deletions

File tree

eval/judge_system.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,7 @@ def prepare_agent_steps(complete_history: list[dict]) -> list[str]:
182182
return last_part[::-1]
183183

184184

185+
@observe_debug()
185186
def are_images_identical(img_path1: str, img_path2: str) -> bool:
186187
"""Check if two images are identical by comparing their content."""
187188
try:
@@ -202,6 +203,7 @@ def are_images_identical(img_path1: str, img_path2: str) -> bool:
202203
return False
203204

204205

206+
@observe_debug()
205207
def filter_images(screenshot_paths: list[str], max_images: int) -> list[str]:
206208
"""
207209
Filter screenshot paths to:
@@ -469,6 +471,7 @@ def parse_judge_response(result_dict: dict, task: str) -> JudgeResult:
469471
return create_fallback_result(task, 'Failed to parse structured response')
470472

471473

474+
@observe_debug()
472475
def create_fallback_result(task: str, error_msg: str) -> JudgeResult:
473476
"""Create a fallback result when evaluation fails."""
474477
return JudgeResult(
@@ -480,6 +483,7 @@ def create_fallback_result(task: str, error_msg: str) -> JudgeResult:
480483
)
481484

482485

486+
@observe_debug()
483487
async def judge_with_retry(
484488
task: str,
485489
complete_history: list[dict],

0 commit comments

Comments
 (0)