-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathevaluate.py
More file actions
123 lines (91 loc) · 3.08 KB
/
Copy pathevaluate.py
File metadata and controls
123 lines (91 loc) · 3.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/env python3
"""
Evaluation script for information retrieval results.
This script evaluates search results against qrels (relevance judgments) using
standard IR metrics including MRR, Recall, NDCG, and MAP.
"""
from dataclasses import dataclass
from pathlib import Path
import tyro
from python.metrics import (
compute_mrr_at_k,
compute_recall_at_k,
compute_ndcg_at_k,
compute_map,
)
from python.metrics.utils import load_qrels, load_results
@dataclass
class EvalConfig:
"""Configuration for evaluation."""
qrels: Path
"""Path to qrels file (relevance judgments)"""
results: Path
"""Path to results file (search output)"""
use_map: bool = False
"""Use MAP instead of NDCG (for binary relevance judgments)"""
mrr_k: int = 10
"""Cutoff k for MRR@k metric"""
recall_k: int = 100
"""Cutoff k for Recall@k metric"""
ndcg_k: tuple[int, ...] = (10, 100)
"""Cutoff k values for NDCG@k metric"""
verbose: bool = True
"""Print detailed evaluation information"""
def evaluate(config: EvalConfig) -> dict:
"""
Evaluate search results against qrels.
Args:
config: Evaluation configuration
Returns:
Dictionary containing all computed metrics
"""
if config.verbose:
print(f"\n{'=' * 70}")
print(f" Evaluating: {config.results.name}")
print(f" Qrels: {config.qrels.name}")
print(f"{'=' * 70}")
# Load data
qrels = load_qrels(str(config.qrels))
results = load_results(str(config.results))
if config.verbose:
print(f"Loaded {len(qrels)} queries from qrels")
print(f"Loaded results for {len(results)} queries")
# Compute metrics
metrics = {}
# MRR
mrr = compute_mrr_at_k(qrels, results, k=config.mrr_k)
metrics[f"mrr@{config.mrr_k}"] = mrr
# Recall
recall = compute_recall_at_k(qrels, results, k=config.recall_k)
metrics[f"recall@{config.recall_k}"] = recall
if config.verbose:
print("\nMetrics:")
print(f" MRR@{config.mrr_k}: {mrr:.4f}")
print(f" Recall@{config.recall_k}: {recall:.4f}")
if config.use_map:
# For binary relevance (qrels.dev.tsv)
map_score = compute_map(qrels, results)
metrics["map"] = map_score
if config.verbose:
print(f" MAP: {map_score:.4f}")
else:
# For graded relevance (qrels.eval.one/two.tsv)
for k in config.ndcg_k:
ndcg = compute_ndcg_at_k(qrels, results, k=k)
metrics[f"ndcg@{k}"] = ndcg
if config.verbose:
print(f" NDCG@{k}: {ndcg:.4f}")
if config.verbose:
print(f"{'=' * 70}\n")
return metrics
def main(config: EvalConfig) -> None:
"""Main evaluation function."""
# Check if files exist
if not config.qrels.exists():
raise FileNotFoundError(f"Qrels file not found: {config.qrels}")
if not config.results.exists():
raise FileNotFoundError(f"Results file not found: {config.results}")
# Run evaluation
evaluate(config)
if __name__ == "__main__":
tyro.cli(main)