Skip to content

Commit 001d787

Browse files
committed
update leaderboard
1 parent 967a55b commit 001d787

3 files changed

Lines changed: 104 additions & 84 deletions

File tree

docs/render_submissions.py

Lines changed: 92 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,19 @@
22
import os
33
import glob
44
import ast
5-
from datasets import load_dataset
5+
import subprocess
66
import json
77
import shutil
88
import argparse
9+
import pypdf
10+
import tqdm
11+
12+
from datasets import load_dataset
913
from transformers import AutoTokenizer
14+
1015
from commit0.harness.constants import SPLIT
1116
from commit0.harness.utils import clone_repo
1217
from commit0.cli import write_commit0_dot_file
13-
import pypdf
1418

1519
import logging
1620

@@ -162,129 +166,133 @@ def get_blank_repo_metrics(
162166
return blank_repo_metrics
163167

164168

165-
def render_mds(subfolder="docs"):
166-
all_submissions = {}
169+
# def render_leaderboard(split):
170+
171+
# for branch_name, branch_info in all_submissions.items():
172+
# if branch_info['split'] != split: continue
173+
# repos_resolved = 0
174+
# cum_passed = 0
175+
# total_duration = 0.0
176+
# for repo_name, repo_test_info in branch_info.items():
177+
# for testname, test_info in repo_test_info.items():
178+
# if "failed_to_run" not in test_info:
179+
# total_duration += test_info["duration"]
180+
# if ('failed' not in test_info['summary']) or (test_info['summary']['failed'] == 0):
181+
# repos_resolved += 1
182+
# # f"{test_info['summary']['collected']} ; duration: {test_info['duration']:.2f}s"
183+
# cum_passed += test_info["summary"]["passed"]
184+
# break # assume we ran all tests. will add functionality for checking diff tests later, as we need it.
185+
# analysis_link = f"[Analysis]({f'analysis_{branch_name}'})"
186+
# leaderboard += f"\n||[{branch_info['display_name']}]({branch_info['project_page']})|" \
187+
# f"{repos_resolved}|" \
188+
# f"{cum_passed}|" \
189+
# f"{total_duration:.2f}" \
190+
# f"{branch_info['submission_date']}|" \
191+
# f"{analysis_link}||"
192+
# return leaderboard
193+
167194

168-
method_repo_pytests = {}
169-
for branch_name in glob.glob(os.path.join(analysis_files_path, "*")):
195+
def render_mds(subfolder="docs"):
196+
leaderboard = {}
197+
leaderboard["lite"] = f"## Leaderboard (Lite)\n\n"
198+
leaderboard["all"] = f"## Leaderboard (All)\n\n"
199+
200+
for split in tqdm.tqdm(["lite", "all"]):
201+
total_num_repos = 0
202+
total_num_tests = 0
203+
for repo_name in SPLIT[split]:
204+
total_num_repos += 1
205+
all_tests = subprocess.run(["commit0", "get-tests", repo_name], capture_output=True, text=True)
206+
total_num_tests += len(all_tests.stdout.strip().splitlines())
207+
leaderboard[split] += f"""
208+
| | Name | Repos Resolved /{total_num_repos} | Net Pass Rate /{total_num_tests} | Test Duration (s) | Date | Analysis | |
209+
|--|------|-----------|------|----------|------|---|--------| |"""
210+
211+
for branch_name in tqdm.tqdm(glob.glob(os.path.join(analysis_files_path, "*"))):
170212
branch_name = os.path.basename(branch_name)
171213
if branch_name in {"blank", "repos", "submission_repos"}:
172214
continue
173-
all_submissions[branch_name] = {}
215+
submission_page = """# Submission Name: REPLACE_NAME_HERE (REPLACE_SPLIT_HERE)
216+
217+
| | Repository | Resolved | Pass Rate | Test Duration (s) | Analysis | |
218+
|-|------------|---------|-----| -----|-----||"""
219+
repos_resolved = 0
220+
cum_passed = 0
221+
total_duration = 0.
174222
for repo_file in glob.glob(
175223
os.path.join(analysis_files_path, branch_name, "*.json")
176224
):
177-
178225
repo_metrics_output_file = os.path.join(
179226
analysis_files_path, branch_name, repo_file
180227
)
181228
repo_metrics = json.load(open(repo_metrics_output_file))
182229
repo_name = os.path.basename(repo_file[: -len(".json")])
183-
184-
all_submissions[branch_name][repo_name] = {}
185-
186-
method_repo_pytests[
187-
f"{branch_name}_{repo_name}"
188-
] = f"# Submission Name: {branch_name}\n# Repository: {repo_name}"
189-
if "pytest_results" in repo_metrics:
190-
repo_metrics = repo_metrics["pytest_results"]
230+
submission_repo_page = f"# Submission Name: {branch_name}\n# Repository: {repo_name}"
231+
if "split" not in locals():
232+
split = repo_metrics["submission_info"]["split"]
233+
project_page_link = repo_metrics["submission_info"]["project_page"]
234+
display_name = repo_metrics["submission_info"]["display_name"]
235+
submission_date = repo_metrics["submission_info"]['submission_date']
236+
submission_page = submission_page.replace("REPLACE_NAME_HERE", display_name).replace("REPLACE_SPLIT_HERE", split)
191237
for pytest_group, pytest_info in repo_metrics.items():
238+
if pytest_group == "submission_info": continue
192239
pytest_group = os.path.basename(pytest_group.strip("/"))
193240
patch_diff = (
194241
f"""\n\n### Patch diff\n```diff\n{pytest_info['patch_diff']}```"""
195242
)
196243
if "failed_to_run" in pytest_info:
197-
all_submissions[branch_name][repo_name][pytest_group] = {
198-
"failed_to_run": pytest_info["failed_to_run"]
199-
}
200-
method_repo_pytests[
201-
f"{branch_name}_{repo_name}"
202-
] += f"""\n## Failed to run pytests\n```\n{pytest_info['failed_to_run']}\n```"""
244+
submission_repo_page += f"""\n## Failed to run pytests\n```\n{pytest_info['failed_to_run']}\n```"""
203245
else:
204-
all_submissions[branch_name][repo_name][pytest_group] = {
205-
"summary": pytest_info["summary"],
206-
"duration": pytest_info["duration"],
207-
}
208-
method_repo_pytests[
209-
f"{branch_name}_{repo_name}"
210-
] += f"""\n## Pytest Summary: {pytest_group}
246+
submission_repo_page += f"""\n## Pytest Summary: {pytest_group}
211247
| status | count |
212248
|:---------|:-----:|
213249
"""
250+
total_duration += pytest_info["duration"]
251+
cum_passed += pytest_info["summary"]["passed"]
214252
for category, count in pytest_info["summary"].items():
215253
if category not in {"duration"}:
216-
method_repo_pytests[
217-
f"{branch_name}_{repo_name}"
218-
] += f"""| {category} | {count} |\n"""
254+
submission_repo_page += f"""| {category} | {count} |\n"""
219255
else:
220-
method_repo_pytests[
221-
f"{branch_name}_{repo_name}"
222-
] += f"""| {category} | {float(count):.2f}s |\n"""
256+
submission_repo_page += f"""| {category} | {float(count):.2f}s |\n"""
223257

224-
method_repo_pytests[
225-
f"{branch_name}_{repo_name}"
226-
] += f"\n## Failed pytest outputs: {pytest_group}\n\n"
258+
submission_repo_page += f"\n## Failed pytest outputs: {pytest_group}\n\n"
227259
for testname, failure in pytest_info["failures"].items():
228260
shortened_testname = os.path.basename(testname)
229-
method_repo_pytests[f"{branch_name}_{repo_name}"] += (
261+
submission_repo_page += (
230262
f"### {shortened_testname}\n\n<details><summary> <pre>{shortened_testname}"
231263
f"</pre></summary><pre>\n{failure['failure_string']}\n</pre>\n</details>\n"
232264
)
233-
234265
back_button = f"[back to {branch_name} summary]({f'analysis_{branch_name}'})\n\n"
235266
with open(
236267
os.path.join(subfolder, f"analysis_{branch_name}_{repo_name}.md"), "w"
237268
) as wf:
238269
wf.write(
239270
back_button
240-
+ method_repo_pytests[f"{branch_name}_{repo_name}"]
271+
+ submission_repo_page
241272
+ patch_diff
242273
)
274+
resolved = ('summary' in pytest_info) and (('failed' not in pytest_info['summary']) or (pytest_info['summary']['failed'] == 0))
275+
if resolved: repos_resolved += 1
276+
pytest_details = "Pytest failed" if "failed_to_run" in pytest_info else f"{pytest_info['summary']['passed']} / {pytest_info['summary']['collected']}"
277+
duration = "Failed."
278+
if 'duration' in pytest_info: duration = f"{pytest_info['duration']:.2f}"
279+
submission_page +=f"""
280+
| | {repo_name} | {'Yes' if resolved else 'No'} | {pytest_details} | {duration} | {f'analysis_{branch_name}_{repo_name}'} | |"""
281+
analysis_link = f"[Analysis]({f'analysis_{branch_name}'})"
282+
leaderboard[split] += f"\n||[{display_name}]({project_page_link})|" \
283+
f"{repos_resolved}|" \
284+
f"{cum_passed}|" \
285+
f"{total_duration:.2f}" \
286+
f"{submission_date}|" \
287+
f"{analysis_link}||"
288+
243289

244-
# Render general page. Has buttons to all methods
245-
leaderboard = """
246-
| | Name | Summary | |
247-
|--|--------|----------|--|"""
248-
# Render method page. Per method, buttons to all repos.
249-
method_to_repos = {}
250-
# Render method & repo page. Has "back" button.
251-
for branch_name, branch_info in all_submissions.items():
252-
cum_pytests = {"passed": 0}
253-
method_to_repos[branch_name] = """
254-
| | Repository | Summary | |
255-
|-|------------|---------|-|"""
256-
total_duration = 0.0
257-
for repo_name, repo_test_info in branch_info.items():
258-
for testname, test_info in repo_test_info.items():
259-
if "failed_to_run" in test_info:
260-
summary_pytests_string = "failure"
261-
else:
262-
total_duration += test_info["duration"]
263-
summary_pytests_string = (
264-
f"`{testname}`: {test_info['summary']['passed']} / "
265-
f"{test_info['summary']['collected']} ; duration: {test_info['duration']:.2f}s"
266-
)
267-
for category, count in test_info["summary"].items():
268-
if category not in cum_pytests:
269-
cum_pytests[category] = 0
270-
if isinstance(count, int):
271-
cum_pytests[category] += int(count)
272-
elif isinstance(count, float):
273-
cum_pytests[category] += float(count)
274-
method_to_repos[branch_name] += (
275-
f"\n||[{repo_name}]({f'analysis_{branch_name}_{repo_name}'})|"
276-
f"{summary_pytests_string}||"
277-
)
278-
break # assume we ran all tests. will add functionality for checking diff tests later, as we need it.
279-
summary_pytests_string = (
280-
f"{cum_pytests['passed']} / {cum_pytests['collected']} ; duration: {total_duration:.2f}s"
281-
)
282-
leaderboard += f"\n||[{branch_name}]({f'analysis_{branch_name}'})|{summary_pytests_string}||"
283290
back_button = f"[back to all submissions]({f'analysis'})\n\n"
284291
with open(os.path.join(subfolder, f"analysis_{branch_name}.md"), "w") as wf:
285-
wf.write(back_button + "\n" + method_to_repos[branch_name])
292+
wf.write(back_button + "\n" + submission_page)
293+
286294
with open(os.path.join(subfolder, "analysis.md"), "w") as wf:
287-
wf.write(leaderboard)
295+
wf.write(leaderboard["lite"] + leaderboard["all"])
288296

289297

290298
def get_args():
@@ -378,6 +386,7 @@ def main(args):
378386

379387
path_to_logs = f"{os.getcwd()}/logs/pytest/{repo_name}/{branch_name}"
380388
pytest_results = get_pytest_info(path_to_logs, repo_name, branch_name)
389+
pytest_results["submission_info"] = example
381390
json.dump(pytest_results, open(repo_metrics_output_file, "w"), indent=4)
382391

383392
if args.analyze_submissions:
@@ -394,7 +403,7 @@ def main(args):
394403
print(f"{e}: when removing {subfolder}")
395404

396405
for submission in submission_dataset:
397-
branch_name = submission["name"]
406+
branch_name = submission["branch"]
398407
os.makedirs(os.path.join(analysis_files_path, branch_name), exist_ok=True)
399408
if not args.keep_previous_eval:
400409
for repo_log_path in glob.glob(f"{os.getcwd()}/logs/pytest/*"):

docs/update_submissions_dataset.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
from datasets import Dataset
2+
3+
submissions = {
4+
"branch": ["baseline"],
5+
"display_name": ["Claude Sonnet - Base"],
6+
"submission_date": ["09/25/2024"],
7+
"split": ["lite"],
8+
"project_page": ["commit-0.github.io"]
9+
}
10+
11+
Dataset.from_dict(submissions).push_to_hub("celinelee/commit0_submissions")

mkdocs.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ nav:
66
- Commit0: setupdist.md
77
- Agent: agent.md
88
- API: api.md
9-
- Submission Analysis: analysis.md
9+
- Leaderboard: analysis.md
1010
theme:
1111
name: material
1212
logo: "logo2.webp"

0 commit comments

Comments
 (0)