Generate a comparison summary between two SWE-bench runs.
(with_ot: SWEBenchReport, without_ot: SWEBenchReport)
| 478 | |
| 479 | |
| 480 | def compare_reports(with_ot: SWEBenchReport, without_ot: SWEBenchReport) -> str: |
| 481 | """Generate a comparison summary between two SWE-bench runs.""" |
| 482 | lines = [ |
| 483 | "SWE-bench Comparison: OpenTrace Impact", |
| 484 | "=" * 45, |
| 485 | "", |
| 486 | f"{'Metric':<25} {'With OT':>10} {'Without OT':>12} {'Delta':>8}", |
| 487 | "-" * 55, |
| 488 | f"{'Instances':<25} {with_ot.total:>10} {without_ot.total:>12}", |
| 489 | f"{'Resolved':<25} {with_ot.resolved:>10} {without_ot.resolved:>12}" |
| 490 | f" {with_ot.resolved - without_ot.resolved:>+8}", |
| 491 | f"{'Resolve rate':<25} {with_ot.resolve_rate:>9.1%} {without_ot.resolve_rate:>11.1%}" |
| 492 | f" {(with_ot.resolve_rate - without_ot.resolve_rate):>+7.1%}", |
| 493 | f"{'Errors':<25} {with_ot.errors:>10} {without_ot.errors:>12}", |
| 494 | f"{'Total duration (s)':<25} {with_ot.duration_s:>10.1f} {without_ot.duration_s:>12.1f}", |
| 495 | ] |
| 496 | if with_ot.results and without_ot.results: |
| 497 | avg_dur_with = sum(r.duration_s for r in with_ot.results) / len(with_ot.results) |
| 498 | avg_dur_without = sum(r.duration_s for r in without_ot.results) / len(without_ot.results) |
| 499 | lines.append(f"{'Avg duration (s)':<25} {avg_dur_with:>10.1f} {avg_dur_without:>12.1f}") |
| 500 | |
| 501 | avg_index = sum(r.index_duration_s for r in with_ot.results) / len(with_ot.results) |
| 502 | lines.append(f"{'Avg index time (s)':<25} {avg_index:>10.1f} {'N/A':>12}") |
| 503 | |
| 504 | # Turns |
| 505 | avg_turns_with = sum(r.num_turns for r in with_ot.results) / len(with_ot.results) |
| 506 | avg_turns_without = sum(r.num_turns for r in without_ot.results) / len(without_ot.results) |
| 507 | delta_turns = avg_turns_with - avg_turns_without |
| 508 | lines.append(f"{'Avg turns':<25} {avg_turns_with:>10.1f} {avg_turns_without:>12.1f} {delta_turns:>+8.1f}") |
| 509 | |
| 510 | # Cost |
| 511 | total_cost_with = sum(r.cost_usd for r in with_ot.results) |
| 512 | total_cost_without = sum(r.cost_usd for r in without_ot.results) |
| 513 | lines.append( |
| 514 | f"{'Total cost ($)':<25} {total_cost_with:>10.4f} {total_cost_without:>12.4f}" |
| 515 | f" {total_cost_with - total_cost_without:>+8.4f}" |
| 516 | ) |
| 517 | avg_cost_with = total_cost_with / len(with_ot.results) |
| 518 | avg_cost_without = total_cost_without / len(without_ot.results) |
| 519 | lines.append( |
| 520 | f"{'Avg cost/instance ($)':<25} {avg_cost_with:>10.4f} {avg_cost_without:>12.4f}" |
| 521 | f" {avg_cost_with - avg_cost_without:>+8.4f}" |
| 522 | ) |
| 523 | |
| 524 | # Per-instance comparison |
| 525 | with_map = {r.instance_id: r for r in with_ot.results} |
| 526 | without_map = {r.instance_id: r for r in without_ot.results} |
| 527 | shared = set(with_map) & set(without_map) |
| 528 | if shared: |
| 529 | only_with = [iid for iid in shared if with_map[iid].success and not without_map[iid].success] |
| 530 | only_without = [iid for iid in shared if not with_map[iid].success and without_map[iid].success] |
| 531 | if only_with: |
| 532 | lines.append(f"\nResolved ONLY with OpenTrace ({len(only_with)}):") |
| 533 | for iid in only_with: |
| 534 | lines.append(f" + {iid}") |
| 535 | if only_without: |
| 536 | lines.append(f"\nResolved ONLY without OpenTrace ({len(only_without)}):") |
| 537 | for iid in only_without: |