MCPcopy
hub / github.com/opentrace/opentrace / compare_reports

Function compare_reports

agent/src/opentrace_agent/benchmarks/swe_bench.py:480–540  ·  view source on GitHub ↗

Generate a comparison summary between two SWE-bench runs.

(with_ot: SWEBenchReport, without_ot: SWEBenchReport)

Source from the content-addressed store, hash-verified

478
479
480def compare_reports(with_ot: SWEBenchReport, without_ot: SWEBenchReport) -> str:
481 """Generate a comparison summary between two SWE-bench runs."""
482 lines = [
483 "SWE-bench Comparison: OpenTrace Impact",
484 "=" * 45,
485 "",
486 f"{'Metric':<25} {'With OT':>10} {'Without OT':>12} {'Delta':>8}",
487 "-" * 55,
488 f"{'Instances':<25} {with_ot.total:>10} {without_ot.total:>12}",
489 f"{'Resolved':<25} {with_ot.resolved:>10} {without_ot.resolved:>12}"
490 f" {with_ot.resolved - without_ot.resolved:>+8}",
491 f"{'Resolve rate':<25} {with_ot.resolve_rate:>9.1%} {without_ot.resolve_rate:>11.1%}"
492 f" {(with_ot.resolve_rate - without_ot.resolve_rate):>+7.1%}",
493 f"{'Errors':<25} {with_ot.errors:>10} {without_ot.errors:>12}",
494 f"{'Total duration (s)':<25} {with_ot.duration_s:>10.1f} {without_ot.duration_s:>12.1f}",
495 ]
496 if with_ot.results and without_ot.results:
497 avg_dur_with = sum(r.duration_s for r in with_ot.results) / len(with_ot.results)
498 avg_dur_without = sum(r.duration_s for r in without_ot.results) / len(without_ot.results)
499 lines.append(f"{'Avg duration (s)':<25} {avg_dur_with:>10.1f} {avg_dur_without:>12.1f}")
500
501 avg_index = sum(r.index_duration_s for r in with_ot.results) / len(with_ot.results)
502 lines.append(f"{'Avg index time (s)':<25} {avg_index:>10.1f} {'N/A':>12}")
503
504 # Turns
505 avg_turns_with = sum(r.num_turns for r in with_ot.results) / len(with_ot.results)
506 avg_turns_without = sum(r.num_turns for r in without_ot.results) / len(without_ot.results)
507 delta_turns = avg_turns_with - avg_turns_without
508 lines.append(f"{'Avg turns':<25} {avg_turns_with:>10.1f} {avg_turns_without:>12.1f} {delta_turns:>+8.1f}")
509
510 # Cost
511 total_cost_with = sum(r.cost_usd for r in with_ot.results)
512 total_cost_without = sum(r.cost_usd for r in without_ot.results)
513 lines.append(
514 f"{'Total cost ($)':<25} {total_cost_with:>10.4f} {total_cost_without:>12.4f}"
515 f" {total_cost_with - total_cost_without:>+8.4f}"
516 )
517 avg_cost_with = total_cost_with / len(with_ot.results)
518 avg_cost_without = total_cost_without / len(without_ot.results)
519 lines.append(
520 f"{'Avg cost/instance ($)':<25} {avg_cost_with:>10.4f} {avg_cost_without:>12.4f}"
521 f" {avg_cost_with - avg_cost_without:>+8.4f}"
522 )
523
524 # Per-instance comparison
525 with_map = {r.instance_id: r for r in with_ot.results}
526 without_map = {r.instance_id: r for r in without_ot.results}
527 shared = set(with_map) & set(without_map)
528 if shared:
529 only_with = [iid for iid in shared if with_map[iid].success and not without_map[iid].success]
530 only_without = [iid for iid in shared if not with_map[iid].success and without_map[iid].success]
531 if only_with:
532 lines.append(f"\nResolved ONLY with OpenTrace ({len(only_with)}):")
533 for iid in only_with:
534 lines.append(f" + {iid}")
535 if only_without:
536 lines.append(f"\nResolved ONLY without OpenTrace ({len(only_without)}):")
537 for iid in only_without:

Callers 2

run_swe_bench_cliFunction · 0.90

Calls 1

setFunction · 0.85

Tested by 1