hub / github.com/opentrace/opentrace / compare_reports

Function compare_reports

agent/src/opentrace_agent/benchmarks/swe_bench.py:480–540 · view source on GitHub ↗

Generate a comparison summary between two SWE-bench runs.

(with_ot: SWEBenchReport, without_ot: SWEBenchReport)

Source from the content-addressed store, hash-verified

478
479
480	def compare_reports(with_ot: SWEBenchReport, without_ot: SWEBenchReport) -> str:
481	"""Generate a comparison summary between two SWE-bench runs."""
482	lines = [
483	"SWE-bench Comparison: OpenTrace Impact",
484	"=" * 45,
485	"",
486	f"{'Metric':<25} {'With OT':>10} {'Without OT':>12} {'Delta':>8}",
487	"-" * 55,
488	f"{'Instances':<25} {with_ot.total:>10} {without_ot.total:>12}",
489	f"{'Resolved':<25} {with_ot.resolved:>10} {without_ot.resolved:>12}"
490	f" {with_ot.resolved - without_ot.resolved:>+8}",
491	f"{'Resolve rate':<25} {with_ot.resolve_rate:>9.1%} {without_ot.resolve_rate:>11.1%}"
492	f" {(with_ot.resolve_rate - without_ot.resolve_rate):>+7.1%}",
493	f"{'Errors':<25} {with_ot.errors:>10} {without_ot.errors:>12}",
494	f"{'Total duration (s)':<25} {with_ot.duration_s:>10.1f} {without_ot.duration_s:>12.1f}",
495	]
496	if with_ot.results and without_ot.results:
497	avg_dur_with = sum(r.duration_s for r in with_ot.results) / len(with_ot.results)
498	avg_dur_without = sum(r.duration_s for r in without_ot.results) / len(without_ot.results)
499	lines.append(f"{'Avg duration (s)':<25} {avg_dur_with:>10.1f} {avg_dur_without:>12.1f}")
500
501	avg_index = sum(r.index_duration_s for r in with_ot.results) / len(with_ot.results)
502	lines.append(f"{'Avg index time (s)':<25} {avg_index:>10.1f} {'N/A':>12}")
503
504	# Turns
505	avg_turns_with = sum(r.num_turns for r in with_ot.results) / len(with_ot.results)
506	avg_turns_without = sum(r.num_turns for r in without_ot.results) / len(without_ot.results)
507	delta_turns = avg_turns_with - avg_turns_without
508	lines.append(f"{'Avg turns':<25} {avg_turns_with:>10.1f} {avg_turns_without:>12.1f} {delta_turns:>+8.1f}")
509
510	# Cost
511	total_cost_with = sum(r.cost_usd for r in with_ot.results)
512	total_cost_without = sum(r.cost_usd for r in without_ot.results)
513	lines.append(
514	f"{'Total cost ($)':<25} {total_cost_with:>10.4f} {total_cost_without:>12.4f}"
515	f" {total_cost_with - total_cost_without:>+8.4f}"
516	)
517	avg_cost_with = total_cost_with / len(with_ot.results)
518	avg_cost_without = total_cost_without / len(without_ot.results)
519	lines.append(
520	f"{'Avg cost/instance ($)':<25} {avg_cost_with:>10.4f} {avg_cost_without:>12.4f}"
521	f" {avg_cost_with - avg_cost_without:>+8.4f}"
522	)
523
524	# Per-instance comparison
525	with_map = {r.instance_id: r for r in with_ot.results}
526	without_map = {r.instance_id: r for r in without_ot.results}
527	shared = set(with_map) & set(without_map)
528	if shared:
529	only_with = [iid for iid in shared if with_map[iid].success and not without_map[iid].success]
530	only_without = [iid for iid in shared if not with_map[iid].success and without_map[iid].success]
531	if only_with:
532	lines.append(f"\nResolved ONLY with OpenTrace ({len(only_with)}):")
533	for iid in only_with:
534	lines.append(f" + {iid}")
535	if only_without:
536	lines.append(f"\nResolved ONLY without OpenTrace ({len(only_without)}):")
537	for iid in only_without:

Callers 2

run_swe_bench_cliFunction · 0.90

test_comparison_outputMethod · 0.90

Calls 1

setFunction · 0.85

Tested by 1

test_comparison_outputMethod · 0.72