Run the full SWE-bench evaluation. Parameters ---------- instances_path : str or Path Path to SWE-bench instances JSON file. agent_fn : AgentFn The agent function to test. use_opentrace : bool Whether to provide OpenTrace t
(
self,
instances_path: str | Path,
agent_fn: AgentFn,
*,
use_opentrace: bool = True,
limit: int | None = None,
on_progress: Callable[[int, int, SWEBenchResult], None] | None = None,
workers: int = 1,
)
| 382 | return result |
| 383 | |
| 384 | def run( |
| 385 | self, |
| 386 | instances_path: str | Path, |
| 387 | agent_fn: AgentFn, |
| 388 | *, |
| 389 | use_opentrace: bool = True, |
| 390 | limit: int | None = None, |
| 391 | on_progress: Callable[[int, int, SWEBenchResult], None] | None = None, |
| 392 | workers: int = 1, |
| 393 | ) -> SWEBenchReport: |
| 394 | """Run the full SWE-bench evaluation. |
| 395 | |
| 396 | Parameters |
| 397 | ---------- |
| 398 | instances_path : str or Path |
| 399 | Path to SWE-bench instances JSON file. |
| 400 | agent_fn : AgentFn |
| 401 | The agent function to test. |
| 402 | use_opentrace : bool |
| 403 | Whether to provide OpenTrace tools. |
| 404 | limit : int or None |
| 405 | Max instances to run (for quick testing). |
| 406 | on_progress : callable or None |
| 407 | Called after each instance with ``(completed, total, result)``. |
| 408 | workers : int |
| 409 | Number of instances to run in parallel (default 1 = sequential). |
| 410 | """ |
| 411 | instances = self.load_instances(instances_path) |
| 412 | if limit: |
| 413 | instances = instances[:limit] |
| 414 | |
| 415 | t0 = time.monotonic() |
| 416 | total = len(instances) |
| 417 | |
| 418 | if workers <= 1: |
| 419 | # Sequential |
| 420 | results = [] |
| 421 | for i, instance in enumerate(instances): |
| 422 | logger.info( |
| 423 | "[%d/%d] Running %s (opentrace=%s)", |
| 424 | i + 1, |
| 425 | total, |
| 426 | instance.instance_id, |
| 427 | use_opentrace, |
| 428 | ) |
| 429 | result = self.run_instance(instance, agent_fn, use_opentrace=use_opentrace) |
| 430 | results.append(result) |
| 431 | if on_progress is not None: |
| 432 | on_progress(i + 1, total, result) |
| 433 | else: |
| 434 | # Parallel with thread pool |
| 435 | from concurrent.futures import ThreadPoolExecutor, as_completed |
| 436 | |
| 437 | results = [None] * total # type: ignore[list-item] |
| 438 | completed = 0 |
| 439 | |
| 440 | def _run(idx: int, inst: Any) -> tuple[int, SWEBenchResult]: |
| 441 | logger.info( |