Configuration for a single benchmark scenario.
| 52 | |
| 53 | |
| 54 | class BenchmarkConfig: |
| 55 | """Configuration for a single benchmark scenario.""" |
| 56 | |
| 57 | all_attn_implementations = ["flash_attention_2", "eager", "sdpa", "flex_attention"] |
| 58 | all_compiled_modes = [None, "default", "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs"] |
| 59 | |
| 60 | def __init__( |
| 61 | self, |
| 62 | warmup_iterations: int = 5, |
| 63 | measurement_iterations: int = 20, |
| 64 | gpu_monitoring: bool = True, # NOTE: you may want to disable this at times as we have obsvered it could heavily slow down benchmarks on AMD |
| 65 | continuous_batching: bool = False, |
| 66 | batch_size: int = 1, |
| 67 | sequence_length: int = 128, |
| 68 | num_tokens_to_generate: int = 128, |
| 69 | attn_implementation: str = "eager", |
| 70 | compile_kwargs: dict[str, Any] | None = None, |
| 71 | kernelize: bool = False, |
| 72 | tp_plan: str | dict[str, str] | None = None, |
| 73 | name: str | None = None, |
| 74 | skip_validity_check: bool = False, |
| 75 | ) -> None: |
| 76 | # Benchmark parameters |
| 77 | self.warmup_iterations = warmup_iterations |
| 78 | self.measurement_iterations = measurement_iterations |
| 79 | self.gpu_monitoring = gpu_monitoring |
| 80 | self.continuous_batching = continuous_batching |
| 81 | # Input parameters |
| 82 | self.batch_size = batch_size |
| 83 | self.sequence_length = sequence_length |
| 84 | self.num_tokens_to_generate = num_tokens_to_generate |
| 85 | # Generation parameters |
| 86 | self.attn_implementation = attn_implementation |
| 87 | self.tp_plan = tp_plan |
| 88 | # Optimization parameters |
| 89 | if compile_kwargs is None: |
| 90 | self.compile_config = None |
| 91 | else: |
| 92 | compile_kwargs["fullgraph"] = compile_kwargs.get("fullgraph", True) |
| 93 | self.compile_config = CompileConfig(**compile_kwargs) |
| 94 | self.kernelize = kernelize |
| 95 | # Constant parameters |
| 96 | self.dtype = "torch.bfloat16" |
| 97 | self.device = torch.accelerator.current_accelerator().type if is_torch_accelerator_available() else "cuda" |
| 98 | |
| 99 | self.check_validity(skip_validity_check) |
| 100 | self.name = name if name is not None else self.infer_name() |
| 101 | |
| 102 | def check_validity(self, skip_validity_check: bool = False) -> None: |
| 103 | if skip_validity_check: |
| 104 | return |
| 105 | # If flash_attention_2 is selected but not available, default to SDPA |
| 106 | if self.attn_implementation == "flash_attention_2" and not is_fa2_or_kernel_available(): |
| 107 | logger.error("Flash attention is not available. Defaulting to SDPA.") |
| 108 | self.attn_implementation = "sdpa" |
| 109 | |
| 110 | # The combination of flash_attention_2, compile and generate is not supported # FIXME: support it |
| 111 | if ( |
no outgoing calls
no test coverage detected