MCPcopy
hub / github.com/huggingface/transformers / BenchmarkConfig

Class BenchmarkConfig

benchmark_v2/framework/benchmark_config.py:54–214  ·  view source on GitHub ↗

Configuration for a single benchmark scenario.

Source from the content-addressed store, hash-verified

52
53
54class BenchmarkConfig:
55 """Configuration for a single benchmark scenario."""
56
57 all_attn_implementations = ["flash_attention_2", "eager", "sdpa", "flex_attention"]
58 all_compiled_modes = [None, "default", "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs"]
59
60 def __init__(
61 self,
62 warmup_iterations: int = 5,
63 measurement_iterations: int = 20,
64 gpu_monitoring: bool = True, # NOTE: you may want to disable this at times as we have obsvered it could heavily slow down benchmarks on AMD
65 continuous_batching: bool = False,
66 batch_size: int = 1,
67 sequence_length: int = 128,
68 num_tokens_to_generate: int = 128,
69 attn_implementation: str = "eager",
70 compile_kwargs: dict[str, Any] | None = None,
71 kernelize: bool = False,
72 tp_plan: str | dict[str, str] | None = None,
73 name: str | None = None,
74 skip_validity_check: bool = False,
75 ) -> None:
76 # Benchmark parameters
77 self.warmup_iterations = warmup_iterations
78 self.measurement_iterations = measurement_iterations
79 self.gpu_monitoring = gpu_monitoring
80 self.continuous_batching = continuous_batching
81 # Input parameters
82 self.batch_size = batch_size
83 self.sequence_length = sequence_length
84 self.num_tokens_to_generate = num_tokens_to_generate
85 # Generation parameters
86 self.attn_implementation = attn_implementation
87 self.tp_plan = tp_plan
88 # Optimization parameters
89 if compile_kwargs is None:
90 self.compile_config = None
91 else:
92 compile_kwargs["fullgraph"] = compile_kwargs.get("fullgraph", True)
93 self.compile_config = CompileConfig(**compile_kwargs)
94 self.kernelize = kernelize
95 # Constant parameters
96 self.dtype = "torch.bfloat16"
97 self.device = torch.accelerator.current_accelerator().type if is_torch_accelerator_available() else "cuda"
98
99 self.check_validity(skip_validity_check)
100 self.name = name if name is not None else self.infer_name()
101
102 def check_validity(self, skip_validity_check: bool = False) -> None:
103 if skip_validity_check:
104 return
105 # If flash_attention_2 is selected but not available, default to SDPA
106 if self.attn_implementation == "flash_attention_2" and not is_fa2_or_kernel_available():
107 logger.error("Flash attention is not available. Defaulting to SDPA.")
108 self.attn_implementation = "sdpa"
109
110 # The combination of flash_attention_2, compile and generate is not supported # FIXME: support it
111 if (

Callers 1

get_config_by_levelFunction · 0.85

Calls

no outgoing calls

Tested by

no test coverage detected