hub / github.com/huggingface/transformers / BenchmarkConfig

Class BenchmarkConfig

benchmark_v2/framework/benchmark_config.py:54–214 · view source on GitHub ↗

Configuration for a single benchmark scenario.

Source from the content-addressed store, hash-verified

52
53
54	class BenchmarkConfig:
55	"""Configuration for a single benchmark scenario."""
56
57	all_attn_implementations = ["flash_attention_2", "eager", "sdpa", "flex_attention"]
58	all_compiled_modes = [None, "default", "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs"]
59
60	def __init__(
61	self,
62	warmup_iterations: int = 5,
63	measurement_iterations: int = 20,
64	gpu_monitoring: bool = True, # NOTE: you may want to disable this at times as we have obsvered it could heavily slow down benchmarks on AMD
65	continuous_batching: bool = False,
66	batch_size: int = 1,
67	sequence_length: int = 128,
68	num_tokens_to_generate: int = 128,
69	attn_implementation: str = "eager",
70	compile_kwargs: dict[str, Any] \| None = None,
71	kernelize: bool = False,
72	tp_plan: str \| dict[str, str] \| None = None,
73	name: str \| None = None,
74	skip_validity_check: bool = False,
75	) -> None:
76	# Benchmark parameters
77	self.warmup_iterations = warmup_iterations
78	self.measurement_iterations = measurement_iterations
79	self.gpu_monitoring = gpu_monitoring
80	self.continuous_batching = continuous_batching
81	# Input parameters
82	self.batch_size = batch_size
83	self.sequence_length = sequence_length
84	self.num_tokens_to_generate = num_tokens_to_generate
85	# Generation parameters
86	self.attn_implementation = attn_implementation
87	self.tp_plan = tp_plan
88	# Optimization parameters
89	if compile_kwargs is None:
90	self.compile_config = None
91	else:
92	compile_kwargs["fullgraph"] = compile_kwargs.get("fullgraph", True)
93	self.compile_config = CompileConfig(**compile_kwargs)
94	self.kernelize = kernelize
95	# Constant parameters
96	self.dtype = "torch.bfloat16"
97	self.device = torch.accelerator.current_accelerator().type if is_torch_accelerator_available() else "cuda"
98
99	self.check_validity(skip_validity_check)
100	self.name = name if name is not None else self.infer_name()
101
102	def check_validity(self, skip_validity_check: bool = False) -> None:
103	if skip_validity_check:
104	return
105	# If flash_attention_2 is selected but not available, default to SDPA
106	if self.attn_implementation == "flash_attention_2" and not is_fa2_or_kernel_available():
107	logger.error("Flash attention is not available. Defaulting to SDPA.")
108	self.attn_implementation = "sdpa"
109
110	# The combination of flash_attention_2, compile and generate is not supported # FIXME: support it
111	if (

Callers 1

get_config_by_levelFunction · 0.85

Calls

no outgoing calls

Tested by

no test coverage detected