Method forward

tensorrt_llm/quantization/layers.py:2434–2453 · view source on GitHub ↗

(self, hidden_states, lora_layer_params=None)

Source from the content-addressed store, hash-verified

2432	self.register_parameter('quantization_scaling_factor', None)
2433
2434	def forward(self, hidden_states, lora_layer_params=None):
2435	assert lora_layer_params is None, f"lora is not supported on {self.__class__.__name__} now"
2436	inter = self.fc(hidden_states)
2437	inter = ACT2FN[self.hidden_act](inter)
2438	gate = self.gate(hidden_states)
2439	inter_x_gate = inter * gate
2440	smoother = cast(self.proj.smoother.value, self.dtype)
2441	inter_x_gate = inter_x_gate / smoother
2442	if self.quant_mode.has_act_and_weight_quant():
2443	if self.quant_mode.has_act_static_scaling():
2444	# Avoid quantization layers as it breaks int8 plugins
2445	inter_x_gate = quantize_tensor(
2446	inter_x_gate, self.quantization_scaling_factor.value)
2447	else:
2448	# Quantize per token outputs tuple:
2449	# quantized tensor and scaling factors per token
2450	inter_x_gate = quantize_per_token(inter_x_gate)
2451
2452	output = self.proj(inter_x_gate)
2453	return output
2454
2455
2456	class SmoothQuantAttention(Module):

nothing calls this directly

castFunction · 0.85

quantize_tensorFunction · 0.85

quantize_per_tokenFunction · 0.85

has_act_and_weight_quantMethod · 0.80

has_act_static_scalingMethod · 0.80

no test coverage detected