MCPcopy Index your code
hub / github.com/NVIDIA/TensorRT-LLM / forward

Method forward

tensorrt_llm/quantization/layers.py:2434–2453  ·  view source on GitHub ↗
(self, hidden_states, lora_layer_params=None)

Source from the content-addressed store, hash-verified

2432 self.register_parameter('quantization_scaling_factor', None)
2433
2434 def forward(self, hidden_states, lora_layer_params=None):
2435 assert lora_layer_params is None, f"lora is not supported on {self.__class__.__name__} now"
2436 inter = self.fc(hidden_states)
2437 inter = ACT2FN[self.hidden_act](inter)
2438 gate = self.gate(hidden_states)
2439 inter_x_gate = inter * gate
2440 smoother = cast(self.proj.smoother.value, self.dtype)
2441 inter_x_gate = inter_x_gate / smoother
2442 if self.quant_mode.has_act_and_weight_quant():
2443 if self.quant_mode.has_act_static_scaling():
2444 # Avoid quantization layers as it breaks int8 plugins
2445 inter_x_gate = quantize_tensor(
2446 inter_x_gate, self.quantization_scaling_factor.value)
2447 else:
2448 # Quantize per token outputs tuple:
2449 # quantized tensor and scaling factors per token
2450 inter_x_gate = quantize_per_token(inter_x_gate)
2451
2452 output = self.proj(inter_x_gate)
2453 return output
2454
2455
2456class SmoothQuantAttention(Module):

Callers

nothing calls this directly

Calls 5

castFunction · 0.85
quantize_tensorFunction · 0.85
quantize_per_tokenFunction · 0.85

Tested by

no test coverage detected