MCPcopy Index your code
hub / github.com/NVIDIA/TensorRT-LLM / forward

Method forward

tensorrt_llm/quantization/layers.py:215–243  ·  view source on GitHub ↗
(self, x, lora_runtime_params=None, all_reduce_params=None)

Source from the content-addressed store, hash-verified

213 self.quant_mode = quant_mode
214
215 def forward(self, x, lora_runtime_params=None, all_reduce_params=None):
216 assert lora_runtime_params is None, "lora is not supported on SmoothQuantRowLinear now"
217 if self.quant_mode.has_act_static_scaling():
218 per_token_scale = self.act_scale.value
219 else:
220 x, per_token_scale = x
221 x = smooth_quant_gemm(x, self.weight.value, per_token_scale,
222 self.per_channel_scale.value,
223 self.quant_mode.has_per_token_dynamic_scaling(),
224 self.quant_mode.has_per_channel_scaling(),
225 self.dtype)
226
227 if self.tp_size > 1 and self.tp_group is not None:
228 need_bias = self.bias is not None
229 fuse_bias_into_all_reduce = need_bias and (
230 all_reduce_params
231 is not None) and (all_reduce_params.fusion_op
232 == AllReduceFusionOp.RESIDUAL_RMS_NORM)
233 if fuse_bias_into_all_reduce:
234 all_reduce_params.bias = self.bias.value
235 x = allreduce(x, self.tp_group, all_reduce_params=all_reduce_params)
236 if need_bias and not fuse_bias_into_all_reduce:
237 x = x + self.bias.value
238 return x
239
240 if self.bias is not None:
241 x = x + self.bias.value
242
243 return x
244
245
246class SmoothQuantLayerNorm(Module):

Callers

nothing calls this directly

Calls 5

smooth_quant_gemmFunction · 0.85
allreduceFunction · 0.50

Tested by

no test coverage detected