MCPcopy Index your code
hub / github.com/NVIDIA/TensorRT-LLM / from_description

Method from_description

tensorrt_llm/quantization/mode.py:239–341  ·  view source on GitHub ↗
(quantize_weights=False,
                         quantize_activations=False,
                         per_token=False,
                         per_channel=False,
                         per_group=False,
                         use_int4_weights=False,
                         use_int8_kv_cache=False,
                         use_fp8_kv_cache=False,
                         use_fp8_qdq=False,
                         use_fp8_block_scales=False,
                         use_fp8_rowwise=False,
                         use_nvfp4=False,
                         use_w4a8_nvfp4_fp8=False,
                         use_w4a8_qserve=False,
                         use_w4a8_mxfp4_fp8=False,
                         use_w4a8_mxfp4_mxfp8=False,
                         use_w4a16_mxfp4=False)

Source from the content-addressed store, hash-verified

237
238 @staticmethod
239 def from_description(quantize_weights=False,
240 quantize_activations=False,
241 per_token=False,
242 per_channel=False,
243 per_group=False,
244 use_int4_weights=False,
245 use_int8_kv_cache=False,
246 use_fp8_kv_cache=False,
247 use_fp8_qdq=False,
248 use_fp8_block_scales=False,
249 use_fp8_rowwise=False,
250 use_nvfp4=False,
251 use_w4a8_nvfp4_fp8=False,
252 use_w4a8_qserve=False,
253 use_w4a8_mxfp4_fp8=False,
254 use_w4a8_mxfp4_mxfp8=False,
255 use_w4a16_mxfp4=False):
256
257 def raise_error():
258 raise ValueError(f"Unsupported combination of QuantMode args: "
259 f"{quantize_weights=}, "
260 f"{quantize_activations=}, "
261 f"{per_token=}, "
262 f"{per_channel=}, "
263 f"{per_group=}, "
264 f"{use_int4_weights=}, "
265 f"{use_int8_kv_cache=}, "
266 f"{use_fp8_kv_cache=}, "
267 f"{use_fp8_qdq=}, "
268 f"{use_fp8_block_scales=}, "
269 f"{use_fp8_rowwise=}, "
270 f"{use_nvfp4=}, "
271 f"{use_w4a8_qserve=}, "
272 f"{use_w4a8_mxfp4_fp8=}, "
273 f"{use_w4a8_mxfp4_mxfp8=}, "
274 f"{use_w4a16_mxfp4=}")
275
276 # We must quantize weights when we quantize activations.
277 if quantize_activations and not quantize_weights:
278 raise_error()
279
280 # If we set per_token or per_channel, we must quantize both weights and activations.
281 if (per_token or per_channel) and not (quantize_weights
282 and quantize_activations):
283 raise_error()
284
285 mode = QuantMode(0)
286
287 # Do we quantize the weights - if so, do we use INT4 or INT8?
288 if quantize_weights and use_int4_weights:
289 mode = mode | QuantMode.INT4_WEIGHTS
290 elif quantize_weights:
291 mode = mode | QuantMode.INT8_WEIGHTS
292
293 # Do we quantize the activations?
294 if quantize_activations:
295 mode = mode | QuantMode.ACTIVATIONS
296

Calls 1

QuantModeClass · 0.85