From 5d9c27223f720669234268abf18acad4e7b933c9 Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Tue, 17 Mar 2026 22:27:12 +0000 Subject: [PATCH 01/32] quant_cfg as a list Right now the quant_cfg is a dict, but we are using the quant_cfg as if it is a list. When we apply the quant_cfg, we enumerate the items in the dict and apply the config one by one in modelopt/torch/quantization/conversion.py. This implementation actually has the semantic that the latter configs has higher precedence than the former configs. However, dicts do not have reliable ordering. Therefore, we make quant_cfg a list of patterns: 1. The latter config patterns have higher precedence. A latter config in the list overrides a fomer config if they target the same module. 2. A config to each module is atomic, each config provides the full information. We do not compose a quant module config from multiple config lines Signed-off-by: Shengliang Xu --- examples/llm_autodeploy/run_auto_quantize.py | 4 +- examples/llm_eval/quantization_utils.py | 10 +- examples/llm_ptq/hf_ptq.py | 23 +- .../llm_export_utils/quantization_utils.py | 39 +- modelopt/torch/export/unified_export_hf.py | 6 +- modelopt/torch/quantization/algorithms.py | 18 +- .../backends/fp8_per_tensor_gemm.py | 12 +- .../torch/quantization/backends/nvfp4_gemm.py | 12 +- modelopt/torch/quantization/config.py | 610 ++++++++++-------- modelopt/torch/quantization/conversion.py | 27 +- modelopt/torch/quantization/model_calib.py | 5 +- modelopt/torch/quantization/model_quant.py | 4 +- .../torch/quantization/utils/core_utils.py | 4 +- .../general/ptq/fp8_default-fp8_kv.yml | 79 +-- .../general/ptq/nvfp4_default-fp8_kv.yml | 95 +-- .../general/ptq/nvfp4_mlp_only-fp8_kv.yml | 123 ++-- .../general/ptq/nvfp4_omlp_only-fp8_kv.yml | 151 ++--- tests/_test_utils/torch/export/utils.py | 236 ++++--- .../torch/quantization/onnx_export.py | 10 +- .../torch/quantization/quantize_common.py | 5 +- tests/unit/recipe/test_loader.py | 7 +- .../plugins/test_attention_quant.py | 8 +- .../quantization/plugins/test_huggingface.py | 8 +- .../unit/torch/quantization/test_autoquant.py | 31 +- .../test_compute_quantization_mse.py | 8 +- .../torch/quantization/test_custom_backend.py | 28 +- .../torch/quantization/test_quantize_cpu.py | 113 ++-- .../quantization/test_tensor_quant_cpu.py | 24 +- 28 files changed, 917 insertions(+), 783 deletions(-) diff --git a/examples/llm_autodeploy/run_auto_quantize.py b/examples/llm_autodeploy/run_auto_quantize.py index e9ecb0731..570eca3d8 100644 --- a/examples/llm_autodeploy/run_auto_quantize.py +++ b/examples/llm_autodeploy/run_auto_quantize.py @@ -100,11 +100,11 @@ def loss_func(output, data): if enable_kv_cache_quantization: mtq.set_quantizer_by_cfg( model, - quant_cfg={"*output_quantizer": {"num_bits": (4, 3), "axis": None, "enable": True}}, + quant_cfg=[{"*output_quantizer": {"num_bits": (4, 3), "axis": None, "enable": True}}], ) # Lets calibrate only the output quantizer this time. Let's disable all other quantizers. with mtq.set_quantizer_by_cfg_context( - model, {"*": {"enable": False}, "*output_quantizer": {"enable": True}} + model, [{"*": {"enable": False}}, {"*output_quantizer": {"enable": True}}] ): mtq.calibrate(model, algorithm="max", forward_loop=calibrate_loop) return model diff --git a/examples/llm_eval/quantization_utils.py b/examples/llm_eval/quantization_utils.py index 3df44115a..9d132a818 100644 --- a/examples/llm_eval/quantization_utils.py +++ b/examples/llm_eval/quantization_utils.py @@ -33,12 +33,12 @@ # Modify your custom config for debugging or research purposes. CUSTOM_CONFIG = { "MY_QUANT_CONFIG": { - "quant_cfg": { - "*weight_quantizer": {"num_bits": 4, "block_sizes": {-1: 128}, "enable": True}, - "*input_quantizer": {"num_bits": 8, "type": "dynamic", "block_sizes": {-1: None}}, + "quant_cfg": [ + {"*weight_quantizer": {"num_bits": 4, "block_sizes": {-1: 128}, "enable": True}}, + {"*input_quantizer": {"num_bits": 8, "type": "dynamic", "block_sizes": {-1: None}}}, # Disable sensitive layers such as `lm_head`, gate layers in MoE etc. - **mtq.config._default_disabled_quantizer_cfg, - }, + *mtq.config._default_disabled_quantizer_cfg, + ], "algorithm": "max", }, } diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index 5620ddf6a..dbccce7f9 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -77,16 +77,17 @@ RAND_SEED = 1234 -def _set_kv_cache_constant_amax(quant_cfg: dict) -> None: +def _set_kv_cache_constant_amax(quant_cfg: list) -> None: """Set use_constant_amax on KV cache quantizers. Creates a new dict for the KV bmm quantizer config to avoid mutating shared references. """ - if "*[kv]_bmm_quantizer" in quant_cfg: - quant_cfg["*[kv]_bmm_quantizer"] = { - **quant_cfg["*[kv]_bmm_quantizer"], - "use_constant_amax": True, - } + for i, entry in enumerate(quant_cfg): + if "*[kv]_bmm_quantizer" in entry: + quant_cfg[i] = { + "*[kv]_bmm_quantizer": {**entry["*[kv]_bmm_quantizer"], "use_constant_amax": True} + } + break QUANT_CFG_CHOICES: dict[str, dict[str, Any]] = { @@ -318,7 +319,7 @@ def forward_step(model, batch): ), verbose=True, # Disable all default disabled layers such as lm_head, mlp.gate, router etc. - disabled_layers=list(_default_disabled_quantizer_cfg.keys()), + disabled_layers=[next(iter(entry)) for entry in _default_disabled_quantizer_cfg], method=auto_quantize_method, checkpoint=auto_quantize_checkpoint, ) @@ -331,7 +332,9 @@ def forward_step(model, batch): kv_cache_quant_cfg = copy.deepcopy( getattr(mtq, KV_QUANT_CFG_CHOICES[args.kv_cache_qformat])["quant_cfg"] ) - kv_cache_quant_cfg.pop("default", None) # keep other quantizers from auto_quantize + kv_cache_quant_cfg = [ + e for e in kv_cache_quant_cfg if "default" not in e + ] # keep other quantizers from auto_quantize if args.kv_cache_qformat in _KV_CAST_FORMATS: _set_kv_cache_constant_amax(kv_cache_quant_cfg) @@ -340,7 +343,7 @@ def forward_step(model, batch): if args.kv_cache_qformat not in _KV_CAST_FORMATS: # Calibrate only the KV cache quantizers; disable all others. with mtq.set_quantizer_by_cfg_context( - language_model, {"*": {"enable": False}, **kv_cache_quant_cfg} + language_model, [{"*": {"enable": False}}, *kv_cache_quant_cfg] ): mtq.calibrate(language_model, algorithm="max", forward_loop=calibrate_loop) return language_model @@ -968,7 +971,7 @@ def quantize_main( for prefix in mtp_layer_prefixes: # Add exclusion pattern for this MTP layer (e.g., "*layers.92*") pattern = f"*{prefix.split('.')[-2]}.{prefix.split('.')[-1]}*" - quant_cfg["quant_cfg"][pattern] = {"enable": False} + quant_cfg["quant_cfg"].append({pattern: {"enable": False}}) print(f"Excluding MTP layer from quantization: {pattern}") # Use constant amax for KV quantizers when a cast format is selected. diff --git a/modelopt/onnx/llm_export_utils/quantization_utils.py b/modelopt/onnx/llm_export_utils/quantization_utils.py index 61f551b63..0e2c3ed62 100644 --- a/modelopt/onnx/llm_export_utils/quantization_utils.py +++ b/modelopt/onnx/llm_export_utils/quantization_utils.py @@ -68,24 +68,33 @@ def get_quant_config(precision, lm_head_precision="fp16"): else: raise ValueError(f"Unsupported precision: {precision}") - config_dict = quant_cfg["quant_cfg"] # type: dict + quant_cfg_list: list[dict] = list(quant_cfg["quant_cfg"]) # type: ignore[arg-type] if lm_head_precision == "fp8": - config_dict["*lm_head.input_quantizer"] = {"num_bits": (4, 3), "axis": None} - config_dict["*lm_head.weight_quantizer"] = {"num_bits": (4, 3), "axis": None} + quant_cfg_list.append({"*lm_head.input_quantizer": {"num_bits": (4, 3), "axis": None}}) + quant_cfg_list.append({"*lm_head.weight_quantizer": {"num_bits": (4, 3), "axis": None}}) elif lm_head_precision == "nvfp4": - config_dict["*lm_head.input_quantizer"] = { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, - } - config_dict["*lm_head.weight_quantizer"] = { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, - } + quant_cfg_list.append( + { + "*lm_head.input_quantizer": { + "num_bits": (2, 1), + "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, + "axis": None, + "enable": True, + } + } + ) + quant_cfg_list.append( + { + "*lm_head.weight_quantizer": { + "num_bits": (2, 1), + "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, + "axis": None, + "enable": True, + } + } + ) + quant_cfg["quant_cfg"] = quant_cfg_list return quant_cfg diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index 9d7b75eb1..6f7cde466 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -52,6 +52,7 @@ from torch.distributed.fsdp import FSDPModule from modelopt.torch.quantization import set_quantizer_by_cfg_context +from modelopt.torch.quantization.config import QuantizerAttributeConfig from modelopt.torch.quantization.nn import ( NVFP4StaticQuantizer, SequentialQuantizer, @@ -218,7 +219,10 @@ def _output_hook(module, input, output): # Run dummy forward pass to collect modules sharing same input try: - with torch.no_grad(), set_quantizer_by_cfg_context(model, {"*": {"enable": False}}): + with ( + torch.no_grad(), + set_quantizer_by_cfg_context(model, [{"*": QuantizerAttributeConfig(enable=False)}]), + ): dummy_forward_fn() finally: # Always remove hooks diff --git a/modelopt/torch/quantization/algorithms.py b/modelopt/torch/quantization/algorithms.py index 339e9d0bb..11e75f680 100644 --- a/modelopt/torch/quantization/algorithms.py +++ b/modelopt/torch/quantization/algorithms.py @@ -80,7 +80,9 @@ def estimate_quant_compression_for_quantizer(quantizer_attr_cfg): raise ValueError(f"Unknown type {type(quantizer_attr_cfg)}, {quantizer_attr_cfg}") - return estimate_quant_compression_for_quantizer(list(quant_cfg.quant_cfg.values())) + return estimate_quant_compression_for_quantizer( + [v for entry in quant_cfg.quant_cfg for v in entry.values()] + ) class QuantRecipe(CustomHPType): @@ -97,7 +99,7 @@ def __init__(self, quant_cfg: str | dict[str, Any] | None = None, name: str | No name = self.get_auto_name_for_config(quant_cfg) or name if quant_cfg is None: - quant_cfg = {"quant_cfg": {"*": {"enable": False}}} + quant_cfg = {"quant_cfg": [{"*": {"enable": False}}]} elif isinstance(quant_cfg, str): assert hasattr(mtq_config, quant_cfg), f"Unknown quantization format {quant_cfg}" quant_cfg = getattr(mtq_config, quant_cfg) @@ -109,8 +111,8 @@ def __init__(self, quant_cfg: str | dict[str, Any] | None = None, name: str | No # Disable KV Cache quantization # Currently KV Cache quantization is enabled for some quantization formats and disabled for others # This breaks the monotonicity of the quantization formats in terms of weight compression Vs accuracy - self.config.quant_cfg["*output_quantizer"] = mtq_config.QuantizerAttributeConfig( - enable=False + self.config.quant_cfg.append( + {"*output_quantizer": mtq_config.QuantizerAttributeConfig(enable=False)} ) self.compression = estimate_quant_compression(self.config) @@ -1299,7 +1301,7 @@ def get_auto_quantize_config(search_state, constraints=None, verbose=False): else: best_recipe = search_state["best"]["recipe"] - quant_cfg: dict[str, Any] = {"*": {"enable": False}} + quant_cfg_dict: dict[str, Any] = {"*": {"enable": False}} for hparam_name, recipe in best_recipe.items(): if recipe == QuantRecipe(quant_cfg=None): continue @@ -1308,7 +1310,7 @@ def get_auto_quantize_config(search_state, constraints=None, verbose=False): for quantizer_attr in ("input_quantizer", "weight_quantizer"): matched_cfg = _match_quantizer_cfg(recipe.config.quant_cfg, quantizer_attr) if matched_cfg is not None: - quant_cfg[f"{module_name}.{quantizer_attr}"] = matched_cfg + quant_cfg_dict[f"{module_name}.{quantizer_attr}"] = matched_cfg def _cfg_to_dict(v): if isinstance(v, mtq_config.QuantizerAttributeConfig): @@ -1321,7 +1323,7 @@ def _cfg_to_dict(v): return [_cfg_to_dict(c) for c in v] return v - quant_cfg = {k: _cfg_to_dict(v) for k, v in quant_cfg.items()} + quant_cfg = [{k: _cfg_to_dict(v)} for k, v in quant_cfg_dict.items()] warnings.warn( "get_auto_quantize_config: returned config uses algorithm='max'. " "Per-recipe calibration algorithms (e.g. smoothquant, awq) are not preserved. " @@ -1363,7 +1365,7 @@ def _resolve_best_recipe(search_state, constraints, verbose=False): def _match_quantizer_cfg(quant_cfg, quantizer_attr): # Last-match-wins to mirror set_quantizer_by_cfg behavior matched = None - for pattern, cfg in quant_cfg.items(): + for pattern, cfg in (item for entry in quant_cfg for item in entry.items()): if fnmatch.fnmatch(quantizer_attr, pattern): matched = cfg return matched diff --git a/modelopt/torch/quantization/backends/fp8_per_tensor_gemm.py b/modelopt/torch/quantization/backends/fp8_per_tensor_gemm.py index cc5be9d56..b854215f2 100644 --- a/modelopt/torch/quantization/backends/fp8_per_tensor_gemm.py +++ b/modelopt/torch/quantization/backends/fp8_per_tensor_gemm.py @@ -15,8 +15,6 @@ """This module provides a GEMM function for fp8 per tensor quantization.""" -from typing import Any - import torch from torch.autograd import Function @@ -99,9 +97,13 @@ def fp8_per_tensor_gemm(quant_module, input, bias=None): def _fp8_availability_check(module, input, args, kwargs): """Comprehensive check for FP8 GEMM availability.""" # Quantizer configs - quant_cfg: dict[str, Any] = FP8_DEFAULT_CFG["quant_cfg"] - input_cfg = quant_cfg["*input_quantizer"] - weight_cfg = quant_cfg["*weight_quantizer"] + quant_cfg_list: list[dict] = FP8_DEFAULT_CFG["quant_cfg"] + input_cfg = next( + v for entry in quant_cfg_list for k, v in entry.items() if k == "*input_quantizer" + ) + weight_cfg = next( + v for entry in quant_cfg_list for k, v in entry.items() if k == "*weight_quantizer" + ) # Check hardware support if not torch.cuda.is_available() or not fp8_compatible(): diff --git a/modelopt/torch/quantization/backends/nvfp4_gemm.py b/modelopt/torch/quantization/backends/nvfp4_gemm.py index ffc18fea3..047d9c37a 100644 --- a/modelopt/torch/quantization/backends/nvfp4_gemm.py +++ b/modelopt/torch/quantization/backends/nvfp4_gemm.py @@ -15,8 +15,6 @@ """This module provides a GEMM function for nvfp4 quantization.""" -from typing import Any - import torch from torch.autograd import Function @@ -213,10 +211,14 @@ def _nvfp4_availability_check(module, input, args, kwargs): if not hasattr(module, "input_quantizer") or not hasattr(module, "weight_quantizer"): return False - quant_cfg: dict[str, Any] = mtq.NVFP4_DEFAULT_CFG["quant_cfg"] + quant_cfg_list: list[dict] = mtq.NVFP4_DEFAULT_CFG["quant_cfg"] # Quantizer configs - input_cfg = quant_cfg["*input_quantizer"] - weight_cfg = quant_cfg["*weight_quantizer"] + input_cfg = next( + v for entry in quant_cfg_list for k, v in entry.items() if k == "*input_quantizer" + ) + weight_cfg = next( + v for entry in quant_cfg_list for k, v in entry.items() if k == "*weight_quantizer" + ) # Check input quantizer config for key, value in input_cfg.items(): diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py index cf2336bf4..3471fa562 100644 --- a/modelopt/torch/quantization/config.py +++ b/modelopt/torch/quantization/config.py @@ -97,15 +97,14 @@ .. code-block:: MY_QUANT_CFG = { - "quant_cfg": { + "quant_cfg": [ # Quantizer wildcard strings mapping to quantizer attributes - "*weight_quantizer": {"num_bits": 8, "axis": 0}, - "*input_quantizer": {"num_bits": 8, "axis": None}, + {"*weight_quantizer": {"num_bits": 8, "axis": 0}}, + {"*input_quantizer": {"num_bits": 8, "axis": None}}, # Module class names mapping to quantizer configurations - "nn.LeakyReLU": {"*input_quantizer": {"enable": False}}, - - } + {"nn.LeakyReLU": {"*input_quantizer": {"enable": False}}}, + ] } .. _example-quantization-configs: @@ -137,149 +136,157 @@ """ from collections.abc import Callable -from typing import Literal +from typing import Any, Literal from pydantic import ValidationInfo, field_validator, model_validator from modelopt.torch.opt.config import ModeloptBaseConfig, ModeloptField from modelopt.torch.utils.network import ConstructorLike -_default_disabled_quantizer_cfg = { - "nn.BatchNorm1d": {"*": {"enable": False}}, - "nn.BatchNorm2d": {"*": {"enable": False}}, - "nn.BatchNorm3d": {"*": {"enable": False}}, - "nn.LeakyReLU": {"*": {"enable": False}}, - "*lm_head*": {"enable": False}, - "*proj_out.*": {"enable": False}, # In Whisper model, lm_head has key name proj_out - "*block_sparse_moe.gate*": {"enable": False}, # Skip the MOE router - "*router*": {"enable": False}, # Skip the MOE router - "*mlp.gate.*": {"enable": False}, # Skip the MOE router - "*mlp.shared_expert_gate.*": {"enable": False}, # Skip the MOE router - "*linear_attn.conv1d*": {"enable": False}, - "*mixer.conv1d*": {"enable": False}, # Skip mamba conv1d - "*output_layer*": {"enable": False}, - "output.*": {"enable": False}, - "default": {"enable": False}, -} +_default_disabled_quantizer_cfg: list[dict] = [ + {"nn.BatchNorm1d": {"*": {"enable": False}}}, + {"nn.BatchNorm2d": {"*": {"enable": False}}}, + {"nn.BatchNorm3d": {"*": {"enable": False}}}, + {"nn.LeakyReLU": {"*": {"enable": False}}}, + {"*lm_head*": {"enable": False}}, + {"*proj_out.*": {"enable": False}}, # In Whisper model, lm_head has key name proj_out + {"*block_sparse_moe.gate*": {"enable": False}}, # Skip the MOE router + {"*router*": {"enable": False}}, # Skip the MOE router + {"*mlp.gate.*": {"enable": False}}, # Skip the MOE router + {"*mlp.shared_expert_gate.*": {"enable": False}}, # Skip the MOE router + {"*linear_attn.conv1d*": {"enable": False}}, + {"*mixer.conv1d*": {"enable": False}}, # Skip mamba conv1d + {"*output_layer*": {"enable": False}}, + {"output.*": {"enable": False}}, + {"default": {"enable": False}}, +] -_mamba_moe_disabled_quantizer_cfg = { - "*fc1_latent_proj*": {"enable": False}, # Skip Latent MOE - "*fc2_latent_proj*": {"enable": False}, # Skip Latent MOE - "*q_proj*": {"enable": False}, # Skip QKV Linear - "*k_proj*": {"enable": False}, # Skip QKV Linear - "*v_proj*": {"enable": False}, # Skip QKV Linear - "*o_proj*": {"enable": False}, # Skip QKV Output Projection -} +_mamba_moe_disabled_quantizer_cfg: list[dict] = [ + {"*fc1_latent_proj*": {"enable": False}}, # Skip Latent MOE + {"*fc2_latent_proj*": {"enable": False}}, # Skip Latent MOE + {"*q_proj*": {"enable": False}}, # Skip QKV Linear + {"*k_proj*": {"enable": False}}, # Skip QKV Linear + {"*v_proj*": {"enable": False}}, # Skip QKV Linear + {"*o_proj*": {"enable": False}}, # Skip QKV Output Projection +] INT8_DEFAULT_CFG = { - "quant_cfg": { - "*weight_quantizer": {"num_bits": 8, "axis": 0}, - "*input_quantizer": {"num_bits": 8, "axis": None}, - **_default_disabled_quantizer_cfg, - }, + "quant_cfg": [ + {"*weight_quantizer": {"num_bits": 8, "axis": 0}}, + {"*input_quantizer": {"num_bits": 8, "axis": None}}, + *_default_disabled_quantizer_cfg, + ], "algorithm": "max", } INT8_SMOOTHQUANT_CFG = { - "quant_cfg": { - "*weight_quantizer": {"num_bits": 8, "axis": 0}, - "*input_quantizer": {"num_bits": 8, "axis": None}, - **_default_disabled_quantizer_cfg, - }, + "quant_cfg": [ + {"*weight_quantizer": {"num_bits": 8, "axis": 0}}, + {"*input_quantizer": {"num_bits": 8, "axis": None}}, + *_default_disabled_quantizer_cfg, + ], "algorithm": "smoothquant", } INT8_WEIGHT_ONLY_CFG = { - "quant_cfg": { - "*weight_quantizer": {"num_bits": 8, "axis": 0}, - "*input_quantizer": {"enable": False}, - **_default_disabled_quantizer_cfg, - }, + "quant_cfg": [ + {"*weight_quantizer": {"num_bits": 8, "axis": 0}}, + {"*input_quantizer": {"enable": False}}, + *_default_disabled_quantizer_cfg, + ], "algorithm": "max", } FP8_DEFAULT_CFG = { - "quant_cfg": { - "*weight_quantizer": {"num_bits": (4, 3), "axis": None}, - "*input_quantizer": {"num_bits": (4, 3), "axis": None}, - **_default_disabled_quantizer_cfg, - }, + "quant_cfg": [ + {"*weight_quantizer": {"num_bits": (4, 3), "axis": None}}, + {"*input_quantizer": {"num_bits": (4, 3), "axis": None}}, + *_default_disabled_quantizer_cfg, + ], "algorithm": "max", } MAMBA_MOE_FP8_AGGRESSIVE_CFG = { - "quant_cfg": { - "*weight_quantizer": {"num_bits": (4, 3), "axis": None}, - "*input_quantizer": {"num_bits": (4, 3), "axis": None}, - **_default_disabled_quantizer_cfg, - **_mamba_moe_disabled_quantizer_cfg, - }, + "quant_cfg": [ + {"*weight_quantizer": {"num_bits": (4, 3), "axis": None}}, + {"*input_quantizer": {"num_bits": (4, 3), "axis": None}}, + *_default_disabled_quantizer_cfg, + *_mamba_moe_disabled_quantizer_cfg, + ], "algorithm": "max", } MAMBA_MOE_FP8_CONSERVATIVE_CFG = { - "quant_cfg": { - "*weight_quantizer": {"num_bits": (4, 3), "axis": None}, - "*input_quantizer": {"num_bits": (4, 3), "axis": None}, - **_default_disabled_quantizer_cfg, - **_mamba_moe_disabled_quantizer_cfg, - "*mixer.in_proj*": {"enable": False}, # Skip mamba linear - "*mixer.out_proj*": {"enable": False}, # Skip mamba linear - }, + "quant_cfg": [ + {"*weight_quantizer": {"num_bits": (4, 3), "axis": None}}, + {"*input_quantizer": {"num_bits": (4, 3), "axis": None}}, + *_default_disabled_quantizer_cfg, + *_mamba_moe_disabled_quantizer_cfg, + {"*mixer.in_proj*": {"enable": False}}, # Skip mamba linear + {"*mixer.out_proj*": {"enable": False}}, # Skip mamba linear + ], "algorithm": "max", } FP8_PER_CHANNEL_PER_TOKEN_CFG = { - "quant_cfg": { - "*weight_quantizer": {"num_bits": (4, 3), "axis": 0}, - "*input_quantizer": { - "num_bits": (4, 3), - "type": "dynamic", - "block_sizes": {-1: None}, + "quant_cfg": [ + {"*weight_quantizer": {"num_bits": (4, 3), "axis": 0}}, + { + "*input_quantizer": { + "num_bits": (4, 3), + "type": "dynamic", + "block_sizes": {-1: None}, + } }, - **_default_disabled_quantizer_cfg, - }, + *_default_disabled_quantizer_cfg, + ], "algorithm": "max", } # FP8 2D blockwise fake quantization config for deepseek models FP8_2D_BLOCKWISE_WEIGHT_ONLY_CFG = { - "quant_cfg": { - "*weight_quantizer": { - "num_bits": (4, 3), - "block_sizes": {-1: 128, -2: 128}, - "enable": True, + "quant_cfg": [ + { + "*weight_quantizer": { + "num_bits": (4, 3), + "block_sizes": {-1: 128, -2: 128}, + "enable": True, + } }, - "*input_quantizer": {"enable": False}, - **_default_disabled_quantizer_cfg, - }, + {"*input_quantizer": {"enable": False}}, + *_default_disabled_quantizer_cfg, + ], "algorithm": "max", } INT4_BLOCKWISE_WEIGHT_ONLY_CFG = { - "quant_cfg": { - "*weight_quantizer": { - "num_bits": 4, - "block_sizes": {-1: 128}, - "enable": True, + "quant_cfg": [ + { + "*weight_quantizer": { + "num_bits": 4, + "block_sizes": {-1: 128}, + "enable": True, + } }, - "*input_quantizer": {"enable": False}, - **_default_disabled_quantizer_cfg, - }, + {"*input_quantizer": {"enable": False}}, + *_default_disabled_quantizer_cfg, + ], "algorithm": "max", } INT4_AWQ_CFG = { - "quant_cfg": { - "*weight_quantizer": { - "num_bits": 4, - "block_sizes": {-1: 128, "type": "static"}, - "enable": True, + "quant_cfg": [ + { + "*weight_quantizer": { + "num_bits": 4, + "block_sizes": {-1: 128, "type": "static"}, + "enable": True, + } }, - "*input_quantizer": {"enable": False}, - **_default_disabled_quantizer_cfg, - }, + {"*input_quantizer": {"enable": False}}, + *_default_disabled_quantizer_cfg, + ], "algorithm": {"method": "awq_lite", "alpha_step": 0.1}, # "algorithm": {"method": "awq_full", "alpha_step": 0.1, "max_co_batch_size": 1024}, # "algorithm": {"method": "awq_clip", "max_co_batch_size": 2048}, @@ -288,127 +295,153 @@ # W4A8 currently uses INT4 blockwise quantization (block size = 128) followed by FP8 quantization # for weights. This could change in the future W4A8_AWQ_BETA_CFG = { - "quant_cfg": { - "*weight_quantizer": [ - { - "num_bits": 4, - "block_sizes": {-1: 128, "type": "static"}, - "enable": True, - }, - { + "quant_cfg": [ + { + "*weight_quantizer": [ + { + "num_bits": 4, + "block_sizes": {-1: 128, "type": "static"}, + "enable": True, + }, + { + "num_bits": (4, 3), + "enable": True, + }, + ] + }, + { + "*input_quantizer": { "num_bits": (4, 3), "enable": True, - }, - ], - "*input_quantizer": { - "num_bits": (4, 3), - "enable": True, + } }, - **_default_disabled_quantizer_cfg, - }, + *_default_disabled_quantizer_cfg, + ], "algorithm": "awq_lite", } MXFP8_DEFAULT_CFG = { - "quant_cfg": { - "*weight_quantizer": { - "num_bits": (4, 3), - "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - "enable": True, + "quant_cfg": [ + { + "*weight_quantizer": { + "num_bits": (4, 3), + "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, + "enable": True, + } }, - "*input_quantizer": { - "num_bits": (4, 3), - "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - "enable": True, + { + "*input_quantizer": { + "num_bits": (4, 3), + "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, + "enable": True, + } }, - **_default_disabled_quantizer_cfg, - }, + *_default_disabled_quantizer_cfg, + ], "algorithm": None, } MXFP6_DEFAULT_CFG = { - "quant_cfg": { - "*weight_quantizer": { - "num_bits": (3, 2), - "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - "enable": True, + "quant_cfg": [ + { + "*weight_quantizer": { + "num_bits": (3, 2), + "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, + "enable": True, + } }, - "*input_quantizer": { - "num_bits": (3, 2), - "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - "enable": True, + { + "*input_quantizer": { + "num_bits": (3, 2), + "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, + "enable": True, + } }, - **_default_disabled_quantizer_cfg, - }, + *_default_disabled_quantizer_cfg, + ], "algorithm": None, } MXFP4_DEFAULT_CFG = { - "quant_cfg": { - "*weight_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - "enable": True, + "quant_cfg": [ + { + "*weight_quantizer": { + "num_bits": (2, 1), + "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, + "enable": True, + } }, - "*input_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - "enable": True, + { + "*input_quantizer": { + "num_bits": (2, 1), + "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, + "enable": True, + } }, - **_default_disabled_quantizer_cfg, - }, + *_default_disabled_quantizer_cfg, + ], "algorithm": None, } W4A8_MXFP4_FP8_CFG = { - "quant_cfg": { - "*weight_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - "enable": True, + "quant_cfg": [ + { + "*weight_quantizer": { + "num_bits": (2, 1), + "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, + "enable": True, + } }, - "*input_quantizer": {"num_bits": (4, 3), "axis": None}, - **_default_disabled_quantizer_cfg, - }, + {"*input_quantizer": {"num_bits": (4, 3), "axis": None}}, + *_default_disabled_quantizer_cfg, + ], "algorithm": None, } MXINT8_DEFAULT_CFG = { - "quant_cfg": { - "*weight_quantizer": { - "num_bits": 8, - "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - "enable": True, + "quant_cfg": [ + { + "*weight_quantizer": { + "num_bits": 8, + "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, + "enable": True, + } }, - "*input_quantizer": { - "num_bits": 8, - "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - "enable": True, + { + "*input_quantizer": { + "num_bits": 8, + "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, + "enable": True, + } }, - **_default_disabled_quantizer_cfg, - }, + *_default_disabled_quantizer_cfg, + ], "algorithm": None, } FP8_KV_CFG = { - "quant_cfg": { - "*[kv]_bmm_quantizer": { - "num_bits": (4, 3), - "enable": True, + "quant_cfg": [ + { + "*[kv]_bmm_quantizer": { + "num_bits": (4, 3), + "enable": True, + } }, - "default": {"enable": False}, - }, + {"default": {"enable": False}}, + ], "algorithm": "max", } FP8_AFFINE_KV_CFG = { - "quant_cfg": { - "*[kv]_bmm_quantizer": { - "num_bits": (4, 3), - "bias": {-2: None, -4: None, "type": "static"}, + "quant_cfg": [ + { + "*[kv]_bmm_quantizer": { + "num_bits": (4, 3), + "bias": {-2: None, -4: None, "type": "static"}, + } }, - "default": {"enable": False}, - }, + {"default": {"enable": False}}, + ], "algorithm": "max", } @@ -433,27 +466,29 @@ def _nvfp4_selective_quant_cfg( algorithm: str | dict = "max", ) -> dict: """Build an NVFP4 config that quantizes only the specified layer patterns.""" - quant_cfg: dict[str, object] = {} + quant_cfg: dict[str, object] = [] for pattern in layer_patterns: - quant_cfg[f"{pattern}weight_quantizer"] = quantizer + quant_cfg.append({f"{pattern}weight_quantizer": quantizer}) if not weight_only: - quant_cfg[f"{pattern}input_quantizer"] = quantizer - quant_cfg.update(_default_disabled_quantizer_cfg) + quant_cfg.append({f"{pattern}input_quantizer": quantizer}) + quant_cfg.extend(_default_disabled_quantizer_cfg) return {"quant_cfg": quant_cfg, "algorithm": algorithm} NVFP4_DEFAULT_CFG = _nvfp4_selective_quant_cfg(["*"]) NVFP4_W4A4_WEIGHT_MSE_FP8_SWEEP_CFG = { - "quant_cfg": { - "*weight_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "static", "scale_bits": (4, 3)}, - "enable": True, + "quant_cfg": [ + { + "*weight_quantizer": { + "num_bits": (2, 1), + "block_sizes": {-1: 16, "type": "static", "scale_bits": (4, 3)}, + "enable": True, + } }, - "*input_quantizer": _nvfp4_quantizer, - **_default_disabled_quantizer_cfg, - }, + {"*input_quantizer": _nvfp4_quantizer}, + *_default_disabled_quantizer_cfg, + ], "algorithm": { "method": "mse", "fp8_scale_sweep": True, @@ -461,15 +496,17 @@ def _nvfp4_selective_quant_cfg( } NVFP4_W4A4_WEIGHT_LOCAL_HESSIAN_CFG = { - "quant_cfg": { - "*weight_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "static", "scale_bits": (4, 3)}, - "enable": True, + "quant_cfg": [ + { + "*weight_quantizer": { + "num_bits": (2, 1), + "block_sizes": {-1: 16, "type": "static", "scale_bits": (4, 3)}, + "enable": True, + } }, - "*input_quantizer": _nvfp4_quantizer, - **_default_disabled_quantizer_cfg, - }, + {"*input_quantizer": _nvfp4_quantizer}, + *_default_disabled_quantizer_cfg, + ], "algorithm": { "method": "local_hessian", "fp8_scale_sweep": True, @@ -477,27 +514,26 @@ def _nvfp4_selective_quant_cfg( } MAMBA_MOE_NVFP4_AGGRESSIVE_CFG = { - "quant_cfg": { - "*weight_quantizer": _nvfp4_quantizer, - "*input_quantizer": _nvfp4_quantizer, - **_default_disabled_quantizer_cfg, - **_mamba_moe_disabled_quantizer_cfg, - }, + "quant_cfg": [ + {"*weight_quantizer": _nvfp4_quantizer}, + {"*input_quantizer": _nvfp4_quantizer}, + *_default_disabled_quantizer_cfg, + *_mamba_moe_disabled_quantizer_cfg, + ], "algorithm": "max", } MAMBA_MOE_NVFP4_CONSERVATIVE_CFG = { - "quant_cfg": { - "*weight_quantizer": _nvfp4_quantizer, - "*input_quantizer": _nvfp4_quantizer, - **_default_disabled_quantizer_cfg, - **_mamba_moe_disabled_quantizer_cfg, - "*mixer.in_proj*": {"enable": False}, # Skip mamba linear - "*mixer.out_proj*": {"enable": False}, # Skip mamba linear - }, + "quant_cfg": [ + {"*weight_quantizer": _nvfp4_quantizer}, + {"*input_quantizer": _nvfp4_quantizer}, + *_default_disabled_quantizer_cfg, + *_mamba_moe_disabled_quantizer_cfg, + {"*mixer.in_proj*": {"enable": False}}, # Skip mamba linear + {"*mixer.out_proj*": {"enable": False}}, # Skip mamba linear + ], "algorithm": "max", } - NVFP4_AWQ_LITE_CFG = _nvfp4_selective_quant_cfg(["*"], algorithm="awq_lite") NVFP4_AWQ_CLIP_CFG = _nvfp4_selective_quant_cfg(["*"], algorithm={"method": "awq_clip"}) @@ -506,64 +542,79 @@ def _nvfp4_selective_quant_cfg( ["*"], algorithm={"method": "awq_full", "alpha_step": 0.1} ) - NVFP4_AFFINE_KV_CFG = { - "quant_cfg": { - "*[kv]_bmm_quantizer": { - **_nvfp4_quantizer, - "bias": {-2: None, -4: None, "type": "static"}, + "quant_cfg": [ + { + "*[kv]_bmm_quantizer": { + **_nvfp4_quantizer, + "bias": {-2: None, -4: None, "type": "static"}, + } }, - "default": {"enable": False}, - }, + {"default": {"enable": False}}, + ], "algorithm": "max", } NVFP4_KV_CFG = { - "quant_cfg": { - "*[kv]_bmm_quantizer": _nvfp4_quantizer, - "default": {"enable": False}, - }, + "quant_cfg": [ + {"*[kv]_bmm_quantizer": _nvfp4_quantizer}, + {"default": {"enable": False}}, + ], "algorithm": "max", } # Moved from examples/diffusers/quantization/config.py to here NVFP4_FP8_MHA_CONFIG = { - "quant_cfg": { - "*weight_quantizer": _nvfp4_quantizer, - "*input_quantizer": _nvfp4_quantizer, - "*output_quantizer": {"enable": False}, - "*q_bmm_quantizer": { - "num_bits": (4, 3), + "quant_cfg": [ + {"*weight_quantizer": _nvfp4_quantizer}, + {"*input_quantizer": _nvfp4_quantizer}, + {"*output_quantizer": {"enable": False}}, + { + "*q_bmm_quantizer": { + "num_bits": (4, 3), + } }, - "*k_bmm_quantizer": { - "num_bits": (4, 3), + { + "*k_bmm_quantizer": { + "num_bits": (4, 3), + } }, - "*v_bmm_quantizer": { - "num_bits": (4, 3), + { + "*v_bmm_quantizer": { + "num_bits": (4, 3), + } }, - "*softmax_quantizer": { - "num_bits": (4, 3), + { + "*softmax_quantizer": { + "num_bits": (4, 3), + } }, - "transformer_blocks*bmm2_output_quantizer": { - "num_bits": (4, 3), + { + "transformer_blocks*bmm2_output_quantizer": { + "num_bits": (4, 3), + } }, - "default": {"enable": False}, - }, + {"default": {"enable": False}}, + ], "algorithm": "max", } NVFP4_KV_ROTATE_CFG = { - "quant_cfg": { - "*q_bmm_quantizer": { - "enable": False, - "rotate": True, + "quant_cfg": [ + { + "*q_bmm_quantizer": { + "enable": False, + "rotate": True, + } }, - "*k_bmm_quantizer": { - **_nvfp4_quantizer, - "rotate": True, + { + "*k_bmm_quantizer": { + **_nvfp4_quantizer, + "rotate": True, + } }, - "*v_bmm_quantizer": _nvfp4_quantizer, - }, + {"*v_bmm_quantizer": _nvfp4_quantizer}, + ], "algorithm": "max", } @@ -572,35 +623,43 @@ def _nvfp4_selective_quant_cfg( ) W4A8_NVFP4_FP8_CFG = { - "quant_cfg": { - "*weight_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (4, 3)}, - "enable": True, + "quant_cfg": [ + { + "*weight_quantizer": { + "num_bits": (2, 1), + "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (4, 3)}, + "enable": True, + } }, - "*input_quantizer": { - "num_bits": (4, 3), - "enable": True, + { + "*input_quantizer": { + "num_bits": (4, 3), + "enable": True, + } }, - **_default_disabled_quantizer_cfg, - }, + *_default_disabled_quantizer_cfg, + ], "algorithm": "max", } MXFP4_MLP_WEIGHT_ONLY_CFG = { - "quant_cfg": { - "*mlp*weight_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - "enable": True, + "quant_cfg": [ + { + "*mlp*weight_quantizer": { + "num_bits": (2, 1), + "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, + "enable": True, + } }, - "*block_sparse_moe*weight_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - "enable": True, + { + "*block_sparse_moe*weight_quantizer": { + "num_bits": (2, 1), + "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, + "enable": True, + } }, - **_default_disabled_quantizer_cfg, - }, + *_default_disabled_quantizer_cfg, + ], "algorithm": None, } @@ -611,6 +670,7 @@ def _nvfp4_selective_quant_cfg( NVFP4_MLP_ONLY_CFG = _nvfp4_selective_quant_cfg(["*mlp*", "*block_sparse_moe*"]) NVFP4_OMLP_ONLY_CFG = _nvfp4_selective_quant_cfg(["*o_proj*", "*mlp*", "*block_sparse_moe*"]) + # DO NOT ADD NEW CONFIGS HERE. If you want to add a new general recipe, add it to # modelopt_recipes/general/ptq/ as a yaml file choices: set[str] = { @@ -1346,13 +1406,16 @@ class GPTQLiteConfig(QuantizeAlgorithmConfig): ) -QuantizeQuantCfgType = dict[ +_QuantizeQuantCfgEntryType = dict[ str | Callable, QuantizerAttributeConfig | list[QuantizerAttributeConfig] - | dict[str | Callable, QuantizerAttributeConfig | list[QuantizerAttributeConfig]], + | dict[str | Callable, QuantizerAttributeConfig | list[QuantizerAttributeConfig]] + | dict[str, Any], ] +QuantizeQuantCfgType = list[_QuantizeQuantCfgEntryType] + _QuantizeAlgoCfgType = str | dict | QuantizeAlgorithmConfig | None QuantizeAlgoCfgType = _QuantizeAlgoCfgType | list[_QuantizeAlgoCfgType] | None @@ -1362,7 +1425,7 @@ class QuantizeConfig(ModeloptBaseConfig): """Default configuration for ``quantize`` mode.""" quant_cfg: QuantizeQuantCfgType = ModeloptField( - default={"default": {"num_bits": 8, "axis": None}}, + default=[{"default": {"num_bits": 8, "axis": None}}], title="Quantization configuration", validate_default=True, ) @@ -1410,7 +1473,8 @@ def _not_dynamic(cfg): and cfg.get("*", {}).get("enable", True) ) - for name, cfg in config.get("quant_cfg", {}).items(): + quant_cfg: list = config.get("quant_cfg") or [] + for name, cfg in (item for entry in quant_cfg for item in entry.items()): if "weight_quantizer" in name: # We don't calibrate weight quantizer continue diff --git a/modelopt/torch/quantization/conversion.py b/modelopt/torch/quantization/conversion.py index f7ef704ee..7f95d5dde 100644 --- a/modelopt/torch/quantization/conversion.py +++ b/modelopt/torch/quantization/conversion.py @@ -211,10 +211,10 @@ def _replace_quant_module(model: nn.Module, version=None, registry=QuantModuleRe _replace_quant_module(getattr(model, name), version=version, registry=registry) -def set_quantizer_by_cfg(quant_model: nn.Module, quant_cfg: QuantizeQuantCfgType | dict): +def set_quantizer_by_cfg(quant_model: nn.Module, quant_cfg: QuantizeQuantCfgType): """Update the quantizer attributes based on the specified `quant_cfg`. - `quant_cfg` is a dictionary mapping wildcards or filter functions + `quant_cfg` is a list of single-key dicts mapping wildcards or filter functions to its quantizer attributes which are defined in :class:`QuantizerAttributeConfig <.config.QuantizerAttributeConfig>`. The wildcards or filter functions are matched against the quantizer module names. @@ -228,12 +228,15 @@ def set_quantizer_by_cfg(quant_model: nn.Module, quant_cfg: QuantizeQuantCfgType See :meth:`set_quantizer_attribute ` for more details. """ - quant_cfg = quant_cfg.copy() - if "default" in quant_cfg: - set_quantizer_attribute(quant_model, "*", quant_cfg["default"]) - quant_cfg.pop("default") - - for pattern, cfg in quant_cfg.items(): + items = [(k, v) for entry in quant_cfg for k, v in entry.items()] + for pattern, cfg in items: + if str(pattern) == "default": + set_quantizer_attribute(quant_model, "*", cfg) + break + + for pattern, cfg in items: + if str(pattern) == "default": + continue if str(pattern) in QuantModuleRegistry: parent_class = QuantModuleRegistry[str(pattern)] assert isinstance(cfg, dict), ( @@ -309,7 +312,7 @@ def set_quantizer_attribute( @contextmanager -def set_quantizer_by_cfg_context(quant_model: nn.Module, quant_cfg: QuantizeQuantCfgType | dict): +def set_quantizer_by_cfg_context(quant_model: nn.Module, quant_cfg: QuantizeQuantCfgType): """Context manager for setting quantizer attributes using `quant_cfg`. The set attributes will be reset to the original attributes after exiting the context manager. @@ -318,9 +321,9 @@ def set_quantizer_by_cfg_context(quant_model: nn.Module, quant_cfg: QuantizeQuan Use this context manager with caution. Changing certain attributes of the quantizer such as `calibrator` can lead to unexpected behavior. """ - assert not any(cfg for cfg in quant_cfg.values() if isinstance(cfg, (list, tuple))), ( - "list of config not support." - ) + assert not any( + cfg for entry in quant_cfg for cfg in entry.values() if isinstance(cfg, (list, tuple)) + ), "list of config not support." original_attributes = {} for name, module in quant_model.named_modules(): diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py index ed57ea3fc..1efd497b3 100644 --- a/modelopt/torch/quantization/model_calib.py +++ b/modelopt/torch/quantization/model_calib.py @@ -35,6 +35,7 @@ from modelopt.torch.utils.perf import get_used_gpu_mem_fraction from .calib import MseCalibrator, NVFP4MSECalibrator +from .config import QuantizerAttributeConfig from .conversion import create_and_replace_svdquant_linear_on_the_fly, set_quantizer_by_cfg_context from .nn import NVFP4StaticQuantizer, QuantModule, SequentialQuantizer, TensorQuantizer from .utils import ( @@ -1101,7 +1102,9 @@ def forward(self, input, *args, **kwargs): self.awq_lite.num_cache_steps += 1 self.awq_lite.num_tokens += input.numel() / input.shape[-1] if self.awq_lite.is_input_quantized: - with set_quantizer_by_cfg_context(self.input_quantizer, {"*": {"enable": True}}): + with set_quantizer_by_cfg_context( + self.input_quantizer, [{"*": QuantizerAttributeConfig(enable=True)}] + ): max_calibrate(self.input_quantizer, lambda quantizer: quantizer(input), False) return out_actual diff --git a/modelopt/torch/quantization/model_quant.py b/modelopt/torch/quantization/model_quant.py index 4aa1ff46b..eed0f251f 100644 --- a/modelopt/torch/quantization/model_quant.py +++ b/modelopt/torch/quantization/model_quant.py @@ -35,7 +35,7 @@ from .algorithms import AutoQuantizeGradientSearcher, AutoQuantizeKLDivSearcher, QuantRecipe from .algorithms import get_auto_quantize_config as _get_auto_quantize_config -from .config import QuantizeAlgoCfgType +from .config import QuantizeAlgoCfgType, QuantizerAttributeConfig from .conversion import set_quantizer_attribute from .mode import QuantizeModeRegistry, get_modelike_from_algo_cfg from .nn import QuantModule, TensorQuantizer @@ -527,7 +527,7 @@ def forward_backward_step(model, batch) -> None: "checkpoint": checkpoint, } # Disable all quantizers; AutoQuantize will enable the needed ones - set_quantizer_by_cfg(model, {"*": {"enable": False}}) + set_quantizer_by_cfg(model, [{"*": QuantizerAttributeConfig(enable=False)}]) searcher.search(model, constraints, config=search_config) # type: ignore[arg-type] return model, searcher.state_dict() diff --git a/modelopt/torch/quantization/utils/core_utils.py b/modelopt/torch/quantization/utils/core_utils.py index 4340b8dc1..ab05bec13 100644 --- a/modelopt/torch/quantization/utils/core_utils.py +++ b/modelopt/torch/quantization/utils/core_utils.py @@ -828,8 +828,8 @@ def update_quant_cfg_with_kv_cache_quant( """Update the quant_cfg with the kv cache quant_cfg.""" # If quant_cfg["quant_cfg"] is None, it corresponds to only kv cache quantization case quant_cfg = copy.deepcopy(quant_cfg) - quant_cfg["quant_cfg"] = quant_cfg.get("quant_cfg") or {"default": {"enable": False}} - quant_cfg["quant_cfg"].update(kv_cache_quant_cfg) + inner: list = quant_cfg.get("quant_cfg") or [{"default": {"enable": False}}] + quant_cfg["quant_cfg"] = inner + [{k: v} for k, v in kv_cache_quant_cfg.items()] # Set default algorithm for kv cache quantization if not provided. if not quant_cfg.get("algorithm"): diff --git a/modelopt_recipes/general/ptq/fp8_default-fp8_kv.yml b/modelopt_recipes/general/ptq/fp8_default-fp8_kv.yml index 72630965b..d8b6adbac 100644 --- a/modelopt_recipes/general/ptq/fp8_default-fp8_kv.yml +++ b/modelopt_recipes/general/ptq/fp8_default-fp8_kv.yml @@ -19,46 +19,47 @@ metadata: ptq_cfg: algorithm: max quant_cfg: - '*input_quantizer': - num_bits: e4m3 - axis: - '*weight_quantizer': - num_bits: e4m3 - axis: - default: - enable: false - '*block_sparse_moe.gate*': - enable: false - '*linear_attn.conv1d*': - enable: false - '*lm_head*': - enable: false - '*mixer.conv1d*': - enable: false - '*mlp.gate.*': - enable: false - '*mlp.shared_expert_gate.*': - enable: false - '*output_layer*': - enable: false - '*proj_out.*': - enable: false - '*router*': - enable: false - output.*: - enable: false - nn.BatchNorm1d: - '*': + - '*input_quantizer': + num_bits: e4m3 + axis: + - '*weight_quantizer': + num_bits: e4m3 + axis: + - '*[kv]_bmm_quantizer': + num_bits: e4m3 + enable: true + + - default: + enable: false + - '*block_sparse_moe.gate*': + enable: false + - '*linear_attn.conv1d*': + enable: false + - '*lm_head*': + enable: false + - '*mixer.conv1d*': + enable: false + - '*mlp.gate.*': + enable: false + - '*mlp.shared_expert_gate.*': + enable: false + - '*output_layer*': enable: false - nn.BatchNorm2d: - '*': + - '*proj_out.*': enable: false - nn.BatchNorm3d: - '*': + - '*router*': enable: false - nn.LeakyReLU: - '*': + - output.*: enable: false - '*[kv]_bmm_quantizer': - num_bits: e4m3 - enable: true + - nn.BatchNorm1d: + '*': + enable: false + - nn.BatchNorm2d: + '*': + enable: false + - nn.BatchNorm3d: + '*': + enable: false + - nn.LeakyReLU: + '*': + enable: false diff --git a/modelopt_recipes/general/ptq/nvfp4_default-fp8_kv.yml b/modelopt_recipes/general/ptq/nvfp4_default-fp8_kv.yml index 73e84b1bc..7f79bd47b 100644 --- a/modelopt_recipes/general/ptq/nvfp4_default-fp8_kv.yml +++ b/modelopt_recipes/general/ptq/nvfp4_default-fp8_kv.yml @@ -19,54 +19,55 @@ metadata: ptq_cfg: algorithm: max quant_cfg: - '*weight_quantizer': - block_sizes: - -1: 16 - type: dynamic - scale_bits: e4m3 - num_bits: e2m1 - enable: true - '*input_quantizer': - block_sizes: - -1: 16 - type: dynamic - scale_bits: e4m3 - num_bits: e2m1 - enable: true - default: - enable: false - '*block_sparse_moe.gate*': - enable: false - '*linear_attn.conv1d*': - enable: false - '*lm_head*': - enable: false - '*mixer.conv1d*': - enable: false - '*mlp.gate.*': - enable: false - '*mlp.shared_expert_gate.*': - enable: false - '*output_layer*': - enable: false - '*proj_out.*': - enable: false - '*router*': - enable: false - output.*: - enable: false - nn.BatchNorm1d: - '*': + - '*weight_quantizer': + block_sizes: + -1: 16 + type: dynamic + scale_bits: e4m3 + num_bits: e2m1 + enable: true + - '*input_quantizer': + block_sizes: + -1: 16 + type: dynamic + scale_bits: e4m3 + num_bits: e2m1 + enable: true + - '*[kv]_bmm_quantizer': + num_bits: e4m3 + enable: true + + - default: + enable: false + - '*block_sparse_moe.gate*': + enable: false + - '*linear_attn.conv1d*': + enable: false + - '*lm_head*': + enable: false + - '*mixer.conv1d*': + enable: false + - '*mlp.gate.*': + enable: false + - '*mlp.shared_expert_gate.*': + enable: false + - '*output_layer*': enable: false - nn.BatchNorm2d: - '*': + - '*proj_out.*': enable: false - nn.BatchNorm3d: - '*': + - '*router*': enable: false - nn.LeakyReLU: - '*': + - output.*: enable: false - '*[kv]_bmm_quantizer': - num_bits: e4m3 - enable: true + - nn.BatchNorm1d: + '*': + enable: false + - nn.BatchNorm2d: + '*': + enable: false + - nn.BatchNorm3d: + '*': + enable: false + - nn.LeakyReLU: + '*': + enable: false diff --git a/modelopt_recipes/general/ptq/nvfp4_mlp_only-fp8_kv.yml b/modelopt_recipes/general/ptq/nvfp4_mlp_only-fp8_kv.yml index fd502e2c3..46cac283d 100644 --- a/modelopt_recipes/general/ptq/nvfp4_mlp_only-fp8_kv.yml +++ b/modelopt_recipes/general/ptq/nvfp4_mlp_only-fp8_kv.yml @@ -19,68 +19,69 @@ metadata: ptq_cfg: algorithm: max quant_cfg: - '*mlp*weight_quantizer': - block_sizes: - -1: 16 - type: dynamic - scale_bits: e4m3 - num_bits: e2m1 - enable: true - '*mlp*input_quantizer': - block_sizes: - -1: 16 - type: dynamic - scale_bits: e4m3 - num_bits: e2m1 - enable: true - '*block_sparse_moe*weight_quantizer': - block_sizes: - -1: 16 - type: dynamic - scale_bits: e4m3 - num_bits: e2m1 - enable: true - '*block_sparse_moe*input_quantizer': - block_sizes: - -1: 16 - type: dynamic - scale_bits: e4m3 - num_bits: e2m1 - enable: true - default: - enable: false - '*block_sparse_moe.gate*': - enable: false - '*linear_attn.conv1d*': - enable: false - '*lm_head*': - enable: false - '*mixer.conv1d*': - enable: false - '*mlp.gate.*': - enable: false - '*mlp.shared_expert_gate.*': - enable: false - '*output_layer*': - enable: false - '*proj_out.*': - enable: false - '*router*': - enable: false - output.*: - enable: false - nn.BatchNorm1d: - '*': + - '*mlp*weight_quantizer': + block_sizes: + -1: 16 + type: dynamic + scale_bits: e4m3 + num_bits: e2m1 + enable: true + - '*mlp*input_quantizer': + block_sizes: + -1: 16 + type: dynamic + scale_bits: e4m3 + num_bits: e2m1 + enable: true + - '*block_sparse_moe*weight_quantizer': + block_sizes: + -1: 16 + type: dynamic + scale_bits: e4m3 + num_bits: e2m1 + enable: true + - '*block_sparse_moe*input_quantizer': + block_sizes: + -1: 16 + type: dynamic + scale_bits: e4m3 + num_bits: e2m1 + enable: true + - '*[kv]_bmm_quantizer': + num_bits: e4m3 + enable: true + + - default: + enable: false + - '*block_sparse_moe.gate*': + enable: false + - '*linear_attn.conv1d*': + enable: false + - '*lm_head*': + enable: false + - '*mixer.conv1d*': + enable: false + - '*mlp.gate.*': + enable: false + - '*mlp.shared_expert_gate.*': + enable: false + - '*output_layer*': enable: false - nn.BatchNorm2d: - '*': + - '*proj_out.*': enable: false - nn.BatchNorm3d: - '*': + - '*router*': enable: false - nn.LeakyReLU: - '*': + - output.*: enable: false - '*[kv]_bmm_quantizer': - num_bits: e4m3 - enable: true + - nn.BatchNorm1d: + '*': + enable: false + - nn.BatchNorm2d: + '*': + enable: false + - nn.BatchNorm3d: + '*': + enable: false + - nn.LeakyReLU: + '*': + enable: false diff --git a/modelopt_recipes/general/ptq/nvfp4_omlp_only-fp8_kv.yml b/modelopt_recipes/general/ptq/nvfp4_omlp_only-fp8_kv.yml index 4a19f874a..57d5ecd2c 100644 --- a/modelopt_recipes/general/ptq/nvfp4_omlp_only-fp8_kv.yml +++ b/modelopt_recipes/general/ptq/nvfp4_omlp_only-fp8_kv.yml @@ -19,82 +19,83 @@ metadata: ptq_cfg: algorithm: max quant_cfg: - '*mlp*weight_quantizer': - block_sizes: - -1: 16 - type: dynamic - scale_bits: e4m3 - num_bits: e2m1 - enable: true - '*mlp*input_quantizer': - block_sizes: - -1: 16 - type: dynamic - scale_bits: e4m3 - num_bits: e2m1 - enable: true - '*block_sparse_moe*weight_quantizer': - block_sizes: - -1: 16 - type: dynamic - scale_bits: e4m3 - num_bits: e2m1 - enable: true - '*block_sparse_moe*input_quantizer': - block_sizes: - -1: 16 - type: dynamic - scale_bits: e4m3 - num_bits: e2m1 - enable: true - '*o_proj*weight_quantizer': - block_sizes: - -1: 16 - type: dynamic - scale_bits: e4m3 - num_bits: e2m1 - enable: true - '*o_proj*input_quantizer': - block_sizes: - -1: 16 - type: dynamic - scale_bits: e4m3 - num_bits: e2m1 - enable: true - default: - enable: false - '*block_sparse_moe.gate*': - enable: false - '*linear_attn.conv1d*': - enable: false - '*lm_head*': - enable: false - '*mixer.conv1d*': - enable: false - '*mlp.gate.*': - enable: false - '*mlp.shared_expert_gate.*': - enable: false - '*output_layer*': - enable: false - '*proj_out.*': - enable: false - '*router*': - enable: false - output.*: - enable: false - nn.BatchNorm1d: - '*': + - '*mlp*weight_quantizer': + block_sizes: + -1: 16 + type: dynamic + scale_bits: e4m3 + num_bits: e2m1 + enable: true + - '*mlp*input_quantizer': + block_sizes: + -1: 16 + type: dynamic + scale_bits: e4m3 + num_bits: e2m1 + enable: true + - '*block_sparse_moe*weight_quantizer': + block_sizes: + -1: 16 + type: dynamic + scale_bits: e4m3 + num_bits: e2m1 + enable: true + - '*block_sparse_moe*input_quantizer': + block_sizes: + -1: 16 + type: dynamic + scale_bits: e4m3 + num_bits: e2m1 + enable: true + - '*o_proj*weight_quantizer': + block_sizes: + -1: 16 + type: dynamic + scale_bits: e4m3 + num_bits: e2m1 + enable: true + - '*o_proj*input_quantizer': + block_sizes: + -1: 16 + type: dynamic + scale_bits: e4m3 + num_bits: e2m1 + enable: true + - '*[kv]_bmm_quantizer': + num_bits: e4m3 + enable: true + + - default: + enable: false + - '*block_sparse_moe.gate*': + enable: false + - '*linear_attn.conv1d*': + enable: false + - '*lm_head*': + enable: false + - '*mixer.conv1d*': + enable: false + - '*mlp.gate.*': + enable: false + - '*mlp.shared_expert_gate.*': + enable: false + - '*output_layer*': enable: false - nn.BatchNorm2d: - '*': + - '*proj_out.*': enable: false - nn.BatchNorm3d: - '*': + - '*router*': enable: false - nn.LeakyReLU: - '*': + - output.*: enable: false - '*[kv]_bmm_quantizer': - num_bits: e4m3 - enable: true + - nn.BatchNorm1d: + '*': + enable: false + - nn.BatchNorm2d: + '*': + enable: false + - nn.BatchNorm3d: + '*': + enable: false + - nn.LeakyReLU: + '*': + enable: false diff --git a/tests/_test_utils/torch/export/utils.py b/tests/_test_utils/torch/export/utils.py index 8011eb72e..c8514769a 100644 --- a/tests/_test_utils/torch/export/utils.py +++ b/tests/_test_utils/torch/export/utils.py @@ -85,162 +85,184 @@ def forward(self, x): # Quantization configs partial_fp8_config = { - "quant_cfg": { - "*.1.weight_quantizer": {"num_bits": (4, 3), "axis": None}, - "*.1.input_quantizer": {"num_bits": (4, 3), "axis": None}, - "default": {"num_bits": 8, "enable": False}, - }, + "quant_cfg": [ + {"*.1.weight_quantizer": {"num_bits": (4, 3), "axis": None}}, + {"*.1.input_quantizer": {"num_bits": (4, 3), "axis": None}}, + {"default": {"num_bits": 8, "enable": False}}, + ], "algorithm": "max", } partial_w4a8_config = { - "quant_cfg": { - "*.2.weight_quantizer": [ - {"num_bits": 4, "block_sizes": {-1: 128, "type": "static"}, "enable": True}, - {"num_bits": (4, 3), "axis": None, "enable": True}, - ], - "*.2.input_quantizer": {"num_bits": (4, 3), "axis": None, "enable": True}, - "default": {"num_bits": 8, "enable": False}, - }, + "quant_cfg": [ + { + "*.2.weight_quantizer": [ + {"num_bits": 4, "block_sizes": {-1: 128, "type": "static"}, "enable": True}, + {"num_bits": (4, 3), "axis": None, "enable": True}, + ] + }, + {"*.2.input_quantizer": {"num_bits": (4, 3), "axis": None, "enable": True}}, + {"default": {"num_bits": 8, "enable": False}}, + ], "algorithm": "awq_lite", } partial_nvfp4_config = { - "quant_cfg": { - "*.1.weight_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, + "quant_cfg": [ + { + "*.1.weight_quantizer": { + "num_bits": (2, 1), + "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, + "axis": None, + "enable": True, + } }, - "*.1.input_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, + { + "*.1.input_quantizer": { + "num_bits": (2, 1), + "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, + "axis": None, + "enable": True, + } }, - "*.2.weight_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, + { + "*.2.weight_quantizer": { + "num_bits": (2, 1), + "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, + "axis": None, + "enable": True, + } }, - "*.2.input_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, + { + "*.2.input_quantizer": { + "num_bits": (2, 1), + "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, + "axis": None, + "enable": True, + } }, - "default": {"enable": False}, - }, + {"default": {"enable": False}}, + ], "algorithm": "max", } partial_nvfp4_awq_config = { - "quant_cfg": { - "*.2.weight_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, + "quant_cfg": [ + { + "*.2.weight_quantizer": { + "num_bits": (2, 1), + "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, + "axis": None, + "enable": True, + } }, - "*.2.input_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, + { + "*.2.input_quantizer": { + "num_bits": (2, 1), + "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, + "axis": None, + "enable": True, + } }, - "*.1.weight_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - "enable": False, + { + "*.1.weight_quantizer": { + "num_bits": (2, 1), + "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, + "axis": None, + "enable": False, + } }, - "*.1.input_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - "enable": False, + { + "*.1.input_quantizer": { + "num_bits": (2, 1), + "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, + "axis": None, + "enable": False, + } }, - "default": {"enable": False}, - }, + {"default": {"enable": False}}, + ], "algorithm": "awq_lite", } partial_int4_awq_config = { - "quant_cfg": { - "*.2.weight_quantizer": { - "num_bits": 4, - "block_sizes": {-1: 128, "type": "static"}, - "enable": True, + "quant_cfg": [ + { + "*.2.weight_quantizer": { + "num_bits": 4, + "block_sizes": {-1: 128, "type": "static"}, + "enable": True, + } }, - "*.2.input_quantizer": {"enable": False}, - "default": {"enable": False}, - }, + {"*.2.input_quantizer": {"enable": False}}, + {"default": {"enable": False}}, + ], "algorithm": {"method": "awq_lite", "alpha_step": 0.1}, # "algorithm": {"method": "awq_full", "alpha_step": 0.1, "max_co_batch_size": 1024}, # "algorithm": {"method": "awq_clip", "max_co_batch_size": 2048}, } partial_fp8_kv_cache_config = { - "quant_cfg": { - "*.1.weight_quantizer": {"num_bits": (4, 3), "axis": None}, - "*.1.input_quantizer": {"num_bits": (4, 3), "axis": None}, - "*output_quantizer": {"num_bits": (4, 3), "axis": None, "enable": True}, - "default": {"enable": False}, - }, + "quant_cfg": [ + {"*.1.weight_quantizer": {"num_bits": (4, 3), "axis": None}}, + {"*.1.input_quantizer": {"num_bits": (4, 3), "axis": None}}, + {"*output_quantizer": {"num_bits": (4, 3), "axis": None, "enable": True}}, + {"default": {"enable": False}}, + ], "algorithm": "max", } partial_int8_kv_cache_config = { - "quant_cfg": { - "*.1.weight_quantizer": {"num_bits": (4, 3), "axis": None}, - "*.1.input_quantizer": {"num_bits": (4, 3), "axis": None}, - "*output_quantizer": {"num_bits": 8, "axis": None, "enable": True}, - "default": {"enable": False}, - }, + "quant_cfg": [ + {"*.1.weight_quantizer": {"num_bits": (4, 3), "axis": None}}, + {"*.1.input_quantizer": {"num_bits": (4, 3), "axis": None}}, + {"*output_quantizer": {"num_bits": 8, "axis": None, "enable": True}}, + {"default": {"enable": False}}, + ], "algorithm": "max", } partial_nvfp4_kv_cache_config = { - "quant_cfg": { - "*.1.weight_quantizer": {"num_bits": (4, 3), "axis": None}, - "*.1.input_quantizer": {"num_bits": (4, 3), "axis": None}, - "*[kv]_bmm_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, + "quant_cfg": [ + {"*.1.weight_quantizer": {"num_bits": (4, 3), "axis": None}}, + {"*.1.input_quantizer": {"num_bits": (4, 3), "axis": None}}, + { + "*[kv]_bmm_quantizer": { + "num_bits": (2, 1), + "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, + "axis": None, + "enable": True, + } }, - "default": {"enable": False}, - }, + {"default": {"enable": False}}, + ], "algorithm": "max", } only_weight_quantizer_fp8_config = { - "quant_cfg": { - "*weight_quantizer": {"num_bits": (4, 3), "axis": None, "enable": True}, - "*input_quantizer": {"num_bits": (4, 3), "axis": None, "enable": False}, - "*output_quantizer": {"num_bits": (4, 3), "axis": None, "enable": False}, - "default": {"enable": False}, - }, + "quant_cfg": [ + {"*weight_quantizer": {"num_bits": (4, 3), "axis": None, "enable": True}}, + {"*input_quantizer": {"num_bits": (4, 3), "axis": None, "enable": False}}, + {"*output_quantizer": {"num_bits": (4, 3), "axis": None, "enable": False}}, + {"default": {"enable": False}}, + ], "algorithm": "max", } only_input_quantizer_fp8_config = { - "quant_cfg": { - "*weight_quantizer": {"num_bits": (4, 3), "axis": None, "enable": False}, - "*input_quantizer": {"num_bits": (4, 3), "axis": None, "enable": True}, - "*output_quantizer": {"num_bits": (4, 3), "axis": None, "enable": False}, - "default": {"enable": False}, - }, + "quant_cfg": [ + {"*weight_quantizer": {"num_bits": (4, 3), "axis": None, "enable": False}}, + {"*input_quantizer": {"num_bits": (4, 3), "axis": None, "enable": True}}, + {"*output_quantizer": {"num_bits": (4, 3), "axis": None, "enable": False}}, + {"default": {"enable": False}}, + ], "algorithm": "max", } only_output_quantizer_fp8_config = { - "quant_cfg": { - "*weight_quantizer": {"num_bits": (4, 3), "axis": None, "enable": False}, - "*input_quantizer": {"num_bits": (4, 3), "axis": None, "enable": False}, - "*output_quantizer": {"num_bits": (4, 3), "axis": None, "enable": True}, - "default": {"enable": False}, - }, + "quant_cfg": [ + {"*weight_quantizer": {"num_bits": (4, 3), "axis": None, "enable": False}}, + {"*input_quantizer": {"num_bits": (4, 3), "axis": None, "enable": False}}, + {"*output_quantizer": {"num_bits": (4, 3), "axis": None, "enable": True}}, + {"default": {"enable": False}}, + ], "algorithm": "max", } diff --git a/tests/_test_utils/torch/quantization/onnx_export.py b/tests/_test_utils/torch/quantization/onnx_export.py index 5c74e656c..c340f2695 100644 --- a/tests/_test_utils/torch/quantization/onnx_export.py +++ b/tests/_test_utils/torch/quantization/onnx_export.py @@ -29,11 +29,11 @@ def onnx_export_tester(model, device, num_bits, per_channel_quantization, constant_folding, dtype): axis = 0 if per_channel_quantization else None config = { - "quant_cfg": { - "*weight_quantizer": {"num_bits": num_bits, "axis": axis}, - "*input_quantizer": {"num_bits": num_bits}, - "default": {"enable": False}, - }, + "quant_cfg": [ + {"*weight_quantizer": {"num_bits": num_bits, "axis": axis}}, + {"*input_quantizer": {"num_bits": num_bits}}, + {"default": {"enable": False}}, + ], "algorithm": "max", } diff --git a/tests/_test_utils/torch/quantization/quantize_common.py b/tests/_test_utils/torch/quantization/quantize_common.py index ae56dd299..eefb9013d 100644 --- a/tests/_test_utils/torch/quantization/quantize_common.py +++ b/tests/_test_utils/torch/quantization/quantize_common.py @@ -47,7 +47,10 @@ def get_awq_config(algorithm="awq_lite", block_size=8): config = copy.deepcopy(mtq.INT4_AWQ_CFG) - config["quant_cfg"]["*weight_quantizer"]["block_sizes"] = {-1: block_size} + for entry in config["quant_cfg"]: + if "*weight_quantizer" in entry: + entry["*weight_quantizer"]["block_sizes"] = {-1: block_size} + break if "algorithm" not in config or not isinstance(config["algorithm"], dict): config["algorithm"] = {} diff --git a/tests/unit/recipe/test_loader.py b/tests/unit/recipe/test_loader.py index e52617861..446a82e0f 100644 --- a/tests/unit/recipe/test_loader.py +++ b/tests/unit/recipe/test_loader.py @@ -207,6 +207,11 @@ def test_general_ptq_yaml_matches_config_dicts(yaml_path, model_cfg_name, kv_cfg kv_cfg = getattr(qcfg, kv_cfg_name) yaml_data = load_config(yaml_path) + def _as_dict(qc): + return {k: v for entry in qc for k, v in entry.items()} + ptq = yaml_data["ptq_cfg"] - assert {**model_cfg["quant_cfg"], **kv_cfg["quant_cfg"]} == ptq["quant_cfg"] + assert {**_as_dict(model_cfg["quant_cfg"]), **_as_dict(kv_cfg["quant_cfg"])} == _as_dict( + ptq["quant_cfg"] + ) assert model_cfg["algorithm"] == ptq["algorithm"] diff --git a/tests/unit/torch/quantization/plugins/test_attention_quant.py b/tests/unit/torch/quantization/plugins/test_attention_quant.py index 9526f80ac..0c376b69e 100644 --- a/tests/unit/torch/quantization/plugins/test_attention_quant.py +++ b/tests/unit/torch/quantization/plugins/test_attention_quant.py @@ -61,10 +61,10 @@ def forward(self, hidden_states, **kwargs): kv_cache_config = { - "quant_cfg": { - "*[kv]_bmm_quantizer": {"num_bits": 4, "enable": True}, - "*softmax_quantizer": {"enable": False}, - }, + "quant_cfg": [ + {"*[kv]_bmm_quantizer": {"num_bits": 4, "enable": True}}, + {"*softmax_quantizer": {"enable": False}}, + ], "algorithm": "max", } diff --git a/tests/unit/torch/quantization/plugins/test_huggingface.py b/tests/unit/torch/quantization/plugins/test_huggingface.py index 33730409a..2bc2aedc4 100644 --- a/tests/unit/torch/quantization/plugins/test_huggingface.py +++ b/tests/unit/torch/quantization/plugins/test_huggingface.py @@ -193,7 +193,13 @@ def test_quantized_transformers_save_restore(tmp_path, model_cls, quant_config): tiny_llama_dir = create_tiny_llama_dir(tmp_path) # update config to fit test cases if quant_config == mtq.INT4_AWQ_CFG: - quant_config["quant_cfg"]["*weight_quantizer"]["block_sizes"] = {-1: 16} + import copy + + quant_config = copy.deepcopy(quant_config) + for entry in quant_config["quant_cfg"]: + if "*weight_quantizer" in entry: + entry["*weight_quantizer"]["block_sizes"] = {-1: 16} + break else: raise ValueError(f"Unsupported quant_config: {quant_config}") diff --git a/tests/unit/torch/quantization/test_autoquant.py b/tests/unit/torch/quantization/test_autoquant.py index c0f049174..bf3f0cae8 100644 --- a/tests/unit/torch/quantization/test_autoquant.py +++ b/tests/unit/torch/quantization/test_autoquant.py @@ -110,11 +110,11 @@ def test_quant_recipe_hparam(): # use this config to test custom quantization config INT8_CUSTOM_QUANT_TEST_CFG = { - "quant_cfg": { - "*weight_quantizer": {"num_bits": 8, "axis": 0}, - "*input_quantizer": {"num_bits": 8, "axis": None}, - **_default_disabled_quantizer_cfg, - }, + "quant_cfg": [ + {"*weight_quantizer": {"num_bits": 8, "axis": 0}}, + {"*input_quantizer": {"num_bits": 8, "axis": None}}, + *_default_disabled_quantizer_cfg, + ], "algorithm": "smoothquant", } @@ -230,14 +230,16 @@ def test_auto_quantize_disabled_layers_no_poison(): INT4INT8_AWQ_CFG = { - "quant_cfg": { - "*weight_quantizer": [ - {"num_bits": 4, "block_sizes": {-1: 128, "type": "static"}, "enable": True}, - {"num_bits": 8, "axis": None, "enable": True}, - ], - "*input_quantizer": {"num_bits": 8, "axis": None, "enable": True}, - "default": {"enable": False}, - }, + "quant_cfg": [ + { + "*weight_quantizer": [ + {"num_bits": 4, "block_sizes": {-1: 128, "type": "static"}, "enable": True}, + {"num_bits": 8, "axis": None, "enable": True}, + ] + }, + {"*input_quantizer": {"num_bits": 8, "axis": None, "enable": True}}, + {"default": {"enable": False}}, + ], "algorithm": "awq_lite", } @@ -480,7 +482,8 @@ def test_get_auto_quantize_config(method): # Use stored best recipe config = mtq.get_auto_quantize_config(search_state) assert "quant_cfg" in config - assert config["quant_cfg"]["*"] == {"enable": False} + assert isinstance(config["quant_cfg"], list) + assert any("*" in entry and entry["*"] == {"enable": False} for entry in config["quant_cfg"]) assert config["algorithm"] == "max" # Re-solve with different constraints diff --git a/tests/unit/torch/quantization/test_compute_quantization_mse.py b/tests/unit/torch/quantization/test_compute_quantization_mse.py index 9a9a81a61..2cce0b28d 100644 --- a/tests/unit/torch/quantization/test_compute_quantization_mse.py +++ b/tests/unit/torch/quantization/test_compute_quantization_mse.py @@ -22,10 +22,10 @@ from modelopt.torch.quantization.nn import TensorQuantizer INT8_CFG = { - "quant_cfg": { - "*weight_quantizer": {"num_bits": 8, "axis": 0}, - "*input_quantizer": {"num_bits": 8, "axis": None}, - }, + "quant_cfg": [ + {"*weight_quantizer": {"num_bits": 8, "axis": 0}}, + {"*input_quantizer": {"num_bits": 8, "axis": None}}, + ], "algorithm": "max", } diff --git a/tests/unit/torch/quantization/test_custom_backend.py b/tests/unit/torch/quantization/test_custom_backend.py index f42d6a5f9..5af6c249c 100644 --- a/tests/unit/torch/quantization/test_custom_backend.py +++ b/tests/unit/torch/quantization/test_custom_backend.py @@ -42,16 +42,18 @@ def dummy_backend(inputs: torch.Tensor, tq) -> torch.Tensor: model = torch.nn.Linear(16, 16, bias=False) cfg = { - "quant_cfg": { - "*weight_quantizer": { - "enable": True, - "num_bits": 8, - "axis": None, - "backend": "dummy_backend", - "backend_extra_args": {"offset": 2.5}, + "quant_cfg": [ + { + "*weight_quantizer": { + "enable": True, + "num_bits": 8, + "axis": None, + "backend": "dummy_backend", + "backend_extra_args": {"offset": 2.5}, + } }, - "default": {"enable": False}, - }, + {"default": {"enable": False}}, + ], "algorithm": "max", } @@ -88,10 +90,10 @@ def cached_backend(inputs: torch.Tensor, tq: TensorQuantizer) -> torch.Tensor: model = torch.nn.Linear(16, 16, bias=False) cfg = { - "quant_cfg": { - "*weight_quantizer": {"enable": True, "backend": "cached_backend"}, - "default": {"enable": False}, - }, + "quant_cfg": [ + {"*weight_quantizer": {"enable": True, "backend": "cached_backend"}}, + {"default": {"enable": False}}, + ], "algorithm": "max", } inputs = torch.randn(1, 16) diff --git a/tests/unit/torch/quantization/test_quantize_cpu.py b/tests/unit/torch/quantization/test_quantize_cpu.py index 641eafd2f..3f51f8f54 100644 --- a/tests/unit/torch/quantization/test_quantize_cpu.py +++ b/tests/unit/torch/quantization/test_quantize_cpu.py @@ -35,38 +35,39 @@ # A test config with double-quant (using `SequentialQuantizers`) WINT4INT8_CFG = { - "quant_cfg": { - "*weight_quantizer": [ - {"num_bits": 4, "block_sizes": {-1: 128, "type": "static"}, "enable": True}, - {"num_bits": 8, "axis": 0, "enable": True}, - ], - "*input_quantizer": {"num_bits": 8, "axis": None, "enable": True}, - }, + "quant_cfg": [ + { + "*weight_quantizer": [ + {"num_bits": 4, "block_sizes": {-1: 128, "type": "static"}, "enable": True}, + {"num_bits": 8, "axis": 0, "enable": True}, + ] + }, + {"*input_quantizer": {"num_bits": 8, "axis": None, "enable": True}}, + ], "algorithm": "awq_lite", } # Test configs for per channel MSE calibration INT8_MSE_CFG = { - "quant_cfg": { - "*weight_quantizer": {"num_bits": 8, "axis": 0}, - "*input_quantizer": {"num_bits": 8, "axis": None}, - }, + "quant_cfg": [ + {"*weight_quantizer": {"num_bits": 8, "axis": 0}}, + {"*input_quantizer": {"num_bits": 8, "axis": None}}, + ], "algorithm": "mse", } STATIC_WEIGHT_DYNAMIC_ACTIVATION_CFG = { - "quant_cfg": { - "*weight_quantizer": { - "num_bits": 8, - "axis": 0, - }, # Per-channel quantization - "*input_quantizer": { - "num_bits": 8, - "axis": (0, 1), - "type": "dynamic", + "quant_cfg": [ + {"*weight_quantizer": {"num_bits": 8, "axis": 0}}, # Per-channel quantization + { + "*input_quantizer": { + "num_bits": 8, + "axis": (0, 1), + "type": "dynamic", + } }, # Dynamic per-token quantization - "default": {"enable": False}, - }, + {"default": {"enable": False}}, + ], "algorithm": "max", } @@ -77,14 +78,16 @@ def compute_amax(self): quant_cfg_custom_calib = { - "quant_cfg": { - "*": { - "num_bits": 4, - "axis": None, - "enable": True, - "calibrator": (NewMaxCalibrator, (4, None, False)), + "quant_cfg": [ + { + "*": { + "num_bits": 4, + "axis": None, + "enable": True, + "calibrator": (NewMaxCalibrator, (4, None, False)), + } } - }, + ], "algorithm": "max", } @@ -131,7 +134,7 @@ def test_save_restore(model_cls, quant_config): def test_quantize_invalid_cfg(): model = SimpleLinear() config_invalid = { - "quant_cfg": {"*": {"num_bits": 4, "axis": 0, "block_sizes": {-1: 128}}}, + "quant_cfg": [{"*": {"num_bits": 4, "axis": 0, "block_sizes": {-1: 128}}}], "algorithm": "max", } with pytest.raises(ValidationError, match="axis must be None when block_sizes is not None."): @@ -170,12 +173,12 @@ def test_custom_calib_config(): def test_class_wise_config(): model = SimpleConvLinear() config = { - "quant_cfg": { - "nn.Linear": {"*": {"num_bits": 4, "axis": -1, "enable": True}}, - "nn.Conv2d": {"*": {"num_bits": 8, "enable": True}}, - "nn.BatchNorm2d": {"*": {"enable": False}}, - "*output_quantizer": {"num_bits": 8, "enable": True}, - }, + "quant_cfg": [ + {"nn.Linear": {"*": {"num_bits": 4, "axis": -1, "enable": True}}}, + {"nn.Conv2d": {"*": {"num_bits": 8, "enable": True}}}, + {"nn.BatchNorm2d": {"*": {"enable": False}}}, + {"*output_quantizer": {"num_bits": 8, "enable": True}}, + ], "algorithm": "max", } @@ -222,33 +225,25 @@ def test_static_weight_dynamic_activations(): def test_block_sizes_axis_model(): REF_QUANT_CFG = { # noqa: N806 - "quant_cfg": { - "*weight_quantizer": { - "num_bits": 8, - "axis": 0, - }, - "*input_quantizer": { - "num_bits": 8, - "axis": None, - "type": "dynamic", - }, - "default": {"enable": False}, - }, + "quant_cfg": [ + {"*weight_quantizer": {"num_bits": 8, "axis": 0}}, + {"*input_quantizer": {"num_bits": 8, "axis": None, "type": "dynamic"}}, + {"default": {"enable": False}}, + ], "algorithm": "max", } QUANT_CFG = { # noqa: N806 - "quant_cfg": { - "*weight_quantizer": { - "num_bits": 8, - "block_sizes": {1: None}, + "quant_cfg": [ + {"*weight_quantizer": {"num_bits": 8, "block_sizes": {1: None}}}, + { + "*input_quantizer": { + "num_bits": 8, + "block_sizes": {0: None, 1: None}, + "type": "dynamic", + } }, - "*input_quantizer": { - "num_bits": 8, - "block_sizes": {0: None, 1: None}, - "type": "dynamic", - }, - "default": {"enable": False}, - }, + {"default": {"enable": False}}, + ], "algorithm": "max", } model_ref = SimpleLinear() diff --git a/tests/unit/torch/quantization/test_tensor_quant_cpu.py b/tests/unit/torch/quantization/test_tensor_quant_cpu.py index d5c6479cd..725f9eb7c 100644 --- a/tests/unit/torch/quantization/test_tensor_quant_cpu.py +++ b/tests/unit/torch/quantization/test_tensor_quant_cpu.py @@ -89,14 +89,16 @@ def test_num_bits(self): WINT4INT8_CFG = { - "quant_cfg": { - "*weight_quantizer": [ - {"num_bits": 4, "block_sizes": {-1: 128, "type": "static"}, "enable": True}, - {"num_bits": 8, "axis": 0, "enable": True}, - ], - "*input_quantizer": {"num_bits": 8, "enable": True}, - "default": {"enable": False}, - }, + "quant_cfg": [ + { + "*weight_quantizer": [ + {"num_bits": 4, "block_sizes": {-1: 128, "type": "static"}, "enable": True}, + {"num_bits": 8, "axis": 0, "enable": True}, + ] + }, + {"*input_quantizer": {"num_bits": 8, "enable": True}}, + {"default": {"enable": False}}, + ], "algorithm": "awq_full", } @@ -109,10 +111,10 @@ def test_set_quantizer_cxt(): state_dict = model.state_dict() output_ref = model(inputs) - mtq.set_quantizer_by_cfg(model, {"*output_quantizer": {"enable": True}}) + mtq.set_quantizer_by_cfg(model, [{"*output_quantizer": {"enable": True}}]) with mtq.set_quantizer_by_cfg_context( - model, {"*": {"enable": False}, "*output_quantizer": {"enable": True}} + model, [{"*": {"enable": False}}, {"*output_quantizer": {"enable": True}}] ): for name, module in model.named_modules(): if not isinstance(module, TensorQuantizer): @@ -123,7 +125,7 @@ def test_set_quantizer_cxt(): assert not module.is_enabled mtq.calibrate(model, "max", lambda model: model(inputs * 10)) - mtq.set_quantizer_by_cfg(model, {"*output_quantizer": {"enable": False}}) + mtq.set_quantizer_by_cfg(model, [{"*output_quantizer": {"enable": False}}]) output_test = model(inputs) assert torch.allclose(output_ref, output_test) From d99e4aeea8957e5b2fae04531e9e3e90810f4bd5 Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Wed, 18 Mar 2026 02:11:41 +0000 Subject: [PATCH 02/32] Make quant_cfg a list of tuples, dict is too much Signed-off-by: Shengliang Xu --- examples/llm_autodeploy/run_auto_quantize.py | 4 +- examples/llm_ptq/hf_ptq.py | 30 +- .../llm_export_utils/quantization_utils.py | 24 +- modelopt/torch/export/unified_export_hf.py | 2 +- modelopt/torch/quantization/algorithms.py | 12 +- .../backends/fp8_per_tensor_gemm.py | 10 +- .../torch/quantization/backends/nvfp4_gemm.py | 10 +- modelopt/torch/quantization/config.py | 449 ++++++++++-------- modelopt/torch/quantization/conversion.py | 12 +- modelopt/torch/quantization/model_calib.py | 5 +- modelopt/torch/quantization/model_quant.py | 2 +- .../torch/quantization/utils/core_utils.py | 12 +- tests/_test_utils/torch/export/utils.py | 163 ++++--- .../torch/quantization/onnx_export.py | 8 +- .../torch/quantization/quantize_common.py | 2 +- tests/unit/recipe/test_loader.py | 8 +- .../plugins/test_attention_quant.py | 4 +- .../quantization/plugins/test_huggingface.py | 10 +- .../unit/torch/quantization/test_autoquant.py | 35 +- .../test_compute_quantization_mse.py | 4 +- .../torch/quantization/test_custom_backend.py | 15 +- .../torch/quantization/test_quantize_cpu.py | 66 +-- .../quantization/test_tensor_quant_cpu.py | 19 +- 23 files changed, 490 insertions(+), 416 deletions(-) diff --git a/examples/llm_autodeploy/run_auto_quantize.py b/examples/llm_autodeploy/run_auto_quantize.py index 570eca3d8..6e49de5ad 100644 --- a/examples/llm_autodeploy/run_auto_quantize.py +++ b/examples/llm_autodeploy/run_auto_quantize.py @@ -100,11 +100,11 @@ def loss_func(output, data): if enable_kv_cache_quantization: mtq.set_quantizer_by_cfg( model, - quant_cfg=[{"*output_quantizer": {"num_bits": (4, 3), "axis": None, "enable": True}}], + quant_cfg=[("*output_quantizer", {"num_bits": (4, 3), "axis": None, "enable": True})], ) # Lets calibrate only the output quantizer this time. Let's disable all other quantizers. with mtq.set_quantizer_by_cfg_context( - model, [{"*": {"enable": False}}, {"*output_quantizer": {"enable": True}}] + model, [("*", {"enable": False}), ("*output_quantizer", {"enable": True})] ): mtq.calibrate(model, algorithm="max", forward_loop=calibrate_loop) return model diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index dbccce7f9..24421598c 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -82,11 +82,9 @@ def _set_kv_cache_constant_amax(quant_cfg: list) -> None: Creates a new dict for the KV bmm quantizer config to avoid mutating shared references. """ - for i, entry in enumerate(quant_cfg): - if "*[kv]_bmm_quantizer" in entry: - quant_cfg[i] = { - "*[kv]_bmm_quantizer": {**entry["*[kv]_bmm_quantizer"], "use_constant_amax": True} - } + for i, (pattern, cfg) in enumerate(quant_cfg): + if pattern == "*[kv]_bmm_quantizer": + quant_cfg[i] = ("*[kv]_bmm_quantizer", {**cfg, "use_constant_amax": True}) break @@ -145,7 +143,7 @@ def extract_and_prepare_language_model_from_vl(full_model): # Apply disabled quant to all modules that are not part of language_model # This excludes them during HF export disabled_quant_cfg = { - "quant_cfg": {"default": {"enable": False}}, + "quant_cfg": ("default", {"enable": False}), "algorithm": "max", } @@ -333,7 +331,7 @@ def forward_step(model, batch): getattr(mtq, KV_QUANT_CFG_CHOICES[args.kv_cache_qformat])["quant_cfg"] ) kv_cache_quant_cfg = [ - e for e in kv_cache_quant_cfg if "default" not in e + e for e in kv_cache_quant_cfg if e[0] != "default" ] # keep other quantizers from auto_quantize if args.kv_cache_qformat in _KV_CAST_FORMATS: @@ -343,7 +341,7 @@ def forward_step(model, batch): if args.kv_cache_qformat not in _KV_CAST_FORMATS: # Calibrate only the KV cache quantizers; disable all others. with mtq.set_quantizer_by_cfg_context( - language_model, [{"*": {"enable": False}}, *kv_cache_quant_cfg] + language_model, [("*", {"enable": False}), *kv_cache_quant_cfg] ): mtq.calibrate(language_model, algorithm="max", forward_loop=calibrate_loop) return language_model @@ -546,13 +544,15 @@ def mono_quantize( # For Nemotron VL models, disable quantization of vision components if is_nemotron_vl_model: print("Disabling quantization for vision components in Nemotron VL model") - quant_cfg["quant_cfg"]["*vision*"] = {"enable": False} - quant_cfg["quant_cfg"]["*image*"] = {"enable": False} + quant_cfg["quant_cfg"].append(("*vision*", {"enable": False})) + quant_cfg["quant_cfg"].append(("*image*", {"enable": False})) # Also disable radio model components specifically (for Nemotron-Parse) - quant_cfg["quant_cfg"]["*radio*"] = {"enable": False} - quant_cfg["quant_cfg"]["*visual*"] = {"enable": False} - quant_cfg["quant_cfg"]["*encoder*"] = {"enable": False} # Disable encoder - quant_cfg["quant_cfg"]["*model_encoder*"] = {"enable": False} # Nemotron-Parse specific + quant_cfg["quant_cfg"].append(("*radio*", {"enable": False})) + quant_cfg["quant_cfg"].append(("*visual*", {"enable": False})) + quant_cfg["quant_cfg"].append(("*encoder*", {"enable": False})) # Disable encoder + quant_cfg["quant_cfg"].append( + ("*model_encoder*", {"enable": False}) + ) # Nemotron-Parse specific print("Quantization will only be applied to the decoder (text generation) component") if not model_is_already_quantized or calibration_only: @@ -971,7 +971,7 @@ def quantize_main( for prefix in mtp_layer_prefixes: # Add exclusion pattern for this MTP layer (e.g., "*layers.92*") pattern = f"*{prefix.split('.')[-2]}.{prefix.split('.')[-1]}*" - quant_cfg["quant_cfg"].append({pattern: {"enable": False}}) + quant_cfg["quant_cfg"].append((pattern, {"enable": False})) print(f"Excluding MTP layer from quantization: {pattern}") # Use constant amax for KV quantizers when a cast format is selected. diff --git a/modelopt/onnx/llm_export_utils/quantization_utils.py b/modelopt/onnx/llm_export_utils/quantization_utils.py index 0e2c3ed62..4df393b70 100644 --- a/modelopt/onnx/llm_export_utils/quantization_utils.py +++ b/modelopt/onnx/llm_export_utils/quantization_utils.py @@ -68,31 +68,33 @@ def get_quant_config(precision, lm_head_precision="fp16"): else: raise ValueError(f"Unsupported precision: {precision}") - quant_cfg_list: list[dict] = list(quant_cfg["quant_cfg"]) # type: ignore[arg-type] + quant_cfg_list: list[tuple] = list(quant_cfg["quant_cfg"]) # type: ignore[arg-type] if lm_head_precision == "fp8": - quant_cfg_list.append({"*lm_head.input_quantizer": {"num_bits": (4, 3), "axis": None}}) - quant_cfg_list.append({"*lm_head.weight_quantizer": {"num_bits": (4, 3), "axis": None}}) + quant_cfg_list.append(("*lm_head.input_quantizer", {"num_bits": (4, 3), "axis": None})) + quant_cfg_list.append(("*lm_head.weight_quantizer", {"num_bits": (4, 3), "axis": None})) elif lm_head_precision == "nvfp4": quant_cfg_list.append( - { - "*lm_head.input_quantizer": { + ( + "*lm_head.input_quantizer", + { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, "axis": None, "enable": True, - } - } + }, + ) ) quant_cfg_list.append( - { - "*lm_head.weight_quantizer": { + ( + "*lm_head.weight_quantizer", + { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, "axis": None, "enable": True, - } - } + }, + ) ) quant_cfg["quant_cfg"] = quant_cfg_list return quant_cfg diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index 6f7cde466..55b6be56d 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -221,7 +221,7 @@ def _output_hook(module, input, output): try: with ( torch.no_grad(), - set_quantizer_by_cfg_context(model, [{"*": QuantizerAttributeConfig(enable=False)}]), + set_quantizer_by_cfg_context(model, [("*", QuantizerAttributeConfig(enable=False))]), ): dummy_forward_fn() finally: diff --git a/modelopt/torch/quantization/algorithms.py b/modelopt/torch/quantization/algorithms.py index 11e75f680..7b607012b 100644 --- a/modelopt/torch/quantization/algorithms.py +++ b/modelopt/torch/quantization/algorithms.py @@ -80,9 +80,7 @@ def estimate_quant_compression_for_quantizer(quantizer_attr_cfg): raise ValueError(f"Unknown type {type(quantizer_attr_cfg)}, {quantizer_attr_cfg}") - return estimate_quant_compression_for_quantizer( - [v for entry in quant_cfg.quant_cfg for v in entry.values()] - ) + return estimate_quant_compression_for_quantizer([v for _, v in quant_cfg.quant_cfg]) class QuantRecipe(CustomHPType): @@ -99,7 +97,7 @@ def __init__(self, quant_cfg: str | dict[str, Any] | None = None, name: str | No name = self.get_auto_name_for_config(quant_cfg) or name if quant_cfg is None: - quant_cfg = {"quant_cfg": [{"*": {"enable": False}}]} + quant_cfg = {"quant_cfg": [("*", {"enable": False})]} elif isinstance(quant_cfg, str): assert hasattr(mtq_config, quant_cfg), f"Unknown quantization format {quant_cfg}" quant_cfg = getattr(mtq_config, quant_cfg) @@ -112,7 +110,7 @@ def __init__(self, quant_cfg: str | dict[str, Any] | None = None, name: str | No # Currently KV Cache quantization is enabled for some quantization formats and disabled for others # This breaks the monotonicity of the quantization formats in terms of weight compression Vs accuracy self.config.quant_cfg.append( - {"*output_quantizer": mtq_config.QuantizerAttributeConfig(enable=False)} + ("*output_quantizer", mtq_config.QuantizerAttributeConfig(enable=False)) ) self.compression = estimate_quant_compression(self.config) @@ -1323,7 +1321,7 @@ def _cfg_to_dict(v): return [_cfg_to_dict(c) for c in v] return v - quant_cfg = [{k: _cfg_to_dict(v)} for k, v in quant_cfg_dict.items()] + quant_cfg = [(k, _cfg_to_dict(v)) for k, v in quant_cfg_dict.items()] warnings.warn( "get_auto_quantize_config: returned config uses algorithm='max'. " "Per-recipe calibration algorithms (e.g. smoothquant, awq) are not preserved. " @@ -1365,7 +1363,7 @@ def _resolve_best_recipe(search_state, constraints, verbose=False): def _match_quantizer_cfg(quant_cfg, quantizer_attr): # Last-match-wins to mirror set_quantizer_by_cfg behavior matched = None - for pattern, cfg in (item for entry in quant_cfg for item in entry.items()): + for pattern, cfg in quant_cfg: if fnmatch.fnmatch(quantizer_attr, pattern): matched = cfg return matched diff --git a/modelopt/torch/quantization/backends/fp8_per_tensor_gemm.py b/modelopt/torch/quantization/backends/fp8_per_tensor_gemm.py index b854215f2..c77097299 100644 --- a/modelopt/torch/quantization/backends/fp8_per_tensor_gemm.py +++ b/modelopt/torch/quantization/backends/fp8_per_tensor_gemm.py @@ -97,13 +97,9 @@ def fp8_per_tensor_gemm(quant_module, input, bias=None): def _fp8_availability_check(module, input, args, kwargs): """Comprehensive check for FP8 GEMM availability.""" # Quantizer configs - quant_cfg_list: list[dict] = FP8_DEFAULT_CFG["quant_cfg"] - input_cfg = next( - v for entry in quant_cfg_list for k, v in entry.items() if k == "*input_quantizer" - ) - weight_cfg = next( - v for entry in quant_cfg_list for k, v in entry.items() if k == "*weight_quantizer" - ) + quant_cfg_list: list[tuple] = FP8_DEFAULT_CFG["quant_cfg"] + input_cfg = next(v for k, v in quant_cfg_list if k == "*input_quantizer") + weight_cfg = next(v for k, v in quant_cfg_list if k == "*weight_quantizer") # Check hardware support if not torch.cuda.is_available() or not fp8_compatible(): diff --git a/modelopt/torch/quantization/backends/nvfp4_gemm.py b/modelopt/torch/quantization/backends/nvfp4_gemm.py index 047d9c37a..ed7352800 100644 --- a/modelopt/torch/quantization/backends/nvfp4_gemm.py +++ b/modelopt/torch/quantization/backends/nvfp4_gemm.py @@ -211,14 +211,10 @@ def _nvfp4_availability_check(module, input, args, kwargs): if not hasattr(module, "input_quantizer") or not hasattr(module, "weight_quantizer"): return False - quant_cfg_list: list[dict] = mtq.NVFP4_DEFAULT_CFG["quant_cfg"] + quant_cfg_list: list[tuple] = mtq.NVFP4_DEFAULT_CFG["quant_cfg"] # Quantizer configs - input_cfg = next( - v for entry in quant_cfg_list for k, v in entry.items() if k == "*input_quantizer" - ) - weight_cfg = next( - v for entry in quant_cfg_list for k, v in entry.items() if k == "*weight_quantizer" - ) + input_cfg = next(v for k, v in quant_cfg_list if k == "*input_quantizer") + weight_cfg = next(v for k, v in quant_cfg_list if k == "*weight_quantizer") # Check input quantizer config for key, value in input_cfg.items(): diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py index 3471fa562..de423bbda 100644 --- a/modelopt/torch/quantization/config.py +++ b/modelopt/torch/quantization/config.py @@ -143,37 +143,37 @@ from modelopt.torch.opt.config import ModeloptBaseConfig, ModeloptField from modelopt.torch.utils.network import ConstructorLike -_default_disabled_quantizer_cfg: list[dict] = [ - {"nn.BatchNorm1d": {"*": {"enable": False}}}, - {"nn.BatchNorm2d": {"*": {"enable": False}}}, - {"nn.BatchNorm3d": {"*": {"enable": False}}}, - {"nn.LeakyReLU": {"*": {"enable": False}}}, - {"*lm_head*": {"enable": False}}, - {"*proj_out.*": {"enable": False}}, # In Whisper model, lm_head has key name proj_out - {"*block_sparse_moe.gate*": {"enable": False}}, # Skip the MOE router - {"*router*": {"enable": False}}, # Skip the MOE router - {"*mlp.gate.*": {"enable": False}}, # Skip the MOE router - {"*mlp.shared_expert_gate.*": {"enable": False}}, # Skip the MOE router - {"*linear_attn.conv1d*": {"enable": False}}, - {"*mixer.conv1d*": {"enable": False}}, # Skip mamba conv1d - {"*output_layer*": {"enable": False}}, - {"output.*": {"enable": False}}, - {"default": {"enable": False}}, +_default_disabled_quantizer_cfg: list[tuple] = [ + ("nn.BatchNorm1d", {"*": {"enable": False}}), + ("nn.BatchNorm2d", {"*": {"enable": False}}), + ("nn.BatchNorm3d", {"*": {"enable": False}}), + ("nn.LeakyReLU", {"*": {"enable": False}}), + ("*lm_head*", {"enable": False}), + ("*proj_out.*", {"enable": False}), # In Whisper model, lm_head has key name proj_out + ("*block_sparse_moe.gate*", {"enable": False}), # Skip the MOE router + ("*router*", {"enable": False}), # Skip the MOE router + ("*mlp.gate.*", {"enable": False}), # Skip the MOE router + ("*mlp.shared_expert_gate.*", {"enable": False}), # Skip the MOE router + ("*linear_attn.conv1d*", {"enable": False}), + ("*mixer.conv1d*", {"enable": False}), # Skip mamba conv1d + ("*output_layer*", {"enable": False}), + ("output.*", {"enable": False}), + ("default", {"enable": False}), ] -_mamba_moe_disabled_quantizer_cfg: list[dict] = [ - {"*fc1_latent_proj*": {"enable": False}}, # Skip Latent MOE - {"*fc2_latent_proj*": {"enable": False}}, # Skip Latent MOE - {"*q_proj*": {"enable": False}}, # Skip QKV Linear - {"*k_proj*": {"enable": False}}, # Skip QKV Linear - {"*v_proj*": {"enable": False}}, # Skip QKV Linear - {"*o_proj*": {"enable": False}}, # Skip QKV Output Projection +_mamba_moe_disabled_quantizer_cfg: list[tuple] = [ + ("*fc1_latent_proj*", {"enable": False}), # Skip Latent MOE + ("*fc2_latent_proj*", {"enable": False}), # Skip Latent MOE + ("*q_proj*", {"enable": False}), # Skip QKV Linear + ("*k_proj*", {"enable": False}), # Skip QKV Linear + ("*v_proj*", {"enable": False}), # Skip QKV Linear + ("*o_proj*", {"enable": False}), # Skip QKV Output Projection ] INT8_DEFAULT_CFG = { "quant_cfg": [ - {"*weight_quantizer": {"num_bits": 8, "axis": 0}}, - {"*input_quantizer": {"num_bits": 8, "axis": None}}, + ("*weight_quantizer", {"num_bits": 8, "axis": 0}), + ("*input_quantizer", {"num_bits": 8, "axis": None}), *_default_disabled_quantizer_cfg, ], "algorithm": "max", @@ -181,8 +181,8 @@ INT8_SMOOTHQUANT_CFG = { "quant_cfg": [ - {"*weight_quantizer": {"num_bits": 8, "axis": 0}}, - {"*input_quantizer": {"num_bits": 8, "axis": None}}, + ("*weight_quantizer", {"num_bits": 8, "axis": 0}), + ("*input_quantizer", {"num_bits": 8, "axis": None}), *_default_disabled_quantizer_cfg, ], "algorithm": "smoothquant", @@ -190,8 +190,8 @@ INT8_WEIGHT_ONLY_CFG = { "quant_cfg": [ - {"*weight_quantizer": {"num_bits": 8, "axis": 0}}, - {"*input_quantizer": {"enable": False}}, + ("*weight_quantizer", {"num_bits": 8, "axis": 0}), + ("*input_quantizer", {"enable": False}), *_default_disabled_quantizer_cfg, ], "algorithm": "max", @@ -199,8 +199,8 @@ FP8_DEFAULT_CFG = { "quant_cfg": [ - {"*weight_quantizer": {"num_bits": (4, 3), "axis": None}}, - {"*input_quantizer": {"num_bits": (4, 3), "axis": None}}, + ("*weight_quantizer", {"num_bits": (4, 3), "axis": None}), + ("*input_quantizer", {"num_bits": (4, 3), "axis": None}), *_default_disabled_quantizer_cfg, ], "algorithm": "max", @@ -208,8 +208,8 @@ MAMBA_MOE_FP8_AGGRESSIVE_CFG = { "quant_cfg": [ - {"*weight_quantizer": {"num_bits": (4, 3), "axis": None}}, - {"*input_quantizer": {"num_bits": (4, 3), "axis": None}}, + ("*weight_quantizer", {"num_bits": (4, 3), "axis": None}), + ("*input_quantizer", {"num_bits": (4, 3), "axis": None}), *_default_disabled_quantizer_cfg, *_mamba_moe_disabled_quantizer_cfg, ], @@ -218,26 +218,27 @@ MAMBA_MOE_FP8_CONSERVATIVE_CFG = { "quant_cfg": [ - {"*weight_quantizer": {"num_bits": (4, 3), "axis": None}}, - {"*input_quantizer": {"num_bits": (4, 3), "axis": None}}, + ("*weight_quantizer", {"num_bits": (4, 3), "axis": None}), + ("*input_quantizer", {"num_bits": (4, 3), "axis": None}), *_default_disabled_quantizer_cfg, *_mamba_moe_disabled_quantizer_cfg, - {"*mixer.in_proj*": {"enable": False}}, # Skip mamba linear - {"*mixer.out_proj*": {"enable": False}}, # Skip mamba linear + ("*mixer.in_proj*", {"enable": False}), # Skip mamba linear + ("*mixer.out_proj*", {"enable": False}), # Skip mamba linear ], "algorithm": "max", } FP8_PER_CHANNEL_PER_TOKEN_CFG = { "quant_cfg": [ - {"*weight_quantizer": {"num_bits": (4, 3), "axis": 0}}, - { - "*input_quantizer": { + ("*weight_quantizer", {"num_bits": (4, 3), "axis": 0}), + ( + "*input_quantizer", + { "num_bits": (4, 3), "type": "dynamic", "block_sizes": {-1: None}, - } - }, + }, + ), *_default_disabled_quantizer_cfg, ], "algorithm": "max", @@ -246,14 +247,15 @@ # FP8 2D blockwise fake quantization config for deepseek models FP8_2D_BLOCKWISE_WEIGHT_ONLY_CFG = { "quant_cfg": [ - { - "*weight_quantizer": { + ( + "*weight_quantizer", + { "num_bits": (4, 3), "block_sizes": {-1: 128, -2: 128}, "enable": True, - } - }, - {"*input_quantizer": {"enable": False}}, + }, + ), + ("*input_quantizer", {"enable": False}), *_default_disabled_quantizer_cfg, ], "algorithm": "max", @@ -261,14 +263,15 @@ INT4_BLOCKWISE_WEIGHT_ONLY_CFG = { "quant_cfg": [ - { - "*weight_quantizer": { + ( + "*weight_quantizer", + { "num_bits": 4, "block_sizes": {-1: 128}, "enable": True, - } - }, - {"*input_quantizer": {"enable": False}}, + }, + ), + ("*input_quantizer", {"enable": False}), *_default_disabled_quantizer_cfg, ], "algorithm": "max", @@ -277,14 +280,15 @@ INT4_AWQ_CFG = { "quant_cfg": [ - { - "*weight_quantizer": { + ( + "*weight_quantizer", + { "num_bits": 4, "block_sizes": {-1: 128, "type": "static"}, "enable": True, - } - }, - {"*input_quantizer": {"enable": False}}, + }, + ), + ("*input_quantizer", {"enable": False}), *_default_disabled_quantizer_cfg, ], "algorithm": {"method": "awq_lite", "alpha_step": 0.1}, @@ -296,8 +300,9 @@ # for weights. This could change in the future W4A8_AWQ_BETA_CFG = { "quant_cfg": [ - { - "*weight_quantizer": [ + ( + "*weight_quantizer", + [ { "num_bits": 4, "block_sizes": {-1: 128, "type": "static"}, @@ -307,14 +312,15 @@ "num_bits": (4, 3), "enable": True, }, - ] - }, - { - "*input_quantizer": { + ], + ), + ( + "*input_quantizer", + { "num_bits": (4, 3), "enable": True, - } - }, + }, + ), *_default_disabled_quantizer_cfg, ], "algorithm": "awq_lite", @@ -322,20 +328,22 @@ MXFP8_DEFAULT_CFG = { "quant_cfg": [ - { - "*weight_quantizer": { + ( + "*weight_quantizer", + { "num_bits": (4, 3), "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, "enable": True, - } - }, - { - "*input_quantizer": { + }, + ), + ( + "*input_quantizer", + { "num_bits": (4, 3), "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, "enable": True, - } - }, + }, + ), *_default_disabled_quantizer_cfg, ], "algorithm": None, @@ -343,20 +351,22 @@ MXFP6_DEFAULT_CFG = { "quant_cfg": [ - { - "*weight_quantizer": { + ( + "*weight_quantizer", + { "num_bits": (3, 2), "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, "enable": True, - } - }, - { - "*input_quantizer": { + }, + ), + ( + "*input_quantizer", + { "num_bits": (3, 2), "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, "enable": True, - } - }, + }, + ), *_default_disabled_quantizer_cfg, ], "algorithm": None, @@ -364,20 +374,22 @@ MXFP4_DEFAULT_CFG = { "quant_cfg": [ - { - "*weight_quantizer": { + ( + "*weight_quantizer", + { "num_bits": (2, 1), "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, "enable": True, - } - }, - { - "*input_quantizer": { + }, + ), + ( + "*input_quantizer", + { "num_bits": (2, 1), "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, "enable": True, - } - }, + }, + ), *_default_disabled_quantizer_cfg, ], "algorithm": None, @@ -385,14 +397,15 @@ W4A8_MXFP4_FP8_CFG = { "quant_cfg": [ - { - "*weight_quantizer": { + ( + "*weight_quantizer", + { "num_bits": (2, 1), "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, "enable": True, - } - }, - {"*input_quantizer": {"num_bits": (4, 3), "axis": None}}, + }, + ), + ("*input_quantizer", {"num_bits": (4, 3), "axis": None}), *_default_disabled_quantizer_cfg, ], "algorithm": None, @@ -400,20 +413,22 @@ MXINT8_DEFAULT_CFG = { "quant_cfg": [ - { - "*weight_quantizer": { + ( + "*weight_quantizer", + { "num_bits": 8, "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, "enable": True, - } - }, - { - "*input_quantizer": { + }, + ), + ( + "*input_quantizer", + { "num_bits": 8, "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, "enable": True, - } - }, + }, + ), *_default_disabled_quantizer_cfg, ], "algorithm": None, @@ -421,26 +436,28 @@ FP8_KV_CFG = { "quant_cfg": [ - { - "*[kv]_bmm_quantizer": { + ( + "*[kv]_bmm_quantizer", + { "num_bits": (4, 3), "enable": True, - } - }, - {"default": {"enable": False}}, + }, + ), + ("default", {"enable": False}), ], "algorithm": "max", } FP8_AFFINE_KV_CFG = { "quant_cfg": [ - { - "*[kv]_bmm_quantizer": { + ( + "*[kv]_bmm_quantizer", + { "num_bits": (4, 3), "bias": {-2: None, -4: None, "type": "static"}, - } - }, - {"default": {"enable": False}}, + }, + ), + ("default", {"enable": False}), ], "algorithm": "max", } @@ -468,9 +485,9 @@ def _nvfp4_selective_quant_cfg( """Build an NVFP4 config that quantizes only the specified layer patterns.""" quant_cfg: dict[str, object] = [] for pattern in layer_patterns: - quant_cfg.append({f"{pattern}weight_quantizer": quantizer}) + quant_cfg.append((f"{pattern}weight_quantizer", quantizer)) if not weight_only: - quant_cfg.append({f"{pattern}input_quantizer": quantizer}) + quant_cfg.append((f"{pattern}input_quantizer", quantizer)) quant_cfg.extend(_default_disabled_quantizer_cfg) return {"quant_cfg": quant_cfg, "algorithm": algorithm} @@ -479,14 +496,15 @@ def _nvfp4_selective_quant_cfg( NVFP4_W4A4_WEIGHT_MSE_FP8_SWEEP_CFG = { "quant_cfg": [ - { - "*weight_quantizer": { + ( + "*weight_quantizer", + { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "static", "scale_bits": (4, 3)}, "enable": True, - } - }, - {"*input_quantizer": _nvfp4_quantizer}, + }, + ), + ("*input_quantizer", _nvfp4_quantizer), *_default_disabled_quantizer_cfg, ], "algorithm": { @@ -497,14 +515,15 @@ def _nvfp4_selective_quant_cfg( NVFP4_W4A4_WEIGHT_LOCAL_HESSIAN_CFG = { "quant_cfg": [ - { - "*weight_quantizer": { + ( + "*weight_quantizer", + { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "static", "scale_bits": (4, 3)}, "enable": True, - } - }, - {"*input_quantizer": _nvfp4_quantizer}, + }, + ), + ("*input_quantizer", _nvfp4_quantizer), *_default_disabled_quantizer_cfg, ], "algorithm": { @@ -515,8 +534,8 @@ def _nvfp4_selective_quant_cfg( MAMBA_MOE_NVFP4_AGGRESSIVE_CFG = { "quant_cfg": [ - {"*weight_quantizer": _nvfp4_quantizer}, - {"*input_quantizer": _nvfp4_quantizer}, + ("*weight_quantizer", _nvfp4_quantizer), + ("*input_quantizer", _nvfp4_quantizer), *_default_disabled_quantizer_cfg, *_mamba_moe_disabled_quantizer_cfg, ], @@ -524,12 +543,12 @@ def _nvfp4_selective_quant_cfg( } MAMBA_MOE_NVFP4_CONSERVATIVE_CFG = { "quant_cfg": [ - {"*weight_quantizer": _nvfp4_quantizer}, - {"*input_quantizer": _nvfp4_quantizer}, + ("*weight_quantizer", _nvfp4_quantizer), + ("*input_quantizer", _nvfp4_quantizer), *_default_disabled_quantizer_cfg, *_mamba_moe_disabled_quantizer_cfg, - {"*mixer.in_proj*": {"enable": False}}, # Skip mamba linear - {"*mixer.out_proj*": {"enable": False}}, # Skip mamba linear + ("*mixer.in_proj*", {"enable": False}), # Skip mamba linear + ("*mixer.out_proj*", {"enable": False}), # Skip mamba linear ], "algorithm": "max", } @@ -544,21 +563,22 @@ def _nvfp4_selective_quant_cfg( NVFP4_AFFINE_KV_CFG = { "quant_cfg": [ - { - "*[kv]_bmm_quantizer": { + ( + "*[kv]_bmm_quantizer", + { **_nvfp4_quantizer, "bias": {-2: None, -4: None, "type": "static"}, - } - }, - {"default": {"enable": False}}, + }, + ), + ("default", {"enable": False}), ], "algorithm": "max", } NVFP4_KV_CFG = { "quant_cfg": [ - {"*[kv]_bmm_quantizer": _nvfp4_quantizer}, - {"default": {"enable": False}}, + ("*[kv]_bmm_quantizer", _nvfp4_quantizer), + ("default", {"enable": False}), ], "algorithm": "max", } @@ -566,54 +586,61 @@ def _nvfp4_selective_quant_cfg( # Moved from examples/diffusers/quantization/config.py to here NVFP4_FP8_MHA_CONFIG = { "quant_cfg": [ - {"*weight_quantizer": _nvfp4_quantizer}, - {"*input_quantizer": _nvfp4_quantizer}, - {"*output_quantizer": {"enable": False}}, - { - "*q_bmm_quantizer": { + ("*weight_quantizer", _nvfp4_quantizer), + ("*input_quantizer", _nvfp4_quantizer), + ("*output_quantizer", {"enable": False}), + ( + "*q_bmm_quantizer", + { "num_bits": (4, 3), - } - }, - { - "*k_bmm_quantizer": { + }, + ), + ( + "*k_bmm_quantizer", + { "num_bits": (4, 3), - } - }, - { - "*v_bmm_quantizer": { + }, + ), + ( + "*v_bmm_quantizer", + { "num_bits": (4, 3), - } - }, - { - "*softmax_quantizer": { + }, + ), + ( + "*softmax_quantizer", + { "num_bits": (4, 3), - } - }, - { - "transformer_blocks*bmm2_output_quantizer": { + }, + ), + ( + "transformer_blocks*bmm2_output_quantizer", + { "num_bits": (4, 3), - } - }, - {"default": {"enable": False}}, + }, + ), + ("default", {"enable": False}), ], "algorithm": "max", } NVFP4_KV_ROTATE_CFG = { "quant_cfg": [ - { - "*q_bmm_quantizer": { + ( + "*q_bmm_quantizer", + { "enable": False, "rotate": True, - } - }, - { - "*k_bmm_quantizer": { + }, + ), + ( + "*k_bmm_quantizer", + { **_nvfp4_quantizer, "rotate": True, - } - }, - {"*v_bmm_quantizer": _nvfp4_quantizer}, + }, + ), + ("*v_bmm_quantizer", _nvfp4_quantizer), ], "algorithm": "max", } @@ -624,19 +651,21 @@ def _nvfp4_selective_quant_cfg( W4A8_NVFP4_FP8_CFG = { "quant_cfg": [ - { - "*weight_quantizer": { + ( + "*weight_quantizer", + { "num_bits": (2, 1), "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (4, 3)}, "enable": True, - } - }, - { - "*input_quantizer": { + }, + ), + ( + "*input_quantizer", + { "num_bits": (4, 3), "enable": True, - } - }, + }, + ), *_default_disabled_quantizer_cfg, ], "algorithm": "max", @@ -644,20 +673,22 @@ def _nvfp4_selective_quant_cfg( MXFP4_MLP_WEIGHT_ONLY_CFG = { "quant_cfg": [ - { - "*mlp*weight_quantizer": { + ( + "*mlp*weight_quantizer", + { "num_bits": (2, 1), "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, "enable": True, - } - }, - { - "*block_sparse_moe*weight_quantizer": { + }, + ), + ( + "*block_sparse_moe*weight_quantizer", + { "num_bits": (2, 1), "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, "enable": True, - } - }, + }, + ), *_default_disabled_quantizer_cfg, ], "algorithm": None, @@ -670,7 +701,6 @@ def _nvfp4_selective_quant_cfg( NVFP4_MLP_ONLY_CFG = _nvfp4_selective_quant_cfg(["*mlp*", "*block_sparse_moe*"]) NVFP4_OMLP_ONLY_CFG = _nvfp4_selective_quant_cfg(["*o_proj*", "*mlp*", "*block_sparse_moe*"]) - # DO NOT ADD NEW CONFIGS HERE. If you want to add a new general recipe, add it to # modelopt_recipes/general/ptq/ as a yaml file choices: set[str] = { @@ -1406,13 +1436,14 @@ class GPTQLiteConfig(QuantizeAlgorithmConfig): ) -_QuantizeQuantCfgEntryType = dict[ - str | Callable, +_QuantizeQuantCfgEntryValueType = ( QuantizerAttributeConfig | list[QuantizerAttributeConfig] | dict[str | Callable, QuantizerAttributeConfig | list[QuantizerAttributeConfig]] - | dict[str, Any], -] + | dict[str, Any] +) + +_QuantizeQuantCfgEntryType = tuple[str | Callable, _QuantizeQuantCfgEntryValueType] QuantizeQuantCfgType = list[_QuantizeQuantCfgEntryType] @@ -1425,7 +1456,7 @@ class QuantizeConfig(ModeloptBaseConfig): """Default configuration for ``quantize`` mode.""" quant_cfg: QuantizeQuantCfgType = ModeloptField( - default=[{"default": {"num_bits": 8, "axis": None}}], + default=[("default", {"num_bits": 8, "axis": None})], title="Quantization configuration", validate_default=True, ) @@ -1437,6 +1468,38 @@ class QuantizeConfig(ModeloptBaseConfig): validate_default=True, ) + @field_validator("quant_cfg", mode="before") + @classmethod + def normalize_quant_cfg(cls, v): + """Normalize quant_cfg entries: convert single-key dicts to (key, value) tuples. + + This allows loading from YAML/JSON (which produces dicts) while the internal + representation is always a list of tuples. + """ + if not isinstance(v, list): + return v + result = [] + for entry in v: + if isinstance(entry, dict) and len(entry) == 1: + result.append(next(iter(entry.items()))) + else: + result.append(entry) + return result + + @field_validator("quant_cfg", mode="after") + @classmethod + def validate_quant_cfg_entries(cls, v): + """Validate quantizer attribute configs to surface errors (e.g. invalid axis/block_sizes). + + When a tuple's value contains keys that are QuantizerAttributeConfig fields, validate it + as a QuantizerAttributeConfig to catch invalid configurations early. + """ + qac_fields = set(QuantizerAttributeConfig.model_fields.keys()) + for _pattern, cfg in v: + if isinstance(cfg, dict) and qac_fields & set(cfg.keys()): + QuantizerAttributeConfig.model_validate(cfg) + return v + class CompressConfig(ModeloptBaseConfig): """Default configuration for ``compress`` mode.""" @@ -1474,7 +1537,7 @@ def _not_dynamic(cfg): ) quant_cfg: list = config.get("quant_cfg") or [] - for name, cfg in (item for entry in quant_cfg for item in entry.items()): + for name, cfg in quant_cfg: if "weight_quantizer" in name: # We don't calibrate weight quantizer continue diff --git a/modelopt/torch/quantization/conversion.py b/modelopt/torch/quantization/conversion.py index 7f95d5dde..705d9686a 100644 --- a/modelopt/torch/quantization/conversion.py +++ b/modelopt/torch/quantization/conversion.py @@ -60,7 +60,7 @@ def convert_to_quantized_model(model: ModelLikeModule, config: QuantizeConfig) - model = model.init_modellike() if isinstance(model, ModelLikeModule) else model replace_quant_module(model, version=ModeloptStateManager(model).state_version) - set_quantizer_by_cfg(model, config.get("quant_cfg", {})) + set_quantizer_by_cfg(model, config.get("quant_cfg", [])) metadata = {} update_quantize_metadata(model, config, metadata) @@ -76,7 +76,7 @@ def convert_to_quantized_model_svdquant( model = model.init_modellike() if isinstance(model, ModelLikeModule) else model create_and_replace_svdquant_linear_on_the_fly(model) - set_quantizer_by_cfg(model, config.get("quant_cfg", {})) + set_quantizer_by_cfg(model, config.get("quant_cfg", [])) metadata = {} update_quantize_metadata(model, config, metadata) @@ -214,7 +214,7 @@ def _replace_quant_module(model: nn.Module, version=None, registry=QuantModuleRe def set_quantizer_by_cfg(quant_model: nn.Module, quant_cfg: QuantizeQuantCfgType): """Update the quantizer attributes based on the specified `quant_cfg`. - `quant_cfg` is a list of single-key dicts mapping wildcards or filter functions + `quant_cfg` is a list of ``(pattern, attrs)`` tuples mapping wildcards or filter functions to its quantizer attributes which are defined in :class:`QuantizerAttributeConfig <.config.QuantizerAttributeConfig>`. The wildcards or filter functions are matched against the quantizer module names. @@ -228,7 +228,7 @@ def set_quantizer_by_cfg(quant_model: nn.Module, quant_cfg: QuantizeQuantCfgType See :meth:`set_quantizer_attribute ` for more details. """ - items = [(k, v) for entry in quant_cfg for k, v in entry.items()] + items = list(quant_cfg) for pattern, cfg in items: if str(pattern) == "default": set_quantizer_attribute(quant_model, "*", cfg) @@ -321,9 +321,7 @@ def set_quantizer_by_cfg_context(quant_model: nn.Module, quant_cfg: QuantizeQuan Use this context manager with caution. Changing certain attributes of the quantizer such as `calibrator` can lead to unexpected behavior. """ - assert not any( - cfg for entry in quant_cfg for cfg in entry.values() if isinstance(cfg, (list, tuple)) - ), "list of config not support." + assert not any(isinstance(v, list) for _, v in quant_cfg), "list of config not support." original_attributes = {} for name, module in quant_model.named_modules(): diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py index 1efd497b3..fc47e55fa 100644 --- a/modelopt/torch/quantization/model_calib.py +++ b/modelopt/torch/quantization/model_calib.py @@ -35,7 +35,6 @@ from modelopt.torch.utils.perf import get_used_gpu_mem_fraction from .calib import MseCalibrator, NVFP4MSECalibrator -from .config import QuantizerAttributeConfig from .conversion import create_and_replace_svdquant_linear_on_the_fly, set_quantizer_by_cfg_context from .nn import NVFP4StaticQuantizer, QuantModule, SequentialQuantizer, TensorQuantizer from .utils import ( @@ -1102,9 +1101,7 @@ def forward(self, input, *args, **kwargs): self.awq_lite.num_cache_steps += 1 self.awq_lite.num_tokens += input.numel() / input.shape[-1] if self.awq_lite.is_input_quantized: - with set_quantizer_by_cfg_context( - self.input_quantizer, [{"*": QuantizerAttributeConfig(enable=True)}] - ): + with set_quantizer_by_cfg_context(self.input_quantizer, [("*", {"enable": True})]): max_calibrate(self.input_quantizer, lambda quantizer: quantizer(input), False) return out_actual diff --git a/modelopt/torch/quantization/model_quant.py b/modelopt/torch/quantization/model_quant.py index eed0f251f..e637641d9 100644 --- a/modelopt/torch/quantization/model_quant.py +++ b/modelopt/torch/quantization/model_quant.py @@ -527,7 +527,7 @@ def forward_backward_step(model, batch) -> None: "checkpoint": checkpoint, } # Disable all quantizers; AutoQuantize will enable the needed ones - set_quantizer_by_cfg(model, [{"*": QuantizerAttributeConfig(enable=False)}]) + set_quantizer_by_cfg(model, [("*", QuantizerAttributeConfig(enable=False))]) searcher.search(model, constraints, config=search_config) # type: ignore[arg-type] return model, searcher.state_dict() diff --git a/modelopt/torch/quantization/utils/core_utils.py b/modelopt/torch/quantization/utils/core_utils.py index ab05bec13..0be7736da 100644 --- a/modelopt/torch/quantization/utils/core_utils.py +++ b/modelopt/torch/quantization/utils/core_utils.py @@ -310,11 +310,11 @@ def calibrate_with_adapters(model, args): def disable_lora_quantizers_in_config(config, layers): """Turns off input, weight, and output quantizers for LoRA weights and LoRALinear layers in config.""" - config["quant_cfg"]["*lora*"] = {"enable": False} + config["quant_cfg"]["*lora*"] = ("enable", False) for layer in layers: - config["quant_cfg"][f"*{layer}.input_quantizer"] = {"enable": False} - config["quant_cfg"][f"*{layer}.weight_quantizer"] = {"enable": False} - config["quant_cfg"][f"*{layer}.output_quantizer"] = {"enable": False} + config["quant_cfg"][f"*{layer}.input_quantizer"] = ("enable", False) + config["quant_cfg"][f"*{layer}.weight_quantizer"] = ("enable", False) + config["quant_cfg"][f"*{layer}.output_quantizer"] = ("enable", False) return config @@ -828,8 +828,8 @@ def update_quant_cfg_with_kv_cache_quant( """Update the quant_cfg with the kv cache quant_cfg.""" # If quant_cfg["quant_cfg"] is None, it corresponds to only kv cache quantization case quant_cfg = copy.deepcopy(quant_cfg) - inner: list = quant_cfg.get("quant_cfg") or [{"default": {"enable": False}}] - quant_cfg["quant_cfg"] = inner + [{k: v} for k, v in kv_cache_quant_cfg.items()] + inner: list = quant_cfg.get("quant_cfg") or [("default", {"enable": False})] + quant_cfg["quant_cfg"] = inner + list(kv_cache_quant_cfg.items()) # Set default algorithm for kv cache quantization if not provided. if not quant_cfg.get("algorithm"): diff --git a/tests/_test_utils/torch/export/utils.py b/tests/_test_utils/torch/export/utils.py index c8514769a..36618de18 100644 --- a/tests/_test_utils/torch/export/utils.py +++ b/tests/_test_utils/torch/export/utils.py @@ -86,116 +86,126 @@ def forward(self, x): # Quantization configs partial_fp8_config = { "quant_cfg": [ - {"*.1.weight_quantizer": {"num_bits": (4, 3), "axis": None}}, - {"*.1.input_quantizer": {"num_bits": (4, 3), "axis": None}}, - {"default": {"num_bits": 8, "enable": False}}, + ("*.1.weight_quantizer", {"num_bits": (4, 3), "axis": None}), + ("*.1.input_quantizer", {"num_bits": (4, 3), "axis": None}), + ("default", {"num_bits": 8, "enable": False}), ], "algorithm": "max", } partial_w4a8_config = { "quant_cfg": [ - { - "*.2.weight_quantizer": [ + ( + "*.2.weight_quantizer", + [ {"num_bits": 4, "block_sizes": {-1: 128, "type": "static"}, "enable": True}, {"num_bits": (4, 3), "axis": None, "enable": True}, - ] - }, - {"*.2.input_quantizer": {"num_bits": (4, 3), "axis": None, "enable": True}}, - {"default": {"num_bits": 8, "enable": False}}, + ], + ), + ("*.2.input_quantizer", {"num_bits": (4, 3), "axis": None, "enable": True}), + ("default", {"num_bits": 8, "enable": False}), ], "algorithm": "awq_lite", } partial_nvfp4_config = { "quant_cfg": [ - { - "*.1.weight_quantizer": { + ( + "*.1.weight_quantizer", + { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, "axis": None, "enable": True, - } - }, - { - "*.1.input_quantizer": { + }, + ), + ( + "*.1.input_quantizer", + { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, "axis": None, "enable": True, - } - }, - { - "*.2.weight_quantizer": { + }, + ), + ( + "*.2.weight_quantizer", + { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, "axis": None, "enable": True, - } - }, - { - "*.2.input_quantizer": { + }, + ), + ( + "*.2.input_quantizer", + { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, "axis": None, "enable": True, - } - }, - {"default": {"enable": False}}, + }, + ), + ("default", {"enable": False}), ], "algorithm": "max", } partial_nvfp4_awq_config = { "quant_cfg": [ - { - "*.2.weight_quantizer": { + ( + "*.2.weight_quantizer", + { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, "axis": None, "enable": True, - } - }, - { - "*.2.input_quantizer": { + }, + ), + ( + "*.2.input_quantizer", + { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, "axis": None, "enable": True, - } - }, - { - "*.1.weight_quantizer": { + }, + ), + ( + "*.1.weight_quantizer", + { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, "axis": None, "enable": False, - } - }, - { - "*.1.input_quantizer": { + }, + ), + ( + "*.1.input_quantizer", + { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, "axis": None, "enable": False, - } - }, - {"default": {"enable": False}}, + }, + ), + ("default", {"enable": False}), ], "algorithm": "awq_lite", } partial_int4_awq_config = { "quant_cfg": [ - { - "*.2.weight_quantizer": { + ( + "*.2.weight_quantizer", + { "num_bits": 4, "block_sizes": {-1: 128, "type": "static"}, "enable": True, - } - }, - {"*.2.input_quantizer": {"enable": False}}, - {"default": {"enable": False}}, + }, + ), + ("*.2.input_quantizer", {"enable": False}), + ("default", {"enable": False}), ], "algorithm": {"method": "awq_lite", "alpha_step": 0.1}, # "algorithm": {"method": "awq_full", "alpha_step": 0.1, "max_co_batch_size": 1024}, @@ -204,65 +214,66 @@ def forward(self, x): partial_fp8_kv_cache_config = { "quant_cfg": [ - {"*.1.weight_quantizer": {"num_bits": (4, 3), "axis": None}}, - {"*.1.input_quantizer": {"num_bits": (4, 3), "axis": None}}, - {"*output_quantizer": {"num_bits": (4, 3), "axis": None, "enable": True}}, - {"default": {"enable": False}}, + ("*.1.weight_quantizer", {"num_bits": (4, 3), "axis": None}), + ("*.1.input_quantizer", {"num_bits": (4, 3), "axis": None}), + ("*output_quantizer", {"num_bits": (4, 3), "axis": None, "enable": True}), + ("default", {"enable": False}), ], "algorithm": "max", } partial_int8_kv_cache_config = { "quant_cfg": [ - {"*.1.weight_quantizer": {"num_bits": (4, 3), "axis": None}}, - {"*.1.input_quantizer": {"num_bits": (4, 3), "axis": None}}, - {"*output_quantizer": {"num_bits": 8, "axis": None, "enable": True}}, - {"default": {"enable": False}}, + ("*.1.weight_quantizer", {"num_bits": (4, 3), "axis": None}), + ("*.1.input_quantizer", {"num_bits": (4, 3), "axis": None}), + ("*output_quantizer", {"num_bits": 8, "axis": None, "enable": True}), + ("default", {"enable": False}), ], "algorithm": "max", } partial_nvfp4_kv_cache_config = { "quant_cfg": [ - {"*.1.weight_quantizer": {"num_bits": (4, 3), "axis": None}}, - {"*.1.input_quantizer": {"num_bits": (4, 3), "axis": None}}, - { - "*[kv]_bmm_quantizer": { + ("*.1.weight_quantizer", {"num_bits": (4, 3), "axis": None}), + ("*.1.input_quantizer", {"num_bits": (4, 3), "axis": None}), + ( + "*[kv]_bmm_quantizer", + { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, "axis": None, "enable": True, - } - }, - {"default": {"enable": False}}, + }, + ), + ("default", {"enable": False}), ], "algorithm": "max", } only_weight_quantizer_fp8_config = { "quant_cfg": [ - {"*weight_quantizer": {"num_bits": (4, 3), "axis": None, "enable": True}}, - {"*input_quantizer": {"num_bits": (4, 3), "axis": None, "enable": False}}, - {"*output_quantizer": {"num_bits": (4, 3), "axis": None, "enable": False}}, - {"default": {"enable": False}}, + ("*weight_quantizer", {"num_bits": (4, 3), "axis": None, "enable": True}), + ("*input_quantizer", {"num_bits": (4, 3), "axis": None, "enable": False}), + ("*output_quantizer", {"num_bits": (4, 3), "axis": None, "enable": False}), + ("default", {"enable": False}), ], "algorithm": "max", } only_input_quantizer_fp8_config = { "quant_cfg": [ - {"*weight_quantizer": {"num_bits": (4, 3), "axis": None, "enable": False}}, - {"*input_quantizer": {"num_bits": (4, 3), "axis": None, "enable": True}}, - {"*output_quantizer": {"num_bits": (4, 3), "axis": None, "enable": False}}, - {"default": {"enable": False}}, + ("*weight_quantizer", {"num_bits": (4, 3), "axis": None, "enable": False}), + ("*input_quantizer", {"num_bits": (4, 3), "axis": None, "enable": True}), + ("*output_quantizer", {"num_bits": (4, 3), "axis": None, "enable": False}), + ("default", {"enable": False}), ], "algorithm": "max", } only_output_quantizer_fp8_config = { "quant_cfg": [ - {"*weight_quantizer": {"num_bits": (4, 3), "axis": None, "enable": False}}, - {"*input_quantizer": {"num_bits": (4, 3), "axis": None, "enable": False}}, - {"*output_quantizer": {"num_bits": (4, 3), "axis": None, "enable": True}}, - {"default": {"enable": False}}, + ("*weight_quantizer", {"num_bits": (4, 3), "axis": None, "enable": False}), + ("*input_quantizer", {"num_bits": (4, 3), "axis": None, "enable": False}), + ("*output_quantizer", {"num_bits": (4, 3), "axis": None, "enable": True}), + ("default", {"enable": False}), ], "algorithm": "max", } diff --git a/tests/_test_utils/torch/quantization/onnx_export.py b/tests/_test_utils/torch/quantization/onnx_export.py index c340f2695..757e5dbea 100644 --- a/tests/_test_utils/torch/quantization/onnx_export.py +++ b/tests/_test_utils/torch/quantization/onnx_export.py @@ -30,9 +30,9 @@ def onnx_export_tester(model, device, num_bits, per_channel_quantization, consta axis = 0 if per_channel_quantization else None config = { "quant_cfg": [ - {"*weight_quantizer": {"num_bits": num_bits, "axis": axis}}, - {"*input_quantizer": {"num_bits": num_bits}}, - {"default": {"enable": False}}, + ("*weight_quantizer", {"num_bits": num_bits, "axis": axis}), + ("*input_quantizer", {"num_bits": num_bits}), + ("default", {"enable": False}), ], "algorithm": "max", } @@ -76,7 +76,7 @@ def forward_loop(model): buffer.seek(0) providers = ["CUDAExecutionProvider"] if device != "cpu" else ["CPUExecutionProvider"] ort_session = onnxruntime.InferenceSession(buffer.read(), providers=providers) - ort_result = ort_session.run([], {"input": dummy_input.cpu().numpy()}) + ort_result = ort_session.run([], ("input", dummy_input.cpu().numpy())) ort_result = torch.tensor(ort_result[0]).to(device) torch_result = model(dummy_input) print(ort_result, torch_result) diff --git a/tests/_test_utils/torch/quantization/quantize_common.py b/tests/_test_utils/torch/quantization/quantize_common.py index eefb9013d..b52a3e204 100644 --- a/tests/_test_utils/torch/quantization/quantize_common.py +++ b/tests/_test_utils/torch/quantization/quantize_common.py @@ -252,7 +252,7 @@ def forward_loop(model): def auto_quantize_helper(model): model, search_state = mtq.auto_quantize( model, - constraints={"effective_bits": 8.0}, + constraints=("effective_bits", 8.0), quantization_formats=[mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG, mtq.INT8_DEFAULT_CFG], data_loader=[model.get_dummy_input().cuda() for _ in range(2)], forward_step=lambda model, batch: model(batch), diff --git a/tests/unit/recipe/test_loader.py b/tests/unit/recipe/test_loader.py index 446a82e0f..a72205bbd 100644 --- a/tests/unit/recipe/test_loader.py +++ b/tests/unit/recipe/test_loader.py @@ -208,7 +208,13 @@ def test_general_ptq_yaml_matches_config_dicts(yaml_path, model_cfg_name, kv_cfg yaml_data = load_config(yaml_path) def _as_dict(qc): - return {k: v for entry in qc for k, v in entry.items()} + result = {} + for entry in qc: + if isinstance(entry, dict): + result.update(entry) + else: + result[entry[0]] = entry[1] + return result ptq = yaml_data["ptq_cfg"] assert {**_as_dict(model_cfg["quant_cfg"]), **_as_dict(kv_cfg["quant_cfg"])} == _as_dict( diff --git a/tests/unit/torch/quantization/plugins/test_attention_quant.py b/tests/unit/torch/quantization/plugins/test_attention_quant.py index 0c376b69e..560533eaf 100644 --- a/tests/unit/torch/quantization/plugins/test_attention_quant.py +++ b/tests/unit/torch/quantization/plugins/test_attention_quant.py @@ -62,8 +62,8 @@ def forward(self, hidden_states, **kwargs): kv_cache_config = { "quant_cfg": [ - {"*[kv]_bmm_quantizer": {"num_bits": 4, "enable": True}}, - {"*softmax_quantizer": {"enable": False}}, + ("*[kv]_bmm_quantizer", {"num_bits": 4, "enable": True}), + ("*softmax_quantizer", {"enable": False}), ], "algorithm": "max", } diff --git a/tests/unit/torch/quantization/plugins/test_huggingface.py b/tests/unit/torch/quantization/plugins/test_huggingface.py index 2bc2aedc4..a68510fad 100644 --- a/tests/unit/torch/quantization/plugins/test_huggingface.py +++ b/tests/unit/torch/quantization/plugins/test_huggingface.py @@ -87,7 +87,7 @@ def test_convert_conv1d(): assert hasattr(module, "weight_quantizer") assert hasattr(module, "output_quantizer") - mtq.set_quantizer_attribute(model_test, "*", {"enable": False}) + mtq.set_quantizer_attribute(model_test, "*", ("enable", False)) x = torch.randn(2, 3) out_1 = model_ref(x) @@ -95,8 +95,8 @@ def test_convert_conv1d(): assert torch.allclose(out_1, out_2) - mtq.set_quantizer_attribute(model_test, "*input_quantizer", {"enable": True}) - mtq.set_quantizer_attribute(model_test, "*weight_quantizer", {"enable": True}) + mtq.set_quantizer_attribute(model_test, "*input_quantizer", ("enable", True)) + mtq.set_quantizer_attribute(model_test, "*weight_quantizer", ("enable", True)) model_ref = PytorchModel() model_ref.load_state_dict(model_test.state_dict()) @@ -136,7 +136,7 @@ def test_dbrx(): expertglu_ref.w1, ) - mtq.set_quantizer_attribute(model_test, "*", {"enable": False}) + mtq.set_quantizer_attribute(model_test, "*", ("enable", False)) x = torch.randn(1, 4, 32) out_1 = model_ref(x) @@ -170,7 +170,7 @@ def forward_step(model, batch): with context: best_model, search_history = mtq.auto_quantize( model, - constraints={"effective_bits": 11.0}, + constraints=("effective_bits", 11.0), quantization_formats=[mtq.INT8_DEFAULT_CFG], data_loader=[{"input_ids": input_ids, "labels": input_ids} for _ in range(2)], forward_step=forward_step, diff --git a/tests/unit/torch/quantization/test_autoquant.py b/tests/unit/torch/quantization/test_autoquant.py index bf3f0cae8..52fce49d4 100644 --- a/tests/unit/torch/quantization/test_autoquant.py +++ b/tests/unit/torch/quantization/test_autoquant.py @@ -111,8 +111,8 @@ def test_quant_recipe_hparam(): # use this config to test custom quantization config INT8_CUSTOM_QUANT_TEST_CFG = { "quant_cfg": [ - {"*weight_quantizer": {"num_bits": 8, "axis": 0}}, - {"*input_quantizer": {"num_bits": 8, "axis": None}}, + ("*weight_quantizer", {"num_bits": 8, "axis": 0}), + ("*input_quantizer", {"num_bits": 8, "axis": None}), *_default_disabled_quantizer_cfg, ], "algorithm": "smoothquant", @@ -145,7 +145,7 @@ def loss_func(output): best_model, search_history = mtq.auto_quantize( model, - constraints={"effective_bits": search_bits}, + constraints=("effective_bits", search_bits), quantization_formats=search_formats, data_loader=[model.get_input() for _ in range(2)], forward_step=lambda model, batch: model(batch), @@ -191,7 +191,7 @@ def loss_func(output): best_model, search_history = mtq.auto_quantize( model, - constraints={"effective_bits": 5.0}, + constraints=("effective_bits", 5.0), quantization_formats=[ mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG, mtq.INT8_DEFAULT_CFG, @@ -214,7 +214,7 @@ def test_auto_quantize_disabled_layers_no_poison(): best_model, _ = mtq.auto_quantize( model, - constraints={"effective_bits": 5.0}, + constraints=("effective_bits", 5.0), quantization_formats=[mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG, mtq.INT8_DEFAULT_CFG], data_loader=[model.get_input() for _ in range(2)], forward_step=lambda model, batch: model(batch), @@ -231,14 +231,15 @@ def test_auto_quantize_disabled_layers_no_poison(): INT4INT8_AWQ_CFG = { "quant_cfg": [ - { - "*weight_quantizer": [ + ( + "*weight_quantizer", + [ {"num_bits": 4, "block_sizes": {-1: 128, "type": "static"}, "enable": True}, {"num_bits": 8, "axis": None, "enable": True}, - ] - }, - {"*input_quantizer": {"num_bits": 8, "axis": None, "enable": True}}, - {"default": {"enable": False}}, + ], + ), + ("*input_quantizer", {"num_bits": 8, "axis": None, "enable": True}), + ("default", {"enable": False}), ], "algorithm": "awq_lite", } @@ -267,7 +268,7 @@ def _test_data_parallel_auto_quantize(rank, size): model, search_history = mtq.auto_quantize( model, - constraints={"effective_bits": 11.0}, + constraints=("effective_bits", 11.0), quantization_formats=[mtq.INT8_SMOOTHQUANT_CFG], data_loader=[model.get_input() for _ in range(2)], forward_step=lambda model, batch: model(batch), @@ -376,7 +377,7 @@ def test_auto_quantize_checkpoint_resume(method, tmp_path, capsys): # First run: save checkpoint model_1, state_dict_1 = mtq.auto_quantize( model, - constraints={"effective_bits": 6.0}, + constraints=("effective_bits", 6.0), quantization_formats=[mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG, mtq.INT8_DEFAULT_CFG], data_loader=[model.get_input() for _ in range(2)], forward_step=lambda model, batch: model(batch), @@ -395,7 +396,7 @@ def test_auto_quantize_checkpoint_resume(method, tmp_path, capsys): model_2 = SimpleLinear() model_2, state_dict_2 = mtq.auto_quantize( model_2, - constraints={"effective_bits": 6.0}, # Same constraint + constraints=("effective_bits", 6.0), # Same constraint quantization_formats=[mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG, mtq.INT8_DEFAULT_CFG], data_loader=[model_2.get_input() for _ in range(2)], forward_step=lambda model, batch: model(batch), @@ -463,7 +464,7 @@ def test_get_auto_quantize_config(method): _, search_state = mtq.auto_quantize( model, - constraints={"effective_bits": 6.0}, + constraints=("effective_bits", 6.0), quantization_formats=[mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG, mtq.INT8_DEFAULT_CFG], data_loader=[model.get_input() for _ in range(4)], forward_step=lambda model, batch: model(batch), @@ -483,12 +484,12 @@ def test_get_auto_quantize_config(method): config = mtq.get_auto_quantize_config(search_state) assert "quant_cfg" in config assert isinstance(config["quant_cfg"], list) - assert any("*" in entry and entry["*"] == {"enable": False} for entry in config["quant_cfg"]) + assert any(pattern == "*" and cfg == {"enable": False} for pattern, cfg in config["quant_cfg"]) assert config["algorithm"] == "max" # Re-solve with different constraints config_resoled = mtq.get_auto_quantize_config( - search_state, constraints={"effective_bits": 12.0} + search_state, constraints=("effective_bits", 12.0) ) assert "quant_cfg" in config_resoled diff --git a/tests/unit/torch/quantization/test_compute_quantization_mse.py b/tests/unit/torch/quantization/test_compute_quantization_mse.py index 2cce0b28d..3c28a42e1 100644 --- a/tests/unit/torch/quantization/test_compute_quantization_mse.py +++ b/tests/unit/torch/quantization/test_compute_quantization_mse.py @@ -23,8 +23,8 @@ INT8_CFG = { "quant_cfg": [ - {"*weight_quantizer": {"num_bits": 8, "axis": 0}}, - {"*input_quantizer": {"num_bits": 8, "axis": None}}, + ("*weight_quantizer", {"num_bits": 8, "axis": 0}), + ("*input_quantizer", {"num_bits": 8, "axis": None}), ], "algorithm": "max", } diff --git a/tests/unit/torch/quantization/test_custom_backend.py b/tests/unit/torch/quantization/test_custom_backend.py index 5af6c249c..2a5643677 100644 --- a/tests/unit/torch/quantization/test_custom_backend.py +++ b/tests/unit/torch/quantization/test_custom_backend.py @@ -43,16 +43,17 @@ def dummy_backend(inputs: torch.Tensor, tq) -> torch.Tensor: cfg = { "quant_cfg": [ - { - "*weight_quantizer": { + ( + "*weight_quantizer", + { "enable": True, "num_bits": 8, "axis": None, "backend": "dummy_backend", "backend_extra_args": {"offset": 2.5}, - } - }, - {"default": {"enable": False}}, + }, + ), + ("default", {"enable": False}), ], "algorithm": "max", } @@ -91,8 +92,8 @@ def cached_backend(inputs: torch.Tensor, tq: TensorQuantizer) -> torch.Tensor: model = torch.nn.Linear(16, 16, bias=False) cfg = { "quant_cfg": [ - {"*weight_quantizer": {"enable": True, "backend": "cached_backend"}}, - {"default": {"enable": False}}, + ("*weight_quantizer", {"enable": True, "backend": "cached_backend"}), + ("default", {"enable": False}), ], "algorithm": "max", } diff --git a/tests/unit/torch/quantization/test_quantize_cpu.py b/tests/unit/torch/quantization/test_quantize_cpu.py index 3f51f8f54..8bf652d81 100644 --- a/tests/unit/torch/quantization/test_quantize_cpu.py +++ b/tests/unit/torch/quantization/test_quantize_cpu.py @@ -36,13 +36,14 @@ # A test config with double-quant (using `SequentialQuantizers`) WINT4INT8_CFG = { "quant_cfg": [ - { - "*weight_quantizer": [ + ( + "*weight_quantizer", + [ {"num_bits": 4, "block_sizes": {-1: 128, "type": "static"}, "enable": True}, {"num_bits": 8, "axis": 0, "enable": True}, - ] - }, - {"*input_quantizer": {"num_bits": 8, "axis": None, "enable": True}}, + ], + ), + ("*input_quantizer", {"num_bits": 8, "axis": None, "enable": True}), ], "algorithm": "awq_lite", } @@ -50,23 +51,24 @@ # Test configs for per channel MSE calibration INT8_MSE_CFG = { "quant_cfg": [ - {"*weight_quantizer": {"num_bits": 8, "axis": 0}}, - {"*input_quantizer": {"num_bits": 8, "axis": None}}, + ("*weight_quantizer", {"num_bits": 8, "axis": 0}), + ("*input_quantizer", {"num_bits": 8, "axis": None}), ], "algorithm": "mse", } STATIC_WEIGHT_DYNAMIC_ACTIVATION_CFG = { "quant_cfg": [ - {"*weight_quantizer": {"num_bits": 8, "axis": 0}}, # Per-channel quantization - { - "*input_quantizer": { + ("*weight_quantizer", {"num_bits": 8, "axis": 0}), # Per-channel quantization + ( + "*input_quantizer", + { "num_bits": 8, "axis": (0, 1), "type": "dynamic", - } - }, # Dynamic per-token quantization - {"default": {"enable": False}}, + }, + ), # Dynamic per-token quantization + ("default", {"enable": False}), ], "algorithm": "max", } @@ -79,14 +81,15 @@ def compute_amax(self): quant_cfg_custom_calib = { "quant_cfg": [ - { - "*": { + ( + "*", + { "num_bits": 4, "axis": None, "enable": True, "calibrator": (NewMaxCalibrator, (4, None, False)), - } - } + }, + ) ], "algorithm": "max", } @@ -134,7 +137,7 @@ def test_save_restore(model_cls, quant_config): def test_quantize_invalid_cfg(): model = SimpleLinear() config_invalid = { - "quant_cfg": [{"*": {"num_bits": 4, "axis": 0, "block_sizes": {-1: 128}}}], + "quant_cfg": [("*", {"num_bits": 4, "axis": 0, "block_sizes": {-1: 128}})], "algorithm": "max", } with pytest.raises(ValidationError, match="axis must be None when block_sizes is not None."): @@ -174,10 +177,10 @@ def test_class_wise_config(): model = SimpleConvLinear() config = { "quant_cfg": [ - {"nn.Linear": {"*": {"num_bits": 4, "axis": -1, "enable": True}}}, - {"nn.Conv2d": {"*": {"num_bits": 8, "enable": True}}}, - {"nn.BatchNorm2d": {"*": {"enable": False}}}, - {"*output_quantizer": {"num_bits": 8, "enable": True}}, + ("nn.Linear", {"*": {"num_bits": 4, "axis": -1, "enable": True}}), + ("nn.Conv2d", {"*": {"num_bits": 8, "enable": True}}), + ("nn.BatchNorm2d", {"*": {"enable": False}}), + ("*output_quantizer", {"num_bits": 8, "enable": True}), ], "algorithm": "max", } @@ -226,23 +229,24 @@ def test_static_weight_dynamic_activations(): def test_block_sizes_axis_model(): REF_QUANT_CFG = { # noqa: N806 "quant_cfg": [ - {"*weight_quantizer": {"num_bits": 8, "axis": 0}}, - {"*input_quantizer": {"num_bits": 8, "axis": None, "type": "dynamic"}}, - {"default": {"enable": False}}, + ("*weight_quantizer", {"num_bits": 8, "axis": 0}), + ("*input_quantizer", {"num_bits": 8, "axis": None, "type": "dynamic"}), + ("default", {"enable": False}), ], "algorithm": "max", } QUANT_CFG = { # noqa: N806 "quant_cfg": [ - {"*weight_quantizer": {"num_bits": 8, "block_sizes": {1: None}}}, - { - "*input_quantizer": { + ("*weight_quantizer", {"num_bits": 8, "block_sizes": {1: None}}), + ( + "*input_quantizer", + { "num_bits": 8, "block_sizes": {0: None, 1: None}, "type": "dynamic", - } - }, - {"default": {"enable": False}}, + }, + ), + ("default", {"enable": False}), ], "algorithm": "max", } diff --git a/tests/unit/torch/quantization/test_tensor_quant_cpu.py b/tests/unit/torch/quantization/test_tensor_quant_cpu.py index 725f9eb7c..f560fcac6 100644 --- a/tests/unit/torch/quantization/test_tensor_quant_cpu.py +++ b/tests/unit/torch/quantization/test_tensor_quant_cpu.py @@ -90,14 +90,15 @@ def test_num_bits(self): WINT4INT8_CFG = { "quant_cfg": [ - { - "*weight_quantizer": [ + ( + "*weight_quantizer", + [ {"num_bits": 4, "block_sizes": {-1: 128, "type": "static"}, "enable": True}, {"num_bits": 8, "axis": 0, "enable": True}, - ] - }, - {"*input_quantizer": {"num_bits": 8, "enable": True}}, - {"default": {"enable": False}}, + ], + ), + ("*input_quantizer", {"num_bits": 8, "enable": True}), + ("default", {"enable": False}), ], "algorithm": "awq_full", } @@ -111,10 +112,10 @@ def test_set_quantizer_cxt(): state_dict = model.state_dict() output_ref = model(inputs) - mtq.set_quantizer_by_cfg(model, [{"*output_quantizer": {"enable": True}}]) + mtq.set_quantizer_by_cfg(model, [("*output_quantizer", {"enable": True})]) with mtq.set_quantizer_by_cfg_context( - model, [{"*": {"enable": False}}, {"*output_quantizer": {"enable": True}}] + model, [("*", {"enable": False}), ("*output_quantizer", {"enable": True})] ): for name, module in model.named_modules(): if not isinstance(module, TensorQuantizer): @@ -125,7 +126,7 @@ def test_set_quantizer_cxt(): assert not module.is_enabled mtq.calibrate(model, "max", lambda model: model(inputs * 10)) - mtq.set_quantizer_by_cfg(model, [{"*output_quantizer": {"enable": False}}]) + mtq.set_quantizer_by_cfg(model, [("*output_quantizer", {"enable": False})]) output_test = model(inputs) assert torch.allclose(output_ref, output_test) From b5bea214674ea520cc7d41762c185f37ea87ca3d Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Wed, 18 Mar 2026 02:36:31 +0000 Subject: [PATCH 03/32] yaml config format update Signed-off-by: Shengliang Xu --- modelopt/torch/quantization/config.py | 29 +++++- .../general/ptq/fp8_default-fp8_kv.yml | 67 ++++++++------ .../general/ptq/nvfp4_default-fp8_kv.yml | 71 ++++++++------- .../general/ptq/nvfp4_mlp_only-fp8_kv.yml | 81 +++++++++-------- .../general/ptq/nvfp4_omlp_only-fp8_kv.yml | 91 +++++++++++-------- tests/unit/recipe/test_loader.py | 8 +- .../quantization/plugins/test_huggingface.py | 8 +- 7 files changed, 207 insertions(+), 148 deletions(-) diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py index de423bbda..b439d1aa2 100644 --- a/modelopt/torch/quantization/config.py +++ b/modelopt/torch/quantization/config.py @@ -1471,17 +1471,36 @@ class QuantizeConfig(ModeloptBaseConfig): @field_validator("quant_cfg", mode="before") @classmethod def normalize_quant_cfg(cls, v): - """Normalize quant_cfg entries: convert single-key dicts to (key, value) tuples. + """Normalize quant_cfg entries: convert dict forms to (key, value) tuples. - This allows loading from YAML/JSON (which produces dicts) while the internal - representation is always a list of tuples. + Supports these dict forms for YAML/JSON compatibility: + + - ``{"pattern": ..., "enable": ..., "format": ...}`` — explicit object with top-level enable + - ``{"pattern": ..., "enable": ...}`` — enable-only (no format fields) + - ``{"pattern": ..., "format": ...}`` — explicit pattern/format object (legacy) + - ``{"": ...}`` — single-key dict (legacy) + + The internal representation is always a list of ``(pattern, cfg)`` tuples where + ``enable`` (if present at the top level) is merged into ``cfg``. """ if not isinstance(v, list): return v result = [] for entry in v: - if isinstance(entry, dict) and len(entry) == 1: - result.append(next(iter(entry.items()))) + if isinstance(entry, dict): + if "pattern" in entry: + pattern = entry["pattern"] + fmt = dict(entry.get("format") or {}) + if "enable" in entry: + fmt["enable"] = entry["enable"] + result.append((pattern, fmt)) + elif len(entry) == 1: + result.append(next(iter(entry.items()))) + else: + raise ValueError( + f"Invalid quant_cfg entry: {entry!r}. " + "Expected a single-key dict or an object with a 'pattern' key." + ) else: result.append(entry) return result diff --git a/modelopt_recipes/general/ptq/fp8_default-fp8_kv.yml b/modelopt_recipes/general/ptq/fp8_default-fp8_kv.yml index d8b6adbac..1d891c595 100644 --- a/modelopt_recipes/general/ptq/fp8_default-fp8_kv.yml +++ b/modelopt_recipes/general/ptq/fp8_default-fp8_kv.yml @@ -19,47 +19,54 @@ metadata: ptq_cfg: algorithm: max quant_cfg: - - '*input_quantizer': + - pattern: '*input_quantizer' + format: num_bits: e4m3 axis: - - '*weight_quantizer': + - pattern: '*weight_quantizer' + format: num_bits: e4m3 axis: - - '*[kv]_bmm_quantizer': + - pattern: '*[kv]_bmm_quantizer' + enable: true + format: num_bits: e4m3 - enable: true - - default: - enable: false - - '*block_sparse_moe.gate*': - enable: false - - '*linear_attn.conv1d*': - enable: false - - '*lm_head*': - enable: false - - '*mixer.conv1d*': - enable: false - - '*mlp.gate.*': - enable: false - - '*mlp.shared_expert_gate.*': - enable: false - - '*output_layer*': - enable: false - - '*proj_out.*': - enable: false - - '*router*': - enable: false - - output.*: - enable: false - - nn.BatchNorm1d: + - pattern: default + enable: false + - pattern: '*block_sparse_moe.gate*' + enable: false + - pattern: '*linear_attn.conv1d*' + enable: false + - pattern: '*lm_head*' + enable: false + - pattern: '*mixer.conv1d*' + enable: false + - pattern: '*mlp.gate.*' + enable: false + - pattern: '*mlp.shared_expert_gate.*' + enable: false + - pattern: '*output_layer*' + enable: false + - pattern: '*proj_out.*' + enable: false + - pattern: '*router*' + enable: false + - pattern: output.* + enable: false + - pattern: nn.BatchNorm1d + format: '*': enable: false - - nn.BatchNorm2d: + - pattern: nn.BatchNorm2d + format: '*': enable: false - - nn.BatchNorm3d: + - pattern: nn.BatchNorm3d + format: '*': enable: false - - nn.LeakyReLU: + - pattern: nn.LeakyReLU + format: '*': enable: false diff --git a/modelopt_recipes/general/ptq/nvfp4_default-fp8_kv.yml b/modelopt_recipes/general/ptq/nvfp4_default-fp8_kv.yml index 7f79bd47b..2ea22c87a 100644 --- a/modelopt_recipes/general/ptq/nvfp4_default-fp8_kv.yml +++ b/modelopt_recipes/general/ptq/nvfp4_default-fp8_kv.yml @@ -19,55 +19,62 @@ metadata: ptq_cfg: algorithm: max quant_cfg: - - '*weight_quantizer': + - pattern: '*weight_quantizer' + enable: true + format: block_sizes: -1: 16 type: dynamic scale_bits: e4m3 num_bits: e2m1 - enable: true - - '*input_quantizer': + - pattern: '*input_quantizer' + enable: true + format: block_sizes: -1: 16 type: dynamic scale_bits: e4m3 num_bits: e2m1 - enable: true - - '*[kv]_bmm_quantizer': + - pattern: '*[kv]_bmm_quantizer' + enable: true + format: num_bits: e4m3 - enable: true - - default: - enable: false - - '*block_sparse_moe.gate*': - enable: false - - '*linear_attn.conv1d*': - enable: false - - '*lm_head*': - enable: false - - '*mixer.conv1d*': - enable: false - - '*mlp.gate.*': - enable: false - - '*mlp.shared_expert_gate.*': - enable: false - - '*output_layer*': - enable: false - - '*proj_out.*': - enable: false - - '*router*': - enable: false - - output.*: - enable: false - - nn.BatchNorm1d: + - pattern: default + enable: false + - pattern: '*block_sparse_moe.gate*' + enable: false + - pattern: '*linear_attn.conv1d*' + enable: false + - pattern: '*lm_head*' + enable: false + - pattern: '*mixer.conv1d*' + enable: false + - pattern: '*mlp.gate.*' + enable: false + - pattern: '*mlp.shared_expert_gate.*' + enable: false + - pattern: '*output_layer*' + enable: false + - pattern: '*proj_out.*' + enable: false + - pattern: '*router*' + enable: false + - pattern: output.* + enable: false + - pattern: nn.BatchNorm1d + format: '*': enable: false - - nn.BatchNorm2d: + - pattern: nn.BatchNorm2d + format: '*': enable: false - - nn.BatchNorm3d: + - pattern: nn.BatchNorm3d + format: '*': enable: false - - nn.LeakyReLU: + - pattern: nn.LeakyReLU + format: '*': enable: false diff --git a/modelopt_recipes/general/ptq/nvfp4_mlp_only-fp8_kv.yml b/modelopt_recipes/general/ptq/nvfp4_mlp_only-fp8_kv.yml index 46cac283d..8ebdd7391 100644 --- a/modelopt_recipes/general/ptq/nvfp4_mlp_only-fp8_kv.yml +++ b/modelopt_recipes/general/ptq/nvfp4_mlp_only-fp8_kv.yml @@ -19,69 +19,78 @@ metadata: ptq_cfg: algorithm: max quant_cfg: - - '*mlp*weight_quantizer': + - pattern: '*mlp*weight_quantizer' + enable: true + format: block_sizes: -1: 16 type: dynamic scale_bits: e4m3 num_bits: e2m1 - enable: true - - '*mlp*input_quantizer': + - pattern: '*mlp*input_quantizer' + enable: true + format: block_sizes: -1: 16 type: dynamic scale_bits: e4m3 num_bits: e2m1 - enable: true - - '*block_sparse_moe*weight_quantizer': + - pattern: '*block_sparse_moe*weight_quantizer' + enable: true + format: block_sizes: -1: 16 type: dynamic scale_bits: e4m3 num_bits: e2m1 - enable: true - - '*block_sparse_moe*input_quantizer': + - pattern: '*block_sparse_moe*input_quantizer' + enable: true + format: block_sizes: -1: 16 type: dynamic scale_bits: e4m3 num_bits: e2m1 - enable: true - - '*[kv]_bmm_quantizer': + - pattern: '*[kv]_bmm_quantizer' + enable: true + format: num_bits: e4m3 - enable: true - - default: - enable: false - - '*block_sparse_moe.gate*': - enable: false - - '*linear_attn.conv1d*': - enable: false - - '*lm_head*': - enable: false - - '*mixer.conv1d*': - enable: false - - '*mlp.gate.*': - enable: false - - '*mlp.shared_expert_gate.*': - enable: false - - '*output_layer*': - enable: false - - '*proj_out.*': - enable: false - - '*router*': - enable: false - - output.*: - enable: false - - nn.BatchNorm1d: + - pattern: default + enable: false + - pattern: '*block_sparse_moe.gate*' + enable: false + - pattern: '*linear_attn.conv1d*' + enable: false + - pattern: '*lm_head*' + enable: false + - pattern: '*mixer.conv1d*' + enable: false + - pattern: '*mlp.gate.*' + enable: false + - pattern: '*mlp.shared_expert_gate.*' + enable: false + - pattern: '*output_layer*' + enable: false + - pattern: '*proj_out.*' + enable: false + - pattern: '*router*' + enable: false + - pattern: output.* + enable: false + - pattern: nn.BatchNorm1d + format: '*': enable: false - - nn.BatchNorm2d: + - pattern: nn.BatchNorm2d + format: '*': enable: false - - nn.BatchNorm3d: + - pattern: nn.BatchNorm3d + format: '*': enable: false - - nn.LeakyReLU: + - pattern: nn.LeakyReLU + format: '*': enable: false diff --git a/modelopt_recipes/general/ptq/nvfp4_omlp_only-fp8_kv.yml b/modelopt_recipes/general/ptq/nvfp4_omlp_only-fp8_kv.yml index 57d5ecd2c..777599135 100644 --- a/modelopt_recipes/general/ptq/nvfp4_omlp_only-fp8_kv.yml +++ b/modelopt_recipes/general/ptq/nvfp4_omlp_only-fp8_kv.yml @@ -19,83 +19,94 @@ metadata: ptq_cfg: algorithm: max quant_cfg: - - '*mlp*weight_quantizer': + - pattern: '*mlp*weight_quantizer' + enable: true + format: block_sizes: -1: 16 type: dynamic scale_bits: e4m3 num_bits: e2m1 - enable: true - - '*mlp*input_quantizer': + - pattern: '*mlp*input_quantizer' + enable: true + format: block_sizes: -1: 16 type: dynamic scale_bits: e4m3 num_bits: e2m1 - enable: true - - '*block_sparse_moe*weight_quantizer': + - pattern: '*block_sparse_moe*weight_quantizer' + enable: true + format: block_sizes: -1: 16 type: dynamic scale_bits: e4m3 num_bits: e2m1 - enable: true - - '*block_sparse_moe*input_quantizer': + - pattern: '*block_sparse_moe*input_quantizer' + enable: true + format: block_sizes: -1: 16 type: dynamic scale_bits: e4m3 num_bits: e2m1 - enable: true - - '*o_proj*weight_quantizer': + - pattern: '*o_proj*weight_quantizer' + enable: true + format: block_sizes: -1: 16 type: dynamic scale_bits: e4m3 num_bits: e2m1 - enable: true - - '*o_proj*input_quantizer': + - pattern: '*o_proj*input_quantizer' + enable: true + format: block_sizes: -1: 16 type: dynamic scale_bits: e4m3 num_bits: e2m1 - enable: true - - '*[kv]_bmm_quantizer': + - pattern: '*[kv]_bmm_quantizer' + enable: true + format: num_bits: e4m3 - enable: true - - default: - enable: false - - '*block_sparse_moe.gate*': - enable: false - - '*linear_attn.conv1d*': - enable: false - - '*lm_head*': - enable: false - - '*mixer.conv1d*': - enable: false - - '*mlp.gate.*': - enable: false - - '*mlp.shared_expert_gate.*': - enable: false - - '*output_layer*': - enable: false - - '*proj_out.*': - enable: false - - '*router*': - enable: false - - output.*: - enable: false - - nn.BatchNorm1d: + - pattern: default + enable: false + - pattern: '*block_sparse_moe.gate*' + enable: false + - pattern: '*linear_attn.conv1d*' + enable: false + - pattern: '*lm_head*' + enable: false + - pattern: '*mixer.conv1d*' + enable: false + - pattern: '*mlp.gate.*' + enable: false + - pattern: '*mlp.shared_expert_gate.*' + enable: false + - pattern: '*output_layer*' + enable: false + - pattern: '*proj_out.*' + enable: false + - pattern: '*router*' + enable: false + - pattern: output.* + enable: false + - pattern: nn.BatchNorm1d + format: '*': enable: false - - nn.BatchNorm2d: + - pattern: nn.BatchNorm2d + format: '*': enable: false - - nn.BatchNorm3d: + - pattern: nn.BatchNorm3d + format: '*': enable: false - - nn.LeakyReLU: + - pattern: nn.LeakyReLU + format: '*': enable: false diff --git a/tests/unit/recipe/test_loader.py b/tests/unit/recipe/test_loader.py index a72205bbd..af80dd78c 100644 --- a/tests/unit/recipe/test_loader.py +++ b/tests/unit/recipe/test_loader.py @@ -211,7 +211,13 @@ def _as_dict(qc): result = {} for entry in qc: if isinstance(entry, dict): - result.update(entry) + if "pattern" in entry: + fmt = dict(entry.get("format") or {}) + if "enable" in entry: + fmt["enable"] = entry["enable"] + result[entry["pattern"]] = fmt + else: + result.update(entry) else: result[entry[0]] = entry[1] return result diff --git a/tests/unit/torch/quantization/plugins/test_huggingface.py b/tests/unit/torch/quantization/plugins/test_huggingface.py index a68510fad..d672c355a 100644 --- a/tests/unit/torch/quantization/plugins/test_huggingface.py +++ b/tests/unit/torch/quantization/plugins/test_huggingface.py @@ -87,7 +87,7 @@ def test_convert_conv1d(): assert hasattr(module, "weight_quantizer") assert hasattr(module, "output_quantizer") - mtq.set_quantizer_attribute(model_test, "*", ("enable", False)) + mtq.set_quantizer_attribute(model_test, "*", {"enable": False}) x = torch.randn(2, 3) out_1 = model_ref(x) @@ -95,8 +95,8 @@ def test_convert_conv1d(): assert torch.allclose(out_1, out_2) - mtq.set_quantizer_attribute(model_test, "*input_quantizer", ("enable", True)) - mtq.set_quantizer_attribute(model_test, "*weight_quantizer", ("enable", True)) + mtq.set_quantizer_attribute(model_test, "*input_quantizer", {"enable": True}) + mtq.set_quantizer_attribute(model_test, "*weight_quantizer", {"enable": True}) model_ref = PytorchModel() model_ref.load_state_dict(model_test.state_dict()) @@ -136,7 +136,7 @@ def test_dbrx(): expertglu_ref.w1, ) - mtq.set_quantizer_attribute(model_test, "*", ("enable", False)) + mtq.set_quantizer_attribute(model_test, "*", {"enable": False}) x = torch.randn(1, 4, 32) out_1 = model_ref(x) From 1b8c4bfbccad8d4009a277612862f50aad90e711 Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Wed, 18 Mar 2026 21:10:41 +0000 Subject: [PATCH 04/32] fix some extra quant_cfg Signed-off-by: Shengliang Xu --- docs/source/guides/_pytorch_quantization.rst | 10 +- examples/diffusers/quantization/config.py | 154 +++++++++--------- examples/llm_eval/quantization_utils.py | 4 +- examples/llm_ptq/example_utils.py | 12 +- .../notebooks/2_PTQ_AWQ_Calibration.ipynb | 14 +- examples/llm_qat/main.py | 12 +- examples/vllm_serve/fakequant_worker.py | 9 +- .../sample_example_qad_diffusers.py | 30 ++-- modelopt/torch/quantization/algorithms.py | 22 ++- modelopt/torch/quantization/config.py | 8 +- modelopt/torch/quantization/model_quant.py | 18 +- .../torch/quantization/utils/core_utils.py | 8 +- .../torch/quantization/test_quantize_cuda.py | 57 ++++--- .../torch/peft/plugins/test_megatron_peft.py | 40 +++-- 14 files changed, 198 insertions(+), 200 deletions(-) diff --git a/docs/source/guides/_pytorch_quantization.rst b/docs/source/guides/_pytorch_quantization.rst index 7539d72fc..0f7720523 100644 --- a/docs/source/guides/_pytorch_quantization.rst +++ b/docs/source/guides/_pytorch_quantization.rst @@ -255,16 +255,16 @@ For exploring new quantization recipes, you can compose a completely new configu # Custom configuration for INT4 block-wise weights and INT8 dynamic activations MY_CUSTOM_CONFIG = { - "quant_cfg": { + "quant_cfg": [ # Configure weight quantizers with 4-bit precision and 128-element blocks - "*weight_quantizer": {"num_bits": 4, "block_sizes": {-1: 128}, "enable": True}, + ("*weight_quantizer", {"num_bits": 4, "block_sizes": {-1: 128}, "enable": True}), # Configure input quantizers with 8-bit dynamic quantization - "*input_quantizer": {"num_bits": 8, "type": "dynamic", "block_sizes": {-1: None}}, + ("*input_quantizer", {"num_bits": 8, "type": "dynamic", "block_sizes": {-1: None}}), # Include default disabled quantizer configurations - **_default_disabled_quantizer_cfg, - }, + *_default_disabled_quantizer_cfg, + ], "algorithm": "max", } diff --git a/examples/diffusers/quantization/config.py b/examples/diffusers/quantization/config.py index 94063ffd9..3e2dbcc2e 100644 --- a/examples/diffusers/quantization/config.py +++ b/examples/diffusers/quantization/config.py @@ -17,82 +17,79 @@ from calib.plugin_calib import PercentileCalibrator FP8_DEFAULT_CONFIG = { - "quant_cfg": { - "*weight_quantizer": {"num_bits": (4, 3), "axis": None}, - "*input_quantizer": {"num_bits": (4, 3), "axis": None}, - "*output_quantizer": {"enable": False}, - "*softmax_quantizer": { - "num_bits": (4, 3), - "axis": None, - }, - "default": {"enable": False}, - }, + "quant_cfg": [ + ("*weight_quantizer", {"num_bits": (4, 3), "axis": None}), + ("*input_quantizer", {"num_bits": (4, 3), "axis": None}), + ("*output_quantizer", {"enable": False}), + ("*softmax_quantizer", {"num_bits": (4, 3), "axis": None}), + ("default", {"enable": False}), + ], "algorithm": "max", } INT8_DEFAULT_CONFIG = { - "quant_cfg": { - "*weight_quantizer": {"num_bits": 8, "axis": 0}, - "*input_quantizer": {"num_bits": 8, "axis": None}, - "*output_quantizer": {"enable": False}, - "default": {"enable": False}, - }, + "quant_cfg": [ + ("*weight_quantizer", {"num_bits": 8, "axis": 0}), + ("*input_quantizer", {"num_bits": 8, "axis": None}), + ("*output_quantizer", {"enable": False}), + ("default", {"enable": False}), + ], "algorithm": "max", } NVFP4_DEFAULT_CONFIG = { - "quant_cfg": { - "*weight_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, - }, - "*input_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, - }, - "*output_quantizer": {"enable": False}, - "*softmax_quantizer": { - "num_bits": (4, 3), - "axis": None, - }, - "default": {"enable": False}, - }, + "quant_cfg": [ + ( + "*weight_quantizer", + { + "num_bits": (2, 1), + "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, + "axis": None, + "enable": True, + }, + ), + ( + "*input_quantizer", + { + "num_bits": (2, 1), + "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, + "axis": None, + "enable": True, + }, + ), + ("*output_quantizer", {"enable": False}), + ("*softmax_quantizer", {"num_bits": (4, 3), "axis": None}), + ("default", {"enable": False}), + ], "algorithm": "max", } NVFP4_FP8_MHA_CONFIG = { - "quant_cfg": { - "**weight_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, - }, - "**input_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, - }, - "*output_quantizer": {"enable": False}, - "*[qkv]_bmm_quantizer": { - "num_bits": (4, 3), - "axis": None, - }, - "*softmax_quantizer": { - "num_bits": (4, 3), - "axis": None, - }, - "*bmm2_output_quantizer": { - "num_bits": (4, 3), - "axis": None, - }, - "default": {"enable": False}, - }, + "quant_cfg": [ + ( + "**weight_quantizer", + { + "num_bits": (2, 1), + "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, + "axis": None, + "enable": True, + }, + ), + ( + "**input_quantizer", + { + "num_bits": (2, 1), + "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, + "axis": None, + "enable": True, + }, + ), + ("*output_quantizer", {"enable": False}), + ("*[qkv]_bmm_quantizer", {"num_bits": (4, 3), "axis": None}), + ("*softmax_quantizer", {"num_bits": (4, 3), "axis": None}), + ("*bmm2_output_quantizer", {"num_bits": (4, 3), "axis": None}), + ("default", {"enable": False}), + ], "algorithm": {"method": "svdquant", "lowrank": 32}, } @@ -106,7 +103,7 @@ def set_quant_config_attr(quant_config, trt_high_precision_dtype, quant_algo, ** algo_cfg["lowrank"] = kwargs["lowrank"] quant_config["algorithm"] = algo_cfg - for p in quant_config["quant_cfg"].values(): + for _pattern, p in quant_config["quant_cfg"]: if "num_bits" in p and "trt_high_precision_dtype" not in p: p["trt_high_precision_dtype"] = trt_high_precision_dtype @@ -127,18 +124,23 @@ def reset_set_int8_config(quant_config, percentile, n_steps, collect_method, bac for name, module in backbone.named_modules(): if isinstance(module, nn.Conv2d): aq_name = f"*{name}*input_quantizer*" - quant_config["quant_cfg"][aq_name] = { - "num_bits": 8, - "axis": None, - "calibrator": ( - PercentileCalibrator, - (), + quant_config["quant_cfg"].append( + ( + aq_name, { "num_bits": 8, "axis": None, - "percentile": percentile, - "total_step": n_steps, - "collect_method": collect_method, + "calibrator": ( + PercentileCalibrator, + (), + { + "num_bits": 8, + "axis": None, + "percentile": percentile, + "total_step": n_steps, + "collect_method": collect_method, + }, + ), }, - ), - } + ) + ) diff --git a/examples/llm_eval/quantization_utils.py b/examples/llm_eval/quantization_utils.py index 9d132a818..03b7039fa 100644 --- a/examples/llm_eval/quantization_utils.py +++ b/examples/llm_eval/quantization_utils.py @@ -34,8 +34,8 @@ CUSTOM_CONFIG = { "MY_QUANT_CONFIG": { "quant_cfg": [ - {"*weight_quantizer": {"num_bits": 4, "block_sizes": {-1: 128}, "enable": True}}, - {"*input_quantizer": {"num_bits": 8, "type": "dynamic", "block_sizes": {-1: None}}}, + ("*weight_quantizer", {"num_bits": 4, "block_sizes": {-1: 128}, "enable": True}), + ("*input_quantizer", {"num_bits": 8, "type": "dynamic", "block_sizes": {-1: None}}), # Disable sensitive layers such as `lm_head`, gate layers in MoE etc. *mtq.config._default_disabled_quantizer_cfg, ], diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py index 459bee77b..ca6a3ea09 100755 --- a/examples/llm_ptq/example_utils.py +++ b/examples/llm_ptq/example_utils.py @@ -205,7 +205,9 @@ def build_quant_cfg( ) -> dict[str, Any]: quant_cfg = copy.deepcopy(quant_cfg) if "awq" in str(quant_cfg.get("algorithm")): - weight_quantizer = quant_cfg["quant_cfg"]["*weight_quantizer"] + weight_quantizer = next( + cfg for pat, cfg in quant_cfg["quant_cfg"] if pat == "*weight_quantizer" + ) if isinstance(weight_quantizer, list): weight_quantizer = weight_quantizer[0] # If awq_block_size argument is provided, update weight_quantizer @@ -236,10 +238,10 @@ def build_quant_cfg( if model_type == "phi4mm": # Only quantize the language model - quant_cfg["quant_cfg"]["*speech*"] = {"enable": False} - quant_cfg["quant_cfg"]["*audio*"] = {"enable": False} - quant_cfg["quant_cfg"]["*image*"] = {"enable": False} - quant_cfg["quant_cfg"]["*vision*"] = {"enable": False} + quant_cfg["quant_cfg"].append(("*speech*", {"enable": False})) + quant_cfg["quant_cfg"].append(("*audio*", {"enable": False})) + quant_cfg["quant_cfg"].append(("*image*", {"enable": False})) + quant_cfg["quant_cfg"].append(("*vision*", {"enable": False})) return quant_cfg diff --git a/examples/llm_ptq/notebooks/2_PTQ_AWQ_Calibration.ipynb b/examples/llm_ptq/notebooks/2_PTQ_AWQ_Calibration.ipynb index fc055cf84..096e80272 100644 --- a/examples/llm_ptq/notebooks/2_PTQ_AWQ_Calibration.ipynb +++ b/examples/llm_ptq/notebooks/2_PTQ_AWQ_Calibration.ipynb @@ -189,17 +189,7 @@ "id": "a3ce3b47-48ac-4a27-a5ed-351a10c104a9", "metadata": {}, "outputs": [], - "source": [ - "# Get default AWQ config and optionally adjust block size\n", - "quant_cfg = mtq.INT4_AWQ_CFG\n", - "weight_quantizer = quant_cfg[\"quant_cfg\"][\"*weight_quantizer\"]\n", - "if isinstance(weight_quantizer, list):\n", - " weight_quantizer = weight_quantizer[0]\n", - "weight_quantizer[\"block_sizes\"][-1] = 128 # Optional: override block size\n", - "\n", - "# Apply AWQ quantization\n", - "model = mtq.quantize(model, quant_cfg, forward_loop=forward_loop)" - ] + "source": "# Get default AWQ config and optionally adjust block size\nquant_cfg = mtq.INT4_AWQ_CFG\nweight_quantizer = next(cfg for pat, cfg in quant_cfg[\"quant_cfg\"] if pat == \"*weight_quantizer\")\nif isinstance(weight_quantizer, list):\n weight_quantizer = weight_quantizer[0]\nweight_quantizer[\"block_sizes\"][-1] = 128 # Optional: override block size\n\n# Apply AWQ quantization\nmodel = mtq.quantize(model, quant_cfg, forward_loop=forward_loop)" }, { "cell_type": "markdown", @@ -308,4 +298,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/examples/llm_qat/main.py b/examples/llm_qat/main.py index 943515725..5312c2ad9 100644 --- a/examples/llm_qat/main.py +++ b/examples/llm_qat/main.py @@ -54,12 +54,12 @@ CUSTOM_QUANT_CFG = { "INT4_WEIGHT_INT8_ACTIVATIONS": { - "quant_cfg": { - "*weight_quantizer": {"num_bits": 4, "block_sizes": {-1: 128}, "enable": True}, - "*input_quantizer": {"num_bits": 8, "axis": None, "enable": True}, - "*lm_head*": {"enable": False}, - "default": {"enable": False}, - }, + "quant_cfg": [ + ("*weight_quantizer", {"num_bits": 4, "block_sizes": {-1: 128}, "enable": True}), + ("*input_quantizer", {"num_bits": 8, "axis": None, "enable": True}), + ("*lm_head*", {"enable": False}), + ("default", {"enable": False}), + ], "algorithm": "max", } } diff --git a/examples/vllm_serve/fakequant_worker.py b/examples/vllm_serve/fakequant_worker.py index 772c6fe66..4a4bde1d3 100644 --- a/examples/vllm_serve/fakequant_worker.py +++ b/examples/vllm_serve/fakequant_worker.py @@ -155,7 +155,7 @@ def disable_compilation(model): } -def update_kv_cfg_for_mla(model: torch.nn.Module, kv_quant_cfg: dict[str, Any]) -> dict[str, Any]: +def update_kv_cfg_for_mla(model: torch.nn.Module, kv_quant_cfg: list) -> list: """Update KV cache quantization config for MLA models. MLA uses `kv_c_bmm_quantizer` (compressed KV) instead of separate @@ -170,9 +170,10 @@ def update_kv_cfg_for_mla(model: torch.nn.Module, kv_quant_cfg: dict[str, Any]) if not any(isinstance(m, MLAAttention) for m in model.modules()): return kv_quant_cfg - if kv_config := kv_quant_cfg.get("*[kv]_bmm_quantizer"): - kv_quant_cfg["*kv_c_bmm_quantizer"] = kv_config - kv_quant_cfg["*k_pe_bmm_quantizer"] = kv_config + kv_config = next((cfg for pat, cfg in kv_quant_cfg if pat == "*[kv]_bmm_quantizer"), None) + if kv_config is not None: + kv_quant_cfg.append(("*kv_c_bmm_quantizer", kv_config)) + kv_quant_cfg.append(("*k_pe_bmm_quantizer", kv_config)) print("MLA detected: added *kv_c_bmm_quantizer and k_pe_bmm_quantizer config") return kv_quant_cfg diff --git a/examples/windows/torch_onnx/diffusers/qad_example/sample_example_qad_diffusers.py b/examples/windows/torch_onnx/diffusers/qad_example/sample_example_qad_diffusers.py index a861493b3..4c66de1d4 100644 --- a/examples/windows/torch_onnx/diffusers/qad_example/sample_example_qad_diffusers.py +++ b/examples/windows/torch_onnx/diffusers/qad_example/sample_example_qad_diffusers.py @@ -257,26 +257,18 @@ def build_quant_config( if exclude_blocks is None: exclude_blocks = [0, 1, 46, 47] - quant_cfg = { - "*weight_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, - }, - "*input_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, - }, + _nvfp4_cfg = { + "num_bits": (2, 1), + "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, + "axis": None, + "enable": True, } - - for pattern in SENSITIVE_LAYER_PATTERNS: - quant_cfg[pattern] = {"enable": False} - - for block_idx in exclude_blocks: - quant_cfg[f"*transformer_blocks.{block_idx}.*"] = {"enable": False} + quant_cfg = [ + ("*weight_quantizer", _nvfp4_cfg), + ("*input_quantizer", _nvfp4_cfg), + *[(pattern, {"enable": False}) for pattern in SENSITIVE_LAYER_PATTERNS], + *[(f"*transformer_blocks.{i}.*", {"enable": False}) for i in exclude_blocks], + ] return { "quant_cfg": quant_cfg, diff --git a/modelopt/torch/quantization/algorithms.py b/modelopt/torch/quantization/algorithms.py index 7b607012b..76a294718 100644 --- a/modelopt/torch/quantization/algorithms.py +++ b/modelopt/torch/quantization/algorithms.py @@ -1299,17 +1299,6 @@ def get_auto_quantize_config(search_state, constraints=None, verbose=False): else: best_recipe = search_state["best"]["recipe"] - quant_cfg_dict: dict[str, Any] = {"*": {"enable": False}} - for hparam_name, recipe in best_recipe.items(): - if recipe == QuantRecipe(quant_cfg=None): - continue - module_names = search_state["candidate_stats"][hparam_name]["module_names"] - for module_name in module_names: - for quantizer_attr in ("input_quantizer", "weight_quantizer"): - matched_cfg = _match_quantizer_cfg(recipe.config.quant_cfg, quantizer_attr) - if matched_cfg is not None: - quant_cfg_dict[f"{module_name}.{quantizer_attr}"] = matched_cfg - def _cfg_to_dict(v): if isinstance(v, mtq_config.QuantizerAttributeConfig): return { @@ -1321,7 +1310,16 @@ def _cfg_to_dict(v): return [_cfg_to_dict(c) for c in v] return v - quant_cfg = [(k, _cfg_to_dict(v)) for k, v in quant_cfg_dict.items()] + quant_cfg: list[tuple] = [("*", {"enable": False})] + for hparam_name, recipe in best_recipe.items(): + if recipe == QuantRecipe(quant_cfg=None): + continue + module_names = search_state["candidate_stats"][hparam_name]["module_names"] + for module_name in module_names: + for quantizer_attr in ("input_quantizer", "weight_quantizer"): + matched_cfg = _match_quantizer_cfg(recipe.config.quant_cfg, quantizer_attr) + if matched_cfg is not None: + quant_cfg.append((f"{module_name}.{quantizer_attr}", _cfg_to_dict(matched_cfg))) warnings.warn( "get_auto_quantize_config: returned config uses algorithm='max'. " "Per-recipe calibration algorithms (e.g. smoothquant, awq) are not preserved. " diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py index b439d1aa2..4fa9b27a9 100644 --- a/modelopt/torch/quantization/config.py +++ b/modelopt/torch/quantization/config.py @@ -99,11 +99,11 @@ MY_QUANT_CFG = { "quant_cfg": [ # Quantizer wildcard strings mapping to quantizer attributes - {"*weight_quantizer": {"num_bits": 8, "axis": 0}}, - {"*input_quantizer": {"num_bits": 8, "axis": None}}, + ("*weight_quantizer", {"num_bits": 8, "axis": 0}), + ("*input_quantizer", {"num_bits": 8, "axis": None}), # Module class names mapping to quantizer configurations - {"nn.LeakyReLU": {"*input_quantizer": {"enable": False}}}, + ("nn.LeakyReLU", {"*input_quantizer": {"enable": False}}), ] } @@ -128,7 +128,7 @@ # Create custom config CUSTOM_INT4_AWQ_CFG = copy.deepcopy(mtq.INT4_AWQ_CFG) - CUSTOM_INT4_AWQ_CFG["quant_cfg"]["*lm_head*"] = {"enable": False} + CUSTOM_INT4_AWQ_CFG["quant_cfg"].append(("*lm_head*", {"enable": False})) # quantize model model = mtq.quantize(model, CUSTOM_INT4_AWQ_CFG, forward_loop) diff --git a/modelopt/torch/quantization/model_quant.py b/modelopt/torch/quantization/model_quant.py index e637641d9..bb85723e3 100644 --- a/modelopt/torch/quantization/model_quant.py +++ b/modelopt/torch/quantization/model_quant.py @@ -179,15 +179,15 @@ def quantize( config = { - "quant_cfg": { + "quant_cfg": [ # "num_bits" specifies the number of bits for quantization # "axis" specifies the axis for quantization - "*weight_quantizer": {"num_bits": 8, "axis": 0}, - "*input_quantizer": {"num_bits": 8, "axis": -1}, + ("*weight_quantizer", {"num_bits": 8, "axis": 0}), + ("*input_quantizer", {"num_bits": 8, "axis": -1}), # Default quantization settings - "default": {"num_bits": 8, "axis": None}, - } + ("default", {"num_bits": 8, "axis": None}), + ] "algorithm": "max" } @@ -323,10 +323,10 @@ def auto_quantize( .. code-block:: python INT8_CUSTOM_QUANT_CFG = { - "quant_cfg": { - "*weight_quantizer": {"num_bits": 8, "axis": 0}, - "*input_quantizer": {"num_bits": 8, "axis": None}, - }, + "quant_cfg": [ + ("*weight_quantizer", {"num_bits": 8, "axis": 0}), + ("*input_quantizer", {"num_bits": 8, "axis": None}), + ], "algorithm": "smoothquant", } diff --git a/modelopt/torch/quantization/utils/core_utils.py b/modelopt/torch/quantization/utils/core_utils.py index 0be7736da..c201869ed 100644 --- a/modelopt/torch/quantization/utils/core_utils.py +++ b/modelopt/torch/quantization/utils/core_utils.py @@ -310,11 +310,11 @@ def calibrate_with_adapters(model, args): def disable_lora_quantizers_in_config(config, layers): """Turns off input, weight, and output quantizers for LoRA weights and LoRALinear layers in config.""" - config["quant_cfg"]["*lora*"] = ("enable", False) + config["quant_cfg"].append(("*lora*", {"enable": False})) for layer in layers: - config["quant_cfg"][f"*{layer}.input_quantizer"] = ("enable", False) - config["quant_cfg"][f"*{layer}.weight_quantizer"] = ("enable", False) - config["quant_cfg"][f"*{layer}.output_quantizer"] = ("enable", False) + config["quant_cfg"].append((f"*{layer}.input_quantizer", {"enable": False})) + config["quant_cfg"].append((f"*{layer}.weight_quantizer", {"enable": False})) + config["quant_cfg"].append((f"*{layer}.output_quantizer", {"enable": False})) return config diff --git a/tests/gpu/torch/quantization/test_quantize_cuda.py b/tests/gpu/torch/quantization/test_quantize_cuda.py index 3e9ff4256..097b28a48 100644 --- a/tests/gpu/torch/quantization/test_quantize_cuda.py +++ b/tests/gpu/torch/quantization/test_quantize_cuda.py @@ -29,20 +29,26 @@ from modelopt.torch.quantization.extensions import get_cuda_ext_mx NVFP4_WEIGHT_ACT_MSE_CFG = { - "quant_cfg": { - "*weight_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "static", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, - }, - "*input_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, - }, - }, + "quant_cfg": [ + ( + "*weight_quantizer", + { + "num_bits": (2, 1), + "block_sizes": {-1: 16, "type": "static", "scale_bits": (4, 3)}, + "axis": None, + "enable": True, + }, + ), + ( + "*input_quantizer", + { + "num_bits": (2, 1), + "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, + "axis": None, + "enable": True, + }, + ), + ], "algorithm": { "method": "mse", "step_size": 0.25, @@ -52,17 +58,18 @@ } NVFP4_WEIGHT_MSE_FP8_SWEEP_CFG = { - "quant_cfg": { - "*weight_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "static", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, - }, - "*input_quantizer": { - "enable": False, - }, - }, + "quant_cfg": [ + ( + "*weight_quantizer", + { + "num_bits": (2, 1), + "block_sizes": {-1: 16, "type": "static", "scale_bits": (4, 3)}, + "axis": None, + "enable": True, + }, + ), + ("*input_quantizer", {"enable": False}), + ], "algorithm": { "method": "mse", "fp8_scale_sweep": True, diff --git a/tests/gpu_megatron/torch/peft/plugins/test_megatron_peft.py b/tests/gpu_megatron/torch/peft/plugins/test_megatron_peft.py index b71eaeb21..d9c2d4dfd 100644 --- a/tests/gpu_megatron/torch/peft/plugins/test_megatron_peft.py +++ b/tests/gpu_megatron/torch/peft/plugins/test_megatron_peft.py @@ -33,23 +33,29 @@ from modelopt.torch.utils.plugins import megatron_prefill NVFP4_DEFAULT_CONFIG = { - "quant_cfg": { - "*weight_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, - }, - "*input_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, - }, - "*output_quantizer": {"enable": False}, - "*output_layer*": {"enable": False}, # Note: only output_layer is disabled. - "default": {"enable": False}, - }, + "quant_cfg": [ + ( + "*weight_quantizer", + { + "num_bits": (2, 1), + "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, + "axis": None, + "enable": True, + }, + ), + ( + "*input_quantizer", + { + "num_bits": (2, 1), + "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, + "axis": None, + "enable": True, + }, + ), + ("*output_quantizer", {"enable": False}), + ("*output_layer*", {"enable": False}), # Note: only output_layer is disabled. + ("default", {"enable": False}), + ], "algorithm": "max", } From ab4daec42c39b7c217ca9bc7c826f21d2ed5c5fd Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Thu, 19 Mar 2026 00:02:21 +0000 Subject: [PATCH 05/32] fix tests Signed-off-by: Shengliang Xu --- .../torch/quantization/onnx_export.py | 2 +- .../torch/quantization/quantize_common.py | 8 ++++---- .../quantization/plugins/test_huggingface.py | 8 ++++---- tests/unit/torch/quantization/test_autoquant.py | 16 ++++++++-------- 4 files changed, 17 insertions(+), 17 deletions(-) diff --git a/tests/_test_utils/torch/quantization/onnx_export.py b/tests/_test_utils/torch/quantization/onnx_export.py index 757e5dbea..cf7b5bc40 100644 --- a/tests/_test_utils/torch/quantization/onnx_export.py +++ b/tests/_test_utils/torch/quantization/onnx_export.py @@ -76,7 +76,7 @@ def forward_loop(model): buffer.seek(0) providers = ["CUDAExecutionProvider"] if device != "cpu" else ["CPUExecutionProvider"] ort_session = onnxruntime.InferenceSession(buffer.read(), providers=providers) - ort_result = ort_session.run([], ("input", dummy_input.cpu().numpy())) + ort_result = ort_session.run([], {"input": dummy_input.cpu().numpy()}) ort_result = torch.tensor(ort_result[0]).to(device) torch_result = model(dummy_input) print(ort_result, torch_result) diff --git a/tests/_test_utils/torch/quantization/quantize_common.py b/tests/_test_utils/torch/quantization/quantize_common.py index b52a3e204..ba0660ac2 100644 --- a/tests/_test_utils/torch/quantization/quantize_common.py +++ b/tests/_test_utils/torch/quantization/quantize_common.py @@ -47,9 +47,9 @@ def get_awq_config(algorithm="awq_lite", block_size=8): config = copy.deepcopy(mtq.INT4_AWQ_CFG) - for entry in config["quant_cfg"]: - if "*weight_quantizer" in entry: - entry["*weight_quantizer"]["block_sizes"] = {-1: block_size} + for pat, cfg in config["quant_cfg"]: + if pat == "*weight_quantizer": + cfg["block_sizes"] = {-1: block_size} break if "algorithm" not in config or not isinstance(config["algorithm"], dict): config["algorithm"] = {} @@ -252,7 +252,7 @@ def forward_loop(model): def auto_quantize_helper(model): model, search_state = mtq.auto_quantize( model, - constraints=("effective_bits", 8.0), + constraints={"effective_bits": 8.0}, quantization_formats=[mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG, mtq.INT8_DEFAULT_CFG], data_loader=[model.get_dummy_input().cuda() for _ in range(2)], forward_step=lambda model, batch: model(batch), diff --git a/tests/unit/torch/quantization/plugins/test_huggingface.py b/tests/unit/torch/quantization/plugins/test_huggingface.py index d672c355a..0cd34da79 100644 --- a/tests/unit/torch/quantization/plugins/test_huggingface.py +++ b/tests/unit/torch/quantization/plugins/test_huggingface.py @@ -170,7 +170,7 @@ def forward_step(model, batch): with context: best_model, search_history = mtq.auto_quantize( model, - constraints=("effective_bits", 11.0), + constraints={"effective_bits": 11.0}, quantization_formats=[mtq.INT8_DEFAULT_CFG], data_loader=[{"input_ids": input_ids, "labels": input_ids} for _ in range(2)], forward_step=forward_step, @@ -196,9 +196,9 @@ def test_quantized_transformers_save_restore(tmp_path, model_cls, quant_config): import copy quant_config = copy.deepcopy(quant_config) - for entry in quant_config["quant_cfg"]: - if "*weight_quantizer" in entry: - entry["*weight_quantizer"]["block_sizes"] = {-1: 16} + for pat, cfg in quant_config["quant_cfg"]: + if pat == "*weight_quantizer": + cfg["block_sizes"] = {-1: 16} break else: raise ValueError(f"Unsupported quant_config: {quant_config}") diff --git a/tests/unit/torch/quantization/test_autoquant.py b/tests/unit/torch/quantization/test_autoquant.py index 52fce49d4..6277fdc7f 100644 --- a/tests/unit/torch/quantization/test_autoquant.py +++ b/tests/unit/torch/quantization/test_autoquant.py @@ -145,7 +145,7 @@ def loss_func(output): best_model, search_history = mtq.auto_quantize( model, - constraints=("effective_bits", search_bits), + constraints={"effective_bits": search_bits}, quantization_formats=search_formats, data_loader=[model.get_input() for _ in range(2)], forward_step=lambda model, batch: model(batch), @@ -191,7 +191,7 @@ def loss_func(output): best_model, search_history = mtq.auto_quantize( model, - constraints=("effective_bits", 5.0), + constraints={"effective_bits": 5.0}, quantization_formats=[ mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG, mtq.INT8_DEFAULT_CFG, @@ -214,7 +214,7 @@ def test_auto_quantize_disabled_layers_no_poison(): best_model, _ = mtq.auto_quantize( model, - constraints=("effective_bits", 5.0), + constraints={"effective_bits": 5.0}, quantization_formats=[mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG, mtq.INT8_DEFAULT_CFG], data_loader=[model.get_input() for _ in range(2)], forward_step=lambda model, batch: model(batch), @@ -268,7 +268,7 @@ def _test_data_parallel_auto_quantize(rank, size): model, search_history = mtq.auto_quantize( model, - constraints=("effective_bits", 11.0), + constraints={"effective_bits": 11.0}, quantization_formats=[mtq.INT8_SMOOTHQUANT_CFG], data_loader=[model.get_input() for _ in range(2)], forward_step=lambda model, batch: model(batch), @@ -377,7 +377,7 @@ def test_auto_quantize_checkpoint_resume(method, tmp_path, capsys): # First run: save checkpoint model_1, state_dict_1 = mtq.auto_quantize( model, - constraints=("effective_bits", 6.0), + constraints={"effective_bits": 6.0}, quantization_formats=[mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG, mtq.INT8_DEFAULT_CFG], data_loader=[model.get_input() for _ in range(2)], forward_step=lambda model, batch: model(batch), @@ -396,7 +396,7 @@ def test_auto_quantize_checkpoint_resume(method, tmp_path, capsys): model_2 = SimpleLinear() model_2, state_dict_2 = mtq.auto_quantize( model_2, - constraints=("effective_bits", 6.0), # Same constraint + constraints={"effective_bits": 6.0}, # Same constraint quantization_formats=[mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG, mtq.INT8_DEFAULT_CFG], data_loader=[model_2.get_input() for _ in range(2)], forward_step=lambda model, batch: model(batch), @@ -464,7 +464,7 @@ def test_get_auto_quantize_config(method): _, search_state = mtq.auto_quantize( model, - constraints=("effective_bits", 6.0), + constraints={"effective_bits": 6.0}, quantization_formats=[mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG, mtq.INT8_DEFAULT_CFG], data_loader=[model.get_input() for _ in range(4)], forward_step=lambda model, batch: model(batch), @@ -489,7 +489,7 @@ def test_get_auto_quantize_config(method): # Re-solve with different constraints config_resoled = mtq.get_auto_quantize_config( - search_state, constraints=("effective_bits", 12.0) + search_state, constraints={"effective_bits": 12.0} ) assert "quant_cfg" in config_resoled From 4ffd2fa4ff3d3ef9abeb2ad8da0e43678f4904c4 Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Thu, 19 Mar 2026 17:02:13 +0000 Subject: [PATCH 06/32] rename from format to cfg Signed-off-by: Shengliang Xu --- modelopt/torch/quantization/config.py | 4 ++-- .../general/ptq/fp8_default-fp8_kv.yml | 14 ++++++------ .../general/ptq/nvfp4_default-fp8_kv.yml | 14 ++++++------ .../general/ptq/nvfp4_mlp_only-fp8_kv.yml | 18 +++++++-------- .../general/ptq/nvfp4_omlp_only-fp8_kv.yml | 22 +++++++++---------- tests/unit/recipe/test_loader.py | 2 +- 6 files changed, 37 insertions(+), 37 deletions(-) diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py index 4fa9b27a9..553370bad 100644 --- a/modelopt/torch/quantization/config.py +++ b/modelopt/torch/quantization/config.py @@ -1477,7 +1477,7 @@ def normalize_quant_cfg(cls, v): - ``{"pattern": ..., "enable": ..., "format": ...}`` — explicit object with top-level enable - ``{"pattern": ..., "enable": ...}`` — enable-only (no format fields) - - ``{"pattern": ..., "format": ...}`` — explicit pattern/format object (legacy) + - ``{"pattern": ..., "cfg": ...}`` — explicit pattern/cfg object - ``{"": ...}`` — single-key dict (legacy) The internal representation is always a list of ``(pattern, cfg)`` tuples where @@ -1490,7 +1490,7 @@ def normalize_quant_cfg(cls, v): if isinstance(entry, dict): if "pattern" in entry: pattern = entry["pattern"] - fmt = dict(entry.get("format") or {}) + fmt = dict(entry.get("cfg") or entry.get("format") or {}) if "enable" in entry: fmt["enable"] = entry["enable"] result.append((pattern, fmt)) diff --git a/modelopt_recipes/general/ptq/fp8_default-fp8_kv.yml b/modelopt_recipes/general/ptq/fp8_default-fp8_kv.yml index 1d891c595..a3287a0e6 100644 --- a/modelopt_recipes/general/ptq/fp8_default-fp8_kv.yml +++ b/modelopt_recipes/general/ptq/fp8_default-fp8_kv.yml @@ -20,16 +20,16 @@ ptq_cfg: algorithm: max quant_cfg: - pattern: '*input_quantizer' - format: + cfg: num_bits: e4m3 axis: - pattern: '*weight_quantizer' - format: + cfg: num_bits: e4m3 axis: - pattern: '*[kv]_bmm_quantizer' enable: true - format: + cfg: num_bits: e4m3 - pattern: default @@ -55,18 +55,18 @@ ptq_cfg: - pattern: output.* enable: false - pattern: nn.BatchNorm1d - format: + cfg: '*': enable: false - pattern: nn.BatchNorm2d - format: + cfg: '*': enable: false - pattern: nn.BatchNorm3d - format: + cfg: '*': enable: false - pattern: nn.LeakyReLU - format: + cfg: '*': enable: false diff --git a/modelopt_recipes/general/ptq/nvfp4_default-fp8_kv.yml b/modelopt_recipes/general/ptq/nvfp4_default-fp8_kv.yml index 2ea22c87a..8b98c53fe 100644 --- a/modelopt_recipes/general/ptq/nvfp4_default-fp8_kv.yml +++ b/modelopt_recipes/general/ptq/nvfp4_default-fp8_kv.yml @@ -21,7 +21,7 @@ ptq_cfg: quant_cfg: - pattern: '*weight_quantizer' enable: true - format: + cfg: block_sizes: -1: 16 type: dynamic @@ -29,7 +29,7 @@ ptq_cfg: num_bits: e2m1 - pattern: '*input_quantizer' enable: true - format: + cfg: block_sizes: -1: 16 type: dynamic @@ -37,7 +37,7 @@ ptq_cfg: num_bits: e2m1 - pattern: '*[kv]_bmm_quantizer' enable: true - format: + cfg: num_bits: e4m3 - pattern: default @@ -63,18 +63,18 @@ ptq_cfg: - pattern: output.* enable: false - pattern: nn.BatchNorm1d - format: + cfg: '*': enable: false - pattern: nn.BatchNorm2d - format: + cfg: '*': enable: false - pattern: nn.BatchNorm3d - format: + cfg: '*': enable: false - pattern: nn.LeakyReLU - format: + cfg: '*': enable: false diff --git a/modelopt_recipes/general/ptq/nvfp4_mlp_only-fp8_kv.yml b/modelopt_recipes/general/ptq/nvfp4_mlp_only-fp8_kv.yml index 8ebdd7391..64eeb1ecf 100644 --- a/modelopt_recipes/general/ptq/nvfp4_mlp_only-fp8_kv.yml +++ b/modelopt_recipes/general/ptq/nvfp4_mlp_only-fp8_kv.yml @@ -21,7 +21,7 @@ ptq_cfg: quant_cfg: - pattern: '*mlp*weight_quantizer' enable: true - format: + cfg: block_sizes: -1: 16 type: dynamic @@ -29,7 +29,7 @@ ptq_cfg: num_bits: e2m1 - pattern: '*mlp*input_quantizer' enable: true - format: + cfg: block_sizes: -1: 16 type: dynamic @@ -37,7 +37,7 @@ ptq_cfg: num_bits: e2m1 - pattern: '*block_sparse_moe*weight_quantizer' enable: true - format: + cfg: block_sizes: -1: 16 type: dynamic @@ -45,7 +45,7 @@ ptq_cfg: num_bits: e2m1 - pattern: '*block_sparse_moe*input_quantizer' enable: true - format: + cfg: block_sizes: -1: 16 type: dynamic @@ -53,7 +53,7 @@ ptq_cfg: num_bits: e2m1 - pattern: '*[kv]_bmm_quantizer' enable: true - format: + cfg: num_bits: e4m3 - pattern: default @@ -79,18 +79,18 @@ ptq_cfg: - pattern: output.* enable: false - pattern: nn.BatchNorm1d - format: + cfg: '*': enable: false - pattern: nn.BatchNorm2d - format: + cfg: '*': enable: false - pattern: nn.BatchNorm3d - format: + cfg: '*': enable: false - pattern: nn.LeakyReLU - format: + cfg: '*': enable: false diff --git a/modelopt_recipes/general/ptq/nvfp4_omlp_only-fp8_kv.yml b/modelopt_recipes/general/ptq/nvfp4_omlp_only-fp8_kv.yml index 777599135..e55dc42e2 100644 --- a/modelopt_recipes/general/ptq/nvfp4_omlp_only-fp8_kv.yml +++ b/modelopt_recipes/general/ptq/nvfp4_omlp_only-fp8_kv.yml @@ -21,7 +21,7 @@ ptq_cfg: quant_cfg: - pattern: '*mlp*weight_quantizer' enable: true - format: + cfg: block_sizes: -1: 16 type: dynamic @@ -29,7 +29,7 @@ ptq_cfg: num_bits: e2m1 - pattern: '*mlp*input_quantizer' enable: true - format: + cfg: block_sizes: -1: 16 type: dynamic @@ -37,7 +37,7 @@ ptq_cfg: num_bits: e2m1 - pattern: '*block_sparse_moe*weight_quantizer' enable: true - format: + cfg: block_sizes: -1: 16 type: dynamic @@ -45,7 +45,7 @@ ptq_cfg: num_bits: e2m1 - pattern: '*block_sparse_moe*input_quantizer' enable: true - format: + cfg: block_sizes: -1: 16 type: dynamic @@ -53,7 +53,7 @@ ptq_cfg: num_bits: e2m1 - pattern: '*o_proj*weight_quantizer' enable: true - format: + cfg: block_sizes: -1: 16 type: dynamic @@ -61,7 +61,7 @@ ptq_cfg: num_bits: e2m1 - pattern: '*o_proj*input_quantizer' enable: true - format: + cfg: block_sizes: -1: 16 type: dynamic @@ -69,7 +69,7 @@ ptq_cfg: num_bits: e2m1 - pattern: '*[kv]_bmm_quantizer' enable: true - format: + cfg: num_bits: e4m3 - pattern: default @@ -95,18 +95,18 @@ ptq_cfg: - pattern: output.* enable: false - pattern: nn.BatchNorm1d - format: + cfg: '*': enable: false - pattern: nn.BatchNorm2d - format: + cfg: '*': enable: false - pattern: nn.BatchNorm3d - format: + cfg: '*': enable: false - pattern: nn.LeakyReLU - format: + cfg: '*': enable: false diff --git a/tests/unit/recipe/test_loader.py b/tests/unit/recipe/test_loader.py index af80dd78c..a4a9a08d4 100644 --- a/tests/unit/recipe/test_loader.py +++ b/tests/unit/recipe/test_loader.py @@ -212,7 +212,7 @@ def _as_dict(qc): for entry in qc: if isinstance(entry, dict): if "pattern" in entry: - fmt = dict(entry.get("format") or {}) + fmt = dict(entry.get("cfg") or entry.get("format") or {}) if "enable" in entry: fmt["enable"] = entry["enable"] result[entry["pattern"]] = fmt From d599103a64689c5857ddcb55ec5eba6e390eef53 Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Thu, 19 Mar 2026 20:47:57 +0000 Subject: [PATCH 07/32] pattern to path Signed-off-by: Shengliang Xu --- modelopt/torch/quantization/config.py | 17 ++++--- .../general/ptq/fp8_default-fp8_kv.yml | 36 +++++++-------- .../general/ptq/nvfp4_default-fp8_kv.yml | 36 +++++++-------- .../general/ptq/nvfp4_mlp_only-fp8_kv.yml | 40 ++++++++--------- .../general/ptq/nvfp4_omlp_only-fp8_kv.yml | 44 +++++++++---------- tests/unit/recipe/test_loader.py | 6 +-- 6 files changed, 89 insertions(+), 90 deletions(-) diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py index 553370bad..d63f048b8 100644 --- a/modelopt/torch/quantization/config.py +++ b/modelopt/torch/quantization/config.py @@ -1475,12 +1475,11 @@ def normalize_quant_cfg(cls, v): Supports these dict forms for YAML/JSON compatibility: - - ``{"pattern": ..., "enable": ..., "format": ...}`` — explicit object with top-level enable - - ``{"pattern": ..., "enable": ...}`` — enable-only (no format fields) - - ``{"pattern": ..., "cfg": ...}`` — explicit pattern/cfg object - - ``{"": ...}`` — single-key dict (legacy) + - ``{"path": ..., "enable": ..., "cfg": ...}`` — explicit object with top-level enable + - ``{"path": ..., "enable": ...}`` — enable-only (no cfg fields) + - ``{"": ...}`` — single-key dict (legacy) - The internal representation is always a list of ``(pattern, cfg)`` tuples where + The internal representation is always a list of ``(path, cfg)`` tuples where ``enable`` (if present at the top level) is merged into ``cfg``. """ if not isinstance(v, list): @@ -1488,9 +1487,9 @@ def normalize_quant_cfg(cls, v): result = [] for entry in v: if isinstance(entry, dict): - if "pattern" in entry: - pattern = entry["pattern"] - fmt = dict(entry.get("cfg") or entry.get("format") or {}) + if "path" in entry: + pattern = entry["path"] + fmt = dict(entry.get("cfg") or {}) if "enable" in entry: fmt["enable"] = entry["enable"] result.append((pattern, fmt)) @@ -1499,7 +1498,7 @@ def normalize_quant_cfg(cls, v): else: raise ValueError( f"Invalid quant_cfg entry: {entry!r}. " - "Expected a single-key dict or an object with a 'pattern' key." + "Expected a single-key dict or an object with a 'path' key." ) else: result.append(entry) diff --git a/modelopt_recipes/general/ptq/fp8_default-fp8_kv.yml b/modelopt_recipes/general/ptq/fp8_default-fp8_kv.yml index a3287a0e6..1d0ef7f68 100644 --- a/modelopt_recipes/general/ptq/fp8_default-fp8_kv.yml +++ b/modelopt_recipes/general/ptq/fp8_default-fp8_kv.yml @@ -19,54 +19,54 @@ metadata: ptq_cfg: algorithm: max quant_cfg: - - pattern: '*input_quantizer' + - path: '*input_quantizer' cfg: num_bits: e4m3 axis: - - pattern: '*weight_quantizer' + - path: '*weight_quantizer' cfg: num_bits: e4m3 axis: - - pattern: '*[kv]_bmm_quantizer' + - path: '*[kv]_bmm_quantizer' enable: true cfg: num_bits: e4m3 - - pattern: default + - path: 'default' enable: false - - pattern: '*block_sparse_moe.gate*' + - path: '*block_sparse_moe.gate*' enable: false - - pattern: '*linear_attn.conv1d*' + - path: '*linear_attn.conv1d*' enable: false - - pattern: '*lm_head*' + - path: '*lm_head*' enable: false - - pattern: '*mixer.conv1d*' + - path: '*mixer.conv1d*' enable: false - - pattern: '*mlp.gate.*' + - path: '*mlp.gate.*' enable: false - - pattern: '*mlp.shared_expert_gate.*' + - path: '*mlp.shared_expert_gate.*' enable: false - - pattern: '*output_layer*' + - path: '*output_layer*' enable: false - - pattern: '*proj_out.*' + - path: '*proj_out.*' enable: false - - pattern: '*router*' + - path: '*router*' enable: false - - pattern: output.* + - path: 'output.*' enable: false - - pattern: nn.BatchNorm1d + - path: 'nn.BatchNorm1d' cfg: '*': enable: false - - pattern: nn.BatchNorm2d + - path: 'nn.BatchNorm2d' cfg: '*': enable: false - - pattern: nn.BatchNorm3d + - path: 'nn.BatchNorm3d' cfg: '*': enable: false - - pattern: nn.LeakyReLU + - path: 'nn.LeakyReLU' cfg: '*': enable: false diff --git a/modelopt_recipes/general/ptq/nvfp4_default-fp8_kv.yml b/modelopt_recipes/general/ptq/nvfp4_default-fp8_kv.yml index 8b98c53fe..c1ef593bc 100644 --- a/modelopt_recipes/general/ptq/nvfp4_default-fp8_kv.yml +++ b/modelopt_recipes/general/ptq/nvfp4_default-fp8_kv.yml @@ -19,7 +19,7 @@ metadata: ptq_cfg: algorithm: max quant_cfg: - - pattern: '*weight_quantizer' + - path: '*weight_quantizer' enable: true cfg: block_sizes: @@ -27,7 +27,7 @@ ptq_cfg: type: dynamic scale_bits: e4m3 num_bits: e2m1 - - pattern: '*input_quantizer' + - path: '*input_quantizer' enable: true cfg: block_sizes: @@ -35,46 +35,46 @@ ptq_cfg: type: dynamic scale_bits: e4m3 num_bits: e2m1 - - pattern: '*[kv]_bmm_quantizer' + - path: '*[kv]_bmm_quantizer' enable: true cfg: num_bits: e4m3 - - pattern: default + - path: 'default' enable: false - - pattern: '*block_sparse_moe.gate*' + - path: '*block_sparse_moe.gate*' enable: false - - pattern: '*linear_attn.conv1d*' + - path: '*linear_attn.conv1d*' enable: false - - pattern: '*lm_head*' + - path: '*lm_head*' enable: false - - pattern: '*mixer.conv1d*' + - path: '*mixer.conv1d*' enable: false - - pattern: '*mlp.gate.*' + - path: '*mlp.gate.*' enable: false - - pattern: '*mlp.shared_expert_gate.*' + - path: '*mlp.shared_expert_gate.*' enable: false - - pattern: '*output_layer*' + - path: '*output_layer*' enable: false - - pattern: '*proj_out.*' + - path: '*proj_out.*' enable: false - - pattern: '*router*' + - path: '*router*' enable: false - - pattern: output.* + - path: 'output.*' enable: false - - pattern: nn.BatchNorm1d + - path: 'nn.BatchNorm1d' cfg: '*': enable: false - - pattern: nn.BatchNorm2d + - path: 'nn.BatchNorm2d' cfg: '*': enable: false - - pattern: nn.BatchNorm3d + - path: 'nn.BatchNorm3d' cfg: '*': enable: false - - pattern: nn.LeakyReLU + - path: 'nn.LeakyReLU' cfg: '*': enable: false diff --git a/modelopt_recipes/general/ptq/nvfp4_mlp_only-fp8_kv.yml b/modelopt_recipes/general/ptq/nvfp4_mlp_only-fp8_kv.yml index 64eeb1ecf..2a0dedf84 100644 --- a/modelopt_recipes/general/ptq/nvfp4_mlp_only-fp8_kv.yml +++ b/modelopt_recipes/general/ptq/nvfp4_mlp_only-fp8_kv.yml @@ -19,7 +19,7 @@ metadata: ptq_cfg: algorithm: max quant_cfg: - - pattern: '*mlp*weight_quantizer' + - path: '*mlp*weight_quantizer' enable: true cfg: block_sizes: @@ -27,7 +27,7 @@ ptq_cfg: type: dynamic scale_bits: e4m3 num_bits: e2m1 - - pattern: '*mlp*input_quantizer' + - path: '*mlp*input_quantizer' enable: true cfg: block_sizes: @@ -35,7 +35,7 @@ ptq_cfg: type: dynamic scale_bits: e4m3 num_bits: e2m1 - - pattern: '*block_sparse_moe*weight_quantizer' + - path: '*block_sparse_moe*weight_quantizer' enable: true cfg: block_sizes: @@ -43,7 +43,7 @@ ptq_cfg: type: dynamic scale_bits: e4m3 num_bits: e2m1 - - pattern: '*block_sparse_moe*input_quantizer' + - path: '*block_sparse_moe*input_quantizer' enable: true cfg: block_sizes: @@ -51,46 +51,46 @@ ptq_cfg: type: dynamic scale_bits: e4m3 num_bits: e2m1 - - pattern: '*[kv]_bmm_quantizer' + - path: '*[kv]_bmm_quantizer' enable: true cfg: num_bits: e4m3 - - pattern: default + - path: 'default' enable: false - - pattern: '*block_sparse_moe.gate*' + - path: '*block_sparse_moe.gate*' enable: false - - pattern: '*linear_attn.conv1d*' + - path: '*linear_attn.conv1d*' enable: false - - pattern: '*lm_head*' + - path: '*lm_head*' enable: false - - pattern: '*mixer.conv1d*' + - path: '*mixer.conv1d*' enable: false - - pattern: '*mlp.gate.*' + - path: '*mlp.gate.*' enable: false - - pattern: '*mlp.shared_expert_gate.*' + - path: '*mlp.shared_expert_gate.*' enable: false - - pattern: '*output_layer*' + - path: '*output_layer*' enable: false - - pattern: '*proj_out.*' + - path: '*proj_out.*' enable: false - - pattern: '*router*' + - path: '*router*' enable: false - - pattern: output.* + - path: 'output.*' enable: false - - pattern: nn.BatchNorm1d + - path: 'nn.BatchNorm1d' cfg: '*': enable: false - - pattern: nn.BatchNorm2d + - path: 'nn.BatchNorm2d' cfg: '*': enable: false - - pattern: nn.BatchNorm3d + - path: 'nn.BatchNorm3d' cfg: '*': enable: false - - pattern: nn.LeakyReLU + - path: 'nn.LeakyReLU' cfg: '*': enable: false diff --git a/modelopt_recipes/general/ptq/nvfp4_omlp_only-fp8_kv.yml b/modelopt_recipes/general/ptq/nvfp4_omlp_only-fp8_kv.yml index e55dc42e2..d6b51e64a 100644 --- a/modelopt_recipes/general/ptq/nvfp4_omlp_only-fp8_kv.yml +++ b/modelopt_recipes/general/ptq/nvfp4_omlp_only-fp8_kv.yml @@ -19,7 +19,7 @@ metadata: ptq_cfg: algorithm: max quant_cfg: - - pattern: '*mlp*weight_quantizer' + - path: '*mlp*weight_quantizer' enable: true cfg: block_sizes: @@ -27,7 +27,7 @@ ptq_cfg: type: dynamic scale_bits: e4m3 num_bits: e2m1 - - pattern: '*mlp*input_quantizer' + - path: '*mlp*input_quantizer' enable: true cfg: block_sizes: @@ -35,7 +35,7 @@ ptq_cfg: type: dynamic scale_bits: e4m3 num_bits: e2m1 - - pattern: '*block_sparse_moe*weight_quantizer' + - path: '*block_sparse_moe*weight_quantizer' enable: true cfg: block_sizes: @@ -43,7 +43,7 @@ ptq_cfg: type: dynamic scale_bits: e4m3 num_bits: e2m1 - - pattern: '*block_sparse_moe*input_quantizer' + - path: '*block_sparse_moe*input_quantizer' enable: true cfg: block_sizes: @@ -51,7 +51,7 @@ ptq_cfg: type: dynamic scale_bits: e4m3 num_bits: e2m1 - - pattern: '*o_proj*weight_quantizer' + - path: '*o_proj*weight_quantizer' enable: true cfg: block_sizes: @@ -59,7 +59,7 @@ ptq_cfg: type: dynamic scale_bits: e4m3 num_bits: e2m1 - - pattern: '*o_proj*input_quantizer' + - path: '*o_proj*input_quantizer' enable: true cfg: block_sizes: @@ -67,46 +67,46 @@ ptq_cfg: type: dynamic scale_bits: e4m3 num_bits: e2m1 - - pattern: '*[kv]_bmm_quantizer' + - path: '*[kv]_bmm_quantizer' enable: true cfg: num_bits: e4m3 - - pattern: default + - path: 'default' enable: false - - pattern: '*block_sparse_moe.gate*' + - path: '*block_sparse_moe.gate*' enable: false - - pattern: '*linear_attn.conv1d*' + - path: '*linear_attn.conv1d*' enable: false - - pattern: '*lm_head*' + - path: '*lm_head*' enable: false - - pattern: '*mixer.conv1d*' + - path: '*mixer.conv1d*' enable: false - - pattern: '*mlp.gate.*' + - path: '*mlp.gate.*' enable: false - - pattern: '*mlp.shared_expert_gate.*' + - path: '*mlp.shared_expert_gate.*' enable: false - - pattern: '*output_layer*' + - path: '*output_layer*' enable: false - - pattern: '*proj_out.*' + - path: '*proj_out.*' enable: false - - pattern: '*router*' + - path: '*router*' enable: false - - pattern: output.* + - path: 'output.*' enable: false - - pattern: nn.BatchNorm1d + - path: 'nn.BatchNorm1d' cfg: '*': enable: false - - pattern: nn.BatchNorm2d + - path: 'nn.BatchNorm2d' cfg: '*': enable: false - - pattern: nn.BatchNorm3d + - path: 'nn.BatchNorm3d' cfg: '*': enable: false - - pattern: nn.LeakyReLU + - path: 'nn.LeakyReLU' cfg: '*': enable: false diff --git a/tests/unit/recipe/test_loader.py b/tests/unit/recipe/test_loader.py index a4a9a08d4..67f587ddc 100644 --- a/tests/unit/recipe/test_loader.py +++ b/tests/unit/recipe/test_loader.py @@ -211,11 +211,11 @@ def _as_dict(qc): result = {} for entry in qc: if isinstance(entry, dict): - if "pattern" in entry: - fmt = dict(entry.get("cfg") or entry.get("format") or {}) + if "path" in entry: + fmt = dict(entry.get("cfg") or {}) if "enable" in entry: fmt["enable"] = entry["enable"] - result[entry["pattern"]] = fmt + result[entry["path"]] = fmt else: result.update(entry) else: From fc5387759d31f4b90617dca7498f050e986bfb08 Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Thu, 19 Mar 2026 21:04:33 +0000 Subject: [PATCH 08/32] flatten the inner configs Signed-off-by: Shengliang Xu --- modelopt/torch/quantization/config.py | 24 ++++++++++------ .../general/ptq/fp8_default-fp8_kv.yml | 28 ++++++++----------- .../general/ptq/nvfp4_default-fp8_kv.yml | 28 ++++++++----------- .../general/ptq/nvfp4_mlp_only-fp8_kv.yml | 28 ++++++++----------- .../general/ptq/nvfp4_omlp_only-fp8_kv.yml | 28 ++++++++----------- tests/unit/recipe/test_loader.py | 7 ++++- 6 files changed, 70 insertions(+), 73 deletions(-) diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py index d63f048b8..38e7f951a 100644 --- a/modelopt/torch/quantization/config.py +++ b/modelopt/torch/quantization/config.py @@ -1475,30 +1475,38 @@ def normalize_quant_cfg(cls, v): Supports these dict forms for YAML/JSON compatibility: - - ``{"path": ..., "enable": ..., "cfg": ...}`` — explicit object with top-level enable - - ``{"path": ..., "enable": ...}`` — enable-only (no cfg fields) + - ``{"path": ..., "enable": ..., "cfg": ...}`` — glob path match with top-level enable + - ``{"path": ..., "enable": ...}`` — glob path match, enable-only + - ``{"type": ..., "path": ..., "enable": ...}`` — type match with per-path-glob enable - ``{"": ...}`` — single-key dict (legacy) - The internal representation is always a list of ``(path, cfg)`` tuples where - ``enable`` (if present at the top level) is merged into ``cfg``. + The internal representation is always a list of ``(key, cfg)`` tuples. + For ``type`` entries the key is the type name and cfg is ``{path: {enable: ...}}``. + For ``path`` entries the key is the path glob and ``enable`` is merged into cfg. """ if not isinstance(v, list): return v result = [] for entry in v: if isinstance(entry, dict): - if "path" in entry: - pattern = entry["path"] + if "type" in entry: + type_val = entry["type"] + path_val = entry["path"] + sub_cfg = {} + if "enable" in entry: + sub_cfg["enable"] = entry["enable"] + result.append((type_val, {path_val: sub_cfg})) + elif "path" in entry: fmt = dict(entry.get("cfg") or {}) if "enable" in entry: fmt["enable"] = entry["enable"] - result.append((pattern, fmt)) + result.append((entry["path"], fmt)) elif len(entry) == 1: result.append(next(iter(entry.items()))) else: raise ValueError( f"Invalid quant_cfg entry: {entry!r}. " - "Expected a single-key dict or an object with a 'path' key." + "Expected a single-key dict or an object with a 'path' or 'type' key." ) else: result.append(entry) diff --git a/modelopt_recipes/general/ptq/fp8_default-fp8_kv.yml b/modelopt_recipes/general/ptq/fp8_default-fp8_kv.yml index 1d0ef7f68..1c172e518 100644 --- a/modelopt_recipes/general/ptq/fp8_default-fp8_kv.yml +++ b/modelopt_recipes/general/ptq/fp8_default-fp8_kv.yml @@ -54,19 +54,15 @@ ptq_cfg: enable: false - path: 'output.*' enable: false - - path: 'nn.BatchNorm1d' - cfg: - '*': - enable: false - - path: 'nn.BatchNorm2d' - cfg: - '*': - enable: false - - path: 'nn.BatchNorm3d' - cfg: - '*': - enable: false - - path: 'nn.LeakyReLU' - cfg: - '*': - enable: false + - type: 'nn.BatchNorm1d' + path: '*' + enable: false + - type: 'nn.BatchNorm2d' + path: '*' + enable: false + - type: 'nn.BatchNorm3d' + path: '*' + enable: false + - type: 'nn.LeakyReLU' + path: '*' + enable: false diff --git a/modelopt_recipes/general/ptq/nvfp4_default-fp8_kv.yml b/modelopt_recipes/general/ptq/nvfp4_default-fp8_kv.yml index c1ef593bc..38ca1b024 100644 --- a/modelopt_recipes/general/ptq/nvfp4_default-fp8_kv.yml +++ b/modelopt_recipes/general/ptq/nvfp4_default-fp8_kv.yml @@ -62,19 +62,15 @@ ptq_cfg: enable: false - path: 'output.*' enable: false - - path: 'nn.BatchNorm1d' - cfg: - '*': - enable: false - - path: 'nn.BatchNorm2d' - cfg: - '*': - enable: false - - path: 'nn.BatchNorm3d' - cfg: - '*': - enable: false - - path: 'nn.LeakyReLU' - cfg: - '*': - enable: false + - type: 'nn.BatchNorm1d' + path: '*' + enable: false + - type: 'nn.BatchNorm2d' + path: '*' + enable: false + - type: 'nn.BatchNorm3d' + path: '*' + enable: false + - type: 'nn.LeakyReLU' + path: '*' + enable: false diff --git a/modelopt_recipes/general/ptq/nvfp4_mlp_only-fp8_kv.yml b/modelopt_recipes/general/ptq/nvfp4_mlp_only-fp8_kv.yml index 2a0dedf84..f95c1aa46 100644 --- a/modelopt_recipes/general/ptq/nvfp4_mlp_only-fp8_kv.yml +++ b/modelopt_recipes/general/ptq/nvfp4_mlp_only-fp8_kv.yml @@ -78,19 +78,15 @@ ptq_cfg: enable: false - path: 'output.*' enable: false - - path: 'nn.BatchNorm1d' - cfg: - '*': - enable: false - - path: 'nn.BatchNorm2d' - cfg: - '*': - enable: false - - path: 'nn.BatchNorm3d' - cfg: - '*': - enable: false - - path: 'nn.LeakyReLU' - cfg: - '*': - enable: false + - type: 'nn.BatchNorm1d' + path: '*' + enable: false + - type: 'nn.BatchNorm2d' + path: '*' + enable: false + - type: 'nn.BatchNorm3d' + path: '*' + enable: false + - type: 'nn.LeakyReLU' + path: '*' + enable: false diff --git a/modelopt_recipes/general/ptq/nvfp4_omlp_only-fp8_kv.yml b/modelopt_recipes/general/ptq/nvfp4_omlp_only-fp8_kv.yml index d6b51e64a..7d6885f70 100644 --- a/modelopt_recipes/general/ptq/nvfp4_omlp_only-fp8_kv.yml +++ b/modelopt_recipes/general/ptq/nvfp4_omlp_only-fp8_kv.yml @@ -94,19 +94,15 @@ ptq_cfg: enable: false - path: 'output.*' enable: false - - path: 'nn.BatchNorm1d' - cfg: - '*': - enable: false - - path: 'nn.BatchNorm2d' - cfg: - '*': - enable: false - - path: 'nn.BatchNorm3d' - cfg: - '*': - enable: false - - path: 'nn.LeakyReLU' - cfg: - '*': - enable: false + - type: 'nn.BatchNorm1d' + path: '*' + enable: false + - type: 'nn.BatchNorm2d' + path: '*' + enable: false + - type: 'nn.BatchNorm3d' + path: '*' + enable: false + - type: 'nn.LeakyReLU' + path: '*' + enable: false diff --git a/tests/unit/recipe/test_loader.py b/tests/unit/recipe/test_loader.py index 67f587ddc..0b49210c7 100644 --- a/tests/unit/recipe/test_loader.py +++ b/tests/unit/recipe/test_loader.py @@ -211,7 +211,12 @@ def _as_dict(qc): result = {} for entry in qc: if isinstance(entry, dict): - if "path" in entry: + if "type" in entry: + sub_cfg = {} + if "enable" in entry: + sub_cfg["enable"] = entry["enable"] + result[entry["type"]] = {entry["path"]: sub_cfg} + elif "path" in entry: fmt = dict(entry.get("cfg") or {}) if "enable" in entry: fmt["enable"] = entry["enable"] From a19335f25055e9a312aa5e3d53d3f8da9e6c7ae4 Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Thu, 19 Mar 2026 21:11:32 +0000 Subject: [PATCH 09/32] get rid of the special 'default' Signed-off-by: Shengliang Xu --- modelopt/torch/quantization/config.py | 4 ++-- modelopt/torch/quantization/conversion.py | 15 +++------------ modelopt/torch/quantization/utils/core_utils.py | 2 +- .../general/ptq/fp8_default-fp8_kv.yml | 4 ++-- .../general/ptq/nvfp4_default-fp8_kv.yml | 4 ++-- .../general/ptq/nvfp4_mlp_only-fp8_kv.yml | 4 ++-- .../general/ptq/nvfp4_omlp_only-fp8_kv.yml | 4 ++-- 7 files changed, 14 insertions(+), 23 deletions(-) diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py index 38e7f951a..172507f38 100644 --- a/modelopt/torch/quantization/config.py +++ b/modelopt/torch/quantization/config.py @@ -66,8 +66,8 @@ The quantizer attributes are defined by :class:`QuantizerAttributeConfig`. See :class:`QuantizerAttributeConfig` for details on the quantizer attributes and their values. -The key `"default"` from the quantization configuration dictionary is applied if no other wildcard or filter functions -match the quantizer module name. +Use `"*"` as the first entry in the quantization configuration list to set a catch-all default +that applies to all quantizers not matched by a later, more specific entry. The quantizer attributes are applied in the order they are specified. For the missing attributes, the default attributes as defined by :class:`QuantizerAttributeConfig` are used. diff --git a/modelopt/torch/quantization/conversion.py b/modelopt/torch/quantization/conversion.py index 705d9686a..17bd510b1 100644 --- a/modelopt/torch/quantization/conversion.py +++ b/modelopt/torch/quantization/conversion.py @@ -217,10 +217,9 @@ def set_quantizer_by_cfg(quant_model: nn.Module, quant_cfg: QuantizeQuantCfgType `quant_cfg` is a list of ``(pattern, attrs)`` tuples mapping wildcards or filter functions to its quantizer attributes which are defined in :class:`QuantizerAttributeConfig <.config.QuantizerAttributeConfig>`. - The wildcards or filter functions are matched against the quantizer module names. + The wildcards or filter functions are matched against the quantizer module names. The specified quantizer attributes of the matched quantizer modules are set accordingly. - The key ``"default"`` is a special key that sets the quantizer attributes of all the quantizers for which - no other wildcard or filter functions match the quantizer module name. + Entries are applied in order; use ``"*"`` as the first entry to set a catch-all default. In addition, the dictionary entries could also be pytorch module class names mapping the class specific quantization configuration. The pytorch modules should have a quantized equivalent. @@ -228,15 +227,7 @@ def set_quantizer_by_cfg(quant_model: nn.Module, quant_cfg: QuantizeQuantCfgType See :meth:`set_quantizer_attribute ` for more details. """ - items = list(quant_cfg) - for pattern, cfg in items: - if str(pattern) == "default": - set_quantizer_attribute(quant_model, "*", cfg) - break - - for pattern, cfg in items: - if str(pattern) == "default": - continue + for pattern, cfg in quant_cfg: if str(pattern) in QuantModuleRegistry: parent_class = QuantModuleRegistry[str(pattern)] assert isinstance(cfg, dict), ( diff --git a/modelopt/torch/quantization/utils/core_utils.py b/modelopt/torch/quantization/utils/core_utils.py index c201869ed..c5c582b8c 100644 --- a/modelopt/torch/quantization/utils/core_utils.py +++ b/modelopt/torch/quantization/utils/core_utils.py @@ -828,7 +828,7 @@ def update_quant_cfg_with_kv_cache_quant( """Update the quant_cfg with the kv cache quant_cfg.""" # If quant_cfg["quant_cfg"] is None, it corresponds to only kv cache quantization case quant_cfg = copy.deepcopy(quant_cfg) - inner: list = quant_cfg.get("quant_cfg") or [("default", {"enable": False})] + inner: list = quant_cfg.get("quant_cfg") or [("*", {"enable": False})] quant_cfg["quant_cfg"] = inner + list(kv_cache_quant_cfg.items()) # Set default algorithm for kv cache quantization if not provided. diff --git a/modelopt_recipes/general/ptq/fp8_default-fp8_kv.yml b/modelopt_recipes/general/ptq/fp8_default-fp8_kv.yml index 1c172e518..4cae9ff7c 100644 --- a/modelopt_recipes/general/ptq/fp8_default-fp8_kv.yml +++ b/modelopt_recipes/general/ptq/fp8_default-fp8_kv.yml @@ -19,6 +19,8 @@ metadata: ptq_cfg: algorithm: max quant_cfg: + - path: '*' + enable: false - path: '*input_quantizer' cfg: num_bits: e4m3 @@ -32,8 +34,6 @@ ptq_cfg: cfg: num_bits: e4m3 - - path: 'default' - enable: false - path: '*block_sparse_moe.gate*' enable: false - path: '*linear_attn.conv1d*' diff --git a/modelopt_recipes/general/ptq/nvfp4_default-fp8_kv.yml b/modelopt_recipes/general/ptq/nvfp4_default-fp8_kv.yml index 38ca1b024..2b5e97b19 100644 --- a/modelopt_recipes/general/ptq/nvfp4_default-fp8_kv.yml +++ b/modelopt_recipes/general/ptq/nvfp4_default-fp8_kv.yml @@ -19,6 +19,8 @@ metadata: ptq_cfg: algorithm: max quant_cfg: + - path: '*' + enable: false - path: '*weight_quantizer' enable: true cfg: @@ -40,8 +42,6 @@ ptq_cfg: cfg: num_bits: e4m3 - - path: 'default' - enable: false - path: '*block_sparse_moe.gate*' enable: false - path: '*linear_attn.conv1d*' diff --git a/modelopt_recipes/general/ptq/nvfp4_mlp_only-fp8_kv.yml b/modelopt_recipes/general/ptq/nvfp4_mlp_only-fp8_kv.yml index f95c1aa46..69c51f87e 100644 --- a/modelopt_recipes/general/ptq/nvfp4_mlp_only-fp8_kv.yml +++ b/modelopt_recipes/general/ptq/nvfp4_mlp_only-fp8_kv.yml @@ -19,6 +19,8 @@ metadata: ptq_cfg: algorithm: max quant_cfg: + - path: '*' + enable: false - path: '*mlp*weight_quantizer' enable: true cfg: @@ -56,8 +58,6 @@ ptq_cfg: cfg: num_bits: e4m3 - - path: 'default' - enable: false - path: '*block_sparse_moe.gate*' enable: false - path: '*linear_attn.conv1d*' diff --git a/modelopt_recipes/general/ptq/nvfp4_omlp_only-fp8_kv.yml b/modelopt_recipes/general/ptq/nvfp4_omlp_only-fp8_kv.yml index 7d6885f70..a35b88cac 100644 --- a/modelopt_recipes/general/ptq/nvfp4_omlp_only-fp8_kv.yml +++ b/modelopt_recipes/general/ptq/nvfp4_omlp_only-fp8_kv.yml @@ -19,6 +19,8 @@ metadata: ptq_cfg: algorithm: max quant_cfg: + - path: '*' + enable: false - path: '*mlp*weight_quantizer' enable: true cfg: @@ -72,8 +74,6 @@ ptq_cfg: cfg: num_bits: e4m3 - - path: 'default' - enable: false - path: '*block_sparse_moe.gate*' enable: false - path: '*linear_attn.conv1d*' From 04014ec7aad90c637dedbe643c4905035687b9e3 Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Thu, 19 Mar 2026 22:39:02 +0000 Subject: [PATCH 10/32] remove default Signed-off-by: Shengliang Xu --- modelopt/torch/quantization/config.py | 38 ++++++++++++++++++++------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py index 172507f38..fd81b0d17 100644 --- a/modelopt/torch/quantization/config.py +++ b/modelopt/torch/quantization/config.py @@ -143,6 +143,10 @@ from modelopt.torch.opt.config import ModeloptBaseConfig, ModeloptField from modelopt.torch.utils.network import ConstructorLike +_base_disable_all: list[tuple] = [ + ("*", {"enable": False}), +] + _default_disabled_quantizer_cfg: list[tuple] = [ ("nn.BatchNorm1d", {"*": {"enable": False}}), ("nn.BatchNorm2d", {"*": {"enable": False}}), @@ -158,7 +162,6 @@ ("*mixer.conv1d*", {"enable": False}), # Skip mamba conv1d ("*output_layer*", {"enable": False}), ("output.*", {"enable": False}), - ("default", {"enable": False}), ] _mamba_moe_disabled_quantizer_cfg: list[tuple] = [ @@ -172,6 +175,7 @@ INT8_DEFAULT_CFG = { "quant_cfg": [ + *_base_disable_all, ("*weight_quantizer", {"num_bits": 8, "axis": 0}), ("*input_quantizer", {"num_bits": 8, "axis": None}), *_default_disabled_quantizer_cfg, @@ -181,6 +185,7 @@ INT8_SMOOTHQUANT_CFG = { "quant_cfg": [ + *_base_disable_all, ("*weight_quantizer", {"num_bits": 8, "axis": 0}), ("*input_quantizer", {"num_bits": 8, "axis": None}), *_default_disabled_quantizer_cfg, @@ -190,6 +195,7 @@ INT8_WEIGHT_ONLY_CFG = { "quant_cfg": [ + *_base_disable_all, ("*weight_quantizer", {"num_bits": 8, "axis": 0}), ("*input_quantizer", {"enable": False}), *_default_disabled_quantizer_cfg, @@ -199,6 +205,7 @@ FP8_DEFAULT_CFG = { "quant_cfg": [ + *_base_disable_all, ("*weight_quantizer", {"num_bits": (4, 3), "axis": None}), ("*input_quantizer", {"num_bits": (4, 3), "axis": None}), *_default_disabled_quantizer_cfg, @@ -208,6 +215,7 @@ MAMBA_MOE_FP8_AGGRESSIVE_CFG = { "quant_cfg": [ + *_base_disable_all, ("*weight_quantizer", {"num_bits": (4, 3), "axis": None}), ("*input_quantizer", {"num_bits": (4, 3), "axis": None}), *_default_disabled_quantizer_cfg, @@ -218,6 +226,7 @@ MAMBA_MOE_FP8_CONSERVATIVE_CFG = { "quant_cfg": [ + *_base_disable_all, ("*weight_quantizer", {"num_bits": (4, 3), "axis": None}), ("*input_quantizer", {"num_bits": (4, 3), "axis": None}), *_default_disabled_quantizer_cfg, @@ -230,6 +239,7 @@ FP8_PER_CHANNEL_PER_TOKEN_CFG = { "quant_cfg": [ + *_base_disable_all, ("*weight_quantizer", {"num_bits": (4, 3), "axis": 0}), ( "*input_quantizer", @@ -247,6 +257,7 @@ # FP8 2D blockwise fake quantization config for deepseek models FP8_2D_BLOCKWISE_WEIGHT_ONLY_CFG = { "quant_cfg": [ + *_base_disable_all, ( "*weight_quantizer", { @@ -263,6 +274,7 @@ INT4_BLOCKWISE_WEIGHT_ONLY_CFG = { "quant_cfg": [ + *_base_disable_all, ( "*weight_quantizer", { @@ -280,6 +292,7 @@ INT4_AWQ_CFG = { "quant_cfg": [ + *_base_disable_all, ( "*weight_quantizer", { @@ -300,6 +313,7 @@ # for weights. This could change in the future W4A8_AWQ_BETA_CFG = { "quant_cfg": [ + *_base_disable_all, ( "*weight_quantizer", [ @@ -328,6 +342,7 @@ MXFP8_DEFAULT_CFG = { "quant_cfg": [ + *_base_disable_all, ( "*weight_quantizer", { @@ -351,6 +366,7 @@ MXFP6_DEFAULT_CFG = { "quant_cfg": [ + *_base_disable_all, ( "*weight_quantizer", { @@ -374,6 +390,7 @@ MXFP4_DEFAULT_CFG = { "quant_cfg": [ + *_base_disable_all, ( "*weight_quantizer", { @@ -397,6 +414,7 @@ W4A8_MXFP4_FP8_CFG = { "quant_cfg": [ + *_base_disable_all, ( "*weight_quantizer", { @@ -413,6 +431,7 @@ MXINT8_DEFAULT_CFG = { "quant_cfg": [ + *_base_disable_all, ( "*weight_quantizer", { @@ -443,7 +462,6 @@ "enable": True, }, ), - ("default", {"enable": False}), ], "algorithm": "max", } @@ -457,7 +475,6 @@ "bias": {-2: None, -4: None, "type": "static"}, }, ), - ("default", {"enable": False}), ], "algorithm": "max", } @@ -496,6 +513,7 @@ def _nvfp4_selective_quant_cfg( NVFP4_W4A4_WEIGHT_MSE_FP8_SWEEP_CFG = { "quant_cfg": [ + *_base_disable_all, ( "*weight_quantizer", { @@ -515,6 +533,7 @@ def _nvfp4_selective_quant_cfg( NVFP4_W4A4_WEIGHT_LOCAL_HESSIAN_CFG = { "quant_cfg": [ + *_base_disable_all, ( "*weight_quantizer", { @@ -534,6 +553,7 @@ def _nvfp4_selective_quant_cfg( MAMBA_MOE_NVFP4_AGGRESSIVE_CFG = { "quant_cfg": [ + *_base_disable_all, ("*weight_quantizer", _nvfp4_quantizer), ("*input_quantizer", _nvfp4_quantizer), *_default_disabled_quantizer_cfg, @@ -543,6 +563,7 @@ def _nvfp4_selective_quant_cfg( } MAMBA_MOE_NVFP4_CONSERVATIVE_CFG = { "quant_cfg": [ + *_base_disable_all, ("*weight_quantizer", _nvfp4_quantizer), ("*input_quantizer", _nvfp4_quantizer), *_default_disabled_quantizer_cfg, @@ -570,22 +591,19 @@ def _nvfp4_selective_quant_cfg( "bias": {-2: None, -4: None, "type": "static"}, }, ), - ("default", {"enable": False}), ], - "algorithm": "max", } NVFP4_KV_CFG = { "quant_cfg": [ ("*[kv]_bmm_quantizer", _nvfp4_quantizer), - ("default", {"enable": False}), ], - "algorithm": "max", } # Moved from examples/diffusers/quantization/config.py to here NVFP4_FP8_MHA_CONFIG = { "quant_cfg": [ + *_base_disable_all, ("*weight_quantizer", _nvfp4_quantizer), ("*input_quantizer", _nvfp4_quantizer), ("*output_quantizer", {"enable": False}), @@ -619,7 +637,6 @@ def _nvfp4_selective_quant_cfg( "num_bits": (4, 3), }, ), - ("default", {"enable": False}), ], "algorithm": "max", } @@ -651,6 +668,7 @@ def _nvfp4_selective_quant_cfg( W4A8_NVFP4_FP8_CFG = { "quant_cfg": [ + *_base_disable_all, ( "*weight_quantizer", { @@ -673,6 +691,7 @@ def _nvfp4_selective_quant_cfg( MXFP4_MLP_WEIGHT_ONLY_CFG = { "quant_cfg": [ + *_base_disable_all, ( "*mlp*weight_quantizer", { @@ -701,6 +720,7 @@ def _nvfp4_selective_quant_cfg( NVFP4_MLP_ONLY_CFG = _nvfp4_selective_quant_cfg(["*mlp*", "*block_sparse_moe*"]) NVFP4_OMLP_ONLY_CFG = _nvfp4_selective_quant_cfg(["*o_proj*", "*mlp*", "*block_sparse_moe*"]) + # DO NOT ADD NEW CONFIGS HERE. If you want to add a new general recipe, add it to # modelopt_recipes/general/ptq/ as a yaml file choices: set[str] = { @@ -1456,7 +1476,7 @@ class QuantizeConfig(ModeloptBaseConfig): """Default configuration for ``quantize`` mode.""" quant_cfg: QuantizeQuantCfgType = ModeloptField( - default=[("default", {"num_bits": 8, "axis": None})], + default=[("*", {"num_bits": 8, "axis": None})], title="Quantization configuration", validate_default=True, ) From 22134efd55d8745e7d010f825c797f1b0a2ca17d Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Fri, 20 Mar 2026 00:13:02 +0000 Subject: [PATCH 11/32] match yaml file format Signed-off-by: Shengliang Xu --- examples/llm_autodeploy/run_auto_quantize.py | 14 +- examples/llm_ptq/hf_ptq.py | 42 +- .../llm_export_utils/quantization_utils.py | 38 +- modelopt/torch/export/unified_export_hf.py | 3 +- modelopt/torch/quantization/algorithms.py | 34 +- .../backends/fp8_per_tensor_gemm.py | 20 +- .../torch/quantization/backends/nvfp4_gemm.py | 20 +- modelopt/torch/quantization/config.py | 531 +++++++++--------- modelopt/torch/quantization/conversion.py | 44 +- modelopt/torch/quantization/model_calib.py | 4 +- modelopt/torch/quantization/model_quant.py | 4 +- .../torch/quantization/utils/core_utils.py | 2 +- .../general/ptq/fp8_default-fp8_kv.yml | 44 +- .../general/ptq/nvfp4_default-fp8_kv.yml | 44 +- .../general/ptq/nvfp4_mlp_only-fp8_kv.yml | 48 +- .../general/ptq/nvfp4_omlp_only-fp8_kv.yml | 52 +- .../torch/quantization/quantize_common.py | 12 +- tests/unit/recipe/test_loader.py | 24 +- .../quantization/plugins/test_huggingface.py | 12 +- .../unit/torch/quantization/test_autoquant.py | 36 +- 20 files changed, 591 insertions(+), 437 deletions(-) diff --git a/examples/llm_autodeploy/run_auto_quantize.py b/examples/llm_autodeploy/run_auto_quantize.py index 6e49de5ad..73308ed7f 100644 --- a/examples/llm_autodeploy/run_auto_quantize.py +++ b/examples/llm_autodeploy/run_auto_quantize.py @@ -100,11 +100,21 @@ def loss_func(output, data): if enable_kv_cache_quantization: mtq.set_quantizer_by_cfg( model, - quant_cfg=[("*output_quantizer", {"num_bits": (4, 3), "axis": None, "enable": True})], + quant_cfg=[ + { + "quantizer_path": "*output_quantizer", + "cfg": {"num_bits": (4, 3), "axis": None}, + "enable": True, + } + ], ) # Lets calibrate only the output quantizer this time. Let's disable all other quantizers. with mtq.set_quantizer_by_cfg_context( - model, [("*", {"enable": False}), ("*output_quantizer", {"enable": True})] + model, + [ + {"quantizer_path": "*", "enable": False}, + {"quantizer_path": "*output_quantizer", "enable": True}, + ], ): mtq.calibrate(model, algorithm="max", forward_loop=calibrate_loop) return model diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index 24421598c..f8be6274d 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -82,9 +82,21 @@ def _set_kv_cache_constant_amax(quant_cfg: list) -> None: Creates a new dict for the KV bmm quantizer config to avoid mutating shared references. """ - for i, (pattern, cfg) in enumerate(quant_cfg): + for i, entry in enumerate(quant_cfg): + pattern = ( + entry["quantizer_path"] + if isinstance(entry, dict) and "quantizer_path" in entry + else entry[0] + ) if pattern == "*[kv]_bmm_quantizer": - quant_cfg[i] = ("*[kv]_bmm_quantizer", {**cfg, "use_constant_amax": True}) + assert isinstance(entry, dict) and isinstance(entry.get("cfg", {}), dict) + new_entry = { + "quantizer_path": "*[kv]_bmm_quantizer", + "cfg": {**entry.get("cfg", {}), "use_constant_amax": True}, + } + if entry.get("enable") is not None: + new_entry["enable"] = entry["enable"] + quant_cfg[i] = new_entry break @@ -317,7 +329,7 @@ def forward_step(model, batch): ), verbose=True, # Disable all default disabled layers such as lm_head, mlp.gate, router etc. - disabled_layers=[next(iter(entry)) for entry in _default_disabled_quantizer_cfg], + disabled_layers=[entry.quantizer_path for entry in _default_disabled_quantizer_cfg], method=auto_quantize_method, checkpoint=auto_quantize_checkpoint, ) @@ -331,7 +343,10 @@ def forward_step(model, batch): getattr(mtq, KV_QUANT_CFG_CHOICES[args.kv_cache_qformat])["quant_cfg"] ) kv_cache_quant_cfg = [ - e for e in kv_cache_quant_cfg if e[0] != "default" + e + for e in kv_cache_quant_cfg + if (e["quantizer_path"] if isinstance(e, dict) and "quantizer_path" in e else e[0]) + != "default" ] # keep other quantizers from auto_quantize if args.kv_cache_qformat in _KV_CAST_FORMATS: @@ -341,7 +356,8 @@ def forward_step(model, batch): if args.kv_cache_qformat not in _KV_CAST_FORMATS: # Calibrate only the KV cache quantizers; disable all others. with mtq.set_quantizer_by_cfg_context( - language_model, [("*", {"enable": False}), *kv_cache_quant_cfg] + language_model, + [{"quantizer_path": "*", "enable": False}, *kv_cache_quant_cfg], ): mtq.calibrate(language_model, algorithm="max", forward_loop=calibrate_loop) return language_model @@ -544,14 +560,16 @@ def mono_quantize( # For Nemotron VL models, disable quantization of vision components if is_nemotron_vl_model: print("Disabling quantization for vision components in Nemotron VL model") - quant_cfg["quant_cfg"].append(("*vision*", {"enable": False})) - quant_cfg["quant_cfg"].append(("*image*", {"enable": False})) + quant_cfg["quant_cfg"].append({"quantizer_path": "*vision*", "enable": False}) + quant_cfg["quant_cfg"].append({"quantizer_path": "*image*", "enable": False}) # Also disable radio model components specifically (for Nemotron-Parse) - quant_cfg["quant_cfg"].append(("*radio*", {"enable": False})) - quant_cfg["quant_cfg"].append(("*visual*", {"enable": False})) - quant_cfg["quant_cfg"].append(("*encoder*", {"enable": False})) # Disable encoder + quant_cfg["quant_cfg"].append({"quantizer_path": "*radio*", "enable": False}) + quant_cfg["quant_cfg"].append({"quantizer_path": "*visual*", "enable": False}) + quant_cfg["quant_cfg"].append( + {"quantizer_path": "*encoder*", "enable": False} + ) # Disable encoder quant_cfg["quant_cfg"].append( - ("*model_encoder*", {"enable": False}) + {"quantizer_path": "*model_encoder*", "enable": False} ) # Nemotron-Parse specific print("Quantization will only be applied to the decoder (text generation) component") @@ -971,7 +989,7 @@ def quantize_main( for prefix in mtp_layer_prefixes: # Add exclusion pattern for this MTP layer (e.g., "*layers.92*") pattern = f"*{prefix.split('.')[-2]}.{prefix.split('.')[-1]}*" - quant_cfg["quant_cfg"].append((pattern, {"enable": False})) + quant_cfg["quant_cfg"].append({"quantizer_path": pattern, "enable": False}) print(f"Excluding MTP layer from quantization: {pattern}") # Use constant amax for KV quantizers when a cast format is selected. diff --git a/modelopt/onnx/llm_export_utils/quantization_utils.py b/modelopt/onnx/llm_export_utils/quantization_utils.py index 4df393b70..a8fdcb98c 100644 --- a/modelopt/onnx/llm_export_utils/quantization_utils.py +++ b/modelopt/onnx/llm_export_utils/quantization_utils.py @@ -68,33 +68,45 @@ def get_quant_config(precision, lm_head_precision="fp16"): else: raise ValueError(f"Unsupported precision: {precision}") - quant_cfg_list: list[tuple] = list(quant_cfg["quant_cfg"]) # type: ignore[arg-type] + quant_cfg_list: list = [ + e for e in quant_cfg["quant_cfg"] if isinstance(e, dict) and "quantizer_path" in e + ] if lm_head_precision == "fp8": - quant_cfg_list.append(("*lm_head.input_quantizer", {"num_bits": (4, 3), "axis": None})) - quant_cfg_list.append(("*lm_head.weight_quantizer", {"num_bits": (4, 3), "axis": None})) + quant_cfg_list.append( + { + "quantizer_path": "*lm_head.input_quantizer", + "cfg": {"num_bits": (4, 3), "axis": None}, + } + ) + quant_cfg_list.append( + { + "quantizer_path": "*lm_head.weight_quantizer", + "cfg": {"num_bits": (4, 3), "axis": None}, + } + ) elif lm_head_precision == "nvfp4": quant_cfg_list.append( - ( - "*lm_head.input_quantizer", - { + { + "quantizer_path": "*lm_head.input_quantizer", + "cfg": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, "axis": None, - "enable": True, }, - ) + "enable": True, + } ) quant_cfg_list.append( - ( - "*lm_head.weight_quantizer", - { + { + "quantizer_path": "*lm_head.weight_quantizer", + "cfg": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, "axis": None, - "enable": True, }, - ) + "enable": True, + } ) quant_cfg["quant_cfg"] = quant_cfg_list return quant_cfg diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index 55b6be56d..6ab7898d5 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -52,7 +52,6 @@ from torch.distributed.fsdp import FSDPModule from modelopt.torch.quantization import set_quantizer_by_cfg_context -from modelopt.torch.quantization.config import QuantizerAttributeConfig from modelopt.torch.quantization.nn import ( NVFP4StaticQuantizer, SequentialQuantizer, @@ -221,7 +220,7 @@ def _output_hook(module, input, output): try: with ( torch.no_grad(), - set_quantizer_by_cfg_context(model, [("*", QuantizerAttributeConfig(enable=False))]), + set_quantizer_by_cfg_context(model, [{"quantizer_path": "*", "enable": False}]), ): dummy_forward_fn() finally: diff --git a/modelopt/torch/quantization/algorithms.py b/modelopt/torch/quantization/algorithms.py index 76a294718..358253891 100644 --- a/modelopt/torch/quantization/algorithms.py +++ b/modelopt/torch/quantization/algorithms.py @@ -62,9 +62,22 @@ def estimate_quant_compression(quant_cfg: QuantizeConfig) -> float: def estimate_quant_compression_for_quantizer(quantizer_attr_cfg): if isinstance(quantizer_attr_cfg, list): + if not quantizer_attr_cfg: + return 1.0 return min(estimate_quant_compression_for_quantizer(q) for q in quantizer_attr_cfg) if isinstance(quantizer_attr_cfg, dict): - return estimate_quant_compression_for_quantizer(list(quantizer_attr_cfg.values())) + # Handle raw quantizer cfg dicts (e.g. {"num_bits": (4, 3), "axis": None}) + if not quantizer_attr_cfg.get("enable", True): + return 1.0 + num_bits = quantizer_attr_cfg.get("num_bits") + if num_bits is None: + return 1.0 + if isinstance(num_bits, tuple): + return (sum(num_bits) + 1) / 16 + elif isinstance(num_bits, int): + return num_bits / 16 + else: + raise ValueError(f"Unknown quantization config {num_bits}") if isinstance(quantizer_attr_cfg, QuantizerAttributeConfig): if not quantizer_attr_cfg.enable: @@ -80,7 +93,8 @@ def estimate_quant_compression_for_quantizer(quantizer_attr_cfg): raise ValueError(f"Unknown type {type(quantizer_attr_cfg)}, {quantizer_attr_cfg}") - return estimate_quant_compression_for_quantizer([v for _, v in quant_cfg.quant_cfg]) + cfgs = [e.get("cfg", {}) for e in quant_cfg.quant_cfg] + return estimate_quant_compression_for_quantizer(cfgs) if cfgs else 1.0 class QuantRecipe(CustomHPType): @@ -109,9 +123,7 @@ def __init__(self, quant_cfg: str | dict[str, Any] | None = None, name: str | No # Disable KV Cache quantization # Currently KV Cache quantization is enabled for some quantization formats and disabled for others # This breaks the monotonicity of the quantization formats in terms of weight compression Vs accuracy - self.config.quant_cfg.append( - ("*output_quantizer", mtq_config.QuantizerAttributeConfig(enable=False)) - ) + self.config.quant_cfg.append({"quantizer_path": "*output_quantizer", "enable": False}) self.compression = estimate_quant_compression(self.config) @@ -1361,7 +1373,17 @@ def _resolve_best_recipe(search_state, constraints, verbose=False): def _match_quantizer_cfg(quant_cfg, quantizer_attr): # Last-match-wins to mirror set_quantizer_by_cfg behavior matched = None - for pattern, cfg in quant_cfg: + for entry in quant_cfg: + pattern = ( + entry["quantizer_path"] + if isinstance(entry, dict) and "quantizer_path" in entry + else entry[0] + ) + cfg = ( + entry.get("cfg", {}) + if isinstance(entry, dict) and "quantizer_path" in entry + else entry[1] + ) if fnmatch.fnmatch(quantizer_attr, pattern): matched = cfg return matched diff --git a/modelopt/torch/quantization/backends/fp8_per_tensor_gemm.py b/modelopt/torch/quantization/backends/fp8_per_tensor_gemm.py index c77097299..a668b33b8 100644 --- a/modelopt/torch/quantization/backends/fp8_per_tensor_gemm.py +++ b/modelopt/torch/quantization/backends/fp8_per_tensor_gemm.py @@ -97,9 +97,23 @@ def fp8_per_tensor_gemm(quant_module, input, bias=None): def _fp8_availability_check(module, input, args, kwargs): """Comprehensive check for FP8 GEMM availability.""" # Quantizer configs - quant_cfg_list: list[tuple] = FP8_DEFAULT_CFG["quant_cfg"] - input_cfg = next(v for k, v in quant_cfg_list if k == "*input_quantizer") - weight_cfg = next(v for k, v in quant_cfg_list if k == "*weight_quantizer") + quant_cfg_list = FP8_DEFAULT_CFG["quant_cfg"] + input_cfg = next( + e.get("cfg", {}) + for e in quant_cfg_list + if isinstance(e, dict) + and "quantizer_path" in e + and e["quantizer_path"] == "*input_quantizer" + ) + weight_cfg = next( + e.get("cfg", {}) + for e in quant_cfg_list + if isinstance(e, dict) + and "quantizer_path" in e + and e["quantizer_path"] == "*weight_quantizer" + ) + assert isinstance(input_cfg, dict) + assert isinstance(weight_cfg, dict) # Check hardware support if not torch.cuda.is_available() or not fp8_compatible(): diff --git a/modelopt/torch/quantization/backends/nvfp4_gemm.py b/modelopt/torch/quantization/backends/nvfp4_gemm.py index ed7352800..e70d51ea1 100644 --- a/modelopt/torch/quantization/backends/nvfp4_gemm.py +++ b/modelopt/torch/quantization/backends/nvfp4_gemm.py @@ -211,10 +211,24 @@ def _nvfp4_availability_check(module, input, args, kwargs): if not hasattr(module, "input_quantizer") or not hasattr(module, "weight_quantizer"): return False - quant_cfg_list: list[tuple] = mtq.NVFP4_DEFAULT_CFG["quant_cfg"] + quant_cfg_list = mtq.NVFP4_DEFAULT_CFG["quant_cfg"] # Quantizer configs - input_cfg = next(v for k, v in quant_cfg_list if k == "*input_quantizer") - weight_cfg = next(v for k, v in quant_cfg_list if k == "*weight_quantizer") + input_cfg = next( + e.get("cfg", {}) + for e in quant_cfg_list + if isinstance(e, dict) + and "quantizer_path" in e + and e["quantizer_path"] == "*input_quantizer" + ) + weight_cfg = next( + e.get("cfg", {}) + for e in quant_cfg_list + if isinstance(e, dict) + and "quantizer_path" in e + and e["quantizer_path"] == "*weight_quantizer" + ) + assert isinstance(input_cfg, dict) + assert isinstance(weight_cfg, dict) # Check input quantizer config for key, value in input_cfg.items(): diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py index fd81b0d17..42d2e25ea 100644 --- a/modelopt/torch/quantization/config.py +++ b/modelopt/torch/quantization/config.py @@ -135,7 +135,6 @@ """ -from collections.abc import Callable from typing import Any, Literal from pydantic import ValidationInfo, field_validator, model_validator @@ -143,41 +142,46 @@ from modelopt.torch.opt.config import ModeloptBaseConfig, ModeloptField from modelopt.torch.utils.network import ConstructorLike -_base_disable_all: list[tuple] = [ - ("*", {"enable": False}), +QuantCfgEntry = dict[str, Any] + +_base_disable_all: list[QuantCfgEntry] = [ + {"quantizer_path": "*", "enable": False}, ] -_default_disabled_quantizer_cfg: list[tuple] = [ - ("nn.BatchNorm1d", {"*": {"enable": False}}), - ("nn.BatchNorm2d", {"*": {"enable": False}}), - ("nn.BatchNorm3d", {"*": {"enable": False}}), - ("nn.LeakyReLU", {"*": {"enable": False}}), - ("*lm_head*", {"enable": False}), - ("*proj_out.*", {"enable": False}), # In Whisper model, lm_head has key name proj_out - ("*block_sparse_moe.gate*", {"enable": False}), # Skip the MOE router - ("*router*", {"enable": False}), # Skip the MOE router - ("*mlp.gate.*", {"enable": False}), # Skip the MOE router - ("*mlp.shared_expert_gate.*", {"enable": False}), # Skip the MOE router - ("*linear_attn.conv1d*", {"enable": False}), - ("*mixer.conv1d*", {"enable": False}), # Skip mamba conv1d - ("*output_layer*", {"enable": False}), - ("output.*", {"enable": False}), +_default_disabled_quantizer_cfg: list[QuantCfgEntry] = [ + {"parent_class": "nn.BatchNorm1d", "quantizer_path": "*", "enable": False}, + {"parent_class": "nn.BatchNorm2d", "quantizer_path": "*", "enable": False}, + {"parent_class": "nn.BatchNorm3d", "quantizer_path": "*", "enable": False}, + {"parent_class": "nn.LeakyReLU", "quantizer_path": "*", "enable": False}, + {"quantizer_path": "*lm_head*", "enable": False}, + { + "quantizer_path": "*proj_out.*", + "enable": False, + }, # In Whisper model, lm_head has key name proj_out + {"quantizer_path": "*block_sparse_moe.gate*", "enable": False}, # Skip the MOE router + {"quantizer_path": "*router*", "enable": False}, # Skip the MOE router + {"quantizer_path": "*mlp.gate.*", "enable": False}, # Skip the MOE router + {"quantizer_path": "*mlp.shared_expert_gate.*", "enable": False}, # Skip the MOE router + {"quantizer_path": "*linear_attn.conv1d*", "enable": False}, + {"quantizer_path": "*mixer.conv1d*", "enable": False}, # Skip mamba conv1d + {"quantizer_path": "*output_layer*", "enable": False}, + {"quantizer_path": "output.*", "enable": False}, ] -_mamba_moe_disabled_quantizer_cfg: list[tuple] = [ - ("*fc1_latent_proj*", {"enable": False}), # Skip Latent MOE - ("*fc2_latent_proj*", {"enable": False}), # Skip Latent MOE - ("*q_proj*", {"enable": False}), # Skip QKV Linear - ("*k_proj*", {"enable": False}), # Skip QKV Linear - ("*v_proj*", {"enable": False}), # Skip QKV Linear - ("*o_proj*", {"enable": False}), # Skip QKV Output Projection +_mamba_moe_disabled_quantizer_cfg: list[QuantCfgEntry] = [ + {"quantizer_path": "*fc1_latent_proj*", "enable": False}, # Skip Latent MOE + {"quantizer_path": "*fc2_latent_proj*", "enable": False}, # Skip Latent MOE + {"quantizer_path": "*q_proj*", "enable": False}, # Skip QKV Linear + {"quantizer_path": "*k_proj*", "enable": False}, # Skip QKV Linear + {"quantizer_path": "*v_proj*", "enable": False}, # Skip QKV Linear + {"quantizer_path": "*o_proj*", "enable": False}, # Skip QKV Output Projection ] INT8_DEFAULT_CFG = { "quant_cfg": [ *_base_disable_all, - ("*weight_quantizer", {"num_bits": 8, "axis": 0}), - ("*input_quantizer", {"num_bits": 8, "axis": None}), + {"quantizer_path": "*weight_quantizer", "cfg": {"num_bits": 8, "axis": 0}}, + {"quantizer_path": "*input_quantizer", "cfg": {"num_bits": 8, "axis": None}}, *_default_disabled_quantizer_cfg, ], "algorithm": "max", @@ -186,8 +190,8 @@ INT8_SMOOTHQUANT_CFG = { "quant_cfg": [ *_base_disable_all, - ("*weight_quantizer", {"num_bits": 8, "axis": 0}), - ("*input_quantizer", {"num_bits": 8, "axis": None}), + {"quantizer_path": "*weight_quantizer", "cfg": {"num_bits": 8, "axis": 0}}, + {"quantizer_path": "*input_quantizer", "cfg": {"num_bits": 8, "axis": None}}, *_default_disabled_quantizer_cfg, ], "algorithm": "smoothquant", @@ -196,8 +200,8 @@ INT8_WEIGHT_ONLY_CFG = { "quant_cfg": [ *_base_disable_all, - ("*weight_quantizer", {"num_bits": 8, "axis": 0}), - ("*input_quantizer", {"enable": False}), + {"quantizer_path": "*weight_quantizer", "cfg": {"num_bits": 8, "axis": 0}}, + {"quantizer_path": "*input_quantizer", "enable": False}, *_default_disabled_quantizer_cfg, ], "algorithm": "max", @@ -206,8 +210,8 @@ FP8_DEFAULT_CFG = { "quant_cfg": [ *_base_disable_all, - ("*weight_quantizer", {"num_bits": (4, 3), "axis": None}), - ("*input_quantizer", {"num_bits": (4, 3), "axis": None}), + {"quantizer_path": "*weight_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, + {"quantizer_path": "*input_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, *_default_disabled_quantizer_cfg, ], "algorithm": "max", @@ -216,8 +220,8 @@ MAMBA_MOE_FP8_AGGRESSIVE_CFG = { "quant_cfg": [ *_base_disable_all, - ("*weight_quantizer", {"num_bits": (4, 3), "axis": None}), - ("*input_quantizer", {"num_bits": (4, 3), "axis": None}), + {"quantizer_path": "*weight_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, + {"quantizer_path": "*input_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, *_default_disabled_quantizer_cfg, *_mamba_moe_disabled_quantizer_cfg, ], @@ -227,12 +231,12 @@ MAMBA_MOE_FP8_CONSERVATIVE_CFG = { "quant_cfg": [ *_base_disable_all, - ("*weight_quantizer", {"num_bits": (4, 3), "axis": None}), - ("*input_quantizer", {"num_bits": (4, 3), "axis": None}), + {"quantizer_path": "*weight_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, + {"quantizer_path": "*input_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, *_default_disabled_quantizer_cfg, *_mamba_moe_disabled_quantizer_cfg, - ("*mixer.in_proj*", {"enable": False}), # Skip mamba linear - ("*mixer.out_proj*", {"enable": False}), # Skip mamba linear + {"quantizer_path": "*mixer.in_proj*", "enable": False}, # Skip mamba linear + {"quantizer_path": "*mixer.out_proj*", "enable": False}, # Skip mamba linear ], "algorithm": "max", } @@ -240,15 +244,15 @@ FP8_PER_CHANNEL_PER_TOKEN_CFG = { "quant_cfg": [ *_base_disable_all, - ("*weight_quantizer", {"num_bits": (4, 3), "axis": 0}), - ( - "*input_quantizer", - { + {"quantizer_path": "*weight_quantizer", "cfg": {"num_bits": (4, 3), "axis": 0}}, + { + "quantizer_path": "*input_quantizer", + "cfg": { "num_bits": (4, 3), "type": "dynamic", "block_sizes": {-1: None}, }, - ), + }, *_default_disabled_quantizer_cfg, ], "algorithm": "max", @@ -258,15 +262,15 @@ FP8_2D_BLOCKWISE_WEIGHT_ONLY_CFG = { "quant_cfg": [ *_base_disable_all, - ( - "*weight_quantizer", - { + { + "quantizer_path": "*weight_quantizer", + "cfg": { "num_bits": (4, 3), "block_sizes": {-1: 128, -2: 128}, - "enable": True, }, - ), - ("*input_quantizer", {"enable": False}), + "enable": True, + }, + {"quantizer_path": "*input_quantizer", "enable": False}, *_default_disabled_quantizer_cfg, ], "algorithm": "max", @@ -275,15 +279,15 @@ INT4_BLOCKWISE_WEIGHT_ONLY_CFG = { "quant_cfg": [ *_base_disable_all, - ( - "*weight_quantizer", - { + { + "quantizer_path": "*weight_quantizer", + "cfg": { "num_bits": 4, "block_sizes": {-1: 128}, - "enable": True, }, - ), - ("*input_quantizer", {"enable": False}), + "enable": True, + }, + {"quantizer_path": "*input_quantizer", "enable": False}, *_default_disabled_quantizer_cfg, ], "algorithm": "max", @@ -293,15 +297,15 @@ INT4_AWQ_CFG = { "quant_cfg": [ *_base_disable_all, - ( - "*weight_quantizer", - { + { + "quantizer_path": "*weight_quantizer", + "cfg": { "num_bits": 4, "block_sizes": {-1: 128, "type": "static"}, - "enable": True, }, - ), - ("*input_quantizer", {"enable": False}), + "enable": True, + }, + {"quantizer_path": "*input_quantizer", "enable": False}, *_default_disabled_quantizer_cfg, ], "algorithm": {"method": "awq_lite", "alpha_step": 0.1}, @@ -314,9 +318,9 @@ W4A8_AWQ_BETA_CFG = { "quant_cfg": [ *_base_disable_all, - ( - "*weight_quantizer", - [ + { + "quantizer_path": "*weight_quantizer", + "cfg": [ { "num_bits": 4, "block_sizes": {-1: 128, "type": "static"}, @@ -327,14 +331,14 @@ "enable": True, }, ], - ), - ( - "*input_quantizer", - { + }, + { + "quantizer_path": "*input_quantizer", + "cfg": { "num_bits": (4, 3), - "enable": True, }, - ), + "enable": True, + }, *_default_disabled_quantizer_cfg, ], "algorithm": "awq_lite", @@ -343,22 +347,22 @@ MXFP8_DEFAULT_CFG = { "quant_cfg": [ *_base_disable_all, - ( - "*weight_quantizer", - { + { + "quantizer_path": "*weight_quantizer", + "cfg": { "num_bits": (4, 3), "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - "enable": True, }, - ), - ( - "*input_quantizer", - { + "enable": True, + }, + { + "quantizer_path": "*input_quantizer", + "cfg": { "num_bits": (4, 3), "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - "enable": True, }, - ), + "enable": True, + }, *_default_disabled_quantizer_cfg, ], "algorithm": None, @@ -367,22 +371,22 @@ MXFP6_DEFAULT_CFG = { "quant_cfg": [ *_base_disable_all, - ( - "*weight_quantizer", - { + { + "quantizer_path": "*weight_quantizer", + "cfg": { "num_bits": (3, 2), "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - "enable": True, }, - ), - ( - "*input_quantizer", - { + "enable": True, + }, + { + "quantizer_path": "*input_quantizer", + "cfg": { "num_bits": (3, 2), "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - "enable": True, }, - ), + "enable": True, + }, *_default_disabled_quantizer_cfg, ], "algorithm": None, @@ -391,22 +395,22 @@ MXFP4_DEFAULT_CFG = { "quant_cfg": [ *_base_disable_all, - ( - "*weight_quantizer", - { + { + "quantizer_path": "*weight_quantizer", + "cfg": { "num_bits": (2, 1), "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - "enable": True, }, - ), - ( - "*input_quantizer", - { + "enable": True, + }, + { + "quantizer_path": "*input_quantizer", + "cfg": { "num_bits": (2, 1), "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - "enable": True, }, - ), + "enable": True, + }, *_default_disabled_quantizer_cfg, ], "algorithm": None, @@ -415,15 +419,15 @@ W4A8_MXFP4_FP8_CFG = { "quant_cfg": [ *_base_disable_all, - ( - "*weight_quantizer", - { + { + "quantizer_path": "*weight_quantizer", + "cfg": { "num_bits": (2, 1), "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - "enable": True, }, - ), - ("*input_quantizer", {"num_bits": (4, 3), "axis": None}), + "enable": True, + }, + {"quantizer_path": "*input_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, *_default_disabled_quantizer_cfg, ], "algorithm": None, @@ -432,22 +436,22 @@ MXINT8_DEFAULT_CFG = { "quant_cfg": [ *_base_disable_all, - ( - "*weight_quantizer", - { + { + "quantizer_path": "*weight_quantizer", + "cfg": { "num_bits": 8, "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - "enable": True, }, - ), - ( - "*input_quantizer", - { + "enable": True, + }, + { + "quantizer_path": "*input_quantizer", + "cfg": { "num_bits": 8, "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - "enable": True, }, - ), + "enable": True, + }, *_default_disabled_quantizer_cfg, ], "algorithm": None, @@ -455,34 +459,33 @@ FP8_KV_CFG = { "quant_cfg": [ - ( - "*[kv]_bmm_quantizer", - { + { + "quantizer_path": "*[kv]_bmm_quantizer", + "cfg": { "num_bits": (4, 3), - "enable": True, }, - ), + "enable": True, + }, ], "algorithm": "max", } FP8_AFFINE_KV_CFG = { "quant_cfg": [ - ( - "*[kv]_bmm_quantizer", - { + { + "quantizer_path": "*[kv]_bmm_quantizer", + "cfg": { "num_bits": (4, 3), "bias": {-2: None, -4: None, "type": "static"}, }, - ), + }, ], "algorithm": "max", } -_nvfp4_quantizer = { +_nvfp4_cfg = { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "enable": True, } _nvfp4_quantizer_bs32 = { @@ -501,10 +504,11 @@ def _nvfp4_selective_quant_cfg( ) -> dict: """Build an NVFP4 config that quantizes only the specified layer patterns.""" quant_cfg: dict[str, object] = [] + quant_cfg.extend(_base_disable_all) for pattern in layer_patterns: - quant_cfg.append((f"{pattern}weight_quantizer", quantizer)) + quant_cfg.append({"quantizer_path": f"{pattern}weight_quantizer", "cfg": quantizer}) if not weight_only: - quant_cfg.append((f"{pattern}input_quantizer", quantizer)) + quant_cfg.append({"quantizer_path": f"{pattern}input_quantizer", "cfg": quantizer}) quant_cfg.extend(_default_disabled_quantizer_cfg) return {"quant_cfg": quant_cfg, "algorithm": algorithm} @@ -514,15 +518,15 @@ def _nvfp4_selective_quant_cfg( NVFP4_W4A4_WEIGHT_MSE_FP8_SWEEP_CFG = { "quant_cfg": [ *_base_disable_all, - ( - "*weight_quantizer", - { + { + "quantizer_path": "*weight_quantizer", + "cfg": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "static", "scale_bits": (4, 3)}, - "enable": True, }, - ), - ("*input_quantizer", _nvfp4_quantizer), + "enable": True, + }, + {"quantizer_path": "*input_quantizer", "cfg": _nvfp4_cfg, "enable": True}, *_default_disabled_quantizer_cfg, ], "algorithm": { @@ -534,15 +538,15 @@ def _nvfp4_selective_quant_cfg( NVFP4_W4A4_WEIGHT_LOCAL_HESSIAN_CFG = { "quant_cfg": [ *_base_disable_all, - ( - "*weight_quantizer", - { + { + "quantizer_path": "*weight_quantizer", + "cfg": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "static", "scale_bits": (4, 3)}, - "enable": True, }, - ), - ("*input_quantizer", _nvfp4_quantizer), + "enable": True, + }, + {"quantizer_path": "*input_quantizer", "cfg": _nvfp4_cfg, "enable": True}, *_default_disabled_quantizer_cfg, ], "algorithm": { @@ -554,8 +558,8 @@ def _nvfp4_selective_quant_cfg( MAMBA_MOE_NVFP4_AGGRESSIVE_CFG = { "quant_cfg": [ *_base_disable_all, - ("*weight_quantizer", _nvfp4_quantizer), - ("*input_quantizer", _nvfp4_quantizer), + {"quantizer_path": "*weight_quantizer", "cfg": _nvfp4_cfg, "enable": True}, + {"quantizer_path": "*input_quantizer", "cfg": _nvfp4_cfg, "enable": True}, *_default_disabled_quantizer_cfg, *_mamba_moe_disabled_quantizer_cfg, ], @@ -564,12 +568,12 @@ def _nvfp4_selective_quant_cfg( MAMBA_MOE_NVFP4_CONSERVATIVE_CFG = { "quant_cfg": [ *_base_disable_all, - ("*weight_quantizer", _nvfp4_quantizer), - ("*input_quantizer", _nvfp4_quantizer), + {"quantizer_path": "*weight_quantizer", "cfg": _nvfp4_cfg, "enable": True}, + {"quantizer_path": "*input_quantizer", "cfg": _nvfp4_cfg, "enable": True}, *_default_disabled_quantizer_cfg, *_mamba_moe_disabled_quantizer_cfg, - ("*mixer.in_proj*", {"enable": False}), # Skip mamba linear - ("*mixer.out_proj*", {"enable": False}), # Skip mamba linear + {"quantizer_path": "*mixer.in_proj*", "enable": False}, # Skip mamba linear + {"quantizer_path": "*mixer.out_proj*", "enable": False}, # Skip mamba linear ], "algorithm": "max", } @@ -584,19 +588,20 @@ def _nvfp4_selective_quant_cfg( NVFP4_AFFINE_KV_CFG = { "quant_cfg": [ - ( - "*[kv]_bmm_quantizer", - { - **_nvfp4_quantizer, + { + "quantizer_path": "*[kv]_bmm_quantizer", + "cfg": { + **_nvfp4_cfg, "bias": {-2: None, -4: None, "type": "static"}, }, - ), + "enable": True, + }, ], } NVFP4_KV_CFG = { "quant_cfg": [ - ("*[kv]_bmm_quantizer", _nvfp4_quantizer), + {"quantizer_path": "*[kv]_bmm_quantizer", "cfg": _nvfp4_cfg, "enable": True}, ], } @@ -604,60 +609,61 @@ def _nvfp4_selective_quant_cfg( NVFP4_FP8_MHA_CONFIG = { "quant_cfg": [ *_base_disable_all, - ("*weight_quantizer", _nvfp4_quantizer), - ("*input_quantizer", _nvfp4_quantizer), - ("*output_quantizer", {"enable": False}), - ( - "*q_bmm_quantizer", - { + {"quantizer_path": "*weight_quantizer", "cfg": _nvfp4_cfg, "enable": True}, + {"quantizer_path": "*input_quantizer", "cfg": _nvfp4_cfg, "enable": True}, + {"quantizer_path": "*output_quantizer", "enable": False}, + { + "quantizer_path": "*q_bmm_quantizer", + "cfg": { "num_bits": (4, 3), }, - ), - ( - "*k_bmm_quantizer", - { + }, + { + "quantizer_path": "*k_bmm_quantizer", + "cfg": { "num_bits": (4, 3), }, - ), - ( - "*v_bmm_quantizer", - { + }, + { + "quantizer_path": "*v_bmm_quantizer", + "cfg": { "num_bits": (4, 3), }, - ), - ( - "*softmax_quantizer", - { + }, + { + "quantizer_path": "*softmax_quantizer", + "cfg": { "num_bits": (4, 3), }, - ), - ( - "transformer_blocks*bmm2_output_quantizer", - { + }, + { + "quantizer_path": "transformer_blocks*bmm2_output_quantizer", + "cfg": { "num_bits": (4, 3), }, - ), + }, ], "algorithm": "max", } NVFP4_KV_ROTATE_CFG = { "quant_cfg": [ - ( - "*q_bmm_quantizer", - { - "enable": False, + { + "quantizer_path": "*q_bmm_quantizer", + "cfg": { "rotate": True, }, - ), - ( - "*k_bmm_quantizer", - { - **_nvfp4_quantizer, + "enable": False, + }, + { + "quantizer_path": "*k_bmm_quantizer", + "cfg": { + **_nvfp4_cfg, "rotate": True, }, - ), - ("*v_bmm_quantizer", _nvfp4_quantizer), + "enable": True, + }, + {"quantizer_path": "*v_bmm_quantizer", "cfg": _nvfp4_cfg, "enable": True}, ], "algorithm": "max", } @@ -669,21 +675,21 @@ def _nvfp4_selective_quant_cfg( W4A8_NVFP4_FP8_CFG = { "quant_cfg": [ *_base_disable_all, - ( - "*weight_quantizer", - { + { + "quantizer_path": "*weight_quantizer", + "cfg": { "num_bits": (2, 1), "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (4, 3)}, - "enable": True, }, - ), - ( - "*input_quantizer", - { + "enable": True, + }, + { + "quantizer_path": "*input_quantizer", + "cfg": { "num_bits": (4, 3), - "enable": True, }, - ), + "enable": True, + }, *_default_disabled_quantizer_cfg, ], "algorithm": "max", @@ -692,22 +698,22 @@ def _nvfp4_selective_quant_cfg( MXFP4_MLP_WEIGHT_ONLY_CFG = { "quant_cfg": [ *_base_disable_all, - ( - "*mlp*weight_quantizer", - { + { + "quantizer_path": "*mlp*weight_quantizer", + "cfg": { "num_bits": (2, 1), "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - "enable": True, }, - ), - ( - "*block_sparse_moe*weight_quantizer", - { + "enable": True, + }, + { + "quantizer_path": "*block_sparse_moe*weight_quantizer", + "cfg": { "num_bits": (2, 1), "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - "enable": True, }, - ), + "enable": True, + }, *_default_disabled_quantizer_cfg, ], "algorithm": None, @@ -720,7 +726,6 @@ def _nvfp4_selective_quant_cfg( NVFP4_MLP_ONLY_CFG = _nvfp4_selective_quant_cfg(["*mlp*", "*block_sparse_moe*"]) NVFP4_OMLP_ONLY_CFG = _nvfp4_selective_quant_cfg(["*o_proj*", "*mlp*", "*block_sparse_moe*"]) - # DO NOT ADD NEW CONFIGS HERE. If you want to add a new general recipe, add it to # modelopt_recipes/general/ptq/ as a yaml file choices: set[str] = { @@ -1456,16 +1461,7 @@ class GPTQLiteConfig(QuantizeAlgorithmConfig): ) -_QuantizeQuantCfgEntryValueType = ( - QuantizerAttributeConfig - | list[QuantizerAttributeConfig] - | dict[str | Callable, QuantizerAttributeConfig | list[QuantizerAttributeConfig]] - | dict[str, Any] -) - -_QuantizeQuantCfgEntryType = tuple[str | Callable, _QuantizeQuantCfgEntryValueType] - -QuantizeQuantCfgType = list[_QuantizeQuantCfgEntryType] +QuantizeQuantCfgType = list[QuantCfgEntry] _QuantizeAlgoCfgType = str | dict | QuantizeAlgorithmConfig | None @@ -1476,7 +1472,7 @@ class QuantizeConfig(ModeloptBaseConfig): """Default configuration for ``quantize`` mode.""" quant_cfg: QuantizeQuantCfgType = ModeloptField( - default=[("*", {"num_bits": 8, "axis": None})], + default=[{"quantizer_path": "*", "cfg": {"num_bits": 8, "axis": None}}], title="Quantization configuration", validate_default=True, ) @@ -1491,57 +1487,70 @@ class QuantizeConfig(ModeloptBaseConfig): @field_validator("quant_cfg", mode="before") @classmethod def normalize_quant_cfg(cls, v): - """Normalize quant_cfg entries: convert dict forms to (key, value) tuples. + """Normalize quant_cfg entries: convert dict and tuple forms to QuantCfgEntry dicts. - Supports these dict forms for YAML/JSON compatibility: - - - ``{"path": ..., "enable": ..., "cfg": ...}`` — glob path match with top-level enable - - ``{"path": ..., "enable": ...}`` — glob path match, enable-only - - ``{"type": ..., "path": ..., "enable": ...}`` — type match with per-path-glob enable - - ``{"": ...}`` — single-key dict (legacy) - - The internal representation is always a list of ``(key, cfg)`` tuples. - For ``type`` entries the key is the type name and cfg is ``{path: {enable: ...}}``. - For ``path`` entries the key is the path glob and ``enable`` is merged into cfg. + Supports these input forms: + - ``{"quantizer_path": ..., "enable": ..., "cfg": ...}`` — passed through as-is + - ``{"": ...}`` — single-key dict (legacy) + - ``(quantizer_path, cfg_dict)`` — tuple form (legacy) """ if not isinstance(v, list): return v result = [] for entry in v: - if isinstance(entry, dict): - if "type" in entry: - type_val = entry["type"] - path_val = entry["path"] - sub_cfg = {} - if "enable" in entry: - sub_cfg["enable"] = entry["enable"] - result.append((type_val, {path_val: sub_cfg})) - elif "path" in entry: - fmt = dict(entry.get("cfg") or {}) - if "enable" in entry: - fmt["enable"] = entry["enable"] - result.append((entry["path"], fmt)) - elif len(entry) == 1: - result.append(next(iter(entry.items()))) + if isinstance(entry, dict) and "quantizer_path" in entry: + result.append(entry) + elif isinstance(entry, dict): + if len(entry) == 1: + key, val = next(iter(entry.items())) + result.append(cls._tuple_to_entry(key, val)) else: raise ValueError( f"Invalid quant_cfg entry: {entry!r}. " - "Expected a single-key dict or an object with a 'path' or 'type' key." + "Expected a dict with 'quantizer_path', a single-key dict, or a (quantizer_path, cfg) tuple." ) + elif isinstance(entry, (tuple, list)) and len(entry) == 2: + result.append(cls._tuple_to_entry(entry[0], entry[1])) else: - result.append(entry) + raise ValueError(f"Invalid quant_cfg entry: {entry!r}.") return result + @classmethod + def _tuple_to_entry(cls, key: str, value) -> "QuantCfgEntry": + """Convert a (key, value) tuple to a QuantCfgEntry dict.""" + if isinstance(key, str) and key.startswith("nn."): + # nn.* type entry: value is {quantizer_path: {enable: ...}} + assert isinstance(value, dict) and len(value) == 1 + q_path, sub_cfg = next(iter(value.items())) + sub_cfg = dict(sub_cfg) + enable = sub_cfg.pop("enable", None) + new_entry: QuantCfgEntry = { + "parent_class": key, + "quantizer_path": q_path, + "cfg": sub_cfg, + } + if enable is not None: + new_entry["enable"] = enable + return new_entry + else: + if isinstance(value, dict): + cfg = {k: v for k, v in value.items() if k != "enable"} + enable = value.get("enable") + else: + cfg = value + enable = None + new_entry = {"quantizer_path": key, "cfg": cfg} + if enable is not None: + new_entry["enable"] = enable + return new_entry + @field_validator("quant_cfg", mode="after") @classmethod def validate_quant_cfg_entries(cls, v): - """Validate quantizer attribute configs to surface errors (e.g. invalid axis/block_sizes). - - When a tuple's value contains keys that are QuantizerAttributeConfig fields, validate it - as a QuantizerAttributeConfig to catch invalid configurations early. - """ + """Validate quantizer attribute configs to surface errors (e.g. invalid axis/block_sizes).""" qac_fields = set(QuantizerAttributeConfig.model_fields.keys()) - for _pattern, cfg in v: + for entry in v: + cfg = entry.get("cfg", {}) if isinstance(cfg, dict) and qac_fields & set(cfg.keys()): QuantizerAttributeConfig.model_validate(cfg) return v @@ -1583,7 +1592,17 @@ def _not_dynamic(cfg): ) quant_cfg: list = config.get("quant_cfg") or [] - for name, cfg in quant_cfg: + for entry in quant_cfg: + name = ( + entry["quantizer_path"] + if isinstance(entry, dict) and "quantizer_path" in entry + else entry[0] + ) + cfg = ( + entry.get("cfg", {}) + if isinstance(entry, dict) and "quantizer_path" in entry + else entry[1] + ) if "weight_quantizer" in name: # We don't calibrate weight quantizer continue @@ -1593,7 +1612,7 @@ def _not_dynamic(cfg): if _not_dynamic(_config): print(f"{cfg}: True") return True - elif _not_dynamic(cfg): + elif isinstance(cfg, dict) and _not_dynamic(cfg): print(f"{cfg}: True") return True diff --git a/modelopt/torch/quantization/conversion.py b/modelopt/torch/quantization/conversion.py index 17bd510b1..f3af07418 100644 --- a/modelopt/torch/quantization/conversion.py +++ b/modelopt/torch/quantization/conversion.py @@ -214,29 +214,37 @@ def _replace_quant_module(model: nn.Module, version=None, registry=QuantModuleRe def set_quantizer_by_cfg(quant_model: nn.Module, quant_cfg: QuantizeQuantCfgType): """Update the quantizer attributes based on the specified `quant_cfg`. - `quant_cfg` is a list of ``(pattern, attrs)`` tuples mapping wildcards or filter functions - to its quantizer attributes which are defined in - :class:`QuantizerAttributeConfig <.config.QuantizerAttributeConfig>`. - The wildcards or filter functions are matched against the quantizer module names. + `quant_cfg` is a list of :class:`QuantCfgEntry <.config.QuantCfgEntry>` objects mapping + quantizer paths (and optionally parent classes) to their quantizer attributes, which are + defined in :class:`QuantizerAttributeConfig <.config.QuantizerAttributeConfig>`. + The ``quantizer_path`` is matched against the quantizer module names. The specified quantizer attributes of the matched quantizer modules are set accordingly. Entries are applied in order; use ``"*"`` as the first entry to set a catch-all default. - In addition, the dictionary entries could also be pytorch module class names mapping the class specific - quantization configuration. The pytorch modules should have a quantized equivalent. + In addition, entries with a ``parent_class`` field filter by the pytorch module class, + which must have a quantized equivalent. See :meth:`set_quantizer_attribute ` for more details. """ - for pattern, cfg in quant_cfg: - if str(pattern) in QuantModuleRegistry: - parent_class = QuantModuleRegistry[str(pattern)] - assert isinstance(cfg, dict), ( - f"Expected a dictionary for quantizer configuration for child tensor quantizers of {parent_class}." - ) - for sub_pattern, sub_cfg in cfg.items(): - set_quantizer_attribute(quant_model, sub_pattern, sub_cfg, parent_class) - continue - set_quantizer_attribute(quant_model, pattern, cfg) + for entry in quant_cfg: + entry_cfg = entry.get("cfg", {}) if isinstance(entry, dict) else {} + effective_cfg = dict(entry_cfg) if isinstance(entry_cfg, dict) else list(entry_cfg) + enable = entry.get("enable") if isinstance(entry, dict) else None + if enable is not None and isinstance(effective_cfg, dict): + effective_cfg["enable"] = enable + parent_class_name = entry.get("parent_class") if isinstance(entry, dict) else None + quantizer_path = ( + entry["quantizer_path"] + if isinstance(entry, dict) and "quantizer_path" in entry + else entry + ) + assert isinstance(quantizer_path, str) + if parent_class_name is not None: + parent_class = QuantModuleRegistry[parent_class_name] + set_quantizer_attribute(quant_model, quantizer_path, effective_cfg, parent_class) + else: + set_quantizer_attribute(quant_model, quantizer_path, effective_cfg) def set_quantizer_attribute( @@ -312,7 +320,9 @@ def set_quantizer_by_cfg_context(quant_model: nn.Module, quant_cfg: QuantizeQuan Use this context manager with caution. Changing certain attributes of the quantizer such as `calibrator` can lead to unexpected behavior. """ - assert not any(isinstance(v, list) for _, v in quant_cfg), "list of config not support." + assert not any( + isinstance(entry.get("cfg", {}), list) for entry in quant_cfg if isinstance(entry, dict) + ), "list of config not support." original_attributes = {} for name, module in quant_model.named_modules(): diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py index fc47e55fa..4616c82fc 100644 --- a/modelopt/torch/quantization/model_calib.py +++ b/modelopt/torch/quantization/model_calib.py @@ -1101,7 +1101,9 @@ def forward(self, input, *args, **kwargs): self.awq_lite.num_cache_steps += 1 self.awq_lite.num_tokens += input.numel() / input.shape[-1] if self.awq_lite.is_input_quantized: - with set_quantizer_by_cfg_context(self.input_quantizer, [("*", {"enable": True})]): + with set_quantizer_by_cfg_context( + self.input_quantizer, [{"quantizer_path": "*", "enable": True}] + ): max_calibrate(self.input_quantizer, lambda quantizer: quantizer(input), False) return out_actual diff --git a/modelopt/torch/quantization/model_quant.py b/modelopt/torch/quantization/model_quant.py index bb85723e3..2c601609c 100644 --- a/modelopt/torch/quantization/model_quant.py +++ b/modelopt/torch/quantization/model_quant.py @@ -35,7 +35,7 @@ from .algorithms import AutoQuantizeGradientSearcher, AutoQuantizeKLDivSearcher, QuantRecipe from .algorithms import get_auto_quantize_config as _get_auto_quantize_config -from .config import QuantizeAlgoCfgType, QuantizerAttributeConfig +from .config import QuantizeAlgoCfgType from .conversion import set_quantizer_attribute from .mode import QuantizeModeRegistry, get_modelike_from_algo_cfg from .nn import QuantModule, TensorQuantizer @@ -527,7 +527,7 @@ def forward_backward_step(model, batch) -> None: "checkpoint": checkpoint, } # Disable all quantizers; AutoQuantize will enable the needed ones - set_quantizer_by_cfg(model, [("*", QuantizerAttributeConfig(enable=False))]) + set_quantizer_by_cfg(model, [{"quantizer_path": "*", "enable": False}]) searcher.search(model, constraints, config=search_config) # type: ignore[arg-type] return model, searcher.state_dict() diff --git a/modelopt/torch/quantization/utils/core_utils.py b/modelopt/torch/quantization/utils/core_utils.py index c5c582b8c..e7e50aa83 100644 --- a/modelopt/torch/quantization/utils/core_utils.py +++ b/modelopt/torch/quantization/utils/core_utils.py @@ -828,7 +828,7 @@ def update_quant_cfg_with_kv_cache_quant( """Update the quant_cfg with the kv cache quant_cfg.""" # If quant_cfg["quant_cfg"] is None, it corresponds to only kv cache quantization case quant_cfg = copy.deepcopy(quant_cfg) - inner: list = quant_cfg.get("quant_cfg") or [("*", {"enable": False})] + inner: list = quant_cfg.get("quant_cfg") or [{"quantizer_path": "*", "enable": False}] quant_cfg["quant_cfg"] = inner + list(kv_cache_quant_cfg.items()) # Set default algorithm for kv cache quantization if not provided. diff --git a/modelopt_recipes/general/ptq/fp8_default-fp8_kv.yml b/modelopt_recipes/general/ptq/fp8_default-fp8_kv.yml index 4cae9ff7c..5322f18f5 100644 --- a/modelopt_recipes/general/ptq/fp8_default-fp8_kv.yml +++ b/modelopt_recipes/general/ptq/fp8_default-fp8_kv.yml @@ -19,50 +19,50 @@ metadata: ptq_cfg: algorithm: max quant_cfg: - - path: '*' + - quantizer_path: '*' enable: false - - path: '*input_quantizer' + - quantizer_path: '*input_quantizer' cfg: num_bits: e4m3 axis: - - path: '*weight_quantizer' + - quantizer_path: '*weight_quantizer' cfg: num_bits: e4m3 axis: - - path: '*[kv]_bmm_quantizer' + - quantizer_path: '*[kv]_bmm_quantizer' enable: true cfg: num_bits: e4m3 - - path: '*block_sparse_moe.gate*' + - quantizer_path: '*block_sparse_moe.gate*' enable: false - - path: '*linear_attn.conv1d*' + - quantizer_path: '*linear_attn.conv1d*' enable: false - - path: '*lm_head*' + - quantizer_path: '*lm_head*' enable: false - - path: '*mixer.conv1d*' + - quantizer_path: '*mixer.conv1d*' enable: false - - path: '*mlp.gate.*' + - quantizer_path: '*mlp.gate.*' enable: false - - path: '*mlp.shared_expert_gate.*' + - quantizer_path: '*mlp.shared_expert_gate.*' enable: false - - path: '*output_layer*' + - quantizer_path: '*output_layer*' enable: false - - path: '*proj_out.*' + - quantizer_path: '*proj_out.*' enable: false - - path: '*router*' + - quantizer_path: '*router*' enable: false - - path: 'output.*' + - quantizer_path: 'output.*' enable: false - - type: 'nn.BatchNorm1d' - path: '*' + - parent_class: 'nn.BatchNorm1d' + quantizer_path: '*' enable: false - - type: 'nn.BatchNorm2d' - path: '*' + - parent_class: 'nn.BatchNorm2d' + quantizer_path: '*' enable: false - - type: 'nn.BatchNorm3d' - path: '*' + - parent_class: 'nn.BatchNorm3d' + quantizer_path: '*' enable: false - - type: 'nn.LeakyReLU' - path: '*' + - parent_class: 'nn.LeakyReLU' + quantizer_path: '*' enable: false diff --git a/modelopt_recipes/general/ptq/nvfp4_default-fp8_kv.yml b/modelopt_recipes/general/ptq/nvfp4_default-fp8_kv.yml index 2b5e97b19..f0ac09dd6 100644 --- a/modelopt_recipes/general/ptq/nvfp4_default-fp8_kv.yml +++ b/modelopt_recipes/general/ptq/nvfp4_default-fp8_kv.yml @@ -19,9 +19,9 @@ metadata: ptq_cfg: algorithm: max quant_cfg: - - path: '*' + - quantizer_path: '*' enable: false - - path: '*weight_quantizer' + - quantizer_path: '*weight_quantizer' enable: true cfg: block_sizes: @@ -29,7 +29,7 @@ ptq_cfg: type: dynamic scale_bits: e4m3 num_bits: e2m1 - - path: '*input_quantizer' + - quantizer_path: '*input_quantizer' enable: true cfg: block_sizes: @@ -37,40 +37,40 @@ ptq_cfg: type: dynamic scale_bits: e4m3 num_bits: e2m1 - - path: '*[kv]_bmm_quantizer' + - quantizer_path: '*[kv]_bmm_quantizer' enable: true cfg: num_bits: e4m3 - - path: '*block_sparse_moe.gate*' + - quantizer_path: '*block_sparse_moe.gate*' enable: false - - path: '*linear_attn.conv1d*' + - quantizer_path: '*linear_attn.conv1d*' enable: false - - path: '*lm_head*' + - quantizer_path: '*lm_head*' enable: false - - path: '*mixer.conv1d*' + - quantizer_path: '*mixer.conv1d*' enable: false - - path: '*mlp.gate.*' + - quantizer_path: '*mlp.gate.*' enable: false - - path: '*mlp.shared_expert_gate.*' + - quantizer_path: '*mlp.shared_expert_gate.*' enable: false - - path: '*output_layer*' + - quantizer_path: '*output_layer*' enable: false - - path: '*proj_out.*' + - quantizer_path: '*proj_out.*' enable: false - - path: '*router*' + - quantizer_path: '*router*' enable: false - - path: 'output.*' + - quantizer_path: 'output.*' enable: false - - type: 'nn.BatchNorm1d' - path: '*' + - parent_class: 'nn.BatchNorm1d' + quantizer_path: '*' enable: false - - type: 'nn.BatchNorm2d' - path: '*' + - parent_class: 'nn.BatchNorm2d' + quantizer_path: '*' enable: false - - type: 'nn.BatchNorm3d' - path: '*' + - parent_class: 'nn.BatchNorm3d' + quantizer_path: '*' enable: false - - type: 'nn.LeakyReLU' - path: '*' + - parent_class: 'nn.LeakyReLU' + quantizer_path: '*' enable: false diff --git a/modelopt_recipes/general/ptq/nvfp4_mlp_only-fp8_kv.yml b/modelopt_recipes/general/ptq/nvfp4_mlp_only-fp8_kv.yml index 69c51f87e..70b75b790 100644 --- a/modelopt_recipes/general/ptq/nvfp4_mlp_only-fp8_kv.yml +++ b/modelopt_recipes/general/ptq/nvfp4_mlp_only-fp8_kv.yml @@ -19,9 +19,9 @@ metadata: ptq_cfg: algorithm: max quant_cfg: - - path: '*' + - quantizer_path: '*' enable: false - - path: '*mlp*weight_quantizer' + - quantizer_path: '*mlp*weight_quantizer' enable: true cfg: block_sizes: @@ -29,7 +29,7 @@ ptq_cfg: type: dynamic scale_bits: e4m3 num_bits: e2m1 - - path: '*mlp*input_quantizer' + - quantizer_path: '*mlp*input_quantizer' enable: true cfg: block_sizes: @@ -37,7 +37,7 @@ ptq_cfg: type: dynamic scale_bits: e4m3 num_bits: e2m1 - - path: '*block_sparse_moe*weight_quantizer' + - quantizer_path: '*block_sparse_moe*weight_quantizer' enable: true cfg: block_sizes: @@ -45,7 +45,7 @@ ptq_cfg: type: dynamic scale_bits: e4m3 num_bits: e2m1 - - path: '*block_sparse_moe*input_quantizer' + - quantizer_path: '*block_sparse_moe*input_quantizer' enable: true cfg: block_sizes: @@ -53,40 +53,40 @@ ptq_cfg: type: dynamic scale_bits: e4m3 num_bits: e2m1 - - path: '*[kv]_bmm_quantizer' + - quantizer_path: '*[kv]_bmm_quantizer' enable: true cfg: num_bits: e4m3 - - path: '*block_sparse_moe.gate*' + - quantizer_path: '*block_sparse_moe.gate*' enable: false - - path: '*linear_attn.conv1d*' + - quantizer_path: '*linear_attn.conv1d*' enable: false - - path: '*lm_head*' + - quantizer_path: '*lm_head*' enable: false - - path: '*mixer.conv1d*' + - quantizer_path: '*mixer.conv1d*' enable: false - - path: '*mlp.gate.*' + - quantizer_path: '*mlp.gate.*' enable: false - - path: '*mlp.shared_expert_gate.*' + - quantizer_path: '*mlp.shared_expert_gate.*' enable: false - - path: '*output_layer*' + - quantizer_path: '*output_layer*' enable: false - - path: '*proj_out.*' + - quantizer_path: '*proj_out.*' enable: false - - path: '*router*' + - quantizer_path: '*router*' enable: false - - path: 'output.*' + - quantizer_path: 'output.*' enable: false - - type: 'nn.BatchNorm1d' - path: '*' + - parent_class: 'nn.BatchNorm1d' + quantizer_path: '*' enable: false - - type: 'nn.BatchNorm2d' - path: '*' + - parent_class: 'nn.BatchNorm2d' + quantizer_path: '*' enable: false - - type: 'nn.BatchNorm3d' - path: '*' + - parent_class: 'nn.BatchNorm3d' + quantizer_path: '*' enable: false - - type: 'nn.LeakyReLU' - path: '*' + - parent_class: 'nn.LeakyReLU' + quantizer_path: '*' enable: false diff --git a/modelopt_recipes/general/ptq/nvfp4_omlp_only-fp8_kv.yml b/modelopt_recipes/general/ptq/nvfp4_omlp_only-fp8_kv.yml index a35b88cac..93cc90606 100644 --- a/modelopt_recipes/general/ptq/nvfp4_omlp_only-fp8_kv.yml +++ b/modelopt_recipes/general/ptq/nvfp4_omlp_only-fp8_kv.yml @@ -19,9 +19,9 @@ metadata: ptq_cfg: algorithm: max quant_cfg: - - path: '*' + - quantizer_path: '*' enable: false - - path: '*mlp*weight_quantizer' + - quantizer_path: '*mlp*weight_quantizer' enable: true cfg: block_sizes: @@ -29,7 +29,7 @@ ptq_cfg: type: dynamic scale_bits: e4m3 num_bits: e2m1 - - path: '*mlp*input_quantizer' + - quantizer_path: '*mlp*input_quantizer' enable: true cfg: block_sizes: @@ -37,7 +37,7 @@ ptq_cfg: type: dynamic scale_bits: e4m3 num_bits: e2m1 - - path: '*block_sparse_moe*weight_quantizer' + - quantizer_path: '*block_sparse_moe*weight_quantizer' enable: true cfg: block_sizes: @@ -45,7 +45,7 @@ ptq_cfg: type: dynamic scale_bits: e4m3 num_bits: e2m1 - - path: '*block_sparse_moe*input_quantizer' + - quantizer_path: '*block_sparse_moe*input_quantizer' enable: true cfg: block_sizes: @@ -53,7 +53,7 @@ ptq_cfg: type: dynamic scale_bits: e4m3 num_bits: e2m1 - - path: '*o_proj*weight_quantizer' + - quantizer_path: '*o_proj*weight_quantizer' enable: true cfg: block_sizes: @@ -61,7 +61,7 @@ ptq_cfg: type: dynamic scale_bits: e4m3 num_bits: e2m1 - - path: '*o_proj*input_quantizer' + - quantizer_path: '*o_proj*input_quantizer' enable: true cfg: block_sizes: @@ -69,40 +69,40 @@ ptq_cfg: type: dynamic scale_bits: e4m3 num_bits: e2m1 - - path: '*[kv]_bmm_quantizer' + - quantizer_path: '*[kv]_bmm_quantizer' enable: true cfg: num_bits: e4m3 - - path: '*block_sparse_moe.gate*' + - quantizer_path: '*block_sparse_moe.gate*' enable: false - - path: '*linear_attn.conv1d*' + - quantizer_path: '*linear_attn.conv1d*' enable: false - - path: '*lm_head*' + - quantizer_path: '*lm_head*' enable: false - - path: '*mixer.conv1d*' + - quantizer_path: '*mixer.conv1d*' enable: false - - path: '*mlp.gate.*' + - quantizer_path: '*mlp.gate.*' enable: false - - path: '*mlp.shared_expert_gate.*' + - quantizer_path: '*mlp.shared_expert_gate.*' enable: false - - path: '*output_layer*' + - quantizer_path: '*output_layer*' enable: false - - path: '*proj_out.*' + - quantizer_path: '*proj_out.*' enable: false - - path: '*router*' + - quantizer_path: '*router*' enable: false - - path: 'output.*' + - quantizer_path: 'output.*' enable: false - - type: 'nn.BatchNorm1d' - path: '*' + - parent_class: 'nn.BatchNorm1d' + quantizer_path: '*' enable: false - - type: 'nn.BatchNorm2d' - path: '*' + - parent_class: 'nn.BatchNorm2d' + quantizer_path: '*' enable: false - - type: 'nn.BatchNorm3d' - path: '*' + - parent_class: 'nn.BatchNorm3d' + quantizer_path: '*' enable: false - - type: 'nn.LeakyReLU' - path: '*' + - parent_class: 'nn.LeakyReLU' + quantizer_path: '*' enable: false diff --git a/tests/_test_utils/torch/quantization/quantize_common.py b/tests/_test_utils/torch/quantization/quantize_common.py index ba0660ac2..03290dfab 100644 --- a/tests/_test_utils/torch/quantization/quantize_common.py +++ b/tests/_test_utils/torch/quantization/quantize_common.py @@ -47,9 +47,17 @@ def get_awq_config(algorithm="awq_lite", block_size=8): config = copy.deepcopy(mtq.INT4_AWQ_CFG) - for pat, cfg in config["quant_cfg"]: + for entry in config["quant_cfg"]: + pat = ( + entry["quantizer_path"] + if isinstance(entry, dict) and "quantizer_path" in entry + else entry[0] + ) if pat == "*weight_quantizer": - cfg["block_sizes"] = {-1: block_size} + if isinstance(entry, dict) and "quantizer_path" in entry: + entry.setdefault("cfg", {})["block_sizes"] = {-1: block_size} + else: + entry[1]["block_sizes"] = {-1: block_size} break if "algorithm" not in config or not isinstance(config["algorithm"], dict): config["algorithm"] = {} diff --git a/tests/unit/recipe/test_loader.py b/tests/unit/recipe/test_loader.py index 0b49210c7..bf660eafd 100644 --- a/tests/unit/recipe/test_loader.py +++ b/tests/unit/recipe/test_loader.py @@ -210,19 +210,19 @@ def test_general_ptq_yaml_matches_config_dicts(yaml_path, model_cfg_name, kv_cfg def _as_dict(qc): result = {} for entry in qc: - if isinstance(entry, dict): - if "type" in entry: - sub_cfg = {} - if "enable" in entry: - sub_cfg["enable"] = entry["enable"] - result[entry["type"]] = {entry["path"]: sub_cfg} - elif "path" in entry: - fmt = dict(entry.get("cfg") or {}) - if "enable" in entry: - fmt["enable"] = entry["enable"] - result[entry["path"]] = fmt + if isinstance(entry, dict) and "quantizer_path" in entry: + parent_class = entry.get("parent_class") + key = parent_class if parent_class else entry["quantizer_path"] + cfg = entry.get("cfg", {}) + val = dict(cfg) if isinstance(cfg, dict) else cfg + if entry.get("enable") is not None: + val["enable"] = entry["enable"] + if parent_class: + result[key] = {entry["quantizer_path"]: val} else: - result.update(entry) + result[key] = val + elif isinstance(entry, dict): + result.update(entry) else: result[entry[0]] = entry[1] return result diff --git a/tests/unit/torch/quantization/plugins/test_huggingface.py b/tests/unit/torch/quantization/plugins/test_huggingface.py index 0cd34da79..d04a8c026 100644 --- a/tests/unit/torch/quantization/plugins/test_huggingface.py +++ b/tests/unit/torch/quantization/plugins/test_huggingface.py @@ -196,9 +196,17 @@ def test_quantized_transformers_save_restore(tmp_path, model_cls, quant_config): import copy quant_config = copy.deepcopy(quant_config) - for pat, cfg in quant_config["quant_cfg"]: + for entry in quant_config["quant_cfg"]: + pat = ( + entry["quantizer_path"] + if isinstance(entry, dict) and "quantizer_path" in entry + else entry[0] + ) if pat == "*weight_quantizer": - cfg["block_sizes"] = {-1: 16} + if isinstance(entry, dict) and "quantizer_path" in entry: + entry.setdefault("cfg", {})["block_sizes"] = {-1: 16} + else: + entry[1]["block_sizes"] = {-1: 16} break else: raise ValueError(f"Unsupported quant_config: {quant_config}") diff --git a/tests/unit/torch/quantization/test_autoquant.py b/tests/unit/torch/quantization/test_autoquant.py index 6277fdc7f..d8ce15681 100644 --- a/tests/unit/torch/quantization/test_autoquant.py +++ b/tests/unit/torch/quantization/test_autoquant.py @@ -111,8 +111,8 @@ def test_quant_recipe_hparam(): # use this config to test custom quantization config INT8_CUSTOM_QUANT_TEST_CFG = { "quant_cfg": [ - ("*weight_quantizer", {"num_bits": 8, "axis": 0}), - ("*input_quantizer", {"num_bits": 8, "axis": None}), + {"quantizer_path": "*weight_quantizer", "cfg": {"num_bits": 8, "axis": 0}}, + {"quantizer_path": "*input_quantizer", "cfg": {"num_bits": 8, "axis": None}}, *_default_disabled_quantizer_cfg, ], "algorithm": "smoothquant", @@ -231,15 +231,19 @@ def test_auto_quantize_disabled_layers_no_poison(): INT4INT8_AWQ_CFG = { "quant_cfg": [ - ( - "*weight_quantizer", - [ + { + "quantizer_path": "*weight_quantizer", + "cfg": [ {"num_bits": 4, "block_sizes": {-1: 128, "type": "static"}, "enable": True}, {"num_bits": 8, "axis": None, "enable": True}, ], - ), - ("*input_quantizer", {"num_bits": 8, "axis": None, "enable": True}), - ("default", {"enable": False}), + }, + { + "quantizer_path": "*input_quantizer", + "cfg": {"num_bits": 8, "axis": None}, + "enable": True, + }, + {"quantizer_path": "default", "enable": False}, ], "algorithm": "awq_lite", } @@ -484,7 +488,21 @@ def test_get_auto_quantize_config(method): config = mtq.get_auto_quantize_config(search_state) assert "quant_cfg" in config assert isinstance(config["quant_cfg"], list) - assert any(pattern == "*" and cfg == {"enable": False} for pattern, cfg in config["quant_cfg"]) + assert any( + ( + entry["quantizer_path"] + if isinstance(entry, dict) and "quantizer_path" in entry + else entry[0] + ) + == "*" + and ( + entry.get("enable") + if isinstance(entry, dict) and "quantizer_path" in entry + else entry[1].get("enable") + ) + is False + for entry in config["quant_cfg"] + ) assert config["algorithm"] == "max" # Re-solve with different constraints From f52d213aa190ca3fcb82f7b582d57d2d7408c420 Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Fri, 20 Mar 2026 07:55:06 +0000 Subject: [PATCH 12/32] fix tests Signed-off-by: Shengliang Xu --- examples/diffusers/quantization/config.py | 87 +++---- examples/llm_ptq/example_utils.py | 15 +- examples/llm_ptq/hf_ptq.py | 7 +- examples/llm_qat/main.py | 16 +- examples/vllm_serve/fakequant_worker.py | 16 +- modelopt/torch/quantization/algorithms.py | 11 +- modelopt/torch/quantization/config.py | 183 ++++++++------ modelopt/torch/quantization/conversion.py | 37 +-- modelopt/torch/quantization/model_quant.py | 14 +- .../nn/modules/tensor_quantizer.py | 15 ++ .../general/ptq/fp8_default-fp8_kv.yml | 1 - .../general/ptq/nvfp4_default-fp8_kv.yml | 1 - .../general/ptq/nvfp4_mlp_only-fp8_kv.yml | 1 - .../general/ptq/nvfp4_omlp_only-fp8_kv.yml | 1 - tests/_test_utils/torch/export/utils.py | 223 +++++++++++------- .../torch/quantization/onnx_export.py | 6 +- .../torch/peft/plugins/test_megatron_peft.py | 29 ++- .../unit/torch/quantization/test_autoquant.py | 2 +- .../torch/quantization/test_custom_backend.py | 20 +- .../torch/quantization/test_quantize_cpu.py | 51 ++-- .../quantization/test_tensor_quant_cpu.py | 12 +- 21 files changed, 433 insertions(+), 315 deletions(-) diff --git a/examples/diffusers/quantization/config.py b/examples/diffusers/quantization/config.py index 3e2dbcc2e..9f24ec15f 100644 --- a/examples/diffusers/quantization/config.py +++ b/examples/diffusers/quantization/config.py @@ -18,77 +18,77 @@ FP8_DEFAULT_CONFIG = { "quant_cfg": [ - ("*weight_quantizer", {"num_bits": (4, 3), "axis": None}), - ("*input_quantizer", {"num_bits": (4, 3), "axis": None}), - ("*output_quantizer", {"enable": False}), - ("*softmax_quantizer", {"num_bits": (4, 3), "axis": None}), - ("default", {"enable": False}), + {"quantizer_path": "*", "enable": False}, + {"quantizer_path": "*weight_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, + {"quantizer_path": "*input_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, + {"quantizer_path": "*output_quantizer", "enable": False}, + {"quantizer_path": "*softmax_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, ], "algorithm": "max", } INT8_DEFAULT_CONFIG = { "quant_cfg": [ - ("*weight_quantizer", {"num_bits": 8, "axis": 0}), - ("*input_quantizer", {"num_bits": 8, "axis": None}), - ("*output_quantizer", {"enable": False}), - ("default", {"enable": False}), + {"quantizer_path": "*", "enable": False}, + {"quantizer_path": "*weight_quantizer", "cfg": {"num_bits": 8, "axis": 0}}, + {"quantizer_path": "*input_quantizer", "cfg": {"num_bits": 8, "axis": None}}, + {"quantizer_path": "*output_quantizer", "enable": False}, ], "algorithm": "max", } NVFP4_DEFAULT_CONFIG = { "quant_cfg": [ - ( - "*weight_quantizer", - { + {"quantizer_path": "*", "enable": False}, + { + "quantizer_path": "*weight_quantizer", + "cfg": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, "axis": None, - "enable": True, }, - ), - ( - "*input_quantizer", - { + "enable": True, + }, + { + "quantizer_path": "*input_quantizer", + "cfg": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, "axis": None, - "enable": True, }, - ), - ("*output_quantizer", {"enable": False}), - ("*softmax_quantizer", {"num_bits": (4, 3), "axis": None}), - ("default", {"enable": False}), + "enable": True, + }, + {"quantizer_path": "*output_quantizer", "enable": False}, + {"quantizer_path": "*softmax_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, ], "algorithm": "max", } NVFP4_FP8_MHA_CONFIG = { "quant_cfg": [ - ( - "**weight_quantizer", - { + {"quantizer_path": "*", "enable": False}, + { + "quantizer_path": "**weight_quantizer", + "cfg": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, "axis": None, - "enable": True, }, - ), - ( - "**input_quantizer", - { + "enable": True, + }, + { + "quantizer_path": "**input_quantizer", + "cfg": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, "axis": None, - "enable": True, }, - ), - ("*output_quantizer", {"enable": False}), - ("*[qkv]_bmm_quantizer", {"num_bits": (4, 3), "axis": None}), - ("*softmax_quantizer", {"num_bits": (4, 3), "axis": None}), - ("*bmm2_output_quantizer", {"num_bits": (4, 3), "axis": None}), - ("default", {"enable": False}), + "enable": True, + }, + {"quantizer_path": "*output_quantizer", "enable": False}, + {"quantizer_path": "*[qkv]_bmm_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, + {"quantizer_path": "*softmax_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, + {"quantizer_path": "*bmm2_output_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, ], "algorithm": {"method": "svdquant", "lowrank": 32}, } @@ -103,8 +103,9 @@ def set_quant_config_attr(quant_config, trt_high_precision_dtype, quant_algo, ** algo_cfg["lowrank"] = kwargs["lowrank"] quant_config["algorithm"] = algo_cfg - for _pattern, p in quant_config["quant_cfg"]: - if "num_bits" in p and "trt_high_precision_dtype" not in p: + for entry in quant_config["quant_cfg"]: + p = entry.get("cfg", {}) + if isinstance(p, dict) and "num_bits" in p and "trt_high_precision_dtype" not in p: p["trt_high_precision_dtype"] = trt_high_precision_dtype @@ -125,9 +126,9 @@ def reset_set_int8_config(quant_config, percentile, n_steps, collect_method, bac if isinstance(module, nn.Conv2d): aq_name = f"*{name}*input_quantizer*" quant_config["quant_cfg"].append( - ( - aq_name, - { + { + "quantizer_path": aq_name, + "cfg": { "num_bits": 8, "axis": None, "calibrator": ( @@ -142,5 +143,5 @@ def reset_set_int8_config(quant_config, percentile, n_steps, collect_method, bac }, ), }, - ) + } ) diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py index ca6a3ea09..1387f2a6d 100755 --- a/examples/llm_ptq/example_utils.py +++ b/examples/llm_ptq/example_utils.py @@ -205,9 +205,12 @@ def build_quant_cfg( ) -> dict[str, Any]: quant_cfg = copy.deepcopy(quant_cfg) if "awq" in str(quant_cfg.get("algorithm")): - weight_quantizer = next( - cfg for pat, cfg in quant_cfg["quant_cfg"] if pat == "*weight_quantizer" + weight_quantizer_entry = next( + e + for e in quant_cfg["quant_cfg"] + if isinstance(e, dict) and e.get("quantizer_path") == "*weight_quantizer" ) + weight_quantizer = weight_quantizer_entry.get("cfg", {}) if isinstance(weight_quantizer, list): weight_quantizer = weight_quantizer[0] # If awq_block_size argument is provided, update weight_quantizer @@ -238,10 +241,10 @@ def build_quant_cfg( if model_type == "phi4mm": # Only quantize the language model - quant_cfg["quant_cfg"].append(("*speech*", {"enable": False})) - quant_cfg["quant_cfg"].append(("*audio*", {"enable": False})) - quant_cfg["quant_cfg"].append(("*image*", {"enable": False})) - quant_cfg["quant_cfg"].append(("*vision*", {"enable": False})) + quant_cfg["quant_cfg"].append({"quantizer_path": "*speech*", "enable": False}) + quant_cfg["quant_cfg"].append({"quantizer_path": "*audio*", "enable": False}) + quant_cfg["quant_cfg"].append({"quantizer_path": "*image*", "enable": False}) + quant_cfg["quant_cfg"].append({"quantizer_path": "*vision*", "enable": False}) return quant_cfg diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index f8be6274d..34d7bb0de 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -155,7 +155,7 @@ def extract_and_prepare_language_model_from_vl(full_model): # Apply disabled quant to all modules that are not part of language_model # This excludes them during HF export disabled_quant_cfg = { - "quant_cfg": ("default", {"enable": False}), + "quant_cfg": [{"quantizer_path": "*", "enable": False}], "algorithm": "max", } @@ -343,10 +343,7 @@ def forward_step(model, batch): getattr(mtq, KV_QUANT_CFG_CHOICES[args.kv_cache_qformat])["quant_cfg"] ) kv_cache_quant_cfg = [ - e - for e in kv_cache_quant_cfg - if (e["quantizer_path"] if isinstance(e, dict) and "quantizer_path" in e else e[0]) - != "default" + e for e in kv_cache_quant_cfg if e["quantizer_path"] != "*" ] # keep other quantizers from auto_quantize if args.kv_cache_qformat in _KV_CAST_FORMATS: diff --git a/examples/llm_qat/main.py b/examples/llm_qat/main.py index 5312c2ad9..14d5a5c82 100644 --- a/examples/llm_qat/main.py +++ b/examples/llm_qat/main.py @@ -55,10 +55,18 @@ CUSTOM_QUANT_CFG = { "INT4_WEIGHT_INT8_ACTIVATIONS": { "quant_cfg": [ - ("*weight_quantizer", {"num_bits": 4, "block_sizes": {-1: 128}, "enable": True}), - ("*input_quantizer", {"num_bits": 8, "axis": None, "enable": True}), - ("*lm_head*", {"enable": False}), - ("default", {"enable": False}), + {"quantizer_path": "*", "enable": False}, + { + "quantizer_path": "*weight_quantizer", + "cfg": {"num_bits": 4, "block_sizes": {-1: 128}}, + "enable": True, + }, + { + "quantizer_path": "*input_quantizer", + "cfg": {"num_bits": 8, "axis": None}, + "enable": True, + }, + {"quantizer_path": "*lm_head*", "enable": False}, ], "algorithm": "max", } diff --git a/examples/vllm_serve/fakequant_worker.py b/examples/vllm_serve/fakequant_worker.py index 4a4bde1d3..284aba8f7 100644 --- a/examples/vllm_serve/fakequant_worker.py +++ b/examples/vllm_serve/fakequant_worker.py @@ -170,10 +170,18 @@ def update_kv_cfg_for_mla(model: torch.nn.Module, kv_quant_cfg: list) -> list: if not any(isinstance(m, MLAAttention) for m in model.modules()): return kv_quant_cfg - kv_config = next((cfg for pat, cfg in kv_quant_cfg if pat == "*[kv]_bmm_quantizer"), None) - if kv_config is not None: - kv_quant_cfg.append(("*kv_c_bmm_quantizer", kv_config)) - kv_quant_cfg.append(("*k_pe_bmm_quantizer", kv_config)) + kv_entry = next( + ( + e + for e in kv_quant_cfg + if isinstance(e, dict) and e.get("quantizer_path") == "*[kv]_bmm_quantizer" + ), + None, + ) + if kv_entry is not None: + kv_config = kv_entry.get("cfg", {}) + kv_quant_cfg.append({"quantizer_path": "*kv_c_bmm_quantizer", "cfg": kv_config}) + kv_quant_cfg.append({"quantizer_path": "*k_pe_bmm_quantizer", "cfg": kv_config}) print("MLA detected: added *kv_c_bmm_quantizer and k_pe_bmm_quantizer config") return kv_quant_cfg diff --git a/modelopt/torch/quantization/algorithms.py b/modelopt/torch/quantization/algorithms.py index 358253891..03029edbe 100644 --- a/modelopt/torch/quantization/algorithms.py +++ b/modelopt/torch/quantization/algorithms.py @@ -111,7 +111,7 @@ def __init__(self, quant_cfg: str | dict[str, Any] | None = None, name: str | No name = self.get_auto_name_for_config(quant_cfg) or name if quant_cfg is None: - quant_cfg = {"quant_cfg": [("*", {"enable": False})]} + quant_cfg = {"quant_cfg": [{"quantizer_path": "*", "enable": False}]} elif isinstance(quant_cfg, str): assert hasattr(mtq_config, quant_cfg), f"Unknown quantization format {quant_cfg}" quant_cfg = getattr(mtq_config, quant_cfg) @@ -1322,7 +1322,7 @@ def _cfg_to_dict(v): return [_cfg_to_dict(c) for c in v] return v - quant_cfg: list[tuple] = [("*", {"enable": False})] + quant_cfg: list[dict] = [{"quantizer_path": "*", "enable": False}] for hparam_name, recipe in best_recipe.items(): if recipe == QuantRecipe(quant_cfg=None): continue @@ -1331,7 +1331,12 @@ def _cfg_to_dict(v): for quantizer_attr in ("input_quantizer", "weight_quantizer"): matched_cfg = _match_quantizer_cfg(recipe.config.quant_cfg, quantizer_attr) if matched_cfg is not None: - quant_cfg.append((f"{module_name}.{quantizer_attr}", _cfg_to_dict(matched_cfg))) + quant_cfg.append( + { + "quantizer_path": f"{module_name}.{quantizer_attr}", + "cfg": _cfg_to_dict(matched_cfg), + } + ) warnings.warn( "get_auto_quantize_config: returned config uses algorithm='max'. " "Per-recipe calibration algorithms (e.g. smoothquant, awq) are not preserved. " diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py index 42d2e25ea..0591e6ea6 100644 --- a/modelopt/torch/quantization/config.py +++ b/modelopt/torch/quantization/config.py @@ -135,20 +135,28 @@ """ -from typing import Any, Literal +from typing import Any, Literal, TypedDict, cast from pydantic import ValidationInfo, field_validator, model_validator from modelopt.torch.opt.config import ModeloptBaseConfig, ModeloptField from modelopt.torch.utils.network import ConstructorLike -QuantCfgEntry = dict[str, Any] -_base_disable_all: list[QuantCfgEntry] = [ +class QuantizerCfgEntry(TypedDict, total=False): + """A single entry in a ``quant_cfg`` list.""" + + quantizer_path: str # required; matched against quantizer module names + parent_class: str # optional; filters by pytorch module class name (e.g. "nn.Linear") + cfg: dict[str, Any] | list[dict[str, Any]] # quantizer attribute config(s) + enable: bool # shorthand to set/unset the quantizer's enable flag + + +_base_disable_all: list[QuantizerCfgEntry] = [ {"quantizer_path": "*", "enable": False}, ] -_default_disabled_quantizer_cfg: list[QuantCfgEntry] = [ +_default_disabled_quantizer_cfg: list[QuantizerCfgEntry] = [ {"parent_class": "nn.BatchNorm1d", "quantizer_path": "*", "enable": False}, {"parent_class": "nn.BatchNorm2d", "quantizer_path": "*", "enable": False}, {"parent_class": "nn.BatchNorm3d", "quantizer_path": "*", "enable": False}, @@ -158,17 +166,23 @@ "quantizer_path": "*proj_out.*", "enable": False, }, # In Whisper model, lm_head has key name proj_out - {"quantizer_path": "*block_sparse_moe.gate*", "enable": False}, # Skip the MOE router + { + "quantizer_path": "*block_sparse_moe.gate*", + "enable": False, + }, # Skip the MOE router {"quantizer_path": "*router*", "enable": False}, # Skip the MOE router {"quantizer_path": "*mlp.gate.*", "enable": False}, # Skip the MOE router - {"quantizer_path": "*mlp.shared_expert_gate.*", "enable": False}, # Skip the MOE router + { + "quantizer_path": "*mlp.shared_expert_gate.*", + "enable": False, + }, # Skip the MOE router {"quantizer_path": "*linear_attn.conv1d*", "enable": False}, {"quantizer_path": "*mixer.conv1d*", "enable": False}, # Skip mamba conv1d {"quantizer_path": "*output_layer*", "enable": False}, {"quantizer_path": "output.*", "enable": False}, ] -_mamba_moe_disabled_quantizer_cfg: list[QuantCfgEntry] = [ +_mamba_moe_disabled_quantizer_cfg: list[QuantizerCfgEntry] = [ {"quantizer_path": "*fc1_latent_proj*", "enable": False}, # Skip Latent MOE {"quantizer_path": "*fc2_latent_proj*", "enable": False}, # Skip Latent MOE {"quantizer_path": "*q_proj*", "enable": False}, # Skip QKV Linear @@ -210,8 +224,14 @@ FP8_DEFAULT_CFG = { "quant_cfg": [ *_base_disable_all, - {"quantizer_path": "*weight_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, - {"quantizer_path": "*input_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, + { + "quantizer_path": "*weight_quantizer", + "cfg": {"num_bits": (4, 3), "axis": None}, + }, + { + "quantizer_path": "*input_quantizer", + "cfg": {"num_bits": (4, 3), "axis": None}, + }, *_default_disabled_quantizer_cfg, ], "algorithm": "max", @@ -220,8 +240,14 @@ MAMBA_MOE_FP8_AGGRESSIVE_CFG = { "quant_cfg": [ *_base_disable_all, - {"quantizer_path": "*weight_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, - {"quantizer_path": "*input_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, + { + "quantizer_path": "*weight_quantizer", + "cfg": {"num_bits": (4, 3), "axis": None}, + }, + { + "quantizer_path": "*input_quantizer", + "cfg": {"num_bits": (4, 3), "axis": None}, + }, *_default_disabled_quantizer_cfg, *_mamba_moe_disabled_quantizer_cfg, ], @@ -231,8 +257,14 @@ MAMBA_MOE_FP8_CONSERVATIVE_CFG = { "quant_cfg": [ *_base_disable_all, - {"quantizer_path": "*weight_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, - {"quantizer_path": "*input_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, + { + "quantizer_path": "*weight_quantizer", + "cfg": {"num_bits": (4, 3), "axis": None}, + }, + { + "quantizer_path": "*input_quantizer", + "cfg": {"num_bits": (4, 3), "axis": None}, + }, *_default_disabled_quantizer_cfg, *_mamba_moe_disabled_quantizer_cfg, {"quantizer_path": "*mixer.in_proj*", "enable": False}, # Skip mamba linear @@ -427,7 +459,10 @@ }, "enable": True, }, - {"quantizer_path": "*input_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, + { + "quantizer_path": "*input_quantizer", + "cfg": {"num_bits": (4, 3), "axis": None}, + }, *_default_disabled_quantizer_cfg, ], "algorithm": None, @@ -1461,13 +1496,62 @@ class GPTQLiteConfig(QuantizeAlgorithmConfig): ) -QuantizeQuantCfgType = list[QuantCfgEntry] +QuantizeQuantCfgType = list[QuantizerCfgEntry] _QuantizeAlgoCfgType = str | dict | QuantizeAlgorithmConfig | None QuantizeAlgoCfgType = _QuantizeAlgoCfgType | list[_QuantizeAlgoCfgType] | None +def normalize_quant_cfg_list(v: list) -> list[QuantizerCfgEntry]: + """Normalize a raw quant_cfg list into a list of QuantizerCfgEntry dicts. + + Supports these input forms per entry: + - ``{"quantizer_path": ..., "enable": ..., "cfg": ...}`` — passed through as-is + - ``{"": ...}`` — single-key dict (legacy) + - ``(quantizer_path, cfg_dict)`` — tuple form (legacy) + """ + + def _tuple_to_entry(key: str, value) -> QuantizerCfgEntry: + if isinstance(key, str) and key.startswith("nn."): + assert isinstance(value, dict) and len(value) == 1 + q_path, sub_cfg = next(iter(value.items())) + sub_cfg = dict(sub_cfg) + enable = sub_cfg.pop("enable", None) + entry: QuantizerCfgEntry = { + "parent_class": key, + "quantizer_path": q_path, + "cfg": sub_cfg, + } + if enable is not None: + entry["enable"] = enable + return entry + else: + if isinstance(value, dict): + cfg = {k: val for k, val in value.items() if k != "enable"} + enable = value.get("enable") + else: + cfg = value + enable = None + entry = {"quantizer_path": key, "cfg": cfg} + if enable is not None: + entry["enable"] = enable + return entry + + result: list[QuantizerCfgEntry] = [] + for raw in v: + if isinstance(raw, dict) and "quantizer_path" in raw: + result.append(cast("QuantizerCfgEntry", raw)) + elif isinstance(raw, dict) and len(raw) == 1: + key, val = next(iter(raw.items())) + result.append(_tuple_to_entry(key, val)) + elif isinstance(raw, (tuple, list)) and len(raw) == 2: + result.append(_tuple_to_entry(raw[0], raw[1])) + else: + raise ValueError(f"Invalid quant_cfg entry: {raw!r}.") + return result + + class QuantizeConfig(ModeloptBaseConfig): """Default configuration for ``quantize`` mode.""" @@ -1487,62 +1571,10 @@ class QuantizeConfig(ModeloptBaseConfig): @field_validator("quant_cfg", mode="before") @classmethod def normalize_quant_cfg(cls, v): - """Normalize quant_cfg entries: convert dict and tuple forms to QuantCfgEntry dicts. - - Supports these input forms: - - ``{"quantizer_path": ..., "enable": ..., "cfg": ...}`` — passed through as-is - - ``{"": ...}`` — single-key dict (legacy) - - ``(quantizer_path, cfg_dict)`` — tuple form (legacy) - """ + """Normalize quant_cfg entries: convert dict and tuple forms to QuantizerCfgEntry dicts.""" if not isinstance(v, list): return v - result = [] - for entry in v: - if isinstance(entry, dict) and "quantizer_path" in entry: - result.append(entry) - elif isinstance(entry, dict): - if len(entry) == 1: - key, val = next(iter(entry.items())) - result.append(cls._tuple_to_entry(key, val)) - else: - raise ValueError( - f"Invalid quant_cfg entry: {entry!r}. " - "Expected a dict with 'quantizer_path', a single-key dict, or a (quantizer_path, cfg) tuple." - ) - elif isinstance(entry, (tuple, list)) and len(entry) == 2: - result.append(cls._tuple_to_entry(entry[0], entry[1])) - else: - raise ValueError(f"Invalid quant_cfg entry: {entry!r}.") - return result - - @classmethod - def _tuple_to_entry(cls, key: str, value) -> "QuantCfgEntry": - """Convert a (key, value) tuple to a QuantCfgEntry dict.""" - if isinstance(key, str) and key.startswith("nn."): - # nn.* type entry: value is {quantizer_path: {enable: ...}} - assert isinstance(value, dict) and len(value) == 1 - q_path, sub_cfg = next(iter(value.items())) - sub_cfg = dict(sub_cfg) - enable = sub_cfg.pop("enable", None) - new_entry: QuantCfgEntry = { - "parent_class": key, - "quantizer_path": q_path, - "cfg": sub_cfg, - } - if enable is not None: - new_entry["enable"] = enable - return new_entry - else: - if isinstance(value, dict): - cfg = {k: v for k, v in value.items() if k != "enable"} - enable = value.get("enable") - else: - cfg = value - enable = None - new_entry = {"quantizer_path": key, "cfg": cfg} - if enable is not None: - new_entry["enable"] = enable - return new_entry + return normalize_quant_cfg_list(v) @field_validator("quant_cfg", mode="after") @classmethod @@ -1598,11 +1630,12 @@ def _not_dynamic(cfg): if isinstance(entry, dict) and "quantizer_path" in entry else entry[0] ) - cfg = ( - entry.get("cfg", {}) - if isinstance(entry, dict) and "quantizer_path" in entry - else entry[1] - ) + if isinstance(entry, dict) and "quantizer_path" in entry: + cfg = dict(entry.get("cfg") or {}) + if "enable" in entry: + cfg["enable"] = entry["enable"] + else: + cfg = entry[1] if "weight_quantizer" in name: # We don't calibrate weight quantizer continue @@ -1610,10 +1643,8 @@ def _not_dynamic(cfg): if isinstance(cfg, list): for _config in cfg: if _not_dynamic(_config): - print(f"{cfg}: True") return True elif isinstance(cfg, dict) and _not_dynamic(cfg): - print(f"{cfg}: True") return True return False diff --git a/modelopt/torch/quantization/conversion.py b/modelopt/torch/quantization/conversion.py index f3af07418..4f0b99e87 100644 --- a/modelopt/torch/quantization/conversion.py +++ b/modelopt/torch/quantization/conversion.py @@ -33,6 +33,7 @@ QuantizeQuantCfgType, QuantizerAttributeConfig, _QuantizeExportConfig, + normalize_quant_cfg_list, ) from .nn import ( NVFP4StaticQuantizer, @@ -214,7 +215,7 @@ def _replace_quant_module(model: nn.Module, version=None, registry=QuantModuleRe def set_quantizer_by_cfg(quant_model: nn.Module, quant_cfg: QuantizeQuantCfgType): """Update the quantizer attributes based on the specified `quant_cfg`. - `quant_cfg` is a list of :class:`QuantCfgEntry <.config.QuantCfgEntry>` objects mapping + `quant_cfg` is a list of :class:`QuantizerCfgEntry <.config.QuantizerCfgEntry>` objects mapping quantizer paths (and optionally parent classes) to their quantizer attributes, which are defined in :class:`QuantizerAttributeConfig <.config.QuantizerAttributeConfig>`. The ``quantizer_path`` is matched against the quantizer module names. @@ -227,24 +228,23 @@ def set_quantizer_by_cfg(quant_model: nn.Module, quant_cfg: QuantizeQuantCfgType See :meth:`set_quantizer_attribute ` for more details. """ + quant_cfg = normalize_quant_cfg_list(quant_cfg) for entry in quant_cfg: - entry_cfg = entry.get("cfg", {}) if isinstance(entry, dict) else {} - effective_cfg = dict(entry_cfg) if isinstance(entry_cfg, dict) else list(entry_cfg) - enable = entry.get("enable") if isinstance(entry, dict) else None - if enable is not None and isinstance(effective_cfg, dict): - effective_cfg["enable"] = enable - parent_class_name = entry.get("parent_class") if isinstance(entry, dict) else None - quantizer_path = ( - entry["quantizer_path"] - if isinstance(entry, dict) and "quantizer_path" in entry - else entry - ) - assert isinstance(quantizer_path, str) + entry_cfg = entry.get("cfg", {}) + enable = entry.get("enable") + if isinstance(entry_cfg, dict): + if enable is not None: + entry_cfg["enable"] = enable + elif entry_cfg: + # cfg present without explicit enable → implicitly enable the quantizer + entry_cfg = {**entry_cfg, "enable": True} + quantizer_path: str = entry["quantizer_path"] + parent_class_name = entry.get("parent_class") if parent_class_name is not None: parent_class = QuantModuleRegistry[parent_class_name] - set_quantizer_attribute(quant_model, quantizer_path, effective_cfg, parent_class) + set_quantizer_attribute(quant_model, quantizer_path, entry_cfg, parent_class) else: - set_quantizer_attribute(quant_model, quantizer_path, effective_cfg) + set_quantizer_attribute(quant_model, quantizer_path, entry_cfg) def set_quantizer_attribute( @@ -320,9 +320,10 @@ def set_quantizer_by_cfg_context(quant_model: nn.Module, quant_cfg: QuantizeQuan Use this context manager with caution. Changing certain attributes of the quantizer such as `calibrator` can lead to unexpected behavior. """ - assert not any( - isinstance(entry.get("cfg", {}), list) for entry in quant_cfg if isinstance(entry, dict) - ), "list of config not support." + quant_cfg = normalize_quant_cfg_list(quant_cfg) + assert not any(isinstance(entry.get("cfg", {}), list) for entry in quant_cfg), ( + "list of config not support." + ) original_attributes = {} for name, module in quant_model.named_modules(): diff --git a/modelopt/torch/quantization/model_quant.py b/modelopt/torch/quantization/model_quant.py index 2c601609c..07b350e19 100644 --- a/modelopt/torch/quantization/model_quant.py +++ b/modelopt/torch/quantization/model_quant.py @@ -178,17 +178,15 @@ def quantize( .. code-block::python config = { - "quant_cfg": [ + # Disable all quantizers by default + {"quantizer_path": "*", "enable": False}, # "num_bits" specifies the number of bits for quantization # "axis" specifies the axis for quantization - ("*weight_quantizer", {"num_bits": 8, "axis": 0}), - ("*input_quantizer", {"num_bits": 8, "axis": -1}), - - # Default quantization settings - ("default", {"num_bits": 8, "axis": None}), - ] - "algorithm": "max" + {"quantizer_path": "*weight_quantizer", "cfg": {"num_bits": 8, "axis": 0}}, + {"quantizer_path": "*input_quantizer", "cfg": {"num_bits": 8, "axis": -1}}, + ], + "algorithm": "max", } See :ref:`Quantization Formats ` to learn more about the supported diff --git a/modelopt/torch/quantization/nn/modules/tensor_quantizer.py b/modelopt/torch/quantization/nn/modules/tensor_quantizer.py index ec2c3cfc5..14d6a97f8 100644 --- a/modelopt/torch/quantization/nn/modules/tensor_quantizer.py +++ b/modelopt/torch/quantization/nn/modules/tensor_quantizer.py @@ -218,12 +218,27 @@ def _calibrator_setter(val): calib_cls, args, kwargs = standardize_constructor_args(val) return calib_cls(*args, **kwargs) + def _axis_setter(val): + if getattr(self, "_calibrator", None) is not None: + self._calibrator._axis = val + return val + + def _block_sizes_setter(val): + if val is not None: + # block_sizes and axis are mutually exclusive; clear axis when block_sizes is set + setattr(self, "_axis", None) + if getattr(self, "_calibrator", None) is not None: + self._calibrator._axis = None + return val + # Some attributes need custom handling. # By default, attributes from config are mapped to a name ``f"_{attribute}"`` _custom_setters: dict[str, tuple[str, Callable]] = { "enable": ("_disabled", lambda val: val is False), "type": ("_dynamic", lambda val: val == "dynamic"), "calibrator": ("_calibrator", _calibrator_setter), + "axis": ("_axis", _axis_setter), + "block_sizes": ("_block_sizes", _block_sizes_setter), "backend": ("backend", lambda val: val), "backend_extra_args": ("backend_extra_args", lambda val: val or {}), "use_constant_amax": ("_use_constant_amax", lambda val: val), diff --git a/modelopt_recipes/general/ptq/fp8_default-fp8_kv.yml b/modelopt_recipes/general/ptq/fp8_default-fp8_kv.yml index 5322f18f5..1024a60c1 100644 --- a/modelopt_recipes/general/ptq/fp8_default-fp8_kv.yml +++ b/modelopt_recipes/general/ptq/fp8_default-fp8_kv.yml @@ -33,7 +33,6 @@ ptq_cfg: enable: true cfg: num_bits: e4m3 - - quantizer_path: '*block_sparse_moe.gate*' enable: false - quantizer_path: '*linear_attn.conv1d*' diff --git a/modelopt_recipes/general/ptq/nvfp4_default-fp8_kv.yml b/modelopt_recipes/general/ptq/nvfp4_default-fp8_kv.yml index f0ac09dd6..524fb6d97 100644 --- a/modelopt_recipes/general/ptq/nvfp4_default-fp8_kv.yml +++ b/modelopt_recipes/general/ptq/nvfp4_default-fp8_kv.yml @@ -41,7 +41,6 @@ ptq_cfg: enable: true cfg: num_bits: e4m3 - - quantizer_path: '*block_sparse_moe.gate*' enable: false - quantizer_path: '*linear_attn.conv1d*' diff --git a/modelopt_recipes/general/ptq/nvfp4_mlp_only-fp8_kv.yml b/modelopt_recipes/general/ptq/nvfp4_mlp_only-fp8_kv.yml index 70b75b790..33fee0e3e 100644 --- a/modelopt_recipes/general/ptq/nvfp4_mlp_only-fp8_kv.yml +++ b/modelopt_recipes/general/ptq/nvfp4_mlp_only-fp8_kv.yml @@ -57,7 +57,6 @@ ptq_cfg: enable: true cfg: num_bits: e4m3 - - quantizer_path: '*block_sparse_moe.gate*' enable: false - quantizer_path: '*linear_attn.conv1d*' diff --git a/modelopt_recipes/general/ptq/nvfp4_omlp_only-fp8_kv.yml b/modelopt_recipes/general/ptq/nvfp4_omlp_only-fp8_kv.yml index 93cc90606..29cb76bb5 100644 --- a/modelopt_recipes/general/ptq/nvfp4_omlp_only-fp8_kv.yml +++ b/modelopt_recipes/general/ptq/nvfp4_omlp_only-fp8_kv.yml @@ -73,7 +73,6 @@ ptq_cfg: enable: true cfg: num_bits: e4m3 - - quantizer_path: '*block_sparse_moe.gate*' enable: false - quantizer_path: '*linear_attn.conv1d*' diff --git a/tests/_test_utils/torch/export/utils.py b/tests/_test_utils/torch/export/utils.py index 36618de18..3501ad9ee 100644 --- a/tests/_test_utils/torch/export/utils.py +++ b/tests/_test_utils/torch/export/utils.py @@ -86,126 +86,127 @@ def forward(self, x): # Quantization configs partial_fp8_config = { "quant_cfg": [ - ("*.1.weight_quantizer", {"num_bits": (4, 3), "axis": None}), - ("*.1.input_quantizer", {"num_bits": (4, 3), "axis": None}), - ("default", {"num_bits": 8, "enable": False}), + {"quantizer_path": "*", "enable": False}, + {"quantizer_path": "*.1.weight_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, + {"quantizer_path": "*.1.input_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, ], "algorithm": "max", } partial_w4a8_config = { "quant_cfg": [ - ( - "*.2.weight_quantizer", - [ + {"quantizer_path": "*", "enable": False}, + { + "quantizer_path": "*.2.weight_quantizer", + "cfg": [ {"num_bits": 4, "block_sizes": {-1: 128, "type": "static"}, "enable": True}, {"num_bits": (4, 3), "axis": None, "enable": True}, ], - ), - ("*.2.input_quantizer", {"num_bits": (4, 3), "axis": None, "enable": True}), - ("default", {"num_bits": 8, "enable": False}), + }, + { + "quantizer_path": "*.2.input_quantizer", + "cfg": {"num_bits": (4, 3), "axis": None}, + "enable": True, + }, ], "algorithm": "awq_lite", } partial_nvfp4_config = { "quant_cfg": [ - ( - "*.1.weight_quantizer", - { + {"quantizer_path": "*", "enable": False}, + { + "quantizer_path": "*.1.weight_quantizer", + "cfg": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, "axis": None, - "enable": True, }, - ), - ( - "*.1.input_quantizer", - { + "enable": True, + }, + { + "quantizer_path": "*.1.input_quantizer", + "cfg": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, "axis": None, - "enable": True, }, - ), - ( - "*.2.weight_quantizer", - { + "enable": True, + }, + { + "quantizer_path": "*.2.weight_quantizer", + "cfg": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, "axis": None, - "enable": True, }, - ), - ( - "*.2.input_quantizer", - { + "enable": True, + }, + { + "quantizer_path": "*.2.input_quantizer", + "cfg": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, "axis": None, - "enable": True, }, - ), - ("default", {"enable": False}), + "enable": True, + }, ], "algorithm": "max", } partial_nvfp4_awq_config = { "quant_cfg": [ - ( - "*.2.weight_quantizer", - { + {"quantizer_path": "*", "enable": False}, + { + "quantizer_path": "*.2.weight_quantizer", + "cfg": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, "axis": None, - "enable": True, }, - ), - ( - "*.2.input_quantizer", - { + "enable": True, + }, + { + "quantizer_path": "*.2.input_quantizer", + "cfg": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, "axis": None, - "enable": True, }, - ), - ( - "*.1.weight_quantizer", - { + "enable": True, + }, + { + "quantizer_path": "*.1.weight_quantizer", + "cfg": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, "axis": None, - "enable": False, }, - ), - ( - "*.1.input_quantizer", - { + "enable": False, + }, + { + "quantizer_path": "*.1.input_quantizer", + "cfg": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, "axis": None, - "enable": False, }, - ), - ("default", {"enable": False}), + "enable": False, + }, ], "algorithm": "awq_lite", } partial_int4_awq_config = { "quant_cfg": [ - ( - "*.2.weight_quantizer", - { - "num_bits": 4, - "block_sizes": {-1: 128, "type": "static"}, - "enable": True, - }, - ), - ("*.2.input_quantizer", {"enable": False}), - ("default", {"enable": False}), + {"quantizer_path": "*", "enable": False}, + { + "quantizer_path": "*.2.weight_quantizer", + "cfg": {"num_bits": 4, "block_sizes": {-1: 128, "type": "static"}}, + "enable": True, + }, + {"quantizer_path": "*.2.input_quantizer", "enable": False}, ], "algorithm": {"method": "awq_lite", "alpha_step": 0.1}, # "algorithm": {"method": "awq_full", "alpha_step": 0.1, "max_co_batch_size": 1024}, @@ -214,66 +215,110 @@ def forward(self, x): partial_fp8_kv_cache_config = { "quant_cfg": [ - ("*.1.weight_quantizer", {"num_bits": (4, 3), "axis": None}), - ("*.1.input_quantizer", {"num_bits": (4, 3), "axis": None}), - ("*output_quantizer", {"num_bits": (4, 3), "axis": None, "enable": True}), - ("default", {"enable": False}), + {"quantizer_path": "*", "enable": False}, + {"quantizer_path": "*.1.weight_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, + {"quantizer_path": "*.1.input_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, + { + "quantizer_path": "*output_quantizer", + "cfg": {"num_bits": (4, 3), "axis": None}, + "enable": True, + }, ], "algorithm": "max", } partial_int8_kv_cache_config = { "quant_cfg": [ - ("*.1.weight_quantizer", {"num_bits": (4, 3), "axis": None}), - ("*.1.input_quantizer", {"num_bits": (4, 3), "axis": None}), - ("*output_quantizer", {"num_bits": 8, "axis": None, "enable": True}), - ("default", {"enable": False}), + {"quantizer_path": "*", "enable": False}, + {"quantizer_path": "*.1.weight_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, + {"quantizer_path": "*.1.input_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, + { + "quantizer_path": "*output_quantizer", + "cfg": {"num_bits": 8, "axis": None}, + "enable": True, + }, ], "algorithm": "max", } partial_nvfp4_kv_cache_config = { "quant_cfg": [ - ("*.1.weight_quantizer", {"num_bits": (4, 3), "axis": None}), - ("*.1.input_quantizer", {"num_bits": (4, 3), "axis": None}), - ( - "*[kv]_bmm_quantizer", - { + {"quantizer_path": "*", "enable": False}, + {"quantizer_path": "*.1.weight_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, + {"quantizer_path": "*.1.input_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, + { + "quantizer_path": "*[kv]_bmm_quantizer", + "cfg": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, "axis": None, - "enable": True, }, - ), - ("default", {"enable": False}), + "enable": True, + }, ], "algorithm": "max", } only_weight_quantizer_fp8_config = { "quant_cfg": [ - ("*weight_quantizer", {"num_bits": (4, 3), "axis": None, "enable": True}), - ("*input_quantizer", {"num_bits": (4, 3), "axis": None, "enable": False}), - ("*output_quantizer", {"num_bits": (4, 3), "axis": None, "enable": False}), - ("default", {"enable": False}), + {"quantizer_path": "*", "enable": False}, + { + "quantizer_path": "*weight_quantizer", + "cfg": {"num_bits": (4, 3), "axis": None}, + "enable": True, + }, + { + "quantizer_path": "*input_quantizer", + "cfg": {"num_bits": (4, 3), "axis": None}, + "enable": False, + }, + { + "quantizer_path": "*output_quantizer", + "cfg": {"num_bits": (4, 3), "axis": None}, + "enable": False, + }, ], "algorithm": "max", } only_input_quantizer_fp8_config = { "quant_cfg": [ - ("*weight_quantizer", {"num_bits": (4, 3), "axis": None, "enable": False}), - ("*input_quantizer", {"num_bits": (4, 3), "axis": None, "enable": True}), - ("*output_quantizer", {"num_bits": (4, 3), "axis": None, "enable": False}), - ("default", {"enable": False}), + {"quantizer_path": "*", "enable": False}, + { + "quantizer_path": "*weight_quantizer", + "cfg": {"num_bits": (4, 3), "axis": None}, + "enable": False, + }, + { + "quantizer_path": "*input_quantizer", + "cfg": {"num_bits": (4, 3), "axis": None}, + "enable": True, + }, + { + "quantizer_path": "*output_quantizer", + "cfg": {"num_bits": (4, 3), "axis": None}, + "enable": False, + }, ], "algorithm": "max", } only_output_quantizer_fp8_config = { "quant_cfg": [ - ("*weight_quantizer", {"num_bits": (4, 3), "axis": None, "enable": False}), - ("*input_quantizer", {"num_bits": (4, 3), "axis": None, "enable": False}), - ("*output_quantizer", {"num_bits": (4, 3), "axis": None, "enable": True}), - ("default", {"enable": False}), + {"quantizer_path": "*", "enable": False}, + { + "quantizer_path": "*weight_quantizer", + "cfg": {"num_bits": (4, 3), "axis": None}, + "enable": False, + }, + { + "quantizer_path": "*input_quantizer", + "cfg": {"num_bits": (4, 3), "axis": None}, + "enable": False, + }, + { + "quantizer_path": "*output_quantizer", + "cfg": {"num_bits": (4, 3), "axis": None}, + "enable": True, + }, ], "algorithm": "max", } diff --git a/tests/_test_utils/torch/quantization/onnx_export.py b/tests/_test_utils/torch/quantization/onnx_export.py index cf7b5bc40..57ee92ad0 100644 --- a/tests/_test_utils/torch/quantization/onnx_export.py +++ b/tests/_test_utils/torch/quantization/onnx_export.py @@ -30,9 +30,9 @@ def onnx_export_tester(model, device, num_bits, per_channel_quantization, consta axis = 0 if per_channel_quantization else None config = { "quant_cfg": [ - ("*weight_quantizer", {"num_bits": num_bits, "axis": axis}), - ("*input_quantizer", {"num_bits": num_bits}), - ("default", {"enable": False}), + {"quantizer_path": "*", "enable": False}, + {"quantizer_path": "*weight_quantizer", "cfg": {"num_bits": num_bits, "axis": axis}}, + {"quantizer_path": "*input_quantizer", "cfg": {"num_bits": num_bits}}, ], "algorithm": "max", } diff --git a/tests/gpu_megatron/torch/peft/plugins/test_megatron_peft.py b/tests/gpu_megatron/torch/peft/plugins/test_megatron_peft.py index d9c2d4dfd..cfa678b1a 100644 --- a/tests/gpu_megatron/torch/peft/plugins/test_megatron_peft.py +++ b/tests/gpu_megatron/torch/peft/plugins/test_megatron_peft.py @@ -34,27 +34,30 @@ NVFP4_DEFAULT_CONFIG = { "quant_cfg": [ - ( - "*weight_quantizer", - { + {"quantizer_path": "*", "enable": False}, + { + "quantizer_path": "*weight_quantizer", + "cfg": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, "axis": None, - "enable": True, }, - ), - ( - "*input_quantizer", - { + "enable": True, + }, + { + "quantizer_path": "*input_quantizer", + "cfg": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, "axis": None, - "enable": True, }, - ), - ("*output_quantizer", {"enable": False}), - ("*output_layer*", {"enable": False}), # Note: only output_layer is disabled. - ("default", {"enable": False}), + "enable": True, + }, + {"quantizer_path": "*output_quantizer", "enable": False}, + { + "quantizer_path": "*output_layer*", + "enable": False, + }, # Note: only output_layer is disabled. ], "algorithm": "max", } diff --git a/tests/unit/torch/quantization/test_autoquant.py b/tests/unit/torch/quantization/test_autoquant.py index d8ce15681..bd8f6f7aa 100644 --- a/tests/unit/torch/quantization/test_autoquant.py +++ b/tests/unit/torch/quantization/test_autoquant.py @@ -231,6 +231,7 @@ def test_auto_quantize_disabled_layers_no_poison(): INT4INT8_AWQ_CFG = { "quant_cfg": [ + {"quantizer_path": "*", "enable": False}, { "quantizer_path": "*weight_quantizer", "cfg": [ @@ -243,7 +244,6 @@ def test_auto_quantize_disabled_layers_no_poison(): "cfg": {"num_bits": 8, "axis": None}, "enable": True, }, - {"quantizer_path": "default", "enable": False}, ], "algorithm": "awq_lite", } diff --git a/tests/unit/torch/quantization/test_custom_backend.py b/tests/unit/torch/quantization/test_custom_backend.py index 2a5643677..1b9308559 100644 --- a/tests/unit/torch/quantization/test_custom_backend.py +++ b/tests/unit/torch/quantization/test_custom_backend.py @@ -43,17 +43,17 @@ def dummy_backend(inputs: torch.Tensor, tq) -> torch.Tensor: cfg = { "quant_cfg": [ - ( - "*weight_quantizer", - { - "enable": True, + {"quantizer_path": "*", "enable": False}, + { + "quantizer_path": "*weight_quantizer", + "cfg": { "num_bits": 8, "axis": None, "backend": "dummy_backend", "backend_extra_args": {"offset": 2.5}, }, - ), - ("default", {"enable": False}), + "enable": True, + }, ], "algorithm": "max", } @@ -92,8 +92,12 @@ def cached_backend(inputs: torch.Tensor, tq: TensorQuantizer) -> torch.Tensor: model = torch.nn.Linear(16, 16, bias=False) cfg = { "quant_cfg": [ - ("*weight_quantizer", {"enable": True, "backend": "cached_backend"}), - ("default", {"enable": False}), + {"quantizer_path": "*", "enable": False}, + { + "quantizer_path": "*weight_quantizer", + "cfg": {"backend": "cached_backend"}, + "enable": True, + }, ], "algorithm": "max", } diff --git a/tests/unit/torch/quantization/test_quantize_cpu.py b/tests/unit/torch/quantization/test_quantize_cpu.py index 8bf652d81..de12fc7f3 100644 --- a/tests/unit/torch/quantization/test_quantize_cpu.py +++ b/tests/unit/torch/quantization/test_quantize_cpu.py @@ -59,16 +59,15 @@ STATIC_WEIGHT_DYNAMIC_ACTIVATION_CFG = { "quant_cfg": [ - ("*weight_quantizer", {"num_bits": 8, "axis": 0}), # Per-channel quantization - ( - "*input_quantizer", - { - "num_bits": 8, - "axis": (0, 1), - "type": "dynamic", - }, - ), # Dynamic per-token quantization - ("default", {"enable": False}), + {"quantizer_path": "*", "enable": False}, + { + "quantizer_path": "*weight_quantizer", + "cfg": {"num_bits": 8, "axis": 0}, + }, # Per-channel quantization + { + "quantizer_path": "*input_quantizer", + "cfg": {"num_bits": 8, "axis": (0, 1), "type": "dynamic"}, + }, # Dynamic per-token quantization ], "algorithm": "max", } @@ -137,7 +136,9 @@ def test_save_restore(model_cls, quant_config): def test_quantize_invalid_cfg(): model = SimpleLinear() config_invalid = { - "quant_cfg": [("*", {"num_bits": 4, "axis": 0, "block_sizes": {-1: 128}})], + "quant_cfg": [ + {"quantizer_path": "*", "cfg": {"num_bits": 4, "axis": 0, "block_sizes": {-1: 128}}} + ], "algorithm": "max", } with pytest.raises(ValidationError, match="axis must be None when block_sizes is not None."): @@ -229,24 +230,26 @@ def test_static_weight_dynamic_activations(): def test_block_sizes_axis_model(): REF_QUANT_CFG = { # noqa: N806 "quant_cfg": [ - ("*weight_quantizer", {"num_bits": 8, "axis": 0}), - ("*input_quantizer", {"num_bits": 8, "axis": None, "type": "dynamic"}), - ("default", {"enable": False}), + {"quantizer_path": "*", "enable": False}, + {"quantizer_path": "*weight_quantizer", "cfg": {"num_bits": 8, "axis": 0}}, + { + "quantizer_path": "*input_quantizer", + "cfg": {"num_bits": 8, "axis": None, "type": "dynamic"}, + }, ], "algorithm": "max", } QUANT_CFG = { # noqa: N806 "quant_cfg": [ - ("*weight_quantizer", {"num_bits": 8, "block_sizes": {1: None}}), - ( - "*input_quantizer", - { - "num_bits": 8, - "block_sizes": {0: None, 1: None}, - "type": "dynamic", - }, - ), - ("default", {"enable": False}), + {"quantizer_path": "*", "enable": False}, + { + "quantizer_path": "*weight_quantizer", + "cfg": {"num_bits": 8, "block_sizes": {1: None}}, + }, + { + "quantizer_path": "*input_quantizer", + "cfg": {"num_bits": 8, "block_sizes": {0: None, 1: None}, "type": "dynamic"}, + }, ], "algorithm": "max", } diff --git a/tests/unit/torch/quantization/test_tensor_quant_cpu.py b/tests/unit/torch/quantization/test_tensor_quant_cpu.py index f560fcac6..918f614f9 100644 --- a/tests/unit/torch/quantization/test_tensor_quant_cpu.py +++ b/tests/unit/torch/quantization/test_tensor_quant_cpu.py @@ -90,15 +90,15 @@ def test_num_bits(self): WINT4INT8_CFG = { "quant_cfg": [ - ( - "*weight_quantizer", - [ + {"quantizer_path": "*", "enable": False}, + { + "quantizer_path": "*weight_quantizer", + "cfg": [ {"num_bits": 4, "block_sizes": {-1: 128, "type": "static"}, "enable": True}, {"num_bits": 8, "axis": 0, "enable": True}, ], - ), - ("*input_quantizer", {"num_bits": 8, "enable": True}), - ("default", {"enable": False}), + }, + {"quantizer_path": "*input_quantizer", "cfg": {"num_bits": 8}, "enable": True}, ], "algorithm": "awq_full", } From 8f59142c68d1c2fd0305c38654420a4eb14a122f Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Fri, 20 Mar 2026 16:57:16 +0000 Subject: [PATCH 13/32] fix guide Signed-off-by: Shengliang Xu --- docs/source/guides/_pytorch_quantization.rst | 25 +++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/docs/source/guides/_pytorch_quantization.rst b/docs/source/guides/_pytorch_quantization.rst index 0f7720523..edf45d98a 100644 --- a/docs/source/guides/_pytorch_quantization.rst +++ b/docs/source/guides/_pytorch_quantization.rst @@ -237,14 +237,16 @@ For debugging purposes or simple customizations, you can modify an existing conf .. code-block:: python - # Create a copy of the default INT8 configuration - config = mtq.INT8_DEFAULT_CFG.copy() + import copy - # Disable input quantizers for all layers - config["quant_cfg"]["*input_quantizer"]["enable"] = False + # Create a deep copy of the default INT8 configuration + config = copy.deepcopy(mtq.INT8_DEFAULT_CFG) + + # Disable input quantizers for all layers (appended last, so it takes precedence) + config["quant_cfg"].append({"quantizer_path": "*input_quantizer", "enable": False}) # Disable all quantizers for layers matching the pattern "layer1.*" - config["quant_cfg"]["*layer1.*"] = {"enable": False} + config["quant_cfg"].append({"quantizer_path": "*layer1.*", "enable": False}) Advanced Configuration Creation ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -256,11 +258,14 @@ For exploring new quantization recipes, you can compose a completely new configu # Custom configuration for INT4 block-wise weights and INT8 dynamic activations MY_CUSTOM_CONFIG = { "quant_cfg": [ + # Disable all quantizers by default, then enable selectively + {"quantizer_path": "*", "enable": False}, + # Configure weight quantizers with 4-bit precision and 128-element blocks - ("*weight_quantizer", {"num_bits": 4, "block_sizes": {-1: 128}, "enable": True}), + {"quantizer_path": "*weight_quantizer", "cfg": {"num_bits": 4, "block_sizes": {-1: 128}}, "enable": True}, # Configure input quantizers with 8-bit dynamic quantization - ("*input_quantizer", {"num_bits": 8, "type": "dynamic", "block_sizes": {-1: None}}), + {"quantizer_path": "*input_quantizer", "cfg": {"num_bits": 8, "type": "dynamic", "block_sizes": {-1: None}}}, # Include default disabled quantizer configurations *_default_disabled_quantizer_cfg, @@ -394,8 +399,10 @@ You can specify ``custom_calib`` as ``algorithm`` in ``quant_cfg`` to use it. He # create quantization configuration with "custom_calib" method quant_cfg = { - 'quant_cfg': {'*weight_quantizer': ..}, - 'algorithm': {"method": 'custom_calib'}, + 'quant_cfg': [ + {"quantizer_path": "*weight_quantizer", "cfg": {...}}, + ], + 'algorithm': {"method": 'custom_calib'}, } From 3cda60f69483470d40f6e27310ad8084aa482b0f Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Fri, 20 Mar 2026 17:34:54 +0000 Subject: [PATCH 14/32] default to disable Signed-off-by: Shengliang Xu --- examples/llm_eval/quantization_utils.py | 1 + .../llm_ptq/notebooks/2_PTQ_AWQ_Calibration.ipynb | 14 ++++++++++++-- tests/unit/torch/quantization/test_autoquant.py | 3 ++- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/examples/llm_eval/quantization_utils.py b/examples/llm_eval/quantization_utils.py index 03b7039fa..3016885f4 100644 --- a/examples/llm_eval/quantization_utils.py +++ b/examples/llm_eval/quantization_utils.py @@ -34,6 +34,7 @@ CUSTOM_CONFIG = { "MY_QUANT_CONFIG": { "quant_cfg": [ + *mtq.config._base_disable_all, ("*weight_quantizer", {"num_bits": 4, "block_sizes": {-1: 128}, "enable": True}), ("*input_quantizer", {"num_bits": 8, "type": "dynamic", "block_sizes": {-1: None}}), # Disable sensitive layers such as `lm_head`, gate layers in MoE etc. diff --git a/examples/llm_ptq/notebooks/2_PTQ_AWQ_Calibration.ipynb b/examples/llm_ptq/notebooks/2_PTQ_AWQ_Calibration.ipynb index 096e80272..0892cec63 100644 --- a/examples/llm_ptq/notebooks/2_PTQ_AWQ_Calibration.ipynb +++ b/examples/llm_ptq/notebooks/2_PTQ_AWQ_Calibration.ipynb @@ -189,7 +189,17 @@ "id": "a3ce3b47-48ac-4a27-a5ed-351a10c104a9", "metadata": {}, "outputs": [], - "source": "# Get default AWQ config and optionally adjust block size\nquant_cfg = mtq.INT4_AWQ_CFG\nweight_quantizer = next(cfg for pat, cfg in quant_cfg[\"quant_cfg\"] if pat == \"*weight_quantizer\")\nif isinstance(weight_quantizer, list):\n weight_quantizer = weight_quantizer[0]\nweight_quantizer[\"block_sizes\"][-1] = 128 # Optional: override block size\n\n# Apply AWQ quantization\nmodel = mtq.quantize(model, quant_cfg, forward_loop=forward_loop)" + "source": [ + "# Get default AWQ config and optionally adjust block size\n", + "quant_cfg = mtq.INT4_AWQ_CFG\n", + "weight_quantizer = next(cfg for pat, cfg in quant_cfg[\"quant_cfg\"] if pat == \"*weight_quantizer\")\n", + "if isinstance(weight_quantizer, list):\n", + " weight_quantizer = weight_quantizer[0]\n", + "weight_quantizer[\"block_sizes\"][-1] = 128 # Optional: override block size\n", + "\n", + "# Apply AWQ quantization\n", + "model = mtq.quantize(model, quant_cfg, forward_loop=forward_loop)" + ] }, { "cell_type": "markdown", @@ -298,4 +308,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/tests/unit/torch/quantization/test_autoquant.py b/tests/unit/torch/quantization/test_autoquant.py index bd8f6f7aa..2de0aec5b 100644 --- a/tests/unit/torch/quantization/test_autoquant.py +++ b/tests/unit/torch/quantization/test_autoquant.py @@ -28,7 +28,7 @@ QuantRecipeHparam, estimate_quant_compression, ) -from modelopt.torch.quantization.config import _default_disabled_quantizer_cfg +from modelopt.torch.quantization.config import _base_disable_all, _default_disabled_quantizer_cfg from modelopt.torch.utils.distributed import DistributedProcessGroup @@ -111,6 +111,7 @@ def test_quant_recipe_hparam(): # use this config to test custom quantization config INT8_CUSTOM_QUANT_TEST_CFG = { "quant_cfg": [ + *_base_disable_all, {"quantizer_path": "*weight_quantizer", "cfg": {"num_bits": 8, "axis": 0}}, {"quantizer_path": "*input_quantizer", "cfg": {"num_bits": 8, "axis": None}}, *_default_disabled_quantizer_cfg, From 43f9a1a9e2a326a4c91bebb40d3a6c63173146a6 Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Fri, 20 Mar 2026 21:31:31 +0000 Subject: [PATCH 15/32] tuple format is not needed, remove all of them Signed-off-by: Shengliang Xu --- docs/source/guides/1_quantization.rst | 1 + docs/source/guides/_quant_cfg.rst | 206 ++++++++++++++++++ examples/llm_eval/quantization_utils.py | 11 +- modelopt/torch/quantization/config.py | 101 +++++---- modelopt/torch/quantization/model_quant.py | 7 +- .../torch/quantization/utils/core_utils.py | 12 +- .../torch/quantization/test_quantize_cuda.py | 12 +- .../plugins/test_attention_quant.py | 4 +- .../test_compute_quantization_mse.py | 4 +- .../torch/quantization/test_quantize_cpu.py | 46 ++-- .../quantization/test_tensor_quant_cpu.py | 10 +- 11 files changed, 332 insertions(+), 82 deletions(-) create mode 100644 docs/source/guides/_quant_cfg.rst diff --git a/docs/source/guides/1_quantization.rst b/docs/source/guides/1_quantization.rst index a838bfb10..38ce0956b 100644 --- a/docs/source/guides/1_quantization.rst +++ b/docs/source/guides/1_quantization.rst @@ -19,6 +19,7 @@ Below, you can find the documentation for the quantization toolkit in ModelOpt: ./_basic_quantization.rst ./_choosing_quant_methods.rst ./_pytorch_quantization.rst + ./_quant_cfg.rst ./_customized_model_quantization.rst ./_compress_quantized_models.rst ./_onnx_quantization.rst diff --git a/docs/source/guides/_quant_cfg.rst b/docs/source/guides/_quant_cfg.rst new file mode 100644 index 000000000..470cd9570 --- /dev/null +++ b/docs/source/guides/_quant_cfg.rst @@ -0,0 +1,206 @@ +.. _quant-cfg: + +====================================== +Quantization Configuration (quant_cfg) +====================================== + +The ``quant_cfg`` field is the primary mechanism for controlling which quantizers are active in a +model and how they are configured. This guide explains the format, ordering semantics, and common +patterns for composing quantization configurations. + +.. tip:: + + For the list of built-in configs and supported formats, see :any:`quantization-formats`. + For how to apply a config to a model, see :any:`_pytorch_quantization`. + +---------- + +Overview +======== + +A quantization config is a Python dictionary with two top-level keys: + +.. code-block:: python + + config = { + "quant_cfg": [...], # ordered list of QuantizerCfgEntry dicts + "algorithm": "max", # calibration algorithm + } + +The ``quant_cfg`` value is an **ordered list** of :class:`QuantizerCfgEntry +` dicts. Each entry targets a set of +quantizer modules in the model and specifies their configuration. + +---------- + +Entry Format +============ + +Each entry in the list is a dictionary with the following fields: + +.. list-table:: + :header-rows: 1 + :widths: 20 15 65 + + * - Field + - Required + - Description + * - ``quantizer_path`` + - Yes + - Wildcard string matched against quantizer module names (e.g. ``"*weight_quantizer"``). + Uses :func:`fnmatch` rules. + * - ``parent_class`` + - No + - Restricts matching to quantizers whose immediate parent module is of this PyTorch class + (e.g. ``"nn.Linear"``). If omitted, all modules are targeted regardless of class. + * - ``cfg`` + - No + - A dict of quantizer attributes as defined by :class:`QuantizerAttributeConfig + `, or a list of such dicts + for sequential quantization (see :ref:`sequential-quantizers`). + * - ``enable`` + - No + - ``True`` or ``False``. Shorthand for enabling or disabling matched quantizers. When ``enable`` is omitted, the quantizer + is implicitly enabled. + +---------- + +Ordering and Precedence +======================= + +Entries are applied **in list order**. Later entries override earlier ones for any quantizer they +match. This gives a clear, composable precedence model: + +- Put broad rules (e.g. deny-all) **first**. +- Put format-specific enable rules **after**. +- Put fine-grained exclusions (specific layers, classes) **last**. + +The recommended pattern used by all built-in configs is: + +.. code-block:: python + + "quant_cfg": [ + # 1. Deny all quantizers by default + {"quantizer_path": "*", "enable": False}, + + # 2. Enable and configure the target quantizers + {"quantizer_path": "*weight_quantizer", "cfg": {"num_bits": 8, "axis": 0}}, + {"quantizer_path": "*input_quantizer", "cfg": {"num_bits": 8, "axis": None}}, + + # 3. Apply standard exclusions last (BatchNorm, LM head, MoE routers, etc.) + *mtq.config._default_disabled_quantizer_cfg, + ] + +.. note:: + + The deny-all entry ``{"quantizer_path": "*", "enable": False}`` is available as + :data:`modelopt.torch.quantization.config._base_disable_all` and is prepended to every + built-in config. This ensures quantizers not explicitly targeted remain disabled. + +---------- + +Common Patterns +=============== + +Skipping Specific Layers +------------------------ + +Append a disable entry after the existing config to exclude layers matched by a path pattern. +Because it is appended last, it takes precedence over all earlier entries: + +.. code-block:: python + + import copy + import modelopt.torch.quantization as mtq + + config = copy.deepcopy(mtq.FP8_DEFAULT_CFG) + + # Skip the final projection layer + config["quant_cfg"].append({"quantizer_path": "*lm_head*", "enable": False}) + + model = mtq.quantize(model, config, forward_loop) + +Skipping Layers by Module Class +-------------------------------- + +Use ``parent_class`` to target quantizers only within a specific type of layer, leaving the +same quantizer path in other layer types unaffected: + +.. code-block:: python + + config["quant_cfg"].append({ + "quantizer_path": "*input_quantizer", + "parent_class": "nn.LayerNorm", + "enable": False, + }) + +Overriding Quantizer Precision for Specific Layers +--------------------------------------------------- + +A later entry with a matching ``quantizer_path`` replaces the configuration set by an earlier +entry. This allows per-layer precision overrides without restructuring the entire config: + +.. code-block:: python + + config = copy.deepcopy(mtq.FP8_DEFAULT_CFG) + + # Quantize attention output projections in higher-precision INT8 instead of FP8 + config["quant_cfg"].append({ + "quantizer_path": "*o_proj*weight_quantizer", + "cfg": {"num_bits": 8, "axis": 0}, + }) + +Building a Config from Scratch +------------------------------- + +For entirely custom recipes, compose the list directly: + +.. code-block:: python + + from modelopt.torch.quantization.config import _base_disable_all, _default_disabled_quantizer_cfg + + MY_CUSTOM_CFG = { + "quant_cfg": [ + *_base_disable_all, + {"quantizer_path": "*weight_quantizer", "cfg": {"num_bits": 4, "block_sizes": {-1: 128}}}, + {"quantizer_path": "*input_quantizer", "cfg": {"num_bits": 8, "axis": None}}, + *_default_disabled_quantizer_cfg, + ], + "algorithm": "max", + } + + model = mtq.quantize(model, MY_CUSTOM_CFG, forward_loop) + +---------- + +.. _sequential-quantizers: + +Sequential Quantization +======================= + +When ``cfg`` is a **list** of attribute dicts, the matched +:class:`TensorQuantizer ` +is replaced with a +:class:`SequentialQuantizer ` +that applies each format in sequence. This is used, for example, in W4A8 quantization where weights +are quantized first in INT4 and then in FP8: + +.. code-block:: python + + { + "quantizer_path": "*weight_quantizer", + "cfg": [ + {"num_bits": 4, "block_sizes": {-1: 128, "type": "static"}, "enable": True}, + {"num_bits": (4, 3), "enable": True}, # FP8 + ], + } + +---------- + +Reference +========= + +- :class:`QuantizerCfgEntry ` +- :class:`QuantizerAttributeConfig ` +- :class:`QuantizeConfig ` +- :func:`set_quantizer_by_cfg ` diff --git a/examples/llm_eval/quantization_utils.py b/examples/llm_eval/quantization_utils.py index 3016885f4..466f65ced 100644 --- a/examples/llm_eval/quantization_utils.py +++ b/examples/llm_eval/quantization_utils.py @@ -35,8 +35,15 @@ "MY_QUANT_CONFIG": { "quant_cfg": [ *mtq.config._base_disable_all, - ("*weight_quantizer", {"num_bits": 4, "block_sizes": {-1: 128}, "enable": True}), - ("*input_quantizer", {"num_bits": 8, "type": "dynamic", "block_sizes": {-1: None}}), + { + "quantizer_path": "*weight_quantizer", + "cfg": {"num_bits": 4, "block_sizes": {-1: 128}}, + "enable": True, + }, + { + "quantizer_path": "*input_quantizer", + "cfg": {"num_bits": 8, "type": "dynamic", "block_sizes": {-1: None}}, + }, # Disable sensitive layers such as `lm_head`, gate layers in MoE etc. *mtq.config._default_disabled_quantizer_cfg, ], diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py index 0591e6ea6..3e3828d31 100644 --- a/modelopt/torch/quantization/config.py +++ b/modelopt/torch/quantization/config.py @@ -50,40 +50,51 @@ Quantization Configs ================================ -Quantization config is dictionary specifying the values for keys ``"quant_cfg"`` and -``"algorithm"``. The ``"quant_cfg"`` key specifies the quantization configurations. The -``"algorithm"`` key specifies the ``algorithm`` argument to -:meth:`calibrate `. Please see :class:`QuantizeConfig` -for the quantization config definition. - -'Quantization configurations' is a dictionary mapping wildcards or filter functions -to its 'quantizer attributes'. The wildcards or filter functions are matched -against the quantizer module names. The quantizer modules have names ending with -``weight_quantizer`` and ``input_quantizer`` and they perform weight quantization and -input quantization (or activation quantization) respectively. The quantizer modules are generally -instances of -:class:`TensorQuantizer `. -The quantizer attributes are defined by :class:`QuantizerAttributeConfig`. See :class:`QuantizerAttributeConfig` -for details on the quantizer attributes and their values. - -Use `"*"` as the first entry in the quantization configuration list to set a catch-all default -that applies to all quantizers not matched by a later, more specific entry. - -The quantizer attributes are applied in the order they are specified. For the missing attributes, the default attributes -as defined by :class:`QuantizerAttributeConfig` are used. - -Quantizer attributes can also be a list of dictionaries. In this case, the matched quantizer module -is replaced with a -:class:`SequentialQuantizer ` -module which is used to quantize a tensor in multiple formats sequentially. Each quantizer attribute -dictionary in the list specifies the quantization formats for each quantization step of the -sequential quantizer. For example, `SequentialQuantizer` is used in 'INT4 Weights, FP8 Activations' -quantization in which the weights are quantized in INT4 followed by FP8. - -In addition, the dictionary entries could also be pytorch module class names mapping the class specific -quantization configurations. The pytorch modules should have a quantized equivalent. - -To get the string representation of a module class, do: +Quantization config is a dictionary with two top-level keys: + +- ``"quant_cfg"``: an ordered list of :class:`QuantizerCfgEntry` dicts that specify which + quantizers to configure and how. +- ``"algorithm"``: the calibration algorithm passed to + :meth:`calibrate `. + +Please see :class:`QuantizeConfig` for the full config schema. + +``quant_cfg`` — Entry Format +----------------------------- + +Each entry in the ``quant_cfg`` list is a :class:`QuantizerCfgEntry` with the following fields: + +- ``quantizer_path`` *(required)*: a wildcard string matched against quantizer module names. + Quantizer modules are instances of + :class:`TensorQuantizer ` + and have names ending with ``weight_quantizer``, ``input_quantizer``, etc. +- ``parent_class`` *(optional)*: restricts matching to quantizers whose immediate parent module is + of this PyTorch class (e.g. ``"nn.Linear"``). If omitted, all matching quantizers are targeted + regardless of their parent class. +- ``cfg`` *(optional)*: a dict of quantizer attributes as defined by + :class:`QuantizerAttributeConfig`, or a list of such dicts. When a list is given, the matched + :class:`TensorQuantizer ` + is replaced with a + :class:`SequentialQuantizer ` + that applies each format in sequence. This is used for example in W4A8 quantization where weights + are quantized first in INT4 and then in FP8. +- ``enable`` *(optional)*: shorthand to enable or disable matched quantizers without specifying a + full ``cfg``. When ``cfg`` is present but ``enable`` is absent, the quantizer is implicitly + enabled. + +``quant_cfg`` — Ordering and Precedence +----------------------------------------- + +Entries are applied **in list order**; later entries override earlier ones for any quantizer they +match. The recommended pattern is: + +1. Start with a deny-all entry ``{"quantizer_path": "*", "enable": False}`` (provided as + :data:`_base_disable_all`) to disable every quantizer by default. +2. Follow with format-specific entries that selectively enable and configure the desired quantizers. +3. Append :data:`_default_disabled_quantizer_cfg` to enforce standard exclusions (e.g. BatchNorm + layers, LM head, MoE routers). + +To get the string representation of a module class for use in ``parent_class``, do: .. code-block:: @@ -98,12 +109,15 @@ MY_QUANT_CFG = { "quant_cfg": [ - # Quantizer wildcard strings mapping to quantizer attributes - ("*weight_quantizer", {"num_bits": 8, "axis": 0}), - ("*input_quantizer", {"num_bits": 8, "axis": None}), + # Deny all quantizers by default + {"quantizer_path": "*", "enable": False}, - # Module class names mapping to quantizer configurations - ("nn.LeakyReLU", {"*input_quantizer": {"enable": False}}), + # Enable and configure weight and input quantizers + {"quantizer_path": "*weight_quantizer", "cfg": {"num_bits": 8, "axis": 0}}, + {"quantizer_path": "*input_quantizer", "cfg": {"num_bits": 8, "axis": None}}, + + # Disable input quantizers specifically for LeakyReLU layers + {"quantizer_path": "*input_quantizer", "parent_class": "nn.LeakyReLU", "enable": False}, ] } @@ -128,7 +142,7 @@ # Create custom config CUSTOM_INT4_AWQ_CFG = copy.deepcopy(mtq.INT4_AWQ_CFG) - CUSTOM_INT4_AWQ_CFG["quant_cfg"].append(("*lm_head*", {"enable": False})) + CUSTOM_INT4_AWQ_CFG["quant_cfg"].append({"quantizer_path": "*lm_head*", "enable": False}) # quantize model model = mtq.quantize(model, CUSTOM_INT4_AWQ_CFG, forward_loop) @@ -1509,10 +1523,9 @@ def normalize_quant_cfg_list(v: list) -> list[QuantizerCfgEntry]: Supports these input forms per entry: - ``{"quantizer_path": ..., "enable": ..., "cfg": ...}`` — passed through as-is - ``{"": ...}`` — single-key dict (legacy) - - ``(quantizer_path, cfg_dict)`` — tuple form (legacy) """ - def _tuple_to_entry(key: str, value) -> QuantizerCfgEntry: + def _dict_to_entry(key: str, value) -> QuantizerCfgEntry: if isinstance(key, str) and key.startswith("nn."): assert isinstance(value, dict) and len(value) == 1 q_path, sub_cfg = next(iter(value.items())) @@ -1544,9 +1557,7 @@ def _tuple_to_entry(key: str, value) -> QuantizerCfgEntry: result.append(cast("QuantizerCfgEntry", raw)) elif isinstance(raw, dict) and len(raw) == 1: key, val = next(iter(raw.items())) - result.append(_tuple_to_entry(key, val)) - elif isinstance(raw, (tuple, list)) and len(raw) == 2: - result.append(_tuple_to_entry(raw[0], raw[1])) + result.append(_dict_to_entry(key, val)) else: raise ValueError(f"Invalid quant_cfg entry: {raw!r}.") return result diff --git a/modelopt/torch/quantization/model_quant.py b/modelopt/torch/quantization/model_quant.py index 07b350e19..13415a16e 100644 --- a/modelopt/torch/quantization/model_quant.py +++ b/modelopt/torch/quantization/model_quant.py @@ -322,8 +322,11 @@ def auto_quantize( INT8_CUSTOM_QUANT_CFG = { "quant_cfg": [ - ("*weight_quantizer", {"num_bits": 8, "axis": 0}), - ("*input_quantizer", {"num_bits": 8, "axis": None}), + {"quantizer_path": "*weight_quantizer", "cfg": {"num_bits": 8, "axis": 0}}, + { + "quantizer_path": "*input_quantizer", + "cfg": {"num_bits": 8, "axis": None}, + }, ], "algorithm": "smoothquant", } diff --git a/modelopt/torch/quantization/utils/core_utils.py b/modelopt/torch/quantization/utils/core_utils.py index e7e50aa83..54f146072 100644 --- a/modelopt/torch/quantization/utils/core_utils.py +++ b/modelopt/torch/quantization/utils/core_utils.py @@ -310,11 +310,15 @@ def calibrate_with_adapters(model, args): def disable_lora_quantizers_in_config(config, layers): """Turns off input, weight, and output quantizers for LoRA weights and LoRALinear layers in config.""" - config["quant_cfg"].append(("*lora*", {"enable": False})) + config["quant_cfg"].append({"quantizer_path": "*lora*", "enable": False}) for layer in layers: - config["quant_cfg"].append((f"*{layer}.input_quantizer", {"enable": False})) - config["quant_cfg"].append((f"*{layer}.weight_quantizer", {"enable": False})) - config["quant_cfg"].append((f"*{layer}.output_quantizer", {"enable": False})) + config["quant_cfg"].append({"quantizer_path": f"*{layer}.input_quantizer", "enable": False}) + config["quant_cfg"].append( + {"quantizer_path": f"*{layer}.weight_quantizer", "enable": False} + ) + config["quant_cfg"].append( + {"quantizer_path": f"*{layer}.output_quantizer", "enable": False} + ) return config diff --git a/tests/gpu/torch/quantization/test_quantize_cuda.py b/tests/gpu/torch/quantization/test_quantize_cuda.py index 097b28a48..c97086d63 100644 --- a/tests/gpu/torch/quantization/test_quantize_cuda.py +++ b/tests/gpu/torch/quantization/test_quantize_cuda.py @@ -59,16 +59,16 @@ NVFP4_WEIGHT_MSE_FP8_SWEEP_CFG = { "quant_cfg": [ - ( - "*weight_quantizer", - { + { + "quantizer_path": "*weight_quantizer", + "cfg": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "static", "scale_bits": (4, 3)}, "axis": None, - "enable": True, }, - ), - ("*input_quantizer", {"enable": False}), + "enable": True, + }, + {"quantizer_path": "*input_quantizer", "enable": False}, ], "algorithm": { "method": "mse", diff --git a/tests/unit/torch/quantization/plugins/test_attention_quant.py b/tests/unit/torch/quantization/plugins/test_attention_quant.py index 560533eaf..302e39496 100644 --- a/tests/unit/torch/quantization/plugins/test_attention_quant.py +++ b/tests/unit/torch/quantization/plugins/test_attention_quant.py @@ -62,8 +62,8 @@ def forward(self, hidden_states, **kwargs): kv_cache_config = { "quant_cfg": [ - ("*[kv]_bmm_quantizer", {"num_bits": 4, "enable": True}), - ("*softmax_quantizer", {"enable": False}), + {"quantizer_path": "*[kv]_bmm_quantizer", "cfg": {"num_bits": 4}, "enable": True}, + {"quantizer_path": "*softmax_quantizer", "enable": False}, ], "algorithm": "max", } diff --git a/tests/unit/torch/quantization/test_compute_quantization_mse.py b/tests/unit/torch/quantization/test_compute_quantization_mse.py index 3c28a42e1..26aa7144a 100644 --- a/tests/unit/torch/quantization/test_compute_quantization_mse.py +++ b/tests/unit/torch/quantization/test_compute_quantization_mse.py @@ -23,8 +23,8 @@ INT8_CFG = { "quant_cfg": [ - ("*weight_quantizer", {"num_bits": 8, "axis": 0}), - ("*input_quantizer", {"num_bits": 8, "axis": None}), + {"quantizer_path": "*weight_quantizer", "cfg": {"num_bits": 8, "axis": 0}}, + {"quantizer_path": "*input_quantizer", "cfg": {"num_bits": 8, "axis": None}}, ], "algorithm": "max", } diff --git a/tests/unit/torch/quantization/test_quantize_cpu.py b/tests/unit/torch/quantization/test_quantize_cpu.py index de12fc7f3..d5100ed02 100644 --- a/tests/unit/torch/quantization/test_quantize_cpu.py +++ b/tests/unit/torch/quantization/test_quantize_cpu.py @@ -36,14 +36,18 @@ # A test config with double-quant (using `SequentialQuantizers`) WINT4INT8_CFG = { "quant_cfg": [ - ( - "*weight_quantizer", - [ + { + "quantizer_path": "*weight_quantizer", + "cfg": [ {"num_bits": 4, "block_sizes": {-1: 128, "type": "static"}, "enable": True}, {"num_bits": 8, "axis": 0, "enable": True}, ], - ), - ("*input_quantizer", {"num_bits": 8, "axis": None, "enable": True}), + }, + { + "quantizer_path": "*input_quantizer", + "cfg": {"num_bits": 8, "axis": None}, + "enable": True, + }, ], "algorithm": "awq_lite", } @@ -51,8 +55,8 @@ # Test configs for per channel MSE calibration INT8_MSE_CFG = { "quant_cfg": [ - ("*weight_quantizer", {"num_bits": 8, "axis": 0}), - ("*input_quantizer", {"num_bits": 8, "axis": None}), + {"quantizer_path": "*weight_quantizer", "cfg": {"num_bits": 8, "axis": 0}}, + {"quantizer_path": "*input_quantizer", "cfg": {"num_bits": 8, "axis": None}}, ], "algorithm": "mse", } @@ -80,15 +84,15 @@ def compute_amax(self): quant_cfg_custom_calib = { "quant_cfg": [ - ( - "*", - { + { + "quantizer_path": "*", + "cfg": { "num_bits": 4, "axis": None, - "enable": True, "calibrator": (NewMaxCalibrator, (4, None, False)), }, - ) + "enable": True, + } ], "algorithm": "max", } @@ -178,10 +182,20 @@ def test_class_wise_config(): model = SimpleConvLinear() config = { "quant_cfg": [ - ("nn.Linear", {"*": {"num_bits": 4, "axis": -1, "enable": True}}), - ("nn.Conv2d", {"*": {"num_bits": 8, "enable": True}}), - ("nn.BatchNorm2d", {"*": {"enable": False}}), - ("*output_quantizer", {"num_bits": 8, "enable": True}), + { + "parent_class": "nn.Linear", + "quantizer_path": "*", + "cfg": {"num_bits": 4, "axis": -1}, + "enable": True, + }, + { + "parent_class": "nn.Conv2d", + "quantizer_path": "*", + "cfg": {"num_bits": 8}, + "enable": True, + }, + {"parent_class": "nn.BatchNorm2d", "quantizer_path": "*", "enable": False}, + {"quantizer_path": "*output_quantizer", "cfg": {"num_bits": 8}, "enable": True}, ], "algorithm": "max", } diff --git a/tests/unit/torch/quantization/test_tensor_quant_cpu.py b/tests/unit/torch/quantization/test_tensor_quant_cpu.py index 918f614f9..a0720a046 100644 --- a/tests/unit/torch/quantization/test_tensor_quant_cpu.py +++ b/tests/unit/torch/quantization/test_tensor_quant_cpu.py @@ -112,10 +112,14 @@ def test_set_quantizer_cxt(): state_dict = model.state_dict() output_ref = model(inputs) - mtq.set_quantizer_by_cfg(model, [("*output_quantizer", {"enable": True})]) + mtq.set_quantizer_by_cfg(model, [{"quantizer_path": "*output_quantizer", "enable": True}]) with mtq.set_quantizer_by_cfg_context( - model, [("*", {"enable": False}), ("*output_quantizer", {"enable": True})] + model, + [ + {"quantizer_path": "*", "enable": False}, + {"quantizer_path": "*output_quantizer", "enable": True}, + ], ): for name, module in model.named_modules(): if not isinstance(module, TensorQuantizer): @@ -126,7 +130,7 @@ def test_set_quantizer_cxt(): assert not module.is_enabled mtq.calibrate(model, "max", lambda model: model(inputs * 10)) - mtq.set_quantizer_by_cfg(model, [("*output_quantizer", {"enable": False})]) + mtq.set_quantizer_by_cfg(model, [{"quantizer_path": "*output_quantizer", "enable": False}]) output_test = model(inputs) assert torch.allclose(output_ref, output_test) From 45490016873713fd45f1ac11f83fdc372bc5b0a5 Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Fri, 20 Mar 2026 21:36:34 +0000 Subject: [PATCH 16/32] final remove tuple format Signed-off-by: Shengliang Xu --- .../torch/quantization/test_quantize_cuda.py | 24 ++++++++++--------- tests/unit/recipe/test_loader.py | 2 +- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/tests/gpu/torch/quantization/test_quantize_cuda.py b/tests/gpu/torch/quantization/test_quantize_cuda.py index c97086d63..984aa5b2b 100644 --- a/tests/gpu/torch/quantization/test_quantize_cuda.py +++ b/tests/gpu/torch/quantization/test_quantize_cuda.py @@ -30,24 +30,24 @@ NVFP4_WEIGHT_ACT_MSE_CFG = { "quant_cfg": [ - ( - "*weight_quantizer", - { + { + "quantizer_path": "*weight_quantizer", + "cfg": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "static", "scale_bits": (4, 3)}, "axis": None, - "enable": True, }, - ), - ( - "*input_quantizer", - { + "enable": True, + }, + { + "quantizer_path": "*input_quantizer", + "cfg": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, "axis": None, - "enable": True, }, - ), + "enable": True, + }, ], "algorithm": { "method": "mse", @@ -130,7 +130,9 @@ def test_quantize(model_cls, config): if config == mtq.FP8_2D_BLOCKWISE_WEIGHT_ONLY_CFG: # reduce block sizes for simple testing models - config["quant_cfg"]["*weight_quantizer"]["block_sizes"] = {-1: 8, -2: 8} + for entry in config["quant_cfg"]: + if entry.get("quantizer_path") == "*weight_quantizer": + entry.setdefault("cfg", {})["block_sizes"] = {-1: 8, -2: 8} model = model_cls().cuda() calib_data = [model.get_input().cuda() for _ in range(8)] quantize_model_and_forward(model, config, calib_data) diff --git a/tests/unit/recipe/test_loader.py b/tests/unit/recipe/test_loader.py index bf660eafd..f48695382 100644 --- a/tests/unit/recipe/test_loader.py +++ b/tests/unit/recipe/test_loader.py @@ -168,7 +168,7 @@ def test_load_recipe_dir(tmp_path): recipe = load_recipe(tmp_path) assert recipe.recipe_type == RecipeType.PTQ assert recipe.description == "Dir test." - assert recipe.ptq_cfg == {"algorithm": "max", "quant_cfg": {}} + assert recipe.ptq_cfg == {"algorithm": "max", "quant_cfg": []} def test_load_recipe_dir_missing_recipe_raises(tmp_path): From 30bb04185ad593fe88d601ee154e892ae9f6feaa Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Fri, 20 Mar 2026 22:19:03 +0000 Subject: [PATCH 17/32] add atomicity to doc Signed-off-by: Shengliang Xu --- docs/source/guides/_quant_cfg.rst | 46 +++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/docs/source/guides/_quant_cfg.rst b/docs/source/guides/_quant_cfg.rst index 470cd9570..2afd7b70e 100644 --- a/docs/source/guides/_quant_cfg.rst +++ b/docs/source/guides/_quant_cfg.rst @@ -99,6 +99,52 @@ The recommended pattern used by all built-in configs is: ---------- +Entry Atomicity +=============== + +Each entry in ``quant_cfg`` is a **complete, self-contained configuration unit**. When an entry +matches a quantizer, it **completely replaces** that quantizer's configuration — it does not merge +with or incrementally update settings left by earlier entries. + +Concretely, if an entry specifies only a subset of quantizer attributes (e.g. only ``num_bits``), +all unspecified attributes are filled in with their default values from +:class:`QuantizerAttributeConfig `. +The resulting *complete* config is then written to the quantizer, discarding whatever any prior +matching entry had set. + +This means: + +- **Last entry wins, fully.** If two entries both match ``*weight_quantizer``, the second entry + does not inherit the first entry's settings — it replaces them entirely. +- **No hidden state accumulation.** The final configuration of a quantizer depends only on the + *last* entry in the list that matched it, making behavior easy to reason about. +- **Changing one field requires a full spec.** Because each entry is a complete replacement, to + change only one attribute of a quantizer that was already configured, you must reproduce the + full desired config in the new entry. Any attribute omitted from the entry will revert to its + default, not to the value set by an earlier entry. + +For example, given the following two entries both matching ``*weight_quantizer``: + +.. code-block:: python + + # Entry 1 — sets FP8 per-channel + {"quantizer_path": "*weight_quantizer", "cfg": {"num_bits": (4, 3), "axis": 0}}, + + # Entry 2 — sets INT4 blockwise (axis is NOT inherited from Entry 1) + {"quantizer_path": "*weight_quantizer", "cfg": {"num_bits": 4, "block_sizes": {-1: 128}}}, + +After Entry 2 is applied, the quantizer has ``num_bits=4``, ``block_sizes={-1: 128}``, and +``axis=None`` (the default). The ``axis=0`` set by Entry 1 is gone. + +.. note:: + + This atomicity property is what makes the deny-all-then-re-enable pattern safe and + predictable: the deny-all entry (``{"quantizer_path": "*", "enable": False}``) completely + resets every quantizer, and subsequent entries each independently configure their targets from a + clean default state. + +---------- + Common Patterns =============== From ff9fdd9f3856d16d113e3a82d995890386b8823a Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Fri, 20 Mar 2026 23:01:48 +0000 Subject: [PATCH 18/32] fix more quant_cfg args Signed-off-by: Shengliang Xu --- .../notebooks/3_PTQ_AutoQuantization.ipynb | 6 ++++-- tests/gpu/torch/quantization/test_hadamard.py | 18 +++++++----------- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/examples/llm_ptq/notebooks/3_PTQ_AutoQuantization.ipynb b/examples/llm_ptq/notebooks/3_PTQ_AutoQuantization.ipynb index 122569489..9634c615d 100644 --- a/examples/llm_ptq/notebooks/3_PTQ_AutoQuantization.ipynb +++ b/examples/llm_ptq/notebooks/3_PTQ_AutoQuantization.ipynb @@ -288,7 +288,9 @@ " mtq.set_quantizer_by_cfg(model, quant_cfg=kv_cfg)\n", "\n", " # Calibrate **only** those quantizers\n", - " with mtq.set_quantizer_by_cfg_context(model, {\"*\": {\"enable\": False}, **kv_cfg}):\n", + " with mtq.set_quantizer_by_cfg_context(\n", + " model, [{\"quantizer_path\": \"*\", \"enable\": False}, *kv_cfg]\n", + " ):\n", " mtq.calibrate(model, algorithm=\"max\", forward_loop=forward_loop)\n", "else:\n", " print(\"KV cache left unquantized.\")" @@ -427,4 +429,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/tests/gpu/torch/quantization/test_hadamard.py b/tests/gpu/torch/quantization/test_hadamard.py index 93d3e8ccb..430d7ddf6 100644 --- a/tests/gpu/torch/quantization/test_hadamard.py +++ b/tests/gpu/torch/quantization/test_hadamard.py @@ -77,7 +77,7 @@ def test_kv_rotate(rotate_fp32): model = nn.Sequential(SDPAAttention()) mtq.replace_quant_module(model) - set_quantizer_by_cfg(model, {"*": {"enable": False}}) + set_quantizer_by_cfg(model, [{"quantizer_path": "*", "enable": False}]) dummy_input = SDPAAttention.get_input(device="cuda") output_ref = model(dummy_input) if rotate_fp32: @@ -86,11 +86,9 @@ def test_kv_rotate(rotate_fp32): rotate = True with set_quantizer_by_cfg_context( model, - { - "*[qk]_bmm_quantizer": { - "rotate": rotate, - }, - }, + [ + {"quantizer_path": "*[qk]_bmm_quantizer", "cfg": {"rotate": rotate}}, + ], ): output_test = model(dummy_input) assert torch.allclose(output_ref, output_test, atol=0.05) @@ -98,11 +96,9 @@ def test_kv_rotate(rotate_fp32): # Test the rotation is actually applied by turning on only one of the query, key quantizers with set_quantizer_by_cfg_context( model, - { - "*k_bmm_quantizer": { - "rotate": rotate, - }, - }, + [ + {"quantizer_path": "*k_bmm_quantizer", "cfg": {"rotate": rotate}}, + ], ): output_test1 = model(dummy_input) assert not torch.allclose(output_ref, output_test1, atol=0.05) From a164f13373613d6c8f9986b2491379b8d249e99f Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Sat, 21 Mar 2026 01:18:37 +0000 Subject: [PATCH 19/32] distinguish set_quantizer_attributes_full and set_quantizer_attributes_partial set_quantizer_attributes_full updates the full quantizer attributes, it has the atomic semantic set_quantizer_attributes_partial updates just a partial set of quantizer attributes, it has the merge semantic Signed-off-by: Shengliang Xu --- modelopt/torch/quantization/compress.py | 6 +- modelopt/torch/quantization/conversion.py | 202 +++++++++++++----- modelopt/torch/quantization/model_quant.py | 10 +- .../nn/modules/tensor_quantizer.py | 9 +- .../sparsity/attention_sparsity/conversion.py | 2 +- .../torch/quantization/test_quant_rnn_cuda.py | 4 +- .../torch/quantization/plugins/test_apex.py | 6 +- .../quantization/plugins/test_megatron.py | 6 +- .../quantization/plugins/test_huggingface.py | 8 +- .../torch/quantization/plugins/test_peft.py | 2 +- .../quantization/test_quant_activations.py | 4 +- .../quantization/test_quant_batchnorm.py | 5 +- .../unit/torch/quantization/test_quant_rnn.py | 17 +- .../quantization/test_quantize_replace.py | 2 +- 14 files changed, 184 insertions(+), 99 deletions(-) diff --git a/modelopt/torch/quantization/compress.py b/modelopt/torch/quantization/compress.py index 5477d0b61..2a5cbbee9 100644 --- a/modelopt/torch/quantization/compress.py +++ b/modelopt/torch/quantization/compress.py @@ -30,7 +30,7 @@ from .backends.gemm_registry import disable_real_quant_gemm, enable_real_quant_gemm from .config import CompressCfgType, CompressConfig -from .conversion import _replace_quant_module, set_quantizer_attribute +from .conversion import _replace_quant_module, set_quantizer_attributes_partial from .nn.modules.quant_linear import RealQuantLinear from .qtensor import QTensorWrapper, pack_real_quantize_weight from .utils import is_quantized_linear @@ -87,7 +87,7 @@ def compress_convert( compress_cfg = config.compress if "default" in compress_cfg and isinstance(compress_cfg["default"], bool): - set_quantizer_attribute( + set_quantizer_attributes_partial( model, "*weight_quantizer*", {"fake_quant": not compress_cfg["default"]} ) @@ -99,7 +99,7 @@ def compress_convert( def filter_func(name): return fnmatch.fnmatch(name, pattern) and "weight_quantizer" in name - set_quantizer_attribute(model, filter_func, {"fake_quant": not to_compress}) + set_quantizer_attributes_partial(model, filter_func, {"fake_quant": not to_compress}) else: raise ValueError( f"Invalid compression configuration: {to_compress}, expected a boolean as value." diff --git a/modelopt/torch/quantization/conversion.py b/modelopt/torch/quantization/conversion.py index 4f0b99e87..dc5a6ece7 100644 --- a/modelopt/torch/quantization/conversion.py +++ b/modelopt/torch/quantization/conversion.py @@ -19,7 +19,7 @@ import warnings from collections.abc import Callable from contextlib import contextmanager -from typing import Any +from typing import Any, cast import torch.nn as nn @@ -48,7 +48,8 @@ __all__ = [ "register", "replace_quant_module", - "set_quantizer_attribute", + "set_quantizer_attributes_full", + "set_quantizer_attributes_partial", "set_quantizer_by_cfg", "set_quantizer_by_cfg_context", "unregister", @@ -225,89 +226,172 @@ def set_quantizer_by_cfg(quant_model: nn.Module, quant_cfg: QuantizeQuantCfgType In addition, entries with a ``parent_class`` field filter by the pytorch module class, which must have a quantized equivalent. - See :meth:`set_quantizer_attribute ` + See :meth:`set_quantizer_attributes_full ` for more details. """ quant_cfg = normalize_quant_cfg_list(quant_cfg) for entry in quant_cfg: - entry_cfg = entry.get("cfg", {}) - enable = entry.get("enable") - if isinstance(entry_cfg, dict): - if enable is not None: - entry_cfg["enable"] = enable - elif entry_cfg: - # cfg present without explicit enable → implicitly enable the quantizer - entry_cfg = {**entry_cfg, "enable": True} quantizer_path: str = entry["quantizer_path"] parent_class_name = entry.get("parent_class") if parent_class_name is not None: parent_class = QuantModuleRegistry[parent_class_name] - set_quantizer_attribute(quant_model, quantizer_path, entry_cfg, parent_class) else: - set_quantizer_attribute(quant_model, quantizer_path, entry_cfg) + parent_class = None + + cfg = entry.get("cfg", {}) + enable = entry.get("enable", True) + if isinstance(cfg, dict): + attributes = QuantizerAttributeConfig(**cfg, enable=enable) + else: + attributes = [QuantizerAttributeConfig(**c, enable=enable) for c in cfg] + set_quantizer_attributes_full(quant_model, quantizer_path, attributes, parent_class) + + +def _match_quantizer( + wildcard_or_filter_func: str | Callable, + name: str, + module: nn.Module, + parent_class: type[nn.Module] | None, + full_model: nn.Module, +): + if not isinstance(module, (TensorQuantizer, SequentialQuantizer)): + return False + if isinstance(wildcard_or_filter_func, str): + if not fnmatch.fnmatch(name, wildcard_or_filter_func): + return False + elif callable(wildcard_or_filter_func): + if not wildcard_or_filter_func(name): + return False + else: + raise NotImplementedError(f"Unsupported type {type(wildcard_or_filter_func)}") + + return parent_class is None or isinstance( + full_model.get_submodule(".".join(name.split(".")[:-1])), parent_class + ) -def set_quantizer_attribute( +def set_quantizer_attributes_full( quant_model: nn.Module, wildcard_or_filter_func: str | Callable, - attribute: QuantizerAttributeConfig - | list[QuantizerAttributeConfig] - | dict[ - str | Callable, - QuantizerAttributeConfig | list[QuantizerAttributeConfig], - ] - | dict - | list[dict], - parent_class: type | None = None, + attributes: QuantizerAttributeConfig | list[QuantizerAttributeConfig], + parent_class: type[nn.Module] | None = None, ): - """Finegrained adjustment of quantizer attribute by wildcard or filter function. + """Set quantizer attributes by wildcard or filter function, fully overwriting existing attributes. + + Unlike :func:`set_quantizer_attributes_partial`, this function requires a complete + :class:`QuantizerAttributeConfig <.config.QuantizerAttributeConfig>` and **replaces** the + matched quantizer's attributes entirely rather than merging with existing ones. Args: - quant_model: A pytorch model - wildcard_or_filter_func: a wildcard string or a filter function. The wildcard string is matched - against the quantizer module names. The quantizer modules are - instances of + quant_model: A pytorch model. + wildcard_or_filter_func: A wildcard string or a filter function. The wildcard string is + matched against the quantizer module names. The quantizer modules are instances of :class:`TensorQuantizer `. - The filter function takes a quantized module name as input and returns ``True`` if the + The filter function takes a quantizer module name as input and returns ``True`` if the quantizer should be adjusted and ``False`` otherwise. - attribute: An instance of :class:`QuantizerAttributeConfig <.config.QuantizerAttributeConfig>` or an equivalent - dictionary or a list of these two types. - If ``attribute`` is a list, the matched + attributes: A :class:`QuantizerAttributeConfig <.config.QuantizerAttributeConfig>` (or a + list of them) that **fully replaces** the matched quantizer's current attributes. All + fields of the config are applied — unspecified fields revert to their defaults. + If ``attributes`` is a list, the matched :class:`TensorQuantizer ` - modules will be replaced with :class:`SequentialQuantizer ` - modules having one quantizer for each attribute instance from the list. + modules will be replaced with + :class:`SequentialQuantizer ` + modules having one quantizer per attribute instance in the list. See :meth:`set_from_attribute_config() ` - for more details on the supported attributes and their types. - parent_class: (Optional) The parent class of the quantizer modules matching ``wildcard_or_filter_func`` which - should be adjusted. If ``None``, all the matching quantizer modules will be adjusted. + for details on supported attributes and their types. + parent_class: (Optional) Restrict matching to quantizers whose immediate parent module is + an instance of this class. If ``None``, all quantizers matching + ``wildcard_or_filter_func`` are adjusted. """ + if not isinstance(attributes, (QuantizerAttributeConfig, list)): + raise ValueError( + f"Invalid type for attributes: {type(attributes)}, " + "expected QuantizerAttributeConfig or list of QuantizerAttributeConfig." + ) + if isinstance(attributes, list) and not all( + isinstance(attr, QuantizerAttributeConfig) for attr in attributes + ): + raise ValueError( + "All elements in attributes list must be of type QuantizerAttributeConfig." + ) for name, module in quant_model.named_modules(): - if isinstance(module, (TensorQuantizer, SequentialQuantizer)): - if isinstance(wildcard_or_filter_func, str): - if not fnmatch.fnmatch(name, wildcard_or_filter_func): - continue - elif callable(wildcard_or_filter_func): - if not wildcard_or_filter_func(name): - continue + if _match_quantizer(wildcard_or_filter_func, name, module, parent_class, quant_model): + if isinstance(attributes, list): + if not isinstance(module, SequentialQuantizer): + parent_module = quant_model.get_submodule(name.rpartition(".")[0]) + module = SequentialQuantizer( + *(TensorQuantizer() for _ in range(len(attributes))) + ) + setattr(parent_module, name.split(".")[-1], module) + elif len(attributes) != len(module): + warnings.warn( + f"The number of attributes ({len(attributes)}) does not match the number of " + f"quantizers of {module} leading to partial assignment.", + ) + module.set_from_attribute_config(attributes) else: - raise NotImplementedError(f"Unsupported type {type(wildcard_or_filter_func)}") + cast("TensorQuantizer", module).set_from_attribute_config(attributes) - if parent_class is not None and not isinstance( - quant_model.get_submodule(".".join(name.split(".")[:-1])), parent_class - ): - continue - - if isinstance(attribute, list) and not isinstance(module, SequentialQuantizer): - parent_module = quant_model.get_submodule(name.rpartition(".")[0]) - module = SequentialQuantizer(*(TensorQuantizer() for _ in range(len(attribute)))) - setattr(parent_module, name.split(".")[-1], module) - elif isinstance(attribute, list) and len(attribute) != len(module): - warnings.warn( - f"The number of attributes ({len(attribute)}) does not match the number of " - f"quantizers of {module} leading to partial assignment.", + +def set_quantizer_attributes_partial( + quant_model: nn.Module, + wildcard_or_filter_func: str | Callable, + partial_attributes: dict[str, Any] | list[dict[str, Any]], + parent_class: type[nn.Module] | None = None, +): + """Update a subset of quantizer attributes by wildcard or filter function, merging with existing attributes. + + Unlike :func:`set_quantizer_attributes_full`, this function accepts an arbitrary subset of + quantizer attributes as a plain ``dict`` and **merges** them into the matched quantizer's + current attributes, leaving unspecified attributes unchanged. + + Args: + quant_model: A pytorch model. + wildcard_or_filter_func: A wildcard string or a filter function. The wildcard string is + matched against the quantizer module names. The quantizer modules are instances of + :class:`TensorQuantizer `. + The filter function takes a quantizer module name as input and returns ``True`` if the + quantizer should be adjusted and ``False`` otherwise. + partial_attributes: A ``dict`` (or a list of ``dict``) containing only the attributes to + update. Keys must be valid fields of + :class:`QuantizerAttributeConfig <.config.QuantizerAttributeConfig>`. Only the + specified keys are written; all other attributes on the quantizer remain unchanged. + When a ``list`` is passed, the matched module must already be a + :class:`SequentialQuantizer ` — + unlike :func:`set_quantizer_attributes_full`, this function will **not** replace a + :class:`TensorQuantizer ` with a + ``SequentialQuantizer``. + See + :meth:`set_from_attribute_config() ` + for details on supported attributes and their types. + parent_class: (Optional) Restrict matching to quantizers whose immediate parent module is + an instance of this class. If ``None``, all quantizers matching + ``wildcard_or_filter_func`` are adjusted. + """ + if not isinstance(partial_attributes, (dict, list)): + raise ValueError( + f"Invalid type for attributes: {type(partial_attributes)}, expected dictionary or list of dict." + ) + if isinstance(partial_attributes, list) and not all( + isinstance(attr, dict) for attr in partial_attributes + ): + raise ValueError("All elements in attributes list must be of type dict.") + + for name, module in quant_model.named_modules(): + if _match_quantizer(wildcard_or_filter_func, name, module, parent_class, quant_model): + module = cast("TensorQuantizer | SequentialQuantizer", module) # for type checker + if isinstance(partial_attributes, list) and not isinstance(module, SequentialQuantizer): + raise ValueError(f"Attributes is a list but {module} is not a SequentialQuantizer.") + if isinstance(partial_attributes, dict) and not isinstance(module, TensorQuantizer): + raise ValueError( + f"Attributes is a dictionary but {module} is not a TensorQuantizer." ) - module.set_from_attribute_config(attribute) + + if isinstance(partial_attributes, list): + cast("SequentialQuantizer", module).set_from_attribute_config(partial_attributes) + else: + cast("TensorQuantizer", module).set_from_attribute_config(partial_attributes) @contextmanager diff --git a/modelopt/torch/quantization/model_quant.py b/modelopt/torch/quantization/model_quant.py index 13415a16e..1d0314185 100644 --- a/modelopt/torch/quantization/model_quant.py +++ b/modelopt/torch/quantization/model_quant.py @@ -30,13 +30,15 @@ from modelopt.torch.opt.searcher import ForwardLoop from modelopt.torch.opt.utils import forward_with_reshard from modelopt.torch.quantization.config import QuantizeConfig -from modelopt.torch.quantization.conversion import set_quantizer_by_cfg +from modelopt.torch.quantization.conversion import ( + set_quantizer_attributes_partial, + set_quantizer_by_cfg, +) from modelopt.torch.utils import atomic_print from .algorithms import AutoQuantizeGradientSearcher, AutoQuantizeKLDivSearcher, QuantRecipe from .algorithms import get_auto_quantize_config as _get_auto_quantize_config from .config import QuantizeAlgoCfgType -from .conversion import set_quantizer_attribute from .mode import QuantizeModeRegistry, get_modelike_from_algo_cfg from .nn import QuantModule, TensorQuantizer from .utils import is_quantized @@ -575,12 +577,12 @@ def get_auto_quantize_config(search_state, constraints=None, verbose=False): def disable_quantizer(model: nn.Module, wildcard_or_filter_func: str | Callable): """Disable quantizer by wildcard or filter function.""" - set_quantizer_attribute(model, wildcard_or_filter_func, {"enable": False}) + set_quantizer_attributes_partial(model, wildcard_or_filter_func, {"enable": False}) def enable_quantizer(model: nn.Module, wildcard_or_filter_func: str | Callable): """Enable quantizer by wildcard or filter function.""" - set_quantizer_attribute(model, wildcard_or_filter_func, {"enable": True}) + set_quantizer_attributes_partial(model, wildcard_or_filter_func, {"enable": True}) @atomic_print diff --git a/modelopt/torch/quantization/nn/modules/tensor_quantizer.py b/modelopt/torch/quantization/nn/modules/tensor_quantizer.py index 14d6a97f8..3ff7401ec 100644 --- a/modelopt/torch/quantization/nn/modules/tensor_quantizer.py +++ b/modelopt/torch/quantization/nn/modules/tensor_quantizer.py @@ -203,8 +203,8 @@ def __init__( # Optional quantizer cache for caching quantizer related encoding or tensors. self._quantizer_cache = None - def set_from_attribute_config(self, attribute_cfg: QuantizerAttributeConfig | dict): - """Set quantizer attributes from attribute_dict. + def set_from_attribute_config(self, attribute_cfg: QuantizerAttributeConfig | dict[str, Any]): + """Set quantizer attributes from attribute_cfg. The attributes are defined in :class:`QuantizerAttributeConfig `. @@ -1423,10 +1423,7 @@ def get_modelopt_state(self) -> dict[str, Any]: return {"num_quantizers": len(self), "is_sequential_quantizer": True} def set_from_attribute_config( - self, - attributes: list[dict[str, Any] | QuantizerAttributeConfig] - | dict[str, Any] - | QuantizerAttributeConfig, + self, attributes: list[QuantizerAttributeConfig] | list[dict[str, Any]] ): """Set the attributes of contained quantizers from a list of attribute_dicts.""" if not isinstance(attributes, (list, tuple)): diff --git a/modelopt/torch/sparsity/attention_sparsity/conversion.py b/modelopt/torch/sparsity/attention_sparsity/conversion.py index cdc2aed94..0255caf4e 100644 --- a/modelopt/torch/sparsity/attention_sparsity/conversion.py +++ b/modelopt/torch/sparsity/attention_sparsity/conversion.py @@ -194,7 +194,7 @@ def set_sparse_attention_attribute( ): """Set sparse attention attributes for modules matching pattern. - Similar to quantization's set_quantizer_attribute. + Similar to quantization's set_quantizer_attributes_partial. Args: model: Model to configure diff --git a/tests/gpu/torch/quantization/test_quant_rnn_cuda.py b/tests/gpu/torch/quantization/test_quant_rnn_cuda.py index be40de8e5..8a245336f 100644 --- a/tests/gpu/torch/quantization/test_quant_rnn_cuda.py +++ b/tests/gpu/torch/quantization/test_quant_rnn_cuda.py @@ -21,7 +21,7 @@ import torch import torch.nn as nn -from modelopt.torch.quantization import set_quantizer_attribute +from modelopt.torch.quantization.conversion import set_quantizer_attributes_partial from modelopt.torch.quantization.nn import QuantModuleRegistry @@ -44,7 +44,7 @@ def test_no_quant_proj(original_cls, bidirectional, bias): rnn_object_original = copy.deepcopy(rnn_object) quant_rnn_object = QuantModuleRegistry.convert(rnn_object) - set_quantizer_attribute(quant_rnn_object, lambda name: True, {"enable": False}) + set_quantizer_attributes_partial(quant_rnn_object, lambda name: True, {"enable": False}) test_input = torch.randn((3, 2, 8), device="cuda") diff --git a/tests/gpu_megatron/torch/quantization/plugins/test_apex.py b/tests/gpu_megatron/torch/quantization/plugins/test_apex.py index 1c9bf1ec6..144c05f6d 100644 --- a/tests/gpu_megatron/torch/quantization/plugins/test_apex.py +++ b/tests/gpu_megatron/torch/quantization/plugins/test_apex.py @@ -84,15 +84,15 @@ def test_convert_apex_parallel_linear(distributed_setup_size_1): assert hasattr(module, "weight_quantizer") assert hasattr(module, "output_quantizer") - mtq.set_quantizer_attribute(model_test, "*", {"enable": False}) + mtq.set_quantizer_attributes_partial(model_test, "*", {"enable": False}) x = model_ref.get_dummy_input().cuda() out_1 = model_ref(x) out_2 = model_test(x) assert torch.allclose(out_1, out_2) - mtq.set_quantizer_attribute(model_test, "*input_quantizer", {"enable": True}) - mtq.set_quantizer_attribute(model_test, "*weight_quantizer", {"enable": True}) + mtq.set_quantizer_attributes_partial(model_test, "*input_quantizer", {"enable": True}) + mtq.set_quantizer_attributes_partial(model_test, "*weight_quantizer", {"enable": True}) model_ref = RegularQuantModelForTP().cuda() model_ref.load_state_dict(model_test.state_dict()) diff --git a/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py b/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py index d8ba6fbed..dca5b6023 100644 --- a/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py +++ b/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py @@ -82,15 +82,15 @@ def test_convert_megatron_parallel_linear(distributed_setup_size_1): assert hasattr(module, "weight_quantizer") assert hasattr(module, "output_quantizer") - mtq.set_quantizer_attribute(model_test, "*", {"enable": False}) + mtq.set_quantizer_attributes_partial(model_test, "*", {"enable": False}) x = model_ref.get_dummy_input().cuda() out_1 = model_ref(x) out_2 = model_test(x) assert torch.allclose(out_1, out_2) - mtq.set_quantizer_attribute(model_test, "*input_quantizer", {"enable": True}) - mtq.set_quantizer_attribute(model_test, "*weight_quantizer", {"enable": True}) + mtq.set_quantizer_attributes_partial(model_test, "*input_quantizer", {"enable": True}) + mtq.set_quantizer_attributes_partial(model_test, "*weight_quantizer", {"enable": True}) model_ref = RegularQuantModelForTP().cuda() model_ref.load_state_dict(model_test.state_dict(), strict=False) diff --git a/tests/unit/torch/quantization/plugins/test_huggingface.py b/tests/unit/torch/quantization/plugins/test_huggingface.py index d04a8c026..771feb31a 100644 --- a/tests/unit/torch/quantization/plugins/test_huggingface.py +++ b/tests/unit/torch/quantization/plugins/test_huggingface.py @@ -87,7 +87,7 @@ def test_convert_conv1d(): assert hasattr(module, "weight_quantizer") assert hasattr(module, "output_quantizer") - mtq.set_quantizer_attribute(model_test, "*", {"enable": False}) + mtq.set_quantizer_attributes_partial(model_test, "*", {"enable": False}) x = torch.randn(2, 3) out_1 = model_ref(x) @@ -95,8 +95,8 @@ def test_convert_conv1d(): assert torch.allclose(out_1, out_2) - mtq.set_quantizer_attribute(model_test, "*input_quantizer", {"enable": True}) - mtq.set_quantizer_attribute(model_test, "*weight_quantizer", {"enable": True}) + mtq.set_quantizer_attributes_partial(model_test, "*input_quantizer", {"enable": True}) + mtq.set_quantizer_attributes_partial(model_test, "*weight_quantizer", {"enable": True}) model_ref = PytorchModel() model_ref.load_state_dict(model_test.state_dict()) @@ -136,7 +136,7 @@ def test_dbrx(): expertglu_ref.w1, ) - mtq.set_quantizer_attribute(model_test, "*", {"enable": False}) + mtq.set_quantizer_attributes_partial(model_test, "*", {"enable": False}) x = torch.randn(1, 4, 32) out_1 = model_ref(x) diff --git a/tests/unit/torch/quantization/plugins/test_peft.py b/tests/unit/torch/quantization/plugins/test_peft.py index 7077801a4..007309361 100644 --- a/tests/unit/torch/quantization/plugins/test_peft.py +++ b/tests/unit/torch/quantization/plugins/test_peft.py @@ -49,7 +49,7 @@ def test_convert_loralinear(): assert hasattr(module, "weight_quantizer") assert hasattr(module, "output_quantizer") - mtq.set_quantizer_attribute(model_test, "*", {"enable": False}) + mtq.set_quantizer_attributes_partial(model_test, "*", {"enable": False}) tf_output_tester(model_ref, model_test) diff --git a/tests/unit/torch/quantization/test_quant_activations.py b/tests/unit/torch/quantization/test_quant_activations.py index afc8decce..e27b85bb6 100644 --- a/tests/unit/torch/quantization/test_quant_activations.py +++ b/tests/unit/torch/quantization/test_quant_activations.py @@ -19,7 +19,7 @@ import torch.nn as nn import torch.nn.functional as F -from modelopt.torch.quantization import set_quantizer_attribute, tensor_quant +from modelopt.torch.quantization import set_quantizer_attributes_partial, tensor_quant from modelopt.torch.quantization.nn import QuantModuleRegistry @@ -42,7 +42,7 @@ def test_fake_quant_per_channel(self): negative_slope = 0.01 leaky_relu_object = nn.LeakyReLU(negative_slope=negative_slope) quant_leaky_relu_object = QuantModuleRegistry.convert(leaky_relu_object) - set_quantizer_attribute(quant_leaky_relu_object, lambda name: True, {"axis": (1)}) + set_quantizer_attributes_partial(quant_leaky_relu_object, lambda name: True, {"axis": (1)}) test_input = torch.randn(input_shape) quant_input = tensor_quant.fake_tensor_quant( diff --git a/tests/unit/torch/quantization/test_quant_batchnorm.py b/tests/unit/torch/quantization/test_quant_batchnorm.py index ee035dab1..c55b4b0b0 100644 --- a/tests/unit/torch/quantization/test_quant_batchnorm.py +++ b/tests/unit/torch/quantization/test_quant_batchnorm.py @@ -20,7 +20,8 @@ import torch.nn as nn import torch.nn.functional as F -from modelopt.torch.quantization import set_quantizer_attribute, tensor_quant +from modelopt.torch.quantization import tensor_quant +from modelopt.torch.quantization.conversion import set_quantizer_attributes_partial from modelopt.torch.quantization.nn import QuantModuleRegistry NUM_CHANNELS = 3 @@ -90,7 +91,7 @@ def test_fake_quant_per_tensor(self, original_cls, input_shape): def test_fake_quant_per_channel(self, original_cls, input_shape): batchnorm_object = original_cls(NUM_CHANNELS, affine=True) quant_batchnorm_object = QuantModuleRegistry.convert(batchnorm_object) - set_quantizer_attribute(quant_batchnorm_object, lambda name: True, {"axis": (1)}) + set_quantizer_attributes_partial(quant_batchnorm_object, lambda name: True, {"axis": (1)}) test_input = torch.randn(input_shape) reduce_dims = list(range(len(test_input.shape))) diff --git a/tests/unit/torch/quantization/test_quant_rnn.py b/tests/unit/torch/quantization/test_quant_rnn.py index 6f3d054c4..0ea6d755a 100644 --- a/tests/unit/torch/quantization/test_quant_rnn.py +++ b/tests/unit/torch/quantization/test_quant_rnn.py @@ -21,7 +21,8 @@ import torch import torch.nn as nn -from modelopt.torch.quantization import set_quantizer_attribute, tensor_quant +from modelopt.torch.quantization import tensor_quant +from modelopt.torch.quantization.conversion import set_quantizer_attributes_partial from modelopt.torch.quantization.nn import QuantModuleRegistry from modelopt.torch.quantization.nn.modules.quant_rnn import VFRNNForward @@ -52,7 +53,7 @@ def test_no_quant(self, original_cls, bidirectional, bias): quant_rnn_object = QuantModuleRegistry.convert(rnn_object) rnn_object.eval() rnn_object_original.eval() - set_quantizer_attribute(quant_rnn_object, lambda name: True, {"enable": False}) + set_quantizer_attributes_partial(quant_rnn_object, lambda name: True, {"enable": False}) assert torch.allclose( quant_rnn_object.weight_ih_l0, rnn_object_original.weight_ih_l0, atol=1e-6 @@ -86,7 +87,7 @@ def test_no_quant_packed_sequence(self, original_cls, bidirectional, bias): quant_rnn_object = QuantModuleRegistry.convert(rnn_object) rnn_object.eval() rnn_object_original.eval() - set_quantizer_attribute(quant_rnn_object, lambda name: True, {"enable": False}) + set_quantizer_attributes_partial(quant_rnn_object, lambda name: True, {"enable": False}) assert torch.allclose( quant_rnn_object.weight_ih_l0, rnn_object_original.weight_ih_l0, atol=1e-6 @@ -124,7 +125,7 @@ def test_no_quant_proj(self, original_cls, bidirectional, bias): rnn_object_original = copy.deepcopy(rnn_object) quant_rnn_object = QuantModuleRegistry.convert(rnn_object) - set_quantizer_attribute(quant_rnn_object, lambda name: True, {"enable": False}) + set_quantizer_attributes_partial(quant_rnn_object, lambda name: True, {"enable": False}) test_input = torch.randn(INPUT_SHAPE) @@ -150,7 +151,7 @@ def test_no_quant_batch_first(self, original_cls, bidirectional): rnn_object_original = copy.deepcopy(rnn_object) quant_rnn_object = QuantModuleRegistry.convert(rnn_object) - set_quantizer_attribute(quant_rnn_object, lambda name: True, {"enable": False}) + set_quantizer_attributes_partial(quant_rnn_object, lambda name: True, {"enable": False}) test_input = torch.randn([INPUT_SHAPE[1], INPUT_SHAPE[0], INPUT_SHAPE[2]]) @@ -176,7 +177,7 @@ def test_fake_quant_per_tensor(self, original_cls, bidirectional): ) rnn_object_original = copy.deepcopy(rnn_object) quant_rnn_object = QuantModuleRegistry.convert(rnn_object) - set_quantizer_attribute(quant_rnn_object, lambda name: True, {"axis": None}) + set_quantizer_attributes_partial(quant_rnn_object, lambda name: True, {"axis": None}) quant_rnn_object._disable_input_quantizers() for name, weight in rnn_object_original.named_parameters(): @@ -205,7 +206,7 @@ def test_fake_quant_per_channel(self, original_cls, bidirectional): rnn_object = original_cls(HIDDEN_SIZE, HIDDEN_SIZE, NUM_LAYERS, bidirectional=bidirectional) rnn_object_original = copy.deepcopy(rnn_object) quant_rnn_object = QuantModuleRegistry.convert(rnn_object) - set_quantizer_attribute(quant_rnn_object, lambda name: True, {"axis": (0)}) + set_quantizer_attributes_partial(quant_rnn_object, lambda name: True, {"axis": (0)}) quant_rnn_object._disable_input_quantizers() for name, weight in rnn_object_original.named_parameters(): @@ -234,7 +235,7 @@ def test_input_quant_per_tensor(self, original_cls, bidirectional): HIDDEN_SIZE, HIDDEN_SIZE, NUM_LAYERS, bidirectional=bidirectional, bias=True ) quant_rnn_object = QuantModuleRegistry.convert(rnn_object) - set_quantizer_attribute(quant_rnn_object, lambda name: True, {"axis": None}) + set_quantizer_attributes_partial(quant_rnn_object, lambda name: True, {"axis": None}) quant_rnn_object._disable_weight_quantizers() num_directions = 2 if bidirectional else 1 diff --git a/tests/unit/torch/quantization/test_quantize_replace.py b/tests/unit/torch/quantization/test_quantize_replace.py index 140da2b64..4b0f4edd2 100644 --- a/tests/unit/torch/quantization/test_quantize_replace.py +++ b/tests/unit/torch/quantization/test_quantize_replace.py @@ -47,7 +47,7 @@ def test_quantize_replace(model_cls): assert not isinstance(module, nn.Conv2d) or _is_quantized_linear_conv(module) assert not isinstance(module, nn.Linear) or _is_quantized_linear_conv(module) - mtq.set_quantizer_attribute(model_atq, "*", {"enable": False}) + mtq.set_quantizer_attributes_partial(model_atq, "*", {"enable": False}) out_ref = model_ref(dummy_input) out_atq = model_atq(dummy_input) From dc915f529c5291f3d7406535188e2c850ef3792a Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Sun, 22 Mar 2026 04:01:38 +0000 Subject: [PATCH 20/32] new partial set quantizer cfg for internal merging logic Signed-off-by: Shengliang Xu --- docs/source/guides/_quant_cfg.rst | 11 +- modelopt/torch/quantization/algorithms.py | 34 +++--- modelopt/torch/quantization/config.py | 3 +- modelopt/torch/quantization/conversion.py | 114 +++++++++++++++--- modelopt/torch/quantization/model_calib.py | 7 +- tests/_test_utils/torch/export/utils.py | 5 +- .../unit/torch/quantization/test_autoquant.py | 5 +- .../torch/quantization/test_quantize_cpu.py | 5 +- .../quantization/test_tensor_quant_cpu.py | 5 +- 9 files changed, 138 insertions(+), 51 deletions(-) diff --git a/docs/source/guides/_quant_cfg.rst b/docs/source/guides/_quant_cfg.rst index 2afd7b70e..d0959fa91 100644 --- a/docs/source/guides/_quant_cfg.rst +++ b/docs/source/guides/_quant_cfg.rst @@ -60,8 +60,10 @@ Each entry in the list is a dictionary with the following fields: for sequential quantization (see :ref:`sequential-quantizers`). * - ``enable`` - No - - ``True`` or ``False``. Shorthand for enabling or disabling matched quantizers. When ``enable`` is omitted, the quantizer - is implicitly enabled. + - ``True`` or ``False``. When ``cfg`` is also absent, this is a **complete replacement**: + all quantizer attributes are reset to their defaults and ``enable`` is set accordingly. + When ``cfg`` is present, ``enable`` overrides the ``enable`` field inside ``cfg``. + When omitted, defaults to ``True``. ---------- @@ -140,8 +142,9 @@ After Entry 2 is applied, the quantizer has ``num_bits=4``, ``block_sizes={-1: 1 This atomicity property is what makes the deny-all-then-re-enable pattern safe and predictable: the deny-all entry (``{"quantizer_path": "*", "enable": False}``) completely - resets every quantizer, and subsequent entries each independently configure their targets from a - clean default state. + resets every quantizer to defaults, and subsequent entries each independently configure their + targets from a clean default state. The same full-reset semantics apply to any entry with no + ``cfg`` — including ``{"quantizer_path": "*", "enable": True}``. ---------- diff --git a/modelopt/torch/quantization/algorithms.py b/modelopt/torch/quantization/algorithms.py index 03029edbe..df090ffc9 100644 --- a/modelopt/torch/quantization/algorithms.py +++ b/modelopt/torch/quantization/algorithms.py @@ -1313,11 +1313,12 @@ def get_auto_quantize_config(search_state, constraints=None, verbose=False): def _cfg_to_dict(v): if isinstance(v, mtq_config.QuantizerAttributeConfig): - return { - "enable": v.enable, - "num_bits": v.num_bits, - **v.model_dump(exclude_defaults=True), - } + return ( + { + "num_bits": v.num_bits, + **v.model_dump(exclude_defaults=True), + }, + ) if isinstance(v, list): return [_cfg_to_dict(c) for c in v] return v @@ -1329,12 +1330,15 @@ def _cfg_to_dict(v): module_names = search_state["candidate_stats"][hparam_name]["module_names"] for module_name in module_names: for quantizer_attr in ("input_quantizer", "weight_quantizer"): - matched_cfg = _match_quantizer_cfg(recipe.config.quant_cfg, quantizer_attr) + matched_cfg, matched_enable = _match_quantizer_cfg( + recipe.config.quant_cfg, quantizer_attr + ) if matched_cfg is not None: quant_cfg.append( { "quantizer_path": f"{module_name}.{quantizer_attr}", "cfg": _cfg_to_dict(matched_cfg), + "enable": matched_enable, } ) warnings.warn( @@ -1378,17 +1382,13 @@ def _resolve_best_recipe(search_state, constraints, verbose=False): def _match_quantizer_cfg(quant_cfg, quantizer_attr): # Last-match-wins to mirror set_quantizer_by_cfg behavior matched = None + matched_enable = False for entry in quant_cfg: - pattern = ( - entry["quantizer_path"] - if isinstance(entry, dict) and "quantizer_path" in entry - else entry[0] - ) - cfg = ( - entry.get("cfg", {}) - if isinstance(entry, dict) and "quantizer_path" in entry - else entry[1] - ) + pattern = entry["quantizer_path"] + cfg = entry.get("cfg", {}) + enable = entry.get("enable", True) if fnmatch.fnmatch(quantizer_attr, pattern): matched = cfg - return matched + matched_enable = enable + + return matched, matched_enable diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py index 3e3828d31..e3ba3216e 100644 --- a/modelopt/torch/quantization/config.py +++ b/modelopt/torch/quantization/config.py @@ -370,13 +370,12 @@ class QuantizerCfgEntry(TypedDict, total=False): { "num_bits": 4, "block_sizes": {-1: 128, "type": "static"}, - "enable": True, }, { "num_bits": (4, 3), - "enable": True, }, ], + "enable": True, }, { "quantizer_path": "*input_quantizer", diff --git a/modelopt/torch/quantization/conversion.py b/modelopt/torch/quantization/conversion.py index dc5a6ece7..26cdc47ae 100644 --- a/modelopt/torch/quantization/conversion.py +++ b/modelopt/torch/quantization/conversion.py @@ -32,6 +32,7 @@ QuantizeConfig, QuantizeQuantCfgType, QuantizerAttributeConfig, + QuantizerCfgEntry, _QuantizeExportConfig, normalize_quant_cfg_list, ) @@ -52,6 +53,7 @@ "set_quantizer_attributes_partial", "set_quantizer_by_cfg", "set_quantizer_by_cfg_context", + "set_quantizer_by_cfg_partial_context", "unregister", ] @@ -213,33 +215,56 @@ def _replace_quant_module(model: nn.Module, version=None, registry=QuantModuleRe _replace_quant_module(getattr(model, name), version=version, registry=registry) +def _parse_quant_cfg_entry(entry: QuantizerCfgEntry, enable_missing_as_true: bool = True): + parent_class_name = entry.get("parent_class") + if parent_class_name is not None: + parent_class = QuantModuleRegistry[parent_class_name] + else: + parent_class = None + + cfg = entry.get("cfg") or {} + enable = entry.get("enable") if entry.get("enable") is not None else enable_missing_as_true + + return cfg, enable, parent_class + + def set_quantizer_by_cfg(quant_model: nn.Module, quant_cfg: QuantizeQuantCfgType): - """Update the quantizer attributes based on the specified `quant_cfg`. + """Apply a quantization config list to the quantizers in ``quant_model``. - `quant_cfg` is a list of :class:`QuantizerCfgEntry <.config.QuantizerCfgEntry>` objects mapping - quantizer paths (and optionally parent classes) to their quantizer attributes, which are - defined in :class:`QuantizerAttributeConfig <.config.QuantizerAttributeConfig>`. - The ``quantizer_path`` is matched against the quantizer module names. - The specified quantizer attributes of the matched quantizer modules are set accordingly. - Entries are applied in order; use ``"*"`` as the first entry to set a catch-all default. + ``quant_cfg`` is an **ordered list** of :class:`QuantizerCfgEntry <.config.QuantizerCfgEntry>` + dicts. Each entry has the following fields: - In addition, entries with a ``parent_class`` field filter by the pytorch module class, - which must have a quantized equivalent. + - ``quantizer_path`` *(required)*: wildcard matched against quantizer module names via + :func:`fnmatch`. + - ``cfg`` *(optional)*: a dict of :class:`QuantizerAttributeConfig <.config.QuantizerAttributeConfig>` + fields, or a list of such dicts for sequential quantization. + - ``enable`` *(optional)*: ``True`` or ``False`` to enable or disable matched quantizers. + When omitted, defaults to ``True``. + - ``parent_class`` *(optional)*: restricts matching to quantizers whose immediate parent + module is of this PyTorch class name. - See :meth:`set_quantizer_attributes_full ` - for more details. + **Ordering and atomicity:** entries are applied in list order; later entries override earlier + ones for any quantizer they match. Each entry with a ``cfg`` is a **complete replacement** — + unspecified attributes revert to their defaults rather than inheriting from a prior entry. + The typical pattern is to deny all first (``{"quantizer_path": "*", "enable": False}``), then + selectively enable and configure target quantizers in subsequent entries. + + **Enable-Fale only entries:** an entry with no ``cfg`` but enalbe False would be a complete reset to + of the matching quantizers of all quantizer attributes to their defaults. + + **Enable-True only entries:** an entry with no ``cfg`` but enalbe True is invalid. An error will be raised. + + See :ref:`quant-cfg` for the full format reference and common patterns. """ quant_cfg = normalize_quant_cfg_list(quant_cfg) for entry in quant_cfg: quantizer_path: str = entry["quantizer_path"] - parent_class_name = entry.get("parent_class") - if parent_class_name is not None: - parent_class = QuantModuleRegistry[parent_class_name] - else: - parent_class = None + cfg, enable, parent_class = _parse_quant_cfg_entry(entry, enable_missing_as_true=True) + if enable and not cfg: + raise ValueError( + f"Entry {entry} has enable=True but no cfg, which will reset all attributes to defaults." + ) - cfg = entry.get("cfg", {}) - enable = entry.get("enable", True) if isinstance(cfg, dict): attributes = QuantizerAttributeConfig(**cfg, enable=enable) else: @@ -421,6 +446,59 @@ def set_quantizer_by_cfg_context(quant_model: nn.Module, quant_cfg: QuantizeQuan module.set_from_modelopt_state(original_attributes[name], properties_only=True) +@contextmanager +def set_quantizer_by_cfg_partial_context(quant_model: nn.Module, quant_cfg: QuantizeQuantCfgType): + """Context manager for partially updating quantizer attributes using ``quant_cfg``. + + This API shall be used internal only. + + Unlike :func:`set_quantizer_by_cfg_context`, only the attributes explicitly specified in each + entry's ``cfg`` (and ``enable``, if provided) are modified; all other quantizer attributes + remain unchanged. The modified attributes are restored to their original values on exit. + + ``enable`` is treated as optional here — if omitted from an entry it is **not** defaulted to + ``True`` (contrast with :func:`set_quantizer_by_cfg` where omitting ``enable`` defaults it to + ``True``). Pass ``enable`` explicitly to toggle the enabled state. + + Use this context manager with caution. Changing certain attributes of the quantizer such as + `calibrator` can lead to unexpected behavior. + + Args: + quant_model: A pytorch model with quantizers inserted. + quant_cfg: A quantization config list; see :func:`set_quantizer_by_cfg` for the format. + ``cfg`` values are treated as **partial** attribute dicts — unspecified fields are left + unchanged on matched quantizers. + """ + quant_cfg = normalize_quant_cfg_list(quant_cfg) + assert not any(isinstance(entry.get("cfg", {}), list) for entry in quant_cfg), ( + "list of config not supported." + ) + + # Save the full state of every quantizer that will be touched by at least one entry. + original_attributes: dict[str, dict] = {} + for name, module in quant_model.named_modules(): + if isinstance(module, TensorQuantizer): + original_attributes[name] = module.get_modelopt_state(properties_only=True) + + # Apply partial updates: only the keys present in cfg (+ enable when explicit). + for entry in quant_cfg: + quantizer_path = entry["quantizer_path"] + cfg, enable, parent_class = _parse_quant_cfg_entry(entry, enable_missing_as_true=False) + + if isinstance(cfg, dict): + attributes = dict(**cfg, enable=enable) + else: + attributes = [dict(**c, enable=enable) for c in cfg] + set_quantizer_attributes_partial(quant_model, quantizer_path, attributes, parent_class) + + yield + + # Restore only the quantizers that were modified. + for name, module in quant_model.named_modules(): + if isinstance(module, TensorQuantizer): + module.set_from_modelopt_state(original_attributes[name], properties_only=True) + + def register(original_cls: nn.Module, quantized_cls: nn.Module): """Register a quantized class for the given un-quantized original class. diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py index 4616c82fc..db3c00fae 100644 --- a/modelopt/torch/quantization/model_calib.py +++ b/modelopt/torch/quantization/model_calib.py @@ -35,7 +35,10 @@ from modelopt.torch.utils.perf import get_used_gpu_mem_fraction from .calib import MseCalibrator, NVFP4MSECalibrator -from .conversion import create_and_replace_svdquant_linear_on_the_fly, set_quantizer_by_cfg_context +from .conversion import ( + create_and_replace_svdquant_linear_on_the_fly, + set_quantizer_by_cfg_partial_context, +) from .nn import NVFP4StaticQuantizer, QuantModule, SequentialQuantizer, TensorQuantizer from .utils import ( disable_calib, @@ -1101,7 +1104,7 @@ def forward(self, input, *args, **kwargs): self.awq_lite.num_cache_steps += 1 self.awq_lite.num_tokens += input.numel() / input.shape[-1] if self.awq_lite.is_input_quantized: - with set_quantizer_by_cfg_context( + with set_quantizer_by_cfg_partial_context( self.input_quantizer, [{"quantizer_path": "*", "enable": True}] ): max_calibrate(self.input_quantizer, lambda quantizer: quantizer(input), False) diff --git a/tests/_test_utils/torch/export/utils.py b/tests/_test_utils/torch/export/utils.py index 3501ad9ee..e0867bad7 100644 --- a/tests/_test_utils/torch/export/utils.py +++ b/tests/_test_utils/torch/export/utils.py @@ -99,9 +99,10 @@ def forward(self, x): { "quantizer_path": "*.2.weight_quantizer", "cfg": [ - {"num_bits": 4, "block_sizes": {-1: 128, "type": "static"}, "enable": True}, - {"num_bits": (4, 3), "axis": None, "enable": True}, + {"num_bits": 4, "block_sizes": {-1: 128, "type": "static"}}, + {"num_bits": (4, 3), "axis": None}, ], + "enable": True, }, { "quantizer_path": "*.2.input_quantizer", diff --git a/tests/unit/torch/quantization/test_autoquant.py b/tests/unit/torch/quantization/test_autoquant.py index 2de0aec5b..e619c7e7b 100644 --- a/tests/unit/torch/quantization/test_autoquant.py +++ b/tests/unit/torch/quantization/test_autoquant.py @@ -236,9 +236,10 @@ def test_auto_quantize_disabled_layers_no_poison(): { "quantizer_path": "*weight_quantizer", "cfg": [ - {"num_bits": 4, "block_sizes": {-1: 128, "type": "static"}, "enable": True}, - {"num_bits": 8, "axis": None, "enable": True}, + {"num_bits": 4, "block_sizes": {-1: 128, "type": "static"}}, + {"num_bits": 8, "axis": None}, ], + "enable": True, }, { "quantizer_path": "*input_quantizer", diff --git a/tests/unit/torch/quantization/test_quantize_cpu.py b/tests/unit/torch/quantization/test_quantize_cpu.py index d5100ed02..18b84bb5b 100644 --- a/tests/unit/torch/quantization/test_quantize_cpu.py +++ b/tests/unit/torch/quantization/test_quantize_cpu.py @@ -39,9 +39,10 @@ { "quantizer_path": "*weight_quantizer", "cfg": [ - {"num_bits": 4, "block_sizes": {-1: 128, "type": "static"}, "enable": True}, - {"num_bits": 8, "axis": 0, "enable": True}, + {"num_bits": 4, "block_sizes": {-1: 128, "type": "static"}}, + {"num_bits": 8, "axis": 0}, ], + "enable": True, }, { "quantizer_path": "*input_quantizer", diff --git a/tests/unit/torch/quantization/test_tensor_quant_cpu.py b/tests/unit/torch/quantization/test_tensor_quant_cpu.py index a0720a046..78a79bbcb 100644 --- a/tests/unit/torch/quantization/test_tensor_quant_cpu.py +++ b/tests/unit/torch/quantization/test_tensor_quant_cpu.py @@ -94,9 +94,10 @@ def test_num_bits(self): { "quantizer_path": "*weight_quantizer", "cfg": [ - {"num_bits": 4, "block_sizes": {-1: 128, "type": "static"}, "enable": True}, - {"num_bits": 8, "axis": 0, "enable": True}, + {"num_bits": 4, "block_sizes": {-1: 128, "type": "static"}}, + {"num_bits": 8, "axis": 0}, ], + "enable": True, }, {"quantizer_path": "*input_quantizer", "cfg": {"num_bits": 8}, "enable": True}, ], From 10c4cddbe8992ff47e8cb0c1586d98118ce47e90 Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Sun, 22 Mar 2026 18:37:26 +0000 Subject: [PATCH 21/32] enable semantic documentation Signed-off-by: Shengliang Xu --- docs/source/guides/_quant_cfg.rst | 90 +++++++++--- modelopt/torch/quantization/config.py | 67 +++++++-- modelopt/torch/quantization/conversion.py | 132 ++++++------------ modelopt/torch/quantization/model_calib.py | 7 +- .../quantization/test_tensor_quant_cpu.py | 10 +- 5 files changed, 174 insertions(+), 132 deletions(-) diff --git a/docs/source/guides/_quant_cfg.rst b/docs/source/guides/_quant_cfg.rst index d0959fa91..b3d37cdb3 100644 --- a/docs/source/guides/_quant_cfg.rst +++ b/docs/source/guides/_quant_cfg.rst @@ -60,10 +60,53 @@ Each entry in the list is a dictionary with the following fields: for sequential quantization (see :ref:`sequential-quantizers`). * - ``enable`` - No - - ``True`` or ``False``. When ``cfg`` is also absent, this is a **complete replacement**: - all quantizer attributes are reset to their defaults and ``enable`` is set accordingly. - When ``cfg`` is present, ``enable`` overrides the ``enable`` field inside ``cfg``. - When omitted, defaults to ``True``. + - ``True`` or ``False``. Toggles matched quantizers on or off, independently of ``cfg``. + When ``cfg`` is absent, **only** the enabled/disabled state is changed — all other + attributes remain untouched. When ``cfg`` is present, ``enable`` sets the enabled state + of the newly-configured quantizer. When ``cfg`` is present and ``enable`` is omitted, + the quantizer is implicitly enabled (``True``). + +.. note:: + + Every entry must specify at least one of ``cfg`` or ``enable`` in addition to + ``quantizer_path``. An entry with only ``quantizer_path`` and no other keys is **invalid** + and will raise a ``ValueError`` at config-processing time. This prevents subtle bugs where + a bare ``{"quantizer_path": "*"}`` would silently behave as ``enable=True`` for all + quantizers. + +---------- + +Default Quantizer Configuration +================================ + +When a quantizer is enabled but has never been touched by a ``cfg`` entry — either because no +entry in the list matched it, or because it was only reached by enable-only entries — it operates +with the default attributes of +:class:`QuantizerAttributeConfig `: + +.. code-block:: python + + { + "num_bits": 8, # 8-bit integer quantization + "axis": None, # per-tensor scale (no per-channel axis) + "fake_quant": True, # simulate quantization in forward pass (PTQ / QAT) + "unsigned": False, # signed integer range, e.g. [-128, 127] for INT8 + "narrow_range": False, # full range; True would restrict to [-127, 127] for INT8 + "type": "static", # static calibration (not dynamic per-inference) + "block_sizes": None, # no block quantization; set for NF4 / MXFP formats + "bias": None, # no affine bias correction + "calibrator": "max", # use max-abs calibration to determine amax + "rotate": False, # no Hadamard rotation (QuaRot / SpinQuant) + "pass_through_bwd": True, # straight-through estimator for QAT gradients + "trt_high_precision_dtype": "Float", # cast QDQ nodes to fp32 for TRT StronglyType export + "backend": None, # use the built-in quantization backend + "backend_extra_args": None, # no extra args for custom backends + "use_constant_amax": False, # calibrate amax; True hard-codes FP8 E4M3 max (448.0) + } + +In practice this means an un-configured but enabled quantizer performs **INT8 per-tensor static +fake-quantization** with a max-calibrated scale. This is rarely the intended behavior — every +quantizer you want active should be explicitly configured with a ``cfg`` entry. ---------- @@ -104,9 +147,9 @@ The recommended pattern used by all built-in configs is: Entry Atomicity =============== -Each entry in ``quant_cfg`` is a **complete, self-contained configuration unit**. When an entry -matches a quantizer, it **completely replaces** that quantizer's configuration — it does not merge -with or incrementally update settings left by earlier entries. +Each ``cfg``-bearing entry in ``quant_cfg`` is a **complete, self-contained configuration unit**. +When an entry with ``cfg`` matches a quantizer, it **completely replaces** that quantizer's +configuration — it does not merge with or incrementally update settings left by earlier entries. Concretely, if an entry specifies only a subset of quantizer attributes (e.g. only ``num_bits``), all unspecified attributes are filled in with their default values from @@ -116,15 +159,25 @@ matching entry had set. This means: -- **Last entry wins, fully.** If two entries both match ``*weight_quantizer``, the second entry - does not inherit the first entry's settings — it replaces them entirely. +- **Last cfg-entry wins, fully.** If two entries both match ``*weight_quantizer`` and both carry + a ``cfg``, the second entry does not inherit the first entry's settings — it replaces them entirely. - **No hidden state accumulation.** The final configuration of a quantizer depends only on the - *last* entry in the list that matched it, making behavior easy to reason about. -- **Changing one field requires a full spec.** Because each entry is a complete replacement, to - change only one attribute of a quantizer that was already configured, you must reproduce the + *last* ``cfg``-bearing entry in the list that matched it, making behavior easy to reason about. +- **Changing one field requires a full spec.** Because each ``cfg`` entry is a complete replacement, + to change only one attribute of a quantizer that was already configured, you must reproduce the full desired config in the new entry. Any attribute omitted from the entry will revert to its default, not to the value set by an earlier entry. +**Enable-only entries are the exception.** An entry with no ``cfg`` (only ``enable``) is *not* a +full replacement — it solely flips the on/off state of matched quantizers, leaving all other +attributes unchanged: + +- ``{"quantizer_path": "*", "enable": False}`` disables all quantizers without touching their + configured attributes. Use this as the first step in a deny-all-then-configure pattern. +- ``{"quantizer_path": "*weight_quantizer", "enable": True}`` (no ``cfg``) re-enables weight + quantizers using whatever attributes they currently carry (or their defaults if they were never + configured by a ``cfg`` entry). + For example, given the following two entries both matching ``*weight_quantizer``: .. code-block:: python @@ -140,11 +193,9 @@ After Entry 2 is applied, the quantizer has ``num_bits=4``, ``block_sizes={-1: 1 .. note:: - This atomicity property is what makes the deny-all-then-re-enable pattern safe and - predictable: the deny-all entry (``{"quantizer_path": "*", "enable": False}``) completely - resets every quantizer to defaults, and subsequent entries each independently configure their - targets from a clean default state. The same full-reset semantics apply to any entry with no - ``cfg`` — including ``{"quantizer_path": "*", "enable": True}``. + The deny-all-then-configure pattern is safe and predictable precisely because + ``{"quantizer_path": "*", "enable": False}`` **only** disables quantizers without resetting + their attributes. Subsequent ``cfg`` entries then configure targets from a known default state. ---------- @@ -239,9 +290,10 @@ are quantized first in INT4 and then in FP8: { "quantizer_path": "*weight_quantizer", "cfg": [ - {"num_bits": 4, "block_sizes": {-1: 128, "type": "static"}, "enable": True}, - {"num_bits": (4, 3), "enable": True}, # FP8 + {"num_bits": 4, "block_sizes": {-1: 128, "type": "static"}}, + {"num_bits": (4, 3)}, # FP8 ], + "enable": True, } ---------- diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py index e3ba3216e..352ccc14f 100644 --- a/modelopt/torch/quantization/config.py +++ b/modelopt/torch/quantization/config.py @@ -78,9 +78,10 @@ :class:`SequentialQuantizer ` that applies each format in sequence. This is used for example in W4A8 quantization where weights are quantized first in INT4 and then in FP8. -- ``enable`` *(optional)*: shorthand to enable or disable matched quantizers without specifying a - full ``cfg``. When ``cfg`` is present but ``enable`` is absent, the quantizer is implicitly - enabled. +- ``enable`` *(optional)*: toggles matched quantizers on (``True``) or off (``False``), + independently of ``cfg``. When ``cfg`` is present and ``enable`` is absent, the quantizer is + implicitly enabled. When ``enable`` is the only field (no ``cfg``), it only flips the on/off + state — all other attributes remain unchanged. ``quant_cfg`` — Ordering and Precedence ----------------------------------------- @@ -161,9 +162,9 @@ class QuantizerCfgEntry(TypedDict, total=False): """A single entry in a ``quant_cfg`` list.""" quantizer_path: str # required; matched against quantizer module names - parent_class: str # optional; filters by pytorch module class name (e.g. "nn.Linear") - cfg: dict[str, Any] | list[dict[str, Any]] # quantizer attribute config(s) - enable: bool # shorthand to set/unset the quantizer's enable flag + parent_class: str | None # optional; filters by pytorch module class name (e.g. "nn.Linear") + cfg: dict[str, Any] | list[dict[str, Any]] | None # quantizer attribute config(s) + enable: bool | None # toggles matched quantizers on/off; independent of cfg _base_disable_all: list[QuantizerCfgEntry] = [ @@ -1517,11 +1518,39 @@ class GPTQLiteConfig(QuantizeAlgorithmConfig): def normalize_quant_cfg_list(v: list) -> list[QuantizerCfgEntry]: - """Normalize a raw quant_cfg list into a list of QuantizerCfgEntry dicts. + """Normalize a raw quant_cfg list into a list of :class:`QuantizerCfgEntry` dicts. - Supports these input forms per entry: - - ``{"quantizer_path": ..., "enable": ..., "cfg": ...}`` — passed through as-is - - ``{"": ...}`` — single-key dict (legacy) + Supports the following input forms per entry: + + - New format: ``{"quantizer_path": ..., "enable": ..., "cfg": ...}`` — passed through. + - Legacy single-key format: ``{"": }`` — converted to new format. + - Legacy ``nn.*``-scoped format: ``{"nn.": {"": }}`` — converted + to a new-format entry with ``parent_class`` set. + + **Validation** — an entry is rejected if it carries no instruction, i.e. it specifies neither + ``cfg`` nor ``enable``. Concretely, the following are invalid: + + - An empty entry ``{}``. + - An entry with only ``quantizer_path`` and no other keys — the only effect would be an + implicit ``enable=True``, which must be stated explicitly. + + **Normalization** — after conversion and validation every entry is put into canonical form: + + - ``enable`` is set to ``True`` if not explicitly specified. + - ``cfg`` is set to ``None`` if not present in the entry. + + Every returned entry is therefore guaranteed to have the keys ``quantizer_path``, ``enable``, + and ``cfg`` (plus optionally ``parent_class``). + + Args: + v: A list of raw quant_cfg entries in any supported format. + + Returns: + A list of :class:`QuantizerCfgEntry` dicts in canonical normalized form. + + Raises: + ValueError: If any entry has only ``quantizer_path`` with neither ``cfg`` nor ``enable``, + or if the entry format is not recognized. """ def _dict_to_entry(key: str, value) -> QuantizerCfgEntry: @@ -1553,12 +1582,26 @@ def _dict_to_entry(key: str, value) -> QuantizerCfgEntry: result: list[QuantizerCfgEntry] = [] for raw in v: if isinstance(raw, dict) and "quantizer_path" in raw: - result.append(cast("QuantizerCfgEntry", raw)) + entry: dict = dict(raw) # copy to avoid mutating caller's data elif isinstance(raw, dict) and len(raw) == 1: key, val = next(iter(raw.items())) - result.append(_dict_to_entry(key, val)) + entry = dict(_dict_to_entry(key, val)) else: raise ValueError(f"Invalid quant_cfg entry: {raw!r}.") + + # Validate: must carry at least one instruction beyond the path selector. + if "cfg" not in entry and "enable" not in entry: + raise ValueError( + f"Invalid quant_cfg entry: {raw!r} — each entry must specify 'cfg', 'enable', " + "or both. An entry with only 'quantizer_path' has no effect (implicit " + "enable=True is not allowed; set it explicitly)." + ) + + # Normalize: make enable and cfg always explicit. + entry.setdefault("enable", True) + entry.setdefault("cfg", None) + + result.append(cast("QuantizerCfgEntry", entry)) return result diff --git a/modelopt/torch/quantization/conversion.py b/modelopt/torch/quantization/conversion.py index 26cdc47ae..ab2978ae6 100644 --- a/modelopt/torch/quantization/conversion.py +++ b/modelopt/torch/quantization/conversion.py @@ -32,7 +32,6 @@ QuantizeConfig, QuantizeQuantCfgType, QuantizerAttributeConfig, - QuantizerCfgEntry, _QuantizeExportConfig, normalize_quant_cfg_list, ) @@ -53,7 +52,6 @@ "set_quantizer_attributes_partial", "set_quantizer_by_cfg", "set_quantizer_by_cfg_context", - "set_quantizer_by_cfg_partial_context", "unregister", ] @@ -215,19 +213,6 @@ def _replace_quant_module(model: nn.Module, version=None, registry=QuantModuleRe _replace_quant_module(getattr(model, name), version=version, registry=registry) -def _parse_quant_cfg_entry(entry: QuantizerCfgEntry, enable_missing_as_true: bool = True): - parent_class_name = entry.get("parent_class") - if parent_class_name is not None: - parent_class = QuantModuleRegistry[parent_class_name] - else: - parent_class = None - - cfg = entry.get("cfg") or {} - enable = entry.get("enable") if entry.get("enable") is not None else enable_missing_as_true - - return cfg, enable, parent_class - - def set_quantizer_by_cfg(quant_model: nn.Module, quant_cfg: QuantizeQuantCfgType): """Apply a quantization config list to the quantizers in ``quant_model``. @@ -238,8 +223,9 @@ def set_quantizer_by_cfg(quant_model: nn.Module, quant_cfg: QuantizeQuantCfgType :func:`fnmatch`. - ``cfg`` *(optional)*: a dict of :class:`QuantizerAttributeConfig <.config.QuantizerAttributeConfig>` fields, or a list of such dicts for sequential quantization. - - ``enable`` *(optional)*: ``True`` or ``False`` to enable or disable matched quantizers. - When omitted, defaults to ``True``. + - ``enable`` *(optional)*: ``True`` or ``False`` to toggle matched quantizers on or off. + When omitted but ``cfg`` is present, defaults to ``True``. Every entry must specify at + least one of ``cfg`` or ``enable`` — an entry with only ``quantizer_path`` is invalid. - ``parent_class`` *(optional)*: restricts matching to quantizers whose immediate parent module is of this PyTorch class name. @@ -249,27 +235,38 @@ def set_quantizer_by_cfg(quant_model: nn.Module, quant_cfg: QuantizeQuantCfgType The typical pattern is to deny all first (``{"quantizer_path": "*", "enable": False}``), then selectively enable and configure target quantizers in subsequent entries. - **Enable-Fale only entries:** an entry with no ``cfg`` but enalbe False would be a complete reset to - of the matching quantizers of all quantizer attributes to their defaults. + **``enable`` and ``cfg`` are independent:** - **Enable-True only entries:** an entry with no ``cfg`` but enalbe True is invalid. An error will be raised. + - An entry with ``cfg`` (and optionally ``enable``) fully replaces the matched quantizer's + attributes. If ``enable`` is omitted, the quantizer is implicitly enabled. + - ``{"enable": False}`` without ``cfg`` **only** toggles the matched quantizers off, leaving + all other attributes unchanged. + - ``{"enable": True}`` without ``cfg`` **only** toggles the matched quantizers on, using + whatever attributes they currently have (or their defaults if never configured). See :ref:`quant-cfg` for the full format reference and common patterns. """ quant_cfg = normalize_quant_cfg_list(quant_cfg) + for entry in quant_cfg: quantizer_path: str = entry["quantizer_path"] - cfg, enable, parent_class = _parse_quant_cfg_entry(entry, enable_missing_as_true=True) - if enable and not cfg: - raise ValueError( - f"Entry {entry} has enable=True but no cfg, which will reset all attributes to defaults." + cfg = entry["cfg"] # None, dict, or list — always explicit after normalization + enable: bool = entry["enable"] # always explicit after normalization + parent_class_name = entry.get("parent_class") + parent_class = QuantModuleRegistry[parent_class_name] if parent_class_name else None + + if not cfg: + # No cfg: only toggle the enable state, leave all other attributes unchanged. + set_quantizer_attributes_partial( + quant_model, quantizer_path, {"enable": enable}, parent_class ) - - if isinstance(cfg, dict): - attributes = QuantizerAttributeConfig(**cfg, enable=enable) else: - attributes = [QuantizerAttributeConfig(**c, enable=enable) for c in cfg] - set_quantizer_attributes_full(quant_model, quantizer_path, attributes, parent_class) + # Has cfg: apply full replacement with the explicit enable value. + if isinstance(cfg, dict): + attributes = QuantizerAttributeConfig(**cfg, enable=enable) + else: + attributes = [QuantizerAttributeConfig(**c, enable=enable) for c in cfg] + set_quantizer_attributes_full(quant_model, quantizer_path, attributes, parent_class) def _match_quantizer( @@ -421,79 +418,34 @@ def set_quantizer_attributes_partial( @contextmanager def set_quantizer_by_cfg_context(quant_model: nn.Module, quant_cfg: QuantizeQuantCfgType): - """Context manager for setting quantizer attributes using `quant_cfg`. - - The set attributes will be reset to the original attributes after exiting the context manager. - See :meth:`set_quantizer_by_cfg` for more details. - - Use this context manager with caution. Changing certain attributes of the quantizer such as - `calibrator` can lead to unexpected behavior. - """ - quant_cfg = normalize_quant_cfg_list(quant_cfg) - assert not any(isinstance(entry.get("cfg", {}), list) for entry in quant_cfg), ( - "list of config not support." - ) - - original_attributes = {} - for name, module in quant_model.named_modules(): - if isinstance(module, TensorQuantizer): - original_attributes[name] = module.get_modelopt_state(properties_only=True) - - set_quantizer_by_cfg(quant_model, quant_cfg) - yield - for name, module in quant_model.named_modules(): - if isinstance(module, TensorQuantizer): - module.set_from_modelopt_state(original_attributes[name], properties_only=True) - - -@contextmanager -def set_quantizer_by_cfg_partial_context(quant_model: nn.Module, quant_cfg: QuantizeQuantCfgType): - """Context manager for partially updating quantizer attributes using ``quant_cfg``. - - This API shall be used internal only. - - Unlike :func:`set_quantizer_by_cfg_context`, only the attributes explicitly specified in each - entry's ``cfg`` (and ``enable``, if provided) are modified; all other quantizer attributes - remain unchanged. The modified attributes are restored to their original values on exit. + """Context manager that temporarily applies a quantization config and restores the original state on exit. - ``enable`` is treated as optional here — if omitted from an entry it is **not** defaulted to - ``True`` (contrast with :func:`set_quantizer_by_cfg` where omitting ``enable`` defaults it to - ``True``). Pass ``enable`` explicitly to toggle the enabled state. + Calls :func:`set_quantizer_by_cfg` on entry and reverts every + :class:`TensorQuantizer ` in + ``quant_model`` to its original attributes on exit. - Use this context manager with caution. Changing certain attributes of the quantizer such as - `calibrator` can lead to unexpected behavior. + .. caution:: + Changing stateful attributes such as ``calibrator`` inside this context may produce + unexpected behavior because those objects are not deep-copied during save/restore. Args: - quant_model: A pytorch model with quantizers inserted. - quant_cfg: A quantization config list; see :func:`set_quantizer_by_cfg` for the format. - ``cfg`` values are treated as **partial** attribute dicts — unspecified fields are left - unchanged on matched quantizers. + quant_model: A quantized PyTorch model whose quantizers will be temporarily reconfigured. + quant_cfg: A quantization config (or list of + :class:`QuantizerCfgEntry <.config.QuantizerCfgEntry>` dicts) passed directly to + :func:`set_quantizer_by_cfg`. Sequential ``cfg`` lists are not allowed. + + Yields: + None — the context body runs with the new quantizer attributes active. """ quant_cfg = normalize_quant_cfg_list(quant_cfg) - assert not any(isinstance(entry.get("cfg", {}), list) for entry in quant_cfg), ( - "list of config not supported." - ) - # Save the full state of every quantizer that will be touched by at least one entry. - original_attributes: dict[str, dict] = {} + original_attributes = {} for name, module in quant_model.named_modules(): if isinstance(module, TensorQuantizer): original_attributes[name] = module.get_modelopt_state(properties_only=True) - # Apply partial updates: only the keys present in cfg (+ enable when explicit). - for entry in quant_cfg: - quantizer_path = entry["quantizer_path"] - cfg, enable, parent_class = _parse_quant_cfg_entry(entry, enable_missing_as_true=False) - - if isinstance(cfg, dict): - attributes = dict(**cfg, enable=enable) - else: - attributes = [dict(**c, enable=enable) for c in cfg] - set_quantizer_attributes_partial(quant_model, quantizer_path, attributes, parent_class) - + set_quantizer_by_cfg(quant_model, quant_cfg) yield - - # Restore only the quantizers that were modified. for name, module in quant_model.named_modules(): if isinstance(module, TensorQuantizer): module.set_from_modelopt_state(original_attributes[name], properties_only=True) diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py index db3c00fae..4616c82fc 100644 --- a/modelopt/torch/quantization/model_calib.py +++ b/modelopt/torch/quantization/model_calib.py @@ -35,10 +35,7 @@ from modelopt.torch.utils.perf import get_used_gpu_mem_fraction from .calib import MseCalibrator, NVFP4MSECalibrator -from .conversion import ( - create_and_replace_svdquant_linear_on_the_fly, - set_quantizer_by_cfg_partial_context, -) +from .conversion import create_and_replace_svdquant_linear_on_the_fly, set_quantizer_by_cfg_context from .nn import NVFP4StaticQuantizer, QuantModule, SequentialQuantizer, TensorQuantizer from .utils import ( disable_calib, @@ -1104,7 +1101,7 @@ def forward(self, input, *args, **kwargs): self.awq_lite.num_cache_steps += 1 self.awq_lite.num_tokens += input.numel() / input.shape[-1] if self.awq_lite.is_input_quantized: - with set_quantizer_by_cfg_partial_context( + with set_quantizer_by_cfg_context( self.input_quantizer, [{"quantizer_path": "*", "enable": True}] ): max_calibrate(self.input_quantizer, lambda quantizer: quantizer(input), False) diff --git a/tests/unit/torch/quantization/test_tensor_quant_cpu.py b/tests/unit/torch/quantization/test_tensor_quant_cpu.py index 78a79bbcb..1f33c9615 100644 --- a/tests/unit/torch/quantization/test_tensor_quant_cpu.py +++ b/tests/unit/torch/quantization/test_tensor_quant_cpu.py @@ -59,12 +59,10 @@ def test_from_to_dict(self, verbose): def test_num_bits(self): """Test num_bits for both integer and tuple cases.""" - with pytest.raises( - ValueError, - match="Invalid quantizer config: Cannot specify only {'enable': True}. " - "Additional parameters are required when enabling quantization.", - ): - QuantizerAttributeConfig(enable=True) + # enable=True alone is valid: it produces a default 8-bit config with enable=True. + cfg = QuantizerAttributeConfig(enable=True) + assert cfg.enable is True + assert cfg.num_bits == 8 with pytest.raises( ValueError, match="num_bits must be a positive integer or a tuple of positive integers." From a03d97568b80fa2024b397bce46c29f8bdf2bbfb Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Sun, 22 Mar 2026 18:49:48 +0000 Subject: [PATCH 22/32] revert accidental test change Signed-off-by: Shengliang Xu --- tests/unit/torch/quantization/test_tensor_quant_cpu.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/unit/torch/quantization/test_tensor_quant_cpu.py b/tests/unit/torch/quantization/test_tensor_quant_cpu.py index 1f33c9615..78a79bbcb 100644 --- a/tests/unit/torch/quantization/test_tensor_quant_cpu.py +++ b/tests/unit/torch/quantization/test_tensor_quant_cpu.py @@ -59,10 +59,12 @@ def test_from_to_dict(self, verbose): def test_num_bits(self): """Test num_bits for both integer and tuple cases.""" - # enable=True alone is valid: it produces a default 8-bit config with enable=True. - cfg = QuantizerAttributeConfig(enable=True) - assert cfg.enable is True - assert cfg.num_bits == 8 + with pytest.raises( + ValueError, + match="Invalid quantizer config: Cannot specify only {'enable': True}. " + "Additional parameters are required when enabling quantization.", + ): + QuantizerAttributeConfig(enable=True) with pytest.raises( ValueError, match="num_bits must be a positive integer or a tuple of positive integers." From fb3bb074355fa8889ca0d651f93cc9c2788d3a69 Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Sun, 22 Mar 2026 20:05:02 +0000 Subject: [PATCH 23/32] fix mypy Signed-off-by: Shengliang Xu --- modelopt/torch/quantization/algorithms.py | 1 + modelopt/torch/quantization/config.py | 8 ++++---- modelopt/torch/quantization/conversion.py | 21 ++++++++++++--------- 3 files changed, 17 insertions(+), 13 deletions(-) diff --git a/modelopt/torch/quantization/algorithms.py b/modelopt/torch/quantization/algorithms.py index df090ffc9..c00b39f6a 100644 --- a/modelopt/torch/quantization/algorithms.py +++ b/modelopt/torch/quantization/algorithms.py @@ -94,6 +94,7 @@ def estimate_quant_compression_for_quantizer(quantizer_attr_cfg): raise ValueError(f"Unknown type {type(quantizer_attr_cfg)}, {quantizer_attr_cfg}") cfgs = [e.get("cfg", {}) for e in quant_cfg.quant_cfg] + cfgs = [c for c in cfgs if c is not None] return estimate_quant_compression_for_quantizer(cfgs) if cfgs else 1.0 diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py index 352ccc14f..064f9a671 100644 --- a/modelopt/torch/quantization/config.py +++ b/modelopt/torch/quantization/config.py @@ -537,7 +537,7 @@ class QuantizerCfgEntry(TypedDict, total=False): "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, } -_nvfp4_quantizer_bs32 = { +_nvfp4_cfg_bs32 = { "num_bits": (2, 1), "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (4, 3)}, "enable": True, @@ -547,12 +547,12 @@ class QuantizerCfgEntry(TypedDict, total=False): def _nvfp4_selective_quant_cfg( layer_patterns: list[str], *, - quantizer: dict = _nvfp4_quantizer, + quantizer: dict = _nvfp4_cfg, weight_only: bool = False, algorithm: str | dict = "max", ) -> dict: """Build an NVFP4 config that quantizes only the specified layer patterns.""" - quant_cfg: dict[str, object] = [] + quant_cfg: list[QuantizerCfgEntry] = [] quant_cfg.extend(_base_disable_all) for pattern in layer_patterns: quant_cfg.append({"quantizer_path": f"{pattern}weight_quantizer", "cfg": quantizer}) @@ -769,7 +769,7 @@ def _nvfp4_selective_quant_cfg( } NVFP4_MLP_WEIGHT_ONLY_CFG = _nvfp4_selective_quant_cfg( - ["*mlp*", "*block_sparse_moe*"], quantizer=_nvfp4_quantizer_bs32, weight_only=True + ["*mlp*", "*block_sparse_moe*"], quantizer=_nvfp4_cfg_bs32, weight_only=True ) NVFP4_EXPERTS_ONLY_CFG = _nvfp4_selective_quant_cfg(["*mlp.experts*", "*block_sparse_moe*"]) NVFP4_MLP_ONLY_CFG = _nvfp4_selective_quant_cfg(["*mlp*", "*block_sparse_moe*"]) diff --git a/modelopt/torch/quantization/conversion.py b/modelopt/torch/quantization/conversion.py index ab2978ae6..4f1bd2a0b 100644 --- a/modelopt/torch/quantization/conversion.py +++ b/modelopt/torch/quantization/conversion.py @@ -379,6 +379,9 @@ def set_quantizer_attributes_partial( update. Keys must be valid fields of :class:`QuantizerAttributeConfig <.config.QuantizerAttributeConfig>`. Only the specified keys are written; all other attributes on the quantizer remain unchanged. + When a ``dict`` is passed and the matched module is a + :class:`SequentialQuantizer `, + the dict is broadcast to every sub-quantizer. When a ``list`` is passed, the matched module must already be a :class:`SequentialQuantizer ` — unlike :func:`set_quantizer_attributes_full`, this function will **not** replace a @@ -403,17 +406,17 @@ def set_quantizer_attributes_partial( for name, module in quant_model.named_modules(): if _match_quantizer(wildcard_or_filter_func, name, module, parent_class, quant_model): module = cast("TensorQuantizer | SequentialQuantizer", module) # for type checker - if isinstance(partial_attributes, list) and not isinstance(module, SequentialQuantizer): - raise ValueError(f"Attributes is a list but {module} is not a SequentialQuantizer.") - if isinstance(partial_attributes, dict) and not isinstance(module, TensorQuantizer): - raise ValueError( - f"Attributes is a dictionary but {module} is not a TensorQuantizer." - ) - if isinstance(partial_attributes, list): - cast("SequentialQuantizer", module).set_from_attribute_config(partial_attributes) + if not isinstance(module, SequentialQuantizer): + raise ValueError( + f"Attributes is a list but {module} is not a SequentialQuantizer." + ) + module.set_from_attribute_config(partial_attributes) + elif isinstance(module, SequentialQuantizer): + # Broadcast the dict to all sub-quantizers. + module.set_from_attribute_config([partial_attributes] * len(module)) else: - cast("TensorQuantizer", module).set_from_attribute_config(partial_attributes) + module.set_from_attribute_config(partial_attributes) @contextmanager From aecf832c44b282779d89d7080260ac160c1ded6e Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Mon, 23 Mar 2026 01:53:23 +0000 Subject: [PATCH 24/32] new tests and fix existing tests Signed-off-by: Shengliang Xu --- tests/unit/recipe/test_loader.py | 66 +++++++----- .../quantization/test_config_validation.py | 94 ++++++++++++++++ .../torch/quantization/test_quantize_cpu.py | 100 ++++++++++++++++++ 3 files changed, 235 insertions(+), 25 deletions(-) diff --git a/tests/unit/recipe/test_loader.py b/tests/unit/recipe/test_loader.py index f48695382..251fc7fdc 100644 --- a/tests/unit/recipe/test_loader.py +++ b/tests/unit/recipe/test_loader.py @@ -15,6 +15,8 @@ """Unit tests for modelopt.recipe.loader and modelopt.recipe.loader.load_config.""" +import re + import pytest from modelopt.recipe.config import ModelOptPTQRecipe, RecipeType @@ -164,7 +166,7 @@ def test_load_recipe_dir(tmp_path): (tmp_path / "recipe.yml").write_text( "metadata:\n recipe_type: ptq\n description: Dir test.\n" ) - (tmp_path / "ptq_cfg.yml").write_text("algorithm: max\nquant_cfg: {}\n") + (tmp_path / "ptq_cfg.yml").write_text("algorithm: max\nquant_cfg: []\n") recipe = load_recipe(tmp_path) assert recipe.recipe_type == RecipeType.PTQ assert recipe.description == "Dir test." @@ -200,35 +202,49 @@ def test_load_recipe_dir_missing_ptq_cfg_raises(tmp_path): ], ) def test_general_ptq_yaml_matches_config_dicts(yaml_path, model_cfg_name, kv_cfg_name): - """Each general/ptq YAML's merged quant_cfg matches the corresponding config.py dicts.""" + """Each general/ptq YAML's quant_cfg list matches the merged Python config dicts.""" + import json + import modelopt.torch.quantization.config as qcfg + from modelopt.torch.quantization.config import normalize_quant_cfg_list model_cfg = getattr(qcfg, model_cfg_name) kv_cfg = getattr(qcfg, kv_cfg_name) yaml_data = load_config(yaml_path) - def _as_dict(qc): - result = {} - for entry in qc: - if isinstance(entry, dict) and "quantizer_path" in entry: - parent_class = entry.get("parent_class") - key = parent_class if parent_class else entry["quantizer_path"] - cfg = entry.get("cfg", {}) - val = dict(cfg) if isinstance(cfg, dict) else cfg - if entry.get("enable") is not None: - val["enable"] = entry["enable"] - if parent_class: - result[key] = {entry["quantizer_path"]: val} - else: - result[key] = val - elif isinstance(entry, dict): - result.update(entry) - else: - result[entry[0]] = entry[1] + def _normalize_fpx(val): + """Normalize FPx representations to a canonical ``[E, M]`` list. + + Python configs may use tuple form ``(E, M)`` or string alias ``"eEmM"``; + YAML always uses the string form. Both are converted to ``[E, M]`` so the + comparison is representation-agnostic. + """ + if isinstance(val, str): + m = re.fullmatch(r"e(\d+)m(\d+)", val) + if m: + return [int(m.group(1)), int(m.group(2))] + if isinstance(val, tuple) and len(val) == 2 and all(isinstance(x, int) for x in val): + return list(val) + if isinstance(val, dict): + return {str(k): _normalize_fpx(v) for k, v in val.items()} + return val + + def _normalize_entries(raw_entries): + """Normalize a raw quant_cfg list to a canonical, JSON-serialisable form.""" + entries = normalize_quant_cfg_list(list(raw_entries)) + result = [] + for entry in entries: + e = {k: v for k, v in entry.items() if v is not None} + if "cfg" in e and e["cfg"] is not None: + e["cfg"] = _normalize_fpx(e["cfg"]) + result.append(e) return result - ptq = yaml_data["ptq_cfg"] - assert {**_as_dict(model_cfg["quant_cfg"]), **_as_dict(kv_cfg["quant_cfg"])} == _as_dict( - ptq["quant_cfg"] - ) - assert model_cfg["algorithm"] == ptq["algorithm"] + def _sort_key(entry): + return json.dumps(entry, sort_keys=True, default=str) + + python_entries = _normalize_entries(model_cfg["quant_cfg"] + kv_cfg["quant_cfg"]) + yaml_entries = _normalize_entries(yaml_data["ptq_cfg"]["quant_cfg"]) + + assert sorted(python_entries, key=_sort_key) == sorted(yaml_entries, key=_sort_key) + assert model_cfg["algorithm"] == yaml_data["ptq_cfg"]["algorithm"] diff --git a/tests/unit/torch/quantization/test_config_validation.py b/tests/unit/torch/quantization/test_config_validation.py index 6ed0c918a..cc8077ef2 100644 --- a/tests/unit/torch/quantization/test_config_validation.py +++ b/tests/unit/torch/quantization/test_config_validation.py @@ -15,6 +15,8 @@ """Test of quantization config validations.""" +import pytest + from modelopt.torch.quantization.config import ( FP8_2D_BLOCKWISE_WEIGHT_ONLY_CFG, FP8_DEFAULT_CFG, @@ -23,6 +25,7 @@ NVFP4_DEFAULT_CFG, W4A8_AWQ_BETA_CFG, need_calibration, + normalize_quant_cfg_list, ) @@ -33,3 +36,94 @@ def test_need_calibration(): assert need_calibration(INT4_AWQ_CFG) assert need_calibration(W4A8_AWQ_BETA_CFG) assert need_calibration(NVFP4_DEFAULT_CFG) + + +class TestNormalizeQuantCfgList: + def test_new_format_passthrough(self): + """New-format entries are returned unchanged (only canonical defaults added).""" + raw = [{"quantizer_path": "*weight_quantizer", "cfg": {"num_bits": 8, "axis": 0}}] + result = normalize_quant_cfg_list(raw) + assert len(result) == 1 + assert result[0]["quantizer_path"] == "*weight_quantizer" + assert result[0]["cfg"] == {"num_bits": 8, "axis": 0} + assert result[0]["enable"] is True # defaulted + + def test_new_format_enable_false(self): + """Explicit enable=False is preserved.""" + raw = [{"quantizer_path": "*", "enable": False}] + result = normalize_quant_cfg_list(raw) + assert result[0]["enable"] is False + assert result[0]["cfg"] is None # defaulted + + def test_new_format_explicit_enable_true_no_cfg(self): + """Explicit enable=True with no cfg is valid and cfg defaults to None.""" + raw = [{"quantizer_path": "*", "enable": True}] + result = normalize_quant_cfg_list(raw) + assert result[0]["enable"] is True + assert result[0]["cfg"] is None + + def test_legacy_single_key_dict(self): + """Legacy {'*path': {attrs}} is converted to new format.""" + raw = [{"*weight_quantizer": {"num_bits": 8, "axis": 0}}] + result = normalize_quant_cfg_list(raw) + assert result[0]["quantizer_path"] == "*weight_quantizer" + assert result[0]["cfg"] == {"num_bits": 8, "axis": 0} + assert result[0]["enable"] is True # defaulted + + def test_legacy_single_key_dict_with_enable(self): + """Legacy {'*path': {'enable': False}} splits enable out from cfg.""" + raw = [{"*input_quantizer": {"enable": False}}] + result = normalize_quant_cfg_list(raw) + assert result[0]["quantizer_path"] == "*input_quantizer" + assert result[0]["enable"] is False + assert result[0]["cfg"] == {} + + def test_legacy_nn_class_scoped(self): + """Legacy {'nn.Linear': {'*': {attrs}}} is converted with parent_class.""" + raw = [{"nn.Linear": {"*": {"enable": False}}}] + result = normalize_quant_cfg_list(raw) + assert result[0]["parent_class"] == "nn.Linear" + assert result[0]["quantizer_path"] == "*" + assert result[0]["enable"] is False + + def test_normalization_cfg_defaults_to_none(self): + """Entries without cfg get cfg=None after normalization.""" + raw = [{"quantizer_path": "*lm_head*", "enable": False}] + result = normalize_quant_cfg_list(raw) + assert "cfg" in result[0] + assert result[0]["cfg"] is None + + def test_normalization_enable_defaults_to_true(self): + """Entries with cfg but no enable get enable=True after normalization.""" + raw = [{"quantizer_path": "*", "cfg": {"num_bits": 4}}] + result = normalize_quant_cfg_list(raw) + assert result[0]["enable"] is True + + def test_empty_list(self): + """Empty list is returned unchanged.""" + assert normalize_quant_cfg_list([]) == [] + + def test_multiple_entries_order_preserved(self): + """The order of entries is preserved.""" + raw = [ + {"quantizer_path": "*", "enable": False}, + {"quantizer_path": "*weight_quantizer", "cfg": {"num_bits": 8}}, + ] + result = normalize_quant_cfg_list(raw) + assert result[0]["quantizer_path"] == "*" + assert result[1]["quantizer_path"] == "*weight_quantizer" + + def test_error_on_quantizer_path_only(self): + """Entry with only quantizer_path and no cfg or enable is rejected.""" + with pytest.raises(ValueError, match="must specify 'cfg', 'enable'"): + normalize_quant_cfg_list([{"quantizer_path": "*"}]) + + def test_error_on_empty_dict(self): + """An empty dict entry is rejected.""" + with pytest.raises(ValueError): + normalize_quant_cfg_list([{}]) + + def test_error_on_multi_key_legacy_dict(self): + """A multi-key legacy dict (no quantizer_path) is rejected.""" + with pytest.raises(ValueError): + normalize_quant_cfg_list([{"*weight_quantizer": {}, "*input_quantizer": {}}]) diff --git a/tests/unit/torch/quantization/test_quantize_cpu.py b/tests/unit/torch/quantization/test_quantize_cpu.py index 18b84bb5b..46f974a0c 100644 --- a/tests/unit/torch/quantization/test_quantize_cpu.py +++ b/tests/unit/torch/quantization/test_quantize_cpu.py @@ -32,6 +32,12 @@ import modelopt.torch.opt as mto import modelopt.torch.quantization as mtq from modelopt.torch.quantization.calib import MaxCalibrator +from modelopt.torch.quantization.config import QuantizerAttributeConfig +from modelopt.torch.quantization.conversion import set_quantizer_attributes_full +from modelopt.torch.quantization.nn.modules.tensor_quantizer import ( + SequentialQuantizer, + TensorQuantizer, +) # A test config with double-quant (using `SequentialQuantizers`) WINT4INT8_CFG = { @@ -300,3 +306,97 @@ def forward_loop(model): out2 = model(inputs) assert torch.allclose(out1, out2), "Re-quantization with same config should be idempotent" + + +class TestSetQuantizerAttributesFull: + """Tests for set_quantizer_attributes_full and its atomicity semantics.""" + + def _quantize(self, model): + return mtq.quantize(model, mtq.INT8_DEFAULT_CFG, lambda m: m(m.get_input())) + + def test_basic_full_replacement(self): + """set_quantizer_attributes_full replaces all attributes on matched quantizers.""" + model = self._quantize(SimpleLinear()) + attrs = QuantizerAttributeConfig(num_bits=4, axis=0) + set_quantizer_attributes_full(model, "*weight_quantizer", attrs) + for name, module in model.named_modules(): + if name.endswith("weight_quantizer"): + assert isinstance(module, TensorQuantizer) + assert module.num_bits == 4 + assert module.axis == 0 + + def test_atomicity_unset_fields_revert_to_defaults(self): + """A full replacement reverts unspecified fields to QuantizerAttributeConfig defaults.""" + model = self._quantize(SimpleLinear()) + # First configure with axis=0 (non-default) + set_quantizer_attributes_full( + model, "*weight_quantizer", QuantizerAttributeConfig(num_bits=8, axis=0) + ) + for name, module in model.named_modules(): + if name.endswith("weight_quantizer"): + assert module.axis == 0 + + # Now replace with only num_bits=4; axis should revert to default (None) + set_quantizer_attributes_full( + model, "*weight_quantizer", QuantizerAttributeConfig(num_bits=4) + ) + default_axis = QuantizerAttributeConfig().axis + for name, module in model.named_modules(): + if name.endswith("weight_quantizer"): + assert module.num_bits == 4 + assert module.axis == default_axis + + def test_parent_class_filter(self): + """parent_class restricts which quantizers are affected.""" + model = self._quantize(SimpleConvLinear()) + # Only set num_bits=4 for quantizers inside nn.Linear modules + set_quantizer_attributes_full( + model, + "*weight_quantizer", + QuantizerAttributeConfig(num_bits=4), + parent_class=torch.nn.Linear, + ) + for name, module in model.named_modules(): + if not name.endswith("weight_quantizer"): + continue + parent_name = name.rpartition(".")[0] + parent = model.get_submodule(parent_name) + if isinstance(parent, torch.nn.Linear): + assert module.num_bits == 4 + else: + # Conv2d weight_quantizers should be unchanged (still 8-bit from INT8_DEFAULT_CFG) + assert module.num_bits == 8 + + def test_wildcard_no_match_is_noop(self): + """A wildcard that matches nothing silently does nothing.""" + model = self._quantize(SimpleLinear()) + # Record state before + bits_before = { + n: m.num_bits for n, m in model.named_modules() if isinstance(m, TensorQuantizer) + } + set_quantizer_attributes_full( + model, "*nonexistent_quantizer*", QuantizerAttributeConfig(num_bits=4) + ) + bits_after = { + n: m.num_bits for n, m in model.named_modules() if isinstance(m, TensorQuantizer) + } + assert bits_before == bits_after + + def test_invalid_attributes_type_raises(self): + """Passing a plain dict instead of QuantizerAttributeConfig raises ValueError.""" + model = self._quantize(SimpleLinear()) + with pytest.raises((ValueError, AttributeError)): + set_quantizer_attributes_full(model, "*weight_quantizer", {"num_bits": 4}) # type: ignore[arg-type] + + def test_list_attributes_creates_sequential_quantizer(self): + """A list of QuantizerAttributeConfig replaces TensorQuantizer with SequentialQuantizer.""" + model = self._quantize(SimpleLinear()) + attrs = [ + QuantizerAttributeConfig(num_bits=4, block_sizes={-1: 128}), + QuantizerAttributeConfig(num_bits=8, axis=0), + ] + set_quantizer_attributes_full(model, "*weight_quantizer", attrs) + for name, module in model.named_modules(): + if name.endswith("weight_quantizer"): + assert isinstance(module, SequentialQuantizer) + assert len(module) == 2 From 5115452d18b868effe7c8a947dd15f17ccc9f01e Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Mon, 23 Mar 2026 03:18:09 +0000 Subject: [PATCH 25/32] python < 3.12 Signed-off-by: Shengliang Xu --- modelopt/torch/quantization/config.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py index 064f9a671..78dd51166 100644 --- a/modelopt/torch/quantization/config.py +++ b/modelopt/torch/quantization/config.py @@ -150,9 +150,10 @@ """ -from typing import Any, Literal, TypedDict, cast +from typing import Any, Literal, cast from pydantic import ValidationInfo, field_validator, model_validator +from typing_extensions import TypedDict from modelopt.torch.opt.config import ModeloptBaseConfig, ModeloptField from modelopt.torch.utils.network import ConstructorLike From a481bd17e44c479a2b2377cd5296606a4821b534 Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Mon, 23 Mar 2026 07:36:27 +0000 Subject: [PATCH 26/32] more fix dict to list Signed-off-by: Shengliang Xu --- .../torch/quantization/utils/core_utils.py | 21 +++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/modelopt/torch/quantization/utils/core_utils.py b/modelopt/torch/quantization/utils/core_utils.py index 54f146072..b9008a702 100644 --- a/modelopt/torch/quantization/utils/core_utils.py +++ b/modelopt/torch/quantization/utils/core_utils.py @@ -27,6 +27,7 @@ from torch.distributed.fsdp._fully_shard._fsdp_param import FSDPParam from torch.distributed.tensor import Replicate +from modelopt.torch.quantization.config import QuantizerCfgEntry from modelopt.torch.utils import get_unwrapped_name, print_rank_0 if TYPE_CHECKING: @@ -827,13 +828,25 @@ def fsdp2_aware_weight_update(root_model, modules_to_update, reshard=True): def update_quant_cfg_with_kv_cache_quant( - quant_cfg: dict[str, Any], kv_cache_quant_cfg: dict[str, Any] + quant_cfg: dict[str, Any], kv_cache_quant_cfg: list[QuantizerCfgEntry] ) -> dict[str, Any]: - """Update the quant_cfg with the kv cache quant_cfg.""" + """Update the quant_cfg with the kv cache quant_cfg. + + Args: + quant_cfg: The outer quantization config dict (with ``"quant_cfg"`` and ``"algorithm"`` keys). + kv_cache_quant_cfg: A list of :class:`QuantizerCfgEntry + ` dicts for KV cache quantization, + typically ``some_kv_cfg["quant_cfg"]``. + + Returns: + A deep copy of ``quant_cfg`` with the KV cache entries appended to ``quant_cfg["quant_cfg"]``. + """ # If quant_cfg["quant_cfg"] is None, it corresponds to only kv cache quantization case quant_cfg = copy.deepcopy(quant_cfg) - inner: list = quant_cfg.get("quant_cfg") or [{"quantizer_path": "*", "enable": False}] - quant_cfg["quant_cfg"] = inner + list(kv_cache_quant_cfg.items()) + inner: list[QuantizerCfgEntry] = quant_cfg.get("quant_cfg") or [ + {"quantizer_path": "*", "enable": False} + ] + quant_cfg["quant_cfg"] = inner + list(kv_cache_quant_cfg) # Set default algorithm for kv cache quantization if not provided. if not quant_cfg.get("algorithm"): From fe2d2f3db7ed507b493131dc6c8acbc4c27f9412 Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Mon, 23 Mar 2026 07:47:07 +0000 Subject: [PATCH 27/32] KV config has only quant_cfg meaningful Signed-off-by: Shengliang Xu --- modelopt/torch/quantization/config.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py index 78dd51166..7968c56ba 100644 --- a/modelopt/torch/quantization/config.py +++ b/modelopt/torch/quantization/config.py @@ -516,8 +516,7 @@ class QuantizerCfgEntry(TypedDict, total=False): }, "enable": True, }, - ], - "algorithm": "max", + ] } FP8_AFFINE_KV_CFG = { @@ -529,8 +528,7 @@ class QuantizerCfgEntry(TypedDict, total=False): "bias": {-2: None, -4: None, "type": "static"}, }, }, - ], - "algorithm": "max", + ] } _nvfp4_cfg = { @@ -646,13 +644,13 @@ def _nvfp4_selective_quant_cfg( }, "enable": True, }, - ], + ] } NVFP4_KV_CFG = { "quant_cfg": [ {"quantizer_path": "*[kv]_bmm_quantizer", "cfg": _nvfp4_cfg, "enable": True}, - ], + ] } # Moved from examples/diffusers/quantization/config.py to here @@ -714,8 +712,7 @@ def _nvfp4_selective_quant_cfg( "enable": True, }, {"quantizer_path": "*v_bmm_quantizer", "cfg": _nvfp4_cfg, "enable": True}, - ], - "algorithm": "max", + ] } NVFP4_SVDQUANT_DEFAULT_CFG = _nvfp4_selective_quant_cfg( From b9d67d3b7a4b95737cf04746ff7a1f899baecc50 Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Wed, 25 Mar 2026 18:28:05 +0000 Subject: [PATCH 28/32] fix tests Signed-off-by: Shengliang Xu --- .../quantization/test_real_quantize_cuda.py | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/tests/gpu/torch/quantization/test_real_quantize_cuda.py b/tests/gpu/torch/quantization/test_real_quantize_cuda.py index 2c6512896..e94210ff7 100644 --- a/tests/gpu/torch/quantization/test_real_quantize_cuda.py +++ b/tests/gpu/torch/quantization/test_real_quantize_cuda.py @@ -47,10 +47,13 @@ def test_real_quantize(model_cls, config): # update config to fit test cases if config == mtq.INT4_AWQ_CFG: # reduce block sizes for simple testing models - config["quant_cfg"]["*weight_quantizer"]["block_sizes"] = { - -1: 16, - "scale_bits": 8, - } + for entry in config["quant_cfg"]: + if entry.get("quantizer_path") == "*weight_quantizer": + entry.setdefault("cfg", {})["block_sizes"] = { + -1: 16, + "scale_bits": 8, + } + break if model_cls is SimpleConv or model_cls is SimpleConvLinear: pytest.skip( "INT4_AWQ_CFG requires even number of elements on last dimension for weights." @@ -101,10 +104,13 @@ def test_save_restore(model_cls, config): # update config to fit test cases if config == mtq.INT4_AWQ_CFG: # reduce block sizes for simple testing models - config["quant_cfg"]["*weight_quantizer"]["block_sizes"] = { - -1: 16, - "scale_bits": 8, - } + for entry in config["quant_cfg"]: + if entry.get("quantizer_path") == "*weight_quantizer": + entry.setdefault("cfg", {})["block_sizes"] = { + -1: 16, + "scale_bits": 8, + } + break if model_cls is SimpleConv or model_cls is SimpleConvLinear: pytest.skip( "INT4_AWQ_CFG requires even number of elements on last dimension for weights." From 9bcd06e867cefeab289c745fda2da4a3033207af Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Thu, 26 Mar 2026 00:33:12 +0000 Subject: [PATCH 29/32] fix: entry is a dict Signed-off-by: Shengliang Xu --- examples/llm_ptq/hf_ptq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index 34d7bb0de..9c6335b9d 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -329,7 +329,7 @@ def forward_step(model, batch): ), verbose=True, # Disable all default disabled layers such as lm_head, mlp.gate, router etc. - disabled_layers=[entry.quantizer_path for entry in _default_disabled_quantizer_cfg], + disabled_layers=[entry["quantizer_path"] for entry in _default_disabled_quantizer_cfg], method=auto_quantize_method, checkpoint=auto_quantize_checkpoint, ) From 2721483b7b7b479eb6947db96c6b85465ec66fdb Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Thu, 26 Mar 2026 01:41:05 +0000 Subject: [PATCH 30/32] fix megatron tests Signed-off-by: Shengliang Xu --- .../quantization/plugins/test_megatron.py | 46 +++++++++++-------- 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py b/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py index dca5b6023..e19da18db 100644 --- a/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py +++ b/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py @@ -383,36 +383,44 @@ def _test_sharded_state_dict( mixed_precision_config = copy.deepcopy(mtq.W4A8_AWQ_BETA_CFG) -mixed_precision_config["quant_cfg"].update( - { - "*.1.*": {"enable": False}, - "*.2.*weight_quantizer": {"num_bits": (4, 3), "axis": None}, - "*.2.*input_quantizer": {"num_bits": (4, 3), "axis": None}, - "*.3.*weight_quantizer.0": {"num_bits": 8, "axis": 0}, - "*.3.*weight_quantizer.1": {"enable": False}, - "*.3.*input_quantizer": {"num_bits": 8, "axis": None}, - } +mixed_precision_config["quant_cfg"].extend( + [ + {"quantizer_path": "*.1.*", "enable": False}, + {"quantizer_path": "*.2.*weight_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, + {"quantizer_path": "*.2.*input_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, + {"quantizer_path": "*.3.*weight_quantizer.0", "cfg": {"num_bits": 8, "axis": 0}}, + {"quantizer_path": "*.3.*weight_quantizer.1", "enable": False}, + {"quantizer_path": "*.3.*input_quantizer", "cfg": {"num_bits": 8, "axis": None}}, + ] ) mixed_block_size_config = copy.deepcopy(mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG) -mixed_block_size_config["quant_cfg"].update( - { - "*.1.*": {"enable": False}, - "*.2.*weight_quantizer": {"num_bits": 4, "block_sizes": {-1: 64}, "enable": True}, - "*.2.*input_quantizer": {"num_bits": (4, 3), "axis": None}, - "*.3.*weight_quantizer": {"num_bits": 4, "block_sizes": {-1: 128, -2: 64}, "enable": True}, - "*.3.*input_quantizer": {"num_bits": 8, "axis": None}, - } +mixed_block_size_config["quant_cfg"].extend( + [ + {"quantizer_path": "*.1.*", "enable": False}, + { + "quantizer_path": "*.2.*weight_quantizer", + "cfg": {"num_bits": 4, "block_sizes": {-1: 64}}, + "enable": True, + }, + {"quantizer_path": "*.2.*input_quantizer", "cfg": {"num_bits": (4, 3), "axis": None}}, + { + "quantizer_path": "*.3.*weight_quantizer", + "cfg": {"num_bits": 4, "block_sizes": {-1: 128, -2: 64}}, + "enable": True, + }, + {"quantizer_path": "*.3.*input_quantizer", "cfg": {"num_bits": 8, "axis": None}}, + ] ) # Combined NVFP4 GEMM + KV cache quantization config NVFP4_GEMM_KV_CFG = copy.deepcopy(mtq.NVFP4_DEFAULT_CFG) -NVFP4_GEMM_KV_CFG["quant_cfg"].update(mtq.NVFP4_KV_CFG["quant_cfg"]) +NVFP4_GEMM_KV_CFG["quant_cfg"].extend(mtq.NVFP4_KV_CFG["quant_cfg"]) # Combined FP8 GEMM + KV cache quantization config FP8_GEMM_KV_CFG = copy.deepcopy(mtq.FP8_DEFAULT_CFG) -FP8_GEMM_KV_CFG["quant_cfg"].update(mtq.FP8_KV_CFG["quant_cfg"]) +FP8_GEMM_KV_CFG["quant_cfg"].extend(mtq.FP8_KV_CFG["quant_cfg"]) @pytest.mark.parametrize( From 9752f05b7f608225c635ca319110a48ba8373f20 Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Thu, 26 Mar 2026 01:48:56 +0000 Subject: [PATCH 31/32] fix deepseek example semantic Signed-off-by: Shengliang Xu --- examples/deepseek/ptq.py | 62 ++++++++++++++++++++++++++++++---------- 1 file changed, 47 insertions(+), 15 deletions(-) diff --git a/examples/deepseek/ptq.py b/examples/deepseek/ptq.py index bcfd9de40..faad47eca 100644 --- a/examples/deepseek/ptq.py +++ b/examples/deepseek/ptq.py @@ -309,38 +309,70 @@ def calibrate_loop(model): mtq_cfg = getattr(mtq, quant_cfg) # disable head that corresponds to lm_head (for the huggingface checkpoint) - mtq_cfg["quant_cfg"]["*head*"] = {"enable": False} + mtq_cfg["quant_cfg"].append({"quantizer_path": "*head*", "enable": False}) allowed_mla_quant = [None, "per_tensor_fp8", "nvfp4"] assert mla_quant in allowed_mla_quant, f"mla_quant must be {allowed_mla_quant}" if not mla_quant: - mtq_cfg["quant_cfg"]["*attn*"] = {"enable": False} + mtq_cfg["quant_cfg"].append({"quantizer_path": "*attn*", "enable": False}) elif mla_quant == "per_tensor_fp8": - mtq_cfg["quant_cfg"]["*attn*weight_quantizer"] = {"num_bits": (4, 3), "axis": None} - mtq_cfg["quant_cfg"]["*attn*input_quantizer"] = {"num_bits": (4, 3), "axis": None} + mtq_cfg["quant_cfg"].extend( + [ + { + "quantizer_path": "*attn*weight_quantizer", + "cfg": {"num_bits": (4, 3), "axis": None}, + }, + { + "quantizer_path": "*attn*input_quantizer", + "cfg": {"num_bits": (4, 3), "axis": None}, + }, + ] + ) elif mla_quant == "nvfp4": # for DeepSeek-R1-0528-NVFP4-Turbo mla_linear_layers = ["*wq_a*", "*wq_b*", "*wkv_a*", "*wkv_b*", "*wo*"] mla_nvfp4_linear_layers = ["*wq_a*", "*wkv_a*", "*wq_b*", "*wo*"] for layer in mla_linear_layers: if layer in mla_nvfp4_linear_layers: # wq_a, wkv_a, wq_b, wo use NVFP4 quantization - mtq_cfg["quant_cfg"][layer + "_quantizer"] = { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, - } + mtq_cfg["quant_cfg"].append( + { + "quantizer_path": layer + "_quantizer", + "cfg": { + "num_bits": (2, 1), + "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, + "axis": None, + }, + "enable": True, + } + ) else: - mtq_cfg["quant_cfg"][layer + "_quantizer"] = {"enable": False} + mtq_cfg["quant_cfg"].append( + {"quantizer_path": layer + "_quantizer", "enable": False} + ) # Disable BMM quantizers - mtq_cfg["quant_cfg"]["*attn.kv_bmm_quantizer*"] = {"enable": False} - mtq_cfg["quant_cfg"]["*attn.pe_bmm_quantizer*"] = {"enable": False} + mtq_cfg["quant_cfg"].extend( + [ + {"quantizer_path": "*attn.kv_bmm_quantizer*", "enable": False}, + {"quantizer_path": "*attn.pe_bmm_quantizer*", "enable": False}, + ] + ) if not args.disable_wo_quant and "FP4" in quant_cfg: - mtq_cfg["quant_cfg"]["*wo*weight_quantizer"] = mtq_cfg["quant_cfg"]["*input_quantizer"] - mtq_cfg["quant_cfg"]["*wo*input_quantizer"] = mtq_cfg["quant_cfg"]["*weight_quantizer"] + # Find the default input/weight quantizer cfgs to swap for wo layers + input_cfg = next( + e["cfg"] for e in mtq_cfg["quant_cfg"] if e.get("quantizer_path") == "*input_quantizer" + ) + weight_cfg = next( + e["cfg"] for e in mtq_cfg["quant_cfg"] if e.get("quantizer_path") == "*weight_quantizer" + ) + mtq_cfg["quant_cfg"].extend( + [ + {"quantizer_path": "*wo*weight_quantizer", "cfg": input_cfg}, + {"quantizer_path": "*wo*input_quantizer", "cfg": weight_cfg}, + ] + ) ## ptq transformer = mtq.quantize(transformer, mtq_cfg, calibrate_loop) From cd65849004699264bc7cc2a98c8a0bc169c2b44a Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Thu, 26 Mar 2026 06:10:57 +0000 Subject: [PATCH 32/32] more fixes Signed-off-by: Shengliang Xu --- examples/diffusers/quantization/quantize.py | 7 ++++++- .../torch/quantization/plugins/test_megatron.py | 2 +- .../torch/quantization/plugins/test_transformer_engine.py | 5 ++++- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/examples/diffusers/quantization/quantize.py b/examples/diffusers/quantization/quantize.py index 612357f6e..cb4b1e003 100644 --- a/examples/diffusers/quantization/quantize.py +++ b/examples/diffusers/quantization/quantize.py @@ -137,7 +137,12 @@ def get_quant_config(self, n_steps: int, backbone: torch.nn.Module) -> Any: else: raise NotImplementedError(f"Unknown format {self.config.format}") if self.config.quantize_mha: - quant_config["quant_cfg"]["*[qkv]_bmm_quantizer"] = {"num_bits": (4, 3), "axis": None} # type: ignore[index] + quant_config["quant_cfg"].append( + { + "quantizer_path": "*[qkv]_bmm_quantizer", + "cfg": {"num_bits": (4, 3), "axis": None}, + } + ) set_quant_config_attr( quant_config, self.model_config.trt_high_precision_dtype.value, diff --git a/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py b/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py index e19da18db..8075ddc13 100644 --- a/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py +++ b/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py @@ -304,7 +304,7 @@ def _test_sharded_state_dict( ): # Must disable output_layer quantization since output_layer amax cannot be restore via # sharded_state_dict. All output_layer quantizers state are removed. - config["quant_cfg"]["*output_layer*"] = {"enable": False} + config["quant_cfg"].append({"quantizer_path": "*output_layer*", "enable": False}) if modelopt_version is not None: mto.conversion.__version__ = modelopt_version diff --git a/tests/gpu_megatron/torch/quantization/plugins/test_transformer_engine.py b/tests/gpu_megatron/torch/quantization/plugins/test_transformer_engine.py index 288cc7519..348d89af2 100644 --- a/tests/gpu_megatron/torch/quantization/plugins/test_transformer_engine.py +++ b/tests/gpu_megatron/torch/quantization/plugins/test_transformer_engine.py @@ -73,7 +73,10 @@ def test_quantize(model_cls, config): if config == mtq.FP8_2D_BLOCKWISE_WEIGHT_ONLY_CFG: # reduce block sizes for simple testing models - config["quant_cfg"]["*weight_quantizer"]["block_sizes"] = {-1: 8, -2: 8} + for entry in config["quant_cfg"]: + if entry.get("quantizer_path") == "*weight_quantizer": + entry["cfg"]["block_sizes"] = {-1: 8, -2: 8} + break model = model_cls().cuda() calib_data = [model.get_input().cuda() for _ in range(1)] quantize_model_and_forward(model, config, calib_data)