diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py index 4ceb51cd2c..95b06d9a6d 100755 --- a/modelopt/torch/export/quant_utils.py +++ b/modelopt/torch/export/quant_utils.py @@ -1451,6 +1451,7 @@ def get_quant_config( default_quantization = { "quant_algo": None, "kv_cache_quant_algo": None, + "scale_fmt": "ue8m0", } quant_config["quantization"] = default_quantization diff --git a/tests/gpu/torch/export/test_export.py b/tests/gpu/torch/export/test_export.py index 55eee2c138..beaca16787 100644 --- a/tests/gpu/torch/export/test_export.py +++ b/tests/gpu/torch/export/test_export.py @@ -399,6 +399,7 @@ def test_get_scaling_factor( "exclude_modules": ["linears.0", "linears.2"], "kv_cache_quant_algo": None, "quant_algo": "FP8", + "scale_fmt": "ue8m0", }, ), ( @@ -410,6 +411,7 @@ def test_get_scaling_factor( "kv_cache_quant_algo": None, "pre_quant_scale": True, "quant_algo": "W4A8_AWQ", + "scale_fmt": "ue8m0", }, ), ( @@ -419,6 +421,7 @@ def test_get_scaling_factor( "group_size": 16, "kv_cache_quant_algo": None, "quant_algo": "NVFP4", + "scale_fmt": "ue8m0", }, ), ( @@ -430,6 +433,7 @@ def test_get_scaling_factor( "pre_quant_scale": True, "kv_cache_quant_algo": None, "quant_algo": "NVFP4_AWQ", + "scale_fmt": "ue8m0", }, ), ( @@ -441,6 +445,7 @@ def test_get_scaling_factor( "kv_cache_quant_algo": None, "pre_quant_scale": True, "quant_algo": "W4A16_AWQ", + "scale_fmt": "ue8m0", }, ), ( @@ -449,6 +454,7 @@ def test_get_scaling_factor( "exclude_modules": ["linears.0", "linears.2"], "quant_algo": "FP8", "kv_cache_quant_algo": "FP8", + "scale_fmt": "ue8m0", }, ), ( @@ -457,6 +463,7 @@ def test_get_scaling_factor( "exclude_modules": ["linears.0", "linears.2"], "quant_algo": "FP8", "kv_cache_quant_algo": "INT8", + "scale_fmt": "ue8m0", }, ), ],