up

metascroy · metascroy · commit bcfc5206c9b4 · 2025-03-26T14:15:16.000-07:00
diff --git a/torchao/experimental/quant_api.py b/torchao/experimental/quant_api.py
@@ -1108,3 +1108,152 @@ def quantize(self, model: nn.Module) -> nn.Module:
             },
         )
         return model
+
+
+def _get_q_dq_patterns_and_replacements(weight_bit_width, has_weight_zeros, target):
+    w_qmin = -(1 << (weight_bit_width - 1))
+    w_qmax = (1 << (weight_bit_width - 1)) - 1
+    a_qmin = -128
+    a_qmax = 127
+
+    if not has_weight_zeros:
+
+        def pattern(a, w_int, w_scale, bias, group_size, a_block):
+            a_scale, a_zero = torch.ops.quant.choose_qparams_affine.default(
+                a,
+                "ASYMMETRIC",
+                a_block,
+                torch.int32,
+                a_qmin,
+                a_qmax,
+                None,
+                torch.float32,
+                torch.int32,
+            )
+            q_a = torch.ops.quant.quantize_affine.default(
+                a, a_block, a_scale, a_zero, torch.int32, a_qmin, a_qmax
+            )
+            dq_a = torch.ops.quant.dequantize_affine.default(
+                q_a, a_block, a_scale, a_zero, torch.int32, a_qmin, a_qmax
+            )
+            dq_w = torch.ops.quant.dequantize_affine.default(
+                w_int,
+                [1, group_size],
+                w_scale,
+                None,
+                torch.int32,
+                w_qmin,
+                w_qmax,
+                "NONE",
+            )
+            return torch.ops.aten.linear.default(dq_a, dq_w, bias)
+
+        def replacement(a, w_int, w_scale, bias, group_size, a_block):
+            n = w_int.size(0)
+            k = a_block[-1]
+            out_shape = a.shape[:-1] + (n,)
+            packed_weight = getattr(
+                torch.ops.torchao,
+                f"_pack_8bit_act_{weight_bit_width}bit_weight",
+            )(
+                w_int.to(torch.int8),
+                w_scale.reshape(-1),
+                None,
+                group_size,
+                bias,
+                target,
+            )
+            return getattr(
+                torch.ops.torchao, f"_linear_8bit_act_{weight_bit_width}bit_weight"
+            )(a.reshape(-1, k), packed_weight, group_size, n, k).reshape(out_shape)
+    else:
+
+        def pattern(a, w_int, w_scale, w_zero, bias, group_size, a_block):
+            a_scale, a_zero = torch.ops.quant.choose_qparams_affine.default(
+                a,
+                "ASYMMETRIC",
+                a_block,
+                torch.int32,
+                a_qmin,
+                a_qmax,
+                None,
+                torch.float32,
+                torch.int32,
+            )
+            q_a = torch.ops.quant.quantize_affine.default(
+                a, a_block, a_scale, a_zero, torch.int32, a_qmin, a_qmax
+            )
+            dq_a = torch.ops.quant.dequantize_affine.default(
+                q_a, a_block, a_scale, a_zero, torch.int32, a_qmin, a_qmax
+            )
+            dq_w = torch.ops.quant.dequantize_affine.default(
+                w_int, [1, group_size], w_scale, w_zero, torch.int32, w_qmin, w_qmax
+            )
+            return torch.ops.aten.linear.default(dq_a, dq_w, bias)
+
+        def replacement(a, w_int, w_scale, w_zero, bias, group_size, a_block):
+            n = w_int.size(0)
+            k = a_block[-1]
+            out_shape = a.shape[:-1] + (n,)
+            packed_weight = getattr(
+                torch.ops.torchao,
+                f"_pack_8bit_act_{weight_bit_width}bit_weight",
+            )(
+                w_int.to(torch.int8),
+                w_scale.reshape(-1),
+                w_zero.reshape(-1).to(torch.int8),
+                group_size,
+                bias,
+                target,
+            )
+            return getattr(
+                torch.ops.torchao, f"_linear_8bit_act_{weight_bit_width}bit_weight"
+            )(a.reshape(-1, k), packed_weight, group_size, n, k).reshape(out_shape)
+
+    return pattern, replacement
+
+
+def replace_q_dq_with_torchao_quantized_linear_ops(
+    ep: torch.export.ExportedProgram, target=None
+):
+    # TODO: figure out how to do this with dynamic_shapes (not saved on EP for easy re-export)
+    assert (
+        len(ep.range_constraints) == 0
+    ), "ExportedProgram with range constraints are not supported"
+
+    import itertools
+
+    from torch._export.passes.constant_folding import constant_fold
+    from torch.fx import subgraph_rewriter
+
+    def filter_invalid_a_block(match, x, y):
+        """
+        We only want a_block with shape [1, ..., 1, k]
+        """
+        a_block_node = [n for n in match.nodes_map if n.name == "a_block"]
+        assert len(a_block_node) == 1
+        a_block_node = a_block_node[0]
+        a_block_node_val = match.nodes_map[a_block_node]
+        for v in a_block_node_val[0:-1]:
+            if v != 1:
+                return False
+        return True
+
+    gm = (
+        ep.module()
+    )  # module() unlifts the inputs, which is needed for constant folding
+    for weight_bit_width, has_weight_zeros in itertools.product(
+        range(1, 9), [False, True]
+    ):
+        pattern, replacement = _get_q_dq_patterns_and_replacements(
+            weight_bit_width, has_weight_zeros, target
+        )
+        subgraph_rewriter.replace_pattern_with_filters(
+            gm, pattern, replacement, match_filters=[filter_invalid_a_block]
+        )
+
+    # Constant fold evaluates and removes the packing ops
+    constant_fold(gm)
+
+    # Re-export
+    return torch.export.export(gm, *ep.example_inputs)
diff --git a/torchao/experimental/tests/test_int8_dynamic_activation_intx_weight.py b/torchao/experimental/tests/test_int8_dynamic_activation_intx_weight.py
@@ -17,7 +17,10 @@
     PackedLinearInt8DynamicActivationIntxWeightLayout,
 )
 from torchao.experimental.q_dq_layout import QDQLayout
-from torchao.experimental.quant_api import int8_dynamic_activation_intx_weight
+from torchao.experimental.quant_api import (
+    Int8DynamicActivationIntxWeightConfig,
+    replace_q_dq_with_torchao_quantized_linear_ops,
+)
 from torchao.quantization.granularity import PerGroup, PerRow
 from torchao.quantization.quant_api import quantize_
 from torchao.utils import unwrap_tensor_subclass
@@ -79,7 +82,7 @@ def test_accuracy(self, layout, weight_dtype, has_weight_zeros, granularity):
         quantized_model = copy.deepcopy(model)
         quantize_(
             quantized_model,
-            int8_dynamic_activation_intx_weight(
+            Int8DynamicActivationIntxWeightConfig(
                 weight_dtype=weight_dtype,
                 granularity=granularity,
                 has_weight_zeros=has_weight_zeros,
@@ -91,7 +94,7 @@ def test_accuracy(self, layout, weight_dtype, has_weight_zeros, granularity):
         quantized_model_reference = copy.deepcopy(model)
         quantize_(
             quantized_model_reference,
-            int8_dynamic_activation_intx_weight(
+            Int8DynamicActivationIntxWeightConfig(
                 weight_dtype=weight_dtype,
                 granularity=granularity,
                 has_weight_zeros=has_weight_zeros,
@@ -124,7 +127,7 @@ def test_accuracy_aten(self):
         quantized_model = copy.deepcopy(model)
         quantize_(
             quantized_model,
-            int8_dynamic_activation_intx_weight(
+            Int8DynamicActivationIntxWeightConfig(
                 weight_dtype=weight_dtype,
                 granularity=granularity,
                 has_weight_zeros=has_weight_zeros,
@@ -136,7 +139,7 @@ def test_accuracy_aten(self):
         quantized_model_reference = copy.deepcopy(model)
         quantize_(
             quantized_model_reference,
-            int8_dynamic_activation_intx_weight(
+            Int8DynamicActivationIntxWeightConfig(
                 weight_dtype=weight_dtype,
                 granularity=granularity,
                 has_weight_zeros=has_weight_zeros,
@@ -183,7 +186,7 @@ def test_export_compile_aoti_PackedLinearInt8DynamicActivationIntxWeightLayout(
 
         quantize_(
             model,
-            int8_dynamic_activation_intx_weight(
+            Int8DynamicActivationIntxWeightConfig(
                 weight_dtype=weight_dtype,
                 granularity=granularity,
                 has_weight_zeros=has_weight_zeros,
@@ -245,7 +248,7 @@ def test_export_dynamic_shape_PackedLinearInt8DynamicActivationIntxWeightLayout(
 
         quantize_(
             model,
-            int8_dynamic_activation_intx_weight(
+            Int8DynamicActivationIntxWeightConfig(
                 weight_dtype=weight_dtype,
                 granularity=granularity,
                 has_weight_zeros=has_weight_zeros,
@@ -278,7 +281,7 @@ def test_export_QDQLayout(self):
 
         quantize_(
             model,
-            int8_dynamic_activation_intx_weight(
+            Int8DynamicActivationIntxWeightConfig(
                 weight_dtype=weight_dtype,
                 granularity=granularity,
                 has_weight_zeros=has_weight_zeros,
@@ -304,6 +307,76 @@ def test_export_QDQLayout(self):
                 exported.graph_module.code
             )
 
+    def test_replace_q_dq_with_torchao_quantized_linear_ops(self):
+        layers = [
+            torch.nn.Linear(256, 128, bias=True),
+            torch.nn.Linear(128, 64, bias=False),
+            torch.nn.Linear(64, 32, bias=True),
+        ]
+        model = torch.nn.Sequential(*layers)
+        activations = torch.randn(2, 1, 256, dtype=torch.float32)
+        quantize_(
+            model,
+            Int8DynamicActivationIntxWeightConfig(
+                weight_dtype=torch.int4,
+                granularity=PerGroup(64),
+                has_weight_zeros=True,
+                layout=QDQLayout(),
+            ),
+            lambda m, fqn: fqn == "0",
+        )
+        quantize_(
+            model,
+            Int8DynamicActivationIntxWeightConfig(
+                weight_dtype=torch.int3,
+                granularity=PerRow(),
+                has_weight_zeros=False,
+                layout=QDQLayout(),
+            ),
+            lambda m, fqn: fqn == "1",
+        )
+        quantize_(
+            model,
+            Int8DynamicActivationIntxWeightConfig(
+                weight_dtype=torch.int5,
+                granularity=PerGroup(32),
+                has_weight_zeros=False,
+                layout=QDQLayout(),
+            ),
+            lambda m, fqn: fqn == "2",
+        )
+
+        eager_results = model(activations)
+
+        unwrap_tensor_subclass(model)
+        exported = torch.export.export(model, (activations,), strict=True)
+        exported = replace_q_dq_with_torchao_quantized_linear_ops(exported)
+
+        # We should not find pack op because it gets constant folded
+        FileCheck().check_not("torch.ops.torchao._pack_8bit_act").run(
+            exported.graph_module.code
+        )
+
+        # We should find 3 torchao linear ops
+        FileCheck().check_count(
+            "torch.ops.torchao._linear_8bit_act_", count=3, exactly=True
+        ).run(exported.graph_module.code)
+
+        # We should not find Q/DQ ops
+        FileCheck().check_not("torch.ops.quant.quantize_affine.default").run(
+            exported.graph_module.code
+        )
+        FileCheck().check_not("torch.ops.quant.dequantize_affine.default").run(
+            exported.graph_module.code
+        )
+        FileCheck().check_not("torch.ops.quant.choose_qparams_affine.default").run(
+            exported.graph_module.code
+        )
+
+        # Numerics should match
+        exported_results = exported.module()(activations)
+        self.assertTrue(torch.allclose(exported_results, eager_results))
+
 
 if __name__ == "__main__":
     unittest.main()