From 6e80cce4e32cc97db923157ed7bc04b858ba930f Mon Sep 17 00:00:00 2001
From: samcheng <samcheng@nvidia.com>
Date: Fri, 20 Mar 2026 09:37:30 +0000
Subject: [PATCH 1/5] Exclude small-channel Conv nodes from FP8 quantization

Small-channel convolutions (IC or OC <= 16) do not benefit from FP8
quantization and cause performance regressions due to poor kernel
selection. Hardcode the threshold directly in
graph_utils rather than exposing it as a CLI flag.

Signed-off-by: samcheng <samcheng@nvidia.com>
---
 modelopt/onnx/quantization/graph_utils.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/modelopt/onnx/quantization/graph_utils.py b/modelopt/onnx/quantization/graph_utils.py
index 131723e61..48c970f71 100755
--- a/modelopt/onnx/quantization/graph_utils.py
+++ b/modelopt/onnx/quantization/graph_utils.py
@@ -1082,12 +1082,17 @@ def find_nodes_from_matmul_to_exclude(
     return [*set(nodes_to_exclude)]
 
 
+_MIN_CHANNELS_FP8 = 16
+
+
 def find_nodes_from_convs_to_exclude(graph: Graph, quantize_mode: str = "int8"):
     """Find unsupported Conv nodes to exclude from quantization.
 
     - The input and output channels should be >= 16. The exception is for Conv layers in INT8 quantization mode,
       which supports it if the input or output channel % 8.
     - The filter size for FP8 conv kernels should be less than 32.
+    - For FP8 mode, Conv nodes with input or output channels < _MIN_CHANNELS_FP8 are excluded.
+      Small-channel convolutions do not benefit from FP8 quantization.
 
     Args:
         graph: Onnx model graph.
@@ -1147,6 +1152,20 @@ def find_nodes_from_convs_to_exclude(graph: Graph, quantize_mode: str = "int8"):
             if quantize_mode == "fp8" and filter_size > 32:
                 logger.debug(f"Found large filter conv for FP8: {node.name}")
                 unsupported_conv_nodes.append(node.name)
+                continue
+
+            # For FP8, exclude small-channel convolutions. These layers do not benefit from
+            # FP8 quantization and cause perf regressions on GPUs where the FP8 conv kernels
+            # are slower than FP16 CASK kernels for small channels.
+            if (
+                quantize_mode == "fp8"
+                and (output_channel <= _MIN_CHANNELS_FP8 or input_channel <= _MIN_CHANNELS_FP8)
+            ):
+                logger.debug(
+                    f"Excluding small-channel Conv from FP8 quantization: {node.name} "
+                    f"(IC={input_channel}, OC={output_channel}, threshold={_MIN_CHANNELS_FP8})"
+                )
+                unsupported_conv_nodes.append(node.name)
 
     logger.info(f"Found {len(unsupported_conv_nodes)} unsupported Conv nodes for quantization")
     return unsupported_conv_nodes

From 7ac3649bd79ff9c73ffd75d14e640c3b8e77d2d4 Mon Sep 17 00:00:00 2001
From: nv-samcheng <130026689+nv-samcheng@users.noreply.github.com>
Date: Fri, 20 Mar 2026 17:51:59 +0800
Subject: [PATCH 2/5] Update modelopt/onnx/quantization/graph_utils.py

Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
Signed-off-by: nv-samcheng <130026689+nv-samcheng@users.noreply.github.com>
---
 modelopt/onnx/quantization/graph_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelopt/onnx/quantization/graph_utils.py b/modelopt/onnx/quantization/graph_utils.py
index 48c970f71..ea6e1bc3f 100755
--- a/modelopt/onnx/quantization/graph_utils.py
+++ b/modelopt/onnx/quantization/graph_utils.py
@@ -1091,7 +1091,7 @@ def find_nodes_from_convs_to_exclude(graph: Graph, quantize_mode: str = "int8"):
     - The input and output channels should be >= 16. The exception is for Conv layers in INT8 quantization mode,
       which supports it if the input or output channel % 8.
     - The filter size for FP8 conv kernels should be less than 32.
-    - For FP8 mode, Conv nodes with input or output channels < _MIN_CHANNELS_FP8 are excluded.
+    - For FP8 mode, Conv nodes with input or output channels <= _MIN_CHANNELS_FP8 are excluded.
       Small-channel convolutions do not benefit from FP8 quantization.
 
     Args:

From acc4cc532fd853107e25579f0251bd5c923aeb73 Mon Sep 17 00:00:00 2001
From: samcheng <samcheng@nvidia.com>
Date: Fri, 20 Mar 2026 14:39:32 +0000
Subject: [PATCH 3/5] Fix pre-commit formatting and add unit tests for FP8
 small-channel exclusion

Apply ruff formatting fix for the if-condition and add unit tests
to cover the FP8 small-channel Conv exclusion logic in graph_utils.

Signed-off-by: samcheng <samcheng@nvidia.com>
Made-with: Cursor
---
 modelopt/onnx/quantization/graph_utils.py     |  5 +-
 .../onnx/quantization/test_graph_utils.py     | 71 +++++++++++++++++++
 2 files changed, 73 insertions(+), 3 deletions(-)
 create mode 100644 tests/unit/onnx/quantization/test_graph_utils.py

diff --git a/modelopt/onnx/quantization/graph_utils.py b/modelopt/onnx/quantization/graph_utils.py
index ea6e1bc3f..4141d7067 100755
--- a/modelopt/onnx/quantization/graph_utils.py
+++ b/modelopt/onnx/quantization/graph_utils.py
@@ -1157,9 +1157,8 @@ def find_nodes_from_convs_to_exclude(graph: Graph, quantize_mode: str = "int8"):
             # For FP8, exclude small-channel convolutions. These layers do not benefit from
             # FP8 quantization and cause perf regressions on GPUs where the FP8 conv kernels
             # are slower than FP16 CASK kernels for small channels.
-            if (
-                quantize_mode == "fp8"
-                and (output_channel <= _MIN_CHANNELS_FP8 or input_channel <= _MIN_CHANNELS_FP8)
+            if quantize_mode == "fp8" and (
+                output_channel <= _MIN_CHANNELS_FP8 or input_channel <= _MIN_CHANNELS_FP8
             ):
                 logger.debug(
                     f"Excluding small-channel Conv from FP8 quantization: {node.name} "
diff --git a/tests/unit/onnx/quantization/test_graph_utils.py b/tests/unit/onnx/quantization/test_graph_utils.py
new file mode 100644
index 000000000..3840a87f4
--- /dev/null
+++ b/tests/unit/onnx/quantization/test_graph_utils.py
@@ -0,0 +1,71 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import onnx_graphsurgeon as gs
+import pytest
+
+from modelopt.onnx.quantization.graph_utils import find_nodes_from_convs_to_exclude
+
+
+def _make_conv_graph(output_channels, input_channels, kernel_shape=(3, 3), name="Conv_0"):
+    """Build a minimal graph with a single Conv node."""
+    spatial = [32, 32]
+    inp = gs.Variable(name="input", dtype=np.float32, shape=[1, input_channels, *spatial])
+    out = gs.Variable(name="output", dtype=np.float32)
+
+    weight_shape = (output_channels, input_channels, *kernel_shape)
+    weight = gs.Constant(name="weight", values=np.ones(weight_shape, dtype=np.float32))
+
+    conv = gs.Node(
+        name=name,
+        op="Conv",
+        inputs=[inp, weight],
+        outputs=[out],
+        attrs={"kernel_shape": list(kernel_shape)},
+    )
+
+    return gs.Graph(nodes=[conv], inputs=[inp], outputs=[out], opset=13)
+
+
+@pytest.mark.parametrize(
+    "oc, ic, expected_excluded",
+    [
+        (16, 64, True),
+        (64, 16, True),
+        (8, 8, True),
+        (16, 16, True),
+        (17, 64, False),
+        (64, 17, False),
+        (17, 17, False),
+        (32, 32, False),
+        (64, 64, False),
+    ],
+)
+def test_fp8_small_channel_conv_exclusion(oc, ic, expected_excluded):
+    """FP8 mode should exclude Conv nodes with OC or IC <= 16."""
+    graph = _make_conv_graph(output_channels=oc, input_channels=ic)
+    excluded = find_nodes_from_convs_to_exclude(graph, quantize_mode="fp8")
+    if expected_excluded:
+        assert "Conv_0" in excluded
+    else:
+        assert "Conv_0" not in excluded
+
+
+def test_fp8_small_channel_exclusion_does_not_affect_int8():
+    """The small-channel FP8 exclusion should not apply in int8 mode."""
+    graph = _make_conv_graph(output_channels=16, input_channels=64, kernel_shape=(3, 3))
+    excluded = find_nodes_from_convs_to_exclude(graph, quantize_mode="int8")
+    assert "Conv_0" not in excluded

From 389ae47455ddf48e73ef3cf1f226a8ba27fca76e Mon Sep 17 00:00:00 2001
From: samcheng <samcheng@nvidia.com>
Date: Sat, 21 Mar 2026 13:19:12 +0000
Subject: [PATCH 4/5] fix

Signed-off-by: samcheng <samcheng@nvidia.com>
---
 modelopt/onnx/quantization/graph_utils.py     |  3 ++-
 .../onnx/quantization/test_graph_utils.py     | 20 +++++++++++++++++--
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/modelopt/onnx/quantization/graph_utils.py b/modelopt/onnx/quantization/graph_utils.py
index 4141d7067..b05825bdb 100755
--- a/modelopt/onnx/quantization/graph_utils.py
+++ b/modelopt/onnx/quantization/graph_utils.py
@@ -1152,7 +1152,8 @@ def find_nodes_from_convs_to_exclude(graph: Graph, quantize_mode: str = "int8"):
             if quantize_mode == "fp8" and filter_size > 32:
                 logger.debug(f"Found large filter conv for FP8: {node.name}")
                 unsupported_conv_nodes.append(node.name)
-                continue
+                # skip the small-channel check below; already excluded
+                continue  
 
             # For FP8, exclude small-channel convolutions. These layers do not benefit from
             # FP8 quantization and cause perf regressions on GPUs where the FP8 conv kernels
diff --git a/tests/unit/onnx/quantization/test_graph_utils.py b/tests/unit/onnx/quantization/test_graph_utils.py
index 3840a87f4..322ef3bbb 100644
--- a/tests/unit/onnx/quantization/test_graph_utils.py
+++ b/tests/unit/onnx/quantization/test_graph_utils.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -66,6 +66,22 @@ def test_fp8_small_channel_conv_exclusion(oc, ic, expected_excluded):
 
 def test_fp8_small_channel_exclusion_does_not_affect_int8():
     """The small-channel FP8 exclusion should not apply in int8 mode."""
-    graph = _make_conv_graph(output_channels=16, input_channels=64, kernel_shape=(3, 3))
+    # OC=8 would be excluded in FP8 (see oc=8, ic=8 case above), but not in int8.
+    graph = _make_conv_graph(output_channels=8, input_channels=64, kernel_shape=(3, 3))
     excluded = find_nodes_from_convs_to_exclude(graph, quantize_mode="int8")
     assert "Conv_0" not in excluded
+
+
+@pytest.mark.parametrize(
+    "oc, ic",
+    [
+        (15, 64),
+        (64, 15),
+        (1, 1),
+    ],
+)
+def test_fp8_channels_below_16_excluded_by_general_check(oc, ic):
+    """Channels strictly < 16 are excluded by the general channel check, not the FP8 check."""
+    graph = _make_conv_graph(output_channels=oc, input_channels=ic, kernel_shape=(3, 3))
+    excluded = find_nodes_from_convs_to_exclude(graph, quantize_mode="fp8")
+    assert "Conv_0" in excluded

From f3c310054841c69ed4f5c82e3b5d2952bf0725e8 Mon Sep 17 00:00:00 2001
From: samcheng <samcheng@nvidia.com>
Date: Wed, 25 Mar 2026 17:01:28 +0000
Subject: [PATCH 5/5] fix pre-commit

Signed-off-by: samcheng <samcheng@nvidia.com>
---
 modelopt/onnx/quantization/graph_utils.py        | 2 +-
 tests/unit/onnx/quantization/test_graph_utils.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/modelopt/onnx/quantization/graph_utils.py b/modelopt/onnx/quantization/graph_utils.py
index b05825bdb..d80ac68e2 100755
--- a/modelopt/onnx/quantization/graph_utils.py
+++ b/modelopt/onnx/quantization/graph_utils.py
@@ -1153,7 +1153,7 @@ def find_nodes_from_convs_to_exclude(graph: Graph, quantize_mode: str = "int8"):
                 logger.debug(f"Found large filter conv for FP8: {node.name}")
                 unsupported_conv_nodes.append(node.name)
                 # skip the small-channel check below; already excluded
-                continue  
+                continue
 
             # For FP8, exclude small-channel convolutions. These layers do not benefit from
             # FP8 quantization and cause perf regressions on GPUs where the FP8 conv kernels
diff --git a/tests/unit/onnx/quantization/test_graph_utils.py b/tests/unit/onnx/quantization/test_graph_utils.py
index 322ef3bbb..1deaa1b8d 100644
--- a/tests/unit/onnx/quantization/test_graph_utils.py
+++ b/tests/unit/onnx/quantization/test_graph_utils.py
@@ -41,7 +41,7 @@ def _make_conv_graph(output_channels, input_channels, kernel_shape=(3, 3), name=
 
 
 @pytest.mark.parametrize(
-    "oc, ic, expected_excluded",
+    ("oc", "ic", "expected_excluded"),
     [
         (16, 64, True),
         (64, 16, True),
@@ -73,7 +73,7 @@ def test_fp8_small_channel_exclusion_does_not_affect_int8():
 
 
 @pytest.mark.parametrize(
-    "oc, ic",
+    ("oc", "ic"),
     [
         (15, 64),
         (64, 15),