NVIDIA · danielkorzekwa · Oct 27, 2025 · Oct 27, 2025 · Oct 27, 2025 · Oct 28, 2025
@@ -24,6 +24,7 @@ modelopt/torch/nas @NVIDIA/modelopt-torch-nas-prune-codeowners
 modelopt/torch/opt @NVIDIA/modelopt-torch-opt-codeowners
 modelopt/torch/peft @NVIDIA/modelopt-torch-peft-codeowners
 modelopt/torch/prune @NVIDIA/modelopt-torch-nas-prune-codeowners
+modelopt/torch/puzzletron @NVIDIA/modelopt-torch-puzzletron-codeowners
 modelopt/torch/quantization @NVIDIA/modelopt-torch-quantization-codeowners
 modelopt/torch/sparsity @NVIDIA/modelopt-torch-sparsity-codeowners
 modelopt/torch/speculative @NVIDIA/modelopt-torch-speculative-codeowners

@@ -56,83 +56,21 @@ jobs:
       match_pattern: "^DCO$|^linux$" # Wait for DCO and Unit tests / linux to pass
       delay: 300s
 
-  ##### PyTorch Example Tests (speculative_decoding requires 26.01 image) #####
-  torch-pr:
-    needs: [check-file-changes, wait-checks]
-    if: startsWith(github.ref, 'refs/heads/pull-request/') && needs.check-file-changes.outputs.any_changed == 'true'
-    strategy: &torch_strategy
-      fail-fast: false
-      matrix:
-        example: [llm_distill, llm_qat, llm_sparsity]
-        include:
-          - example: speculative_decoding
-            docker_image: "26.01"
-    uses: ./.github/workflows/_example_tests_runner.yml
-    secrets: inherit
-    with:
-      docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.01' }}-py3"
-      example: ${{ matrix.example }}
-      timeout_minutes: 30
-      pip_install_extras: "[hf,dev-test]"
-      runner: linux-amd64-gpu-h100-latest-1
-
-  torch-non-pr:
-    if: ${{ !startsWith(github.ref, 'refs/heads/pull-request/') }}
-    strategy: *torch_strategy
-    uses: ./.github/workflows/_example_tests_runner.yml
-    secrets: inherit
-    with:
-      docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.01' }}-py3"
-      example: ${{ matrix.example }}
-      timeout_minutes: 30
-      pip_install_extras: "[hf,dev-test]"
-      runner: linux-amd64-gpu-rtxpro6000-latest-2
-
-  ##### TensorRT-LLM Example Tests #####
-  trtllm-pr:
-    needs: [check-file-changes, wait-checks]
-    if: startsWith(github.ref, 'refs/heads/pull-request/') && needs.check-file-changes.outputs.any_changed == 'true'
-    strategy:
-      fail-fast: false
-      matrix:
-        example: [llm_ptq, vlm_ptq]
-    uses: ./.github/workflows/_example_tests_runner.yml
-    secrets: inherit
-    with:
-      docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc5"
-      example: ${{ matrix.example }}
-      pip_install_extras: "[hf,dev-test]"
-      runner: linux-amd64-gpu-rtxpro6000-latest-1
-
-  trtllm-non-pr:
-    if: ${{ !startsWith(github.ref, 'refs/heads/pull-request/') }}
-    strategy:
-      fail-fast: false
-      matrix:
-        example: [llm_autodeploy, llm_eval, llm_ptq, vlm_ptq]
-    uses: ./.github/workflows/_example_tests_runner.yml
-    secrets: inherit
-    with:
-      docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc5"
-      example: ${{ matrix.example }}
-      pip_install_extras: "[hf,dev-test]"
-      runner: linux-amd64-gpu-rtxpro6000-latest-2
-
   ##### NeMo Example Tests #####
   nemo-pr:
     needs: [check-file-changes, wait-checks]
     if: startsWith(github.ref, 'refs/heads/pull-request/') && needs.check-file-changes.outputs.any_changed == 'true'
     strategy: &nemo_strategy
       fail-fast: false
       matrix:
-        example: [megatron_bridge]
+        example: [megatron_bridge, puzzletron]
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
       docker_image: "nvcr.io/nvidia/nemo:26.02"
       example: ${{ matrix.example }}
       timeout_minutes: 30
-      pip_install_extras: "[hf,dev-test]"
+      pip_install_extras: "[hf,puzzletron,dev-test]"
       runner: linux-amd64-gpu-rtxpro6000-latest-1
 
   nemo-non-pr:
@@ -144,50 +82,20 @@ jobs:
       docker_image: "nvcr.io/nvidia/nemo:26.02"
       example: ${{ matrix.example }}
       timeout_minutes: 30
-      pip_install_extras: "[hf,dev-test]"
-      runner: linux-amd64-gpu-rtxpro6000-latest-2
-
-  ##### ONNX/TensorRT Example Tests #####
-  onnx-pr:
-    needs: [check-file-changes, wait-checks]
-    if: startsWith(github.ref, 'refs/heads/pull-request/') && needs.check-file-changes.outputs.any_changed == 'true'
-    strategy: &onnx_strategy
-      fail-fast: false
-      matrix:
-        example: [diffusers, torch_onnx]
-    uses: ./.github/workflows/_example_tests_runner.yml
-    secrets: inherit
-    with:
-      docker_image: "nvcr.io/nvidia/tensorrt:26.02-py3"
-      example: ${{ matrix.example }}
-      pip_install_extras: "[all,dev-test]"
-      runner: linux-amd64-gpu-l4-latest-1
-
-  onnx-non-pr:
-    if: ${{ !startsWith(github.ref, 'refs/heads/pull-request/') }}
-    strategy: *onnx_strategy
-    uses: ./.github/workflows/_example_tests_runner.yml
-    secrets: inherit
-    with:
-      docker_image: "nvcr.io/nvidia/tensorrt:26.02-py3"
-      example: ${{ matrix.example }}
-      pip_install_extras: "[all,dev-test]"
+      pip_install_extras: "[hf,puzzletron,dev-test]"
       runner: linux-amd64-gpu-rtxpro6000-latest-2
 
   ##### Required Check for PR #####
   example-pr-required-check:
     # Run even if example tests are skipped
     if: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && always() }}
-    needs: [check-file-changes, torch-pr, trtllm-pr, nemo-pr, onnx-pr]
+    needs: [check-file-changes, nemo-pr]
     runs-on: ubuntu-latest
     steps:
       - name: Required GPU tests did not succeed
         if: |
           needs.check-file-changes.result != 'success' ||
           (needs.check-file-changes.outputs.any_changed == 'true' && (
-            needs.torch-pr.result != 'success' ||
-            needs.trtllm-pr.result != 'success' ||
-            needs.nemo-pr.result != 'success' ||
-            needs.onnx-pr.result != 'success'
+            needs.nemo-pr.result != 'success'
           ))
         run: exit 1
@@ -62,16 +62,16 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - example: gpu
-            timeout: 45
-            container_image: pytorch:26.01-py3
-          - example: gpu-megatron
-            timeout: 45
-            container_image: pytorch:26.01-py3
-          - example: gpu-trtllm
+          - example: gpu-puzzletron
             timeout: 30
-            container_image: tensorrt-llm/release:1.3.0rc5
-    runs-on: linux-amd64-gpu-rtxpro6000-latest-1
+            container_image: pytorch:26.01-py3
+          # - example: gpu-megatron
+          #   timeout: 45
+          #   container_image: pytorch:26.01-py3
+          # - example: gpu-trtllm
+          #   timeout: 30
+          #   container_image: tensorrt-llm/release:1.3.0rc5
+    runs-on: linux-amd64-gpu-rtxpro6000-latest-2
     timeout-minutes: ${{ matrix.timeout }}
     container: &gpu_container
       image: nvcr.io/nvidia/${{ matrix.container_image }}
@@ -85,6 +85,8 @@ jobs:
       - name: Setup environment variables
         run: |
           echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu" >> $GITHUB_ENV
+      - name: Install dependencies for mip
+        run: apt-get update && apt-get install -y libffi-dev
       - name: Run gpu tests
         run: pip install tox-current-env && tox -e cuda13-${{ matrix.example }} --current-env
   gpu-tests-non-pr:

@@ -25,9 +25,18 @@ repos:
     hooks:
       - id: ruff-check
         args: [--fix, --exit-non-zero-on-fix]
-        exclude: ^examples/specdec_bench/specdec_bench/datasets/speed\.py$
+        # See: commit hooks modifies block_config.py leading to test_puzzletron.py failing (#25) · Issues · omniml / modelopt · GitLab
+        exclude: >
+          (?x)^(
+              ^examples/specdec_bench/specdec_bench/datasets/speed\.py$|
+              modelopt/torch/puzzletron/decilm/deci_lm_hf_code/block_config\.py
+          )$
       - id: ruff-format
-        exclude: ^examples/specdec_bench/specdec_bench/datasets/speed\.py$
+        exclude: >
+          (?x)^(
+              ^examples/specdec_bench/specdec_bench/datasets/speed\.py$|
+              modelopt/torch/puzzletron/decilm/deci_lm_hf_code/block_config\.py
+          )$
 
   - repo: https://github.com/pre-commit/mirrors-mypy
     rev: v1.17.1
@@ -95,10 +104,13 @@ repos:
               examples/llm_eval/modeling.py|
               examples/llm_qat/main.py|
               examples/llm_sparsity/weight_sparsity/finetune.py|
+              examples/puzzletron/evaluation/lm_eval_anymodel.py|
               examples/specdec_bench/specdec_bench/models/specbench_medusa.py|
               examples/speculative_decoding/main.py|
               examples/speculative_decoding/medusa_utils.py|
               examples/speculative_decoding/server_generate.py|
+              examples/puzzletron/evaluation/lm_eval_anymodel.py|
+              modelopt/torch/puzzletron/anymodel/models/gpt_oss/gpt_oss_pruned_to_mxfp4.py|
               experimental/dms/models/qwen3/configuration_qwen3_dms.py|
               experimental/dms/models/qwen3/modeling_qwen3_dms.py|
           )$

@@ -7,6 +7,7 @@ Pruning can involve removal (prune) of Linear and Conv layers; and Transformer a
 This section focuses on applying Model Optimizer's state-of-the-art complementary pruning modes to enable you to search for the best subnet architecture from your provided base model:
 
 1. [Minitron](https://arxiv.org/pdf/2408.11796): A pruning method developed by NVIDIA Research for pruning GPT (and later extended to Mamba, MoE, and Hybrid Transformer Mamba) models in NVIDIA Megatron-LM (M-LM) or Megatron-Bridge (M-Bridge) framework. It uses the activation magnitudes to prune the embedding hidden size; mlp ffn hidden size; transformer attention heads; mamba heads and head dimension; MoE number of experts, ffn hidden size, and shared expert intermediate size; and number of layers of the model.
+1. [Puzzletron](../puzzletron/README.md): An advanced pruning method by NVIDIA using Mixed Integer Programming (MIP) based NAS search algorithm.
 1. FastNAS: A pruning method recommended for Computer Vision models. Given a pretrained model, FastNAS finds the subnet which maximizes the score function while meeting the given constraints.
 1. GradNAS: A light-weight pruning method recommended for language models like Hugging Face BERT, GPT-J. It uses the gradient information to prune the model's linear layers and attention heads to meet the given constraints.
 

@@ -0,0 +1,14 @@
+
+## GptOss
+
+With this release Puzzle algorithm supports only experts removal for `Gpt-Oss`.
+
+This model comes as a quantized checkpoint i.e. MoE experts matrices are quantized with _MXFP4_ format.
+In the pruning steps puzzle utilizes decompressed model (back to BF16) for statistics and scores computation.
+This means, during the conversion to puzzle format we decompress the model and store it as a BF16.
+Once the pruning is done i.e. experts to be removed are identified and the process is finished, user may want to get back the _MXFP4_ format of the checkpoint.
+To do so, there is an additional script, that takes the original and the pruned checkpoint and outputs pruned checkpoint in _MXFP4_ format.
+
+```bash
+python -m modelopt.torch.puzzletron.anymodel.models.gpt_oss.gpt_oss_pruned_to_mxfp4 --student-path /workspaces/any_model_gpt_oss/mip/puzzle_solutions/stats_num_params_18014757184/solutions--checkpoints/solution_0/ --original-path /workspaces/source_model_checkpoints/openai_gpt-oss-20b/ --output-path /workspaces/any_model_gpt_oss/mip/puzzle_solutions/stats_num_params_18014757184/solutions--checkpoints/mxfp4-ckpt/  --num-layers 24
+```