NVIDIA · kevalmorabia97 · Mar 4, 2026 · Mar 6, 2026 · Mar 24, 2026 · Mar 24, 2026
@@ -70,7 +70,7 @@ jobs:
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
-      docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.01' }}-py3"
+      docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.03' }}-py3"
       example: ${{ matrix.example }}
       timeout_minutes: 30
       pip_install_extras: "[hf,dev-test]"
@@ -82,7 +82,7 @@ jobs:
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
-      docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.01' }}-py3"
+      docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.03' }}-py3"
       example: ${{ matrix.example }}
       timeout_minutes: 30
       pip_install_extras: "[hf,dev-test]"
@@ -99,7 +99,7 @@ jobs:
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
-      docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc5"
+      docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc9"
       example: ${{ matrix.example }}
       pip_install_extras: "[hf,dev-test]"
       runner: linux-amd64-gpu-rtxpro6000-latest-1
@@ -113,7 +113,7 @@ jobs:
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
-      docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc5"
+      docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc9"
       example: ${{ matrix.example }}
       pip_install_extras: "[hf,dev-test]"
       runner: linux-amd64-gpu-rtxpro6000-latest-2

@@ -64,13 +64,13 @@ jobs:
         include:
           - example: gpu
             timeout: 45
-            container_image: pytorch:26.01-py3
+            container_image: pytorch:26.03-py3
           - example: gpu-megatron
             timeout: 45
-            container_image: pytorch:26.01-py3
+            container_image: pytorch:26.03-py3
           - example: gpu-trtllm
             timeout: 30
-            container_image: tensorrt-llm/release:1.3.0rc5
+            container_image: tensorrt-llm/release:1.3.0rc9
     runs-on: linux-amd64-gpu-rtxpro6000-latest-1
     timeout-minutes: ${{ matrix.timeout }}
     container: &gpu_container

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -1,5 +1,6 @@
-NVIDIA Model Optimizer Changelog
-================================
+Changelog
+=========
+
 0.44 (2026-05-xx)
 ^^^^^^^^^^^^^^^^^
 
@@ -13,6 +14,10 @@ NVIDIA Model Optimizer Changelog
 
 - Fix Minitron pruning (``mcore_minitron``) for MoE models. Importance estimation hooks were incorrectly registered for MoE modules and NAS step was hanging before this.
 
+**Misc**
+
+- Add experimental support for transformers>=5.0. Unified Hugging Face checkpoint export for quantized checkpoints may not work for some models with transformers>=5.0 yet.
+
 0.43 (2026-04-09)
 ^^^^^^^^^^^^^^^^^
 

@@ -1,7 +1,7 @@
 # Model
 model_name_or_path: openai/gpt-oss-20b
 attn_implementation: eager
-torch_dtype: bfloat16
+dtype: bfloat16
 
 # Dataset
 dataset_name: HuggingFaceH4/Multilingual-Thinking
@@ -16,7 +16,7 @@ per_device_train_batch_size: 2
 per_device_eval_batch_size: 2
 gradient_accumulation_steps: 2
 max_length: 4096
-warmup_ratio: 0.03
+warmup_steps: 0.03
 lr_scheduler_type: cosine_with_min_lr
 lr_scheduler_kwargs:
   min_lr_rate: 0.1

@@ -1,7 +1,7 @@
 # Model
 model_name_or_path: openai/gpt-oss-20b
 attn_implementation: eager
-torch_dtype: bfloat16
+dtype: bfloat16
 
 # Dataset
 dataset_name: HuggingFaceH4/Multilingual-Thinking
@@ -21,7 +21,7 @@ lora_alpha: 16
 lora_dropout: 0.0
 lora_target_modules: all-linear
 max_length: 4096
-warmup_ratio: 0.03
+warmup_steps: 0.03
 lr_scheduler_type: cosine_with_min_lr
 lr_scheduler_kwargs:
   min_lr_rate: 0.1

@@ -95,21 +95,23 @@ def convert_and_save(model, tokenizer, output_path: str):
 
 def create_parser():
     parser = argparse.ArgumentParser(description=__doc__)
-
     parser.add_argument("--model_path", type=str, help="path to the fake-quantized model from QAT.")
-
+    parser.add_argument(
+        "--trust_remote_code",
+        help="Set trust_remote_code for Huggingface models and tokenizers",
+        default=False,
+        action="store_true",
+    )
     parser.add_argument(
         "--lora_path",
         type=str,
         help="path to the LoRA-QAT adapter weights. You can only specify lora_path or model_path, not both.",
     )
-
     parser.add_argument(
         "--base_path",
         type=str,
         help="path to the base model used for LoRA-QAT. Only used if lora_path is specified.",
     )
-
     parser.add_argument(
         "--output_path", type=str, required=True, help="location to save converted model."
     )
@@ -121,7 +123,7 @@ def create_parser():
     parser = create_parser()
     args = parser.parse_args()
 
-    kwargs = {"device_map": "auto", "torch_dtype": "auto", "trust_remote_code": True}
+    kwargs = {"device_map": "auto", "dtype": "auto", "trust_remote_code": args.trust_remote_code}
     if args.lora_path:
         assert args.model_path is None, "You can only specify lora_path or model_path, not both."
         model_path = args.base_path
@@ -140,7 +142,7 @@ def create_parser():
         gc.collect()
 
     # Load tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=args.trust_remote_code)
 
     # Quantize and save model
     convert_and_save(model, tokenizer, args.output_path)
@@ -207,7 +207,7 @@
     "    per_device_eval_batch_size=1,\n",
     "    gradient_accumulation_steps=2,\n",
     "    max_length=4096,\n",
-    "    warmup_ratio=0.03,\n",
+    "    warmup_steps=0.03,\n",
     "    eval_strategy=\"steps\",\n",
     "    eval_on_start=True,\n",
     "    logging_steps=10,\n",

@@ -1,5 +1,4 @@
 kernels>=0.9.0
 torch>2.7.1
 trackio
-transformers>=4.55.0
 trl>=0.21.0
@@ -72,7 +72,7 @@ def main(script_args, training_args, model_args, quant_args):
         "revision": model_args.model_revision,
         "trust_remote_code": model_args.trust_remote_code,
         "attn_implementation": model_args.attn_implementation,
-        "torch_dtype": getattr(model_args, "dtype", "float32"),
+        "dtype": getattr(model_args, "dtype", "float32"),
         "use_cache": not training_args.gradient_checkpointing,
     }
 

@@ -118,18 +118,19 @@ def modelopt_ptq(
     auto_quantize_bits: float | None = None,
     calib_dataset: str = "cnn_dailymail",
     calib_batch_size: int = 8,
+    trust_remote_code: bool = False,
 ) -> torch.nn.Module:
     """Quantize the model with modelopt."""
     model = AutoModelForCausalLM.from_pretrained(
-        model_path, trust_remote_code=True, torch_dtype="auto", device_map="auto"
+        model_path, trust_remote_code=trust_remote_code, dtype="auto", device_map="auto"
     )
     model.eval()
 
     tokenizer = AutoTokenizer.from_pretrained(
         model_path,
         model_max_length=2048,
         padding_side="left",
-        trust_remote_code=True,
+        trust_remote_code=trust_remote_code,
     )
     # sanitize tokenizer
     if tokenizer.pad_token != "<unk>":
@@ -203,6 +204,12 @@ def modelopt_ptq(
             "regular quantization without auto_quantize search will be applied."
         ),
     )
+    parser.add_argument(
+        "--trust_remote_code",
+        help="Set trust_remote_code for Huggingface models and tokenizers",
+        default=False,
+        action="store_true",
+    )
 
     args = parser.parse_args()
 
@@ -213,4 +220,5 @@ def modelopt_ptq(
         args.num_samples,
         auto_quantize_bits=args.effective_bits,
         calib_batch_size=args.calib_batch_size,
+        trust_remote_code=args.trust_remote_code,
     )
@@ -1,4 +1,3 @@
 pyarrow
 torchao>=0.14.1
-transformers<5.0
 trl>=0.23.0
@@ -38,6 +38,7 @@
 # limitations under the License.
 import warnings
 
+import datasets
 from lm_eval import utils
 from lm_eval.__main__ import cli_evaluate, parse_eval_args, setup_parser
 from lm_eval.api.model import T
@@ -180,8 +181,6 @@ def setup_parser_with_modelopt_args():
     model_args = utils.simple_parse_args_string(args.model_args)
 
     if args.trust_remote_code:
-        import datasets
-
         datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
         model_args["trust_remote_code"] = True
         args.trust_remote_code = None

@@ -74,6 +74,7 @@
 
 class EvalModel(BaseModel, arbitrary_types_allowed=True):
     model_path: str
+    trust_remote_code: bool = False
     max_input_length: int = 512
     max_output_length: int = 512
     dtype: str = "auto"
@@ -92,7 +93,6 @@ def load(self):
 
 
 class OpenAIModel(EvalModel):
-    model_path: str
     engine: str = ""
     use_azure: bool = False
     tokenizer: tiktoken.Encoding | None
@@ -173,7 +173,6 @@ def handler(signum, frame):
 
 
 class SeqToSeqModel(EvalModel):
-    model_path: str
     model: PreTrainedModel | None = None
     tokenizer: PreTrainedTokenizer | None = None
     lora_path: str = ""
@@ -188,18 +187,22 @@ def load(self):
                 args.update(device_map="auto")
             if self.load_8bit:
                 args.update(device_map="auto", load_in_8bit=True)
-            args.update(torch_dtype=getattr(torch, self.dtype) if self.dtype != "auto" else "auto")
+            args.update(dtype=getattr(torch, self.dtype) if self.dtype != "auto" else "auto")
             if self.attn_implementation:
                 args["attn_implementation"] = self.attn_implementation
-            self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_path, **args)
+            self.model = AutoModelForSeq2SeqLM.from_pretrained(
+                self.model_path, trust_remote_code=self.trust_remote_code, **args
+            )
             print_gpu_utilization()
             if self.lora_path:
                 self.model = PeftModel.from_pretrained(self.model, self.lora_path)
             self.model.eval()
             if "device_map" not in args:
                 self.model.to(self.device)
         if self.tokenizer is None:
-            self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                self.model_path, trust_remote_code=self.trust_remote_code
+            )
 
     def run(self, prompt: str, **kwargs) -> str:
         self.load()
@@ -243,11 +246,11 @@ def load(self):
                 args.update(device_map="auto")
             if self.load_8bit:
                 args.update(device_map="auto", load_in_8bit=True)
-            args.update(torch_dtype=getattr(torch, self.dtype) if self.dtype != "auto" else "auto")
+            args.update(dtype=getattr(torch, self.dtype) if self.dtype != "auto" else "auto")
             if self.attn_implementation:
                 args["attn_implementation"] = self.attn_implementation
             self.model = AutoModelForCausalLM.from_pretrained(
-                self.model_path, trust_remote_code=True, **args
+                self.model_path, trust_remote_code=self.trust_remote_code, **args
             )
             self.model.eval()
             if "device_map" not in args:
@@ -256,7 +259,9 @@ def load(self):
             # Sampling with temperature will cause MMLU to drop
             self.model.generation_config.do_sample = False
         if self.tokenizer is None:
-            self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True)
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                self.model_path, trust_remote_code=self.trust_remote_code
+            )
 
     def run(self, prompt: str, **kwargs) -> str:
         self.load()
@@ -322,7 +327,7 @@ def load(self):
                 args.update(device_map="auto")
             if self.load_8bit:
                 args.update(device_map="auto", load_in_8bit=True)
-            args.update(torch_dtype=getattr(torch, self.dtype) if self.dtype != "auto" else "auto")
+            args.update(dtype=getattr(torch, self.dtype) if self.dtype != "auto" else "auto")
             self.model = LlamaForCausalLM.from_pretrained(self.model_path, **args)
             print_gpu_utilization()
             if self.lora_path:
@@ -487,10 +492,12 @@ def test_max_length(self):
 class ChatGLMModel(SeqToSeqModel):
     def load(self):
         if self.tokenizer is None:
-            self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True)
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                self.model_path, trust_remote_code=self.trust_remote_code
+            )
         if self.model is None:
             self.model = AutoModel.from_pretrained(
-                self.model_path, trust_remote_code=True
+                self.model_path, trust_remote_code=self.trust_remote_code
             ).half()  # FP16 is required for ChatGLM
             self.model.eval()
             self.model.to(self.device)