Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 0 additions & 47 deletions .github/workflows/delete_outdated_pr_branches.yml

This file was deleted.

8 changes: 4 additions & 4 deletions .github/workflows/example_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ jobs:
uses: ./.github/workflows/_example_tests_runner.yml
secrets: inherit
with:
docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.01' }}-py3"
docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.03' }}-py3"
example: ${{ matrix.example }}
timeout_minutes: 30
pip_install_extras: "[hf,dev-test]"
Expand All @@ -82,7 +82,7 @@ jobs:
uses: ./.github/workflows/_example_tests_runner.yml
secrets: inherit
with:
docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.01' }}-py3"
docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.03' }}-py3"
example: ${{ matrix.example }}
timeout_minutes: 30
pip_install_extras: "[hf,dev-test]"
Expand All @@ -99,7 +99,7 @@ jobs:
uses: ./.github/workflows/_example_tests_runner.yml
secrets: inherit
with:
docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc5"
docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc9"
example: ${{ matrix.example }}
pip_install_extras: "[hf,dev-test]"
runner: linux-amd64-gpu-rtxpro6000-latest-1
Expand All @@ -113,7 +113,7 @@ jobs:
uses: ./.github/workflows/_example_tests_runner.yml
secrets: inherit
with:
docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc5"
docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc9"
example: ${{ matrix.example }}
pip_install_extras: "[hf,dev-test]"
runner: linux-amd64-gpu-rtxpro6000-latest-2
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/gpu_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,13 +64,13 @@ jobs:
include:
- example: gpu
timeout: 45
container_image: pytorch:26.01-py3
container_image: pytorch:26.03-py3
- example: gpu-megatron
timeout: 45
container_image: pytorch:26.01-py3
container_image: pytorch:26.03-py3
- example: gpu-trtllm
timeout: 30
container_image: tensorrt-llm/release:1.3.0rc5
container_image: tensorrt-llm/release:1.3.0rc9
runs-on: linux-amd64-gpu-rtxpro6000-latest-1
timeout-minutes: ${{ matrix.timeout }}
container: &gpu_container
Expand Down
9 changes: 7 additions & 2 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
NVIDIA Model Optimizer Changelog
================================
Changelog
=========

0.44 (2026-05-xx)
^^^^^^^^^^^^^^^^^

Expand All @@ -13,6 +14,10 @@ NVIDIA Model Optimizer Changelog

- Fix Minitron pruning (``mcore_minitron``) for MoE models. Importance estimation hooks were incorrectly registered for MoE modules and NAS step was hanging before this.

**Misc**

- Add experimental support for transformers>=5.0. Unified Hugging Face checkpoint export for quantized checkpoints may not work for some models with transformers>=5.0 yet.

0.43 (2026-04-09)
^^^^^^^^^^^^^^^^^

Expand Down
4 changes: 2 additions & 2 deletions examples/gpt-oss/configs/sft_full.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Model
model_name_or_path: openai/gpt-oss-20b
attn_implementation: eager
torch_dtype: bfloat16
dtype: bfloat16

# Dataset
dataset_name: HuggingFaceH4/Multilingual-Thinking
Expand All @@ -16,7 +16,7 @@ per_device_train_batch_size: 2
per_device_eval_batch_size: 2
gradient_accumulation_steps: 2
max_length: 4096
warmup_ratio: 0.03
warmup_steps: 0.03
lr_scheduler_type: cosine_with_min_lr
lr_scheduler_kwargs:
min_lr_rate: 0.1
Expand Down
4 changes: 2 additions & 2 deletions examples/gpt-oss/configs/sft_lora.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Model
model_name_or_path: openai/gpt-oss-20b
attn_implementation: eager
torch_dtype: bfloat16
dtype: bfloat16

# Dataset
dataset_name: HuggingFaceH4/Multilingual-Thinking
Expand All @@ -21,7 +21,7 @@ lora_alpha: 16
lora_dropout: 0.0
lora_target_modules: all-linear
max_length: 4096
warmup_ratio: 0.03
warmup_steps: 0.03
lr_scheduler_type: cosine_with_min_lr
lr_scheduler_kwargs:
min_lr_rate: 0.1
Expand Down
14 changes: 8 additions & 6 deletions examples/gpt-oss/convert_oai_mxfp4_weight_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,21 +95,23 @@ def convert_and_save(model, tokenizer, output_path: str):

def create_parser():
parser = argparse.ArgumentParser(description=__doc__)

parser.add_argument("--model_path", type=str, help="path to the fake-quantized model from QAT.")

parser.add_argument(
"--trust_remote_code",
help="Set trust_remote_code for Huggingface models and tokenizers",
default=False,
action="store_true",
)
parser.add_argument(
"--lora_path",
type=str,
help="path to the LoRA-QAT adapter weights. You can only specify lora_path or model_path, not both.",
)

parser.add_argument(
"--base_path",
type=str,
help="path to the base model used for LoRA-QAT. Only used if lora_path is specified.",
)

parser.add_argument(
"--output_path", type=str, required=True, help="location to save converted model."
)
Expand All @@ -121,7 +123,7 @@ def create_parser():
parser = create_parser()
args = parser.parse_args()

kwargs = {"device_map": "auto", "torch_dtype": "auto", "trust_remote_code": True}
kwargs = {"device_map": "auto", "dtype": "auto", "trust_remote_code": args.trust_remote_code}
if args.lora_path:
assert args.model_path is None, "You can only specify lora_path or model_path, not both."
model_path = args.base_path
Expand All @@ -140,7 +142,7 @@ def create_parser():
gc.collect()

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=args.trust_remote_code)

# Quantize and save model
convert_and_save(model, tokenizer, args.output_path)
2 changes: 1 addition & 1 deletion examples/gpt-oss/qat-finetune-transformers.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@
" per_device_eval_batch_size=1,\n",
" gradient_accumulation_steps=2,\n",
" max_length=4096,\n",
" warmup_ratio=0.03,\n",
" warmup_steps=0.03,\n",
" eval_strategy=\"steps\",\n",
" eval_on_start=True,\n",
" logging_steps=10,\n",
Expand Down
1 change: 0 additions & 1 deletion examples/gpt-oss/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
kernels>=0.9.0
torch>2.7.1
trackio
transformers>=4.55.0
trl>=0.21.0
2 changes: 1 addition & 1 deletion examples/gpt-oss/sft.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def main(script_args, training_args, model_args, quant_args):
"revision": model_args.model_revision,
"trust_remote_code": model_args.trust_remote_code,
"attn_implementation": model_args.attn_implementation,
"torch_dtype": getattr(model_args, "dtype", "float32"),
"dtype": getattr(model_args, "dtype", "float32"),
"use_cache": not training_args.gradient_checkpointing,
}

Expand Down
12 changes: 10 additions & 2 deletions examples/llm_autodeploy/run_auto_quantize.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,18 +118,19 @@ def modelopt_ptq(
auto_quantize_bits: float | None = None,
calib_dataset: str = "cnn_dailymail",
calib_batch_size: int = 8,
trust_remote_code: bool = False,
) -> torch.nn.Module:
"""Quantize the model with modelopt."""
model = AutoModelForCausalLM.from_pretrained(
model_path, trust_remote_code=True, torch_dtype="auto", device_map="auto"
model_path, trust_remote_code=trust_remote_code, dtype="auto", device_map="auto"
)
model.eval()

tokenizer = AutoTokenizer.from_pretrained(
model_path,
model_max_length=2048,
padding_side="left",
trust_remote_code=True,
trust_remote_code=trust_remote_code,
)
# sanitize tokenizer
if tokenizer.pad_token != "<unk>":
Expand Down Expand Up @@ -203,6 +204,12 @@ def modelopt_ptq(
"regular quantization without auto_quantize search will be applied."
),
)
parser.add_argument(
"--trust_remote_code",
help="Set trust_remote_code for Huggingface models and tokenizers",
default=False,
action="store_true",
)

args = parser.parse_args()

Expand All @@ -213,4 +220,5 @@ def modelopt_ptq(
args.num_samples,
auto_quantize_bits=args.effective_bits,
calib_batch_size=args.calib_batch_size,
trust_remote_code=args.trust_remote_code,
)
1 change: 0 additions & 1 deletion examples/llm_distill/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
pyarrow
torchao>=0.14.1
transformers<5.0
trl>=0.23.0
3 changes: 1 addition & 2 deletions examples/llm_eval/lm_eval_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
# limitations under the License.
import warnings

import datasets
from lm_eval import utils
from lm_eval.__main__ import cli_evaluate, parse_eval_args, setup_parser
from lm_eval.api.model import T
Expand Down Expand Up @@ -180,8 +181,6 @@ def setup_parser_with_modelopt_args():
model_args = utils.simple_parse_args_string(args.model_args)

if args.trust_remote_code:
import datasets

datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
model_args["trust_remote_code"] = True
args.trust_remote_code = None
Expand Down
29 changes: 18 additions & 11 deletions examples/llm_eval/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@

class EvalModel(BaseModel, arbitrary_types_allowed=True):
model_path: str
trust_remote_code: bool = False
max_input_length: int = 512
max_output_length: int = 512
dtype: str = "auto"
Expand All @@ -92,7 +93,6 @@ def load(self):


class OpenAIModel(EvalModel):
model_path: str
engine: str = ""
use_azure: bool = False
tokenizer: tiktoken.Encoding | None
Expand Down Expand Up @@ -173,7 +173,6 @@ def handler(signum, frame):


class SeqToSeqModel(EvalModel):
model_path: str
model: PreTrainedModel | None = None
tokenizer: PreTrainedTokenizer | None = None
lora_path: str = ""
Expand All @@ -188,18 +187,22 @@ def load(self):
args.update(device_map="auto")
if self.load_8bit:
args.update(device_map="auto", load_in_8bit=True)
args.update(torch_dtype=getattr(torch, self.dtype) if self.dtype != "auto" else "auto")
args.update(dtype=getattr(torch, self.dtype) if self.dtype != "auto" else "auto")
if self.attn_implementation:
args["attn_implementation"] = self.attn_implementation
self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_path, **args)
self.model = AutoModelForSeq2SeqLM.from_pretrained(
self.model_path, trust_remote_code=self.trust_remote_code, **args
)
print_gpu_utilization()
if self.lora_path:
self.model = PeftModel.from_pretrained(self.model, self.lora_path)
self.model.eval()
if "device_map" not in args:
self.model.to(self.device)
if self.tokenizer is None:
self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
self.tokenizer = AutoTokenizer.from_pretrained(
self.model_path, trust_remote_code=self.trust_remote_code
)

def run(self, prompt: str, **kwargs) -> str:
self.load()
Expand Down Expand Up @@ -243,11 +246,11 @@ def load(self):
args.update(device_map="auto")
if self.load_8bit:
args.update(device_map="auto", load_in_8bit=True)
args.update(torch_dtype=getattr(torch, self.dtype) if self.dtype != "auto" else "auto")
args.update(dtype=getattr(torch, self.dtype) if self.dtype != "auto" else "auto")
if self.attn_implementation:
args["attn_implementation"] = self.attn_implementation
self.model = AutoModelForCausalLM.from_pretrained(
self.model_path, trust_remote_code=True, **args
self.model_path, trust_remote_code=self.trust_remote_code, **args
)
self.model.eval()
if "device_map" not in args:
Expand All @@ -256,7 +259,9 @@ def load(self):
# Sampling with temperature will cause MMLU to drop
self.model.generation_config.do_sample = False
if self.tokenizer is None:
self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True)
self.tokenizer = AutoTokenizer.from_pretrained(
self.model_path, trust_remote_code=self.trust_remote_code
)

def run(self, prompt: str, **kwargs) -> str:
self.load()
Expand Down Expand Up @@ -322,7 +327,7 @@ def load(self):
args.update(device_map="auto")
if self.load_8bit:
args.update(device_map="auto", load_in_8bit=True)
args.update(torch_dtype=getattr(torch, self.dtype) if self.dtype != "auto" else "auto")
args.update(dtype=getattr(torch, self.dtype) if self.dtype != "auto" else "auto")
self.model = LlamaForCausalLM.from_pretrained(self.model_path, **args)
print_gpu_utilization()
if self.lora_path:
Expand Down Expand Up @@ -487,10 +492,12 @@ def test_max_length(self):
class ChatGLMModel(SeqToSeqModel):
def load(self):
if self.tokenizer is None:
self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True)
self.tokenizer = AutoTokenizer.from_pretrained(
self.model_path, trust_remote_code=self.trust_remote_code
)
if self.model is None:
self.model = AutoModel.from_pretrained(
self.model_path, trust_remote_code=True
self.model_path, trust_remote_code=self.trust_remote_code
).half() # FP16 is required for ChatGLM
self.model.eval()
self.model.to(self.device)
Expand Down
Loading
Loading