diff --git a/.github/workflows/delete_outdated_pr_branches.yml b/.github/workflows/delete_outdated_pr_branches.yml deleted file mode 100644 index 532b5c5b7..000000000 --- a/.github/workflows/delete_outdated_pr_branches.yml +++ /dev/null @@ -1,47 +0,0 @@ -name: Delete Outdated PR Branches - -on: - schedule: - - cron: "0 9 * * 1" # Every Monday at 9:00 UTC - workflow_dispatch: # On-demand - -permissions: - contents: write - pull-requests: read - -jobs: - delete-outdated-pr-branches: - runs-on: ubuntu-latest - timeout-minutes: 15 - steps: - - uses: actions/checkout@v6 - with: - fetch-depth: 0 - - name: Delete branches for closed/merged PRs - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - REPO="${{ github.repository }}" - DELETED=0 - SKIPPED=0 - - # List all remote branches matching pull-request/ - git fetch --prune origin - for branch in $(git branch -r | grep -oP 'origin/pull-request/\K[0-9]+' | sort -un); do - FULL_BRANCH="pull-request/${branch}" - STATE=$(gh pr view "$branch" --repo "$REPO" --json state --jq '.state' 2>/dev/null || echo "") - - if [ "$STATE" = "CLOSED" ] || [ "$STATE" = "MERGED" ]; then - echo "Deleting branch '${FULL_BRANCH}' (PR #${branch} is ${STATE})" - git push origin --delete "$FULL_BRANCH" && DELETED=$((DELETED + 1)) || true - elif [ "$STATE" = "OPEN" ]; then - echo "Skipping branch '${FULL_BRANCH}' (PR #${branch} is still OPEN)" - SKIPPED=$((SKIPPED + 1)) - else - echo "Skipping branch '${FULL_BRANCH}' (could not determine PR #${branch} state)" - SKIPPED=$((SKIPPED + 1)) - fi - done - - echo "" - echo "Done. Deleted: ${DELETED}, Skipped: ${SKIPPED}" diff --git a/.github/workflows/example_tests.yml b/.github/workflows/example_tests.yml index f3f390804..1c2a8fbb4 100644 --- a/.github/workflows/example_tests.yml +++ b/.github/workflows/example_tests.yml @@ -70,7 +70,7 @@ jobs: uses: ./.github/workflows/_example_tests_runner.yml secrets: inherit with: - docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.01' }}-py3" + docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.03' }}-py3" example: ${{ matrix.example }} timeout_minutes: 30 pip_install_extras: "[hf,dev-test]" @@ -82,7 +82,7 @@ jobs: uses: ./.github/workflows/_example_tests_runner.yml secrets: inherit with: - docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.01' }}-py3" + docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.03' }}-py3" example: ${{ matrix.example }} timeout_minutes: 30 pip_install_extras: "[hf,dev-test]" @@ -99,7 +99,7 @@ jobs: uses: ./.github/workflows/_example_tests_runner.yml secrets: inherit with: - docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc5" + docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc9" example: ${{ matrix.example }} pip_install_extras: "[hf,dev-test]" runner: linux-amd64-gpu-rtxpro6000-latest-1 @@ -113,7 +113,7 @@ jobs: uses: ./.github/workflows/_example_tests_runner.yml secrets: inherit with: - docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc5" + docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc9" example: ${{ matrix.example }} pip_install_extras: "[hf,dev-test]" runner: linux-amd64-gpu-rtxpro6000-latest-2 diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml index 538e05e75..214afa756 100644 --- a/.github/workflows/gpu_tests.yml +++ b/.github/workflows/gpu_tests.yml @@ -64,13 +64,13 @@ jobs: include: - example: gpu timeout: 45 - container_image: pytorch:26.01-py3 + container_image: pytorch:26.03-py3 - example: gpu-megatron timeout: 45 - container_image: pytorch:26.01-py3 + container_image: pytorch:26.03-py3 - example: gpu-trtllm timeout: 30 - container_image: tensorrt-llm/release:1.3.0rc5 + container_image: tensorrt-llm/release:1.3.0rc9 runs-on: linux-amd64-gpu-rtxpro6000-latest-1 timeout-minutes: ${{ matrix.timeout }} container: &gpu_container diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 3993f4670..d259b3733 100755 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,5 +1,6 @@ -NVIDIA Model Optimizer Changelog -================================ +Changelog +========= + 0.44 (2026-05-xx) ^^^^^^^^^^^^^^^^^ @@ -13,6 +14,10 @@ NVIDIA Model Optimizer Changelog - Fix Minitron pruning (``mcore_minitron``) for MoE models. Importance estimation hooks were incorrectly registered for MoE modules and NAS step was hanging before this. +**Misc** + +- Add experimental support for transformers>=5.0. Unified Hugging Face checkpoint export for quantized checkpoints may not work for some models with transformers>=5.0 yet. + 0.43 (2026-04-09) ^^^^^^^^^^^^^^^^^ diff --git a/examples/gpt-oss/configs/sft_full.yaml b/examples/gpt-oss/configs/sft_full.yaml index bbb74e660..7d980b9d0 100644 --- a/examples/gpt-oss/configs/sft_full.yaml +++ b/examples/gpt-oss/configs/sft_full.yaml @@ -1,7 +1,7 @@ # Model model_name_or_path: openai/gpt-oss-20b attn_implementation: eager -torch_dtype: bfloat16 +dtype: bfloat16 # Dataset dataset_name: HuggingFaceH4/Multilingual-Thinking @@ -16,7 +16,7 @@ per_device_train_batch_size: 2 per_device_eval_batch_size: 2 gradient_accumulation_steps: 2 max_length: 4096 -warmup_ratio: 0.03 +warmup_steps: 0.03 lr_scheduler_type: cosine_with_min_lr lr_scheduler_kwargs: min_lr_rate: 0.1 diff --git a/examples/gpt-oss/configs/sft_lora.yaml b/examples/gpt-oss/configs/sft_lora.yaml index 81dbac07a..4b44ca4af 100644 --- a/examples/gpt-oss/configs/sft_lora.yaml +++ b/examples/gpt-oss/configs/sft_lora.yaml @@ -1,7 +1,7 @@ # Model model_name_or_path: openai/gpt-oss-20b attn_implementation: eager -torch_dtype: bfloat16 +dtype: bfloat16 # Dataset dataset_name: HuggingFaceH4/Multilingual-Thinking @@ -21,7 +21,7 @@ lora_alpha: 16 lora_dropout: 0.0 lora_target_modules: all-linear max_length: 4096 -warmup_ratio: 0.03 +warmup_steps: 0.03 lr_scheduler_type: cosine_with_min_lr lr_scheduler_kwargs: min_lr_rate: 0.1 diff --git a/examples/gpt-oss/convert_oai_mxfp4_weight_only.py b/examples/gpt-oss/convert_oai_mxfp4_weight_only.py index bebb91486..4f471ef48 100644 --- a/examples/gpt-oss/convert_oai_mxfp4_weight_only.py +++ b/examples/gpt-oss/convert_oai_mxfp4_weight_only.py @@ -95,21 +95,23 @@ def convert_and_save(model, tokenizer, output_path: str): def create_parser(): parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("--model_path", type=str, help="path to the fake-quantized model from QAT.") - + parser.add_argument( + "--trust_remote_code", + help="Set trust_remote_code for Huggingface models and tokenizers", + default=False, + action="store_true", + ) parser.add_argument( "--lora_path", type=str, help="path to the LoRA-QAT adapter weights. You can only specify lora_path or model_path, not both.", ) - parser.add_argument( "--base_path", type=str, help="path to the base model used for LoRA-QAT. Only used if lora_path is specified.", ) - parser.add_argument( "--output_path", type=str, required=True, help="location to save converted model." ) @@ -121,7 +123,7 @@ def create_parser(): parser = create_parser() args = parser.parse_args() - kwargs = {"device_map": "auto", "torch_dtype": "auto", "trust_remote_code": True} + kwargs = {"device_map": "auto", "dtype": "auto", "trust_remote_code": args.trust_remote_code} if args.lora_path: assert args.model_path is None, "You can only specify lora_path or model_path, not both." model_path = args.base_path @@ -140,7 +142,7 @@ def create_parser(): gc.collect() # Load tokenizer - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=args.trust_remote_code) # Quantize and save model convert_and_save(model, tokenizer, args.output_path) diff --git a/examples/gpt-oss/qat-finetune-transformers.ipynb b/examples/gpt-oss/qat-finetune-transformers.ipynb index 695ed39f6..58dba84cb 100644 --- a/examples/gpt-oss/qat-finetune-transformers.ipynb +++ b/examples/gpt-oss/qat-finetune-transformers.ipynb @@ -207,7 +207,7 @@ " per_device_eval_batch_size=1,\n", " gradient_accumulation_steps=2,\n", " max_length=4096,\n", - " warmup_ratio=0.03,\n", + " warmup_steps=0.03,\n", " eval_strategy=\"steps\",\n", " eval_on_start=True,\n", " logging_steps=10,\n", diff --git a/examples/gpt-oss/requirements.txt b/examples/gpt-oss/requirements.txt index 368097d33..76c3b0a2e 100644 --- a/examples/gpt-oss/requirements.txt +++ b/examples/gpt-oss/requirements.txt @@ -1,5 +1,4 @@ kernels>=0.9.0 torch>2.7.1 trackio -transformers>=4.55.0 trl>=0.21.0 diff --git a/examples/gpt-oss/sft.py b/examples/gpt-oss/sft.py index 9c2d6aeb8..4d30fc0fd 100644 --- a/examples/gpt-oss/sft.py +++ b/examples/gpt-oss/sft.py @@ -72,7 +72,7 @@ def main(script_args, training_args, model_args, quant_args): "revision": model_args.model_revision, "trust_remote_code": model_args.trust_remote_code, "attn_implementation": model_args.attn_implementation, - "torch_dtype": getattr(model_args, "dtype", "float32"), + "dtype": getattr(model_args, "dtype", "float32"), "use_cache": not training_args.gradient_checkpointing, } diff --git a/examples/llm_autodeploy/run_auto_quantize.py b/examples/llm_autodeploy/run_auto_quantize.py index e9ecb0731..389d8207b 100644 --- a/examples/llm_autodeploy/run_auto_quantize.py +++ b/examples/llm_autodeploy/run_auto_quantize.py @@ -118,10 +118,11 @@ def modelopt_ptq( auto_quantize_bits: float | None = None, calib_dataset: str = "cnn_dailymail", calib_batch_size: int = 8, + trust_remote_code: bool = False, ) -> torch.nn.Module: """Quantize the model with modelopt.""" model = AutoModelForCausalLM.from_pretrained( - model_path, trust_remote_code=True, torch_dtype="auto", device_map="auto" + model_path, trust_remote_code=trust_remote_code, dtype="auto", device_map="auto" ) model.eval() @@ -129,7 +130,7 @@ def modelopt_ptq( model_path, model_max_length=2048, padding_side="left", - trust_remote_code=True, + trust_remote_code=trust_remote_code, ) # sanitize tokenizer if tokenizer.pad_token != "": @@ -203,6 +204,12 @@ def modelopt_ptq( "regular quantization without auto_quantize search will be applied." ), ) + parser.add_argument( + "--trust_remote_code", + help="Set trust_remote_code for Huggingface models and tokenizers", + default=False, + action="store_true", + ) args = parser.parse_args() @@ -213,4 +220,5 @@ def modelopt_ptq( args.num_samples, auto_quantize_bits=args.effective_bits, calib_batch_size=args.calib_batch_size, + trust_remote_code=args.trust_remote_code, ) diff --git a/examples/llm_distill/requirements.txt b/examples/llm_distill/requirements.txt index 91dda9daf..4bcd19083 100644 --- a/examples/llm_distill/requirements.txt +++ b/examples/llm_distill/requirements.txt @@ -1,4 +1,3 @@ pyarrow torchao>=0.14.1 -transformers<5.0 trl>=0.23.0 diff --git a/examples/llm_eval/lm_eval_hf.py b/examples/llm_eval/lm_eval_hf.py index 405e8590a..11d736a42 100755 --- a/examples/llm_eval/lm_eval_hf.py +++ b/examples/llm_eval/lm_eval_hf.py @@ -38,6 +38,7 @@ # limitations under the License. import warnings +import datasets from lm_eval import utils from lm_eval.__main__ import cli_evaluate, parse_eval_args, setup_parser from lm_eval.api.model import T @@ -180,8 +181,6 @@ def setup_parser_with_modelopt_args(): model_args = utils.simple_parse_args_string(args.model_args) if args.trust_remote_code: - import datasets - datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True model_args["trust_remote_code"] = True args.trust_remote_code = None diff --git a/examples/llm_eval/modeling.py b/examples/llm_eval/modeling.py index d06d05560..71e048e1a 100644 --- a/examples/llm_eval/modeling.py +++ b/examples/llm_eval/modeling.py @@ -74,6 +74,7 @@ class EvalModel(BaseModel, arbitrary_types_allowed=True): model_path: str + trust_remote_code: bool = False max_input_length: int = 512 max_output_length: int = 512 dtype: str = "auto" @@ -92,7 +93,6 @@ def load(self): class OpenAIModel(EvalModel): - model_path: str engine: str = "" use_azure: bool = False tokenizer: tiktoken.Encoding | None @@ -173,7 +173,6 @@ def handler(signum, frame): class SeqToSeqModel(EvalModel): - model_path: str model: PreTrainedModel | None = None tokenizer: PreTrainedTokenizer | None = None lora_path: str = "" @@ -188,10 +187,12 @@ def load(self): args.update(device_map="auto") if self.load_8bit: args.update(device_map="auto", load_in_8bit=True) - args.update(torch_dtype=getattr(torch, self.dtype) if self.dtype != "auto" else "auto") + args.update(dtype=getattr(torch, self.dtype) if self.dtype != "auto" else "auto") if self.attn_implementation: args["attn_implementation"] = self.attn_implementation - self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_path, **args) + self.model = AutoModelForSeq2SeqLM.from_pretrained( + self.model_path, trust_remote_code=self.trust_remote_code, **args + ) print_gpu_utilization() if self.lora_path: self.model = PeftModel.from_pretrained(self.model, self.lora_path) @@ -199,7 +200,9 @@ def load(self): if "device_map" not in args: self.model.to(self.device) if self.tokenizer is None: - self.tokenizer = AutoTokenizer.from_pretrained(self.model_path) + self.tokenizer = AutoTokenizer.from_pretrained( + self.model_path, trust_remote_code=self.trust_remote_code + ) def run(self, prompt: str, **kwargs) -> str: self.load() @@ -243,11 +246,11 @@ def load(self): args.update(device_map="auto") if self.load_8bit: args.update(device_map="auto", load_in_8bit=True) - args.update(torch_dtype=getattr(torch, self.dtype) if self.dtype != "auto" else "auto") + args.update(dtype=getattr(torch, self.dtype) if self.dtype != "auto" else "auto") if self.attn_implementation: args["attn_implementation"] = self.attn_implementation self.model = AutoModelForCausalLM.from_pretrained( - self.model_path, trust_remote_code=True, **args + self.model_path, trust_remote_code=self.trust_remote_code, **args ) self.model.eval() if "device_map" not in args: @@ -256,7 +259,9 @@ def load(self): # Sampling with temperature will cause MMLU to drop self.model.generation_config.do_sample = False if self.tokenizer is None: - self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True) + self.tokenizer = AutoTokenizer.from_pretrained( + self.model_path, trust_remote_code=self.trust_remote_code + ) def run(self, prompt: str, **kwargs) -> str: self.load() @@ -322,7 +327,7 @@ def load(self): args.update(device_map="auto") if self.load_8bit: args.update(device_map="auto", load_in_8bit=True) - args.update(torch_dtype=getattr(torch, self.dtype) if self.dtype != "auto" else "auto") + args.update(dtype=getattr(torch, self.dtype) if self.dtype != "auto" else "auto") self.model = LlamaForCausalLM.from_pretrained(self.model_path, **args) print_gpu_utilization() if self.lora_path: @@ -487,10 +492,12 @@ def test_max_length(self): class ChatGLMModel(SeqToSeqModel): def load(self): if self.tokenizer is None: - self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True) + self.tokenizer = AutoTokenizer.from_pretrained( + self.model_path, trust_remote_code=self.trust_remote_code + ) if self.model is None: self.model = AutoModel.from_pretrained( - self.model_path, trust_remote_code=True + self.model_path, trust_remote_code=self.trust_remote_code ).half() # FP16 is required for ChatGLM self.model.eval() self.model.to(self.device) diff --git a/examples/llm_ptq/README.md b/examples/llm_ptq/README.md index 5db36a972..3780284cf 100755 --- a/examples/llm_ptq/README.md +++ b/examples/llm_ptq/README.md @@ -115,7 +115,7 @@ Please reference our [framework scripts](#framework-scripts) and our [docs](http | Kimi K2 | - | - | - | - | ✅ | | MiniMax M2.1 | - | - | - | - | ✅ | | T5 | ✅ | ✅ | ✅ | ✅ | - | -| Whisper | ✅ | ❌ | ❌ | ❌ | - | +| Whisper9 | ✅ | ❌ | ❌ | ❌ | - | | Nemotron-3 | ✅ | ❌ | ❌ | ❌ | ✅ | > *This is a subset of the models supported. For the full list please check the [TensorRT-LLM support matrix](https://nvidia.github.io/TensorRT-LLM/reference/precision.html#support-matrix)* @@ -127,7 +127,8 @@ Please reference our [framework scripts](#framework-scripts) and our [docs](http > *5.A selective set of the popular models are internally tested. The actual model support list may be longer. NVFP4 inference requires Blackwell GPUs and TensorRT-LLM v0.17 or later* \ > *6.Some models currently support export to HF format only.* \ > *7.[PTQ for DeepSeek](../deepseek/README.md)* \ -> *8.GLM-4.7 has MTP (Multi-Token Prediction) layers that are automatically loaded and excluded from quantization.* +> *8.GLM-4.7 has MTP (Multi-Token Prediction) layers that are automatically loaded and excluded from quantization.* \ +> *9.Running Whisper model with transformers>=5.0 requires [torchcodec](https://github.com/meta-pytorch/torchcodec?tab=readme-ov-file#installing-cuda-enabled-torchcodec) and other system packages (e.g. ffmpeg).* > *The accuracy loss after PTQ may vary depending on the actual model and the quantization method. Different models may have different accuracy loss and usually the accuracy loss is more significant when the base model is small. If the accuracy after PTQ is not meeting the requirement, please try either modifying [hf_ptq.py](./hf_ptq.py) and disabling the KV cache quantization or using the [QAT](./../llm_qat/README.md) instead. For NVFP4 quantization specifically, we recommend `nvfp4_mlp_only`, `nvfp4_experts_only`, or `nvfp4_omlp_only` to achieve higher accuracy by restricting quantization to the MLP/expert layers (and optionally the `o_proj` layer) while keeping the attention QKV projections unquantized.* diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py index 58eb67611..a4515baac 100755 --- a/examples/llm_ptq/example_utils.py +++ b/examples/llm_ptq/example_utils.py @@ -53,7 +53,13 @@ def run_nemotron_vl_preview( - full_model, tokenizer, input_ids, pyt_ckpt_path, stage_name, allow_fallback=False + full_model, + tokenizer, + input_ids, + pyt_ckpt_path, + stage_name, + allow_fallback=False, + trust_remote_code=False, ): """Run text-only and VL preview generation for Nemotron VL models. @@ -64,7 +70,7 @@ def run_nemotron_vl_preview( pyt_ckpt_path: Path to the model checkpoint stage_name: Description of the stage (e.g., "before quantization", "after quantization") allow_fallback: Whether to allow fallback to standard generate on failure - + trust_remote_code: Whether to trust remote code for Huggingface models and tokenizers Returns: Generated text response or None if generation failed """ @@ -80,7 +86,7 @@ def run_nemotron_vl_preview( # Try text-only generation (may fail for encoder-decoder models like Nemotron-Parse) text_response = run_text_only_generation( - full_model, tokenizer, question, generation_config, pyt_ckpt_path + full_model, tokenizer, question, generation_config, pyt_ckpt_path, trust_remote_code ) generated_ids = None @@ -93,7 +99,7 @@ def run_nemotron_vl_preview( # Run additional VL test with images print(f"Running additional VL test with images ({stage_name})...") - run_vl_preview_generation(full_model, tokenizer, pyt_ckpt_path, stage_name) + run_vl_preview_generation(full_model, tokenizer, pyt_ckpt_path, stage_name, trust_remote_code) return generated_ids @@ -567,7 +573,7 @@ def get_model( model_kwargs = config_kwargs.copy() # Don't set torch_dtype for VILA models as they handle it explicitly in their builder if "vila" not in ckpt_path.lower(): - model_kwargs.setdefault("torch_dtype", "auto") + model_kwargs.setdefault("dtype", "auto") if "vila" in ckpt_path.lower(): hf_vila = AutoModel.from_pretrained( @@ -618,7 +624,7 @@ def has_pack_quantized_config(config): ckpt_path, device_map="auto", trust_remote_code=trust_remote_code, - torch_dtype="auto", + dtype="auto", ) else: architecture = hf_config.architectures[0] @@ -650,7 +656,7 @@ def has_pack_quantized_config(config): model_kwargs2 = model_kwargs.copy() if auto_model_module not in [AutoModelForCausalLM, AutoModel]: model_kwargs2.pop("trust_remote_code", None) - model_kwargs2["torch_dtype"] = torch_dtype + model_kwargs2["dtype"] = torch_dtype model_kwargs2.pop("max_memory", None) model = from_config(hf_config, **model_kwargs2) diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index 5620ddf6a..0fabed049 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -745,6 +745,7 @@ def pre_quantize( args.pyt_ckpt_path, "before quantization", allow_fallback=False, + trust_remote_code=args.trust_remote_code, ) else: generated_ids_before_ptq = full_model.generate(preview_input_ids, max_new_tokens=100) @@ -795,6 +796,7 @@ def post_quantize( args.pyt_ckpt_path, "after quantization", allow_fallback=False, + trust_remote_code=args.trust_remote_code, ) else: warnings.warn( diff --git a/examples/llm_ptq/multinode_ptq.py b/examples/llm_ptq/multinode_ptq.py index 624307cda..93ef21ea4 100644 --- a/examples/llm_ptq/multinode_ptq.py +++ b/examples/llm_ptq/multinode_ptq.py @@ -149,9 +149,7 @@ def load_and_prepare_model( Tuple of (prepared_model, model_type, original_architectures, calibration_dataloader) """ model = AutoModelForCausalLM.from_pretrained( - model_path, - torch_dtype="auto", - trust_remote_code=trust_remote_code, + model_path, dtype="auto", trust_remote_code=trust_remote_code ) model.eval() model_type = get_model_type(model) diff --git a/examples/llm_ptq/requirements-t5.txt b/examples/llm_ptq/requirements-t5.txt deleted file mode 100644 index 034713546..000000000 --- a/examples/llm_ptq/requirements-t5.txt +++ /dev/null @@ -1 +0,0 @@ -transformers==4.48.0 diff --git a/examples/llm_ptq/requirements-whisper.txt b/examples/llm_ptq/requirements-whisper.txt deleted file mode 100644 index a79b19aee..000000000 --- a/examples/llm_ptq/requirements-whisper.txt +++ /dev/null @@ -1,2 +0,0 @@ -librosa -soundfile diff --git a/examples/llm_ptq/requirements.txt b/examples/llm_ptq/requirements.txt index 1469d5552..460be2fe6 100644 --- a/examples/llm_ptq/requirements.txt +++ b/examples/llm_ptq/requirements.txt @@ -3,5 +3,6 @@ fire flash-attn>=2.6.0 rouge_score>=0.1.2 tiktoken +transformers<5.0 transformers_stream_generator zstandard diff --git a/examples/llm_ptq/vlm_utils.py b/examples/llm_ptq/vlm_utils.py index 9919e405b..abfebbd4f 100644 --- a/examples/llm_ptq/vlm_utils.py +++ b/examples/llm_ptq/vlm_utils.py @@ -21,7 +21,7 @@ from transformers import AutoImageProcessor, AutoProcessor -def run_vl_preview_generation(model, tokenizer, model_path, stage_name): +def run_vl_preview_generation(model, tokenizer, model_path, stage_name, trust_remote_code=False): """Run preview generation for VL models using sample images. Args: @@ -29,7 +29,7 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name): tokenizer: The tokenizer model_path: Path to the model (for loading image processor) stage_name: Description of the stage (e.g., "before quantization") - + trust_remote_code: Whether to trust remote code for Huggingface models and tokenizers Returns: Generated response text for logging/comparison """ @@ -85,7 +85,9 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name): # Try to detect the VL model has chat method or generate method if hasattr(model, "chat"): - image_processor = AutoImageProcessor.from_pretrained(model_path, trust_remote_code=True) + image_processor = AutoImageProcessor.from_pretrained( + model_path, trust_remote_code=trust_remote_code + ) image_features = image_processor([image]) # Pass as list with single image @@ -103,7 +105,9 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name): **image_features, ) else: - processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) + processor = AutoProcessor.from_pretrained( + model_path, trust_remote_code=trust_remote_code + ) # Use chat template if available, otherwise fall back to default task prompt if hasattr(tokenizer, "chat_template") and tokenizer.chat_template is not None: @@ -190,7 +194,9 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name): return None -def run_text_only_generation(model, tokenizer, question, generation_config, model_path): +def run_text_only_generation( + model, tokenizer, question, generation_config, model_path, trust_remote_code=False +): """Run text-only generation for VL models, supporting both chat and generate methods. Args: @@ -199,7 +205,7 @@ def run_text_only_generation(model, tokenizer, question, generation_config, mode question: The text question to ask generation_config: Generation configuration model_path: Path to the model (for loading processor if needed) - + trust_remote_code: Whether to trust remote code for Huggingface models and tokenizers Returns: Generated response text or None if failed """ @@ -209,7 +215,9 @@ def run_text_only_generation(model, tokenizer, question, generation_config, mode response = model.chat(tokenizer, None, question, generation_config, history=None) return response else: - processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) + processor = AutoProcessor.from_pretrained( + model_path, trust_remote_code=trust_remote_code + ) # Create text-only messages messages = [ diff --git a/examples/llm_qad/data_utils/download_dataset.py b/examples/llm_qad/data_utils/download_dataset.py index e3e3d0646..42ef6280e 100644 --- a/examples/llm_qad/data_utils/download_dataset.py +++ b/examples/llm_qad/data_utils/download_dataset.py @@ -30,14 +30,14 @@ _TOKENIZER = None -def init_tokenizer(name: str) -> None: +def init_tokenizer(name: str, trust_remote_code: bool = False) -> None: """Load HuggingFace tokenizer for chat template.""" global _TOKENIZER if name: from transformers import AutoTokenizer print(f"Loading tokenizer: {name}") - _TOKENIZER = AutoTokenizer.from_pretrained(name, trust_remote_code=True) + _TOKENIZER = AutoTokenizer.from_pretrained(name, trust_remote_code=trust_remote_code) def format_text(messages: list[dict], reasoning: str = "") -> str: @@ -159,10 +159,16 @@ def main(): p.add_argument( "--include-reasoning", action="store_true", help="Include COT for Thinking models" ) + p.add_argument( + "--trust_remote_code", + help="Set trust_remote_code for Huggingface models and tokenizers", + default=False, + action="store_true", + ) args = p.parse_args() if args.tokenizer: - init_tokenizer(args.tokenizer) + init_tokenizer(args.tokenizer, args.trust_remote_code) # Build suffix suffix = f"{int(args.sample_percent)}pct" diff --git a/examples/llm_qat/launch.sh b/examples/llm_qat/launch.sh index 6120476f1..cc3adc74f 100755 --- a/examples/llm_qat/launch.sh +++ b/examples/llm_qat/launch.sh @@ -165,7 +165,7 @@ CMD="accelerate launch --config-file accelerate_config/$CONFIG_FILE $FSDP_ARGS \ --save_total_limit 2 \ --learning_rate $LR \ --weight_decay 0.0 \ - --warmup_ratio 0.1 \ + --warmup_steps 0.1 \ --lr_scheduler_type linear \ --logging_steps 1 \ --report_to tensorboard \ diff --git a/examples/llm_qat/main.py b/examples/llm_qat/main.py index 943515725..2edbf3ccb 100644 --- a/examples/llm_qat/main.py +++ b/examples/llm_qat/main.py @@ -166,9 +166,7 @@ def train(): print_rank_0(f"Last checkpoint detected: {last_checkpoint}") model = transformers.AutoModelForCausalLM.from_pretrained( - model_args.model_name_or_path, - cache_dir=training_args.cache_dir, - torch_dtype=torch.bfloat16, + model_args.model_name_or_path, cache_dir=training_args.cache_dir, dtype=torch.bfloat16 ) model.generation_config.do_sample = True tokenizer = transformers.AutoTokenizer.from_pretrained( @@ -223,7 +221,7 @@ def train(): teacher_model = transformers.AutoModelForCausalLM.from_pretrained( model_args.teacher_model, cache_dir=training_args.cache_dir, - torch_dtype=torch.bfloat16, + dtype=torch.bfloat16, ) distill_config = { "teacher_model": teacher_model, diff --git a/examples/llm_qat/notebooks/QAT_QAD_Walkthrough.ipynb b/examples/llm_qat/notebooks/QAT_QAD_Walkthrough.ipynb index a9bb6589b..9c10c55c2 100644 --- a/examples/llm_qat/notebooks/QAT_QAD_Walkthrough.ipynb +++ b/examples/llm_qat/notebooks/QAT_QAD_Walkthrough.ipynb @@ -290,7 +290,7 @@ " per_device_eval_batch_size=1,\n", " gradient_accumulation_steps=2,\n", " max_length=4096,\n", - " warmup_ratio=0.03,\n", + " warmup_steps=0.03,\n", " eval_strategy=\"steps\",\n", " eval_on_start=True,\n", " logging_steps=50,\n", diff --git a/examples/llm_sparsity/attention_sparsity/hf_sa.py b/examples/llm_sparsity/attention_sparsity/hf_sa.py index 8115d4aaf..84f36c8e4 100644 --- a/examples/llm_sparsity/attention_sparsity/hf_sa.py +++ b/examples/llm_sparsity/attention_sparsity/hf_sa.py @@ -115,7 +115,7 @@ def generate_sample_output(model, tokenizer, args): padding=False, ) if torch.cuda.is_available(): - inputs = {k: v.cuda() for k, v in inputs.items()} + inputs = {k: v.to(model.device) for k, v in inputs.items()} # Generate with torch.no_grad(): @@ -147,10 +147,7 @@ def main(args): # No need to specify attn_implementation here — mtsa.sparsify() sets it # automatically ("eager" for pytorch backend, "modelopt_triton" for triton). model = AutoModelForCausalLM.from_pretrained( - args.pyt_ckpt_path, - attn_implementation="eager", - torch_dtype="auto", - device_map="auto", + args.pyt_ckpt_path, attn_implementation="eager", dtype="auto", device_map="auto" ) tokenizer = AutoTokenizer.from_pretrained(args.pyt_ckpt_path) diff --git a/examples/llm_sparsity/weight_sparsity/eval.py b/examples/llm_sparsity/weight_sparsity/eval.py index 6b1d4ef17..a5f2fb91b 100644 --- a/examples/llm_sparsity/weight_sparsity/eval.py +++ b/examples/llm_sparsity/weight_sparsity/eval.py @@ -129,7 +129,7 @@ def __call__(self, instances: Sequence[dict]) -> dict[str, torch.Tensor]: [instance[key] for instance in instances] for key in ("src_idx", "label_idx") ) - batch_encoded = self.tokenizer.batch_encode_plus( + batch_encoded = self.tokenizer( sources, return_tensors="pt", padding=True, @@ -254,7 +254,7 @@ def main(): dataloader = get_dataloader( accelerator, dataset, tokenizer, args.model_max_length, args.batch_size, shuffle=False ) - model = AutoModelForCausalLM.from_pretrained(args.model_dir, torch_dtype=torch.float16).to( + model = AutoModelForCausalLM.from_pretrained(args.model_dir, dtype=torch.float16).to( accelerator.device ) diff --git a/examples/llm_sparsity/weight_sparsity/export_trtllm_ckpt.py b/examples/llm_sparsity/weight_sparsity/export_trtllm_ckpt.py index 0fb64f958..2cf7ca3a7 100644 --- a/examples/llm_sparsity/weight_sparsity/export_trtllm_ckpt.py +++ b/examples/llm_sparsity/weight_sparsity/export_trtllm_ckpt.py @@ -74,7 +74,7 @@ def get_model(ckpt_path, dtype="fp16", trust_remote_code=False): dtype = torch.float32 else: raise NotImplementedError(f"Unknown dtype {dtype}") - model_kwargs = {"torch_dtype": dtype} + model_kwargs = {"dtype": dtype} model = AutoModelForCausalLM.from_pretrained( ckpt_path, device_map="auto", **model_kwargs, trust_remote_code=trust_remote_code diff --git a/examples/llm_sparsity/weight_sparsity/finetune.py b/examples/llm_sparsity/weight_sparsity/finetune.py index 711084668..6eb199adc 100644 --- a/examples/llm_sparsity/weight_sparsity/finetune.py +++ b/examples/llm_sparsity/weight_sparsity/finetune.py @@ -297,13 +297,12 @@ def train(): ) last_checkpoint = None - if os.path.isdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: + if os.path.isdir(args.output_dir) and args.do_train and args.resume_from_checkpoint is None: last_checkpoint = get_last_checkpoint(args.output_dir) - if last_checkpoint is not None and args.resume_from_checkpoint is None: + if last_checkpoint is not None: print_rank_0( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this" - " behavior, change the `--output_dir` or add `--overwrite_output_dir` to train" - " from scratch." + " behavior, change the `--output_dir` or pass `--resume_from_checkpoint`." ) model = transformers.AutoModelForCausalLM.from_pretrained( @@ -335,18 +334,12 @@ def train(): # Detecting last checkpoint. last_checkpoint = None - if os.path.isdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: + if os.path.isdir(args.output_dir) and args.do_train and args.resume_from_checkpoint is None: last_checkpoint = get_last_checkpoint(args.output_dir) - if last_checkpoint is None and len(os.listdir(args.output_dir)) > 0: - raise ValueError( - f"Output directory ({args.output_dir}) already exists and is not empty. " - "Use --overwrite_output_dir to overcome." - ) - elif last_checkpoint is not None and args.resume_from_checkpoint is None: + if last_checkpoint is not None: print_rank_0( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this" - " behavior, change the `--output_dir` or add `--overwrite_output_dir` to train" - " from scratch." + " behavior, change the `--output_dir` or pass `--resume_from_checkpoint`." ) # Training diff --git a/examples/llm_sparsity/weight_sparsity/hf_pts.py b/examples/llm_sparsity/weight_sparsity/hf_pts.py index ad8061211..77574c1c2 100644 --- a/examples/llm_sparsity/weight_sparsity/hf_pts.py +++ b/examples/llm_sparsity/weight_sparsity/hf_pts.py @@ -40,7 +40,7 @@ def get_calib_dataloader( else: raise NotImplementedError - batch_encoded = tokenizer.batch_encode_plus( + batch_encoded = tokenizer( dataset, return_tensors="pt", padding=True, truncation=True, max_length=block_size ) if device: @@ -98,7 +98,7 @@ def get_model(ckpt_path, dtype="fp16", trust_remote_code=False): dtype = torch.float32 else: raise NotImplementedError(f"Unknown dtype {dtype}") - model_kwargs = {"torch_dtype": dtype} + model_kwargs = {"dtype": dtype} model = AutoModelForCausalLM.from_pretrained( ckpt_path, device_map="auto", **model_kwargs, trust_remote_code=trust_remote_code diff --git a/examples/llm_sparsity/weight_sparsity/launch_finetune.sh b/examples/llm_sparsity/weight_sparsity/launch_finetune.sh index a65e1e600..7f8e71f25 100755 --- a/examples/llm_sparsity/weight_sparsity/launch_finetune.sh +++ b/examples/llm_sparsity/weight_sparsity/launch_finetune.sh @@ -88,11 +88,11 @@ CMD="accelerate launch --multi_gpu --mixed_precision bf16 finetune.py \ --save_total_limit 10 \ --learning_rate 2e-5 \ --weight_decay 0.1 \ - --warmup_ratio 0.0 \ + --warmup_steps 0.0 \ --lr_scheduler_type cosine \ --logging_steps 1 \ --fsdp 'full_shard auto_wrap' \ - --fsdp_transformer_layer_cls_to_wrap LlamaDecoderLayer \ + --fsdp_config '{\"transformer_layer_cls_to_wrap\": \"LlamaDecoderLayer\"}' \ --tf32 True \ --modelopt_restore_path $MODELOPT_RESTORE_PATH \ --report_to tensorboard \ diff --git a/examples/specdec_bench/specdec_bench/models/specbench_medusa.py b/examples/specdec_bench/specdec_bench/models/specbench_medusa.py index e483f379c..0165505d2 100644 --- a/examples/specdec_bench/specdec_bench/models/specbench_medusa.py +++ b/examples/specdec_bench/specdec_bench/models/specbench_medusa.py @@ -100,7 +100,7 @@ def __init__( self.draft_model_path, model_dir, medusa_num_heads=self.medusa_num_heads, - torch_dtype=torch_dtype, + dtype=torch_dtype, low_cpu_mem_usage=True, ) self.model = self.model.to(self.device) diff --git a/examples/speculative_decoding/README.md b/examples/speculative_decoding/README.md index 2a29f644e..8d75eb06f 100644 --- a/examples/speculative_decoding/README.md +++ b/examples/speculative_decoding/README.md @@ -308,7 +308,7 @@ This will modify the model in-place with eagle training forward, making it compa ```python # Create a trainer -trainer = transformers.Trainer(model=model, tokenizer=tokenizer, args=training_args, **data_module) +trainer = transformers.Trainer(model=model, processing_class=tokenizer, args=training_args, **data_module) trainer._move_model_to_device(model, trainer.args.device) # Enable HF checkpointing so that the saved model will contain the speculative decoding module diff --git a/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_hf.py b/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_hf.py index a3d1681c4..449b261c5 100644 --- a/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_hf.py +++ b/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_hf.py @@ -85,6 +85,12 @@ def parse_args() -> argparse.Namespace: default=1, help="""Data parallel world size. Number of tasks on SLURM.""", ) + parser.add_argument( + "--trust_remote_code", + help="Set trust_remote_code for Huggingface models and tokenizers", + default=False, + action="store_true", + ) return parser.parse_args() @@ -130,11 +136,11 @@ def keep_conversation(entry): dataset = dataset.select(range(args.debug_max_num_conversations)) model = AutoModel.from_pretrained( - args.model, torch_dtype="auto", device_map="auto", trust_remote_code=True + args.model, dtype="auto", device_map="auto", trust_remote_code=args.trust_remote_code ) num_hidden_layers = getattr(model.config, "num_hidden_layers", None) - tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=args.trust_remote_code) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token tokenizer.chat_template = tokenizer.chat_template.replace(REMOVE_THINK_CHAT_TEMPLATE, "") diff --git a/examples/speculative_decoding/main.py b/examples/speculative_decoding/main.py index 0db3867cc..d6e363090 100644 --- a/examples/speculative_decoding/main.py +++ b/examples/speculative_decoding/main.py @@ -60,6 +60,10 @@ @dataclass class ModelArguments: model_name_or_path: str | None = field(default="TinyLlama/TinyLlama-1.1B-Chat-v1.0") + trust_remote_code: bool = field( + default=False, + metadata={"help": "Set trust_remote_code for Huggingface models and tokenizers"}, + ) @dataclass @@ -179,18 +183,20 @@ def train(): if checkpoint: with patch_transformers5_params_loading(): _, model = load_vlm_or_llm_with_kwargs( - checkpoint, torch_dtype="auto", trust_remote_code=True + checkpoint, dtype="auto", trust_remote_code=model_args.trust_remote_code ) - tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True) + tokenizer = transformers.AutoTokenizer.from_pretrained( + checkpoint, trust_remote_code=model_args.trust_remote_code + ) else: # To avoid OOM for large models, we load and convert model on CPU first. # Model will be moved to GPU during HF trainer.init(). offline_kwargs = {"num_hidden_layers": 0} if use_offline_training else {} model_config, model = load_vlm_or_llm_with_kwargs( model_args.model_name_or_path, - torch_dtype="auto", + dtype="auto", device_map="cpu", - trust_remote_code=True, + trust_remote_code=model_args.trust_remote_code, **offline_kwargs, ) if use_offline_training: @@ -200,7 +206,7 @@ def train(): tokenizer = transformers.AutoTokenizer.from_pretrained( model_args.model_name_or_path, model_max_length=training_args.training_seq_len, - trust_remote_code=True, + trust_remote_code=model_args.trust_remote_code, ) if training_args.mode == "medusa": config = { diff --git a/examples/speculative_decoding/requirements.txt b/examples/speculative_decoding/requirements.txt deleted file mode 100644 index 6324bac62..000000000 --- a/examples/speculative_decoding/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -accelerate==1.12.0 -transformers==5.0.0rc1 diff --git a/examples/speculative_decoding/scripts/ar_validate.py b/examples/speculative_decoding/scripts/ar_validate.py index d5c37a895..f07aaafb8 100644 --- a/examples/speculative_decoding/scripts/ar_validate.py +++ b/examples/speculative_decoding/scripts/ar_validate.py @@ -55,6 +55,7 @@ def validate_ar(model, tokenizer, ds, steps=3, osl=20, num_samples=80, device=No def main(): parser = argparse.ArgumentParser() parser.add_argument("--model_path", type=str, required=True, help="Path to model directory") + parser.add_argument("--trust_remote_code", action="store_true", help="Trust remote code") parser.add_argument("--steps", type=int, default=3, help="Steps for AR validation") parser.add_argument( "--osl", type=int, default=32, help="Output sequence length for AR validation" @@ -72,8 +73,12 @@ def main(): accelerator = Accelerator() # Load model and tokenizer - _, model = load_vlm_or_llm_with_kwargs(args.model_path, device_map="auto") - tokenizer = AutoTokenizer.from_pretrained(args.model_path) + _, model = load_vlm_or_llm_with_kwargs( + args.model_path, device_map="auto", trust_remote_code=args.trust_remote_code + ) + tokenizer = AutoTokenizer.from_pretrained( + args.model_path, trust_remote_code=args.trust_remote_code + ) model.eval() model = accelerator.prepare(model) diff --git a/examples/speculative_decoding/scripts/export_hf_checkpoint.py b/examples/speculative_decoding/scripts/export_hf_checkpoint.py index 23a7560f7..ea30f0bba 100644 --- a/examples/speculative_decoding/scripts/export_hf_checkpoint.py +++ b/examples/speculative_decoding/scripts/export_hf_checkpoint.py @@ -29,6 +29,7 @@ def parse_args(): description="Export a HF checkpoint (with ModelOpt state) for deployment." ) parser.add_argument("--model_path", type=str, default="Path of the trained checkpoint.") + parser.add_argument("--trust_remote_code", action="store_true", help="Trust remote code") parser.add_argument( "--export_path", type=str, default="Destination directory for exported files." ) @@ -38,7 +39,9 @@ def parse_args(): mto.enable_huggingface_checkpointing() args = parse_args() -_, model = load_vlm_or_llm_with_kwargs(args.model_path, torch_dtype="auto") +_, model = load_vlm_or_llm_with_kwargs( + args.model_path, trust_remote_code=args.trust_remote_code, torch_dtype="auto" +) model.eval() with torch.inference_mode(): export_speculative_decoding( diff --git a/examples/speculative_decoding/scripts/send_conversation_vllm.py b/examples/speculative_decoding/scripts/send_conversation_vllm.py index 5101b4e6f..d1a5ac5c1 100644 --- a/examples/speculative_decoding/scripts/send_conversation_vllm.py +++ b/examples/speculative_decoding/scripts/send_conversation_vllm.py @@ -55,6 +55,12 @@ def parse_args() -> argparse.Namespace: "the local serving engine. This should match the value used by the server." ), ) + parser.add_argument( + "--trust_remote_code", + help="Set trust_remote_code for Huggingface models and tokenizers", + default=False, + action="store_true", + ) ## Client Parameters ## parser.add_argument( "--base-url", @@ -133,7 +139,9 @@ async def main(args: argparse.Namespace) -> None: base_url=args.base_url, ) - tokenizer = AutoTokenizer.from_pretrained(args.model_card, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained( + args.model_card, trust_remote_code=args.trust_remote_code + ) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token bos_token_id = tokenizer.bos_token_id diff --git a/examples/vllm_serve/fakequant_worker.py b/examples/vllm_serve/fakequant_worker.py index 772c6fe66..e23dce255 100644 --- a/examples/vllm_serve/fakequant_worker.py +++ b/examples/vllm_serve/fakequant_worker.py @@ -189,7 +189,7 @@ def _create_new_data_cls(data_cls, **kwargs): def _fakequant_run_prolog_worker(self) -> None: tokenizer = AutoTokenizer.from_pretrained( self.model_runner.model_config.tokenizer, - trust_remote_code=True, + trust_remote_code=False, ) if tokenizer.pad_token != "" or tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token diff --git a/examples/vlm_ptq/requirements-vila.txt b/examples/vlm_ptq/requirements-vila.txt deleted file mode 100644 index 7391a5f26..000000000 --- a/examples/vlm_ptq/requirements-vila.txt +++ /dev/null @@ -1,3 +0,0 @@ -deepspeed>=0.16.0 -git+https://github.com/bfshi/scaling_on_scales.git -transformers<=4.50.0 diff --git a/examples/vlm_ptq/requirements.txt b/examples/vlm_ptq/requirements.txt new file mode 100644 index 000000000..180f53411 --- /dev/null +++ b/examples/vlm_ptq/requirements.txt @@ -0,0 +1 @@ +transformers<5.0 diff --git a/examples/windows/accuracy_benchmark/modeling.py b/examples/windows/accuracy_benchmark/modeling.py index 273a944c5..f17300be9 100644 --- a/examples/windows/accuracy_benchmark/modeling.py +++ b/examples/windows/accuracy_benchmark/modeling.py @@ -49,6 +49,7 @@ class EvalModel(BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) model_path: str + trust_remote_code: bool = False max_input_length: int = 512 max_output_length: int = 512 dtype: str = "auto" @@ -84,7 +85,9 @@ def load(self): args.update(torch_dtype=getattr(torch, self.dtype)) else: args.update(torch_dtype="auto") - self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_path, **args) + self.model = AutoModelForSeq2SeqLM.from_pretrained( + self.model_path, trust_remote_code=self.trust_remote_code, **args + ) print_gpu_utilization() if self.lora_path: self.model = PeftModel.from_pretrained(self.model, self.lora_path) @@ -92,7 +95,9 @@ def load(self): if "device_map" not in args: self.model.to(self.device) if self.tokenizer is None: - self.tokenizer = AutoTokenizer.from_pretrained(self.model_path) + self.tokenizer = AutoTokenizer.from_pretrained( + self.model_path, trust_remote_code=self.trust_remote_code + ) def run(self, prompt: str, **kwargs) -> str: self.load() @@ -143,7 +148,7 @@ def load(self): args.update(device_map="auto", load_in_8bit=True) args.update(torch_dtype=getattr(torch, self.dtype) if self.dtype != "auto" else "auto") self.model = AutoModelForCausalLM.from_pretrained( - self.model_path, trust_remote_code=True, **args + self.model_path, trust_remote_code=self.trust_remote_code, **args ) self.model.eval() if "device_map" not in args: @@ -152,7 +157,9 @@ def load(self): # Sampling with temperature will cause MMLU to drop self.model.generation_config.do_sample = False if self.tokenizer is None: - self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True) + self.tokenizer = AutoTokenizer.from_pretrained( + self.model_path, trust_remote_code=self.trust_remote_code + ) def run(self, prompt: str, **kwargs) -> str: self.load() @@ -200,7 +207,7 @@ def load(self): args.update(device_map="auto", load_in_8bit=True) args.update(torch_dtype=getattr(torch, self.dtype) if self.dtype != "auto" else "auto") self.model = AutoAWQForCausalLM.from_quantized( - self.model_path, trust_remote_code=True, **args + self.model_path, trust_remote_code=self.trust_remote_code, **args ) self.model.eval() if "device_map" not in args: @@ -209,7 +216,9 @@ def load(self): # Sampling with temperature will cause MMLU to drop self.model.config.do_sample = False if self.tokenizer is None: - self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True) + self.tokenizer = AutoTokenizer.from_pretrained( + self.model_path, trust_remote_code=self.trust_remote_code + ) def run(self, prompt: str, **kwargs) -> str: self.load() diff --git a/examples/windows/onnx_ptq/genai_llm/quantize.py b/examples/windows/onnx_ptq/genai_llm/quantize.py index d21d1d796..13f6ac804 100644 --- a/examples/windows/onnx_ptq/genai_llm/quantize.py +++ b/examples/windows/onnx_ptq/genai_llm/quantize.py @@ -180,7 +180,7 @@ def get_initial_inputs( """ # tokenizer.pad_token = "[PAD]" tokenizer.pad_token = tokenizer.eos_token - encodings_dict = tokenizer.batch_encode_plus(prompt, padding=True) + encodings_dict = tokenizer(prompt, padding=True) # max_length = model.config.max_position_embeddings # input_ids = tokenizer.encode(text, truncation=True, padding='max_length', max_length=max_length) @@ -242,7 +242,7 @@ def get_calib_inputs( # dataset2 = dataset2.shuffle(seed=42) dataset2 = dataset2[column][:calib_size] - batch_encoded = tokenizer.batch_encode_plus( + batch_encoded = tokenizer( dataset2, return_tensors="pt", padding=True, truncation=True, max_length=block_size ) # return_tensors="pt", batch_encoded = batch_encoded.to(device) diff --git a/examples/windows/onnx_ptq/whisper/README.md b/examples/windows/onnx_ptq/whisper/README.md index 8757aaeb5..82ae78220 100644 --- a/examples/windows/onnx_ptq/whisper/README.md +++ b/examples/windows/onnx_ptq/whisper/README.md @@ -174,7 +174,7 @@ These scripts are currently validated with following settings: - Calibration size - 32 - Calibration EPs - \[`cuda`, `cpu`\] - Audio dataset - `librispeech_asr` dataset (32 samples used for calibration, 100+ samples used for WER test) - - `load_dataset("librispeech_asr", "clean", split="test", trust_remote_code=True)` + - `load_dataset("librispeech_asr", "clean", split="test")` - Quantization support for various ONNX files - `encoder_model.onnx`, `decoder_model.onnx`, `decoder_with_past_model.onnx` - The `use_merged` argument in optimum-ORT's Whisper model API is kept False. diff --git a/examples/windows/onnx_ptq/whisper/whisper_onnx_quantization.py b/examples/windows/onnx_ptq/whisper/whisper_onnx_quantization.py index 7b3e3d319..03d2c4980 100644 --- a/examples/windows/onnx_ptq/whisper/whisper_onnx_quantization.py +++ b/examples/windows/onnx_ptq/whisper/whisper_onnx_quantization.py @@ -275,7 +275,7 @@ def main(args): processor = WhisperProcessor.from_pretrained(args.model_name, cache_dir=args.cache_dir) - asr_dataset = load_dataset("librispeech_asr", "clean", split="test", trust_remote_code=True) + asr_dataset = load_dataset("librispeech_asr", "clean", split="test") # asr_dataset = load_dataset("librispeech_asr", "all", split="test.clean") calib_data = None diff --git a/examples/windows/onnx_ptq/whisper/whisper_optimum_ort_inference.py b/examples/windows/onnx_ptq/whisper/whisper_optimum_ort_inference.py index 52d56fe04..a1f39b8f0 100644 --- a/examples/windows/onnx_ptq/whisper/whisper_optimum_ort_inference.py +++ b/examples/windows/onnx_ptq/whisper/whisper_optimum_ort_inference.py @@ -85,9 +85,7 @@ def main(args): print(f"\n\n-- Content of input audio-file = {prediction}\n\n") if args.run_wer_test: - librispeech_test_clean = load_dataset( - "librispeech_asr", "clean", split="test", trust_remote_code=True - ) + librispeech_test_clean = load_dataset("librispeech_asr", "clean", split="test") references = [] predictions = [] diff --git a/modelopt/onnx/llm_export_utils/export_utils.py b/modelopt/onnx/llm_export_utils/export_utils.py index 4009b119e..4f50628dc 100644 --- a/modelopt/onnx/llm_export_utils/export_utils.py +++ b/modelopt/onnx/llm_export_utils/export_utils.py @@ -53,7 +53,7 @@ def load_model(self, trust_remote_code: bool = False) -> AutoModelForCausalLM: """Load HuggingFace model based on model type.""" print(f"Loading HF model from {self.hf_model_path} with model type {self.model_type}") self.hf_model = AutoModelForCausalLM.from_pretrained( - self.hf_model_path, torch_dtype=torch.float16, trust_remote_code=trust_remote_code + self.hf_model_path, dtype=torch.float16, trust_remote_code=trust_remote_code ) return self.hf_model.eval().cuda() # type: ignore[attr-defined] @@ -86,7 +86,8 @@ def forward(self, input_ids: torch.Tensor | None, past_key_values: tuple): outputs = self.model(input_ids=input_ids, past_key_values=past_key_values, use_cache=True) hidden_states = outputs[0] - past_key_values = outputs.past_key_values.to_legacy_cache() + cache = outputs.past_key_values + past_key_values = tuple(zip(cache.key_cache, cache.value_cache)) logits = self.lm_head(hidden_states) return logits, past_key_values diff --git a/modelopt/torch/__init__.py b/modelopt/torch/__init__.py index ec62b86ff..456bf7b4a 100644 --- a/modelopt/torch/__init__.py +++ b/modelopt/torch/__init__.py @@ -15,10 +15,13 @@ """Model optimization and deployment subpackage for torch.""" +import sys as _sys import warnings as _warnings from packaging.version import Version as _Version from torch import __version__ as _torch_version +from torch import device as _device +from torch import dtype as _dtype from . import distill, nas, opt, peft, prune, quantization, sparsity, speculative, utils @@ -27,16 +30,88 @@ "nvidia-modelopt will drop torch<2.7 support in a future release.", DeprecationWarning ) + # Since `hf` dependencies are optional and users have pre-installed transformers, we need to ensure # correct version is installed to avoid incompatibility issues. +def _patch_transformers_compat(mod) -> None: + """Compatibility shims for names removed in transformers 5.0.""" + import torch.nn as _nn + + # AutoModelForVision2Seq -> AutoModelForImageTextToText + if not hasattr(mod, "AutoModelForVision2Seq") and hasattr(mod, "AutoModelForImageTextToText"): + mod.AutoModelForVision2Seq = mod.AutoModelForImageTextToText + + # get_parameter_device and get_parameter_dtype were removed in transformers 5.0 + modeling_utils = _sys.modules.get("transformers.modeling_utils") + if modeling_utils is not None: + if not hasattr(modeling_utils, "get_parameter_device"): + + def get_parameter_device(parameter: _nn.Module) -> _device: + return next(parameter.parameters()).device + + modeling_utils.get_parameter_device = get_parameter_device # type: ignore[attr-defined] + + if not hasattr(modeling_utils, "get_parameter_dtype"): + + def get_parameter_dtype(parameter: _nn.Module) -> _dtype: + return next(parameter.parameters()).dtype + + modeling_utils.get_parameter_dtype = get_parameter_dtype # type: ignore[attr-defined] + + if not hasattr(modeling_utils, "load_sharded_checkpoint"): + try: + from transformers.trainer_utils import ( + load_sharded_checkpoint as _load_sharded_checkpoint, + ) + + modeling_utils.load_sharded_checkpoint = _load_sharded_checkpoint # type: ignore[attr-defined] + except ImportError: + pass + + # AutoConfig.register raises ValueError when a model type is already built into + # transformers (e.g. exaone_moe added in 5.0). Older packages like TRT-LLM call + # register without exist_ok=True. Patch CONFIG_MAPPING.register to silently skip. + try: + from transformers.models.auto.configuration_auto import CONFIG_MAPPING as _CONFIG_MAPPING + + _orig_cfg_register = _CONFIG_MAPPING.register + + def _patched_cfg_register(key, value, exist_ok=False): + _orig_cfg_register(key, value, exist_ok=True) + + _CONFIG_MAPPING.register = _patched_cfg_register + except Exception: + pass + + try: from transformers import __version__ as _transformers_version - if not (_Version("4.56") <= _Version(_transformers_version) < _Version("5.0")): + if _Version(_transformers_version) < _Version("4.56"): _warnings.warn( - f"transformers version {_transformers_version} is not tested with nvidia-modelopt and may cause issues. " - "Please install recommended version with `pip install nvidia-modelopt[hf]` if working with HF models.", + f"transformers {_transformers_version} is not tested with current version of modelopt and may cause issues." + " Please install recommended version with `pip install -U nvidia-modelopt[hf]` if working with HF models.", ) + elif _Version(_transformers_version) >= _Version("5.0"): + _warnings.warn( + "transformers>=5.0 support is experimental. Unified Hugging Face checkpoint export for quantized " + "checkpoints may not work for some models yet.", + ) + + # Temporary workaround until TRT-LLM container supports transformers 5.0 + if "transformers" in _sys.modules: + _patch_transformers_compat(_sys.modules["transformers"]) + else: + + class _TransformersCompatFinder: + def find_module(self, fullname, path=None): + if fullname == "transformers": + _sys.meta_path.remove(self) # type: ignore[arg-type] + import importlib as _importlib + + _patch_transformers_compat(_importlib.import_module(fullname)) + + _sys.meta_path.insert(0, _TransformersCompatFinder()) # type: ignore[arg-type] except ImportError: pass diff --git a/modelopt/torch/export/model_config_export.py b/modelopt/torch/export/model_config_export.py index b9acb80c8..ae92e2776 100644 --- a/modelopt/torch/export/model_config_export.py +++ b/modelopt/torch/export/model_config_export.py @@ -151,7 +151,8 @@ def torch_to_tensorrt_llm_checkpoint( model_metadata_config = model.config.__dict__ vocab_size = model.config.vocab_size hf_config = model.config - architecture = model.config.architectures[0] + architectures = getattr(model.config, "architectures", None) + architecture = architectures[0] if architectures else "" # For Baichuan 13B, we check if alibi is used with the alibi_mask property. if hasattr(model, "model") and hasattr(model.model, "alibi_mask"): diff --git a/modelopt/torch/export/tensorrt_llm_utils.py b/modelopt/torch/export/tensorrt_llm_utils.py index 75708dbcd..f49fcd489 100755 --- a/modelopt/torch/export/tensorrt_llm_utils.py +++ b/modelopt/torch/export/tensorrt_llm_utils.py @@ -48,6 +48,7 @@ "gemma": "GemmaForCausalLM", "gemma3": "Gemma3ForCausalLM", "gpt": "GPTForCausalLM", + "qwen": "QWenForCausalLM", "enc": "EncoderModel", "dec": "DecoderModel", "mllama": "MLLaMAModel", @@ -240,7 +241,7 @@ def convert_to_tensorrt_llm_config( layernorm_type_map = {i.name: i.value for i in LayerNormType} layernorm_position_map = {i.name: i.value for i in LayerNormPositionType} - if decoder_type in ["gpt", "gemma", "llama"]: + if decoder_type in ["gpt", "gemma", "llama", "qwen"]: pass elif decoder_type == "mpt": config.update( diff --git a/modelopt/torch/opt/plugins/transformers.py b/modelopt/torch/opt/plugins/transformers.py index 7cfdc8ca0..9cc729723 100644 --- a/modelopt/torch/opt/plugins/transformers.py +++ b/modelopt/torch/opt/plugins/transformers.py @@ -15,6 +15,7 @@ """ModelOpt plugin for enabling automatic save/restore of ModelOpt state for HuggingFace models.""" +import os import types from contextlib import contextmanager @@ -24,8 +25,9 @@ from modelopt.torch.utils import report_memory -from ..conversion import ModeloptStateManager +from ..conversion import ModeloptStateManager, load_modelopt_state from .huggingface import ( + _get_modelopt_state_path, _new_save_pretrained, _patch_model_init_for_modelopt, enable_huggingface_checkpointing, @@ -60,6 +62,39 @@ def _undo_torch_init_override_by_transformers(): setattr(torch.nn.init, name, init_func) +def _restore_qtensor_wrappers(model, model_path): + """Re-wrap QTensorWrapper weights that were replaced during HF weight loading. + + Transformers>=5.0 uses ``setattr`` to load weights, which replaces ``QTensorWrapper`` + objects with plain ``Parameter`` tensors. The compressed data is loaded correctly but + the wrapper metadata (original shape, dtype, qtensor class) is lost. This function + reads the saved ``q_tensor_state`` from ``modelopt_state.pth`` and re-wraps the affected + weights. + """ + modelopt_state_path = _get_modelopt_state_path(model_path) + if not os.path.isfile(modelopt_state_path): + return + + from modelopt.torch.quantization.nn.modules.quant_linear import RealQuantLinear + from modelopt.torch.quantization.qtensor import QTensorWrapper + + state = load_modelopt_state(modelopt_state_path) + for _, mode_config in state["modelopt_state_dict"]: + q_tensor_state = mode_config.get("metadata", {}).get("q_tensor_state", {}) + if not q_tensor_state: + continue + for name, module in model.named_modules(): + if ( + isinstance(module, RealQuantLinear) + and name in q_tensor_state + and not isinstance(module.weight, QTensorWrapper) + ): + module._parameters["weight"] = QTensorWrapper( + qtensor=module.weight.data, + metadata=q_tensor_state[name]["metadata"], + ) + + def _new_from_pretrained(cls, /, pretrained_model_name_or_path, *args, **kwargs): """Patch for `cls.from_pretrained` method to restore ModelOpt state.""" with _patch_model_init_for_modelopt( @@ -69,6 +104,8 @@ def _new_from_pretrained(cls, /, pretrained_model_name_or_path, *args, **kwargs) pretrained_model_name_or_path, *args, **kwargs ) + _restore_qtensor_wrappers(model, pretrained_model_name_or_path) + return model @@ -93,12 +130,12 @@ def _save_pretrained_with_checks(self, save_directory, *args, **kwargs): # [Fix for huggingface bug] deepspeed zero3 training backend only loads params into the model from # state_dict, but not buffers. So lets explicitly load the buffers into the model from state_dict. -def _load_params_and_buffers_into_zero3_model(model_to_load, state_dict): +def _load_params_and_buffers_into_zero3_model(model_to_load, state_dict, load_config=None): buffer_names = [name for name, _ in model_to_load.named_buffers()] buffer_state_dict = {k: v for k, v in state_dict.items() if k in buffer_names} model_to_load.load_state_dict(buffer_state_dict, strict=False) return tf_modeling_utils._modelopt_cache["_load_state_dict_into_zero3_model"]( - model_to_load, state_dict + model_to_load, state_dict, load_config ) diff --git a/modelopt/torch/quantization/backends/nvfp4_gemm.py b/modelopt/torch/quantization/backends/nvfp4_gemm.py index ffc18fea3..e7d2b90ff 100644 --- a/modelopt/torch/quantization/backends/nvfp4_gemm.py +++ b/modelopt/torch/quantization/backends/nvfp4_gemm.py @@ -176,7 +176,7 @@ def backward(ctx, grad_outputs): grad_weight = grad_outputs.reshape(-1, grad_outputs.shape[-1]).T @ input_tensor.reshape( -1, input_tensor.shape[-1] ) - if ctx.compute_bias_grad is not None: + if ctx.compute_bias_grad: # Sum all dimensions except the last one grad_bias = grad_outputs.sum(dim=list(range(grad_outputs.dim() - 1))) diff --git a/modelopt/torch/quantization/nn/modules/quant_linear.py b/modelopt/torch/quantization/nn/modules/quant_linear.py index bcb71e4c9..bb65d5907 100644 --- a/modelopt/torch/quantization/nn/modules/quant_linear.py +++ b/modelopt/torch/quantization/nn/modules/quant_linear.py @@ -246,26 +246,39 @@ def __init__(self, weight_quantizer: TensorQuantizer, *args, **kwargs): self.weight_quantizer = weight_quantizer def __setitem__(self, key, value): - if ( - key == "weight" - and self.weight_quantizer - and self.weight_quantizer.is_enabled - and not self.weight_quantizer._fake_quant - and value.element_size() > 1 - ): - # reset the amax for later calibration + if key == "weight" and not isinstance(value, QTensorWrapper): + existing = self.get("weight") if ( - self.weight_quantizer.amax is not None - and self.weight_quantizer.amax.is_meta + isinstance(existing, QTensorWrapper) + and not existing.is_meta + and existing.shape == value.shape ): - delattr(self.weight_quantizer, "_amax") - self.weight_quantizer.amax = self.weight_quantizer._get_amax(value) - self.weight_quantizer._calibrator.reset() - # compress the weight - real_quant_tensor = self.weight_quantizer(value) - real_quant_value = QTensorWrapper(real_quant_tensor) - del value # delete the original weight to save memory - value = real_quant_value + # Loading a compressed weight (e.g. from safetensors in transformers>=5.0 + # which replaces parameters via setattr rather than copy_). Preserve the + # QTensorWrapper type and metadata. + super().__setitem__( + key, QTensorWrapper(qtensor=value.data, metadata=existing.metadata) + ) + return + if ( + self.weight_quantizer + and self.weight_quantizer.is_enabled + and not self.weight_quantizer._fake_quant + and value.element_size() > 1 + ): + # reset the amax for later calibration + if ( + self.weight_quantizer.amax is not None + and self.weight_quantizer.amax.is_meta + ): + delattr(self.weight_quantizer, "_amax") + self.weight_quantizer.amax = self.weight_quantizer._get_amax(value) + self.weight_quantizer._calibrator.reset() + # compress the weight + real_quant_tensor = self.weight_quantizer(value) + real_quant_value = QTensorWrapper(real_quant_tensor) + del value # delete the original weight to save memory + value = real_quant_value super().__setitem__(key, value) # Monkey patch the _parameters.__setitem__ to real quant the weight when loading diff --git a/modelopt/torch/quantization/plugins/accelerate.py b/modelopt/torch/quantization/plugins/accelerate.py index 59731cc8a..13999df0f 100644 --- a/modelopt/torch/quantization/plugins/accelerate.py +++ b/modelopt/torch/quantization/plugins/accelerate.py @@ -190,8 +190,10 @@ def patched_from_pretrained(cls, /, pretrained_model_name_or_path, *args, **kwar with init_empty_weights(): # Fix torch_dtype to match original model - torch_dtype = kwargs.get("torch_dtype", getattr(config, "torch_dtype", torch.float16)) - model = cls.from_config(config, torch_dtype=torch_dtype) + torch_dtype = kwargs.get( + "dtype", kwargs.get("torch_dtype", getattr(config, "torch_dtype", torch.float16)) + ) + model = cls.from_config(config, dtype=torch_dtype) mtq.quantize(model, quant_cfg) mtq.compress(model, config=mtq.CompressConfig(quant_gemm=quant_gemm)) diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py index ead989bc6..00a8df4f8 100644 --- a/modelopt/torch/quantization/plugins/huggingface.py +++ b/modelopt/torch/quantization/plugins/huggingface.py @@ -172,14 +172,20 @@ def forward(self, *args, **kwargs): The forward method is used to patch the attention interface with _quantized_attention. Once output tensors are generated, it restores the original attention interface. """ + # In transformers>=5.0 some attention classes (e.g. BertAttention) no longer store + # `self.config` directly; fall back to searching child modules for a config attribute. + _config = getattr(self, "config", None) + if _config is None: + _config = next( + (getattr(m, "config", None) for m in self.children() if hasattr(m, "config")), + None, + ) + _attn_impl = getattr(_config, "_attn_implementation", None) if _config is not None else None def _is_eager_attention(): - if self.config._attn_implementation == "eager": + if _attn_impl is None or _attn_impl == "eager": return True - return bool( - self.config._attn_implementation == "sdpa" - and kwargs.get("output_attentions", False) - ) + return bool(_attn_impl == "sdpa" and kwargs.get("output_attentions", False)) # Get the original transformers module before wrapped in any ModelOpt DynamicModule module: ModuleType = inspect.getmodule(self.get_attn_type(self)) @@ -188,7 +194,7 @@ def _is_eager_attention(): original_attention_interface = ( module.eager_attention_forward if _is_eager_attention() - else module.ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + else module.ALL_ATTENTION_FUNCTIONS[_attn_impl] ) patch_fn = partial(self._quantized_attention, original_attention_interface) @@ -201,7 +207,7 @@ def _is_eager_attention(): ) module.eager_attention_forward = patch_fn # type: ignore[attr-defined] else: - module.ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] = patch_fn + module.ALL_ATTENTION_FUNCTIONS[_attn_impl] = patch_fn try: outputs = super().forward(*args, **kwargs) @@ -210,9 +216,7 @@ def _is_eager_attention(): if _is_eager_attention(): module.eager_attention_forward = original_attention_interface # type: ignore[attr-defined] else: - module.ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] = ( - original_attention_interface - ) + module.ALL_ATTENTION_FUNCTIONS[_attn_impl] = original_attention_interface return outputs @@ -333,10 +337,14 @@ class HFParallelLinear(torch.nn.Linear, DynamicModule): shard = None def _setup(self): - assert self.weight.placements == self.shard, ( - f"Received unexpected shard {self.weight.placements} for {self}" - ) - tp_group = self.weight.device_mesh.get_group() + if isinstance(self.weight, torch.distributed.tensor.DTensor): # transformers<5.0 + assert self.weight.placements == self.shard, ( + f"Received unexpected shard {self.weight.placements} for {self}" + ) + device_mesh = self.weight.device_mesh + else: # transformers>=5.0: weights are plain Parameters, mesh is on the module + device_mesh = self._hf_device_mesh + tp_group = device_mesh.get_group() self._parallel_state = ParallelState(data_parallel_group=-1, tensor_parallel_group=tp_group) @classmethod @@ -371,14 +379,17 @@ def fold_weight(self, keep_attrs: bool = False): @contextmanager def enable_weight_access_and_writeback(self): - assert self.weight.placements == self.shard, ( - f"Received unexpected shard {self.weight.placements} for {self}" - ) - weight = self.weight - # TODO: To support TP + FSDP, we need to redistribute the tensor with replicate instead of shard - self.weight = nn.Parameter(weight.to_local()) - yield - self.weight = weight + if isinstance(self.weight, torch.distributed.tensor.DTensor): # transformers<5.0 + assert self.weight.placements == self.shard, ( + f"Received unexpected shard {self.weight.placements} for {self}" + ) + weight = self.weight + # TODO: To support TP + FSDP, we need to redistribute the tensor with replicate instead of shard + self.weight = nn.Parameter(weight.to_local()) + yield + self.weight = weight + else: # transformers>=5.0: weights are already plain Parameters + yield @QuantModuleRegistry.register({HFColumnParallelLinear: "HFColumnParallelLinear"}) @@ -523,7 +534,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: super().forward(hidden_states) self.gate.top_k = original_top_k else: - # Path for transformers < 5.0 + # Path for transformers<5.0 if hasattr(self, "gate") and hasattr(self.gate, "top_k"): top_k_owner = self.gate else: @@ -591,22 +602,20 @@ def _setup(self): """Modify the DbrxExpert.""" # No setup is needed for DbrxExpert, we only need to update DbrxExpertGLU - # forward method copied from the original dbrx repo - https://github.com/databricks/dbrx/blob/a3200393/model/modeling_dbrx.py#L795 def forward( self, x: torch.Tensor, - weights: torch.Tensor, - top_weights: torch.Tensor, top_experts: torch.LongTensor, + top_weights: torch.Tensor, ) -> torch.Tensor: bsz, q_len, hidden_size = x.shape x = x.view(-1, hidden_size) out = torch.zeros_like(x) - expert_mask = nn.functional.one_hot(top_experts, num_classes=self.moe_num_experts).permute( + expert_mask = nn.functional.one_hot(top_experts, num_classes=self.num_experts).permute( 2, 1, 0 ) - for expert_idx in range(self.moe_num_experts): + for expert_idx in range(self.num_experts): topk_idx, token_idx = torch.where(expert_mask[expert_idx]) if token_idx.shape[0] == 0: continue @@ -636,41 +645,48 @@ def _copy_weights(modules, weights): with torch.no_grad(): module.weight.copy_(weights[expert_idx].detach()) + # In transformers 5.0, DbrxExpertGLU.forward uses raw matmul: x @ w1[i] where + # w1[i] has shape (ffn_hidden_size, hidden_size). To match via F.linear (which + # computes x @ W.T), we store weights transposed: W = w1[i].T. self.w1_linear = nn.ModuleList( [ - nn.Linear(self.hidden_size, self.ffn_hidden_size, bias=False) + nn.Linear(self.ffn_hidden_size, self.hidden_size, bias=False) for _ in range(self.moe_num_experts) ] ) _copy_weights( self.w1_linear, - self.w1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size), + self.w1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size).transpose( + 1, 2 + ), ) delattr(self, "w1") self.v1_linear = nn.ModuleList( [ - nn.Linear(self.hidden_size, self.ffn_hidden_size, bias=False) + nn.Linear(self.ffn_hidden_size, self.hidden_size, bias=False) for _ in range(self.moe_num_experts) ] ) _copy_weights( self.v1_linear, - self.v1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size), + self.v1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size).transpose( + 1, 2 + ), ) delattr(self, "v1") + # w2: down_proj uses intermediate.matmul(w2[i].t()) = F.linear(intermediate, w2[i]) + # so W = w2[i] directly (no extra transpose needed). self.w2_linear = nn.ModuleList( [ - nn.Linear(self.ffn_hidden_size, self.hidden_size, bias=False) + nn.Linear(self.hidden_size, self.ffn_hidden_size, bias=False) for _ in range(self.moe_num_experts) ] ) _copy_weights( self.w2_linear, - self.w2.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size).transpose( - 1, 2 - ), + self.w2.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size), ) delattr(self, "w2") @@ -870,11 +886,18 @@ def num_experts(self): @property def top_k(self): - return self.router.moe_top_k + # In older transformers, top_k was stored on DbrxRouter as moe_top_k. + # In transformers 5.0, DbrxFFN stores it as a plain attribute (top_k). + if hasattr(self.router, "moe_top_k"): + return self.router.moe_top_k + return self.__dict__.get("top_k", 1) @top_k.setter def top_k(self, value): - self.router.moe_top_k = value + if hasattr(self.router, "moe_top_k"): + self.router.moe_top_k = value + else: + self.__dict__["top_k"] = value @contextmanager @@ -897,10 +920,7 @@ def patch_compressed_linear_loading(): with patch_compressed_linear_loading(): model = AutoModelForCausalLM.from_pretrained( - ckpt_path, - device_map="auto", - trust_remote_code=True, - torch_dtype="auto", + ckpt_path, device_map="auto", trust_remote_code=True, dtype="auto" ) """ try: diff --git a/modelopt/torch/quantization/plugins/transformers_trainer.py b/modelopt/torch/quantization/plugins/transformers_trainer.py index b0d278650..253632784 100644 --- a/modelopt/torch/quantization/plugins/transformers_trainer.py +++ b/modelopt/torch/quantization/plugins/transformers_trainer.py @@ -29,7 +29,7 @@ import modelopt.torch.quantization as mtq from modelopt.torch.distill.plugins.huggingface import KDTrainer from modelopt.torch.opt.plugins import ModelOptHFTrainer -from modelopt.torch.utils import print_rank_0 +from modelopt.torch.utils import get_module_device, print_rank_0 from ..config import QuantizeConfig from ..nn import TensorQuantizer @@ -344,8 +344,10 @@ def _load_best_model(self, *args, **kwargs): ), "Some base_layer parameters are not frozen" adapter_name = self.model.active_adapters()[0] + device = get_module_device(self.model) self.model.delete_adapter(adapter_name) self.model.load_adapter(self.state.best_model_checkpoint, adapter_name) + self.model.to(device) else: super()._load_best_model(*args, **kwargs) diff --git a/modelopt/torch/quantization/utils/core_utils.py b/modelopt/torch/quantization/utils/core_utils.py index 4340b8dc1..22f207964 100644 --- a/modelopt/torch/quantization/utils/core_utils.py +++ b/modelopt/torch/quantization/utils/core_utils.py @@ -524,7 +524,14 @@ def sync_moe_expert_amax(experts): 2. For any ``weight_quantizer`` that is enabled but has ``amax is None`` (expert received no tokens during calibration), runs a weight-only ``max_calibrate`` to populate the missing amax. + + No-op for batched expert modules (e.g. transformers>=5.0 ``Qwen3MoeExperts``) + that store all expert weights in a single 3D tensor without per-expert sub-modules. """ + if not hasattr(experts, "__iter__"): + # transformers>=5.0: batched experts, no per-expert quantizers + return + from ..nn import TensorQuantizer amax_dict: dict[str, torch.Tensor] = {} diff --git a/modelopt/torch/sparsity/attention_sparsity/model_sparsify.py b/modelopt/torch/sparsity/attention_sparsity/model_sparsify.py index 28c18943a..a33938b05 100644 --- a/modelopt/torch/sparsity/attention_sparsity/model_sparsify.py +++ b/modelopt/torch/sparsity/attention_sparsity/model_sparsify.py @@ -139,7 +139,7 @@ def forward_loop(model) -> float: model = AutoModelForCausalLM.from_pretrained( model_path, attn_implementation="eager", # Required for sparse attention - torch_dtype=torch.bfloat16, + dtype=torch.bfloat16, ) This is because sparse attention works by patching torch.nn.functional.softmax, diff --git a/modelopt/torch/speculative/plugins/transformers.py b/modelopt/torch/speculative/plugins/transformers.py index 25946f2c1..5fd3fcba9 100644 --- a/modelopt/torch/speculative/plugins/transformers.py +++ b/modelopt/torch/speculative/plugins/transformers.py @@ -75,11 +75,6 @@ CACHED_SHARD_TTT_MASKS = {} -def _get_empty_cache(config): - """Return an empty cache. Handle different versions of transformers for unit tests.""" - return DynamicCache(config=config) - - @MedusaDMRegistry.register({PreTrainedModel: "hf.PreTrainedModel"}) class HFMedusaModel(MedusaModel): """Medusa Model Class for huggingface models.""" @@ -927,9 +922,9 @@ def forward( ) if not isinstance(past_key_values, Cache): - past_key_values = _get_empty_cache(self._base_llm_config) + past_key_values = DynamicCache(config=self._base_llm_config) if not isinstance(eagle_cache, Cache): - eagle_cache = _get_empty_cache(self.eagle_module.config) + eagle_cache = DynamicCache(config=self.eagle_module.config) past_key_values.eagle_cache = eagle_cache # ====Prepare inputs for the first eagle forward pass==== diff --git a/modelopt/torch/speculative/utils.py b/modelopt/torch/speculative/utils.py index 72c5b5dbc..396e998d6 100644 --- a/modelopt/torch/speculative/utils.py +++ b/modelopt/torch/speculative/utils.py @@ -474,10 +474,10 @@ def enable_cp_ttt_patch(): modelopt.torch.speculative.plugins.transformers.ENABLE_CP_TTT_PATCH = False -def load_vlm_or_llm_with_kwargs(model_name_or_path: str, **kwargs): +def load_vlm_or_llm_with_kwargs(model_name_or_path: str, trust_remote_code: bool = False, **kwargs): """Load a VLM or LLM with kwargs. Returns the model and model config.""" model_config = transformers.AutoConfig.from_pretrained( - model_name_or_path, trust_remote_code=True + model_name_or_path, trust_remote_code=trust_remote_code ) if "vl" in model_config.model_type.lower(): model_cls = transformers.AutoModelForVision2Seq @@ -488,7 +488,9 @@ def load_vlm_or_llm_with_kwargs(model_name_or_path: str, **kwargs): if hasattr(model_config, "layer_types"): kwargs["layer_types"] = [] - return model_config, model_cls.from_pretrained(model_name_or_path, **kwargs) + return model_config, model_cls.from_pretrained( + model_name_or_path, trust_remote_code=trust_remote_code, **kwargs + ) @contextlib.contextmanager diff --git a/modelopt/torch/trace/plugins/transformers.py b/modelopt/torch/trace/plugins/transformers.py index f07a37601..ad7a8cf01 100644 --- a/modelopt/torch/trace/plugins/transformers.py +++ b/modelopt/torch/trace/plugins/transformers.py @@ -15,8 +15,11 @@ """Utilities to describe symbols in the dynamic attention module.""" +import torch +import transformers +from packaging.version import Version from torch import nn -from transformers.models.bert.modeling_bert import BertAttention +from transformers.models.bert.modeling_bert import BertAttention, BertLayer from transformers.models.gptj.modeling_gptj import GPTJAttention from ..symbols import Symbol, SymInfo, SymMap @@ -56,3 +59,57 @@ def get_hf_attn_sym_info_sortable(mod: nn.Module) -> SymInfo: @SymMap.register([GPTJAttention]) def get_hf_attn_sym_info_unsortable(mod: nn.Module) -> SymInfo: return get_hf_attn_sym_info(sortable_attn=True) + + +# In transformers>=5.0, BertLayer.forward uses tuple unpacking on the BertAttention output +# (e.g. `self_attn_out, _ = self.attention(...)`), which FX symbolic tracing cannot handle when +# BertAttention is a registered leaf (the proxy is not iterable). Patch BertLayer.forward to use +# indexing instead, and call feed_forward_chunk directly (equivalent to apply_chunking_to_forward +# with chunk_size=0, which is the default for BERT). +if Version(transformers.__version__) >= Version("5.0"): + + def _fx_friendly_bert_layer_forward( + self, + hidden_states: torch.Tensor, + attention_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + cache_position=None, + **kwargs, + ): + # Use indexing instead of tuple-unpacking so FX can trace through BertLayer + # when BertAttention is a registered leaf (returns an opaque Proxy). + # Accept **kwargs so that a parent trace (e.g. BertEncoder) passing extra kwargs + # like position_ids does not mark BertLayer as failed. However, do NOT forward + # **kwargs into self.attention: FX represents **kwargs as a Proxy(_kwargs), so + # unpacking it with ** would trigger "Proxy cannot be iterated". Additionally, + # BertSelfAttention ignores these kwargs (e.g. position_ids) in practice. + _attn_outputs = self.attention( + hidden_states, + attention_mask, + past_key_values=past_key_values, + cache_position=cache_position, + ) + attention_output = _attn_outputs[0] + + if self.is_decoder and encoder_hidden_states is not None: + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with" + " cross-attention layers by setting `config.add_cross_attention=True`" + ) + _cross_outputs = self.crossattention( + attention_output, + None, + encoder_hidden_states, + encoder_attention_mask, + past_key_values=past_key_values, + ) + attention_output = _cross_outputs[0] + + # Call feed_forward_chunk directly (equivalent to apply_chunking_to_forward when + # chunk_size_feed_forward=0, which is the BERT default). + return self.feed_forward_chunk(attention_output) + + BertLayer.forward = _fx_friendly_bert_layer_forward diff --git a/modelopt/torch/utils/speech_dataset_utils.py b/modelopt/torch/utils/speech_dataset_utils.py index a71d73773..ef0660175 100644 --- a/modelopt/torch/utils/speech_dataset_utils.py +++ b/modelopt/torch/utils/speech_dataset_utils.py @@ -48,9 +48,7 @@ def _get_speech_dataset(dataset_name: str, num_samples: int): # Use streaming can reduce the downloading time for large datasets dataset = load_dataset( - **SUPPORTED_SPEECH_DATASET_CONFIG[dataset_name]["config"], - trust_remote_code=True, - streaming=True, + **SUPPORTED_SPEECH_DATASET_CONFIG[dataset_name]["config"], streaming=True ) else: raise NotImplementedError( diff --git a/pyproject.toml b/pyproject.toml index 96490dff0..52ee43ccb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -81,7 +81,7 @@ hf = [ "nltk", "peft>=0.17.0", "sentencepiece>=0.2.1", # Also implicitly used in test_unified_export_megatron, test_vllm_fakequant_megatron_export - "transformers>=4.56,<5.0", # Should match modelopt/torch/__init__.py and tox.ini + "transformers>=4.56.0", # Should match modelopt/torch/__init__.py and tox.ini "wonderwords", ] dev-lint = [ diff --git a/tests/_test_utils/examples/models.py b/tests/_test_utils/examples/models.py index abedd7b2a..8bf2b95a6 100644 --- a/tests/_test_utils/examples/models.py +++ b/tests/_test_utils/examples/models.py @@ -64,8 +64,8 @@ def _select_path(remote_id: str, local_id: str) -> str: ) QWEN_VL_PATH = _select_path( - remote_id="Qwen/Qwen2-VL-2B-Instruct", - local_id="Qwen2-VL-2B-Instruct", + remote_id="Qwen/Qwen3-VL-2B-Instruct", + local_id="Qwen3-VL-2B-Instruct", ) # Diffusers diff --git a/tests/_test_utils/torch/quantization/tensor_quantizer_common.py b/tests/_test_utils/torch/quantization/tensor_quantizer_common.py index ad2722dca..855919271 100644 --- a/tests/_test_utils/torch/quantization/tensor_quantizer_common.py +++ b/tests/_test_utils/torch/quantization/tensor_quantizer_common.py @@ -144,10 +144,9 @@ def test_max_calib(self): rtol=0, ) - @pytest.mark.manual(reason="slow test, run with --run-manual") def test_entropy_and_percentile_calib(self): """Don't really have a good way to test it.""" - quant_attr_cfg1 = QuantizerAttributeConfig(calib_method="histogram") + quant_attr_cfg1 = QuantizerAttributeConfig(calibrator="histogram") quantizer1 = TensorQuantizer(quant_attr_cfg1, if_calib=True, if_quant=False).to(self.device) x_1 = torch.rand(3, 6, 7, 7).to(self.device) diff --git a/tests/examples/llm_ptq/test_llm_ptq.py b/tests/examples/llm_ptq/test_llm_ptq.py index f5d0b39c1..358516e99 100644 --- a/tests/examples/llm_ptq/test_llm_ptq.py +++ b/tests/examples/llm_ptq/test_llm_ptq.py @@ -12,10 +12,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - - import pytest -from _test_utils.examples.llm_ptq_utils import PTQCommand, WithRequirements +import transformers +from _test_utils.examples.llm_ptq_utils import PTQCommand from _test_utils.examples.models import ( BART_PATH, MIXTRAL_PATH, @@ -23,6 +22,7 @@ TINY_LLAMA_PATH, WHISPER_PATH, ) +from packaging.version import Version @pytest.mark.parametrize( @@ -36,18 +36,9 @@ def test_ptq_bart(command): command.run(BART_PATH) -class TestT5(WithRequirements): - requirements = [("transformers", "4.48.0")] - - @pytest.mark.parametrize( - "command", - [ - PTQCommand(quant="fp8", min_sm=89), - ], - ids=PTQCommand.param_str, - ) - def test_ptq_t5(self, command): - command.run(T5_PATH) +@pytest.mark.parametrize("command", [PTQCommand(quant="fp8", min_sm=89)], ids=PTQCommand.param_str) +def test_ptq_t5(command): + command.run(T5_PATH) @pytest.mark.parametrize( @@ -61,22 +52,20 @@ def test_ptq_mixtral(command): command.run(MIXTRAL_PATH) -class TestWhisper(WithRequirements): - requirements = [ - ("librosa", None), - ("soundfile", None), - ] - - @pytest.mark.parametrize( - "command", - [ - # Auto-batch-size computation seems to take >10mins for Whisper hence using a fixed batch size - PTQCommand(quant="fp8", calib_batch_size=16, min_sm=89), - ], - ids=PTQCommand.param_str, - ) - def test_ptq_whisper(self, command): - command.run(WHISPER_PATH) +@pytest.mark.skipif( + Version(transformers.__version__) >= Version("5.0"), + reason="Whisper requires torchcodec and other system packages for transformers>=5.0", +) +@pytest.mark.parametrize( + "command", + [ + # Auto-batch-size computation seems to take >10mins for Whisper hence using a fixed batch size + PTQCommand(quant="fp8", calib_batch_size=16, min_sm=89), + ], + ids=PTQCommand.param_str, +) +def test_ptq_whisper(command): + command.run(WHISPER_PATH) @pytest.mark.parametrize( diff --git a/tests/examples/llm_qat/test_llm_qat.py b/tests/examples/llm_qat/test_llm_qat.py index ebdb67024..5a0e7ad44 100644 --- a/tests/examples/llm_qat/test_llm_qat.py +++ b/tests/examples/llm_qat/test_llm_qat.py @@ -17,6 +17,7 @@ import pytest import torch from _test_utils.examples.run_command import run_example_command +from _test_utils.torch.misc import minimum_sm # fmt: off @@ -98,7 +99,7 @@ def test_llama_lora_qat_nvfp4(tiny_llama_path, tmp_path): ] ) - +@minimum_sm(90) def test_llama_qlora_nvfp4(tiny_llama_path, tmp_path): _run_command( [ diff --git a/tests/gpu/torch/quantization/test_gptq.py b/tests/gpu/torch/quantization/test_gptq.py index 0c60bcd00..d43177cae 100644 --- a/tests/gpu/torch/quantization/test_gptq.py +++ b/tests/gpu/torch/quantization/test_gptq.py @@ -163,9 +163,7 @@ def test_gptq_e2e_flow(quant_cfg): model = AutoModelForCausalLM.from_pretrained( "TinyLlama/TinyLlama-1.1B-Chat-v1.0", device_map="auto" ) - tokenizer = AutoTokenizer.from_pretrained( - "TinyLlama/TinyLlama-1.1B-Chat-v1.0", trust_remote_code=True - ) + tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0") # can't set attribute 'pad_token' for "" # We skip this step for Nemo models diff --git a/tests/unit/torch/opt/plugins/test_transformers_save_load.py b/tests/unit/torch/opt/plugins/test_transformers_save_load.py index 25b182b9b..fced5734e 100644 --- a/tests/unit/torch/opt/plugins/test_transformers_save_load.py +++ b/tests/unit/torch/opt/plugins/test_transformers_save_load.py @@ -17,6 +17,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed import pytest +import torch from _test_utils.torch.opt.utils import apply_mode_with_sampling from _test_utils.torch.transformers_models import ( create_tiny_llama_dir, @@ -27,7 +28,7 @@ @pytest.mark.parametrize("model_cls", [LlamaForCausalLM, AutoModelForCausalLM]) def test_causal_lm_save_restore(tmp_path, model_cls): - tiny_llama_dir = create_tiny_llama_dir(tmp_path, hidden_size=128) + tiny_llama_dir = create_tiny_llama_dir(tmp_path, hidden_size=128, dtype=torch.float32) model_ref = model_cls.from_pretrained(tiny_llama_dir) # TODO: Add calibrate, compress mode to the test model_ref = apply_mode_with_sampling( @@ -41,7 +42,7 @@ def test_causal_lm_save_restore(tmp_path, model_cls): def test_causal_lm_from_config(tmp_path): """Test loading a model using from_config after applying optimizations""" - tiny_llama_dir = create_tiny_llama_dir(tmp_path, hidden_size=128) + tiny_llama_dir = create_tiny_llama_dir(tmp_path, hidden_size=128, dtype=torch.float32) model_ref = AutoModelForCausalLM.from_pretrained(tiny_llama_dir) model_ref = apply_mode_with_sampling( diff --git a/tests/unit/torch/quantization/plugins/test_huggingface.py b/tests/unit/torch/quantization/plugins/test_huggingface.py index 33730409a..d9b1f9e43 100644 --- a/tests/unit/torch/quantization/plugins/test_huggingface.py +++ b/tests/unit/torch/quantization/plugins/test_huggingface.py @@ -12,7 +12,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import os import warnings from contextlib import nullcontext @@ -28,6 +27,7 @@ get_tiny_qwen3_moe, tf_modelopt_state_and_output_tester, ) +from packaging.version import Version import modelopt.torch.quantization as mtq from modelopt.torch.quantization.nn import QuantLinear, QuantModuleRegistry @@ -105,12 +105,17 @@ def test_convert_conv1d(): assert torch.allclose(out_1, out_2) +@pytest.mark.skipif( + Version(transformers.__version__) < Version("5.0"), + reason="test_dbrx is not supported for transformers<5.0", +) def test_dbrx(): assert DbrxExperts in QuantModuleRegistry assert DbrxExpertGLU in QuantModuleRegistry config = DbrxConfig( - ffn_config=DbrxFFNConfig(ffn_hidden_size=8, moe_num_experts=2), hidden_size=32 + ffn_config=DbrxFFNConfig(ffn_hidden_size=8, moe_num_experts=2, hidden_size=32), + hidden_size=32, ) model_ref = DbrxFFN(config) @@ -131,14 +136,17 @@ def test_dbrx(): assert hasattr(expertglu_test, "v1_linear") and not hasattr(expertglu_test, "v1") assert hasattr(expertglu_test, "w2_linear") and not hasattr(expertglu_test, "w2") + # Weights are stored transposed (W = w1[i].T) to match F.linear semantics with + # transformers 5.0's raw matmul: x @ w1[i] = F.linear(x, w1[i].T) assert torch.allclose( - torch.concat(list(expertglu_test.w1_linear.parameters()), dim=0), + torch.concat([m.weight.T for m in expertglu_test.w1_linear], dim=0), expertglu_ref.w1, ) mtq.set_quantizer_attribute(model_test, "*", {"enable": False}) - x = torch.randn(1, 4, 32) + # In transformers 5.0, the FFN input dimension is ffn_hidden_size (not hidden_size) + x = torch.randn(1, 4, 8) out_1 = model_ref(x) out_2 = model_test(x) assert torch.allclose(out_1[0], out_2[0]) @@ -190,7 +198,7 @@ def forward_step(model, batch): ], ) def test_quantized_transformers_save_restore(tmp_path, model_cls, quant_config): - tiny_llama_dir = create_tiny_llama_dir(tmp_path) + tiny_llama_dir = create_tiny_llama_dir(tmp_path, dtype=torch.float32) # update config to fit test cases if quant_config == mtq.INT4_AWQ_CFG: quant_config["quant_cfg"]["*weight_quantizer"]["block_sizes"] = {-1: 16} diff --git a/tests/unit/torch/quantization/test_calibrator.py b/tests/unit/torch/quantization/test_calibrator.py index 4cb745891..19c86b0b9 100644 --- a/tests/unit/torch/quantization/test_calibrator.py +++ b/tests/unit/torch/quantization/test_calibrator.py @@ -88,8 +88,8 @@ def test_track_amax_raises(self): max_calibrator.collect(x_3) -@pytest.mark.manual(reason="slow test, run with --run-manual") class TestHistogramCalibrator: + @pytest.mark.skip(reason="TODO: Fix assertions in test_grow") def test_grow(self, verbose): x_1 = torch.tensor([0, 255, 255, 255, 255, 255]) x_2 = torch.tensor([0, 255, 255, 255, 255, 256]) @@ -181,7 +181,6 @@ def test_torch_hist(self): ) -@pytest.mark.manual(reason="slow test, run with --run-manual") class TestEntropyCalibrator: def test_one_tensor(self, verbose): hist_calibrator = calib.HistogramCalibrator( @@ -244,7 +243,6 @@ def test_repr(self): repr(hist_calibrator) -@pytest.mark.manual(reason="slow test, run with --run-manual") class TestMSECalibrator: def test_one_tensor(self, verbose): calibrator = calib.HistogramCalibrator(8, None, False, num_bins=32) @@ -299,7 +297,6 @@ def test_repr(self): repr(calibrator) -@pytest.mark.manual(reason="slow test, run with --run-manual") class TestPercentileCalibrator: def test_one_tensor(self, verbose): calibrator = calib.HistogramCalibrator(8, None, False) @@ -359,7 +356,6 @@ def test_range(self): calibrator.compute_amax("percentile", percentile=200) -@pytest.mark.manual(reason="slow test, run with --run-manual") class TestCalibrateWeights: def test_max(self): ref_lenet = QuantConvLinear()