Simplify megatron to hf export logic

kevalmorabia97 · kevalmorabia97 · commit 6d45b3a576bd · 2026-03-25T14:05:33.000-07:00
Signed-off-by: Keval Morabia &lt;28916987+kevalmorabia97@users.noreply.github.com&gt;
diff --git a/examples/puzzletron/mbridge_distillation/README.md b/examples/puzzletron/mbridge_distillation/README.md
@@ -71,7 +71,6 @@ torchrun --nproc_per_node=8 examples/puzzletron/mbridge_distillation/distill_hf.
     --data_paths 1.0 /path/to/hf_datasets/wikitext-103-v1/Salesforce--wikitext_wikitext-103-v1_train_text_document \
     --output_dir /path/to/distilled/checkpoint \
     --hf_export_path /path/to/exported/hf/model \
-    --hf_model meta-llama/Llama-3.1-8B-Instruct \
     --seq_length 4096 \
     --tp_size 8 \
     --pp_size 1 \
diff --git a/examples/puzzletron/mbridge_distillation/distill_hf.py b/examples/puzzletron/mbridge_distillation/distill_hf.py
@@ -44,10 +44,9 @@
 from megatron.core.datasets.utils import get_blend_from_list
 from megatron.core.distributed import DistributedDataParallelConfig
 
+# Import to register heterogeneous bridges (side effect)
+import modelopt.torch.puzzletron.export.mbridge  # noqa: F401
 import modelopt.torch.utils.distributed as dist
-from modelopt.torch.puzzletron.export.mbridge.export_mbridge_to_hf import (
-    export_to_hf_and_copy_config,
-)
 from modelopt.torch.utils import print_rank_0
 
 SEED = 1234
@@ -129,13 +128,6 @@ def get_args():
             "If provided, exports last iteration checkpoint to HF format after distillation."
         ),
     )
-    parser.add_argument(
-        "--hf_model",
-        type=str,
-        required=True,
-        help="HuggingFace model ID to use as template for export (e.g., meta-llama/Llama-3.1-8B-Instruct). "
-        "Should match the base architecture of the student model.",
-    )
     args = parser.parse_args()
 
     # Sanity checks
@@ -272,13 +264,15 @@ def _build_model_provider(hf_path):
 
         # Only rank 0 exports
         if is_rank_0:
-            export_to_hf_and_copy_config(
-                student_hf_path=args.student_hf_path,
-                checkpoint_dir=checkpoint_dir,
-                train_iters=args.train_iters,
-                hf_export_path=args.hf_export_path,
-                hf_model=args.hf_model,
-                trust_remote_code=args.trust_remote_code,
+            bridge = AutoBridge.from_hf_pretrained(
+                args.student_hf_path, trust_remote_code=args.trust_remote_code
+            )
+            os.makedirs(os.path.join(args.hf_export_path, "subblocks_safetensors"), exist_ok=True)
+            bridge.export_ckpt(
+                megatron_path=f"{checkpoint_dir}/iter_{args.train_iters:07d}",
+                hf_path=args.hf_export_path,
+                show_progress=True,
+                strict=True,
             )
 
 
diff --git a/modelopt/torch/puzzletron/export/mbridge/export_mbridge_to_hf.py b/modelopt/torch/puzzletron/export/mbridge/export_mbridge_to_hf.py
diff --git a/tests/examples/puzzletron/mbridge_distillation/test_distill_hf.py b/tests/examples/puzzletron/mbridge_distillation/test_distill_hf.py
@@ -32,6 +32,7 @@ def test_distill_hf(project_root_path: Path, tmp_path: Path):
     and runs mbridge distillation. The models are created with reduced size for faster testing.
     Models are converted to include block_configs.
     """
+    tmp_path = Path("/tmp/test_distill_hf")
     # Prepare student and teacher models
     student_hf_path, teacher_hf_path = _prepare_student_and_teacher_models(
         project_root_path, tmp_path
@@ -74,7 +75,6 @@ def test_distill_hf(project_root_path: Path, tmp_path: Path):
         eval_iters=0,
         log_interval=5,
         hf_export_path=hf_export_dir,
-        hf_model="Qwen/Qwen3-0.6B",
     )
 
     run_example_command(cmd_parts, example_path="puzzletron/mbridge_distillation")