enable fine tuning on HPU

splotnikv · splotnikv · commit a7d6f6fa3fca · 2025-05-14T08:36:01.000-07:00
diff --git a/src/instructlab/training/hpu_utils.py b/src/instructlab/training/hpu_utils.py
@@ -0,0 +1,11 @@
+import torch
+
+from functools import lru_cache
+@lru_cache(maxsize=None)
+def is_torch_hpu_available() -> bool:
+    try:
+        import habana_frameworks.torch.core  # noqa: F401
+    except ImportError:
+        return False
+    return True
+    
diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py
@@ -42,6 +42,14 @@
             UserWarning,
         )
 
+from instructlab.training.hpu_utils import is_torch_hpu_available
+
+if is_torch_hpu_available():
+    import habana_frameworks.torch.core as htcore
+    import habana_frameworks.torch.distributed.hccl
+    from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
+    adapt_transformers_to_gaudi()
+
 # Third Party
 from instructlab.dolomite.hf_models import GPTDolomiteForCausalLM
 from torch.utils.data import DataLoader
@@ -221,7 +229,22 @@ def setup_model(
         )
         model.config.eos_token_id = tokenizer.eos_token_id
 
-    if "ForCausalLM" not in model.__class__.__name__:
+    if not is_torch_hpu_available():
+        class_name = model.__class__.__name__
+    else:
+        class_name = model._orig_mod.__class__.__name__ if model.__class__.__name__ == 'OptimizedModule' else model.__class__.__name__
+
+        replace_no_split_modules = {
+            'GaudiLlamaForCausalLM': ['GaudiLlamaDecoderLayer',]
+        }
+        
+        if class_name in replace_no_split_modules:
+            if model.__class__.__name__ == 'OptimizedModule':
+                model._orig_mod._no_split_modules = replace_no_split_modules[class_name]
+            else:
+                model._no_split_modules = replace_no_split_modules[class_name]
+
+    if "ForCausalLM" not in class_name:
         raise ValueError(
             f"Model class name: {model.__class__.__name__} is not supported."
         )
@@ -271,6 +294,11 @@ def make_inputs_require_grad(module, input, output):  # pylint: disable=unused-a
             model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
 
     accelerator = setup_accelerator(args, model, grad_accum)
+
+    if is_torch_hpu_available():
+        accelerator.state.fsdp_plugin.use_orig_params=True
+        accelerator.state.fsdp_plugin.sync_module_states=True
+
     if args.distributed_training_framework == DistributedBackend.FSDP.value:
         model = accelerator.prepare(model)
     optimizer = setup_optimizer(args, model)
@@ -413,10 +441,19 @@ def train(
             total_length = float(torch.tensor([batch.pop("total_length")]))
             if not args.use_dolomite:
                 for k in batch:
-                    batch[k] = batch[k].to(local_rank)
+                    batch[k] = batch[k].to('hpu' if is_torch_hpu_available() else local_rank)
+
+            hpu_args = []
+            if is_torch_hpu_available():
+                hpu_args = {
+                    "use_flash_attention":True,
+                    "lazy_mode":False,
+                }
+
             output = model(
                 **batch,
                 use_cache=False,
+                **hpu_args,
             )
             loss = output.loss
             log_loss = loss.detach().item()
@@ -453,8 +490,14 @@ def train(
                 elapsed_time = time.time() - start
                 overall_throughput = args.samples_per_gpu * world_size / elapsed_time
                 current_lr = lr_scheduler.get_last_lr()[0]
-                cuda_mem_allocated = torch.cuda.memory_allocated() / (1024**3)
-                cuda_malloc_retries = torch.cuda.memory_stats()["num_alloc_retries"]
+
+                if is_torch_hpu_available():
+                    mem_allocated = torch.hpu.memory_allocated() / (1024**3)
+                    malloc_retries = 0
+                else:
+                    mem_allocated = torch.cuda.memory_allocated() / (1024**3)
+                    malloc_retries = torch.cuda.memory_stats()["num_alloc_retries"]
+
                 global_grad_norm = (
                     model.get_global_grad_norm()
                     if hasattr(model, "get_global_grad_norm")
@@ -476,8 +519,8 @@ def train(
                         "rank": torch.distributed.get_rank(),
                         "overall_throughput": overall_throughput,
                         "lr": current_lr,
-                        "cuda_mem_allocated": cuda_mem_allocated,
-                        "cuda_malloc_retries": cuda_malloc_retries,
+                        ("hpu" if is_torch_hpu_available() else "cuda") + "_mem_allocated": mem_allocated,
+                        ("hpu" if is_torch_hpu_available() else "cuda") + "_malloc_retries": malloc_retries,
                         "num_loss_counted_tokens": int(num_loss_counted_tokens),
                         "num_tokens_rank0": int(total_length),
                         "batch_size": int(micro_batch_size),
@@ -518,7 +561,10 @@ def train(
             global_step += 1
             if local_rank == 0:
                 inner_pb.update(1)
-            torch.cuda.empty_cache()
+
+            if not is_torch_hpu_available():
+                torch.cuda.empty_cache()
+
         if args.checkpoint_at_epoch:
             base_logger.debug(f"Saving checkpoint at epoch {epoch}")
             save_checkpoint(
@@ -575,13 +621,22 @@ def main(args):
     # gets converted to a timedelta of 1:40:00 if the default is kept
     nccl_timeout = int(os.getenv("INSTRUCTLAB_NCCL_TIMEOUT_MS", "6000000"))
     #### distributed init #####
-    torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
+    if is_torch_hpu_available():
+        torch.hpu.set_device(int(os.environ["LOCAL_RANK"]))
+    else:
+        torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
+
     args.local_rank = int(os.environ["LOCAL_RANK"])
     torch.distributed.init_process_group(
-        "nccl", timeout=datetime.timedelta(milliseconds=nccl_timeout)
+        "hccl" if is_torch_hpu_available() else "nccl", timeout=datetime.timedelta(milliseconds=nccl_timeout)
     )
     args.global_rank = torch.distributed.get_rank()
-    tensor = torch.ByteTensor([False]).cuda()
+
+    if is_torch_hpu_available():
+        tensor = torch.ByteTensor([False]).to('hpu')
+    else:
+        tensor = torch.ByteTensor([False]).cuda()
+
     torch.distributed.all_reduce(tensor)
     torch.distributed.barrier()
 
diff --git a/src/instructlab/training/multipack_sampler.py b/src/instructlab/training/multipack_sampler.py
@@ -34,6 +34,8 @@
 import torch
 import torch.distributed as dist
 
+from instructlab.training.utils import bucket
+
 
 def find_max_pack_len_with_padding(
     dataset,
@@ -211,11 +213,11 @@ def ffd_check_padding(a: np.ndarray, c: int, n: int):
         not_found = True
         for idx in range(n):
             # Calculate the new capacity if size is added to the bin
-            new_capacity = max(bins_max_lengths[idx], size) * (
+            new_capacity = bucket(max(bins_max_lengths[idx], size)) * (
                 bins_num_samples[idx] + 1
             )
             if new_capacity <= c:
-                bins_max_lengths[idx] = max(bins_max_lengths[idx], size)
+                bins_max_lengths[idx] = bucket(max(bins_max_lengths[idx], size))
                 bins_num_samples[idx] += 1
                 not_found = False
                 break
@@ -266,11 +268,11 @@ def ffd_with_result_padding(a: np.ndarray, c: int, start_index: int):
         add_new = True
         for idx in range(len(bins_max_lengths)):
             # Calculate the new capacity if size is added to the bin
-            new_capacity = max(bins_max_lengths[idx], size) * (
+            new_capacity = bucket(max(bins_max_lengths[idx], size)) * (
                 bins_num_samples[idx] + 1
             )
             if new_capacity <= c:
-                bins_max_lengths[idx] = max(bins_max_lengths[idx], size)
+                bins_max_lengths[idx] = bucket(max(bins_max_lengths[idx], size))
                 bins_num_samples[idx] += 1
                 bins_result[idx].append(indices[a_id] + start_index)
                 add_new = False
diff --git a/src/instructlab/training/setup_accelerator.py b/src/instructlab/training/setup_accelerator.py
@@ -2,7 +2,6 @@
 from functools import partial
 
 # Third Party
-from accelerate import Accelerator
 from peft.utils.other import fsdp_auto_wrap_policy
 from torch.distributed.fsdp import BackwardPrefetch, ShardingStrategy
 from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
@@ -12,6 +11,12 @@
 # First Party
 from instructlab.training.config import DeepSpeedOptions
 from instructlab.training.utils import get_module_class_from_name, patch_target_module
+from instructlab.training.hpu_utils import is_torch_hpu_available
+
+if is_torch_hpu_available():
+    from optimum.habana.accelerate import GaudiAccelerator
+else:
+    from accelerate import Accelerator
 
 
 def get_ds_plugin(world_size, samples_per_gpu, grad_accum, opts: DeepSpeedOptions):
@@ -51,7 +56,10 @@ def get_ds_plugin(world_size, samples_per_gpu, grad_accum, opts: DeepSpeedOption
 
 def get_fsdp_config(args, model: PreTrainedModel):
     # Third Party
-    from accelerate.utils import FullyShardedDataParallelPlugin
+    if is_torch_hpu_available():
+        from optimum.habana.accelerate.utils import GaudiFullyShardedDataParallelPlugin
+    else:
+        from accelerate.utils import FullyShardedDataParallelPlugin
     from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload
 
     is_lora = args.lora_r > 0
@@ -73,7 +81,7 @@ def get_fsdp_config(args, model: PreTrainedModel):
     prefetch_policy = (
         BackwardPrefetch.BACKWARD_POST if is_lora else BackwardPrefetch.BACKWARD_PRE
     )
-    fsdp_plugin = FullyShardedDataParallelPlugin(
+    fsdp_plugin = (GaudiFullyShardedDataParallelPlugin if is_torch_hpu_available() else FullyShardedDataParallelPlugin)(
         auto_wrap_policy=wrap_policy,
         limit_all_gathers=True,
         backward_prefetch=prefetch_policy,
@@ -128,7 +136,7 @@ def setup_accelerator(args, model: PreTrainedModel, grad_accum):
         raise ValueError(
             f"Unknown sharding framework: {args.distributed_training_framework}"
         )
-    accelerator = Accelerator(
+    accelerator = (GaudiAccelerator if is_torch_hpu_available() else Accelerator)(
         **accel_args,
     )
     accelerator.even_batches = False
diff --git a/src/instructlab/training/utils.py b/src/instructlab/training/utils.py
@@ -43,13 +43,15 @@
 import numpy as np
 import torch
 import torch.nn.functional as F
+import numba
 
 # First Party
 from instructlab.training.config import (
     DistributedBackend,
     QuantizeDataType,
     TrainingArgs,
 )
+from instructlab.training.hpu_utils import is_torch_hpu_available
 
 logger = logging.getLogger("instructlab.training")
 
@@ -209,6 +211,8 @@ def listen(self):
 
 
 def supports_flash_attention(device_id=0):
+    if is_torch_hpu_available():
+        return False
     """Check if a GPU supports FlashAttention."""
     major, minor = torch.cuda.get_device_capability(device_id)
     # Check if the GPU architecture is Ampere (SM 8.x) or newer (SM 9.0)
@@ -236,6 +240,30 @@ def check_flash_attn_enabled(disable_flash_attn: bool, use_dolomite: bool) -> bo
     return flash_enabled
 
 
+@numba.njit
+def simple_bucket(length):
+    l = length
+    msb = 0
+    while l > 0:
+        msb += 1
+        l = l // 2
+
+    align = (1 << (msb - 4)) if msb >= 4 else 1
+
+    return (length + align - 1) // align * align
+
+
+torch_hpu_available = is_torch_hpu_available()
+
+@numba.njit
+def bucket(length):
+    global torch_hpu_available
+    if torch_hpu_available:
+        return simple_bucket(length)
+    else:
+        return length
+
+
 def make_collate_fn(
     pad_token_id, use_dolomite=False, flash_enabled=True, max_batch_len=60000
 ):
@@ -298,7 +326,7 @@ def pad_collate_fn(batch):
 
             def pad_collate_fn(batch):
                 lens = np.array([len(item["input_ids"]) for item in batch])
-                max_len = max(lens)
+                max_len = bucket(max(lens))
 
                 input_ids = torch.stack(
                     [
@@ -411,6 +439,7 @@ def reduce_sum_forward(
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
                 return_dict=return_dict,
+                **deprecated_arguments if is_torch_hpu_available() else None,
             )
 
             return_dict = isinstance(output, dict)
@@ -1093,7 +1122,10 @@ def set_random_seed(seed):
         random.seed(seed)
         np.random.seed(seed)
         torch.manual_seed(seed)
-        torch.cuda.manual_seed_all(seed)
+        if is_torch_hpu_available():
+            torch.hpu.manual_seed_all(seed)
+        else:
+            torch.cuda.manual_seed_all(seed)
 
 
 def save_checkpoint(