fix review comments

mamtsing · quic-mamta · commit f529f21663ec · 2025-06-09T11:05:31.000Z
Signed-off-by: Mamta Singh &lt;mamtsing@qti.qualcomm.com&gt;
diff --git a/QEfficient/cloud/finetune.py b/QEfficient/cloud/finetune.py
@@ -26,6 +26,7 @@
     update_config,
 )
 from QEfficient.finetune.utils.dataset_utils import get_dataloader
+from QEfficient.finetune.utils.helper import PEFT_METHOD
 from QEfficient.finetune.utils.parser import get_finetune_parser
 from QEfficient.finetune.utils.train_utils import get_longest_seq_length, print_model_size, train
 from QEfficient.utils._utils import login_and_download_hf_lm
@@ -64,8 +65,8 @@ def setup_distributed_training(train_config: TrainConfig) -> None:
     assert torch_device.type != "cpu", "Host doesn't support single-node DDP"
     assert torch_device.index is None, f"DDP requires only device type, got: {torch_device}"
 
-    dist_backend = {"cpu": "gloo", "qaic": "qccl", "cuda": "gloo"}
-    dist.init_process_group(backend=dist_backend[torch_device.type])
+    dist_backend_map = {"cpu": "gloo", "qaic": "qccl", "cuda": "gloo"}
+    dist.init_process_group(backend=dist_backend_map[torch_device.type])
     # from here onward "qaic/cuda" will automatically map to "qaic:i/cuda:i", where i = process rank
     getattr(torch, torch_device.type).set_device(dist.get_rank())
 
@@ -181,7 +182,7 @@ def apply_peft(
             then PeftModel object is returned else original model object
             (AutoModel) is returned.
     """
-    if train_config.peft_method != "lora":
+    if train_config.peft_method not in PEFT_METHOD:
         return model
 
     # Load the pre-trained peft model checkpoint and setup its configuration
diff --git a/QEfficient/finetune/configs/training.py b/QEfficient/finetune/configs/training.py
@@ -66,11 +66,11 @@ class TrainConfig:
     weight_decay: float = 0.0
     gamma: float = 0.85  # multiplicatively decay the learning rate by gamma after each epoch
     seed: int = 42
-    dataset = "samsum_dataset"
+    dataset = "alpaca_dataset"
     task_type = "generation"  # "generation" / "seq_classification"
     peft_method: str = "lora"
     from_peft_checkpoint: str = ""  # if not empty and peft_method='lora', will load the peft checkpoint and resume the fine-tuning on that checkpoint
-    output_dir: str = "meta-llama-samsum"
+    output_dir: str = "training_results"
     save_model: bool = True
     save_metrics: bool = True  # saves training metrics to a json file for later plotting
     intermediate_step_save: int = 1000
diff --git a/QEfficient/finetune/eval.py b/QEfficient/finetune/eval.py
@@ -44,8 +44,12 @@ def main(**kwargs):
     random.seed(train_config.seed)
     np.random.seed(train_config.seed)
 
-    # Load the pre-trained model and setup its configuration
-    save_dir = os.path.join(train_config.output_dir, "complete_epoch_1")
+    # Load the pre-trained model from latest checkpoint
+    trained_weights_path = os.path.join(train_config.output_dir, "trained_weights")
+    epoch_max_index = max([int(name.split("_")[-1]) for name in os.listdir(trained_weights_path)])
+    epochs_path = os.path.join(trained_weights_path, "epoch_" + str(epoch_max_index))
+    step_max_index = max([int(name.split("_")[-1]) for name in os.listdir(epochs_path)])
+    save_dir = os.path.join(epochs_path, "step_" + str(step_max_index))
 
     # Load PEFT model on CPU
     model_peft = AutoPeftModelForCausalLM.from_pretrained(save_dir)
diff --git a/QEfficient/finetune/utils/helper.py b/QEfficient/finetune/utils/helper.py
@@ -0,0 +1,11 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+TASK_TYPE = ["generation", "seq_classification"]
+PEFT_METHOD = ["lora"]
+DEVICE = ["qaic", "cpu", "cuda"]
+BATCHING_STRATEGY = ["padding", "packing"]
diff --git a/QEfficient/finetune/utils/parser.py b/QEfficient/finetune/utils/parser.py
@@ -8,6 +8,7 @@
 import argparse
 
 from QEfficient.finetune.dataset.dataset_config import DATASET_PREPROC
+from QEfficient.finetune.utils.helper import BATCHING_STRATEGY, DEVICE, PEFT_METHOD, TASK_TYPE
 
 
 def get_finetune_parser():
@@ -83,7 +84,7 @@ def get_finetune_parser():
         default=0,
         help="Maximum evaluation steps, unlimited if 0",
     )
-    parser.add_argument("--device", required=False, type=str, default="qaic", help="Device to train on")
+    parser.add_argument("--device", required=False, type=str, default="qaic", choices=DEVICE, help="Device to train on")
     parser.add_argument(
         "--num_workers_dataloader",
         "--num-workers-dataloader",
@@ -118,7 +119,7 @@ def get_finetune_parser():
         required=False,
         type=str,
         default="generation",
-        choices=["generation", "seq_classification"],
+        choices=TASK_TYPE,
         help="Task used for finetuning. Use 'generation' for decoder based models and 'seq_classification' for encoder based models.",
     )
     parser.add_argument(
@@ -127,7 +128,7 @@ def get_finetune_parser():
         required=False,
         type=str,
         default="lora",
-        choices=["lora"],
+        choices=PEFT_METHOD,
         help="Parameter efficient finetuning technique to be used. Currently only 'lora' is supported.",
     )
     parser.add_argument(
@@ -143,7 +144,7 @@ def get_finetune_parser():
         "--output-dir",
         required=False,
         type=str,
-        default="meta-llama-samsum",
+        default="training_results",
         help="Directory to save outputs of training",
     )
     parser.add_argument(
@@ -172,7 +173,7 @@ def get_finetune_parser():
         required=False,
         type=str,
         default="padding",
-        choices=["padding", "packing"],
+        choices=BATCHING_STRATEGY,
         help="Strategy for making batches of data points. Packing groups data points into batches by minimizing unnecessary empty spaces. Padding adds extra values (often zeros) to batch sequences so they align in size. Currently only padding is supported which is by default.",
     )
     parser.add_argument(
diff --git a/QEfficient/finetune/utils/train_utils.py b/QEfficient/finetune/utils/train_utils.py
@@ -18,6 +18,7 @@
 from tqdm import tqdm
 
 from QEfficient.finetune.configs.training import TrainConfig
+from QEfficient.finetune.utils.helper import PEFT_METHOD
 
 try:
     import torch_qaic  # noqa: F401
@@ -80,7 +81,6 @@ def train(
     best_val_loss = float("inf")
     total_train_steps = 0
     max_steps_reached = False  # Flag to indicate max training steps reached
-    device_type = device.split(":")[0]
 
     tensorboard_updates = None
     if train_config.enable_ddp:
@@ -92,7 +92,7 @@ def train(
     if device.startswith("qaic"):
         scaler = QAicGradScaler()
     else:
-        scaler = GradScaler(device_type)
+        scaler = GradScaler(torch.device(device).type)
 
     loss_0_counter = torch.tensor([0]).to(device)
 
@@ -121,7 +121,7 @@ def train(
                 )
                 break
 
-        if train_config.peft_method == "lora" and train_config.from_peft_checkpoint:
+        if train_config.peft_method in PEFT_METHOD and train_config.from_peft_checkpoint:
             intermediate_epoch = int(train_config.from_peft_checkpoint.split("/")[-2].split("_")[-1]) - 1
             if epoch < intermediate_epoch:
                 print(f"Skipping epoch {epoch + 1} since fine tuning has already completed for it.")
@@ -151,7 +151,7 @@ def train(
 
         for step, batch in enumerate(train_dataloader):
             # resume training from a particular checkpoint, assuming the dataset is not shuffled
-            if train_config.peft_method == "lora" and train_config.from_peft_checkpoint:
+            if train_config.peft_method in PEFT_METHOD and train_config.from_peft_checkpoint:
                 intermediate_step = int(train_config.from_peft_checkpoint.split("/")[-1].split("_")[-1])
                 intermediate_epoch = int(train_config.from_peft_checkpoint.split("/")[-2].split("_")[-1]) - 1
                 # to bring the count of train_step in sync with where it left off
@@ -171,7 +171,7 @@ def train(
                 break
             batch = {k: v.to(device) for k, v in batch.items()}  # move the batch elements to qaic device
 
-            with torch.autocast(device_type=device, dtype=torch.float16):
+            with torch.autocast(device_type=torch.device(device).type, dtype=torch.float16):
                 # an additional condition can be put here to avoid opByOpVerifier getting triggered for each step
                 if train_config.opByOpVerifier:
                     with qaic_debug.OpByOpVerifierMode(
@@ -282,12 +282,20 @@ def train(
         epoch_times.append(epoch_end_time)
 
         if loss_0_counter.item() == train_config.convergence_counter:
-            if train_config.peft_method == "lora" and train_config.from_peft_checkpoint and epoch == intermediate_epoch:
+            if (
+                train_config.peft_method in PEFT_METHOD
+                and train_config.from_peft_checkpoint
+                and epoch == intermediate_epoch
+            ):
                 train_epoch_loss = total_loss / (step - intermediate_step)
             else:
                 train_epoch_loss = total_loss / step
         else:
-            if train_config.peft_method == "lora" and train_config.from_peft_checkpoint and epoch == intermediate_epoch:
+            if (
+                train_config.peft_method in PEFT_METHOD
+                and train_config.from_peft_checkpoint
+                and epoch == intermediate_epoch
+            ):
                 train_epoch_loss = total_loss / (len(train_dataloader) - intermediate_step)
             else:
                 train_epoch_loss = total_loss / len(train_dataloader)
@@ -417,7 +425,7 @@ def evaluation_helper(model, train_config, eval_dataloader, device):
         # Ensure no gradients are computed for this scope to save memory
         with torch.no_grad():
             # Forward pass and compute loss
-            with torch.autocast(device_type=device, dtype=torch.float16):
+            with torch.autocast(device_type=torch.device(device).type, dtype=torch.float16):
                 outputs = model(**batch)
             loss = outputs.loss