Use logger in place of print statements in finetuning scripts

quic-mamta · quic-mamta · commit 74e191546595 · 2025-04-21T09:13:00.000Z
Signed-off-by: Mamta Singh &lt;quic_mamtsing@quicinc.com&gt;
diff --git a/QEfficient/cloud/finetune.py b/QEfficient/cloud/finetune.py
@@ -31,11 +31,12 @@
 )
 from QEfficient.finetune.utils.train_utils import get_longest_seq_length, print_model_size, train
 from QEfficient.utils._utils import login_and_download_hf_lm
+from QEfficient.utils.logging_utils import logger
 
 try:
     import torch_qaic  # noqa: F401
 except ImportError as e:
-    print(f"Warning: {e}. Moving ahead without these qaic modules.")
+    logger.warning(f"{e}. Moving ahead without these qaic modules.")
 
 
 from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer
@@ -114,7 +115,7 @@ def main(**kwargs):
     # If there is a mismatch between tokenizer vocab size and embedding matrix,
     # throw a warning and then expand the embedding matrix
     if len(tokenizer) > model.get_input_embeddings().weight.shape[0]:
-        print("WARNING: Resizing the embedding matrix to match the tokenizer vocab size.")
+        logger.warning("Resizing the embedding matrix to match the tokenizer vocab size.")
         model.resize_token_embeddings(len(tokenizer))
 
     print_model_size(model, train_config)
@@ -163,10 +164,10 @@ def main(**kwargs):
     #         )
     ##
     train_dl_kwargs = get_dataloader_kwargs(train_config, dataset_train, dataset_processer, "train")
-    print("length of dataset_train", len(dataset_train))
+    logger.info("length of dataset_train", len(dataset_train))
     custom_data_collator = get_custom_data_collator(dataset_processer, dataset_config)
     if custom_data_collator:
-        print("custom_data_collator is used")
+        logger.info("custom_data_collator is used")
         train_dl_kwargs["collate_fn"] = custom_data_collator
 
     # Create DataLoaders for the training and validation dataset
@@ -176,7 +177,7 @@ def main(**kwargs):
         pin_memory=True,
         **train_dl_kwargs,
     )
-    print(f"--> Num of Training Set Batches loaded = {len(train_dataloader)}")
+    logger.info(f"--> Num of Training Set Batches loaded = {len(train_dataloader)}")
 
     eval_dataloader = None
     if train_config.run_validation:
@@ -200,15 +201,15 @@ def main(**kwargs):
                 f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})"
             )
         else:
-            print(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}")
+            logger.info(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}")
 
         longest_seq_length, _ = get_longest_seq_length(
             torch.utils.data.ConcatDataset([train_dataloader.dataset, eval_dataloader.dataset])
         )
     else:
         longest_seq_length, _ = get_longest_seq_length(train_dataloader.dataset)
 
-    print(
+    logger.info(
         f"The longest sequence length in the train data is {longest_seq_length}, "
         f"passed context length is {train_config.context_length} and overall model's context length is "
         f"{model.config.max_position_embeddings}"
diff --git a/QEfficient/finetune/dataset/custom_dataset.py b/QEfficient/finetune/dataset/custom_dataset.py
@@ -8,6 +8,8 @@
 import importlib
 from pathlib import Path
 
+from QEfficient.utils.logging_utils import logger
+
 
 def load_module_from_py_file(py_file: str) -> object:
     """
@@ -40,7 +42,7 @@ def get_custom_dataset(dataset_config, tokenizer, split: str):
     try:
         return getattr(module, func_name)(dataset_config, tokenizer, split)
     except AttributeError as e:
-        print(
+        logger.error(
             f"It seems like the given method name ({func_name}) is not present in the dataset .py file ({module_path.as_posix()})."
         )
         raise e
@@ -63,6 +65,6 @@ def get_data_collator(dataset_processer, dataset_config):
     try:
         return getattr(module, func_name)(dataset_processer)
     except AttributeError:
-        print(f"Can not find the custom data_collator in the dataset.py file ({module_path.as_posix()}).")
-        print("Using the default data_collator instead.")
+        logger.info(f"Can not find the custom data_collator in the dataset.py file ({module_path.as_posix()}).")
+        logger.info("Using the default data_collator instead.")
         return None
diff --git a/QEfficient/finetune/dataset/grammar_dataset.py b/QEfficient/finetune/dataset/grammar_dataset.py
@@ -10,6 +10,8 @@
 from datasets import load_dataset
 from torch.utils.data import Dataset
 
+from QEfficient.utils.logging_utils import logger
+
 
 class grammar(Dataset):
     def __init__(self, tokenizer, csv_name=None, context_length=None):
@@ -20,7 +22,7 @@ def __init__(self, tokenizer, csv_name=None, context_length=None):
                 delimiter=",",
             )
         except Exception as e:
-            print(
+            logger.error(
                 "Loading of grammar dataset failed! Please see [here](https://github.com/meta-llama/llama-recipes/blob/main/src/llama_recipes/datasets/grammar_dataset/grammar_dataset_process.ipynb) for details on how to download the dataset."
             )
             raise e
@@ -36,7 +38,7 @@ def convert_to_features(self, example_batch):
         # Create prompt and tokenize contexts and questions
 
         if self.print_text:
-            print("Input Text: ", self.clean_text(example_batch["text"]))
+            logger.info("Input Text: ", self.clean_text(example_batch["text"]))
 
         input_ = example_batch["input"]
         target_ = example_batch["target"]
@@ -71,9 +73,9 @@ def get_dataset(dataset_config, tokenizer, csv_name=None, context_length=None):
     """cover function for handling loading the working dataset"""
     """dataset loading"""
     currPath = Path.cwd() / "datasets_grammar" / "grammar_train.csv"
-    print(f"Loading dataset {currPath}")
+    logger.info(f"Loading dataset {currPath}")
     csv_name = str(currPath)
-    print(csv_name)
+    logger.info(csv_name)
     dataset = grammar(tokenizer=tokenizer, csv_name=csv_name, context_length=context_length)
 
     return dataset
diff --git a/QEfficient/finetune/eval.py b/QEfficient/finetune/eval.py
@@ -25,12 +25,14 @@
 )
 from utils.train_utils import evaluation, print_model_size
 
+from QEfficient.utils.logging_utils import logger
+
 try:
     import torch_qaic  # noqa: F401
 
     device = "qaic:0"
 except ImportError as e:
-    print(f"Warning: {e}. Moving ahead without these qaic modules.")
+    logger.warning(f"{e}. Moving ahead without these qaic modules.")
     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 
 # Suppress all warnings
@@ -76,7 +78,7 @@ def main(**kwargs):
     # If there is a mismatch between tokenizer vocab size and embedding matrix,
     # throw a warning and then expand the embedding matrix
     if len(tokenizer) > model.get_input_embeddings().weight.shape[0]:
-        print("WARNING: Resizing the embedding matrix to match the tokenizer vocab size.")
+        logger.warning("Resizing the embedding matrix to match the tokenizer vocab size.")
         model.resize_token_embeddings(len(tokenizer))
 
     print_model_size(model, train_config)
@@ -107,13 +109,13 @@ def main(**kwargs):
             pin_memory=True,
             **val_dl_kwargs,
         )
-        print(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}")
+        logger.info(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}")
         if len(eval_dataloader) == 0:
             raise ValueError(
                 f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})"
             )
         else:
-            print(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}")
+            logger.info(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}")
 
     model.to(device)
     _ = evaluation(model, train_config, eval_dataloader, None, tokenizer, device)
diff --git a/QEfficient/finetune/utils/plot_metrics.py b/QEfficient/finetune/utils/plot_metrics.py
@@ -11,6 +11,8 @@
 
 import matplotlib.pyplot as plt
 
+from QEfficient.utils.logging_utils import logger
+
 
 def plot_metric(data, metric_name, x_label, y_label, title, colors):
     plt.figure(figsize=(7, 6))
@@ -67,14 +69,14 @@ def plot_metrics_by_step(data, metric_name, x_label, y_label, colors):
 
 def plot_metrics(file_path):
     if not os.path.exists(file_path):
-        print(f"File {file_path} does not exist.")
+        logger.error(f"File {file_path} does not exist.")
         return
 
     with open(file_path, "r") as f:
         try:
             data = json.load(f)
         except json.JSONDecodeError:
-            print("Invalid JSON file.")
+            logger.error("Invalid JSON file.")
             return
 
     directory = os.path.dirname(file_path)
diff --git a/QEfficient/finetune/utils/train_utils.py b/QEfficient/finetune/utils/train_utils.py
@@ -19,6 +19,7 @@
 from tqdm import tqdm
 
 from QEfficient.finetune.configs.training import train_config as TRAIN_CONFIG
+from QEfficient.utils.logging_utils import logger
 
 try:
     import torch_qaic  # noqa: F401
@@ -27,7 +28,7 @@
     import torch_qaic.utils as qaic_utils  # noqa: F401
     from torch.qaic.amp import GradScaler as QAicGradScaler
 except ImportError as e:
-    print(f"Warning: {e}. Moving ahead without these qaic modules.")
+    logger.warning(f"{e}. Moving ahead without these qaic modules.")
 
 from torch.amp import GradScaler
 
@@ -116,26 +117,26 @@ def train(
     for epoch in range(train_config.num_epochs):
         if loss_0_counter.item() == train_config.convergence_counter:
             if train_config.enable_ddp:
-                print(
+                logger.info(
                     f"Not proceeding with epoch {epoch + 1} on device {local_rank} since loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps."
                 )
                 break
             else:
-                print(
+                logger.info(
                     f"Not proceeding with epoch {epoch + 1} since loss value has been <= {train_config.convergence_loss}  for last {loss_0_counter.item()} steps."
                 )
                 break
 
         if train_config.use_peft and train_config.from_peft_checkpoint:
             intermediate_epoch = int(train_config.from_peft_checkpoint.split("/")[-2].split("_")[-1]) - 1
             if epoch < intermediate_epoch:
-                print(f"Skipping epoch {epoch + 1} since fine tuning has already completed for it.")
+                logger.info(f"Skipping epoch {epoch + 1} since fine tuning has already completed for it.")
                 # to bring the count of train_step in sync with where it left off
                 total_train_steps += len(train_dataloader)
                 continue
 
-        print(f"Starting epoch {epoch + 1}/{train_config.num_epochs}")
-        print(f"train_config.max_train_step: {train_config.max_train_step}")
+        logger.info(f"Starting epoch {epoch + 1}/{train_config.num_epochs}")
+        logger.info(f"train_config.max_train_step: {train_config.max_train_step}")
         # stop when the maximum number of training steps is reached
         if max_steps_reached:
             break
@@ -162,7 +163,7 @@ def train(
                 # to bring the count of train_step in sync with where it left off
                 if epoch == intermediate_epoch and step == 0:
                     total_train_steps += intermediate_step
-                    print(
+                    logger.info(
                         f"skipping first {intermediate_step} steps for epoch {epoch + 1}, since fine tuning has already completed for them."
                     )
                 if epoch == intermediate_epoch and step < intermediate_step:
@@ -197,7 +198,7 @@ def train(
                             labels = batch["labels"][:, 0]
                             preds = torch.nn.functional.softmax(logits, dim=-1)
                             acc_helper.forward(preds, labels)
-                    print("Mismatches detected:", verifier.get_perop_mismatch_count())
+                    logger.info("Mismatches detected:", verifier.get_perop_mismatch_count())
                 else:
                     model_outputs = model(**batch)
                     loss = model_outputs.loss  # Forward call
@@ -279,13 +280,13 @@ def train(
                 )
             if train_config.enable_ddp:
                 if loss_0_counter.item() == train_config.convergence_counter:
-                    print(
+                    logger.info(
                         f"Loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps. Hence, stopping the fine tuning on device {local_rank}."
                     )
                     break
             else:
                 if loss_0_counter.item() == train_config.convergence_counter:
-                    print(
+                    logger.info(
                         f"Loss value has been  <= {train_config.convergence_loss}  for last {loss_0_counter.item()} steps. Hence, stopping the fine tuning."
                     )
                     break
@@ -347,15 +348,15 @@ def train(
         if train_config.run_validation:
             if eval_epoch_loss < best_val_loss:
                 best_val_loss = eval_epoch_loss
-                print(f"best eval loss on epoch {epoch + 1} is {best_val_loss}")
+                logger.info(f"best eval loss on epoch {epoch + 1} is {best_val_loss}")
             val_loss.append(float(eval_epoch_loss))
             val_metric.append(float(eval_metric))
         if train_config.task_type == "seq_classification":
-            print(
+            logger.info(
                 f"Epoch {epoch + 1}: train_acc={metric_val:.4f}, train_epoch_loss={train_epoch_loss:.4f}, epoch time {epoch_end_time}s"
             )
         else:
-            print(
+            logger.info(
                 f"Epoch {epoch + 1}: train_metric={metric_val:.4f}, train_epoch_loss={train_epoch_loss:.4f}, epoch time {epoch_end_time}s"
             )
 
@@ -459,7 +460,7 @@ def evaluation_helper(model, train_config, eval_dataloader, device):
         eval_metric = torch.exp(eval_epoch_loss)
 
     # Print evaluation metrics
-    print(f" {eval_metric.detach().cpu()=} {eval_epoch_loss.detach().cpu()=}")
+    logger.info(f" {eval_metric.detach().cpu()=} {eval_epoch_loss.detach().cpu()=}")
 
     return eval_metric, eval_epoch_loss, val_step_loss, val_step_metric
 
@@ -489,9 +490,9 @@ def print_model_size(model, config) -> None:
         model_name (str): Name of the model.
     """
 
-    print(f"--> Model {config.model_name}")
+    logger.info(f"--> Model {config.model_name}")
     total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
-    print(f"\n--> {config.model_name} has {total_params / 1e6} Million params\n")
+    logger.info(f"\n--> {config.model_name} has {total_params / 1e6} Million params\n")
 
 
 def save_to_json(