diff --git a/QEfficient/cloud/finetune.py b/QEfficient/cloud/finetune.py index f312d00cb..474f7864c 100644 --- a/QEfficient/cloud/finetune.py +++ b/QEfficient/cloud/finetune.py @@ -31,11 +31,12 @@ ) from QEfficient.finetune.utils.train_utils import get_longest_seq_length, print_model_size, train from QEfficient.utils._utils import login_and_download_hf_lm +from QEfficient.utils.logging_utils import logger try: import torch_qaic # noqa: F401 except ImportError as e: - print(f"Warning: {e}. Moving ahead without these qaic modules.") + logger.warning(f"{e}. Moving ahead without these qaic modules.") from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer @@ -114,7 +115,7 @@ def main(**kwargs): # If there is a mismatch between tokenizer vocab size and embedding matrix, # throw a warning and then expand the embedding matrix if len(tokenizer) > model.get_input_embeddings().weight.shape[0]: - print("WARNING: Resizing the embedding matrix to match the tokenizer vocab size.") + logger.warning("Resizing the embedding matrix to match the tokenizer vocab size.") model.resize_token_embeddings(len(tokenizer)) print_model_size(model, train_config) @@ -163,10 +164,10 @@ def main(**kwargs): # ) ## train_dl_kwargs = get_dataloader_kwargs(train_config, dataset_train, dataset_processer, "train") - print("length of dataset_train", len(dataset_train)) + logger.info("length of dataset_train", len(dataset_train)) custom_data_collator = get_custom_data_collator(dataset_processer, dataset_config) if custom_data_collator: - print("custom_data_collator is used") + logger.info("custom_data_collator is used") train_dl_kwargs["collate_fn"] = custom_data_collator # Create DataLoaders for the training and validation dataset @@ -176,7 +177,7 @@ def main(**kwargs): pin_memory=True, **train_dl_kwargs, ) - print(f"--> Num of Training Set Batches loaded = {len(train_dataloader)}") + logger.info(f"--> Num of Training Set Batches loaded = {len(train_dataloader)}") eval_dataloader = None if train_config.run_validation: @@ -200,7 +201,7 @@ def main(**kwargs): f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})" ) else: - print(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}") + logger.info(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}") longest_seq_length, _ = get_longest_seq_length( torch.utils.data.ConcatDataset([train_dataloader.dataset, eval_dataloader.dataset]) @@ -208,7 +209,7 @@ def main(**kwargs): else: longest_seq_length, _ = get_longest_seq_length(train_dataloader.dataset) - print( + logger.info( f"The longest sequence length in the train data is {longest_seq_length}, " f"passed context length is {train_config.context_length} and overall model's context length is " f"{model.config.max_position_embeddings}" diff --git a/QEfficient/finetune/dataset/custom_dataset.py b/QEfficient/finetune/dataset/custom_dataset.py index 4bee06c58..7164e13a1 100644 --- a/QEfficient/finetune/dataset/custom_dataset.py +++ b/QEfficient/finetune/dataset/custom_dataset.py @@ -8,6 +8,8 @@ import importlib from pathlib import Path +from QEfficient.utils.logging_utils import logger + def load_module_from_py_file(py_file: str) -> object: """ @@ -40,7 +42,7 @@ def get_custom_dataset(dataset_config, tokenizer, split: str): try: return getattr(module, func_name)(dataset_config, tokenizer, split) except AttributeError as e: - print( + logger.error( f"It seems like the given method name ({func_name}) is not present in the dataset .py file ({module_path.as_posix()})." ) raise e @@ -63,6 +65,6 @@ def get_data_collator(dataset_processer, dataset_config): try: return getattr(module, func_name)(dataset_processer) except AttributeError: - print(f"Can not find the custom data_collator in the dataset.py file ({module_path.as_posix()}).") - print("Using the default data_collator instead.") + logger.info(f"Can not find the custom data_collator in the dataset.py file ({module_path.as_posix()}).") + logger.info("Using the default data_collator instead.") return None diff --git a/QEfficient/finetune/dataset/grammar_dataset.py b/QEfficient/finetune/dataset/grammar_dataset.py index 8f04b7544..6ebeeb2d1 100644 --- a/QEfficient/finetune/dataset/grammar_dataset.py +++ b/QEfficient/finetune/dataset/grammar_dataset.py @@ -10,6 +10,8 @@ from datasets import load_dataset from torch.utils.data import Dataset +from QEfficient.utils.logging_utils import logger + class grammar(Dataset): def __init__(self, tokenizer, csv_name=None, context_length=None): @@ -20,7 +22,7 @@ def __init__(self, tokenizer, csv_name=None, context_length=None): delimiter=",", ) except Exception as e: - print( + logger.error( "Loading of grammar dataset failed! Please see [here](https://github.com/meta-llama/llama-recipes/blob/main/src/llama_recipes/datasets/grammar_dataset/grammar_dataset_process.ipynb) for details on how to download the dataset." ) raise e @@ -36,7 +38,7 @@ def convert_to_features(self, example_batch): # Create prompt and tokenize contexts and questions if self.print_text: - print("Input Text: ", self.clean_text(example_batch["text"])) + logger.info("Input Text: ", self.clean_text(example_batch["text"])) input_ = example_batch["input"] target_ = example_batch["target"] @@ -71,9 +73,9 @@ def get_dataset(dataset_config, tokenizer, csv_name=None, context_length=None): """cover function for handling loading the working dataset""" """dataset loading""" currPath = Path.cwd() / "datasets_grammar" / "grammar_train.csv" - print(f"Loading dataset {currPath}") + logger.info(f"Loading dataset {currPath}") csv_name = str(currPath) - print(csv_name) + logger.info(csv_name) dataset = grammar(tokenizer=tokenizer, csv_name=csv_name, context_length=context_length) return dataset diff --git a/QEfficient/finetune/eval.py b/QEfficient/finetune/eval.py index 918230554..fe9d9ae6a 100644 --- a/QEfficient/finetune/eval.py +++ b/QEfficient/finetune/eval.py @@ -25,12 +25,14 @@ ) from utils.train_utils import evaluation, print_model_size +from QEfficient.utils.logging_utils import logger + try: import torch_qaic # noqa: F401 device = "qaic:0" except ImportError as e: - print(f"Warning: {e}. Moving ahead without these qaic modules.") + logger.warning(f"{e}. Moving ahead without these qaic modules.") device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Suppress all warnings @@ -76,7 +78,7 @@ def main(**kwargs): # If there is a mismatch between tokenizer vocab size and embedding matrix, # throw a warning and then expand the embedding matrix if len(tokenizer) > model.get_input_embeddings().weight.shape[0]: - print("WARNING: Resizing the embedding matrix to match the tokenizer vocab size.") + logger.warning("Resizing the embedding matrix to match the tokenizer vocab size.") model.resize_token_embeddings(len(tokenizer)) print_model_size(model, train_config) @@ -107,13 +109,13 @@ def main(**kwargs): pin_memory=True, **val_dl_kwargs, ) - print(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}") + logger.info(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}") if len(eval_dataloader) == 0: raise ValueError( f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})" ) else: - print(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}") + logger.info(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}") model.to(device) _ = evaluation(model, train_config, eval_dataloader, None, tokenizer, device) diff --git a/QEfficient/finetune/utils/plot_metrics.py b/QEfficient/finetune/utils/plot_metrics.py index 5fc54f279..e2dd37f49 100644 --- a/QEfficient/finetune/utils/plot_metrics.py +++ b/QEfficient/finetune/utils/plot_metrics.py @@ -11,6 +11,8 @@ import matplotlib.pyplot as plt +from QEfficient.utils.logging_utils import logger + def plot_metric(data, metric_name, x_label, y_label, title, colors): plt.figure(figsize=(7, 6)) @@ -67,14 +69,14 @@ def plot_metrics_by_step(data, metric_name, x_label, y_label, colors): def plot_metrics(file_path): if not os.path.exists(file_path): - print(f"File {file_path} does not exist.") + logger.error(f"File {file_path} does not exist.") return with open(file_path, "r") as f: try: data = json.load(f) except json.JSONDecodeError: - print("Invalid JSON file.") + logger.error("Invalid JSON file.") return directory = os.path.dirname(file_path) diff --git a/QEfficient/finetune/utils/train_utils.py b/QEfficient/finetune/utils/train_utils.py index 2bc701008..68b884cab 100644 --- a/QEfficient/finetune/utils/train_utils.py +++ b/QEfficient/finetune/utils/train_utils.py @@ -19,6 +19,7 @@ from tqdm import tqdm from QEfficient.finetune.configs.training import train_config as TRAIN_CONFIG +from QEfficient.utils.logging_utils import logger try: import torch_qaic # noqa: F401 @@ -27,7 +28,7 @@ import torch_qaic.utils as qaic_utils # noqa: F401 from torch.qaic.amp import GradScaler as QAicGradScaler except ImportError as e: - print(f"Warning: {e}. Moving ahead without these qaic modules.") + logger.warning(f"{e}. Moving ahead without these qaic modules.") from torch.amp import GradScaler @@ -116,12 +117,12 @@ def train( for epoch in range(train_config.num_epochs): if loss_0_counter.item() == train_config.convergence_counter: if train_config.enable_ddp: - print( + logger.info( f"Not proceeding with epoch {epoch + 1} on device {local_rank} since loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps." ) break else: - print( + logger.info( f"Not proceeding with epoch {epoch + 1} since loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps." ) break @@ -129,13 +130,13 @@ def train( if train_config.use_peft and train_config.from_peft_checkpoint: intermediate_epoch = int(train_config.from_peft_checkpoint.split("/")[-2].split("_")[-1]) - 1 if epoch < intermediate_epoch: - print(f"Skipping epoch {epoch + 1} since fine tuning has already completed for it.") + logger.info(f"Skipping epoch {epoch + 1} since fine tuning has already completed for it.") # to bring the count of train_step in sync with where it left off total_train_steps += len(train_dataloader) continue - print(f"Starting epoch {epoch + 1}/{train_config.num_epochs}") - print(f"train_config.max_train_step: {train_config.max_train_step}") + logger.info(f"Starting epoch {epoch + 1}/{train_config.num_epochs}") + logger.info(f"train_config.max_train_step: {train_config.max_train_step}") # stop when the maximum number of training steps is reached if max_steps_reached: break @@ -162,7 +163,7 @@ def train( # to bring the count of train_step in sync with where it left off if epoch == intermediate_epoch and step == 0: total_train_steps += intermediate_step - print( + logger.info( f"skipping first {intermediate_step} steps for epoch {epoch + 1}, since fine tuning has already completed for them." ) if epoch == intermediate_epoch and step < intermediate_step: @@ -197,7 +198,7 @@ def train( labels = batch["labels"][:, 0] preds = torch.nn.functional.softmax(logits, dim=-1) acc_helper.forward(preds, labels) - print("Mismatches detected:", verifier.get_perop_mismatch_count()) + logger.info("Mismatches detected:", verifier.get_perop_mismatch_count()) else: model_outputs = model(**batch) loss = model_outputs.loss # Forward call @@ -279,13 +280,13 @@ def train( ) if train_config.enable_ddp: if loss_0_counter.item() == train_config.convergence_counter: - print( + logger.info( f"Loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps. Hence, stopping the fine tuning on device {local_rank}." ) break else: if loss_0_counter.item() == train_config.convergence_counter: - print( + logger.info( f"Loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps. Hence, stopping the fine tuning." ) break @@ -347,15 +348,15 @@ def train( if train_config.run_validation: if eval_epoch_loss < best_val_loss: best_val_loss = eval_epoch_loss - print(f"best eval loss on epoch {epoch + 1} is {best_val_loss}") + logger.info(f"best eval loss on epoch {epoch + 1} is {best_val_loss}") val_loss.append(float(eval_epoch_loss)) val_metric.append(float(eval_metric)) if train_config.task_type == "seq_classification": - print( + logger.info( f"Epoch {epoch + 1}: train_acc={metric_val:.4f}, train_epoch_loss={train_epoch_loss:.4f}, epoch time {epoch_end_time}s" ) else: - print( + logger.info( f"Epoch {epoch + 1}: train_metric={metric_val:.4f}, train_epoch_loss={train_epoch_loss:.4f}, epoch time {epoch_end_time}s" ) @@ -459,7 +460,7 @@ def evaluation_helper(model, train_config, eval_dataloader, device): eval_metric = torch.exp(eval_epoch_loss) # Print evaluation metrics - print(f" {eval_metric.detach().cpu()=} {eval_epoch_loss.detach().cpu()=}") + logger.info(f" {eval_metric.detach().cpu()=} {eval_epoch_loss.detach().cpu()=}") return eval_metric, eval_epoch_loss, val_step_loss, val_step_metric @@ -489,9 +490,9 @@ def print_model_size(model, config) -> None: model_name (str): Name of the model. """ - print(f"--> Model {config.model_name}") + logger.info(f"--> Model {config.model_name}") total_params = sum(p.numel() for p in model.parameters() if p.requires_grad) - print(f"\n--> {config.model_name} has {total_params / 1e6} Million params\n") + logger.info(f"\n--> {config.model_name} has {total_params / 1e6} Million params\n") def save_to_json(