Skip to content

[QEff Finetune]: Use logger in place of print statements in finetuning scripts #371

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions QEfficient/cloud/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,12 @@
)
from QEfficient.finetune.utils.train_utils import get_longest_seq_length, print_model_size, train
from QEfficient.utils._utils import login_and_download_hf_lm
from QEfficient.utils.logging_utils import logger

try:
import torch_qaic # noqa: F401
except ImportError as e:
print(f"Warning: {e}. Moving ahead without these qaic modules.")
logger.warning(f"{e}. Moving ahead without these qaic modules.")


from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer
Expand Down Expand Up @@ -114,7 +115,7 @@ def main(**kwargs):
# If there is a mismatch between tokenizer vocab size and embedding matrix,
# throw a warning and then expand the embedding matrix
if len(tokenizer) > model.get_input_embeddings().weight.shape[0]:
print("WARNING: Resizing the embedding matrix to match the tokenizer vocab size.")
logger.warning("Resizing the embedding matrix to match the tokenizer vocab size.")
model.resize_token_embeddings(len(tokenizer))

print_model_size(model, train_config)
Expand Down Expand Up @@ -163,10 +164,10 @@ def main(**kwargs):
# )
##
train_dl_kwargs = get_dataloader_kwargs(train_config, dataset_train, dataset_processer, "train")
print("length of dataset_train", len(dataset_train))
logger.info("length of dataset_train", len(dataset_train))
custom_data_collator = get_custom_data_collator(dataset_processer, dataset_config)
if custom_data_collator:
print("custom_data_collator is used")
logger.info("custom_data_collator is used")
train_dl_kwargs["collate_fn"] = custom_data_collator

# Create DataLoaders for the training and validation dataset
Expand All @@ -176,7 +177,7 @@ def main(**kwargs):
pin_memory=True,
**train_dl_kwargs,
)
print(f"--> Num of Training Set Batches loaded = {len(train_dataloader)}")
logger.info(f"--> Num of Training Set Batches loaded = {len(train_dataloader)}")

eval_dataloader = None
if train_config.run_validation:
Expand All @@ -200,15 +201,15 @@ def main(**kwargs):
f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})"
)
else:
print(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}")
logger.info(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}")

longest_seq_length, _ = get_longest_seq_length(
torch.utils.data.ConcatDataset([train_dataloader.dataset, eval_dataloader.dataset])
)
else:
longest_seq_length, _ = get_longest_seq_length(train_dataloader.dataset)

print(
logger.info(
f"The longest sequence length in the train data is {longest_seq_length}, "
f"passed context length is {train_config.context_length} and overall model's context length is "
f"{model.config.max_position_embeddings}"
Expand Down
8 changes: 5 additions & 3 deletions QEfficient/finetune/dataset/custom_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import importlib
from pathlib import Path

from QEfficient.utils.logging_utils import logger


def load_module_from_py_file(py_file: str) -> object:
"""
Expand Down Expand Up @@ -40,7 +42,7 @@ def get_custom_dataset(dataset_config, tokenizer, split: str):
try:
return getattr(module, func_name)(dataset_config, tokenizer, split)
except AttributeError as e:
print(
logger.error(
f"It seems like the given method name ({func_name}) is not present in the dataset .py file ({module_path.as_posix()})."
)
raise e
Expand All @@ -63,6 +65,6 @@ def get_data_collator(dataset_processer, dataset_config):
try:
return getattr(module, func_name)(dataset_processer)
except AttributeError:
print(f"Can not find the custom data_collator in the dataset.py file ({module_path.as_posix()}).")
print("Using the default data_collator instead.")
logger.info(f"Can not find the custom data_collator in the dataset.py file ({module_path.as_posix()}).")
logger.info("Using the default data_collator instead.")
return None
10 changes: 6 additions & 4 deletions QEfficient/finetune/dataset/grammar_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
from datasets import load_dataset
from torch.utils.data import Dataset

from QEfficient.utils.logging_utils import logger


class grammar(Dataset):
def __init__(self, tokenizer, csv_name=None, context_length=None):
Expand All @@ -20,7 +22,7 @@ def __init__(self, tokenizer, csv_name=None, context_length=None):
delimiter=",",
)
except Exception as e:
print(
logger.error(
"Loading of grammar dataset failed! Please see [here](https://github.com/meta-llama/llama-recipes/blob/main/src/llama_recipes/datasets/grammar_dataset/grammar_dataset_process.ipynb) for details on how to download the dataset."
)
raise e
Expand All @@ -36,7 +38,7 @@ def convert_to_features(self, example_batch):
# Create prompt and tokenize contexts and questions

if self.print_text:
print("Input Text: ", self.clean_text(example_batch["text"]))
logger.info("Input Text: ", self.clean_text(example_batch["text"]))

input_ = example_batch["input"]
target_ = example_batch["target"]
Expand Down Expand Up @@ -71,9 +73,9 @@ def get_dataset(dataset_config, tokenizer, csv_name=None, context_length=None):
"""cover function for handling loading the working dataset"""
"""dataset loading"""
currPath = Path.cwd() / "datasets_grammar" / "grammar_train.csv"
print(f"Loading dataset {currPath}")
logger.info(f"Loading dataset {currPath}")
csv_name = str(currPath)
print(csv_name)
logger.info(csv_name)
dataset = grammar(tokenizer=tokenizer, csv_name=csv_name, context_length=context_length)

return dataset
10 changes: 6 additions & 4 deletions QEfficient/finetune/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,14 @@
)
from utils.train_utils import evaluation, print_model_size

from QEfficient.utils.logging_utils import logger

try:
import torch_qaic # noqa: F401

device = "qaic:0"
except ImportError as e:
print(f"Warning: {e}. Moving ahead without these qaic modules.")
logger.warning(f"{e}. Moving ahead without these qaic modules.")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Suppress all warnings
Expand Down Expand Up @@ -76,7 +78,7 @@ def main(**kwargs):
# If there is a mismatch between tokenizer vocab size and embedding matrix,
# throw a warning and then expand the embedding matrix
if len(tokenizer) > model.get_input_embeddings().weight.shape[0]:
print("WARNING: Resizing the embedding matrix to match the tokenizer vocab size.")
logger.warning("Resizing the embedding matrix to match the tokenizer vocab size.")
model.resize_token_embeddings(len(tokenizer))

print_model_size(model, train_config)
Expand Down Expand Up @@ -107,13 +109,13 @@ def main(**kwargs):
pin_memory=True,
**val_dl_kwargs,
)
print(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}")
logger.info(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}")
if len(eval_dataloader) == 0:
raise ValueError(
f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})"
)
else:
print(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}")
logger.info(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}")

model.to(device)
_ = evaluation(model, train_config, eval_dataloader, None, tokenizer, device)
Expand Down
6 changes: 4 additions & 2 deletions QEfficient/finetune/utils/plot_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@

import matplotlib.pyplot as plt

from QEfficient.utils.logging_utils import logger


def plot_metric(data, metric_name, x_label, y_label, title, colors):
plt.figure(figsize=(7, 6))
Expand Down Expand Up @@ -67,14 +69,14 @@ def plot_metrics_by_step(data, metric_name, x_label, y_label, colors):

def plot_metrics(file_path):
if not os.path.exists(file_path):
print(f"File {file_path} does not exist.")
logger.error(f"File {file_path} does not exist.")
return

with open(file_path, "r") as f:
try:
data = json.load(f)
except json.JSONDecodeError:
print("Invalid JSON file.")
logger.error("Invalid JSON file.")
return

directory = os.path.dirname(file_path)
Expand Down
33 changes: 17 additions & 16 deletions QEfficient/finetune/utils/train_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from tqdm import tqdm

from QEfficient.finetune.configs.training import train_config as TRAIN_CONFIG
from QEfficient.utils.logging_utils import logger

try:
import torch_qaic # noqa: F401
Expand All @@ -27,7 +28,7 @@
import torch_qaic.utils as qaic_utils # noqa: F401
from torch.qaic.amp import GradScaler as QAicGradScaler
except ImportError as e:
print(f"Warning: {e}. Moving ahead without these qaic modules.")
logger.warning(f"{e}. Moving ahead without these qaic modules.")

from torch.amp import GradScaler

Expand Down Expand Up @@ -116,26 +117,26 @@ def train(
for epoch in range(train_config.num_epochs):
if loss_0_counter.item() == train_config.convergence_counter:
if train_config.enable_ddp:
print(
logger.info(
f"Not proceeding with epoch {epoch + 1} on device {local_rank} since loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps."
)
break
else:
print(
logger.info(
f"Not proceeding with epoch {epoch + 1} since loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps."
)
break

if train_config.use_peft and train_config.from_peft_checkpoint:
intermediate_epoch = int(train_config.from_peft_checkpoint.split("/")[-2].split("_")[-1]) - 1
if epoch < intermediate_epoch:
print(f"Skipping epoch {epoch + 1} since fine tuning has already completed for it.")
logger.info(f"Skipping epoch {epoch + 1} since fine tuning has already completed for it.")
# to bring the count of train_step in sync with where it left off
total_train_steps += len(train_dataloader)
continue

print(f"Starting epoch {epoch + 1}/{train_config.num_epochs}")
print(f"train_config.max_train_step: {train_config.max_train_step}")
logger.info(f"Starting epoch {epoch + 1}/{train_config.num_epochs}")
logger.info(f"train_config.max_train_step: {train_config.max_train_step}")
# stop when the maximum number of training steps is reached
if max_steps_reached:
break
Expand All @@ -162,7 +163,7 @@ def train(
# to bring the count of train_step in sync with where it left off
if epoch == intermediate_epoch and step == 0:
total_train_steps += intermediate_step
print(
logger.info(
f"skipping first {intermediate_step} steps for epoch {epoch + 1}, since fine tuning has already completed for them."
)
if epoch == intermediate_epoch and step < intermediate_step:
Expand Down Expand Up @@ -197,7 +198,7 @@ def train(
labels = batch["labels"][:, 0]
preds = torch.nn.functional.softmax(logits, dim=-1)
acc_helper.forward(preds, labels)
print("Mismatches detected:", verifier.get_perop_mismatch_count())
logger.info("Mismatches detected:", verifier.get_perop_mismatch_count())
else:
model_outputs = model(**batch)
loss = model_outputs.loss # Forward call
Expand Down Expand Up @@ -279,13 +280,13 @@ def train(
)
if train_config.enable_ddp:
if loss_0_counter.item() == train_config.convergence_counter:
print(
logger.info(
f"Loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps. Hence, stopping the fine tuning on device {local_rank}."
)
break
else:
if loss_0_counter.item() == train_config.convergence_counter:
print(
logger.info(
f"Loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps. Hence, stopping the fine tuning."
)
break
Expand Down Expand Up @@ -347,15 +348,15 @@ def train(
if train_config.run_validation:
if eval_epoch_loss < best_val_loss:
best_val_loss = eval_epoch_loss
print(f"best eval loss on epoch {epoch + 1} is {best_val_loss}")
logger.info(f"best eval loss on epoch {epoch + 1} is {best_val_loss}")
val_loss.append(float(eval_epoch_loss))
val_metric.append(float(eval_metric))
if train_config.task_type == "seq_classification":
print(
logger.info(
f"Epoch {epoch + 1}: train_acc={metric_val:.4f}, train_epoch_loss={train_epoch_loss:.4f}, epoch time {epoch_end_time}s"
)
else:
print(
logger.info(
f"Epoch {epoch + 1}: train_metric={metric_val:.4f}, train_epoch_loss={train_epoch_loss:.4f}, epoch time {epoch_end_time}s"
)

Expand Down Expand Up @@ -459,7 +460,7 @@ def evaluation_helper(model, train_config, eval_dataloader, device):
eval_metric = torch.exp(eval_epoch_loss)

# Print evaluation metrics
print(f" {eval_metric.detach().cpu()=} {eval_epoch_loss.detach().cpu()=}")
logger.info(f" {eval_metric.detach().cpu()=} {eval_epoch_loss.detach().cpu()=}")

return eval_metric, eval_epoch_loss, val_step_loss, val_step_metric

Expand Down Expand Up @@ -489,9 +490,9 @@ def print_model_size(model, config) -> None:
model_name (str): Name of the model.
"""

print(f"--> Model {config.model_name}")
logger.info(f"--> Model {config.model_name}")
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"\n--> {config.model_name} has {total_params / 1e6} Million params\n")
logger.info(f"\n--> {config.model_name} has {total_params / 1e6} Million params\n")


def save_to_json(
Expand Down
Loading