quic
diff --git a/‎QEfficient/__init__.py
Lines changed: 7 additions & 11 deletions b/‎QEfficient/__init__.py
Lines changed: 7 additions & 11 deletions
diff --git a/‎QEfficient/base/common.py
Lines changed: 7 additions & 5 deletions b/‎QEfficient/base/common.py
Lines changed: 7 additions & 5 deletions
diff --git a/‎QEfficient/base/modeling_qeff.py
Lines changed: 11 additions & 2 deletions b/‎QEfficient/base/modeling_qeff.py
Lines changed: 11 additions & 2 deletions
diff --git a/‎QEfficient/base/pytorch_transforms.py
Lines changed: 69 additions & 1 deletion b/‎QEfficient/base/pytorch_transforms.py
Lines changed: 69 additions & 1 deletion
diff --git a/‎QEfficient/cloud/finetune.py
Lines changed: 12 additions & 57 deletions b/‎QEfficient/cloud/finetune.py
Lines changed: 12 additions & 57 deletions
diff --git a/‎QEfficient/cloud/infer.py
Lines changed: 9 additions & 0 deletions b/‎QEfficient/cloud/infer.py
Lines changed: 9 additions & 0 deletions
diff --git a/‎QEfficient/finetune/configs/peft_config.py
Lines changed: 0 additions & 7 deletions b/‎QEfficient/finetune/configs/peft_config.py
Lines changed: 0 additions & 7 deletions
@@ -6,24 +6,20 @@
 # -----------------------------------------------------------------------------
 
 import os
+import warnings
+
+from QEfficient.utils import custom_format_warning
 
 # For faster downloads via hf_transfer
 # This code is put above import statements as this needs to be executed before
 # hf_transfer is imported (will happen on line 15 via leading imports)
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
-
-from transformers import AutoConfig
-
-from QEfficient.transformers.modeling_utils import MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS
+# Placeholder for all non-transformer models registered in QEfficient
+import QEfficient.utils.model_registery  # noqa: F401
 from QEfficient.utils.logging_utils import logger
 
-# loop over all the model types which are not present in transformers and register them
-for model_type, model_cls in MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS.items():
-    # Register the model config class based on the model type. This will be first element in the tuple
-    AutoConfig.register(model_type, model_cls[0])
-
-    # Register the non transformer library Class and config class using AutoModelClass
-    model_cls[2].register(model_cls[0], model_cls[1])
+# custom warning for the better logging experience
+warnings.formatwarning = custom_format_warning
 
 
 def check_qaic_sdk():
 
@@ -18,7 +18,7 @@
 from transformers import AutoConfig
 
 from QEfficient.base.modeling_qeff import QEFFBaseModel
-from QEfficient.transformers.modeling_utils import MODEL_CLASS_MAPPING
+from QEfficient.transformers.modeling_utils import EXTERNAL_MODEL_CLASS_MAPPING, MODEL_CLASS_MAPPING
 from QEfficient.utils import login_and_download_hf_lm
 
 
@@ -40,16 +40,18 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs) ->
         """
         Downloads HuggingFace model if already doesn't exist locally, returns QEFFAutoModel object based on type of model.
         """
-        config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
-        architecture = config.architectures[0] if config.architectures else None
+        config = AutoConfig.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
 
-        class_name = MODEL_CLASS_MAPPING.get(architecture)
+        class_name = (
+            MODEL_CLASS_MAPPING.get(config.__class__.__name__, None)
+            or EXTERNAL_MODEL_CLASS_MAPPING[config.__class__.__name__]
+        )
         if class_name:
             module = __import__("QEfficient.transformers.models.modeling_auto")
             model_class = getattr(module, class_name)
         else:
             raise NotImplementedError(
-                f"Unknown architecture={architecture}, either use specific auto model class for loading the model or raise an issue for support!"
+                f"Unknown architecture={config.__class__.__name__}, either use specific auto model class for loading the model or raise an issue for support!"
             )
 
         local_model_dir = kwargs.pop("local_model_dir", None)
 
@@ -241,10 +241,12 @@ def _compile(
             :mdp_ts_num_devices (int): Number of devices to partition to use Multi-Device Partitioning with tensor-slicing.
             :num_speculative_tokens (int, optional): Number of speculative tokens to take as input for Speculative Decoding Target Language Model.
             :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.``
-            :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
-            :compiler_options: Pass any compiler option as input. Any flag that is supported by `qaic-exec` can be passed. Params are converted to flags as below:
+            :qnn_config (str): Path of QNN Config parameters file. Any extra parameters for QNN compilation can be passed via this file. ``Defaults to None.``
+            :compiler_options: Pass any compiler option as input.
+                Any flag that is supported by `qaic-exec` can be passed. Params are converted to flags as below:
                 - aic_num_cores=16 -> -aic-num-cores=16
                 - convert_to_fp16=True -> -convert-to-fp16
+                For QNN Compilation path, when enable_qnn is set to True, any parameter passed in compiler_options will be ignored.
         """
         if onnx_path is None and self.onnx_path is None:
             self.export()
@@ -256,6 +258,11 @@ def _compile(
             raise FileNotFoundError(f"ONNX file not found at: {onnx_path}")
 
         if enable_qnn:
+            if compiler_options:
+                logger.warning(
+                    f"Extra arguments to QNN compilation are supported only via qnn_config file. Ignoring {compiler_options}"
+                )
+
             self.qpc_path = qnn_compile(
                 onnx_path=onnx_path,
                 qpc_base_path=compile_dir,
@@ -292,6 +299,8 @@ def _compile(
 
         if num_speculative_tokens:
             compile_hash.update(to_hashable({"num_speculative_tokens": num_speculative_tokens}))
+        # Hash num_devices too, since default value would always be 1.
+        compile_hash.update(to_hashable(mdp_ts_num_devices))
 
         # Check if already compiled
         compile_hash = compile_hash.hexdigest()[:16]
 
@@ -9,6 +9,8 @@
 
 from torch import nn
 
+from QEfficient.utils.logging_utils import logger
+
 
 class PytorchTransform:
     """
@@ -90,7 +92,7 @@ def mutate(cls, original_module: nn.Module, parent_module: nn.Module):
         raise NotImplementedError("Please implement your own method by inheriting this class")
 
 
-class ModuleMethodMapperTransform(PytorchTransform):
+class ExternalModuleMapperTransform(PytorchTransform):
     """
     Serves as base class for any transform that want to map a particular method of a class to a new method implementation.
     """
@@ -107,6 +109,72 @@ def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]:
             ):
                 for orig_method_name, mapped_method in repl_method_map.items():
                     setattr(module, orig_method_name, MethodType(mapped_method, module))
+
+                    if hasattr(module, "__qeff_init__"):
+                        module.__qeff_init__()
+
                     transformed = True
 
         return model, transformed
+
+
+class SplitGateUpWeightsTransform(PytorchTransform):
+    """
+    split fused Gate+Up weights and copy into the model
+
+    For every transformer layer inside `model`:
+      • expects   <PREFIX>.experts.gate_up_proj   in the *source* `sd`
+      • copies halves into
+            <PREFIX>.experts.gate_proj     <-- Gate   [E,H,I]
+            <PREFIX>.experts.up_proj       <-- Up     [E,H,I]
+    """
+
+    @classmethod
+    def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]:
+        transformed = False
+        model_class = model.__class__.__name__ if hasattr(model, "model") else model.__class__.__name__
+
+        if model_class not in VLM_SPLIT_GATE_UP_WEIGHTS:
+            return model, transformed
+
+        model_tmp = model.language_model if hasattr(model, "language_model") else model
+
+        num_layers = len(model_tmp.model.layers)
+        delete_fused_key = True
+        sd = model_tmp.state_dict()
+        for layer_idx in range(num_layers):
+            # ---- build the textual prefix once per layer ----------
+            prefix = f"model.layers.{layer_idx}.feed_forward.experts."
+
+            fused_key = prefix + "gate_up_proj"
+            gate_key = prefix + "gate_proj"
+            up_key = prefix + "up_proj"
+
+            # ---- split  [E,H,2I] → two  [E,H,I]  tensors ----------------------
+            fused = sd[fused_key]  # [E, H, 2I]  (no .weight here)
+            E, H, two_I = fused.shape
+            ffn_dim = two_I // 2
+            gate, up = fused.split(ffn_dim, dim=-1)  # views – no copy
+
+            experts = model_tmp.model.layers[layer_idx].feed_forward.experts
+            experts.gate_proj.data.copy_(gate)
+            experts.up_proj.data.copy_(up)
+
+            # ---- update the state-dict so load_state_dict sees the right keys
+            sd[gate_key] = gate
+            sd[up_key] = up
+
+            if delete_fused_key:
+                del sd[fused_key]
+
+            logger.info(f"[layer {layer_idx:02d}] loaded gate_proj & up_proj from fused tensor  (shape {fused.shape})")
+            transformed = True
+
+        if hasattr(model, "language_model"):
+            model.language_model = model_tmp
+        else:
+            model = model_tmp
+        return model, transformed
+
+
+VLM_SPLIT_GATE_UP_WEIGHTS = {"QEffLlama4ForConditionalGeneration", "QEffLlama4ForCausalLM"}
@@ -9,7 +9,6 @@
 import warnings
 from typing import Any, Dict, Optional, Union
 
-import fire
 import numpy as np
 import torch
 import torch.distributed as dist
@@ -24,14 +23,11 @@
 from QEfficient.finetune.utils.config_utils import (
     generate_dataset_config,
     generate_peft_config,
-    get_dataloader_kwargs,
     update_config,
 )
-from QEfficient.finetune.utils.dataset_utils import (
-    get_custom_data_collator,
-    get_preprocessed_dataset,
-)
 from QEfficient.finetune.utils.device_map import get_device_map
+from QEfficient.finetune.utils.dataset_utils import get_dataloader
+from QEfficient.finetune.utils.parser import get_finetune_parser
 from QEfficient.finetune.utils.train_utils import get_longest_seq_length, print_model_size, train
 from QEfficient.utils._utils import get_num_layers_from_config, login_and_download_hf_lm
 
@@ -66,8 +62,8 @@ def setup_distributed_training(train_config: TrainConfig) -> None:
     torch_device = torch.device(train_config.device)
     assert torch_device.type != "cpu", "Host doesn't support single-node DDP"
     assert torch_device.index is None, f"DDP requires only device type, got: {torch_device}"
-
-    dist.init_process_group(backend=train_config.dist_backend)
+    dist_backend_map = {"cpu": "gloo", "qaic": "qccl", "cuda": "gloo"}
+    dist.init_process_group(backend=dist_backend_map[torch_device.type])
     if train_config.enable_pp:
         assert dist.get_world_size() * train_config.num_pp_stages == getattr(torch, torch_device.type).device_count(), (
             "Total available devices should be multiple of number of pipeline stages."
@@ -201,7 +197,7 @@ def apply_peft(
         kwargs: Additional arguments to override PEFT config params.
 
     Returns:
-        Union[AutoModel, PeftModel]: If the use_peft in train_config is True
+        Union[AutoModel, PeftModel]: If use_peft in train_config is True
             then PeftModel object is returned else original model object
             (AutoModel) is returned.
     """
@@ -247,58 +243,13 @@ def setup_dataloaders(
         - Applies a custom data collator if provided by get_custom_data_collator.
         - Configures DataLoader kwargs using get_dataloader_kwargs for train and val splits.
     """
-    # Get the dataset utils
-    dataset_processer = tokenizer
-
-    # Load and preprocess the dataset for training and validation
-    dataset_train = get_preprocessed_dataset(
-        dataset_processer, dataset_config, split="train", context_length=train_config.context_length
-    )
-
-    dataset_val = get_preprocessed_dataset(
-        dataset_processer, dataset_config, split="test", context_length=train_config.context_length
-    )
 
-    # TODO: vbaddi, check if its necessary to do this?
-    # dataset_train = ConcatDataset(
-    #             dataset_train, chunk_size=train_config.context_length
-    #         )
-    ##
-    train_dl_kwargs = get_dataloader_kwargs(train_config, dataset_train, dataset_processer, "train")
-    print("length of dataset_train", len(dataset_train))
-
-    # FIXME (Meet): Add custom data collator registration from the outside by the user.
-    custom_data_collator = get_custom_data_collator(dataset_processer, dataset_config)
-    if custom_data_collator:
-        print("custom_data_collator is used")
-        train_dl_kwargs["collate_fn"] = custom_data_collator
-
-    # Create DataLoaders for the training and validation dataset
-    train_dataloader = torch.utils.data.DataLoader(
-        dataset_train,
-        num_workers=train_config.num_workers_dataloader,
-        pin_memory=True,
-        **train_dl_kwargs,
-    )
+    train_dataloader = get_dataloader(tokenizer, dataset_config, train_config, split="train")
     print(f"--> Num of Training Set Batches loaded = {len(train_dataloader)}")
 
     eval_dataloader = None
     if train_config.run_validation:
-        # if train_config.batching_strategy == "packing":
-        #     dataset_val = ConcatDataset(
-        #         dataset_val, chunk_size=train_config.context_length
-        #     )
-
-        val_dl_kwargs = get_dataloader_kwargs(train_config, dataset_val, dataset_processer, "val")
-        if custom_data_collator:
-            val_dl_kwargs["collate_fn"] = custom_data_collator
-
-        eval_dataloader = torch.utils.data.DataLoader(
-            dataset_val,
-            num_workers=train_config.num_workers_dataloader,
-            pin_memory=True,
-            **val_dl_kwargs,
-        )
+        eval_dataloader = get_dataloader(tokenizer, dataset_config, train_config, split="val")
         if len(eval_dataloader) == 0:
             raise ValueError(
                 f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})"
@@ -337,6 +288,7 @@ def main(peft_config_file: str = None, **kwargs) -> None:
                 --model_name "meta-llama/Llama-3.2-1B" \\
                 --lr 5e-4
     """
+    # TODO:Remove TrainConfig() and update_config() as all params are passed in kwargs by parser
     train_config = TrainConfig()
     update_config(train_config, **kwargs)
     dataset_config = generate_dataset_config(train_config.dataset)
@@ -380,4 +332,7 @@ def main(peft_config_file: str = None, **kwargs) -> None:
 
 
 if __name__ == "__main__":
-    fire.Fire(main)
+    parser = get_finetune_parser()
+    args = parser.parse_args()
+    args_dict = vars(args)
+    main(**args_dict)
@@ -111,6 +111,7 @@ def main(
     allow_mxint8_mdp_io: bool = False,
     enable_qnn: Optional[bool] = False,
     qnn_config: Optional[str] = None,
+    trust_remote_code: Optional[bool] = False,
     **kwargs,
 ) -> None:
     """
@@ -140,6 +141,7 @@ def main(
         :allow_mxint8_mdp_io (bool): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.``
         :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.``
         :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
+        :trust_remote_code (bool): Trust remote code execution. ``Defaults to False.``
         :kwargs: Pass any compiler option as input. Any flag that is supported by `qaic-exec` can be passed. Params are converted to flags as below:
                 -allocator_dealloc_delay=1 -> -allocator-dealloc-delay=1
                 -qpc_crc=True -> -qpc-crc
@@ -164,6 +166,7 @@ def main(
         hf_token=hf_token,
         full_batch_size=full_batch_size,
         local_model_dir=local_model_dir,
+        trust_remote_code=trust_remote_code,
     )
 
     image_path = kwargs.pop("image_path", None)
@@ -264,6 +267,12 @@ def main(
         action="store_true",
         help="Compress constant MatMul weights to MXFP6 E2M3, default is no compression",
     )
+    parser.add_argument(
+        "--trust_remote_code",
+        action="store_true",
+        default=False,
+        help="Enable trusting remote code when loading models. Default is False; set to True by passing this flag.",
+    )
     parser.add_argument(
         "--mxint8",
         "--mxint8_kv_cache",
 
@@ -30,10 +30,3 @@ class LoraConfig:
     task_type: str = "CAUSAL_LM"
     lora_dropout: float = 0.05
     inference_mode: bool = False  # should be False for finetuning
-
-
-# CAUTION prefix tuning is currently not supported
-@dataclass
-class PrefixConfig:
-    num_virtual_tokens: int = 30
-    task_type: str = "CAUSAL_LM"