Merge branch 'main' into add_gemma3

qcdipankar · web-flow · commit 68bb66402722 · 2025-06-10T13:47:04.000+05:30
diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py
@@ -241,10 +241,12 @@ def _compile(
             :mdp_ts_num_devices (int): Number of devices to partition to use Multi-Device Partitioning with tensor-slicing.
             :num_speculative_tokens (int, optional): Number of speculative tokens to take as input for Speculative Decoding Target Language Model.
             :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.``
-            :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
-            :compiler_options: Pass any compiler option as input. Any flag that is supported by `qaic-exec` can be passed. Params are converted to flags as below:
+            :qnn_config (str): Path of QNN Config parameters file. Any extra parameters for QNN compilation can be passed via this file. ``Defaults to None.``
+            :compiler_options: Pass any compiler option as input.
+                Any flag that is supported by `qaic-exec` can be passed. Params are converted to flags as below:
                 - aic_num_cores=16 -> -aic-num-cores=16
                 - convert_to_fp16=True -> -convert-to-fp16
+                For QNN Compilation path, when enable_qnn is set to True, any parameter passed in compiler_options will be ignored.
         """
         if onnx_path is None and self.onnx_path is None:
             self.export()
@@ -256,6 +258,11 @@ def _compile(
             raise FileNotFoundError(f"ONNX file not found at: {onnx_path}")
 
         if enable_qnn:
+            if compiler_options:
+                logger.warning(
+                    f"Extra arguments to QNN compilation are supported only via qnn_config file. Ignoring {compiler_options}"
+                )
+
             self.qpc_path = qnn_compile(
                 onnx_path=onnx_path,
                 qpc_base_path=compile_dir,
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
@@ -291,8 +291,13 @@ def compile(
             :num_devices (int): Number of devices the model needs to be compiled for. Defaults to 1.
             :num_cores (int): Number of cores used to compile the model.
             :mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to False``.
-            :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``.
-            :allow_mxint8_mdp_io (bool, optional): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.``
+            :compiler_options (dict, optional): Additional compiler options.
+                For QAIC Compiler: Extra arguments for qaic-exec can be passed.
+                    :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``.
+                    :allow_mxint8_mdp_io (bool, optional): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.``
+                For QNN Compiler: Following arguments can be passed.
+                    :enable_qnn (bool): Enables QNN Compilation.
+                    :qnn_config (str): Path of QNN Config parameters file. Any extra parameters for QNN compilation can be passed via this file.
         Returns:
             :str: Path of the compiled ``qpc`` package.
         """
@@ -1576,16 +1581,18 @@ def compile(
             :mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to False``.
             :mxint8_kv_cache (bool, optional): Whether to use ``mxint8`` compression for KV cache. ``Defaults to False``.
             :num_speculative_tokens (int, optional): Number of speculative tokens to take as input for Speculative Decoding Target Language Model.
-            :mos (int, optional): Effort level to reduce on-chip memory. Defaults to -1, meaning no effort. ``Defaults to -1``.
-            :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``.
             :prefill_only (bool): if ``True`` compile for prefill only and if ``False`` compile for decode only. Defaults to None, which compiles for both ``prefill and ``decode``.
-            :compiler_options (dict, optional): Pass any compiler option as input. ``Defaults to None``.
-            Following flag can be passed in compiler_options to enable QNN Compilation path.
-                :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False. if not passed.``
-                :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None. if not passed``
-            for QAIC compilation path, any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below:
-                - aic_num_cores=16 -> -aic-num-cores=16
-                - convert_to_fp16=True -> -convert-to-fp16
+            :compiler_options (dict, optional): Additional compiler options. ``Defaults to None``.
+                For QAIC Compiler: Extra arguments for qaic-exec can be passed.
+                    :mos (int, optional): Effort level to reduce on-chip memory. Defaults to -1, meaning no effort. ``Defaults to -1``.
+                    :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``.
+                    :allow_mxint8_mdp_io (bool, optional): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.``
+                    Params are converted to flags as below:
+                    - aic_num_cores=16 -> -aic-num-cores=16
+                    - convert_to_fp16=True -> -convert-to-fp16
+                For QNN Compiler: Following arguments can be passed.
+                    :enable_qnn (bool): Enables QNN Compilation.
+                    :qnn_config (str): Path of QNN Config parameters file. Any extra parameters for QNN compilation can be passed via this file.
 
         Returns:
             :str: Path of the compiled ``qpc`` package.
diff --git a/QEfficient/transformers/quantizers/quantizer_compressed_tensors.py b/QEfficient/transformers/quantizers/quantizer_compressed_tensors.py
@@ -218,6 +218,9 @@ def _process_model_after_weight_loading(self, model, **kwargs):
     def update_missing_keys_after_loading(self, model, missing_keys: List[str], prefix: str) -> List[str]:
         return missing_keys
 
+    def update_unexpected_keys(self, model, unexpected_keys: List[str], prefix: str) -> List[str]:
+        return unexpected_keys
+
 
 class QEffCompressedTensorsConfig(CompressedTensorsConfig):
     def __init__(
@@ -395,3 +398,6 @@ def _process_model_after_weight_loading(self, model, **kwargs):
 
     def update_missing_keys_after_loading(self, model, missing_keys: List[str], prefix: str) -> List[str]:
         return missing_keys
+
+    def update_unexpected_keys(self, model, unexpected_keys: List[str], prefix: str) -> List[str]:
+        return unexpected_keys
diff --git a/QEfficient/utils/generate_qnn_network_specialization_config.py b/QEfficient/utils/generate_qnn_network_specialization_config.py
@@ -173,10 +173,10 @@ def generate_data_format_config(
 
     for input in onnx_model.graph.input:
         if "past_key" in input.name or "past_value" in input.name:
-            kv_nodes.append((input.name).replace(".", "_"))
+            kv_nodes.append(input.name)
     for output in onnx_model.graph.output:
         if "past_key" in output.name or "past_value" in output.name:
-            kv_nodes.append((output.name).replace(".", "_"))
+            kv_nodes.append(output.name)
             kv_overrides = {}
 
     kv_overrides["graphs"] = [
diff --git a/pyproject.toml b/pyproject.toml
@@ -19,8 +19,8 @@ classifiers = [
 ]
 requires-python = ">=3.8,<3.11"
 dependencies = [
-    "transformers==4.50.0",
-    "huggingface-hub==0.27.0",
+    "transformers==4.51.3",
+    "huggingface-hub==0.30.0",
     "hf_transfer==0.1.9",
     "peft==0.13.2",
     "datasets==2.20.0",
diff --git a/tests/transformers/models/test_image_text_to_text_models.py b/tests/transformers/models/test_image_text_to_text_models.py
@@ -27,7 +27,6 @@
 from QEfficient.utils.run_utils import ApiRunnerInternVL, ApiRunnerVlm
 from QEfficient.utils.test_utils import InternProcessor
 
-HF_TOKEN = ""
 NEW_GENERATION_TOKENS = 10
 test_models_config = [
     # CONFIG PARAMS NEEDED FOR A MODEL TO BE TESTED
@@ -104,21 +103,18 @@
 def load_image_text_to_text_model(model_config):
     model_path = hf_download(
         repo_id=model_config._name_or_path,
-        hf_token=HF_TOKEN,
         ignore_patterns=["*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.h5", "*.msgpack"],
     )
     try:
         model_hf = AutoModelForImageTextToText.from_pretrained(
             model_path,
             low_cpu_mem_usage=False,
-            token=HF_TOKEN,
             config=model_config,
         )
     except ValueError:
         model_hf = AutoModelForCausalLM.from_pretrained(
             model_path,
             low_cpu_mem_usage=False,
-            token=HF_TOKEN,
             trust_remote_code=True,
             config=model_config,
         )
@@ -160,9 +156,7 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100(
 ):
     model_config = {"model_name": model_name}
     model_config["img_size"] = img_size
-    config = AutoConfig.from_pretrained(
-        model_config["model_name"], token=HF_TOKEN, trust_remote_code=True, padding=True
-    )
+    config = AutoConfig.from_pretrained(model_config["model_name"], trust_remote_code=True, padding=True)
     config = set_num_layers(config, n_layer=n_layer)
     model_hf, _ = load_image_text_to_text_model(config)
     processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True, padding=True)
@@ -199,7 +193,6 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100(
         model_config["model_name"],
         kv_offload=kv_offload,
         config=config,
-        token=HF_TOKEN,
     )
 
     # pytorch_kv_tokens = api_runner.run_vlm_kv_model_on_pytorch(qeff_model.model)
@@ -284,7 +277,6 @@ def check_intern_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100(
         model_config["model_name"],
         kv_offload=kv_offload,
         config=config,
-        token=HF_TOKEN,
     )
     # pytorch_kv_tokens = api_runner.run_vlm_kv_model_on_pytorch(qeff_model.model)
     # assert (pytorch_hf_tokens == pytorch_kv_tokens).all(), (