Skip to content

Commit 68bb664

Browse files
authored
Merge branch 'main' into add_gemma3
2 parents c6e5ee5 + 2080052 commit 68bb664

File tree

6 files changed

+38
-26
lines changed

6 files changed

+38
-26
lines changed

QEfficient/base/modeling_qeff.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -241,10 +241,12 @@ def _compile(
241241
:mdp_ts_num_devices (int): Number of devices to partition to use Multi-Device Partitioning with tensor-slicing.
242242
:num_speculative_tokens (int, optional): Number of speculative tokens to take as input for Speculative Decoding Target Language Model.
243243
:enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.``
244-
:qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
245-
:compiler_options: Pass any compiler option as input. Any flag that is supported by `qaic-exec` can be passed. Params are converted to flags as below:
244+
:qnn_config (str): Path of QNN Config parameters file. Any extra parameters for QNN compilation can be passed via this file. ``Defaults to None.``
245+
:compiler_options: Pass any compiler option as input.
246+
Any flag that is supported by `qaic-exec` can be passed. Params are converted to flags as below:
246247
- aic_num_cores=16 -> -aic-num-cores=16
247248
- convert_to_fp16=True -> -convert-to-fp16
249+
For QNN Compilation path, when enable_qnn is set to True, any parameter passed in compiler_options will be ignored.
248250
"""
249251
if onnx_path is None and self.onnx_path is None:
250252
self.export()
@@ -256,6 +258,11 @@ def _compile(
256258
raise FileNotFoundError(f"ONNX file not found at: {onnx_path}")
257259

258260
if enable_qnn:
261+
if compiler_options:
262+
logger.warning(
263+
f"Extra arguments to QNN compilation are supported only via qnn_config file. Ignoring {compiler_options}"
264+
)
265+
259266
self.qpc_path = qnn_compile(
260267
onnx_path=onnx_path,
261268
qpc_base_path=compile_dir,

QEfficient/transformers/models/modeling_auto.py

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -291,8 +291,13 @@ def compile(
291291
:num_devices (int): Number of devices the model needs to be compiled for. Defaults to 1.
292292
:num_cores (int): Number of cores used to compile the model.
293293
:mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to False``.
294-
:aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``.
295-
:allow_mxint8_mdp_io (bool, optional): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.``
294+
:compiler_options (dict, optional): Additional compiler options.
295+
For QAIC Compiler: Extra arguments for qaic-exec can be passed.
296+
:aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``.
297+
:allow_mxint8_mdp_io (bool, optional): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.``
298+
For QNN Compiler: Following arguments can be passed.
299+
:enable_qnn (bool): Enables QNN Compilation.
300+
:qnn_config (str): Path of QNN Config parameters file. Any extra parameters for QNN compilation can be passed via this file.
296301
Returns:
297302
:str: Path of the compiled ``qpc`` package.
298303
"""
@@ -1576,16 +1581,18 @@ def compile(
15761581
:mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to False``.
15771582
:mxint8_kv_cache (bool, optional): Whether to use ``mxint8`` compression for KV cache. ``Defaults to False``.
15781583
:num_speculative_tokens (int, optional): Number of speculative tokens to take as input for Speculative Decoding Target Language Model.
1579-
:mos (int, optional): Effort level to reduce on-chip memory. Defaults to -1, meaning no effort. ``Defaults to -1``.
1580-
:aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``.
15811584
:prefill_only (bool): if ``True`` compile for prefill only and if ``False`` compile for decode only. Defaults to None, which compiles for both ``prefill and ``decode``.
1582-
:compiler_options (dict, optional): Pass any compiler option as input. ``Defaults to None``.
1583-
Following flag can be passed in compiler_options to enable QNN Compilation path.
1584-
:enable_qnn (bool): Enables QNN Compilation. ``Defaults to False. if not passed.``
1585-
:qnn_config (str): Path of QNN Config parameters file. ``Defaults to None. if not passed``
1586-
for QAIC compilation path, any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below:
1587-
- aic_num_cores=16 -> -aic-num-cores=16
1588-
- convert_to_fp16=True -> -convert-to-fp16
1585+
:compiler_options (dict, optional): Additional compiler options. ``Defaults to None``.
1586+
For QAIC Compiler: Extra arguments for qaic-exec can be passed.
1587+
:mos (int, optional): Effort level to reduce on-chip memory. Defaults to -1, meaning no effort. ``Defaults to -1``.
1588+
:aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``.
1589+
:allow_mxint8_mdp_io (bool, optional): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.``
1590+
Params are converted to flags as below:
1591+
- aic_num_cores=16 -> -aic-num-cores=16
1592+
- convert_to_fp16=True -> -convert-to-fp16
1593+
For QNN Compiler: Following arguments can be passed.
1594+
:enable_qnn (bool): Enables QNN Compilation.
1595+
:qnn_config (str): Path of QNN Config parameters file. Any extra parameters for QNN compilation can be passed via this file.
15891596
15901597
Returns:
15911598
:str: Path of the compiled ``qpc`` package.

QEfficient/transformers/quantizers/quantizer_compressed_tensors.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,9 @@ def _process_model_after_weight_loading(self, model, **kwargs):
218218
def update_missing_keys_after_loading(self, model, missing_keys: List[str], prefix: str) -> List[str]:
219219
return missing_keys
220220

221+
def update_unexpected_keys(self, model, unexpected_keys: List[str], prefix: str) -> List[str]:
222+
return unexpected_keys
223+
221224

222225
class QEffCompressedTensorsConfig(CompressedTensorsConfig):
223226
def __init__(
@@ -395,3 +398,6 @@ def _process_model_after_weight_loading(self, model, **kwargs):
395398

396399
def update_missing_keys_after_loading(self, model, missing_keys: List[str], prefix: str) -> List[str]:
397400
return missing_keys
401+
402+
def update_unexpected_keys(self, model, unexpected_keys: List[str], prefix: str) -> List[str]:
403+
return unexpected_keys

QEfficient/utils/generate_qnn_network_specialization_config.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -173,10 +173,10 @@ def generate_data_format_config(
173173

174174
for input in onnx_model.graph.input:
175175
if "past_key" in input.name or "past_value" in input.name:
176-
kv_nodes.append((input.name).replace(".", "_"))
176+
kv_nodes.append(input.name)
177177
for output in onnx_model.graph.output:
178178
if "past_key" in output.name or "past_value" in output.name:
179-
kv_nodes.append((output.name).replace(".", "_"))
179+
kv_nodes.append(output.name)
180180
kv_overrides = {}
181181

182182
kv_overrides["graphs"] = [

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@ classifiers = [
1919
]
2020
requires-python = ">=3.8,<3.11"
2121
dependencies = [
22-
"transformers==4.50.0",
23-
"huggingface-hub==0.27.0",
22+
"transformers==4.51.3",
23+
"huggingface-hub==0.30.0",
2424
"hf_transfer==0.1.9",
2525
"peft==0.13.2",
2626
"datasets==2.20.0",

tests/transformers/models/test_image_text_to_text_models.py

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@
2727
from QEfficient.utils.run_utils import ApiRunnerInternVL, ApiRunnerVlm
2828
from QEfficient.utils.test_utils import InternProcessor
2929

30-
HF_TOKEN = ""
3130
NEW_GENERATION_TOKENS = 10
3231
test_models_config = [
3332
# CONFIG PARAMS NEEDED FOR A MODEL TO BE TESTED
@@ -104,21 +103,18 @@
104103
def load_image_text_to_text_model(model_config):
105104
model_path = hf_download(
106105
repo_id=model_config._name_or_path,
107-
hf_token=HF_TOKEN,
108106
ignore_patterns=["*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.h5", "*.msgpack"],
109107
)
110108
try:
111109
model_hf = AutoModelForImageTextToText.from_pretrained(
112110
model_path,
113111
low_cpu_mem_usage=False,
114-
token=HF_TOKEN,
115112
config=model_config,
116113
)
117114
except ValueError:
118115
model_hf = AutoModelForCausalLM.from_pretrained(
119116
model_path,
120117
low_cpu_mem_usage=False,
121-
token=HF_TOKEN,
122118
trust_remote_code=True,
123119
config=model_config,
124120
)
@@ -160,9 +156,7 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100(
160156
):
161157
model_config = {"model_name": model_name}
162158
model_config["img_size"] = img_size
163-
config = AutoConfig.from_pretrained(
164-
model_config["model_name"], token=HF_TOKEN, trust_remote_code=True, padding=True
165-
)
159+
config = AutoConfig.from_pretrained(model_config["model_name"], trust_remote_code=True, padding=True)
166160
config = set_num_layers(config, n_layer=n_layer)
167161
model_hf, _ = load_image_text_to_text_model(config)
168162
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True, padding=True)
@@ -199,7 +193,6 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100(
199193
model_config["model_name"],
200194
kv_offload=kv_offload,
201195
config=config,
202-
token=HF_TOKEN,
203196
)
204197

205198
# pytorch_kv_tokens = api_runner.run_vlm_kv_model_on_pytorch(qeff_model.model)
@@ -284,7 +277,6 @@ def check_intern_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100(
284277
model_config["model_name"],
285278
kv_offload=kv_offload,
286279
config=config,
287-
token=HF_TOKEN,
288280
)
289281
# pytorch_kv_tokens = api_runner.run_vlm_kv_model_on_pytorch(qeff_model.model)
290282
# assert (pytorch_hf_tokens == pytorch_kv_tokens).all(), (

0 commit comments

Comments
 (0)