nit: QAic changes

vbaddi · quic-rishinr · commit 8a80c89b19d5 · 2025-05-20T15:00:28.000Z
Signed-off-by: vbaddi &lt;quic_vbaddi@quicinc.com&gt;
Signed-off-by: Rishin &lt;quic_rishinr@quicinc.com&gt;
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
@@ -1461,14 +1461,38 @@ def export(self, export_dir: Optional[str] = None) -> str:
                 0: "full_batch_size" if self.continuous_batching else "batch_size",
                 2: "ctx_len",
             }
+            pkv_dynamic_sliding_axes = {
+                0: "full_batch_size" if self.continuous_batching else "batch_size",
+                2: "chunk_attn",
+            }
+
         output_names = ["logits"]
 
-        for i in range(self.num_layers):
+        is_chunked_attention = torch.tensor(
+            [bool((i + 1) % 4) for i in range(self.model.config.num_hidden_layers)], dtype=torch.bool
+        )
+        global_cache_shape = [1, 8, seq_len, 128]
+        chunked_cache_shape = [
+            1,
+            8,
+            self.model.config.attention_chunk_size,
+            128,
+        ]
+
+        for i in range(self.model.config.num_hidden_layers):
             for kv in ["key", "value"]:
-                example_inputs["past_key_values"][i].append(torch.zeros(kv_cache_shape, dtype=torch.float32))
-                dynamic_axes[f"past_{kv}.{i}"] = pkv_dynamic_axes
+                cache_shape = global_cache_shape if not is_chunked_attention[i] else chunked_cache_shape
+                apply_dynamic_axes = pkv_dynamic_axes if not is_chunked_attention[i] else pkv_dynamic_sliding_axes
+                example_inputs["past_key_values"][i].append(torch.zeros(cache_shape, dtype=torch.float32))
+                dynamic_axes[f"past_{kv}.{i}"] = apply_dynamic_axes
                 output_names.append(f"past_{kv}.{i}_RetainedState")
 
+        # for i in range(self.num_layers):
+        #     for kv in ["key", "value"]:
+        #         example_inputs["past_key_values"][i].append(torch.zeros(kv_cache_shape, dtype=torch.float32))
+        #         dynamic_axes[f"past_{kv}.{i}"] = pkv_dynamic_axes
+        #         output_names.append(f"past_{kv}.{i}_RetainedState")
+
         if self.continuous_batching:
             example_inputs["batch_index"] = torch.arange(bs).view(bs, 1)
             dynamic_axes["batch_index"] = {0: "batch_size"}
@@ -1497,6 +1521,7 @@ def build_prefill_specialization(
             "batch_size": 1 if self.continuous_batching else batch_size,
             "seq_len": prefill_seq_len,
             "ctx_len": ctx_len,
+            "chunk_attn": self.model.config.attention_chunk_size,
             "num_logits_to_keep": 1 if self.is_tlm else None,
         }
         if self.continuous_batching:
@@ -1522,6 +1547,7 @@ def build_decode_specialization(
             "batch_size": full_batch_size if self.continuous_batching else batch_size,
             "seq_len": (num_speculative_tokens + 1) if self.is_tlm else 1,
             "ctx_len": ctx_len,
+            "chunk_attn": self.model.config.attention_chunk_size,
             "num_logits_to_keep": (num_speculative_tokens + 1) if self.is_tlm else None,
         }
         if self.continuous_batching:
diff --git a/QEfficient/utils/__init__.py b/QEfficient/utils/__init__.py
@@ -16,6 +16,7 @@
     get_onnx_dir_name,
     get_padding_shape_from_config,
     get_qpc_dir_path,
+    get_sliding_window_shapes,
     hf_download,
     load_hf_processor,
     load_hf_tokenizer,
diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
@@ -282,6 +282,64 @@ def padding_check_and_fix(tokenizer: Union[PreTrainedTokenizer, PreTrainedTokeni
             tokenizer.pad_token_id = tokenizer.vocab_size - 1
 
 
+def get_sliding_window_shapes(config, batch_size, seq_len):
+    """
+    Gets padding dims from model config - number of kv heads and d_head
+    and returns padding shape - (batch_size, number of kv heads, seq_len, hidden size)
+    required for initialization of past_key_values
+    --------
+
+    :config: AutoConfig from pretrained model.
+    :batch_size: int. number of input prompts used to create inputs
+    :seq_len: int. sequence length to run the model for.
+
+    Return:
+        List[int, int, int, int]
+    """
+
+    if hasattr(config, "n_head"):  # Assuming n_head is a key in the config (GPTs/CodeGen)
+        n_heads = config.n_head
+        d_head = config.n_embd // config.n_head
+    elif hasattr(config, "num_key_value_heads") and hasattr(
+        config, "num_attention_heads"
+    ):  # Check for num_key_value_heads (Llama/Mistral)
+        n_heads = config.num_key_value_heads
+
+        if hasattr(config, "head_dim"):
+            d_head = config.head_dim
+        else:
+            d_head = config.hidden_size // config.num_attention_heads
+
+    elif hasattr(config, "n_heads"):  # Check for n_heads and d_model in the config (MPT Model)
+        n_heads = config.n_heads
+        d_head = config.d_model // config.n_heads
+    elif hasattr(config, "new_decoder_architecture"):  # Check for Falcon
+        new_decoder_architecture = getattr(config, "new_decoder_architecture")
+        if new_decoder_architecture:  # multi_query is ignored when new_decoder_architecture is True
+            n_heads = config.num_attention_heads
+        else:
+            if hasattr(config, "multi_query"):
+                multi_query_value = getattr(config, "multi_query")
+                if multi_query_value:
+                    n_heads = 1  # MQA , multi query is true
+                else:
+                    n_heads = config.num_attention_heads
+        d_head = config.hidden_size // config.num_attention_heads
+    else:
+        raise ValueError("Invalid model configuration: n_head/d_heads or num_key_value_heads not found.")
+
+    # is_chunked_attention = torch.tensor([bool((i + 1) % 4) for i in range(config.num_hidden_layers)], dtype=torch.bool)
+    global_cache_shape = [batch_size, n_heads, seq_len, d_head]
+    chunked_cache_shape = [
+        batch_size,
+        n_heads,
+        seq_len if seq_len < config.attention_chunk_size else config.attention_chunk_size,
+        d_head,
+    ]
+
+    return global_cache_shape, chunked_cache_shape
+
+
 def get_padding_shape_from_config(config, batch_size, seq_len):
     """
     Gets padding dims from model config - number of kv heads and d_head
diff --git a/QEfficient/utils/generate_inputs.py b/QEfficient/utils/generate_inputs.py
@@ -8,7 +8,12 @@
 import numpy as np
 import torch
 
-from QEfficient.utils import get_num_layers_from_config, get_padding_shape_from_config, padding_check_and_fix
+from QEfficient.utils import (
+    get_num_layers_from_config,
+    get_padding_shape_from_config,
+    get_sliding_window_shapes,
+    padding_check_and_fix,
+)
 
 
 class InputHandler:
@@ -39,6 +44,12 @@ def __init__(self, batch_size, tokenizer, config, prompt, prompt_len, ctx_len, f
         self.past_key_values = get_padding_shape_from_config(
             config=config, batch_size=full_batch_size if full_batch_size else batch_size, seq_len=ctx_len
         )
+        self.is_chunked_attention = torch.tensor(
+            [bool((i + 1) % 4) for i in range(config.num_hidden_layers)], dtype=torch.bool
+        )
+        self.global_shape, self.sliding_shape = get_sliding_window_shapes(
+            config=config, batch_size=full_batch_size if full_batch_size else batch_size, seq_len=ctx_len
+        )
 
     def prepare_pytorch_inputs(self):
         """
@@ -152,9 +163,16 @@ def prepare_ort_inputs(self):
             axis=1,
         ).astype(np.int64)
 
+        # for i in range(self.n_layer):
+        #     inputs["past_key." + str(i)] = np.zeros((self.padding_shape), dtype=np.float32)
+        #     inputs["past_value." + str(i)] = np.zeros((self.padding_shape), dtype=np.float32)
+
         for i in range(self.n_layer):
-            inputs["past_key." + str(i)] = np.zeros((self.padding_shape), dtype=np.float32)
-            inputs["past_value." + str(i)] = np.zeros((self.padding_shape), dtype=np.float32)
+            cache_shape = self.global_shape if not self.is_chunked_attention[i] else self.sliding_shape
+            inputs["past_key." + str(i)] = np.zeros((cache_shape), dtype=np.float32)
+            inputs["past_value." + str(i)] = np.zeros((cache_shape), dtype=np.float32)
+
+        return inputs
 
         return inputs
 
diff --git a/QEfficient/utils/run_utils.py b/QEfficient/utils/run_utils.py
@@ -115,7 +115,7 @@ def run_hf_model_on_pytorch(self, model_hf):
         input_len = model_inputs["input_ids"].shape[-1]
 
         with torch.inference_mode():
-            generation = model_hf.generate(**model_inputs, max_new_tokens=8, do_sample=False)
+            generation = model_hf.generate(**model_inputs, max_new_tokens=12, do_sample=False)
             generation = generation[0][input_len:]
 
         # generated_ids = input_ids[0][input_ids_len:].detach().numpy()