Updaeted max_seq_len_cached to 64k

quic-rishinr · quic-rishinr · commit 39095083afec · 2025-06-12T10:42:51.000Z
Signed-off-by: Rishin &lt;quic_rishinr@quicinc.com&gt;
diff --git a/QEfficient/transformers/models/llama4/modeling_llama4.py b/QEfficient/transformers/models/llama4/modeling_llama4.py
@@ -311,7 +311,10 @@ def __init__(self, config: Llama4TextConfig, device=None):
         self.rope_type = "llama3" if config.rope_scaling is not None else "default"
         self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
 
-        self.max_seq_len_cached = config.max_position_embeddings
+        # self.max_seq_len_cached = config.max_position_embeddings
+        # TODO: vbaddi Shouldn't for rope, the max posision_embeddings be original embeddings for rope,
+        # chunk size 8192 always? and Revisit when >8K Chunked attention is enabled.
+        self.max_seq_len_cached = constants.LLAMA4_MAX_POSITION_EMBEDDINGS
 
         # Get inverse frequency and scaling function (handles yarn/etc)
         inv_freq, self.attention_scaling = self.rope_init_fn(config, device)
@@ -1031,7 +1034,7 @@ def get_dummy_inputs(self, kv_offload: bool = False):
         lang_inputs["past_key_values"] = [[] for _ in range(self.language_model.config.num_hidden_layers)]
         for i in range(self.language_model.config.num_hidden_layers):
             for kv in ["key", "value"]:
-                lang_inputs["past_key_values"][i].append(torch.zeros(kv_cache_shape, dtype=torch.float32))
+                lang_inputs["past_key_values"][i].append(torch.zeros(kv_cache_shape[0][0].shape, dtype=torch.float32))
 
         inputs = {}
         if kv_offload:
diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py
@@ -83,6 +83,7 @@ def get_models_dir():
 # Llama4 Constants
 LLAMA4_NUM_PATCHES = 17
 LLAMA4_ATTENTION_CHUNK_SIZE = 8192
+LLAMA4_MAX_POSITION_EMBEDDINGS = 65536  # 2^16 + 512
 
 
 class Constants: