Value update for mask

quic-amitraj · quic-amitraj · commit 177a8de92214 · 2025-06-12T08:03:13.000Z
Signed-off-by: Amit Raj &lt;quic_amitraj@quicinc.com&gt;
diff --git a/QEfficient/transformers/models/gpt2/modeling_gpt2.py b/QEfficient/transformers/models/gpt2/modeling_gpt2.py
@@ -31,10 +31,9 @@ def eager_attention_forward(module, query, key, value, attention_mask, head_mask
         # if only "normal" attention layer implements causal mask
         query_length, key_length = query.size(-2), key.size(-2)
         causal_mask = module.bias[:, :, key_length - query_length : key_length, :key_length]
-        mask_value = MIN_MASKED_ATTENTION_VALUE
         # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
         # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
-        mask_value = torch.full([], mask_value, dtype=attn_weights.dtype, device=attn_weights.device)
+        mask_value = torch.full([], MIN_MASKED_ATTENTION_VALUE, dtype=attn_weights.dtype, device=attn_weights.device)
         attn_weights = torch.where(causal_mask, attn_weights.to(attn_weights.dtype), mask_value)
 
     if attention_mask is not None:
diff --git a/QEfficient/transformers/models/mllama/modeling_mllama.py b/QEfficient/transformers/models/mllama/modeling_mllama.py
@@ -179,9 +179,7 @@ def forward(
         if attention_mask is not None:  # no matter the length, we just slice it
             causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
             attn_weights = attn_weights + causal_mask
-            # attn_weights = torch.where(
-            #     attention_mask, torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=torch.float32), attn_weights
-            # )
+
 
         attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
         attn_output = torch.matmul(attn_weights, value_states)
diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py
@@ -26,7 +26,7 @@
 ONNX_EXPORT_CTX_LEN = 1024
 
 # Minimum value for causal mask
-MIN_MASKED_ATTENTION_VALUE = -1e4
+MIN_MASKED_ATTENTION_VALUE = float("-inf")
 
 
 # Store the qeff_models inside the ~/.cache directory or over-ride with an env variable.