Addressed comments

quic-amitraj · quic-amitraj · commit f449fd28db12 · 2025-06-10T14:38:41.000Z
Signed-off-by: Amit Raj &lt;quic_amitraj@quicinc.com&gt;
diff --git a/QEfficient/transformers/embeddings/embedding_utils.py b/QEfficient/transformers/embeddings/embedding_utils.py
@@ -13,22 +13,62 @@
 
 
 def mean_pooling(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
+    """
+    Performs mean pooling on the last hidden states of a transformer model.
+
+    Args:
+        last_hidden_states (torch.Tensor): The last hidden states of the transformer model.
+        attention_mask (torch.Tensor): The attention mask used to mask out padding tokens.
+
+    Returns:
+        torch.Tensor: The mean pooled last hidden states.
+    """
     input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_states.size()).float()
     return torch.sum(last_hidden_states * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
 
 
 def average_pool(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
+    """
+    Performs average pooling on the last hidden states of a transformer model.
+
+    Args:
+        last_hidden_states (torch.Tensor): The last hidden states of the transformer model.
+        attention_mask (torch.Tensor): The attention mask used to mask out padding tokens.
+
+    Returns:
+        torch.Tensor: The average pooled last hidden states.
+    """
     last_hidden = last_hidden_states[0].masked_fill(~attention_mask[..., None].bool(), 0.0)
     return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
 
 
 def max_pooling(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
+    """
+    Performs max pooling on the last hidden states of a transformer model.
+
+    Args:
+        last_hidden_states (torch.Tensor): The last hidden states of the transformer model.
+        attention_mask (torch.Tensor): The attention mask used to mask out padding tokens.
+
+    Returns:
+        torch.Tensor: The max pooled last hidden states.
+    """
     input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_states.size()).float()
     last_hidden_states[input_mask_expanded == 0] = -1e9
     return torch.max(last_hidden_states, 1)[0]
 
 
 def cls_pooling(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
+    """
+    Performs CLS pooling on the last hidden states of a transformer model.
+
+    Args:
+        last_hidden_states (torch.Tensor): The last hidden states of the transformer model.
+        attention_mask (torch.Tensor): The attention mask used to mask out padding tokens.
+
+    Returns:
+        torch.Tensor: The CLS pooled last hidden states.
+    """
     return last_hidden_states[:, 0]
 
 
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
@@ -157,11 +157,12 @@ class QEFFAutoModel(QEFFTransformersBase):
     _pytorch_transforms = [CustomOpsTransform, AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform]
     _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform]
 
-    def __init__(self, model: nn.Module, **kwargs):
+    def __init__(self, model: nn.Module, pooling=None, **kwargs):
         super().__init__(model)
 
-        # Make Embedding specific transforms like pooling
-        self.model, _ = PoolingTransform.apply(self.model, **kwargs)
+        # Make Embedding specific transforms like appending pooling
+        if pooling:
+            self.model, _ = PoolingTransform.apply(self.model, pooling)
 
         self.model.base_model.config.use_cache = True
 
@@ -177,20 +178,22 @@ def from_pretrained(cls, pretrained_model_name_or_path, pooling=None, *args, **k
         This API can also be used as exception for VLM model since transformers support loading InternChatVL models via AutoModel API we support it via AutoModelForCausalLM API
         Args:
             pretrained_model_name_or_path (str): The name or path of the pre-trained model.
-            pooling (Optional[str], optional): The pooling method to use. Defaults to None.
-                Options:
-                    - "mean": Mean pooling
-                    - "max": Max pooling
-                    - "cls": CLS token pooling
-                    - "avg": Average pooling
+            pooling (Optional[Union[str, Callable]], optional): The pooling method to use. Defaults to None.
+            Options:
+                - "mean": Mean pooling
+                - "max": Max pooling
+                - "cls": CLS token pooling
+                - "avg": Average pooling
+                - Callable: A custom pooling function
+                - None: No pooling applied
 
         .. code-block:: python
 
             from QEfficient import QEFFAutoModel
             from transformers import AutoTokenizer
 
             # Initialize the model using from_pretrained similar to transformers.AutoModel.
-            model = QEFFAutoModel.from_pretrained("model_name")
+            model = QEFFAutoModel.from_pretrained("model_name", pooling="mean")
 
             # Now you can directly compile the model for Cloud AI 100
             model.compile(num_cores=16)  # Considering you have a Cloud AI 100 SKU
@@ -308,6 +311,9 @@ def compile(
             :str: Path of the compiled ``qpc`` package.
         """
 
+        if isinstance(seq_len, list) and len(seq_len) >= 15:
+            warnings.warn("Recommended: `seq_len` should contain fewer than 15 items.")
+
         specializations = [
             {"batch_size": batch_size, "seq_len": sl} for sl in (seq_len if isinstance(seq_len, list) else [seq_len])
         ]
@@ -395,11 +401,21 @@ def cloud_ai_100_feature_generate(
 
         inputs = dict(input_ids=input_ids, attention_mask=attention_mask)
 
-        outputs = {
-            "output": np.random.randn(*list(self.qpc_session.bindings[2].dims)).astype(np.float32),
-        }
-        self.qpc_session.set_buffers(outputs)
-        outputs = self.qpc_session.run(inputs)
+        # TODO: Remove try and catch after compiler fix
+        try:
+            outputs = {
+                "output": np.random.randn(*list(self.qpc_session.bindings[2].dims)).astype(np.float32),
+            }
+            self.qpc_session.set_buffers(outputs)
+            outputs = self.qpc_session.run(inputs)
+        except Exception as e:
+            outputs = {
+                "output": np.random.randn(self.batch_size, self.seq_len, self.qpc_session.bindings[2].dims[1]).astype(
+                    np.float32
+                ),
+            }
+            self.qpc_session.set_buffers(outputs)
+            outputs = self.qpc_session.run(inputs)
         return outputs
 
     def pytorch_feature_generate(self, model, inputs: Union[torch.Tensor, np.ndarray]) -> List[torch.Tensor]:
diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py
@@ -7,7 +7,7 @@
 
 import warnings
 from types import MethodType
-from typing import Optional, Tuple
+from typing import Callable, Optional, Tuple, Union
 
 from torch import nn
 from transformers.models.codegen.modeling_codegen import (
@@ -498,13 +498,13 @@ class PoolingTransform:
     """
 
     @classmethod
-    def apply(cls, model: nn.Module, **kwargs) -> Tuple[nn.Module, bool]:
+    def apply(cls, model: nn.Module, pooling: Union[str, Callable]) -> Tuple[nn.Module, bool]:
         transformed = False
-        if kwargs.get("pooling") is not None:
-            pooling = kwargs["pooling"]
-            pooling_method = (
-                POOLING_MAP[pooling] if isinstance(pooling, str) else validate_user_pooling_function(pooling)
-            )
-            model = PooledModel(model, pooling_method)
-            warnings.warn(f"Pooling method {pooling.__name__} is applied to the model.")
+        pooling_method = (
+            POOLING_MAP[pooling]
+            if isinstance(pooling, str) and pooling in POOLING_MAP
+            else validate_user_pooling_function(pooling)
+        )
+        model = PooledModel(model, pooling_method)
+        warnings.warn("Pooling is applied to the model.")
         return model, transformed
diff --git a/examples/embedding_model.py b/examples/embedding_model.py
@@ -33,10 +33,12 @@ def max_pooling(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor)
 
 # Example: Using mean pooling by specifying it as a string.
 # This will return sentence embeddings computed using mean pooling.
-# qeff_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2", pooling="mean")
+# qeff_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
 
-# Here seq_len can be list seq_len or single int
+# Here seq_len can be list of seq_len or single int
 qeff_model.compile(num_cores=16, seq_len=[32, 64])
+# qeff_model.compile(num_cores=16, seq_len=32)
+
 
 # Tokenize sentences
 encoded_input = tokenizer(sentences, return_tensors="pt")