quic · asmigosw · May 21, 2025 · May 21, 2025 · May 21, 2025 · May 21, 2025
@@ -12,25 +12,20 @@
 
 import requests
 from PIL import Image
-from transformers import PreTrainedModel, TextStreamer
+from transformers import PreTrainedModel
 from transformers.models.auto.modeling_auto import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
 
 from QEfficient.base.common import QEFFCommonLoader
-from QEfficient.utils import check_and_assign_cache_dir, load_hf_processor, load_hf_tokenizer
+from QEfficient.utils import check_and_assign_cache_dir, load_hf_processor, load_hf_tokenizer, load_streamer
 from QEfficient.utils.logging_utils import logger
 
 
 # TODO: Remove after adding support for VLM's compile and execute
 def execute_vlm_model(
+    processor: PreTrainedModel,
     qeff_model: PreTrainedModel,
-    model_name: str,
-    image_url: str,
-    image_path: str,
-    prompt: Optional[str] = None,  # type: ignore
+    inputs: Optional[dict] = None,
     device_group: Optional[List[int]] = None,
-    local_model_dir: Optional[str] = None,
-    cache_dir: Optional[str] = None,
-    hf_token: Optional[str] = None,
     generation_len: Optional[int] = None,
 ):
     """
@@ -50,16 +45,43 @@ def execute_vlm_model(
     Returns:
         :dict: Output from the ``AI_100`` runtime.
     """
+    streamer = load_streamer(processor.tokenizer)
+    output = qeff_model.generate(
+        inputs=inputs,
+        streamer=streamer,
+        device_ids=device_group,
+        generation_len=generation_len,
+    )
+    return output
+
+
+def count_vlm_tokens(
+    processor: PreTrainedModel,
+    prompt_len: int = 32,
+    ctx_len: int = 128,
+    image_url: Optional[str] = None,
+    image_path: Optional[str] = None,
+    prompt: Optional[str] = None,  # type: ignore
+):
+    """
+    This method counts the number of tokens in the image and updates the prompt length and context length accordingly.
+    ``Mandatory`` Args:
+        :processor (PreTrainedModel): Hugging Face Processor object.
+        :image_url (str): Image URL to be used for inference. ``Defaults to None.``
+        :image_path (str): Image path to be used for inference. ``Defaults to None.``
+    ``Optional`` Args:
+        :prompt_len (str): Prompt length for the model to compile. ``Defaults to 32.``
+        :ctx_len (str): Maximum context length to compile the model. ``Defaults to 128.``
+        :prompt (str): Sample prompt for the model text generation. ``Defaults to None.```
+    Returns:
+        :prompt_len: Updated prompt length for the VLM model to compile.
+        :ctx_len: Updated context length for the VLM model to compile.
+        :split_inputs: Tokenized inputs for the VLM model.
+    """
     if not (image_url or image_path):
         raise ValueError('Neither Image URL nor Image Path is found, either provide "image_url" or "image_path"')
     raw_image = Image.open(requests.get(image_url, stream=True).raw) if image_url else Image.open(image_path)
 
-    processor = load_hf_processor(
-        pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name),
-        cache_dir=cache_dir,
-        hf_token=hf_token,
-    )
-
     # Added for QEff version 1.20 supported VLM models (mllama and llava)
     conversation = [
         {
@@ -73,21 +95,31 @@ def execute_vlm_model(
 
     # Converts a list of dictionaries with `"role"` and `"content"` keys to a list of token ids.
     input_text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
-
     split_inputs = processor(
         text=input_text,
         images=raw_image,
         return_tensors="pt",
         add_special_tokens=False,
     )
-    streamer = TextStreamer(processor.tokenizer)
-    output = qeff_model.generate(
-        inputs=split_inputs,
-        streamer=streamer,
-        device_ids=device_group,
-        generation_len=generation_len,
-    )
-    return output
+
+    # Get the number of total number of decoded tokens in the input
+    decoded_tokens = processor.tokenizer.decode(split_inputs["input_ids"][0])
+
+    total_image_tokens = decoded_tokens.count("<IMG_CONTEXT>") + decoded_tokens.count("<image>")
+
+    # Check if the number of tokens in the image is greater than the prompt length
+    if total_image_tokens > prompt_len:
+        logger.warning(
+            f"Prompt length {prompt_len} is less than the number of tokens in the image. "
+            f"Increasing the prompt length to at least {total_image_tokens + prompt_len}."
+        )
+        prompt_len = total_image_tokens + prompt_len
+
+    # Update the context length only if it is less than the prompt length
+    if ctx_len < prompt_len:
+        ctx_len = prompt_len + ctx_len
+
+    return prompt_len, ctx_len, split_inputs
 
 
 def main(
@@ -176,6 +208,22 @@ def main(
         kwargs.pop("img_size", None) or image_path or image_url
     ):
         logger.warning(f"Skipping image arguments as they are not valid for {architecture}")
+    else:
+        processor = load_hf_processor(
+            pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name),
+            cache_dir=cache_dir,
+            hf_token=hf_token,
+        )
+
+        # count the number of tokens in required in the input and update the prompt length and context length accordingly
+        prompt_len, ctx_len, inputs = count_vlm_tokens(
+            processor=processor,
+            prompt_len=prompt_len,
+            ctx_len=ctx_len,
+            image_url=image_url,
+            image_path=image_path,
+            prompt=prompt,
+        )
 
     #########
     # Compile
@@ -206,15 +254,10 @@ def main(
     #########
     if architecture in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values():
         exec_info = execute_vlm_model(
+            processor=processor,
             qeff_model=qeff_model,
-            model_name=model_name,
-            prompt=prompt,
-            image_url=image_url,
-            image_path=image_path,
+            inputs=inputs,
             device_group=device_group,
-            local_model_dir=local_model_dir,
-            cache_dir=cache_dir,
-            hf_token=hf_token,
             generation_len=generation_len,
         )
         print(exec_info)

@@ -21,6 +21,7 @@
     hf_download,
     load_hf_processor,
     load_hf_tokenizer,
+    load_streamer,
     login_and_download_hf_lm,
     onnx_exists,
     padding_check_and_fix,

@@ -23,6 +23,7 @@
     AutoTokenizer,
     PreTrainedTokenizer,
     PreTrainedTokenizerFast,
+    TextStreamer,
 )
 
 from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants, QnnConstants
@@ -220,6 +221,22 @@ def load_hf_processor(
     return processor
 
 
+def load_streamer(
+    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+):
+    """
+    Loads the streamer for the given tokenizer.
+    --------
+
+    tokenizer: `Union[PreTrainedTokenizer, PreTrainedTokenizerFast]` - Pass model tokenizer to load streamer.
+
+    Return:
+        TextStreamer object for the given tokenizer.
+    """
+    logger.info("Loading Streamer")
+    return TextStreamer(tokenizer)
+
+
 def get_qpc_dir_path(
     model_card_name,
     num_cores,

@@ -194,6 +194,23 @@ qeff_model.generate(prompts=["My name is"])
 
 **Users can also take advantage of features like multi-Qranium inference and continuous batching with QNN SDK Compilation.**
 
+
+### VLM Inference
+
+Users can compile a VLM model by using the below commands.
+
+**CLI Inference Command**
+
+For Llava
+```bash
+python -m QEfficient.cloud.infer --model_name llava-hf/llava-1.5-7b-hf --batch_size 1 --prompt_len 784 --ctx_len 1024 --mxfp6 --num_cores 16 --device_group [0] --prompt "Describe the image" --mos 1 --aic_enable_depth_first --image_url https://i.etsystatic.com/8155076/r/il/0825c2/1594869823/il_fullxfull.1594869823_5x0w.jpg --generation_len 128
+```
+
+For Mllama
+```bash
+python -m QEfficient.cloud.infer --model_name meta-llama/Llama-3.2-11B-Vision-Instruct --batch_size 1 --prompt_len 32 --ctx_len 512 --num_cores 16 --device_group [0] --prompt "Describe the image?" --mos 1  --allocator_dealloc_delay 1 --image_url https://i.etsystatic.com/8155076/r/il/0825c2/1594869823/il_fullxfull.1594869823_5x0w.jpg
+```
+
 ## Python API
 
 ### 1.  Model download and Optimize for Cloud AI 100