Skip to content

Added Prompt length check for VLMs #422

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 74 additions & 31 deletions QEfficient/cloud/infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,25 +12,20 @@

import requests
from PIL import Image
from transformers import PreTrainedModel, TextStreamer
from transformers import PreTrainedModel
from transformers.models.auto.modeling_auto import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES

from QEfficient.base.common import QEFFCommonLoader
from QEfficient.utils import check_and_assign_cache_dir, load_hf_processor, load_hf_tokenizer
from QEfficient.utils import check_and_assign_cache_dir, load_hf_processor, load_hf_tokenizer, load_streamer
from QEfficient.utils.logging_utils import logger


# TODO: Remove after adding support for VLM's compile and execute
def execute_vlm_model(
processor: PreTrainedModel,
qeff_model: PreTrainedModel,
model_name: str,
image_url: str,
image_path: str,
prompt: Optional[str] = None, # type: ignore
inputs: Optional[dict] = None,
device_group: Optional[List[int]] = None,
local_model_dir: Optional[str] = None,
cache_dir: Optional[str] = None,
hf_token: Optional[str] = None,
generation_len: Optional[int] = None,
):
"""
Expand All @@ -50,16 +45,43 @@ def execute_vlm_model(
Returns:
:dict: Output from the ``AI_100`` runtime.
"""
streamer = load_streamer(processor.tokenizer)
output = qeff_model.generate(
inputs=inputs,
streamer=streamer,
device_ids=device_group,
generation_len=generation_len,
)
return output


def count_vlm_tokens(
processor: PreTrainedModel,
prompt_len: int = 32,
ctx_len: int = 128,
image_url: Optional[str] = None,
image_path: Optional[str] = None,
prompt: Optional[str] = None, # type: ignore
):
"""
This method counts the number of tokens in the image and updates the prompt length and context length accordingly.
``Mandatory`` Args:
:processor (PreTrainedModel): Hugging Face Processor object.
:image_url (str): Image URL to be used for inference. ``Defaults to None.``
:image_path (str): Image path to be used for inference. ``Defaults to None.``
``Optional`` Args:
:prompt_len (str): Prompt length for the model to compile. ``Defaults to 32.``
:ctx_len (str): Maximum context length to compile the model. ``Defaults to 128.``
:prompt (str): Sample prompt for the model text generation. ``Defaults to None.```
Returns:
:prompt_len: Updated prompt length for the VLM model to compile.
:ctx_len: Updated context length for the VLM model to compile.
:split_inputs: Tokenized inputs for the VLM model.
"""
if not (image_url or image_path):
raise ValueError('Neither Image URL nor Image Path is found, either provide "image_url" or "image_path"')
raw_image = Image.open(requests.get(image_url, stream=True).raw) if image_url else Image.open(image_path)

processor = load_hf_processor(
pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name),
cache_dir=cache_dir,
hf_token=hf_token,
)

# Added for QEff version 1.20 supported VLM models (mllama and llava)
conversation = [
{
Expand All @@ -73,21 +95,31 @@ def execute_vlm_model(

# Converts a list of dictionaries with `"role"` and `"content"` keys to a list of token ids.
input_text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)

split_inputs = processor(
text=input_text,
images=raw_image,
return_tensors="pt",
add_special_tokens=False,
)
streamer = TextStreamer(processor.tokenizer)
output = qeff_model.generate(
inputs=split_inputs,
streamer=streamer,
device_ids=device_group,
generation_len=generation_len,
)
return output

# Get the number of total number of decoded tokens in the input
decoded_tokens = processor.tokenizer.decode(split_inputs["input_ids"][0])

total_image_tokens = decoded_tokens.count("<IMG_CONTEXT>") + decoded_tokens.count("<image>")

# Check if the number of tokens in the image is greater than the prompt length
if total_image_tokens > prompt_len:
logger.warning(
f"Prompt length {prompt_len} is less than the number of tokens in the image. "
f"Increasing the prompt length to at least {total_image_tokens + prompt_len}."
)
prompt_len = total_image_tokens + prompt_len

# Update the context length only if it is less than the prompt length
if ctx_len < prompt_len:
ctx_len = prompt_len + ctx_len

return prompt_len, ctx_len, split_inputs


def main(
Expand Down Expand Up @@ -176,6 +208,22 @@ def main(
kwargs.pop("img_size", None) or image_path or image_url
):
logger.warning(f"Skipping image arguments as they are not valid for {architecture}")
else:
processor = load_hf_processor(
pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name),
cache_dir=cache_dir,
hf_token=hf_token,
)

# count the number of tokens in required in the input and update the prompt length and context length accordingly
prompt_len, ctx_len, inputs = count_vlm_tokens(
processor=processor,
prompt_len=prompt_len,
ctx_len=ctx_len,
image_url=image_url,
image_path=image_path,
prompt=prompt,
)

#########
# Compile
Expand Down Expand Up @@ -206,15 +254,10 @@ def main(
#########
if architecture in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values():
exec_info = execute_vlm_model(
processor=processor,
qeff_model=qeff_model,
model_name=model_name,
prompt=prompt,
image_url=image_url,
image_path=image_path,
inputs=inputs,
device_group=device_group,
local_model_dir=local_model_dir,
cache_dir=cache_dir,
hf_token=hf_token,
generation_len=generation_len,
)
print(exec_info)
Expand Down
1 change: 1 addition & 0 deletions QEfficient/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
hf_download,
load_hf_processor,
load_hf_tokenizer,
load_streamer,
login_and_download_hf_lm,
onnx_exists,
padding_check_and_fix,
Expand Down
17 changes: 17 additions & 0 deletions QEfficient/utils/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
AutoTokenizer,
PreTrainedTokenizer,
PreTrainedTokenizerFast,
TextStreamer,
)

from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants, QnnConstants
Expand Down Expand Up @@ -220,6 +221,22 @@ def load_hf_processor(
return processor


def load_streamer(
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
):
"""
Loads the streamer for the given tokenizer.
--------

tokenizer: `Union[PreTrainedTokenizer, PreTrainedTokenizerFast]` - Pass model tokenizer to load streamer.

Return:
TextStreamer object for the given tokenizer.
"""
logger.info("Loading Streamer")
return TextStreamer(tokenizer)


def get_qpc_dir_path(
model_card_name,
num_cores,
Expand Down
17 changes: 17 additions & 0 deletions docs/source/quick_start.md
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,23 @@ qeff_model.generate(prompts=["My name is"])

**Users can also take advantage of features like multi-Qranium inference and continuous batching with QNN SDK Compilation.**


### VLM Inference

Users can compile a VLM model by using the below commands.

**CLI Inference Command**

For Llava
```bash
python -m QEfficient.cloud.infer --model_name llava-hf/llava-1.5-7b-hf --batch_size 1 --prompt_len 784 --ctx_len 1024 --mxfp6 --num_cores 16 --device_group [0] --prompt "Describe the image" --mos 1 --aic_enable_depth_first --image_url https://i.etsystatic.com/8155076/r/il/0825c2/1594869823/il_fullxfull.1594869823_5x0w.jpg --generation_len 128
```

For Mllama
```bash
python -m QEfficient.cloud.infer --model_name meta-llama/Llama-3.2-11B-Vision-Instruct --batch_size 1 --prompt_len 32 --ctx_len 512 --num_cores 16 --device_group [0] --prompt "Describe the image?" --mos 1 --allocator_dealloc_delay 1 --image_url https://i.etsystatic.com/8155076/r/il/0825c2/1594869823/il_fullxfull.1594869823_5x0w.jpg
```

## Python API

### 1. Model download and Optimize for Cloud AI 100
Expand Down
Loading