Skip to content

[Qwen2.5-VL] Fix empty string input crash in processor #38421

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
85 changes: 74 additions & 11 deletions tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,10 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
def setUpClass(cls):
cls.tmpdirname = tempfile.mkdtemp()
processor = Qwen2_5_VLProcessor.from_pretrained(
"Qwen/Qwen2-VL-7B-Instruct", patch_size=4, max_pixels=56 * 56, min_pixels=28 * 28
"Qwen/Qwen2-VL-7B-Instruct",
patch_size=4,
max_pixels=56 * 56,
min_pixels=28 * 28,
)
processor.save_pretrained(cls.tmpdirname)
cls.image_token = processor.image_token
Expand Down Expand Up @@ -74,7 +77,9 @@ def test_save_load_pretrained_default(self):
video_processor = self.get_video_processor()

processor = Qwen2_5_VLProcessor(
tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
tokenizer=tokenizer,
image_processor=image_processor,
video_processor=video_processor,
)
processor.save_pretrained(self.tmpdirname)
processor = Qwen2_5_VLProcessor.from_pretrained(self.tmpdirname, use_fast=False)
Expand All @@ -90,7 +95,9 @@ def test_image_processor(self):
video_processor = self.get_video_processor()

processor = Qwen2_5_VLProcessor(
tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
tokenizer=tokenizer,
image_processor=image_processor,
video_processor=video_processor,
)

image_input = self.prepare_image_inputs()
Expand All @@ -107,7 +114,9 @@ def test_processor(self):
video_processor = self.get_video_processor()

processor = Qwen2_5_VLProcessor(
tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
tokenizer=tokenizer,
image_processor=image_processor,
video_processor=video_processor,
)

input_str = "lower newer"
Expand All @@ -133,7 +142,9 @@ def test_model_input_names(self):
video_processor = self.get_video_processor()

processor = Qwen2_5_VLProcessor(
tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
tokenizer=tokenizer,
image_processor=image_processor,
video_processor=video_processor,
)

input_str = "lower newer"
Expand Down Expand Up @@ -177,13 +188,18 @@ def _test_apply_chat_template(

# Test that tokenizing with template and directly with `self.tokenizer` gives same output
formatted_prompt_tokenized = processor.apply_chat_template(
batch_messages, add_generation_prompt=True, tokenize=True, return_tensors=return_tensors
batch_messages,
add_generation_prompt=True,
tokenize=True,
return_tensors=return_tensors,
)
add_special_tokens = True
if processor.tokenizer.bos_token is not None and formatted_prompt[0].startswith(processor.tokenizer.bos_token):
add_special_tokens = False
tok_output = processor.tokenizer(
formatted_prompt, return_tensors=return_tensors, add_special_tokens=add_special_tokens
formatted_prompt,
return_tensors=return_tensors,
add_special_tokens=add_special_tokens,
)
expected_output = tok_output.input_ids
self.assertListEqual(expected_output.tolist(), formatted_prompt_tokenized.tolist())
Expand Down Expand Up @@ -214,7 +230,10 @@ def _test_apply_chat_template(

# Test that with modality URLs and `return_dict=True`, we get modality inputs in the dict
for idx, url in enumerate(input_data[:batch_size]):
batch_messages[idx][0]["content"] = [batch_messages[idx][0]["content"][0], {"type": modality, "url": url}]
batch_messages[idx][0]["content"] = [
batch_messages[idx][0]["content"][0],
{"type": modality, "url": url},
]

out_dict = processor.apply_chat_template(
batch_messages,
Expand Down Expand Up @@ -350,7 +369,12 @@ def test_kwargs_overrides_custom_image_processor_kwargs(self):

input_str = self.prepare_text_inputs()
image_input = self.prepare_image_inputs()
inputs = processor(text=input_str, images=image_input, max_pixels=56 * 56 * 4, return_tensors="pt")
inputs = processor(
text=input_str,
images=image_input,
max_pixels=56 * 56 * 4,
return_tensors="pt",
)
self.assertEqual(inputs[self.images_input_name].shape[0], 612)
inputs = processor(text=input_str, images=image_input, return_tensors="pt")
self.assertEqual(inputs[self.images_input_name].shape[0], 100)
Expand All @@ -372,7 +396,9 @@ def test_apply_chat_template_video_special_processing(self):
self.skipTest("Processor doesn't accept videos at input")

video_file_path = hf_hub_download(
repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset"
repo_id="raushan-testing-hf/videos-test",
filename="sample_demo_1.mp4",
repo_type="dataset",
)
messages = [
[
Expand Down Expand Up @@ -400,7 +426,10 @@ def _process_messages_for_chat_template(
"role": "user",
"content": [
{"type": "video"}, # no need to use path, video is loaded already by this moment
{"type": "text", "text": "Dummy prompt for preprocess testing"},
{
"type": "text",
"text": "Dummy prompt for preprocess testing",
},
],
},
]
Expand All @@ -421,3 +450,37 @@ def _process_messages_for_chat_template(
formatted_text = processor.batch_decode(out_dict_with_video["input_ids"], skip_special_tokens=True)[0]
self.assertTrue("Dummy prompt for preprocess testing" in formatted_text)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 21960)

@require_torch
def test_qwen2_tokenizer_empty_string_regression(self):
"""
Tests that Qwen2Tokenizer returns a torch.long tensor with correct shape
for an empty string input, serving as a regression test for issue #38417.
"""
model_id = "Qwen/Qwen2-0.5B"

try:
tokenizer = Qwen2Tokenizer.from_pretrained(model_id, trust_remote_code=True)
except OSError as e:
self.skipTest(f"Could not load tokenizer {model_id} for testing. Error: {e}")
return

text_inputs = tokenizer([""], return_tensors="pt")

self.assertIn("input_ids", text_inputs, "Key 'input_ids' not found in tokenizer output.")
input_ids_tensor = text_inputs["input_ids"]
self.assertIsNotNone(input_ids_tensor, "input_ids tensor is None.")

expected_shape = torch.Size([1, 0])
self.assertEqual(
input_ids_tensor.shape,
expected_shape,
f"Expected shape {expected_shape}, but got {input_ids_tensor.shape}.",
)

expected_dtype = torch.long
self.assertEqual(
input_ids_tensor.dtype,
expected_dtype,
f"Expected dtype {expected_dtype}, but got {input_ids_tensor.dtype}.",
)