Skip to content

Commit 00a9ee1

Browse files
authoredSep 30, 2024
FEAT: add vllm restart check and support internvl multi-image chat (#2384)
1 parent 4c8aae1 commit 00a9ee1

File tree

5 files changed

+91
-42
lines changed

5 files changed

+91
-42
lines changed
 

‎xinference/model/llm/llm_family.json

+12-24
Original file line numberDiff line numberDiff line change
@@ -6483,8 +6483,7 @@
64836483
"8-bit",
64846484
"none"
64856485
],
6486-
"model_id": "OpenGVLab/InternVL2-1B",
6487-
"model_revision": "a9fc14aea824b6ea1d44f8778cad6b35512c4ce1"
6486+
"model_id": "OpenGVLab/InternVL2-1B"
64886487
},
64896488
{
64906489
"model_format": "pytorch",
@@ -6494,17 +6493,15 @@
64946493
"8-bit",
64956494
"none"
64966495
],
6497-
"model_id": "OpenGVLab/InternVL2-2B",
6498-
"model_revision": "422ad7c6335917bfb514958233955512338485a6"
6496+
"model_id": "OpenGVLab/InternVL2-2B"
64996497
},
65006498
{
65016499
"model_format": "awq",
65026500
"model_size_in_billions": 2,
65036501
"quantizations": [
65046502
"Int4"
65056503
],
6506-
"model_id": "OpenGVLab/InternVL2-2B-AWQ",
6507-
"model_revision": "701bc3fc098a8a3b686b3b4135cfb77202be89e0"
6504+
"model_id": "OpenGVLab/InternVL2-2B-AWQ"
65086505
},
65096506
{
65106507
"model_format": "pytorch",
@@ -6514,8 +6511,7 @@
65146511
"8-bit",
65156512
"none"
65166513
],
6517-
"model_id": "OpenGVLab/InternVL2-4B",
6518-
"model_revision": "b50544dafada6c41e80bfde2f57cc9b0140fc21c"
6514+
"model_id": "OpenGVLab/InternVL2-4B"
65196515
},
65206516
{
65216517
"model_format": "pytorch",
@@ -6525,17 +6521,15 @@
65256521
"8-bit",
65266522
"none"
65276523
],
6528-
"model_id": "OpenGVLab/InternVL2-8B",
6529-
"model_revision": "3bfd3664dea4f3da628785f5125d30f889701253"
6524+
"model_id": "OpenGVLab/InternVL2-8B"
65306525
},
65316526
{
65326527
"model_format": "awq",
65336528
"model_size_in_billions": 8,
65346529
"quantizations": [
65356530
"Int4"
65366531
],
6537-
"model_id": "OpenGVLab/InternVL2-8B-AWQ",
6538-
"model_revision": "9f1a4756b7ae18eb26d8a22b618dfc283e8193b3"
6532+
"model_id": "OpenGVLab/InternVL2-8B-AWQ"
65396533
},
65406534
{
65416535
"model_format": "pytorch",
@@ -6545,17 +6539,15 @@
65456539
"8-bit",
65466540
"none"
65476541
],
6548-
"model_id": "OpenGVLab/InternVL2-26B",
6549-
"model_revision": "b9f3c7e6d575b0115e076a3ffc46fd20b7586899"
6542+
"model_id": "OpenGVLab/InternVL2-26B"
65506543
},
65516544
{
65526545
"model_format": "awq",
65536546
"model_size_in_billions": 26,
65546547
"quantizations": [
65556548
"Int4"
65566549
],
6557-
"model_id": "OpenGVLab/InternVL2-26B-AWQ",
6558-
"model_revision": "469e0019ffd251e22ff6501a5c2321964e86ef0d"
6550+
"model_id": "OpenGVLab/InternVL2-26B-AWQ"
65596551
},
65606552
{
65616553
"model_format": "pytorch",
@@ -6565,17 +6557,15 @@
65656557
"8-bit",
65666558
"none"
65676559
],
6568-
"model_id": "OpenGVLab/InternVL2-40B",
6569-
"model_revision": "725a12063bb855c966e30a0617d0ccd9e870d772"
6560+
"model_id": "OpenGVLab/InternVL2-40B"
65706561
},
65716562
{
65726563
"model_format": "awq",
65736564
"model_size_in_billions": 40,
65746565
"quantizations": [
65756566
"Int4"
65766567
],
6577-
"model_id": "OpenGVLab/InternVL2-40B-AWQ",
6578-
"model_revision": "d92e140f6dfe8ea9679924c6a31898f42c4e1846"
6568+
"model_id": "OpenGVLab/InternVL2-40B-AWQ"
65796569
},
65806570
{
65816571
"model_format": "pytorch",
@@ -6585,17 +6575,15 @@
65856575
"8-bit",
65866576
"none"
65876577
],
6588-
"model_id": "OpenGVLab/InternVL2-Llama3-76B",
6589-
"model_revision": "cf7914905f78e9e3560ddbd6f5dfc39becac494f"
6578+
"model_id": "OpenGVLab/InternVL2-Llama3-76B"
65906579
},
65916580
{
65926581
"model_format": "awq",
65936582
"model_size_in_billions": 76,
65946583
"quantizations": [
65956584
"Int4"
65966585
],
6597-
"model_id": "OpenGVLab/InternVL2-Llama3-76B-AWQ",
6598-
"model_revision": "1bc796bf80f2ebc7d6a14c15f55217a4600d50a4"
6586+
"model_id": "OpenGVLab/InternVL2-Llama3-76B-AWQ"
65996587
}
66006588
],
66016589
"chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",

‎xinference/model/llm/llm_family_modelscope.json

+2-10
Original file line numberDiff line numberDiff line change
@@ -4334,16 +4334,8 @@
43344334
}
43354335
],
43364336
"chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
4337-
"stop_token_ids": [
4338-
151643,
4339-
151644,
4340-
151645
4341-
],
4342-
"stop": [
4343-
"<|endoftext|>",
4344-
"<|im_start|>",
4345-
"<|im_end|>"
4346-
]
4337+
"stop_token_ids": [],
4338+
"stop": []
43474339
},
43484340
{
43494341
"version": 1,

‎xinference/model/llm/utils.py

+14-3
Original file line numberDiff line numberDiff line change
@@ -159,14 +159,25 @@ def get_specific_prompt(model_family: str, messages: List[Dict]):
159159
for image_url in image_urls:
160160
fut = executor.submit(_decode_image, image_url)
161161
image_futures.append(fut)
162-
images = [fut.result() for fut in image_futures]
162+
images.extend([fut.result() for fut in image_futures])
163163
if len(image_futures) == 0:
164164
ret += role + "\n" + text + intra_message_sep + "\n"
165165
else:
166+
placeholders = "\n".join(
167+
f"Image-{i+1}: <image>\n"
168+
for i in range(
169+
len(images) - len(image_futures), len(images)
170+
)
171+
)
166172
ret += (
167-
role + "\n" + f"<image>\n{text}" + intra_message_sep + "\n"
173+
role
174+
+ "\n"
175+
+ f"{placeholders}\n{text}"
176+
+ intra_message_sep
177+
+ "\n"
168178
)
169-
179+
if len(images) == 1:
180+
ret = ret.replace("Image-1: <image>\n", "<image>\n")
170181
return ret, images
171182
else:
172183
raise ValueError(f"Invalid model family: {model_family}")

‎xinference/model/llm/vllm/core.py

+21-5
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
# limitations under the License.
1414

1515
import asyncio
16+
import json
1617
import logging
1718
import multiprocessing
1819
import os
@@ -47,6 +48,7 @@
4748
ChatModelMixin,
4849
generate_completion_chunk,
4950
)
51+
from .utils import vllm_check
5052

5153
logger = logging.getLogger(__name__)
5254

@@ -65,6 +67,7 @@ class VLLMModelConfig(TypedDict, total=False):
6567
max_num_seqs: int
6668
quantization: Optional[str]
6769
max_model_len: Optional[int]
70+
limit_mm_per_prompt: Optional[Dict[str, int]]
6871

6972

7073
class VLLMGenerateConfig(TypedDict, total=False):
@@ -90,9 +93,7 @@ class VLLMGenerateConfig(TypedDict, total=False):
9093
except ImportError:
9194
VLLM_INSTALLED = False
9295

93-
VLLM_SUPPORTED_VISION_MODEL_LIST: List[str] = [
94-
"internvl2",
95-
]
96+
VLLM_SUPPORTED_VISION_MODEL_LIST: List[str] = []
9697
VLLM_SUPPORTED_MODELS = [
9798
"llama-2",
9899
"llama-3",
@@ -171,6 +172,9 @@ class VLLMGenerateConfig(TypedDict, total=False):
171172
VLLM_SUPPORTED_MODELS.append("llama-3.1")
172173
VLLM_SUPPORTED_CHAT_MODELS.append("llama-3.1-instruct")
173174

175+
if VLLM_INSTALLED and vllm.__version__ >= "0.6.1":
176+
VLLM_SUPPORTED_VISION_MODEL_LIST.append("internvl2")
177+
174178

175179
class VLLMModel(LLM):
176180
def __init__(
@@ -305,6 +309,11 @@ def _sanitize_model_config(
305309
model_config.setdefault("max_num_seqs", 256)
306310
model_config.setdefault("quantization", None)
307311
model_config.setdefault("max_model_len", None)
312+
model_config["limit_mm_per_prompt"] = (
313+
json.loads(model_config.get("limit_mm_per_prompt")) # type: ignore
314+
if model_config.get("limit_mm_per_prompt")
315+
else None
316+
)
308317

309318
return model_config
310319

@@ -434,6 +443,7 @@ def _convert_request_output_to_completion(
434443
usage=usage,
435444
)
436445

446+
@vllm_check
437447
async def async_generate(
438448
self,
439449
prompt: Union[str, Dict[str, Any]],
@@ -665,6 +675,7 @@ async def _async_to_tool_completion_chunks(
665675
yield self._to_chat_completion_chunk(chunk)
666676
i += 1
667677

678+
@vllm_check
668679
async def async_chat(
669680
self,
670681
messages: List[Dict],
@@ -741,25 +752,30 @@ def _sanitize_chat_config(
741752
)
742753
return generate_config
743754

755+
@vllm_check
744756
async def async_chat(
745757
self,
746758
messages: List[Dict],
747759
generate_config: Optional[Dict] = None,
748760
request_id: Optional[str] = None,
749761
) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]:
750-
# only support single image, waiting vllm support multi images
751762
model_family = self.model_family.model_family or self.model_family.model_name
752763
prompt, images = self.get_specific_prompt(model_family, messages)
753764

754765
if len(images) == 0:
755766
inputs = {
756767
"prompt": prompt,
757768
}
758-
else:
769+
elif len(images) == 1:
759770
inputs = {
760771
"prompt": prompt,
761772
"multi_modal_data": {"image": images[-1]}, # type: ignore
762773
}
774+
else:
775+
inputs = {
776+
"prompt": prompt,
777+
"multi_modal_data": {"image": images}, # type: ignore
778+
}
763779
generate_config = self._sanitize_chat_config(generate_config)
764780

765781
stream = generate_config.get("stream", None)

‎xinference/model/llm/vllm/utils.py

+42
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# Copyright 2022-2023 XProbe Inc.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
import functools
15+
import logging
16+
import os
17+
18+
logger = logging.getLogger(__name__)
19+
20+
21+
def vllm_check(fn):
22+
try:
23+
from vllm.engine.async_llm_engine import AsyncEngineDeadError
24+
except:
25+
return fn
26+
27+
@functools.wraps(fn)
28+
async def _async_wrapper(self, *args, **kwargs):
29+
logger.info("vllm_check")
30+
try:
31+
return await fn(self, *args, **kwargs)
32+
except AsyncEngineDeadError:
33+
logger.info("Detecting vLLM is not health, prepare to quit the process")
34+
try:
35+
self.stop()
36+
except:
37+
# ignore error when stop
38+
pass
39+
# Just kill the process and let xinference auto-recover the model
40+
os._exit(1)
41+
42+
return _async_wrapper

0 commit comments

Comments
 (0)
Please sign in to comment.