Skip to content

Commit abfc0e6

Browse files
committed
Score API
1 parent de96811 commit abfc0e6

File tree

7 files changed

+75
-50
lines changed

7 files changed

+75
-50
lines changed

tests/entrypoints/openai/correctness/test_mteb_score.py

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,19 +6,16 @@
66

77
# yapf conflicts with isort for this block
88
# yapf: disable
9-
from tests.models.language.pooling.mteb_utils import (MTEB_RERANK_LANGS,
10-
MTEB_RERANK_TASKS,
11-
MTEB_RERANK_TOL,
12-
RerankClientMtebEncoder,
13-
ScoreClientMtebEncoder,
14-
run_mteb_rerank)
9+
from tests.models.language.pooling.mteb_utils import (
10+
MTEB_RERANK_LANGS, MTEB_RERANK_TASKS, MTEB_RERANK_TOL,
11+
RerankClientMtebEncoder, ScoreClientMtebEncoder,
12+
mteb_test_rerank_models_hf, run_mteb_rerank)
1513
# yapf: enable
1614
from tests.utils import RemoteOpenAIServer
1715

1816
os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
1917

2018
MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
21-
MAIN_SCORE = 0.33437
2219

2320

2421
@pytest.fixture(scope="module")
@@ -31,12 +28,19 @@ def server():
3128
yield remote_server
3229

3330

34-
def test_mteb_score(server):
31+
@pytest.fixture(scope="module")
32+
def st_main_score(hf_runner):
33+
# The main score related to the version of the dependency.
34+
# So we need to recalculate every time.
35+
main_score, st_dtype = mteb_test_rerank_models_hf(hf_runner, MODEL_NAME)
36+
return main_score
37+
38+
39+
def test_mteb_score(server, st_main_score):
3540
url = server.url_for("score")
3641
encoder = ScoreClientMtebEncoder(MODEL_NAME, url)
3742
vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS,
3843
MTEB_RERANK_LANGS)
39-
st_main_score = MAIN_SCORE
4044

4145
print("VLLM main score: ", vllm_main_score)
4246
print("SentenceTransformer main score: ", st_main_score)
@@ -45,12 +49,11 @@ def test_mteb_score(server):
4549
assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_RERANK_TOL)
4650

4751

48-
def test_mteb_rerank(server):
52+
def test_mteb_rerank(server, st_main_score):
4953
url = server.url_for("rerank")
5054
encoder = RerankClientMtebEncoder(MODEL_NAME, url)
5155
vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS,
5256
MTEB_RERANK_LANGS)
53-
st_main_score = MAIN_SCORE
5457

5558
print("VLLM main score: ", vllm_main_score)
5659
print("SentenceTransformer main score: ", st_main_score)

tests/models/language/pooling/mteb_utils.py

Lines changed: 31 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,35 @@ def run_mteb_rerank(cross_encoder, tasks, languages):
234234
return main_score
235235

236236

237+
def mteb_test_rerank_models_hf(hf_runner, model_name, hf_model_callback=None):
238+
with hf_runner(model_name, is_cross_encoder=True,
239+
dtype="float32") as hf_model:
240+
241+
original_predict = hf_model.predict
242+
243+
def _predict(
244+
sentences: list[tuple[str, str,
245+
Optional[str]]], # query, corpus, prompt
246+
*args,
247+
**kwargs,
248+
):
249+
# vllm and st both remove the prompt, fair comparison.
250+
prompts = [(s[0], s[1]) for s in sentences]
251+
return original_predict(prompts, *args, **kwargs, batch_size=8)
252+
253+
hf_model.predict = _predict
254+
hf_model.original_predict = original_predict
255+
256+
if hf_model_callback is not None:
257+
hf_model_callback(hf_model)
258+
259+
st_main_score = run_mteb_rerank(hf_model,
260+
tasks=MTEB_RERANK_TASKS,
261+
languages=MTEB_RERANK_LANGS)
262+
st_dtype = next(hf_model.model.model.parameters()).dtype
263+
return st_main_score, st_dtype
264+
265+
237266
def mteb_test_rerank_models(hf_runner,
238267
vllm_runner,
239268
model_info: RerankModelInfo,
@@ -261,31 +290,8 @@ def mteb_test_rerank_models(hf_runner,
261290
languages=MTEB_RERANK_LANGS)
262291
vllm_dtype = vllm_model.model.llm_engine.model_config.dtype
263292

264-
with hf_runner(model_info.name, is_cross_encoder=True,
265-
dtype="float32") as hf_model:
266-
267-
original_predict = hf_model.predict
268-
269-
def _predict(
270-
sentences: list[tuple[str, str,
271-
Optional[str]]], # query, corpus, prompt
272-
*args,
273-
**kwargs,
274-
):
275-
# vllm and st both remove the prompt, fair comparison.
276-
prompts = [(s[0], s[1]) for s in sentences]
277-
return original_predict(prompts, *args, **kwargs, batch_size=8)
278-
279-
hf_model.predict = _predict
280-
hf_model.original_predict = original_predict
281-
282-
if hf_model_callback is not None:
283-
hf_model_callback(hf_model)
284-
285-
st_main_score = run_mteb_rerank(hf_model,
286-
tasks=MTEB_RERANK_TASKS,
287-
languages=MTEB_RERANK_LANGS)
288-
st_dtype = next(hf_model.model.model.parameters()).dtype
293+
st_main_score, st_dtype = mteb_test_rerank_models_hf(
294+
hf_runner, model_info.name, hf_model_callback)
289295

290296
print("VLLM:", vllm_dtype, vllm_main_score)
291297
print("SentenceTransformers:", st_dtype, st_main_score)

vllm/entrypoints/llm.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1273,9 +1273,13 @@ def score(
12731273

12741274
raise ValueError(" ".join(messages))
12751275

1276-
if self.llm_engine.model_config.task not in ("embed", "score"):
1277-
raise ValueError(
1278-
"Score API is only enabled for `--task embed or --task score`")
1276+
if self.llm_engine.model_config.task not in ("embed", "classify"):
1277+
raise ValueError("Score API is only enabled for "
1278+
"`--task embed or --task classify`.")
1279+
1280+
if (self.llm_engine.model_config.task == "classify"
1281+
and self.llm_engine.model_config.hf_config.num_labels != 1):
1282+
raise ValueError("Score API is only enabled for num_labels == 1.")
12791283

12801284
# the tokenizer for models such as
12811285
# "cross-encoder/ms-marco-MiniLM-L-6-v2" doesn't support passing

vllm/entrypoints/openai/api_server.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1219,7 +1219,7 @@ async def init_app_state(
12191219
model_config,
12201220
state.openai_serving_models,
12211221
request_logger=request_logger) if model_config.task in (
1222-
"score", "embed", "pooling") else None
1222+
"classify", "embed", "pooling") else None
12231223
state.openai_serving_classification = ServingClassification(
12241224
engine_client,
12251225
model_config,
@@ -1231,7 +1231,7 @@ async def init_app_state(
12311231
model_config,
12321232
state.openai_serving_models,
12331233
request_logger=request_logger
1234-
) if model_config.task == "score" else None
1234+
) if model_config.task == "classify" else None
12351235
state.openai_serving_tokenization = OpenAIServingTokenization(
12361236
engine_client,
12371237
model_config,

vllm/model_executor/layers/pooler.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,7 @@ def forward(self, pooled_data: Union[list[torch.Tensor], torch.Tensor],
286286
else:
287287
pooled_data = pooled_data.to(torch.float32)
288288

289+
# for matryoshka representation
289290
if isinstance(pooling_metadata, V0PoolingMetadata):
290291
dimensions_list = [
291292
pooling_param.dimensions
@@ -300,10 +301,15 @@ def forward(self, pooled_data: Union[list[torch.Tensor], torch.Tensor],
300301
if any(d is not None for d in dimensions_list):
301302
# change the output dimension
302303
assert len(pooled_data) == len(dimensions_list)
303-
pooled_data = [
304-
vecs if d is None else vecs[..., :d]
305-
for vecs, d in zip(pooled_data, dimensions_list)
306-
]
304+
if set(dimensions_list) == 1 and not isinstance(pooled_data, list):
305+
# if all dimensions are the same
306+
d = dimensions_list[0]
307+
pooled_data = pooled_data[..., :d]
308+
else:
309+
pooled_data = [
310+
vecs if d is None else vecs[..., :d]
311+
for vecs, d in zip(pooled_data, dimensions_list)
312+
]
307313

308314
if self.normalize:
309315
if isinstance(pooled_data, list):
@@ -326,6 +332,10 @@ def forward(self, pooled_data: Union[list[torch.Tensor], torch.Tensor],
326332
else:
327333
pooled_data = F.sigmoid(pooled_data)
328334

335+
# shape:
336+
# classify & score -> (batch_size, num_classes)
337+
# embed -> (batch_size, embedding_dim) or list(embedding_dim)
338+
# (batch_size, dimensions) or list(dimensions) if using MRL
329339
return pooled_data
330340

331341

@@ -420,7 +430,6 @@ def forward(
420430
offset += prompt_len
421431
pooled_data.append(pooled_data_i)
422432

423-
offset = 0
424433
pooled_data_lst = []
425434
for pooled_data_i in pooled_data:
426435

@@ -437,7 +446,8 @@ def forward(
437446
# apply classifier once on the full batch if possible
438447
pooled_output = self.classifier(pooled_output)
439448

440-
scores = self.default_activation_function(pooled_output).squeeze(-1)
449+
# shape: (batch_size, num_labels)
450+
scores = self.default_activation_function(pooled_output)
441451

442452
pooled_outputs = [PoolingSequenceGroupOutput(data) for data in scores]
443453
return PoolerOutput(outputs=pooled_outputs)

vllm/model_executor/model_loader/utils.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,7 @@
2121
from vllm.model_executor.layers.quantization.base_config import (
2222
QuantizationConfig, QuantizeMethodBase)
2323
from vllm.model_executor.models import ModelRegistry
24-
from vllm.model_executor.models.adapters import (as_seq_cls_model,
25-
as_embedding_model,
24+
from vllm.model_executor.models.adapters import (as_embedding_model,
2625
as_reward_model)
2726
from vllm.utils import is_pin_memory_available
2827

vllm/outputs.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -453,6 +453,7 @@ class ClassificationOutput:
453453

454454
@staticmethod
455455
def from_base(pooling_output: PoolingOutput):
456+
# pooling_output shape: (num_classes)
456457
pooled_data = pooling_output.data
457458
if pooled_data.ndim != 1:
458459
raise ValueError("pooled_data should be a 1-D probability vector")
@@ -490,7 +491,9 @@ class ScoringOutput:
490491

491492
@staticmethod
492493
def from_base(pooling_output: PoolingOutput):
493-
pooled_data = pooling_output.data
494+
# pooling_output shape: (num_classes)
495+
# num_classes == 1 when using score api.
496+
pooled_data = pooling_output.data.squeeze()
494497
if pooled_data.ndim != 0:
495498
raise ValueError("pooled_data should be a scalar score")
496499

0 commit comments

Comments
 (0)