Score API

noooop · noooop · commit abfc0e6e9edf · 2025-06-24T14:19:56.000+08:00
diff --git a/tests/entrypoints/openai/correctness/test_mteb_score.py b/tests/entrypoints/openai/correctness/test_mteb_score.py
@@ -6,19 +6,16 @@
 
 # yapf conflicts with isort for this block
 # yapf: disable
-from tests.models.language.pooling.mteb_utils import (MTEB_RERANK_LANGS,
-                                                      MTEB_RERANK_TASKS,
-                                                      MTEB_RERANK_TOL,
-                                                      RerankClientMtebEncoder,
-                                                      ScoreClientMtebEncoder,
-                                                      run_mteb_rerank)
+from tests.models.language.pooling.mteb_utils import (
+    MTEB_RERANK_LANGS, MTEB_RERANK_TASKS, MTEB_RERANK_TOL,
+    RerankClientMtebEncoder, ScoreClientMtebEncoder,
+    mteb_test_rerank_models_hf, run_mteb_rerank)
 # yapf: enable
 from tests.utils import RemoteOpenAIServer
 
 os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
 
 MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
-MAIN_SCORE = 0.33437
 
 
 @pytest.fixture(scope="module")
@@ -31,12 +28,19 @@ def server():
         yield remote_server
 
 
-def test_mteb_score(server):
+@pytest.fixture(scope="module")
+def st_main_score(hf_runner):
+    # The main score related to the version of the dependency.
+    # So we need to recalculate every time.
+    main_score, st_dtype = mteb_test_rerank_models_hf(hf_runner, MODEL_NAME)
+    return main_score
+
+
+def test_mteb_score(server, st_main_score):
     url = server.url_for("score")
     encoder = ScoreClientMtebEncoder(MODEL_NAME, url)
     vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS,
                                       MTEB_RERANK_LANGS)
-    st_main_score = MAIN_SCORE
 
     print("VLLM main score: ", vllm_main_score)
     print("SentenceTransformer main score: ", st_main_score)
@@ -45,12 +49,11 @@ def test_mteb_score(server):
     assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_RERANK_TOL)
 
 
-def test_mteb_rerank(server):
+def test_mteb_rerank(server, st_main_score):
     url = server.url_for("rerank")
     encoder = RerankClientMtebEncoder(MODEL_NAME, url)
     vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS,
                                       MTEB_RERANK_LANGS)
-    st_main_score = MAIN_SCORE
 
     print("VLLM main score: ", vllm_main_score)
     print("SentenceTransformer main score: ", st_main_score)
diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py
@@ -234,6 +234,35 @@ def run_mteb_rerank(cross_encoder, tasks, languages):
     return main_score
 
 
+def mteb_test_rerank_models_hf(hf_runner, model_name, hf_model_callback=None):
+    with hf_runner(model_name, is_cross_encoder=True,
+                   dtype="float32") as hf_model:
+
+        original_predict = hf_model.predict
+
+        def _predict(
+            sentences: list[tuple[str, str,
+                                  Optional[str]]],  # query, corpus, prompt
+            *args,
+            **kwargs,
+        ):
+            # vllm and st both remove the prompt, fair comparison.
+            prompts = [(s[0], s[1]) for s in sentences]
+            return original_predict(prompts, *args, **kwargs, batch_size=8)
+
+        hf_model.predict = _predict
+        hf_model.original_predict = original_predict
+
+        if hf_model_callback is not None:
+            hf_model_callback(hf_model)
+
+        st_main_score = run_mteb_rerank(hf_model,
+                                        tasks=MTEB_RERANK_TASKS,
+                                        languages=MTEB_RERANK_LANGS)
+        st_dtype = next(hf_model.model.model.parameters()).dtype
+    return st_main_score, st_dtype
+
+
 def mteb_test_rerank_models(hf_runner,
                             vllm_runner,
                             model_info: RerankModelInfo,
@@ -261,31 +290,8 @@ def mteb_test_rerank_models(hf_runner,
                                           languages=MTEB_RERANK_LANGS)
         vllm_dtype = vllm_model.model.llm_engine.model_config.dtype
 
-    with hf_runner(model_info.name, is_cross_encoder=True,
-                   dtype="float32") as hf_model:
-
-        original_predict = hf_model.predict
-
-        def _predict(
-            sentences: list[tuple[str, str,
-                                  Optional[str]]],  # query, corpus, prompt
-            *args,
-            **kwargs,
-        ):
-            # vllm and st both remove the prompt, fair comparison.
-            prompts = [(s[0], s[1]) for s in sentences]
-            return original_predict(prompts, *args, **kwargs, batch_size=8)
-
-        hf_model.predict = _predict
-        hf_model.original_predict = original_predict
-
-        if hf_model_callback is not None:
-            hf_model_callback(hf_model)
-
-        st_main_score = run_mteb_rerank(hf_model,
-                                        tasks=MTEB_RERANK_TASKS,
-                                        languages=MTEB_RERANK_LANGS)
-        st_dtype = next(hf_model.model.model.parameters()).dtype
+    st_main_score, st_dtype = mteb_test_rerank_models_hf(
+        hf_runner, model_info.name, hf_model_callback)
 
     print("VLLM:", vllm_dtype, vllm_main_score)
     print("SentenceTransformers:", st_dtype, st_main_score)
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
@@ -1273,9 +1273,13 @@ def score(
 
             raise ValueError(" ".join(messages))
 
-        if self.llm_engine.model_config.task not in ("embed", "score"):
-            raise ValueError(
-                "Score API is only enabled for `--task embed or --task score`")
+        if self.llm_engine.model_config.task not in ("embed", "classify"):
+            raise ValueError("Score API is only enabled for "
+                             "`--task embed or --task classify`.")
+
+        if (self.llm_engine.model_config.task == "classify"
+                and self.llm_engine.model_config.hf_config.num_labels != 1):
+            raise ValueError("Score API is only enabled for num_labels == 1.")
 
         # the tokenizer for models such as
         # "cross-encoder/ms-marco-MiniLM-L-6-v2" doesn't support passing
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
@@ -1219,7 +1219,7 @@ async def init_app_state(
         model_config,
         state.openai_serving_models,
         request_logger=request_logger) if model_config.task in (
-            "score", "embed", "pooling") else None
+            "classify", "embed", "pooling") else None
     state.openai_serving_classification = ServingClassification(
         engine_client,
         model_config,
@@ -1231,7 +1231,7 @@ async def init_app_state(
         model_config,
         state.openai_serving_models,
         request_logger=request_logger
-    ) if model_config.task == "score" else None
+    ) if model_config.task == "classify" else None
     state.openai_serving_tokenization = OpenAIServingTokenization(
         engine_client,
         model_config,
diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
@@ -286,6 +286,7 @@ def forward(self, pooled_data: Union[list[torch.Tensor], torch.Tensor],
         else:
             pooled_data = pooled_data.to(torch.float32)
 
+        # for matryoshka representation
         if isinstance(pooling_metadata, V0PoolingMetadata):
             dimensions_list = [
                 pooling_param.dimensions
@@ -300,10 +301,15 @@ def forward(self, pooled_data: Union[list[torch.Tensor], torch.Tensor],
         if any(d is not None for d in dimensions_list):
             # change the output dimension
             assert len(pooled_data) == len(dimensions_list)
-            pooled_data = [
-                vecs if d is None else vecs[..., :d]
-                for vecs, d in zip(pooled_data, dimensions_list)
-            ]
+            if set(dimensions_list) == 1 and not isinstance(pooled_data, list):
+                # if all dimensions are the same
+                d = dimensions_list[0]
+                pooled_data = pooled_data[..., :d]
+            else:
+                pooled_data = [
+                    vecs if d is None else vecs[..., :d]
+                    for vecs, d in zip(pooled_data, dimensions_list)
+                ]
 
         if self.normalize:
             if isinstance(pooled_data, list):
@@ -326,6 +332,10 @@ def forward(self, pooled_data: Union[list[torch.Tensor], torch.Tensor],
                 else:
                     pooled_data = F.sigmoid(pooled_data)
 
+        # shape:
+        # classify & score -> (batch_size, num_classes)
+        # embed -> (batch_size, embedding_dim) or list(embedding_dim)
+        #          (batch_size, dimensions) or list(dimensions) if using MRL
         return pooled_data
 
 
@@ -420,7 +430,6 @@ def forward(
                 offset += prompt_len
                 pooled_data.append(pooled_data_i)
 
-        offset = 0
         pooled_data_lst = []
         for pooled_data_i in pooled_data:
 
@@ -437,7 +446,8 @@ def forward(
             # apply classifier once on the full batch if possible
             pooled_output = self.classifier(pooled_output)
 
-        scores = self.default_activation_function(pooled_output).squeeze(-1)
+        # shape: (batch_size, num_labels)
+        scores = self.default_activation_function(pooled_output)
 
         pooled_outputs = [PoolingSequenceGroupOutput(data) for data in scores]
         return PoolerOutput(outputs=pooled_outputs)
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
@@ -21,8 +21,7 @@
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.models import ModelRegistry
-from vllm.model_executor.models.adapters import (as_seq_cls_model,
-                                                 as_embedding_model,
+from vllm.model_executor.models.adapters import (as_embedding_model,
                                                  as_reward_model)
 from vllm.utils import is_pin_memory_available
 
diff --git a/vllm/outputs.py b/vllm/outputs.py
@@ -453,6 +453,7 @@ class ClassificationOutput:
 
     @staticmethod
     def from_base(pooling_output: PoolingOutput):
+        # pooling_output shape: (num_classes)
         pooled_data = pooling_output.data
         if pooled_data.ndim != 1:
             raise ValueError("pooled_data should be a 1-D probability vector")
@@ -490,7 +491,9 @@ class ScoringOutput:
 
     @staticmethod
     def from_base(pooling_output: PoolingOutput):
-        pooled_data = pooling_output.data
+        # pooling_output shape: (num_classes)
+        # num_classes == 1 when using score api.
+        pooled_data = pooling_output.data.squeeze()
         if pooled_data.ndim != 0:
             raise ValueError("pooled_data should be a scalar score")