+ as_seq_cls_model

noooop · noooop · commit 6e7797f0f58a · 2025-06-16T14:23:45.000+08:00
Signed-off-by: wang.yuqi &lt;noooop@126.com&gt;
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
@@ -446,7 +446,7 @@ Specified using `--task classify`.
 | `JambaForSequenceClassification` | Jamba    | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎                     | ✅︎                          |                       |
 
 If your model is not in the above list, we will try to automatically convert the model using
-[as_classification_model][vllm.model_executor.models.adapters.as_classification_model]. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
+[as_seq_cls_model][vllm.model_executor.models.adapters.as_seq_cls_model]. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
 
 #### Sentence Pair Scoring
 
diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md
@@ -379,7 +379,7 @@ Code example: <gh-file:examples/online_serving/openai_pooling_client.py>
 
 Our Classification API directly supports Hugging Face sequence-classification models such as [ai21labs/Jamba-tiny-reward-dev](https://huggingface.co/ai21labs/Jamba-tiny-reward-dev) and [jason9693/Qwen2.5-1.5B-apeach](https://huggingface.co/jason9693/Qwen2.5-1.5B-apeach).
 
-We automatically wrap any other transformer via `as_classification_model()`, which pools on the last token, attaches a `RowParallelLinear` head, and applies a softmax to produce per-class probabilities.
+We automatically wrap any other transformer via `as_seq_cls_model()`, which pools on the last token, attaches a `RowParallelLinear` head, and applies a softmax to produce per-class probabilities.
 
 Code example: <gh-file:examples/online_serving/openai_classification_client.py>
 
diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py
@@ -9,9 +9,9 @@
 from vllm.model_executor.models import (is_pooling_model,
                                         is_text_generation_model,
                                         supports_multimodal)
-from vllm.model_executor.models.adapters import (as_classification_model,
-                                                 as_embedding_model,
-                                                 as_reward_model)
+from vllm.model_executor.models.adapters import (as_embedding_model,
+                                                 as_reward_model,
+                                                 as_seq_cls_model)
 from vllm.model_executor.models.registry import (_MULTIMODAL_MODELS,
                                                  _SPECULATIVE_DECODING_MODELS,
                                                  _TEXT_GENERATION_MODELS,
@@ -38,7 +38,7 @@ def test_registry_imports(model_arch):
         assert is_text_generation_model(model_cls)
 
     # All vLLM models should be convertible to a pooling model
-    assert is_pooling_model(as_classification_model(model_cls))
+    assert is_pooling_model(as_seq_cls_model(model_cls))
     assert is_pooling_model(as_embedding_model(model_cls))
     assert is_pooling_model(as_reward_model(model_cls))
 
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
@@ -21,9 +21,9 @@
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.models import ModelRegistry
-from vllm.model_executor.models.adapters import (as_classification_model,
-                                                 as_embedding_model,
-                                                 as_reward_model)
+from vllm.model_executor.models.adapters import (as_embedding_model,
+                                                 as_reward_model,
+                                                 as_seq_cls_model)
 from vllm.utils import is_pin_memory_available
 
 logger = init_logger(__name__)
@@ -244,8 +244,8 @@ def get_model_architecture(
     model_cls, arch = ModelRegistry.resolve_model_cls(architectures)
     if model_config.task == "embed":
         model_cls = as_embedding_model(model_cls)
-    elif model_config.task == "classify":
-        model_cls = as_classification_model(model_cls)
+    elif model_config.task in ["classify", "score"]:
+        model_cls = as_seq_cls_model(model_cls)
     elif model_config.task == "reward":
         model_cls = as_reward_model(model_cls)
 
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
@@ -145,9 +145,9 @@ def as_embedding_model(cls: _T) -> _T:
     return ModelForEmbedding  # type: ignore
 
 
-def as_classification_model(cls: _T) -> _T:
+def as_seq_cls_model(cls: _T) -> _T:
     """
-    Subclass an existing vLLM model to support classification.
+    Subclass an existing vLLM model to support classify and score tasks.
 
     By default, the class probabilities are extracted from the softmaxed
     hidden state corresponding to the last token.
@@ -164,19 +164,23 @@ def as_classification_model(cls: _T) -> _T:
     # Lazy import
     from vllm.config import VllmConfig
     from vllm.model_executor.layers.linear import RowParallelLinear
-    from vllm.model_executor.layers.pooler import PoolingType
+    from vllm.model_executor.layers.pooler import PoolerOutput, PoolingType
+    from vllm.model_executor.models.interfaces import SupportsCrossEncoding
+    from vllm.model_executor.pooling_metadata import PoolingMetadata
     from vllm.sequence import IntermediateTensors
 
     from .utils import maybe_prefix
 
     ModelForPooling = _create_pooling_model_cls(
         cls,
-        default_pooling_type=PoolingType.LAST,
+        default_pooling_type=getattr(cls, "default_pooling_type",
+                                     PoolingType.LAST),
         default_normalize=False,
         default_softmax=True,
     )
 
-    class ModelForClassification(ModelForPooling):
+    class ModelForSequenceClassification(ModelForPooling,
+                                         SupportsCrossEncoding):
 
         def __init__(
             self,
@@ -186,10 +190,18 @@ def __init__(
             **kwargs: Any,
         ) -> None:
             super().__init__(vllm_config=vllm_config, prefix=prefix, **kwargs)
+            self.config_verify(vllm_config)
 
             config = vllm_config.model_config.hf_config
             quant_config = vllm_config.quant_config
 
+            self.task = vllm_config.model_config.task
+            self.pooling_type = (
+                vllm_config.model_config.pooler_config.pooling_type)
+
+            if self.task == "score":
+                assert config.num_labels == 1
+
             self.score = RowParallelLinear(config.hidden_size,
                                            config.num_labels,
                                            quant_config=quant_config,
@@ -198,24 +210,48 @@ def __init__(
                                            prefix=maybe_prefix(
                                                prefix, "score"))
 
+        def config_verify(self, vllm_config):
+            # Leave an interface for validating and modifying model_config
+            # for slightly different models
+            pass
+
         def forward(
             self,
             input_ids: torch.Tensor,
             positions: torch.Tensor,
             intermediate_tensors: Optional[IntermediateTensors] = None,
             inputs_embeds: Optional[torch.Tensor] = None,
         ) -> torch.Tensor:
-            hidden_states = super().forward(input_ids, positions,
-                                            intermediate_tensors,
-                                            inputs_embeds)
-            logits, _ = self.score(hidden_states)
-            return logits
+            return super().forward(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+
+        def pooler(
+            self,
+            hidden_states: torch.Tensor,
+            pooling_metadata: PoolingMetadata,
+        ) -> PoolerOutput:
+            if self.pooling_type == PoolingType.ALL:
+                logits, _ = self.score(hidden_states)
+                return self._pooler(hidden_states, pooling_metadata)
+            else:
+                hidden_states = self._pooler.extract_states(
+                    hidden_states, pooling_metadata)
+                logits, _ = self.score(hidden_states)
+                pooled_data = self._pooler.head(logits, pooling_metadata)
+
+                if self.task == "score":
+                    pooled_data = [data.squeeze(-1) for data in pooled_data]
+
+                pooled_outputs = [
+                    self._pooler.build_output(data) for data in pooled_data
+                ]
+                return PoolerOutput(outputs=pooled_outputs)
 
 
-    ModelForClassification.__name__ = \
-        _get_pooling_model_name(cls.__name__, "ForClassification")
+    ModelForSequenceClassification.__name__ = \
+        _get_pooling_model_name(cls.__name__, "ForSequenceClassification")
 
-    return ModelForClassification  # type: ignore
+    return ModelForSequenceClassification  # type: ignore
 
 
 def as_reward_model(cls: _T) -> _T:
diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py
@@ -38,15 +38,15 @@
 from vllm.model_executor.layers.linear import (QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.pooler import Pooler, PoolingType
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors, PoolerOutput
 
-from .interfaces import SupportsCrossEncoding, SupportsLoRA, SupportsPP
+from .adapters import as_seq_cls_model
+from .interfaces import SupportsLoRA, SupportsPP
 from .qwen2 import Qwen2MLP as Qwen3MLP
 from .qwen2 import Qwen2Model
 from .utils import AutoWeightsLoader, PPMissingLayer, maybe_prefix
@@ -323,38 +323,31 @@ def load_weights(self, weights: Iterable[tuple[str,
         return loader.load_weights(weights)
 
 
-class Qwen3ForSequenceClassification(nn.Module, SupportsLoRA,
-                                     SupportsCrossEncoding):
+class Qwen3ForSequenceClassification(as_seq_cls_model(Qwen3ForCausalLM)):
 
     def __init__(
         self,
         vllm_config: "VllmConfig",
         prefix: str = "",
     ) -> None:
-        super().__init__()
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
 
+    def config_verify(self, vllm_config: "VllmConfig"):
         config = vllm_config.model_config.hf_config
-        quant_config = vllm_config.quant_config
-        pooler_config = vllm_config.model_config.pooler_config
 
+        is_original_qwen3_reranker = getattr(config,
+                                             "is_original_qwen3_reranker",
+                                             False)
+
+        if not is_original_qwen3_reranker:
+            return
+
+        tokens = getattr(config, "classifier_from_token", None)
+        assert tokens is not None and len(tokens) == 2, \
+            ("Try loading the original Qwen3 Reranker?, see: "
+             "https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/qwen3_reranker.py")
+        config.num_labels = 1
         self.vllm_config = vllm_config
-        self.config = config
-        self.quant_config = quant_config
-        self.prefix = prefix
-        self.model = Qwen3Model(vllm_config=vllm_config,
-                                prefix=maybe_prefix(prefix, "model"))
-        self.score = RowParallelLinear(config.hidden_size,
-                                       config.num_labels,
-                                       quant_config=quant_config,
-                                       input_is_parallel=False,
-                                       bias=False,
-                                       prefix=maybe_prefix(prefix, "score"))
-
-        self._pooler = Pooler.from_config_with_defaults(
-            pooler_config,
-            pooling_type=PoolingType.LAST,
-            normalize=False,
-            softmax=True)
 
     def forward(
         self,
@@ -395,22 +388,10 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
 
     def load_weights_from_original_qwen3_reranker(
             self, weights: Iterable[tuple[str, torch.Tensor]]):
-        tokens = getattr(self.config, "classifier_from_token", None)
-        assert tokens is not None and len(tokens) == 2, \
-            ("Try loading the original Qwen3 Reranker?, see: "
-             "https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/qwen3_reranker.py")
 
-        self.config.num_labels = 1
         model_config = self.vllm_config.model_config
-
+        tokens = getattr(self.config, "classifier_from_token", None)
         device = self.score.weight.device
-        self.score = RowParallelLinear(self.config.hidden_size,
-                                       self.config.num_labels,
-                                       quant_config=self.quant_config,
-                                       input_is_parallel=False,
-                                       bias=False,
-                                       prefix=maybe_prefix(
-                                           self.prefix, "score")).to(device)
 
         if self.config.tie_word_embeddings:
             self.lm_head = self.model.embed_tokens