[Bugfix] Fix Qwen3-Reranker-8B load (vllm-project#28117)

noooop · web-flow · commit 802748bddbe3 · 2025-11-05T18:33:50.000Z
Signed-off-by: wang.yuqi &lt;noooop@126.com&gt;
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
@@ -186,15 +186,21 @@ def __init__(
         def _init_pooler(self, vllm_config: "VllmConfig", prefix: str = ""):
             raise NotImplementedError
 
-        def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        def load_weights(
+            self,
+            weights: Iterable[tuple[str, torch.Tensor]],
+            load_lm_head: bool = False,
+        ):
             # TODO: Support uninitialized params tracking
 
-            # We have deleted this attribute, so don't load it
-            weights = (
-                (name, data)
-                for name, data in weights
-                if not name.startswith("lm_head.")
-            )
+            # For most pooling models: We have deleted this attribute, so don't load it.
+            # For converting an LLM into a seq cls model, we need the lm_head.
+            if not load_lm_head:
+                weights = (
+                    (name, data)
+                    for name, data in weights
+                    if not name.startswith("lm_head.")
+                )
 
             # If `*ForCausalLM` defines `load_weights` on the inner model
             # and there are no other inner modules with parameters,
@@ -431,8 +437,12 @@ def load_weights_using_from_2_way_softmax(
         )
         model.lm_head = model.lm_head.tie_weights(embed_tokens)
 
-    # Skip ModelForSequenceClassification in MRO to avoid infinite recursion
-    loaded_weights = type(model).__mro__[1].load_weights(model, weights)
+    # ModelForPooling is dynamically defined inside the _create_pooling_model_cls
+    # function, so we need use this hacky method to obtain it.
+    pooling_model_cls = next(
+        x for x in type(model).__mro__ if x.__name__ == "ModelForPooling"
+    )
+    loaded_weights = pooling_model_cls.load_weights(model, weights, load_lm_head=True)
 
     from vllm.transformers_utils.tokenizer import get_tokenizer