Skip to content

Commit f905250

Browse files
committed
fix test_mteb_score randomness
Signed-off-by: wang.yuqi <[email protected]>
1 parent 1d0ae26 commit f905250

File tree

3 files changed

+48
-36
lines changed

3 files changed

+48
-36
lines changed

tests/entrypoints/openai/correctness/test_mteb_score.py

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,19 +6,16 @@
66

77
# yapf conflicts with isort for this block
88
# yapf: disable
9-
from tests.models.language.pooling.mteb_utils import (MTEB_RERANK_LANGS,
10-
MTEB_RERANK_TASKS,
11-
MTEB_RERANK_TOL,
12-
RerankClientMtebEncoder,
13-
ScoreClientMtebEncoder,
14-
run_mteb_rerank)
9+
from tests.models.language.pooling.mteb_utils import (
10+
MTEB_RERANK_LANGS, MTEB_RERANK_TASKS, MTEB_RERANK_TOL,
11+
RerankClientMtebEncoder, ScoreClientMtebEncoder,
12+
mteb_test_rerank_models_hf, run_mteb_rerank)
1513
# yapf: enable
1614
from tests.utils import RemoteOpenAIServer
1715

1816
os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
1917

2018
MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
21-
MAIN_SCORE = 0.33437
2219

2320

2421
@pytest.fixture(scope="module")
@@ -31,12 +28,20 @@ def server():
3128
yield remote_server
3229

3330

34-
def test_mteb_score(server):
31+
@pytest.fixture(scope="module")
32+
def st_main_score(hf_runner):
33+
# I don't know where the randomness comes from,
34+
# but this value changes over time.
35+
# Perhaps related to the version of the dependency.
36+
main_score, st_dtype = mteb_test_rerank_models_hf(hf_runner, MODEL_NAME)
37+
return main_score
38+
39+
40+
def test_mteb_score(server, st_main_score):
3541
url = server.url_for("score")
3642
encoder = ScoreClientMtebEncoder(MODEL_NAME, url)
3743
vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS,
3844
MTEB_RERANK_LANGS)
39-
st_main_score = MAIN_SCORE
4045

4146
print("VLLM main score: ", vllm_main_score)
4247
print("SentenceTransformer main score: ", st_main_score)
@@ -45,12 +50,11 @@ def test_mteb_score(server):
4550
assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_RERANK_TOL)
4651

4752

48-
def test_mteb_rerank(server):
53+
def test_mteb_rerank(server, st_main_score):
4954
url = server.url_for("rerank")
5055
encoder = RerankClientMtebEncoder(MODEL_NAME, url)
5156
vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS,
5257
MTEB_RERANK_LANGS)
53-
st_main_score = MAIN_SCORE
5458

5559
print("VLLM main score: ", vllm_main_score)
5660
print("SentenceTransformer main score: ", st_main_score)

tests/models/language/pooling/mteb_utils.py

Lines changed: 31 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,35 @@ def run_mteb_rerank(cross_encoder, tasks, languages):
234234
return main_score
235235

236236

237+
def mteb_test_rerank_models_hf(hf_runner, model_name, hf_model_callback=None):
238+
with hf_runner(model_name, is_cross_encoder=True,
239+
dtype="float32") as hf_model:
240+
241+
original_predict = hf_model.predict
242+
243+
def _predict(
244+
sentences: list[tuple[str, str,
245+
Optional[str]]], # query, corpus, prompt
246+
*args,
247+
**kwargs,
248+
):
249+
# vllm and st both remove the prompt, fair comparison.
250+
prompts = [(s[0], s[1]) for s in sentences]
251+
return original_predict(prompts, *args, **kwargs, batch_size=8)
252+
253+
hf_model.predict = _predict
254+
hf_model.original_predict = original_predict
255+
256+
if hf_model_callback is not None:
257+
hf_model_callback(hf_model)
258+
259+
st_main_score = run_mteb_rerank(hf_model,
260+
tasks=MTEB_RERANK_TASKS,
261+
languages=MTEB_RERANK_LANGS)
262+
st_dtype = next(hf_model.model.model.parameters()).dtype
263+
return st_main_score, st_dtype
264+
265+
237266
def mteb_test_rerank_models(hf_runner,
238267
vllm_runner,
239268
model_info: RerankModelInfo,
@@ -261,31 +290,8 @@ def mteb_test_rerank_models(hf_runner,
261290
languages=MTEB_RERANK_LANGS)
262291
vllm_dtype = vllm_model.model.llm_engine.model_config.dtype
263292

264-
with hf_runner(model_info.name, is_cross_encoder=True,
265-
dtype="float32") as hf_model:
266-
267-
original_predict = hf_model.predict
268-
269-
def _predict(
270-
sentences: list[tuple[str, str,
271-
Optional[str]]], # query, corpus, prompt
272-
*args,
273-
**kwargs,
274-
):
275-
# vllm and st both remove the prompt, fair comparison.
276-
prompts = [(s[0], s[1]) for s in sentences]
277-
return original_predict(prompts, *args, **kwargs, batch_size=8)
278-
279-
hf_model.predict = _predict
280-
hf_model.original_predict = original_predict
281-
282-
if hf_model_callback is not None:
283-
hf_model_callback(hf_model)
284-
285-
st_main_score = run_mteb_rerank(hf_model,
286-
tasks=MTEB_RERANK_TASKS,
287-
languages=MTEB_RERANK_LANGS)
288-
st_dtype = next(hf_model.model.model.parameters()).dtype
293+
st_main_score, st_dtype = mteb_test_rerank_models_hf(
294+
hf_runner, model_info.name, hf_model_callback)
289295

290296
print("VLLM:", vllm_dtype, vllm_main_score)
291297
print("SentenceTransformers:", st_dtype, st_main_score)

tests/models/language/pooling/test_cross_encoder.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
RERANK_MODELS = [
88
RerankModelInfo("cross-encoder/ms-marco-TinyBERT-L-2-v2",
99
architecture="BertForSequenceClassification"),
10+
RerankModelInfo("cross-encoder/ms-marco-MiniLM-L-6-v2",
11+
architecture="BertForSequenceClassification"),
1012
RerankModelInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls",
1113
architecture="Qwen3ForSequenceClassification")
1214
]

0 commit comments

Comments
 (0)