embeddings-benchmark · Samoed · Sep 19, 2025 · Sep 13, 2025 · Sep 13, 2025 · Sep 13, 2025
diff --git a/mteb/_evaluators/any_sts_evaluator.py b/mteb/_evaluators/any_sts_evaluator.py
@@ -31,7 +31,7 @@ def __init__(
         hf_split: str,
         hf_subset: str,
         **kwargs,
-    ):
+    ) -> None:
         super().__init__(**kwargs)
         self.first_column = create_dataloader(
             dataset,
@@ -53,7 +53,7 @@ def __call__(
         model: Encoder,
         *,
         encode_kwargs: dict[str, Any],
-    ):
+    ) -> dict[str, float]:
         embeddings1 = model.encode(
             self.first_column,
             task_metadata=self.task_metadata,

diff --git a/mteb/_evaluators/classification_evaluator.py b/mteb/_evaluators/classification_evaluator.py
@@ -111,13 +111,13 @@ def calculate_scores(
             )
         return scores
 
-    def __call__(
+    def __call__(  # type: ignore[override]
         self,
         model: Encoder,
         *,
         encode_kwargs: dict[str, Any],
         test_cache: np.ndarray | None = None,
-    ) -> tuple[dict[str, float], Any]:
+    ) -> tuple[dict[str, float], np.ndarray | None]:
         dataloader_train, dataloader_test = self.create_dataloaders(
             batch_size=encode_kwargs["batch_size"]
         )

diff --git a/mteb/_evaluators/clustering_evaluator.py b/mteb/_evaluators/clustering_evaluator.py
@@ -27,7 +27,7 @@ def __init__(
         hf_subset: str,
         clustering_batch_size: int = 500,
         **kwargs,
-    ):
+    ) -> None:
         super().__init__(**kwargs)
         self.dataset = dataset
         self.clustering_batch_size = clustering_batch_size
@@ -43,7 +43,7 @@ def __call__(
         *,
         encode_kwargs: dict[str, Any],
         v_measure_only: bool = False,
-    ):
+    ) -> dict[str, float]:
         data_loader = create_dataloader(
             self.dataset,
             self.task_metadata,

diff --git a/mteb/_evaluators/evaluator.py b/mteb/_evaluators/evaluator.py
@@ -12,12 +12,14 @@ class Evaluator(ABC):
     Extend this class and implement __call__ for custom evaluators.
     """
 
-    def __init__(self, seed: int = 42, **kwargs: Any):
+    def __init__(self, seed: int = 42, **kwargs: Any) -> None:
         self.seed = seed
         self.rng_state, self.np_rng = set_seed(seed)
 
     @abstractmethod
-    def __call__(self, model: Encoder, *, encode_kwargs: dict[str, Any]):
+    def __call__(
+        self, model: Encoder, *, encode_kwargs: dict[str, Any]
+    ) -> dict[str, float]:
         """This is called during training to evaluate the model.
         It returns scores.
 

diff --git a/mteb/_evaluators/regression_evaluator.py b/mteb/_evaluators/regression_evaluator.py
@@ -52,7 +52,7 @@ def __init__(
         self.task_metadata = task_metadata
         self.regressor = regressor
 
-    def __call__(
+    def __call__(  # type: ignore[override]
         self,
         model: Encoder,
         *,

diff --git a/mteb/_evaluators/retrieval_evaluator.py b/mteb/_evaluators/retrieval_evaluator.py
@@ -34,7 +34,7 @@ def __init__(
         top_ranked: TopRankedDocumentsType | None = None,
         qid: str | None = None,
         **kwargs,
-    ):
+    ) -> None:
         super().__init__(**kwargs)
         self.corpus = corpus
         self.queries = queries
@@ -46,11 +46,10 @@ def __init__(
         self.qid = qid
         self.top_k = top_k
 
-    def __call__(
+    def __call__(  # type: ignore[override]
         self,
         search_model: SearchProtocol,
         encode_kwargs: dict[str, Any],
-        **kwargs: Any,
     ) -> RetrievalOutputType:
         search_model.index(
             corpus=self.corpus,

diff --git a/mteb/_evaluators/text/bitext_mining_evaluator.py b/mteb/_evaluators/text/bitext_mining_evaluator.py
@@ -30,7 +30,7 @@ def __init__(
         hf_subset: str,
         pair_columns: list[tuple[str, str]] = DEFAULT_PAIR,
         **kwargs,
-    ):
+    ) -> None:
         super().__init__(**kwargs)
         self.pairs = pair_columns
         self.n = len(sentences)
@@ -45,11 +45,15 @@ def __init__(
         self.hf_subset = hf_subset
         self.task_metadata = task_metadata
 
-    def __call__(self, model: Encoder, *, encode_kwargs: dict[str, Any]):
+    def __call__(
+        self, model: Encoder, *, encode_kwargs: dict[str, Any]
+    ) -> dict[str, float]:
         scores = self.compute_metrics(model, encode_kwargs=encode_kwargs)
         return scores
 
-    def compute_metrics(self, model: Encoder, encode_kwargs: dict[str, Any]):
+    def compute_metrics(
+        self, model: Encoder, encode_kwargs: dict[str, Any]
+    ) -> dict[str, float]:
         pair_elements = {p for pair in self.pairs for p in pair}
         if isinstance(self.sentences, Dataset):
             subsets = [

diff --git a/mteb/_evaluators/text/pair_classification_evaluator.py b/mteb/_evaluators/text/pair_classification_evaluator.py
@@ -49,7 +49,7 @@ def __init__(
         hf_split: str,
         hf_subset: str,
         **kwargs,
-    ):
+    ) -> None:
         super().__init__(**kwargs)
         self.sentences1 = sentences1
         self.sentences2 = sentences2
@@ -67,7 +67,7 @@ def __call__(
         self,
         model: Encoder,
         encode_kwargs: dict[str, Any],
-    ):
+    ) -> dict[str, float]:
         scores = self.compute_metrics(model, encode_kwargs=encode_kwargs)
 
         # Main score is the max of Average Precision (AP)
@@ -83,7 +83,7 @@ def _encode_unique_texts(
         hf_split: str,
         hf_subset: str,
         **encode_kwargs: Any,
-    ):
+    ) -> np.ndarray:
         index_map, all_unique_texts, all_texts_indexes = {}, [], []
         for text in all_texts:
             text_hash = hash(text)
@@ -110,7 +110,7 @@ def compute_metrics(
         model: Encoder,
         *,
         encode_kwargs: dict[str, Any],
-    ):
+    ) -> dict[str, float]:
         all_sentences = self.sentences1 + self.sentences2
         len_sentences1 = len(self.sentences1)
         embeddings = self._encode_unique_texts(
@@ -215,7 +215,9 @@ def _compute_metrics(
         }
 
     @staticmethod
-    def find_best_acc_and_threshold(scores, labels, high_score_more_similar: bool):
+    def find_best_acc_and_threshold(
+        scores: np.ndarray, labels: np.ndarray, high_score_more_similar: bool
+    ) -> tuple[float, float]:
         assert len(scores) == len(labels)
         rows = list(zip(scores, labels))
 
@@ -242,7 +244,9 @@ def find_best_acc_and_threshold(scores, labels, high_score_more_similar: bool):
         return max_acc, best_threshold
 
     @staticmethod
-    def find_best_f1_and_threshold(scores, labels, high_score_more_similar: bool):
+    def find_best_f1_and_threshold(
+        scores, labels, high_score_more_similar: bool
+    ) -> tuple[float, float, float, float]:
         assert len(scores) == len(labels)
 
         scores = np.asarray(scores)
@@ -278,7 +282,7 @@ def find_best_f1_and_threshold(scores, labels, high_score_more_similar: bool):
         return best_f1, best_precision, best_recall, threshold
 
     @staticmethod
-    def ap_score(scores, labels, high_score_more_similar: bool):
+    def ap_score(scores, labels, high_score_more_similar: bool) -> float:
         return average_precision_score(
             labels, scores * (1 if high_score_more_similar else -1)
         )
diff --git a/mteb/_evaluators/text/summarization_evaluator.py b/mteb/_evaluators/text/summarization_evaluator.py
@@ -36,7 +36,7 @@ def __init__(
         hf_split: str,
         hf_subset: str,
         **kwargs,
-    ):
+    ) -> None:
         """Summarization Evaluator
 
         Args:
@@ -63,7 +63,7 @@ def __call__(
         model: Encoder,
         *,
         encode_kwargs: dict[str, Any],
-    ):
+    ) -> dict[str, float]:
         cosine_spearman_scores = []
         cosine_pearson_scores = []
         dot_spearman_scores = []
@@ -196,7 +196,7 @@ def __init__(
         hf_split: str | None = None,
         hf_subset: str | None = None,
         **kwargs,
-    ):
+    ) -> None:
         # human_summaries shape: (None, num_human_summaries)
         # machine_summaries shape: (None, num_machine_summaries)
         # gold scores shape: (None, num_machine_summaries)
@@ -220,7 +220,7 @@ def __call__(
         model: Encoder,
         *,
         encode_kwargs: dict[str, Any],
-    ):
+    ) -> dict[str, float]:
         cosine_spearman_scores = []
         cosine_pearson_scores = []
         dot_spearman_scores = []

diff --git a/mteb/_evaluators/zeroshot_classification_evaluator.py b/mteb/_evaluators/zeroshot_classification_evaluator.py
@@ -31,7 +31,7 @@ def __init__(
         hf_split: str,
         hf_subset: str,
         **kwargs,
-    ):
+    ) -> None:
         super().__init__(**kwargs)
 
         self.dataset = dataset
@@ -42,7 +42,9 @@ def __init__(
         self.hf_split = hf_split
         self.hf_subset = hf_subset
 
-    def __call__(self, model: Encoder, *, encode_kwargs: dict[str, Any]):
+    def __call__(
+        self, model: Encoder, *, encode_kwargs: dict[str, Any]
+    ) -> dict[str, float]:
         if "image" in self.task_metadata.modalities:
             dataloader = create_image_dataloader(
                 self.dataset,

diff --git a/mteb/abstasks/AbsTask.py b/mteb/abstasks/AbsTask.py
@@ -23,8 +23,8 @@
     SearchProtocol,
 )
 from mteb.set_seed import set_seed
-from mteb.types import HFSubset, ScoresDict
-from mteb.types.statistics import DescriptiveStatistics
+from mteb.types import HFSubset, Modalities, ScoresDict
+from mteb.types.statistics import DescriptiveStatistics, SplitDescriptiveStatistics
 
 logger = logging.getLogger(__name__)
 
@@ -191,8 +191,9 @@ def evaluate(
     @abstractmethod
     def _evaluate_subset(
         self,
-        model: MTEBModels,
+        model: Encoder,
         data_split: Dataset,
+        *,
         encode_kwargs: dict[str, Any],
         hf_split: str,
         hf_subset: str,
@@ -336,7 +337,7 @@ def fast_load(self, **kwargs: Any) -> None:
 
     def calculate_descriptive_statistics(
         self, overwrite_results: bool = False
-    ) -> dict[str, DescriptiveStatistics | dict[str, DescriptiveStatistics]]:
+    ) -> dict[str, DescriptiveStatistics]:
         """Calculates descriptive statistics from the dataset."""
         from mteb.abstasks import AbsTaskAnyClassification
 
@@ -347,7 +348,7 @@ def calculate_descriptive_statistics(
         if not self.data_loaded:
             self.load_data()
 
-        descriptive_stats = {}
+        descriptive_stats: dict[str, DescriptiveStatistics] = {}
         hf_subset_stat = "hf_subset_descriptive_stats"
         eval_splits = self.metadata.eval_splits
         if isinstance(self, AbsTaskAnyClassification):
@@ -387,15 +388,15 @@ def calculate_descriptive_statistics(
 
     def calculate_metadata_metrics(
         self, overwrite_results: bool = False
-    ) -> dict[str, DescriptiveStatistics | dict[str, DescriptiveStatistics]]:
+    ) -> dict[str, DescriptiveStatistics]:
         return self.calculate_descriptive_statistics(
             overwrite_results=overwrite_results
         )
 
     @abstractmethod
     def _calculate_descriptive_statistics_from_split(
         self, split: str, hf_subset: str | None = None, compute_overall: bool = False
-    ) -> DescriptiveStatistics:
+    ) -> SplitDescriptiveStatistics:
         raise NotImplementedError
 
     @property
@@ -584,7 +585,7 @@ def eval_splits(self) -> list[str]:
         return self.metadata.eval_splits
 
     @property
-    def modalities(self) -> list[str]:
+    def modalities(self) -> list[Modalities]:
         """Returns the modalities of the task."""
         return self.metadata.modalities