embeddings-benchmark
diff --git a/‎Makefile‎
Lines changed: 5 additions & 0 deletions b/‎Makefile‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎mteb/_evaluators/any_sts_evaluator.py‎
Lines changed: 2 additions & 2 deletions b/‎mteb/_evaluators/any_sts_evaluator.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎mteb/_evaluators/classification_evaluator.py‎
Lines changed: 14 additions & 2 deletions b/‎mteb/_evaluators/classification_evaluator.py‎
Lines changed: 14 additions & 2 deletions
diff --git a/‎mteb/_evaluators/clustering_evaluator.py‎
Lines changed: 2 additions & 2 deletions b/‎mteb/_evaluators/clustering_evaluator.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎mteb/_evaluators/evaluator.py‎
Lines changed: 4 additions & 2 deletions b/‎mteb/_evaluators/evaluator.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎mteb/_evaluators/regression_evaluator.py‎
Lines changed: 1 addition & 1 deletion b/‎mteb/_evaluators/regression_evaluator.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎mteb/_evaluators/retrieval_evaluator.py‎
Lines changed: 2 additions & 3 deletions b/‎mteb/_evaluators/retrieval_evaluator.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎mteb/_evaluators/text/bitext_mining_evaluator.py‎
Lines changed: 7 additions & 3 deletions b/‎mteb/_evaluators/text/bitext_mining_evaluator.py‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎mteb/_evaluators/text/pair_classification_evaluator.py‎
Lines changed: 11 additions & 7 deletions b/‎mteb/_evaluators/text/pair_classification_evaluator.py‎
Lines changed: 11 additions & 7 deletions
diff --git a/‎mteb/_evaluators/text/summarization_evaluator.py‎
Lines changed: 4 additions & 4 deletions b/‎mteb/_evaluators/text/summarization_evaluator.py‎
Lines changed: 4 additions & 4 deletions
@@ -80,3 +80,8 @@ format-citations:
 check: ## Run code quality tools.
 	@echo "--- 🧹 Running code quality tools ---"
 	@pre-commit run -a
+
+.PHONY: typecheck
+typecheck:
+	@echo "--- 🔍 Running type checks ---"
+	mypy mteb
@@ -31,7 +31,7 @@ def __init__(
         hf_split: str,
         hf_subset: str,
         **kwargs,
-    ):
+    ) -> None:
         super().__init__(**kwargs)
         self.first_column = create_dataloader(
             dataset,
@@ -53,7 +53,7 @@ def __call__(
         model: Encoder,
         *,
         encode_kwargs: dict[str, Any],
-    ):
+    ) -> dict[str, float]:
         embeddings1 = model.encode(
             self.first_column,
             task_metadata=self.task_metadata,
 
@@ -111,13 +111,25 @@ def calculate_scores(
             )
         return scores
 
-    def __call__(
+    def __call__(  # type: ignore[override]
         self,
         model: Encoder,
         *,
         encode_kwargs: dict[str, Any],
         test_cache: np.ndarray | None = None,
-    ) -> tuple[dict[str, float], Any]:
+    ) -> tuple[dict[str, float], np.ndarray]:
+        """Classification evaluation by training a sklearn classifier on the
+        embeddings of the training set and evaluating on the embeddings of the test set.
+
+        Args:
+            model: Encoder
+            encode_kwargs: encode kwargs
+            test_cache: embeddings of the test set, if already computed
+
+        Returns:
+            Tuple of scores and test embeddings
+
+        """
         dataloader_train, dataloader_test = self.create_dataloaders(
             batch_size=encode_kwargs["batch_size"]
         )
 
@@ -27,7 +27,7 @@ def __init__(
         hf_subset: str,
         clustering_batch_size: int = 500,
         **kwargs,
-    ):
+    ) -> None:
         super().__init__(**kwargs)
         self.dataset = dataset
         self.clustering_batch_size = clustering_batch_size
@@ -43,7 +43,7 @@ def __call__(
         *,
         encode_kwargs: dict[str, Any],
         v_measure_only: bool = False,
-    ):
+    ) -> dict[str, float]:
         data_loader = create_dataloader(
             self.dataset,
             self.task_metadata,
 
@@ -12,12 +12,14 @@ class Evaluator(ABC):
     Extend this class and implement __call__ for custom evaluators.
     """
 
-    def __init__(self, seed: int = 42, **kwargs: Any):
+    def __init__(self, seed: int = 42, **kwargs: Any) -> None:
         self.seed = seed
         self.rng_state, self.np_rng = set_seed(seed)
 
     @abstractmethod
-    def __call__(self, model: Encoder, *, encode_kwargs: dict[str, Any]):
+    def __call__(
+        self, model: Encoder, *, encode_kwargs: dict[str, Any]
+    ) -> dict[str, float]:
         """This is called during training to evaluate the model.
         It returns scores.
 
 
@@ -52,7 +52,7 @@ def __init__(
         self.task_metadata = task_metadata
         self.regressor = regressor
 
-    def __call__(
+    def __call__(  # type: ignore[override]
         self,
         model: Encoder,
         *,
 
@@ -34,7 +34,7 @@ def __init__(
         top_ranked: TopRankedDocumentsType | None = None,
         qid: str | None = None,
         **kwargs,
-    ):
+    ) -> None:
         super().__init__(**kwargs)
         self.corpus = corpus
         self.queries = queries
@@ -46,11 +46,10 @@ def __init__(
         self.qid = qid
         self.top_k = top_k
 
-    def __call__(
+    def __call__(  # type: ignore[override]
         self,
         search_model: SearchProtocol,
         encode_kwargs: dict[str, Any],
-        **kwargs: Any,
     ) -> RetrievalOutputType:
         search_model.index(
             corpus=self.corpus,
 
@@ -30,7 +30,7 @@ def __init__(
         hf_subset: str,
         pair_columns: list[tuple[str, str]] = DEFAULT_PAIR,
         **kwargs,
-    ):
+    ) -> None:
         super().__init__(**kwargs)
         self.pairs = pair_columns
         self.n = len(sentences)
@@ -45,11 +45,15 @@ def __init__(
         self.hf_subset = hf_subset
         self.task_metadata = task_metadata
 
-    def __call__(self, model: Encoder, *, encode_kwargs: dict[str, Any]):
+    def __call__(
+        self, model: Encoder, *, encode_kwargs: dict[str, Any]
+    ) -> dict[str, float]:
         scores = self.compute_metrics(model, encode_kwargs=encode_kwargs)
         return scores
 
-    def compute_metrics(self, model: Encoder, encode_kwargs: dict[str, Any]):
+    def compute_metrics(
+        self, model: Encoder, encode_kwargs: dict[str, Any]
+    ) -> dict[str, float]:
         pair_elements = {p for pair in self.pairs for p in pair}
         if isinstance(self.sentences, Dataset):
             subsets = [
 
@@ -49,7 +49,7 @@ def __init__(
         hf_split: str,
         hf_subset: str,
         **kwargs,
-    ):
+    ) -> None:
         super().__init__(**kwargs)
         self.sentences1 = sentences1
         self.sentences2 = sentences2
@@ -67,7 +67,7 @@ def __call__(
         self,
         model: Encoder,
         encode_kwargs: dict[str, Any],
-    ):
+    ) -> dict[str, float]:
         scores = self.compute_metrics(model, encode_kwargs=encode_kwargs)
 
         # Main score is the max of Average Precision (AP)
@@ -83,7 +83,7 @@ def _encode_unique_texts(
         hf_split: str,
         hf_subset: str,
         **encode_kwargs: Any,
-    ):
+    ) -> np.ndarray:
         index_map, all_unique_texts, all_texts_indexes = {}, [], []
         for text in all_texts:
             text_hash = hash(text)
@@ -110,7 +110,7 @@ def compute_metrics(
         model: Encoder,
         *,
         encode_kwargs: dict[str, Any],
-    ):
+    ) -> dict[str, float]:
         all_sentences = self.sentences1 + self.sentences2
         len_sentences1 = len(self.sentences1)
         embeddings = self._encode_unique_texts(
@@ -215,7 +215,9 @@ def _compute_metrics(
         }
 
     @staticmethod
-    def find_best_acc_and_threshold(scores, labels, high_score_more_similar: bool):
+    def find_best_acc_and_threshold(
+        scores: np.ndarray, labels: np.ndarray, high_score_more_similar: bool
+    ) -> tuple[float, float]:
         assert len(scores) == len(labels)
         rows = list(zip(scores, labels))
 
@@ -242,7 +244,9 @@ def find_best_acc_and_threshold(scores, labels, high_score_more_similar: bool):
         return max_acc, best_threshold
 
     @staticmethod
-    def find_best_f1_and_threshold(scores, labels, high_score_more_similar: bool):
+    def find_best_f1_and_threshold(
+        scores, labels, high_score_more_similar: bool
+    ) -> tuple[float, float, float, float]:
         assert len(scores) == len(labels)
 
         scores = np.asarray(scores)
@@ -278,7 +282,7 @@ def find_best_f1_and_threshold(scores, labels, high_score_more_similar: bool):
         return best_f1, best_precision, best_recall, threshold
 
     @staticmethod
-    def ap_score(scores, labels, high_score_more_similar: bool):
+    def ap_score(scores, labels, high_score_more_similar: bool) -> float:
         return average_precision_score(
             labels, scores * (1 if high_score_more_similar else -1)
         )
@@ -36,7 +36,7 @@ def __init__(
         hf_split: str,
         hf_subset: str,
         **kwargs,
-    ):
+    ) -> None:
         """Summarization Evaluator
 
         Args:
@@ -63,7 +63,7 @@ def __call__(
         model: Encoder,
         *,
         encode_kwargs: dict[str, Any],
-    ):
+    ) -> dict[str, float]:
         cosine_spearman_scores = []
         cosine_pearson_scores = []
         dot_spearman_scores = []
@@ -196,7 +196,7 @@ def __init__(
         hf_split: str | None = None,
         hf_subset: str | None = None,
         **kwargs,
-    ):
+    ) -> None:
         # human_summaries shape: (None, num_human_summaries)
         # machine_summaries shape: (None, num_machine_summaries)
         # gold scores shape: (None, num_machine_summaries)
@@ -220,7 +220,7 @@ def __call__(
         model: Encoder,
         *,
         encode_kwargs: dict[str, Any],
-    ):
+    ) -> dict[str, float]:
         cosine_spearman_scores = []
         cosine_pearson_scores = []
         dot_spearman_scores = []