diff --git a/Makefile b/Makefile index e83c85eb77..37e0aa7b4b 100644 --- a/Makefile +++ b/Makefile @@ -80,3 +80,8 @@ format-citations: check: ## Run code quality tools. @echo "--- ๐Ÿงน Running code quality tools ---" @pre-commit run -a + +.PHONY: typecheck +typecheck: + @echo "--- ๐Ÿ” Running type checks ---" + mypy mteb diff --git a/mteb/_evaluators/any_sts_evaluator.py b/mteb/_evaluators/any_sts_evaluator.py index ba20e0b713..bf1f61c64a 100644 --- a/mteb/_evaluators/any_sts_evaluator.py +++ b/mteb/_evaluators/any_sts_evaluator.py @@ -31,7 +31,7 @@ def __init__( hf_split: str, hf_subset: str, **kwargs, - ): + ) -> None: super().__init__(**kwargs) self.first_column = create_dataloader( dataset, @@ -53,7 +53,7 @@ def __call__( model: Encoder, *, encode_kwargs: dict[str, Any], - ): + ) -> dict[str, float]: embeddings1 = model.encode( self.first_column, task_metadata=self.task_metadata, diff --git a/mteb/_evaluators/classification_evaluator.py b/mteb/_evaluators/classification_evaluator.py index 50c62d13f6..306afd2e7b 100644 --- a/mteb/_evaluators/classification_evaluator.py +++ b/mteb/_evaluators/classification_evaluator.py @@ -111,13 +111,25 @@ def calculate_scores( ) return scores - def __call__( + def __call__( # type: ignore[override] self, model: Encoder, *, encode_kwargs: dict[str, Any], test_cache: np.ndarray | None = None, - ) -> tuple[dict[str, float], Any]: + ) -> tuple[dict[str, float], np.ndarray]: + """Classification evaluation by training a sklearn classifier on the + embeddings of the training set and evaluating on the embeddings of the test set. + + Args: + model: Encoder + encode_kwargs: encode kwargs + test_cache: embeddings of the test set, if already computed + + Returns: + Tuple of scores and test embeddings + + """ dataloader_train, dataloader_test = self.create_dataloaders( batch_size=encode_kwargs["batch_size"] ) diff --git a/mteb/_evaluators/clustering_evaluator.py b/mteb/_evaluators/clustering_evaluator.py index 8f45412991..f74d355981 100644 --- a/mteb/_evaluators/clustering_evaluator.py +++ b/mteb/_evaluators/clustering_evaluator.py @@ -27,7 +27,7 @@ def __init__( hf_subset: str, clustering_batch_size: int = 500, **kwargs, - ): + ) -> None: super().__init__(**kwargs) self.dataset = dataset self.clustering_batch_size = clustering_batch_size @@ -43,7 +43,7 @@ def __call__( *, encode_kwargs: dict[str, Any], v_measure_only: bool = False, - ): + ) -> dict[str, float]: data_loader = create_dataloader( self.dataset, self.task_metadata, diff --git a/mteb/_evaluators/evaluator.py b/mteb/_evaluators/evaluator.py index 02797b16ec..858ea28e8a 100644 --- a/mteb/_evaluators/evaluator.py +++ b/mteb/_evaluators/evaluator.py @@ -12,12 +12,14 @@ class Evaluator(ABC): Extend this class and implement __call__ for custom evaluators. """ - def __init__(self, seed: int = 42, **kwargs: Any): + def __init__(self, seed: int = 42, **kwargs: Any) -> None: self.seed = seed self.rng_state, self.np_rng = set_seed(seed) @abstractmethod - def __call__(self, model: Encoder, *, encode_kwargs: dict[str, Any]): + def __call__( + self, model: Encoder, *, encode_kwargs: dict[str, Any] + ) -> dict[str, float]: """This is called during training to evaluate the model. It returns scores. diff --git a/mteb/_evaluators/regression_evaluator.py b/mteb/_evaluators/regression_evaluator.py index c8cda58c8c..a616c68c84 100644 --- a/mteb/_evaluators/regression_evaluator.py +++ b/mteb/_evaluators/regression_evaluator.py @@ -52,7 +52,7 @@ def __init__( self.task_metadata = task_metadata self.regressor = regressor - def __call__( + def __call__( # type: ignore[override] self, model: Encoder, *, diff --git a/mteb/_evaluators/retrieval_evaluator.py b/mteb/_evaluators/retrieval_evaluator.py index dc02947d06..617256a6cb 100644 --- a/mteb/_evaluators/retrieval_evaluator.py +++ b/mteb/_evaluators/retrieval_evaluator.py @@ -34,7 +34,7 @@ def __init__( top_ranked: TopRankedDocumentsType | None = None, qid: str | None = None, **kwargs, - ): + ) -> None: super().__init__(**kwargs) self.corpus = corpus self.queries = queries @@ -46,11 +46,10 @@ def __init__( self.qid = qid self.top_k = top_k - def __call__( + def __call__( # type: ignore[override] self, search_model: SearchProtocol, encode_kwargs: dict[str, Any], - **kwargs: Any, ) -> RetrievalOutputType: search_model.index( corpus=self.corpus, diff --git a/mteb/_evaluators/text/bitext_mining_evaluator.py b/mteb/_evaluators/text/bitext_mining_evaluator.py index a1bfb303b3..f245ecfc21 100644 --- a/mteb/_evaluators/text/bitext_mining_evaluator.py +++ b/mteb/_evaluators/text/bitext_mining_evaluator.py @@ -30,7 +30,7 @@ def __init__( hf_subset: str, pair_columns: list[tuple[str, str]] = DEFAULT_PAIR, **kwargs, - ): + ) -> None: super().__init__(**kwargs) self.pairs = pair_columns self.n = len(sentences) @@ -45,11 +45,15 @@ def __init__( self.hf_subset = hf_subset self.task_metadata = task_metadata - def __call__(self, model: Encoder, *, encode_kwargs: dict[str, Any]): + def __call__( + self, model: Encoder, *, encode_kwargs: dict[str, Any] + ) -> dict[str, float]: scores = self.compute_metrics(model, encode_kwargs=encode_kwargs) return scores - def compute_metrics(self, model: Encoder, encode_kwargs: dict[str, Any]): + def compute_metrics( + self, model: Encoder, encode_kwargs: dict[str, Any] + ) -> dict[str, float]: pair_elements = {p for pair in self.pairs for p in pair} if isinstance(self.sentences, Dataset): subsets = [ diff --git a/mteb/_evaluators/text/pair_classification_evaluator.py b/mteb/_evaluators/text/pair_classification_evaluator.py index de3acb6949..220708287c 100644 --- a/mteb/_evaluators/text/pair_classification_evaluator.py +++ b/mteb/_evaluators/text/pair_classification_evaluator.py @@ -49,7 +49,7 @@ def __init__( hf_split: str, hf_subset: str, **kwargs, - ): + ) -> None: super().__init__(**kwargs) self.sentences1 = sentences1 self.sentences2 = sentences2 @@ -67,7 +67,7 @@ def __call__( self, model: Encoder, encode_kwargs: dict[str, Any], - ): + ) -> dict[str, float]: scores = self.compute_metrics(model, encode_kwargs=encode_kwargs) # Main score is the max of Average Precision (AP) @@ -83,7 +83,7 @@ def _encode_unique_texts( hf_split: str, hf_subset: str, **encode_kwargs: Any, - ): + ) -> np.ndarray: index_map, all_unique_texts, all_texts_indexes = {}, [], [] for text in all_texts: text_hash = hash(text) @@ -110,7 +110,7 @@ def compute_metrics( model: Encoder, *, encode_kwargs: dict[str, Any], - ): + ) -> dict[str, float]: all_sentences = self.sentences1 + self.sentences2 len_sentences1 = len(self.sentences1) embeddings = self._encode_unique_texts( @@ -215,7 +215,9 @@ def _compute_metrics( } @staticmethod - def find_best_acc_and_threshold(scores, labels, high_score_more_similar: bool): + def find_best_acc_and_threshold( + scores: np.ndarray, labels: np.ndarray, high_score_more_similar: bool + ) -> tuple[float, float]: assert len(scores) == len(labels) rows = list(zip(scores, labels)) @@ -242,7 +244,9 @@ def find_best_acc_and_threshold(scores, labels, high_score_more_similar: bool): return max_acc, best_threshold @staticmethod - def find_best_f1_and_threshold(scores, labels, high_score_more_similar: bool): + def find_best_f1_and_threshold( + scores, labels, high_score_more_similar: bool + ) -> tuple[float, float, float, float]: assert len(scores) == len(labels) scores = np.asarray(scores) @@ -278,7 +282,7 @@ def find_best_f1_and_threshold(scores, labels, high_score_more_similar: bool): return best_f1, best_precision, best_recall, threshold @staticmethod - def ap_score(scores, labels, high_score_more_similar: bool): + def ap_score(scores, labels, high_score_more_similar: bool) -> float: return average_precision_score( labels, scores * (1 if high_score_more_similar else -1) ) diff --git a/mteb/_evaluators/text/summarization_evaluator.py b/mteb/_evaluators/text/summarization_evaluator.py index 79e01c43f9..cb0966bd44 100644 --- a/mteb/_evaluators/text/summarization_evaluator.py +++ b/mteb/_evaluators/text/summarization_evaluator.py @@ -36,7 +36,7 @@ def __init__( hf_split: str, hf_subset: str, **kwargs, - ): + ) -> None: """Summarization Evaluator Args: @@ -63,7 +63,7 @@ def __call__( model: Encoder, *, encode_kwargs: dict[str, Any], - ): + ) -> dict[str, float]: cosine_spearman_scores = [] cosine_pearson_scores = [] dot_spearman_scores = [] @@ -196,7 +196,7 @@ def __init__( hf_split: str | None = None, hf_subset: str | None = None, **kwargs, - ): + ) -> None: # human_summaries shape: (None, num_human_summaries) # machine_summaries shape: (None, num_machine_summaries) # gold scores shape: (None, num_machine_summaries) @@ -220,7 +220,7 @@ def __call__( model: Encoder, *, encode_kwargs: dict[str, Any], - ): + ) -> dict[str, float]: cosine_spearman_scores = [] cosine_pearson_scores = [] dot_spearman_scores = [] diff --git a/mteb/_evaluators/zeroshot_classification_evaluator.py b/mteb/_evaluators/zeroshot_classification_evaluator.py index e7432ab03a..0a7a242431 100644 --- a/mteb/_evaluators/zeroshot_classification_evaluator.py +++ b/mteb/_evaluators/zeroshot_classification_evaluator.py @@ -31,7 +31,7 @@ def __init__( hf_split: str, hf_subset: str, **kwargs, - ): + ) -> None: super().__init__(**kwargs) self.dataset = dataset @@ -42,7 +42,9 @@ def __init__( self.hf_split = hf_split self.hf_subset = hf_subset - def __call__(self, model: Encoder, *, encode_kwargs: dict[str, Any]): + def __call__( + self, model: Encoder, *, encode_kwargs: dict[str, Any] + ) -> dict[str, float]: if "image" in self.task_metadata.modalities: dataloader = create_image_dataloader( self.dataset, diff --git a/mteb/abstasks/AbsTask.py b/mteb/abstasks/AbsTask.py index f761932491..d7f583d14d 100644 --- a/mteb/abstasks/AbsTask.py +++ b/mteb/abstasks/AbsTask.py @@ -23,8 +23,8 @@ SearchProtocol, ) from mteb.set_seed import set_seed -from mteb.types import HFSubset, ScoresDict -from mteb.types.statistics import DescriptiveStatistics +from mteb.types import HFSubset, Modalities, ScoresDict +from mteb.types.statistics import DescriptiveStatistics, SplitDescriptiveStatistics logger = logging.getLogger(__name__) @@ -191,8 +191,9 @@ def evaluate( @abstractmethod def _evaluate_subset( self, - model: MTEBModels, + model: Encoder, data_split: Dataset, + *, encode_kwargs: dict[str, Any], hf_split: str, hf_subset: str, @@ -336,7 +337,7 @@ def fast_load(self) -> None: def calculate_descriptive_statistics( self, overwrite_results: bool = False - ) -> dict[str, DescriptiveStatistics | dict[str, DescriptiveStatistics]]: + ) -> dict[str, DescriptiveStatistics]: """Calculates descriptive statistics from the dataset.""" from mteb.abstasks import AbsTaskAnyClassification @@ -347,7 +348,7 @@ def calculate_descriptive_statistics( if not self.data_loaded: self.load_data() - descriptive_stats = {} + descriptive_stats: dict[str, DescriptiveStatistics] = {} hf_subset_stat = "hf_subset_descriptive_stats" eval_splits = self.metadata.eval_splits if isinstance(self, AbsTaskAnyClassification): @@ -387,7 +388,7 @@ def calculate_descriptive_statistics( def calculate_metadata_metrics( self, overwrite_results: bool = False - ) -> dict[str, DescriptiveStatistics | dict[str, DescriptiveStatistics]]: + ) -> dict[str, DescriptiveStatistics]: return self.calculate_descriptive_statistics( overwrite_results=overwrite_results ) @@ -395,7 +396,7 @@ def calculate_metadata_metrics( @abstractmethod def _calculate_descriptive_statistics_from_split( self, split: str, hf_subset: str | None = None, compute_overall: bool = False - ) -> DescriptiveStatistics: + ) -> SplitDescriptiveStatistics: raise NotImplementedError @property @@ -578,7 +579,7 @@ def eval_splits(self) -> list[str]: return self.metadata.eval_splits @property - def modalities(self) -> list[str]: + def modalities(self) -> list[Modalities]: """Returns the modalities of the task.""" return self.metadata.modalities diff --git a/mteb/abstasks/AbsTaskAnyClassification.py b/mteb/abstasks/AbsTaskAnyClassification.py index 8801a3d94f..671f772e00 100644 --- a/mteb/abstasks/AbsTaskAnyClassification.py +++ b/mteb/abstasks/AbsTaskAnyClassification.py @@ -2,6 +2,7 @@ import logging from collections import defaultdict +from pathlib import Path from typing import Any import numpy as np @@ -9,12 +10,12 @@ from PIL import ImageFile from sklearn.linear_model import LogisticRegression -from mteb.models import Encoder +from mteb.models import Encoder, MTEBModels from mteb.types import HFSubset, ScoresDict from mteb.types.statistics import ( - DescriptiveStatistics, ImageStatistics, LabelStatistics, + SplitDescriptiveStatistics, TextStatistics, ) @@ -33,7 +34,7 @@ logger = logging.getLogger(__name__) -class ClassificationDescriptiveStatistics(DescriptiveStatistics): +class ClassificationDescriptiveStatistics(SplitDescriptiveStatistics): """Descriptive statistics for Classification Attributes: @@ -83,13 +84,19 @@ class AbsTaskAnyClassification(AbsTask): def evaluate( self, - model: Encoder, + model: MTEBModels, split: str = "test", subsets_to_run: list[HFSubset] | None = None, *, encode_kwargs: dict[str, Any], **kwargs: Any, ) -> dict[HFSubset, ScoresDict]: + if not isinstance(model, Encoder): + raise TypeError( + f"Model {model} is a SearchProtocol, but this task {self.metadata.name} does not support Search. " + "Please use a Encoder model instead." + ) + if not self.data_loaded: self.load_data() @@ -127,14 +134,16 @@ def evaluate( def _evaluate_subset( self, model: Encoder, - dataset: DatasetDict, + data_split: DatasetDict, + *, + encode_kwargs: dict[str, Any], hf_split: str, hf_subset: str, - encode_kwargs: dict[str, Any], - **kwargs, + prediction_folder: Path | None = None, + **kwargs: Any, ) -> ScoresDict: - train_split = dataset[self.train_split] - eval_split = dataset[hf_split] + train_split = data_split[self.train_split] + eval_split = data_split[hf_split] params = {"k": self.k} params.update(kwargs) @@ -195,7 +204,7 @@ def _undersample_data( rng_state = np.random.RandomState(self.seed) rng_state.shuffle(idxs) - label_counter = defaultdict(int) + label_counter: dict[str, int] = defaultdict(int) sampled_idxs = [] for i in idxs: diff --git a/mteb/abstasks/AbsTaskAnyClustering.py b/mteb/abstasks/AbsTaskAnyClustering.py index ef9b5207a2..992e82978e 100644 --- a/mteb/abstasks/AbsTaskAnyClustering.py +++ b/mteb/abstasks/AbsTaskAnyClustering.py @@ -1,6 +1,7 @@ from __future__ import annotations import logging +from pathlib import Path from typing import Any import numpy as np @@ -11,9 +12,9 @@ from mteb.models import Encoder from mteb.types import ScoresDict from mteb.types.statistics import ( - DescriptiveStatistics, ImageStatistics, LabelStatistics, + SplitDescriptiveStatistics, TextStatistics, ) @@ -27,7 +28,7 @@ logger = logging.getLogger(__name__) -class ClusteringDescriptiveStatistics(DescriptiveStatistics): +class ClusteringDescriptiveStatistics(SplitDescriptiveStatistics): """Descriptive statistics for Clustering Attributes: @@ -63,19 +64,18 @@ class AbsTaskAnyClustering(AbsTask): def _evaluate_subset( self, model: Encoder, - dataset: Dataset, + data_split: Dataset, *, + encode_kwargs: dict[str, Any], hf_split: str, hf_subset: str, - encode_kwargs: dict[str, Any], - **kwargs, + prediction_folder: Path | None = None, + **kwargs: Any, ) -> ScoresDict: - ## MTEB v1 text clustering requires renaming and eval per subset. - if "sentences" in dataset.column_names and isinstance( - dataset[self.input_column_name][0], list - ): + # MTEB text clustering requires renaming and eval per subset. + if self.metadata.modalities == ["text"]: v_measures = [] - for cluster_set in tqdm.tqdm(dataset, desc="Clustering"): + for cluster_set in tqdm.tqdm(data_split, desc="Clustering"): clustering_dataset = Dataset.from_dict(cluster_set).select_columns( [self.input_column_name, self.label_column_name] ) @@ -103,11 +103,11 @@ def _evaluate_subset( self._add_main_score(scores) return scores - dataset = dataset.select_columns( + data_split = data_split.select_columns( [self.input_column_name, self.label_column_name] ) evaluator = self.evaluator( - dataset, + data_split, input_column_name=self.input_column_name, label_column_name=self.label_column_name, task_metadata=self.metadata, diff --git a/mteb/abstasks/AbsTaskAnySTS.py b/mteb/abstasks/AbsTaskAnySTS.py index fe64001f4e..1a1cba6365 100644 --- a/mteb/abstasks/AbsTaskAnySTS.py +++ b/mteb/abstasks/AbsTaskAnySTS.py @@ -1,7 +1,7 @@ from __future__ import annotations import logging -from typing import Any +from typing import Any, cast from datasets import Dataset @@ -9,9 +9,9 @@ from mteb.models import Encoder from mteb.types import ScoresDict from mteb.types.statistics import ( - DescriptiveStatistics, ImageStatistics, ScoreStatistics, + SplitDescriptiveStatistics, TextStatistics, ) @@ -25,7 +25,7 @@ logger = logging.getLogger(__name__) -class AnySTSDescriptiveStatistics(DescriptiveStatistics): +class AnySTSDescriptiveStatistics(SplitDescriptiveStatistics): """Descriptive statistics for STS Attributes: @@ -97,6 +97,8 @@ def _calculate_descriptive_statistics_from_split( self, split: str, hf_subset: str | None = None, compute_overall: bool = False ) -> AnySTSDescriptiveStatistics: first_column, second_column = self.column_names + self.dataset = cast(dict[str, dict[str, Dataset]], self.dataset) + if hf_subset: sentence1 = self.dataset[hf_subset][split][first_column] sentence2 = self.dataset[hf_subset][split][second_column] diff --git a/mteb/abstasks/AbsTaskAnyZeroShotClassification.py b/mteb/abstasks/AbsTaskAnyZeroShotClassification.py index c9a66bb6fd..31f7c8ac17 100644 --- a/mteb/abstasks/AbsTaskAnyZeroShotClassification.py +++ b/mteb/abstasks/AbsTaskAnyZeroShotClassification.py @@ -8,9 +8,9 @@ from mteb._evaluators import ZeroShotClassificationEvaluator from mteb.types import ScoresDict from mteb.types.statistics import ( - DescriptiveStatistics, ImageStatistics, LabelStatistics, + SplitDescriptiveStatistics, TextStatistics, ) @@ -25,7 +25,7 @@ logger = logging.getLogger(__name__) -class ZeroShotClassificationDescriptiveStatistics(DescriptiveStatistics): +class ZeroShotClassificationDescriptiveStatistics(SplitDescriptiveStatistics): """Descriptive statistics for ZeroShotClassification Attributes: @@ -100,7 +100,7 @@ def _calculate_descriptive_statistics_from_split( def _evaluate_subset( self, model: Encoder, - dataset: Dataset, + data_split: Dataset, *, hf_split: str, hf_subset: str, @@ -108,11 +108,11 @@ def _evaluate_subset( **kwargs, ) -> ScoresDict: candidate_labels = self.get_candidate_labels() - dataset = dataset.select_columns( + data_split = data_split.select_columns( [self.input_column_name, self.label_column_name] ) evaluator = ZeroShotClassificationEvaluator( - dataset, + data_split, self.input_column_name, self.label_column_name, candidate_labels, diff --git a/mteb/abstasks/AbsTaskBitextMining.py b/mteb/abstasks/AbsTaskBitextMining.py index fec5a58d4c..0e112e8253 100644 --- a/mteb/abstasks/AbsTaskBitextMining.py +++ b/mteb/abstasks/AbsTaskBitextMining.py @@ -7,9 +7,9 @@ from datasets import Dataset, DatasetDict from mteb._evaluators import BitextMiningEvaluator -from mteb.models import Encoder +from mteb.models import Encoder, MTEBModels from mteb.types import HFSubset, ScoresDict -from mteb.types.statistics import DescriptiveStatistics, TextStatistics +from mteb.types.statistics import SplitDescriptiveStatistics, TextStatistics from ._statistics_calculation import calculate_text_statistics from .AbsTask import AbsTask @@ -17,7 +17,7 @@ logger = logging.getLogger(__name__) -class BitextDescriptiveStatistics(DescriptiveStatistics): +class BitextDescriptiveStatistics(SplitDescriptiveStatistics): """Descriptive statistics for Bitext Attributes: @@ -52,7 +52,7 @@ class AbsTaskBitextMining(AbsTask): def evaluate( self, - model: Encoder, + model: MTEBModels, split: str = "test", subsets_to_run: list[HFSubset] | None = None, *, diff --git a/mteb/abstasks/AbsTaskClusteringFast.py b/mteb/abstasks/AbsTaskClusteringFast.py index 7fe1ac07e0..29c46303aa 100644 --- a/mteb/abstasks/AbsTaskClusteringFast.py +++ b/mteb/abstasks/AbsTaskClusteringFast.py @@ -4,6 +4,7 @@ import logging import random from collections import defaultdict +from pathlib import Path from typing import Any import numpy as np @@ -12,8 +13,12 @@ from sklearn.metrics.cluster import v_measure_score from mteb.models import Encoder -from mteb.types import HFSubset -from mteb.types.statistics import DescriptiveStatistics, LabelStatistics, TextStatistics +from mteb.types import HFSubset, ScoresDict +from mteb.types.statistics import ( + LabelStatistics, + SplitDescriptiveStatistics, + TextStatistics, +) from ..create_dataloaders import create_dataloader from ._statistics_calculation import ( @@ -81,7 +86,7 @@ def evaluate_clustering_bootstrapped( return v_measures -class ClusteringFastDescriptiveStatistics(DescriptiveStatistics): +class ClusteringFastDescriptiveStatistics(SplitDescriptiveStatistics): """Descriptive statistics for ClusteringFast Attributes: @@ -132,13 +137,14 @@ class AbsTaskClusteringFast(AbsTask): def _evaluate_subset( self, model: Encoder, - dataset: Dataset, + data_split: Dataset, *, + encode_kwargs: dict[str, Any], hf_split: str, hf_subset: str, - encode_kwargs: dict[str, Any], + prediction_folder: Path | None = None, **kwargs: Any, - ) -> dict[str, float | dict[str, list[float]]]: + ) -> ScoresDict: if ( self.max_document_to_embed is not None and self.max_fraction_of_documents_to_embed is not None @@ -151,20 +157,20 @@ def _evaluate_subset( self.max_document_to_embed is None and self.max_fraction_of_documents_to_embed is None ): - downsampled_dataset = dataset + downsampled_dataset = data_split else: if self.max_fraction_of_documents_to_embed is not None: max_documents_to_embed = int( - self.max_fraction_of_documents_to_embed * len(dataset) + self.max_fraction_of_documents_to_embed * len(data_split) ) else: max_documents_to_embed = self.max_document_to_embed - max_documents_to_embed = min(len(dataset), max_documents_to_embed) # type: ignore + max_documents_to_embed = min(len(data_split), max_documents_to_embed) # type: ignore example_indices = self.rng_state.sample( - range(len(dataset)), k=max_documents_to_embed + range(len(data_split)), k=max_documents_to_embed ) - downsampled_dataset = dataset.select(example_indices) # type: ignore + downsampled_dataset = data_split.select(example_indices) # type: ignore downsampled_dataset = downsampled_dataset.select_columns( [self.input_column_name, self.label_column_name] diff --git a/mteb/abstasks/AbsTaskMultilabelClassification.py b/mteb/abstasks/AbsTaskMultilabelClassification.py index fe408adcd6..5322a52b76 100644 --- a/mteb/abstasks/AbsTaskMultilabelClassification.py +++ b/mteb/abstasks/AbsTaskMultilabelClassification.py @@ -3,15 +3,17 @@ import itertools import logging from collections import defaultdict +from pathlib import Path from typing import Any import numpy as np -from datasets import Dataset, DatasetDict +from datasets import DatasetDict from sklearn.base import clone from sklearn.metrics import f1_score, label_ranking_average_precision_score from sklearn.multioutput import MultiOutputClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.preprocessing import MultiLabelBinarizer +from typing_extensions import override from mteb.models import Encoder from mteb.types import ScoresDict @@ -67,22 +69,24 @@ class AbsTaskMultilabelClassification(AbsTaskAnyClassification): input_column_name: str = "text" label_column_name: str = "label" + @override def _evaluate_subset( self, model: Encoder, - dataset: DatasetDict | Dataset, + data_split: DatasetDict, *, + encode_kwargs: dict[str, Any], hf_split: str, hf_subset: str, - encode_kwargs: dict[str, Any], + prediction_folder: Path | None = None, **kwargs: Any, ) -> ScoresDict: - if isinstance(dataset, (Dataset, DatasetDict)): - dataset = dataset.select_columns( + if isinstance(data_split, DatasetDict): + data_split = data_split.select_columns( [self.input_column_name, self.label_column_name] ) - train_split = dataset[self.train_split] - eval_split = dataset[hf_split] + train_split = data_split[self.train_split] + eval_split = data_split[hf_split] scores = [] # Bootstrap sample indices from training set for each experiment diff --git a/mteb/abstasks/AbsTaskPairClassification.py b/mteb/abstasks/AbsTaskPairClassification.py index f16cc96eda..6fabb5f21c 100644 --- a/mteb/abstasks/AbsTaskPairClassification.py +++ b/mteb/abstasks/AbsTaskPairClassification.py @@ -7,7 +7,11 @@ from mteb._evaluators import PairClassificationEvaluator from mteb.types import ScoresDict -from mteb.types.statistics import DescriptiveStatistics, LabelStatistics, TextStatistics +from mteb.types.statistics import ( + LabelStatistics, + SplitDescriptiveStatistics, + TextStatistics, +) from ..models.models_protocols import Encoder from ._statistics_calculation import ( @@ -19,7 +23,7 @@ logger = logging.getLogger(__name__) -class PairClassificationDescriptiveStatistics(DescriptiveStatistics): +class PairClassificationDescriptiveStatistics(SplitDescriptiveStatistics): """Descriptive statistics for PairClassification Attributes: @@ -60,14 +64,14 @@ class AbsTaskPairClassification(AbsTask): def _evaluate_subset( self, model: Encoder, - dataset: Dataset, + data_split: Dataset, *, hf_split: str, hf_subset: str, encode_kwargs: dict[str, str] = {}, **kwargs, ) -> ScoresDict: - data_split = dataset[0] if len(dataset) == 1 else dataset + data_split = data_split[0] if len(data_split) == 1 else data_split logging.getLogger( "sentence_transformers.evaluation.PairClassificationEvaluator" ).setLevel(logging.WARN) diff --git a/mteb/abstasks/AbsTaskRetrieval.py b/mteb/abstasks/AbsTaskRetrieval.py index 2c42db19a8..d67fb54b03 100644 --- a/mteb/abstasks/AbsTaskRetrieval.py +++ b/mteb/abstasks/AbsTaskRetrieval.py @@ -5,7 +5,7 @@ from collections import defaultdict from pathlib import Path from time import time -from typing import Any, Callable +from typing import Any, Callable, Literal from datasets import Dataset, DatasetDict, concatenate_datasets from typing_extensions import Self @@ -20,6 +20,7 @@ from mteb.models import ( CrossEncoderProtocol, Encoder, + MTEBModels, SearchCrossEncoderWrapper, SearchEncoderWrapper, SearchProtocol, @@ -32,9 +33,9 @@ ScoresDict, ) from mteb.types.statistics import ( - DescriptiveStatistics, ImageStatistics, RelevantDocsStatistics, + SplitDescriptiveStatistics, TextStatistics, TopRankedStatistics, ) @@ -55,7 +56,7 @@ logger = logging.getLogger(__name__) -class RetrievalDescriptiveStatistics(DescriptiveStatistics): +class RetrievalDescriptiveStatistics(SplitDescriptiveStatistics): """Descriptive statistics for Retrieval Attributes: @@ -295,7 +296,7 @@ def process_data(split: str, hf_subset: str = "default"): def evaluate( self, - model: Encoder, + model: MTEBModels, split: str = "test", subsets_to_run: list[HFSubset] | None = None, *, @@ -333,7 +334,7 @@ def evaluate( def _evaluate_subset( self, - model: Encoder, + model: MTEBModels, data_split: RetrievalSplitData, encode_kwargs: dict[str, Any], hf_split: str, @@ -387,7 +388,6 @@ def _evaluate_subset( results = retriever( search_model, encode_kwargs=encode_kwargs, - **kwargs, ) end_time = time() logger.debug(f"Time taken to retrieve: {end_time - start_time:.2f} seconds") @@ -500,7 +500,12 @@ def _calculate_descriptive_statistics_from_split( num_documents = len(corpus) num_queries = len(queries) - queries_modalities, corpus_modalities = self.metadata.category.split("2") + if self.metadata.category is None: + queries_modalities = "t" + corpus_modalities = "t" + else: + queries_modalities, corpus_modalities = self.metadata.category.split("2") + number_of_characters = 0 documents_text_statistics = None @@ -554,8 +559,8 @@ def _push_dataset_to_hub(self, repo_name: str) -> None: self.convert_v1_dataset_format_to_v2() def _push_section( - data: dict[str, dict[str, Dataset | dict]], - subset_item: str, + data: dict[str, RetrievalSplitData], + subset_item: Literal["corpus", "queries", "relevant_docs", "top_ranked"], hf_subset_name: str, converter: Callable[[Any, Any], dict[str, Any]] | None = None, ) -> None: diff --git a/mteb/abstasks/AbsTaskSummarization.py b/mteb/abstasks/AbsTaskSummarization.py index 4bcb53d84f..b9091779af 100644 --- a/mteb/abstasks/AbsTaskSummarization.py +++ b/mteb/abstasks/AbsTaskSummarization.py @@ -9,7 +9,11 @@ from mteb._evaluators import SummarizationEvaluator from mteb.models import Encoder from mteb.types import ScoresDict -from mteb.types.statistics import DescriptiveStatistics, ScoreStatistics, TextStatistics +from mteb.types.statistics import ( + ScoreStatistics, + SplitDescriptiveStatistics, + TextStatistics, +) from ._statistics_calculation import ( calculate_score_statistics, @@ -20,7 +24,7 @@ logger = logging.getLogger(__name__) -class SummarizationDescriptiveStatistics(DescriptiveStatistics): +class SummarizationDescriptiveStatistics(SplitDescriptiveStatistics): """Descriptive statistics for Summarization Attributes: diff --git a/mteb/abstasks/AbsTaskTextRegression.py b/mteb/abstasks/AbsTaskTextRegression.py index 2946630e09..7fee3e2809 100644 --- a/mteb/abstasks/AbsTaskTextRegression.py +++ b/mteb/abstasks/AbsTaskTextRegression.py @@ -19,15 +19,20 @@ calculate_text_statistics, ) from mteb.models import MTEBModels +from mteb.models.models_protocols import Encoder from mteb.types import HFSubset, ScoresDict -from mteb.types.statistics import DescriptiveStatistics, ScoreStatistics, TextStatistics +from mteb.types.statistics import ( + ScoreStatistics, + SplitDescriptiveStatistics, + TextStatistics, +) from .AbsTask import AbsTask logger = logging.getLogger(__name__) -class RegressionDescriptiveStatistics(DescriptiveStatistics): +class RegressionDescriptiveStatistics(SplitDescriptiveStatistics): """Descriptive statistics for Regression Attributes: @@ -70,16 +75,17 @@ class AbsTaskTextRegression(AbsTask): def _evaluate_subset( self, - model: MTEBModels, - dataset: DatasetDict, + model: Encoder, + data_split: DatasetDict, + *, encode_kwargs: dict[str, Any], hf_split: str, hf_subset: str, prediction_folder: Path | None = None, **kwargs: Any, ) -> ScoresDict: - train_split = dataset[self.train_split] - eval_split = dataset[hf_split] + train_split = data_split[self.train_split] + eval_split = data_split[hf_split] scores_list, test_cache = [], None for i in range(self.n_experiments): diff --git a/mteb/abstasks/Image/AbsTaskImageTextPairClassification.py b/mteb/abstasks/Image/AbsTaskImageTextPairClassification.py index 7ee82b8d4d..8fc7e178ea 100644 --- a/mteb/abstasks/Image/AbsTaskImageTextPairClassification.py +++ b/mteb/abstasks/Image/AbsTaskImageTextPairClassification.py @@ -1,6 +1,7 @@ from __future__ import annotations import logging +from pathlib import Path from typing import Any from datasets import Dataset @@ -8,14 +9,14 @@ from mteb._evaluators import ImageTextPairClassificationEvaluator from mteb.models.models_protocols import Encoder from mteb.types import ScoresDict -from mteb.types.statistics import DescriptiveStatistics +from ...types.statistics import SplitDescriptiveStatistics from ..AbsTask import AbsTask logger = logging.getLogger(__name__) -class ImageTextPairClassificationDescriptiveStatistics(DescriptiveStatistics): +class ImageTextPairClassificationDescriptiveStatistics(SplitDescriptiveStatistics): """Descriptive statistics for ImageTextPairClassification Attributes: @@ -107,12 +108,13 @@ def _calculate_descriptive_statistics_from_split( def _evaluate_subset( self, model: Encoder, - dataset: Dataset, + data_split: Dataset, *, + encode_kwargs: dict[str, Any], hf_split: str, hf_subset: str, - encode_kwargs: dict[str, Any], - **kwargs, + prediction_folder: Path | None = None, + **kwargs: Any, ) -> ScoresDict: select_columns = [] for columns in (self.images_column_names, self.texts_column_names): @@ -121,9 +123,9 @@ def _evaluate_subset( else: select_columns.extend(columns) - dataset = dataset.select_columns(select_columns) + data_split = data_split.select_columns(select_columns) evaluator = ImageTextPairClassificationEvaluator( - dataset, + data_split, images_column_names=self.images_column_names, texts_column_names=self.texts_column_names, task_metadata=self.metadata, diff --git a/mteb/abstasks/aggregated_task.py b/mteb/abstasks/aggregated_task.py index 182cc0e0fc..79c9ee982e 100644 --- a/mteb/abstasks/aggregated_task.py +++ b/mteb/abstasks/aggregated_task.py @@ -1,6 +1,7 @@ from __future__ import annotations import logging +from pathlib import Path from typing import TYPE_CHECKING, Any import numpy as np @@ -13,7 +14,8 @@ if TYPE_CHECKING: from datasets import Dataset, DatasetDict - from mteb.models.models_protocols import Encoder + from mteb.load_results.task_results import TaskResult + from mteb.models.models_protocols import MTEBModels from mteb.types import HFSubset, ScoresDict from mteb.types.statistics import DescriptiveStatistics @@ -27,6 +29,7 @@ class AbsTaskAggregate(AbsTask): _eval_splits: list[str] | None = None def __init__(self, **kwargs: Any): + super().__init__(**kwargs) self.tasks = self.metadata.tasks self.taskname_to_task = {task.metadata.name: task for task in self.tasks} @@ -120,11 +123,12 @@ def filter_eval_splits(self, eval_splits: list[str] | None) -> AbsTaskAggregate: def evaluate( self, - model: Encoder, + model: MTEBModels, split: str = "test", subsets_to_run: list[HFSubset] | None = None, *, encode_kwargs: dict[str, Any], + prediction_folder: Path | None = None, **kwargs: Any, ) -> dict[HFSubset, ScoresDict]: # TODO: If we refactor the runner to at least have a subfunction mteb.run_task(model, task) we could use that here @@ -134,7 +138,7 @@ def evaluate( def _evaluate_subset( self, - model: Encoder, + model: MTEBModels, data_split: DatasetDict | Dataset, encode_kwargs: dict[str, Any], **kwargs: Any, diff --git a/mteb/abstasks/retrieval_dataset_loaders.py b/mteb/abstasks/retrieval_dataset_loaders.py index 3b6fda31bc..c0c8595444 100644 --- a/mteb/abstasks/retrieval_dataset_loaders.py +++ b/mteb/abstasks/retrieval_dataset_loaders.py @@ -168,7 +168,7 @@ def _load_qrels(self) -> RelevantDocumentsType: ) # filter queries with no qrels - qrels_dict = defaultdict(dict) + qrels_dict: dict[str, dict[str, int]] = defaultdict(dict) def qrels_dict_init(row): qrels_dict[row["query-id"]][row["corpus-id"]] = int(row["score"]) diff --git a/mteb/abstasks/task_metadata.py b/mteb/abstasks/task_metadata.py index b10e1618f6..693d35f1c2 100644 --- a/mteb/abstasks/task_metadata.py +++ b/mteb/abstasks/task_metadata.py @@ -17,7 +17,7 @@ ConfigDict, field_validator, ) -from typing_extensions import Literal, TypedDict +from typing_extensions import Literal, Required, TypedDict import mteb from mteb.languages import check_language_code @@ -198,8 +198,8 @@ class MetadataDatasetDict(TypedDict, total=False): because datasets `v4` doesn't support this. This parameter is left for compatibility with forks/external usage. """ - path: str - revision: str + path: Required[str] + revision: Required[str] name: str split: str trust_remote_code: bool @@ -266,26 +266,8 @@ class TaskMetadata(BaseModel): adapted_from: list[str] | None = None def _validate_metadata(self) -> None: - self._dataset_path_is_specified(self.dataset) - self._dataset_revision_is_specified(self.dataset) self._eval_langs_are_valid(self.eval_langs) - @field_validator("dataset") - @classmethod - def _check_dataset_path_is_specified( - cls, dataset: dict[str, Any] - ) -> dict[str, Any]: - cls._dataset_path_is_specified(dataset) - return dataset - - @field_validator("dataset") - @classmethod - def _check_dataset_revision_is_specified( - cls, dataset: dict[str, Any] - ) -> dict[str, Any]: - cls._dataset_revision_is_specified(dataset) - return dataset - @field_validator("prompt") @classmethod def _check_prompt_is_valid( @@ -299,27 +281,6 @@ def _check_prompt_is_valid( ) return prompt - @staticmethod - def _dataset_path_is_specified(dataset: dict[str, Any]) -> None: - """This method checks that the dataset path is specified.""" - if "path" not in dataset or dataset["path"] is None: - raise ValueError( - "You must specify the path to the dataset in the dataset dictionary. " - + "See https://huggingface.co/docs/datasets/main/en/package_reference/loading_methods#datasets.load_dataset" - ) - - @staticmethod - def _dataset_revision_is_specified(dataset: dict[str, Any]) -> None: - if "revision" not in dataset: - raise ValueError( - "You must explicitly specify a revision for the dataset (either a SHA or None)." - ) - if dataset["revision"] is None: - logger.warning( - "Revision missing for the dataset %s. It is encourage to specify a dataset revision for reproducability.", - dataset["path"], - ) - def _eval_langs_are_valid(self, eval_langs: Languages) -> None: """This method checks that the eval_langs are specified as a list of languages.""" if isinstance(eval_langs, dict): @@ -451,7 +412,7 @@ def revision(self) -> str: def _create_dataset_card_data( self, existing_dataset_card_data: DatasetCardData | None = None, - ) -> tuple[DatasetCardData, dict[str, str]]: + ) -> tuple[DatasetCardData, dict[str, Any]]: """Create a DatasetCardData object from the task metadata. Args: @@ -501,12 +462,12 @@ def _create_dataset_card_data( if self.category in ["i2t", "t2i", "it2t", "it2i", "t2it", "i2it", "it2it"]: dataset_type.extend(["image-to-text", "text-to-image"]) + languages: list[str] = [] if self.is_multilingual: - languages: list[str] = [] for val in list(self.eval_langs.values()): languages.extend(val) else: - languages: list[str] = self.eval_langs + languages = self.eval_langs # value "python" is not valid. It must be an ISO 639-1, 639-2 or 639-3 code (two/three letters), # or a special value like "code", "multilingual". readme_langs = [] @@ -529,11 +490,7 @@ def _create_dataset_card_data( ] source_datasets.append(self.dataset["path"]) else: - source_datasets = ( - None - if not TaskMetadata.push_dataset_card_to_hub - else [self.dataset["path"]] - ) + source_datasets = None if not self.dataset else [self.dataset["path"]] tags = ["mteb"] + self.modalities diff --git a/mteb/create_dataloaders.py b/mteb/create_dataloaders.py index 8ea184b105..5b4fbd6348 100644 --- a/mteb/create_dataloaders.py +++ b/mteb/create_dataloaders.py @@ -1,7 +1,7 @@ from __future__ import annotations import logging -from typing import Any, Callable +from typing import Any, Callable, cast import torch from datasets import Dataset @@ -21,19 +21,23 @@ def create_dataloader_from_texts( - text: list[str], **dataloader_kwargs + text: list[str], + batch_size: int = 32, ) -> DataLoader[TextInput]: """Create a dataloader from a list of text. Args: text: A list of text to create a dataloader from. - dataloader_kwargs: Additional arguments to pass to the dataloader. + batch_size: Batch size for the dataloader. Returns: A dataloader with the text. """ dataset = Dataset.from_dict({"text": text}) - return torch.utils.data.DataLoader(dataset, **dataloader_kwargs) + return torch.utils.data.DataLoader( + dataset, + batch_size=batch_size, + ) def corpus_to_dict( @@ -56,19 +60,23 @@ def corpus_to_dict( def create_dataloader_for_retrieval_corpus( - dataset: Dataset, **dataloader_kwargs + dataset: Dataset, + batch_size: int = 32, ) -> DataLoader[CorpusInput]: """Create a dataloader from a corpus. Args: dataset: Corpus - dataloader_kwargs: Additional arguments to pass to the dataloader. + batch_size: Batch size for the dataloader. Returns: A dataloader with the corpus. """ new_ds = dataset.map(corpus_to_dict, desc="Converting corpus dict") - return torch.utils.data.DataLoader(new_ds, **dataloader_kwargs) + return torch.utils.data.DataLoader( + new_ds, + batch_size=batch_size, + ) def combine_queries_with_instruction_text(row: dict[str, str]) -> dict[str, str]: @@ -83,13 +91,13 @@ def combine_queries_with_instruction_text(row: dict[str, str]) -> dict[str, str] def create_text_dataloader_for_queries( queries: QueryDatasetType, - **dataloader_kwargs, + batch_size: int = 32, ) -> DataLoader[QueryInput]: """Create a dataloader from a list of queries. Args: queries: A list of queries. - dataloader_kwargs: Additional arguments to pass to the dataloader. + batch_size: Batch size for the dataloader. Returns: A dataloader with the queries. @@ -97,7 +105,10 @@ def create_text_dataloader_for_queries( queries = queries.map( combine_queries_with_instruction_text, desc="Processing queries for dataloading" ) - return torch.utils.data.DataLoader(queries, **dataloader_kwargs) + return torch.utils.data.DataLoader( + queries, + batch_size=batch_size, + ) def convert_conv_history_to_query( @@ -106,6 +117,7 @@ def convert_conv_history_to_query( conversation = row["text"] # if it's a list of strings, just join them if isinstance(conversation, list) and isinstance(conversation[0], str): + conversation = cast(list[str], conversation) conv_str = "; ".join(conversation) current_conversation = [ ConversationTurn(role="user", content=message) for message in conversation @@ -155,13 +167,13 @@ def convert_conv_history_to_query( def create_dataloader_for_queries_conversation( queries: QueryDatasetType, - **dataloader_kwargs, + batch_size: int = 32, ) -> DataLoader[QueryInput]: """Create a dataloader from a list of queries. Args: queries: A list of queries. - dataloader_kwargs: Additional arguments to pass to the dataloader. + batch_size: Batch size for the dataloader. Returns: A dataloader with the queries. @@ -171,7 +183,7 @@ def create_dataloader_for_queries_conversation( convert_conv_history_to_query, desc="Converting conversations to queries" ), collate_fn=custom_collate_fn, - **dataloader_kwargs, + batch_size=batch_size, ) @@ -258,16 +270,16 @@ def create_image_dataloader( def create_text_queries_dataloader( dataset: Dataset, - **dataloader_kwargs: dict[str, Any], + batch_size: int = 32, ) -> DataLoader[BatchedInput]: if not isinstance(dataset["text"][0], list): return create_text_dataloader_for_queries( dataset, - **dataloader_kwargs, + batch_size=batch_size, ) return create_dataloader_for_queries_conversation( dataset, - **dataloader_kwargs, + batch_size=batch_size, ) @@ -275,14 +287,19 @@ def create_queries_dataloader( dataset: Dataset, task_metadata: TaskMetadata, input_column: str | None = None, - **dataloader_kwargs: dict[str, Any], + batch_size: int = 32, ) -> DataLoader[BatchedInput]: queries_type, _ = task_metadata.category.split("2") if queries_type == "t": # text only - return create_text_queries_dataloader(dataset, **dataloader_kwargs) + return create_text_queries_dataloader( + dataset, + batch_size=batch_size, + ) if "i" in queries_type: # contains image return create_image_dataloader( - dataset, image_column_name="image", **dataloader_kwargs + dataset, + image_column_name="image", + batch_size=batch_size, ) raise ValueError(f"Can't handle queries type {queries_type}") @@ -291,14 +308,22 @@ def create_document_dataloader( dataset: Dataset, task_metadata: TaskMetadata, input_column: str | None = None, - **dataloader_kwargs: dict[str, Any], + batch_size: int = 32, ) -> DataLoader[BatchedInput]: - _, document_type = task_metadata.category.split("2") + if task_metadata.category is None: + document_type = "t" + else: + _, document_type = task_metadata.category.split("2") if document_type == "t": # text only - return create_dataloader_for_retrieval_corpus(dataset, **dataloader_kwargs) + return create_dataloader_for_retrieval_corpus( + dataset, + batch_size=batch_size, + ) if "i" in document_type: # contains image return create_image_dataloader( - dataset, image_column_name="image", **dataloader_kwargs + dataset, + image_column_name="image", + batch_size=batch_size, ) raise ValueError(f"Can't handle queries type {document_type}") @@ -308,35 +333,34 @@ def create_dataloader( task_metadata: TaskMetadata, prompt_type: PromptType | None = None, input_column: str | None = None, - **dataloader_kwargs: dict[str, Any], + batch_size: int = 32, + **kwargs: dict[str, Any], ) -> DataLoader[BatchedInput]: if prompt_type == PromptType.query: return create_queries_dataloader( dataset, task_metadata, + batch_size=batch_size, input_column=input_column, - **dataloader_kwargs, ) if prompt_type == PromptType.document: return create_document_dataloader( dataset, task_metadata, input_column=input_column, - **dataloader_kwargs, + batch_size=batch_size, ) if "image" in task_metadata.modalities: return create_image_dataloader( dataset, image_column_name=input_column, - **dataloader_kwargs, ) if "text" in task_metadata.modalities and input_column is not None: return create_dataloader_from_texts( dataset[input_column], - **dataloader_kwargs, ) return DataLoader( dataset, - **dataloader_kwargs, + batch_size=batch_size, ) diff --git a/mteb/models/model_implementations/listconranker.py b/mteb/models/model_implementations/listconranker.py index e48a680234..47e5f6c189 100644 --- a/mteb/models/model_implementations/listconranker.py +++ b/mteb/models/model_implementations/listconranker.py @@ -13,7 +13,7 @@ class ListConRanker(RerankerWrapper): - def __init__(self, model_name_or_path: str | None = None, **kwargs) -> None: + def __init__(self, model_name_or_path: str, **kwargs) -> None: from transformers import AutoModelForSequenceClassification, AutoTokenizer super().__init__(model_name_or_path, **kwargs) diff --git a/mteb/models/model_meta.py b/mteb/models/model_meta.py index a3025733b3..4e50a32341 100644 --- a/mteb/models/model_meta.py +++ b/mteb/models/model_meta.py @@ -216,7 +216,7 @@ def is_zero_shot_on(self, tasks: Sequence[AbsTask] | Sequence[str]) -> bool | No intersection = training_datasets & benchmark_datasets return len(intersection) == 0 - def get_training_datasets(self) -> dict[str, list[str]] | None: + def get_training_datasets(self) -> set[str] | None: """Returns all training datasets of the model including similar tasks.""" import mteb diff --git a/mteb/types/_encoder_io.py b/mteb/types/_encoder_io.py index f995459a75..b77a360c47 100644 --- a/mteb/types/_encoder_io.py +++ b/mteb/types/_encoder_io.py @@ -98,7 +98,7 @@ class AudioInput(TypedDict): audio: list[list[bytes]] -class MultimodalInput(TextInput, CorpusInput, QueryInput, ImageInput, AudioInput): +class MultimodalInput(TextInput, CorpusInput, QueryInput, ImageInput, AudioInput): # type: ignore[misc] """The input to the encoder for multimodal data.""" pass diff --git a/mteb/types/statistics.py b/mteb/types/statistics.py index e0a9fa610c..7d6e821f1f 100644 --- a/mteb/types/statistics.py +++ b/mteb/types/statistics.py @@ -1,14 +1,20 @@ from __future__ import annotations -from typing import TypedDict +from typing_extensions import NotRequired, TypedDict +from mteb.types import HFSubset -class DescriptiveStatistics(TypedDict): - """Class for descriptive statistics.""" +class SplitDescriptiveStatistics(TypedDict): pass +class DescriptiveStatistics(TypedDict, SplitDescriptiveStatistics): + """Class for descriptive statistics.""" + + hf_subset_descriptive_stats: NotRequired[dict[HFSubset, SplitDescriptiveStatistics]] + + class TextStatistics(TypedDict): """Class for descriptive statistics for texts. diff --git a/pyproject.toml b/pyproject.toml index dd7555b4b6..2e4da63816 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -107,13 +107,28 @@ docs = [ "mkdocs-bibtex>=2.16.2", "mkdocs-exclude>=1.0.2", "mkdocs-include-dir-to-nav>=1.2.0", - "tabulate>=0.9.0" + "tabulate>=0.9.0", +] +typing = [ + "mypy>=1.18.1", + "types-cachetools>=6.2.0.20250827", + "types-pysocks>=1.7.1.20250828", + "types-pyyaml>=6.0.12.20250822", + "types-requests>=2.32.4.20250913", + "types-simplejson>=3.20.0.20250822", + "types-tqdm>=4.67.0.20250809", + "types-tensorflow>=2.18.0.20250809", + # stubs require python >=3.10 + # "pandas-stubs>=2.3.2.250827", + # "scipy-stubs>=1.15.3.0", ] dev = [ {include-group = "lint"}, {include-group = "test"}, + {include-group = "typing"}, ] + [tool.coverage.report] omit = ["tests/*", "mteb/tasks/**/*", "scripts"] @@ -262,3 +277,29 @@ conflicts = [ [{ extra = "colpali-engine" }, { extra = "pylate" }], [{ extra = "colpali-engine" }, { extra = "llm2vec" }] ] + +[tool.mypy] +plugins = ['pydantic.mypy'] + +[[tool.mypy.overrides]] +# these modules not typed and don't have stubs +module = [ + "datasets", + "sklearn", + "sklearn.*", +] +ignore_missing_imports = true + +[[tool.mypy.overrides]] +# don't typecheck these modules (too many issues) +module = [ + "mteb.models.model_implementations.*", + "mteb.tasks.*", + "mteb.leaderboard.*", +] +ignore_errors = true + +[[tool.mypy.overrides]] +# mypy can't resolve dataset dict +module = ["mteb.abstasks.*"] +disable_error_code = ["index"]