embeddings-benchmark
diff --git a/‎mteb/abstasks/AbsTask.py‎
Lines changed: 2 additions & 8 deletions b/‎mteb/abstasks/AbsTask.py‎
Lines changed: 2 additions & 8 deletions
diff --git a/‎mteb/abstasks/AbsTaskAnyZeroShotClassification.py‎
Lines changed: 11 additions & 0 deletions b/‎mteb/abstasks/AbsTaskAnyZeroShotClassification.py‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎mteb/abstasks/AbsTaskBitextMining.py‎
Lines changed: 11 additions & 19 deletions b/‎mteb/abstasks/AbsTaskBitextMining.py‎
Lines changed: 11 additions & 19 deletions
diff --git a/‎mteb/abstasks/AbsTaskRetrieval.py‎
Lines changed: 2 additions & 2 deletions b/‎mteb/abstasks/AbsTaskRetrieval.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎mteb/abstasks/retrieval_dataset_loaders.py‎
Lines changed: 1 addition & 0 deletions b/‎mteb/abstasks/retrieval_dataset_loaders.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎mteb/abstasks/task_metadata.py‎
Lines changed: 3 additions & 3 deletions b/‎mteb/abstasks/task_metadata.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎mteb/create_dataloaders.py‎
Lines changed: 2 additions & 2 deletions b/‎mteb/create_dataloaders.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎mteb/tasks/BitextMining/dan/BornholmskBitextMining.py‎
Lines changed: 2 additions & 8 deletions b/‎mteb/tasks/BitextMining/dan/BornholmskBitextMining.py‎
Lines changed: 2 additions & 8 deletions
diff --git a/‎mteb/tasks/BitextMining/multilingual/BibleNLPBitextMining.py‎
Lines changed: 0 additions & 1 deletion b/‎mteb/tasks/BitextMining/multilingual/BibleNLPBitextMining.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎mteb/tasks/BitextMining/multilingual/DiaBLaBitextMining.py‎
Lines changed: 2 additions & 34 deletions b/‎mteb/tasks/BitextMining/multilingual/DiaBLaBitextMining.py‎
Lines changed: 2 additions & 34 deletions
@@ -313,7 +313,7 @@ def load_data(self) -> None:
         self.dataset_transform()
         self.data_loaded = True
 
-    def fast_load(self, **kwargs: Any) -> None:
+    def fast_load(self) -> None:
         """**Deprecated**. Load all subsets at once, then group by language. Using fast loading has two requirements:
 
         - Each row in the dataset should have a 'lang' feature giving the corresponding language/language pair
@@ -546,22 +546,16 @@ def _upload_dataset_to_hub(
     def _push_dataset_to_hub(self, repo_name: str) -> None:
         raise NotImplementedError
 
-    def push_dataset_to_hub(self, repo_name: str, reupload: bool = False) -> None:
+    def push_dataset_to_hub(self, repo_name: str) -> None:
         """Push the dataset to the HuggingFace Hub.
 
         Args:
             repo_name: The name of the repository to push the dataset to.
-            reupload: If true, then `source_datasets` will be added to model card with source dataset.
 
         Examples:
             >>> import mteb
             >>> task = mteb.get_task("Caltech101")
             >>> repo_name = f"myorg/{task.metadata.name}"
-            >>> task.load_data() # ensure that the dataset can load
-            >>>
-            >>> # Create the repo on HuggingFace Hub if it does not exist
-            >>> from huggingface_hub import create_repo
-            >>> create_repo(repo_name, repo_type="dataset")
             >>> # Push the dataset to the Hub
             >>> task.push_dataset_to_hub(repo_name)
         """
 
@@ -123,6 +123,17 @@ def _evaluate_subset(
         )
         return evaluator(model, encode_kwargs=encode_kwargs)
 
+    def _push_dataset_to_hub(self, repo_name: str) -> None:
+        self._upload_dataset_to_hub(
+            repo_name,
+            [
+                self.input_column_name,
+                self.label_column_name,
+            ],
+        )
+        labels_dataset = Dataset.from_dict({"labels": self.get_candidate_labels()})
+        labels_dataset.push_to_hub(repo_name, config_name="labels")
+
     def get_candidate_labels(self) -> list[str]:
         """Return the text candidates for zeroshot classification"""
         raise NotImplementedError("This method should be overridden by subclasses")
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import logging
+from collections import defaultdict
 from typing import Any
 
 from datasets import Dataset, DatasetDict
@@ -181,34 +182,25 @@ def _calculate_descriptive_statistics_from_split(
 
     def _push_dataset_to_hub(self, repo_name: str) -> None:
         if self.metadata.is_multilingual:
+            dataset = defaultdict(dict)
             for config in self.metadata.eval_langs:
                 logger.info(f"Converting {config} of {self.metadata.name}")
 
-                sentences = {}
                 if self.parallel_subsets:
-                    # If there are parallel subsets, process them
                     for split in self.dataset:
                         sent_1, sent_2 = config.split("-")
-                        sentences[split] = Dataset.from_dict(
-                            {
-                                "sentence1": self.dataset[split][sent_1],
-                                "sentence2": self.dataset[split][sent_2],
-                            }
-                        )
+                        dataset[split][sent_1] = self.dataset[split][sent_1]
+                        dataset[split][sent_2] = self.dataset[split][sent_2]
                 else:
-                    # Handle the non-parallel subset case
                     sent_1, sent_2 = self.get_pairs(self.parallel_subsets)[0]
+                    lang_1, lang_2 = config.split("-")
                     for split in self.dataset[config]:
-                        sentences[split] = Dataset.from_dict(
-                            {
-                                "sentence1": self.dataset[config][split][sent_1],
-                                "sentence2": self.dataset[config][split][sent_2],
-                            }
-                        )
-                sentences = DatasetDict(sentences)
-                sentences.push_to_hub(
-                    repo_name, config, commit_message=f"Add {config} subset"
-                )
+                        dataset[split][lang_1] = self.dataset[config][split][sent_1]
+                        dataset[split][lang_2] = self.dataset[config][split][sent_2]
+            for split in dataset:
+                dataset[split] = Dataset.from_dict(dataset[split])
+            dataset = DatasetDict(dataset)
+            dataset.push_to_hub(repo_name)
         else:
             sentences = {}
             for split in self.dataset:
 
@@ -554,7 +554,7 @@ def _push_dataset_to_hub(self, repo_name: str) -> None:
         self.convert_v1_dataset_format_to_v2()
 
         def _push_section(
-            data: dict[str, dict[Any, Any]],
+            data: dict[str, dict[str, Dataset | dict]],
             subset_item: str,
             hf_subset_name: str,
             converter: Callable[[Any, Any], dict[str, Any]] | None = None,
@@ -572,7 +572,7 @@ def _push_section(
                 # skip empty instructions and top ranked
                 if subset_item not in data[split] or data[split][subset_item] is None:
                     continue
-                if isinstance(sections[split], Dataset):
+                if isinstance(data[split][subset_item], Dataset):
                     sections[split] = data[split][subset_item]
                 elif converter is not None:
                     sections[split] = Dataset.from_list(
 
@@ -155,6 +155,7 @@ def _load_qrels(self) -> RelevantDocumentsType:
                 )
 
         qrels_ds = self.load_dataset_split(config)
+        qrels_ds = qrels_ds.select_columns(["query-id", "corpus-id", "score"])
 
         qrels_ds = qrels_ds.cast(
             Features(
 
@@ -194,15 +194,15 @@ class MetadataDatasetDict(TypedDict, total=False):
         revision: The revision of the dataset.
         name: The name the dataset config.
         split: The split of the dataset.
-        trust_remote_code: Whether to trust the remote code.
+        trust_remote_code: Whether to use `trust_remote_code`. Datasets shouldn't use this since,
+         because datasets `v4` doesn't support this. This parameter is left for compatibility with forks/external usage.
     """
 
     path: str
     revision: str
     name: str
     split: str
     trust_remote_code: bool
-    dataset_version: str  # NLPJournalAbsArticleRetrieval.V2
 
 
 class TaskMetadata(BaseModel):
@@ -552,7 +552,7 @@ def _create_dataset_card_data(
             license_mapping = {
                 "not specified": "unknown",
                 "msr-la-nc": "other",
-                "cc-by-nd-2.1-jp": "cc-by-nd-2.1",
+                "cc-by-nd-2.1-jp": "other",
             }
             dataset_license = license_mapping.get(
                 dataset_license,
 
@@ -41,7 +41,7 @@ def corpus_to_dict(
 ) -> dict[str, str]:
     text = (
         (row["title"] + " " + row["text"]).strip()
-        if "title" in row
+        if "title" in row and len(row["title"]) > 0
         else row["text"].strip()
     )
     new_row = {
@@ -50,7 +50,7 @@ def corpus_to_dict(
         "body": row["text"],
     }
     # dataloders can't handle None
-    if "title" in row and row["title"] is not None:
+    if "title" in row and row["title"] is not None and len(row["title"]) > 0:
         new_row["title"] = row["title"]
     return new_row
 
 
@@ -8,9 +8,8 @@ class BornholmBitextMining(AbsTaskBitextMining):
     metadata = TaskMetadata(
         name="BornholmBitextMining",
         dataset={
-            "path": "strombergnlp/bornholmsk_parallel",
-            "revision": "3bc5cfb4ec514264fe2db5615fac9016f7251552",
-            "trust_remote_code": True,
+            "path": "mteb/BornholmBitextMining",
+            "revision": "5b02048bd75e79275aa91a1fce6cdfd3f4a391cb",
         },
         description="Danish Bornholmsk Parallel Corpus. Bornholmsk is a Danish dialect spoken on the island of Bornholm, Denmark. Historically it is a part of east Danish which was also spoken in Scania and Halland, Sweden.",
         reference="https://aclanthology.org/W19-6138/",
@@ -43,8 +42,3 @@ class BornholmBitextMining(AbsTaskBitextMining):
 """,
         prompt="Retrieve parallel sentences.",
     )
-
-    def dataset_transform(self):
-        # Convert to standard format
-        self.dataset = self.dataset.rename_column("da", "sentence1")
-        self.dataset = self.dataset.rename_column("da_bornholm", "sentence2")
@@ -865,7 +865,6 @@ class BibleNLPBitextMining(AbsTaskBitextMining):
             "path": "davidstap/biblenlp-corpus-mmteb",
             "revision": "264a18480c529d9e922483839b4b9758e690b762",
             "split": f"train[:{_N}]",
-            "trust_remote_code": True,
         },
         description="Partial Bible translations in 829 languages, aligned by verse.",
         reference="https://arxiv.org/abs/2304.09919",
 
@@ -1,7 +1,5 @@
 from __future__ import annotations
 
-import datasets
-
 from mteb.abstasks.AbsTaskBitextMining import AbsTaskBitextMining
 from mteb.abstasks.task_metadata import TaskMetadata
 
@@ -10,9 +8,8 @@ class DiaBLaBitextMining(AbsTaskBitextMining):
     metadata = TaskMetadata(
         name="DiaBlaBitextMining",
         dataset={
-            "path": "rbawden/DiaBLa",
-            "revision": "5345895c56a601afe1a98519ce3199be60a27dba",
-            "trust_remote_code": True,
+            "path": "mteb/DiaBlaBitextMining",
+            "revision": "c458e9bf4306d6380604462926a38c34861b4d3b",
         },
         description="English-French Parallel Corpus. DiaBLa is an English-French dataset for the evaluation of Machine Translation (MT) for informal, written bilingual dialogue.",
         reference="https://inria.hal.science/hal-03021633",
@@ -42,32 +39,3 @@ class DiaBLaBitextMining(AbsTaskBitextMining):
 }
 """,
     )
-
-    def load_data(self) -> None:
-        """Load dataset from HuggingFace hub and convert it to the standard format."""
-        if self.data_loaded:
-            return
-
-        self.dataset = {}
-
-        for lang in self.hf_subsets:
-            self.dataset[lang] = datasets.load_dataset(**self.metadata.dataset)
-
-        self.dataset_transform()
-        self.data_loaded = True
-
-    def dataset_transform(self):
-        def create_columns(row):
-            """Put all French texts in column 'sentence1' and English texts in 'sentence2' column"""
-            row["orig_lang"] = row["utterance_meta"]["lang"]
-            row["sentence1"] = (
-                row["orig"] if row["orig_lang"] == "french" else row["ref"]
-            )
-            row["sentence2"] = (
-                row["orig"] if not row["orig_lang"] == "french" else row["ref"]
-            )
-            return row
-
-        # Convert to standard format
-        for lang in self.hf_subsets:
-            self.dataset[lang] = self.dataset[lang].map(create_columns)
Original file line number	Diff line number	Diff line change
`@@ -155,6 +155,7 @@ def _load_qrels(self) -> RelevantDocumentsType:`
`155`	`155`	`)`
`156`	`156`
`157`	`157`	`qrels_ds = self.load_dataset_split(config)`
	`158`	`+ qrels_ds = qrels_ds.select_columns(["query-id", "corpus-id", "score"])`
`158`	`159`
`159`	`160`	`qrels_ds = qrels_ds.cast(`
`160`	`161`	`Features(`