Skip to content
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 2 additions & 8 deletions mteb/abstasks/AbsTask.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,7 @@ def load_data(self) -> None:
self.dataset_transform()
self.data_loaded = True

def fast_load(self, **kwargs: Any) -> None:
def fast_load(self) -> None:
"""**Deprecated**. Load all subsets at once, then group by language. Using fast loading has two requirements:

- Each row in the dataset should have a 'lang' feature giving the corresponding language/language pair
Expand Down Expand Up @@ -554,22 +554,16 @@ def _upload_dataset_to_hub(
def _push_dataset_to_hub(self, repo_name: str) -> None:
raise NotImplementedError

def push_dataset_to_hub(self, repo_name: str, reupload: bool = False) -> None:
def push_dataset_to_hub(self, repo_name: str) -> None:
"""Push the dataset to the HuggingFace Hub.

Args:
repo_name: The name of the repository to push the dataset to.
reupload: If true, then `source_datasets` will be added to model card with source dataset.

Examples:
>>> import mteb
>>> task = mteb.get_task("Caltech101")
>>> repo_name = f"myorg/{task.metadata.name}"
>>> task.load_data() # ensure that the dataset can load
>>>
>>> # Create the repo on HuggingFace Hub if it does not exist
>>> from huggingface_hub import create_repo
>>> create_repo(repo_name, repo_type="dataset")
>>> # Push the dataset to the Hub
>>> task.push_dataset_to_hub(repo_name)
"""
Expand Down
17 changes: 11 additions & 6 deletions mteb/abstasks/AbsTaskAnyZeroShotClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,6 @@ class AbsTaskAnyZeroShotClassification(AbsTask):
input_column_name: str = "image"
label_column_name: str = "label"

def __init__(self, **kwargs):
super().__init__(**kwargs)

def _add_main_score(self, scores) -> None:
scores["main_score"] = scores[self.metadata.main_score]

def _calculate_descriptive_statistics_from_split(
self, split: str, hf_subset: str | None = None, compute_overall: bool = False
) -> ZeroShotClassificationDescriptiveStatistics:
Expand Down Expand Up @@ -126,6 +120,17 @@ def _evaluate_subset(
)
return evaluator(model, encode_kwargs=encode_kwargs)

def _push_dataset_to_hub(self, repo_name: str) -> None:
self._upload_dataset_to_hub(
repo_name,
[
self.input_column_name,
self.label_column_name,
],
)
labels_dataset = Dataset.from_dict({"labels": self.get_candidate_labels()})
labels_dataset.push_to_hub(repo_name, config_name="labels")

def get_candidate_labels(self) -> list[str]:
"""Return the text candidates for zeroshot classification"""
raise NotImplementedError("This method should be overridden by subclasses")
30 changes: 11 additions & 19 deletions mteb/abstasks/AbsTaskBitextMining.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import logging
from collections import defaultdict
from typing import Any

from datasets import Dataset, DatasetDict
Expand Down Expand Up @@ -181,34 +182,25 @@ def _calculate_descriptive_statistics_from_split(

def _push_dataset_to_hub(self, repo_name: str) -> None:
if self.metadata.is_multilingual:
dataset = defaultdict(dict)
for config in self.metadata.eval_langs:
logger.info(f"Converting {config} of {self.metadata.name}")

sentences = {}
if self.parallel_subsets:
# If there are parallel subsets, process them
for split in self.dataset:
sent_1, sent_2 = config.split("-")
sentences[split] = Dataset.from_dict(
{
"sentence1": self.dataset[split][sent_1],
"sentence2": self.dataset[split][sent_2],
}
)
dataset[split][sent_1] = self.dataset[split][sent_1]
dataset[split][sent_2] = self.dataset[split][sent_2]
else:
# Handle the non-parallel subset case
sent_1, sent_2 = self.get_pairs(self.parallel_subsets)[0]
lang_1, lang_2 = config.split("-")
for split in self.dataset[config]:
sentences[split] = Dataset.from_dict(
{
"sentence1": self.dataset[config][split][sent_1],
"sentence2": self.dataset[config][split][sent_2],
}
)
sentences = DatasetDict(sentences)
sentences.push_to_hub(
repo_name, config, commit_message=f"Add {config} subset"
)
dataset[split][lang_1] = self.dataset[config][split][sent_1]
dataset[split][lang_2] = self.dataset[config][split][sent_2]
for split in dataset:
dataset[split] = Dataset.from_dict(dataset[split])
dataset = DatasetDict(dataset)
dataset.push_to_hub(repo_name)
else:
sentences = {}
for split in self.dataset:
Expand Down
4 changes: 2 additions & 2 deletions mteb/abstasks/AbsTaskRetrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -554,7 +554,7 @@ def _push_dataset_to_hub(self, repo_name: str) -> None:
self.convert_v1_dataset_format_to_v2()

def _push_section(
data: dict[str, dict[Any, Any]],
data: dict[str, dict[str, Dataset | dict]],
subset_item: str,
hf_subset_name: str,
converter: Callable[[Any, Any], dict[str, Any]] | None = None,
Expand All @@ -572,7 +572,7 @@ def _push_section(
# skip empty instructions and top ranked
if subset_item not in data[split] or data[split][subset_item] is None:
continue
if isinstance(sections[split], Dataset):
if isinstance(data[split][subset_item], Dataset):
sections[split] = data[split][subset_item]
elif converter is not None:
sections[split] = Dataset.from_list(
Expand Down
1 change: 1 addition & 0 deletions mteb/abstasks/retrieval_dataset_loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ def _load_qrels(self) -> RelevantDocumentsType:
)

qrels_ds = self.load_dataset_split(config)
qrels_ds = qrels_ds.select_columns(["query-id", "corpus-id", "score"])

qrels_ds = qrels_ds.cast(
Features(
Expand Down
5 changes: 3 additions & 2 deletions mteb/abstasks/task_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,8 @@ class MetadataDatasetDict(TypedDict, total=False):
revision: The revision of the dataset.
name: The name the dataset config.
split: The split of the dataset.
trust_remote_code: Whether to trust the remote code.
trust_remote_code: Whether to use `trust_remote_code`. Datasets shouldn't use this since,
because datasets `v4` doesn't support this. This parameter is left for compatibility with forks/external usage.
"""

path: str
Expand Down Expand Up @@ -552,7 +553,7 @@ def _create_dataset_card_data(
license_mapping = {
"not specified": "unknown",
"msr-la-nc": "other",
"cc-by-nd-2.1-jp": "cc-by-nd-2.1",
"cc-by-nd-2.1-jp": "other",
}
dataset_license = license_mapping.get(
dataset_license,
Expand Down
4 changes: 2 additions & 2 deletions mteb/create_dataloaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def corpus_to_dict(
) -> dict[str, str]:
text = (
(row["title"] + " " + row["text"]).strip()
if "title" in row
if "title" in row and len(row["title"]) > 0
else row["text"].strip()
)
new_row = {
Expand All @@ -50,7 +50,7 @@ def corpus_to_dict(
"body": row["text"],
}
# dataloders can't handle None
if "title" in row and row["title"] is not None:
if "title" in row and row["title"] is not None and len(row["title"]) > 0:
new_row["title"] = row["title"]
return new_row

Expand Down
10 changes: 2 additions & 8 deletions mteb/tasks/BitextMining/dan/BornholmskBitextMining.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,8 @@ class BornholmBitextMining(AbsTaskBitextMining):
metadata = TaskMetadata(
name="BornholmBitextMining",
dataset={
"path": "strombergnlp/bornholmsk_parallel",
"revision": "3bc5cfb4ec514264fe2db5615fac9016f7251552",
"trust_remote_code": True,
"path": "mteb/BornholmBitextMining",
"revision": "5b02048bd75e79275aa91a1fce6cdfd3f4a391cb",
},
description="Danish Bornholmsk Parallel Corpus. Bornholmsk is a Danish dialect spoken on the island of Bornholm, Denmark. Historically it is a part of east Danish which was also spoken in Scania and Halland, Sweden.",
reference="https://aclanthology.org/W19-6138/",
Expand Down Expand Up @@ -43,8 +42,3 @@ class BornholmBitextMining(AbsTaskBitextMining):
""",
prompt="Retrieve parallel sentences.",
)

def dataset_transform(self):
# Convert to standard format
self.dataset = self.dataset.rename_column("da", "sentence1")
self.dataset = self.dataset.rename_column("da_bornholm", "sentence2")
Original file line number Diff line number Diff line change
Expand Up @@ -865,7 +865,6 @@ class BibleNLPBitextMining(AbsTaskBitextMining):
"path": "davidstap/biblenlp-corpus-mmteb",
"revision": "264a18480c529d9e922483839b4b9758e690b762",
"split": f"train[:{_N}]",
"trust_remote_code": True,
},
description="Partial Bible translations in 829 languages, aligned by verse.",
reference="https://arxiv.org/abs/2304.09919",
Expand Down
36 changes: 2 additions & 34 deletions mteb/tasks/BitextMining/multilingual/DiaBLaBitextMining.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from __future__ import annotations

import datasets

from mteb.abstasks.AbsTaskBitextMining import AbsTaskBitextMining
from mteb.abstasks.task_metadata import TaskMetadata

Expand All @@ -10,9 +8,8 @@ class DiaBLaBitextMining(AbsTaskBitextMining):
metadata = TaskMetadata(
name="DiaBlaBitextMining",
dataset={
"path": "rbawden/DiaBLa",
"revision": "5345895c56a601afe1a98519ce3199be60a27dba",
"trust_remote_code": True,
"path": "mteb/DiaBlaBitextMining",
"revision": "c458e9bf4306d6380604462926a38c34861b4d3b",
},
description="English-French Parallel Corpus. DiaBLa is an English-French dataset for the evaluation of Machine Translation (MT) for informal, written bilingual dialogue.",
reference="https://inria.hal.science/hal-03021633",
Expand Down Expand Up @@ -42,32 +39,3 @@ class DiaBLaBitextMining(AbsTaskBitextMining):
}
""",
)

def load_data(self) -> None:
"""Load dataset from HuggingFace hub and convert it to the standard format."""
if self.data_loaded:
return

self.dataset = {}

for lang in self.hf_subsets:
self.dataset[lang] = datasets.load_dataset(**self.metadata.dataset)

self.dataset_transform()
self.data_loaded = True

def dataset_transform(self):
def create_columns(row):
"""Put all French texts in column 'sentence1' and English texts in 'sentence2' column"""
row["orig_lang"] = row["utterance_meta"]["lang"]
row["sentence1"] = (
row["orig"] if row["orig_lang"] == "french" else row["ref"]
)
row["sentence2"] = (
row["orig"] if not row["orig_lang"] == "french" else row["ref"]
)
return row

# Convert to standard format
for lang in self.hf_subsets:
self.dataset[lang] = self.dataset[lang].map(create_columns)
29 changes: 21 additions & 8 deletions mteb/tasks/BitextMining/multilingual/FloresBitextMining.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from __future__ import annotations

from typing import Any
from collections import defaultdict

import datasets
from datasets import Dataset

from mteb.abstasks.AbsTaskBitextMining import AbsTaskBitextMining
from mteb.abstasks.task_metadata import TaskMetadata
Expand Down Expand Up @@ -235,13 +236,11 @@ def extend_lang_pairs() -> dict[str, list[str]]:


class FloresBitextMining(AbsTaskBitextMining):
parallel_subsets = True
metadata = TaskMetadata(
name="FloresBitextMining",
dataset={
"path": "mteb/flores",
"revision": "e6b647fcb6299a2f686f742f4d4c023e553ea67e",
"trust_remote_code": True,
"path": "mteb/FloresBitextMining",
"revision": "2144d16cc15edd22d4a9237d12bff5f31f5c07fc",
},
description="FLORES is a benchmark dataset for machine translation between English and low-resource languages.",
reference="https://huggingface.co/datasets/facebook/flores",
Expand Down Expand Up @@ -269,9 +268,23 @@ class FloresBitextMining(AbsTaskBitextMining):
""",
)

def load_data(self, **kwargs: Any) -> None:
"""Load dataset from HuggingFace hub"""
def load_data(self) -> None:
if self.data_loaded:
return
self.dataset = datasets.load_dataset(**self.metadata.dataset)

dataset = datasets.load_dataset(
**self.metadata.dataset,
split=self.metadata.eval_splits[0],
)
self.dataset = defaultdict(dict)
for lang in self.metadata.eval_langs:
first_lang, second_lang = lang.split("-")
ds = Dataset.from_dict(
{
"sentence1": dataset[first_lang],
"sentence2": dataset[second_lang],
}
)
self.dataset[lang][self.metadata.eval_splits[0]] = ds

self.data_loaded = True
29 changes: 21 additions & 8 deletions mteb/tasks/BitextMining/multilingual/IN22ConvBitextMining.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from __future__ import annotations

from typing import Any
from collections import defaultdict

import datasets
from datasets import Dataset

from mteb.abstasks.AbsTaskBitextMining import AbsTaskBitextMining
from mteb.abstasks.task_metadata import TaskMetadata
Expand Down Expand Up @@ -67,13 +68,11 @@ def check_uniques(example, uniques):


class IN22ConvBitextMining(AbsTaskBitextMining):
parallel_subsets = True
metadata = TaskMetadata(
name="IN22ConvBitextMining",
dataset={
"path": "mteb/IN22-Conv",
"revision": "16f46f059d56eac7c65c3c9581a45e40199eb140",
"trust_remote_code": True,
"path": "mteb/IN22ConvBitextMining",
"revision": "4729cdf8e2c21d5d8e953b2e256ccd5d7a6716cd",
},
description="IN22-Conv is a n-way parallel conversation domain benchmark dataset for machine translation spanning English and 22 Indic languages.",
reference="https://huggingface.co/datasets/ai4bharat/IN22-Conv",
Expand Down Expand Up @@ -103,9 +102,23 @@ class IN22ConvBitextMining(AbsTaskBitextMining):
""",
)

def load_data(self, **kwargs: Any) -> None:
"""Load dataset from HuggingFace hub"""
def load_data(self) -> None:
if self.data_loaded:
return
self.dataset = datasets.load_dataset(**self.metadata.dataset)

dataset = datasets.load_dataset(
**self.metadata.dataset,
split=self.metadata.eval_splits[0],
)
self.dataset = defaultdict(dict)
for lang in self.metadata.eval_langs:
first_lang, second_lang = lang.split("-")
ds = Dataset.from_dict(
{
"sentence1": dataset[first_lang],
"sentence2": dataset[second_lang],
}
)
self.dataset[lang][self.metadata.eval_splits[0]] = ds

self.data_loaded = True
Loading
Loading