Skip to content

Commit 45114a5

Browse files
authored
Reupload datasets with trust remote code (#3161)
* start removing trust remote code * fix dataset uploading * upload bitext as parallel * fix uploading * try to upload all * continue reuploading * fix zeroshot upload * add comment to trust remote code * reupload legal bench * fix datasets * remove exceptions * fix tests * fix bitext loading * fix bitext * try to fix retrieval * fix * fix revisions * remove parameter from metadata dataset dict
1 parent 59f6c01 commit 45114a5

File tree

143 files changed

+637
-5134
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

143 files changed

+637
-5134
lines changed

mteb/abstasks/AbsTask.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -313,7 +313,7 @@ def load_data(self) -> None:
313313
self.dataset_transform()
314314
self.data_loaded = True
315315

316-
def fast_load(self, **kwargs: Any) -> None:
316+
def fast_load(self) -> None:
317317
"""**Deprecated**. Load all subsets at once, then group by language. Using fast loading has two requirements:
318318
319319
- Each row in the dataset should have a 'lang' feature giving the corresponding language/language pair
@@ -546,22 +546,16 @@ def _upload_dataset_to_hub(
546546
def _push_dataset_to_hub(self, repo_name: str) -> None:
547547
raise NotImplementedError
548548

549-
def push_dataset_to_hub(self, repo_name: str, reupload: bool = False) -> None:
549+
def push_dataset_to_hub(self, repo_name: str) -> None:
550550
"""Push the dataset to the HuggingFace Hub.
551551
552552
Args:
553553
repo_name: The name of the repository to push the dataset to.
554-
reupload: If true, then `source_datasets` will be added to model card with source dataset.
555554
556555
Examples:
557556
>>> import mteb
558557
>>> task = mteb.get_task("Caltech101")
559558
>>> repo_name = f"myorg/{task.metadata.name}"
560-
>>> task.load_data() # ensure that the dataset can load
561-
>>>
562-
>>> # Create the repo on HuggingFace Hub if it does not exist
563-
>>> from huggingface_hub import create_repo
564-
>>> create_repo(repo_name, repo_type="dataset")
565559
>>> # Push the dataset to the Hub
566560
>>> task.push_dataset_to_hub(repo_name)
567561
"""

mteb/abstasks/AbsTaskAnyZeroShotClassification.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,17 @@ def _evaluate_subset(
123123
)
124124
return evaluator(model, encode_kwargs=encode_kwargs)
125125

126+
def _push_dataset_to_hub(self, repo_name: str) -> None:
127+
self._upload_dataset_to_hub(
128+
repo_name,
129+
[
130+
self.input_column_name,
131+
self.label_column_name,
132+
],
133+
)
134+
labels_dataset = Dataset.from_dict({"labels": self.get_candidate_labels()})
135+
labels_dataset.push_to_hub(repo_name, config_name="labels")
136+
126137
def get_candidate_labels(self) -> list[str]:
127138
"""Return the text candidates for zeroshot classification"""
128139
raise NotImplementedError("This method should be overridden by subclasses")

mteb/abstasks/AbsTaskBitextMining.py

Lines changed: 11 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import annotations
22

33
import logging
4+
from collections import defaultdict
45
from typing import Any
56

67
from datasets import Dataset, DatasetDict
@@ -181,34 +182,25 @@ def _calculate_descriptive_statistics_from_split(
181182

182183
def _push_dataset_to_hub(self, repo_name: str) -> None:
183184
if self.metadata.is_multilingual:
185+
dataset = defaultdict(dict)
184186
for config in self.metadata.eval_langs:
185187
logger.info(f"Converting {config} of {self.metadata.name}")
186188

187-
sentences = {}
188189
if self.parallel_subsets:
189-
# If there are parallel subsets, process them
190190
for split in self.dataset:
191191
sent_1, sent_2 = config.split("-")
192-
sentences[split] = Dataset.from_dict(
193-
{
194-
"sentence1": self.dataset[split][sent_1],
195-
"sentence2": self.dataset[split][sent_2],
196-
}
197-
)
192+
dataset[split][sent_1] = self.dataset[split][sent_1]
193+
dataset[split][sent_2] = self.dataset[split][sent_2]
198194
else:
199-
# Handle the non-parallel subset case
200195
sent_1, sent_2 = self.get_pairs(self.parallel_subsets)[0]
196+
lang_1, lang_2 = config.split("-")
201197
for split in self.dataset[config]:
202-
sentences[split] = Dataset.from_dict(
203-
{
204-
"sentence1": self.dataset[config][split][sent_1],
205-
"sentence2": self.dataset[config][split][sent_2],
206-
}
207-
)
208-
sentences = DatasetDict(sentences)
209-
sentences.push_to_hub(
210-
repo_name, config, commit_message=f"Add {config} subset"
211-
)
198+
dataset[split][lang_1] = self.dataset[config][split][sent_1]
199+
dataset[split][lang_2] = self.dataset[config][split][sent_2]
200+
for split in dataset:
201+
dataset[split] = Dataset.from_dict(dataset[split])
202+
dataset = DatasetDict(dataset)
203+
dataset.push_to_hub(repo_name)
212204
else:
213205
sentences = {}
214206
for split in self.dataset:

mteb/abstasks/AbsTaskRetrieval.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -554,7 +554,7 @@ def _push_dataset_to_hub(self, repo_name: str) -> None:
554554
self.convert_v1_dataset_format_to_v2()
555555

556556
def _push_section(
557-
data: dict[str, dict[Any, Any]],
557+
data: dict[str, dict[str, Dataset | dict]],
558558
subset_item: str,
559559
hf_subset_name: str,
560560
converter: Callable[[Any, Any], dict[str, Any]] | None = None,
@@ -572,7 +572,7 @@ def _push_section(
572572
# skip empty instructions and top ranked
573573
if subset_item not in data[split] or data[split][subset_item] is None:
574574
continue
575-
if isinstance(sections[split], Dataset):
575+
if isinstance(data[split][subset_item], Dataset):
576576
sections[split] = data[split][subset_item]
577577
elif converter is not None:
578578
sections[split] = Dataset.from_list(

mteb/abstasks/retrieval_dataset_loaders.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,7 @@ def _load_qrels(self) -> RelevantDocumentsType:
155155
)
156156

157157
qrels_ds = self.load_dataset_split(config)
158+
qrels_ds = qrels_ds.select_columns(["query-id", "corpus-id", "score"])
158159

159160
qrels_ds = qrels_ds.cast(
160161
Features(

mteb/abstasks/task_metadata.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -194,15 +194,15 @@ class MetadataDatasetDict(TypedDict, total=False):
194194
revision: The revision of the dataset.
195195
name: The name the dataset config.
196196
split: The split of the dataset.
197-
trust_remote_code: Whether to trust the remote code.
197+
trust_remote_code: Whether to use `trust_remote_code`. Datasets shouldn't use this since,
198+
because datasets `v4` doesn't support this. This parameter is left for compatibility with forks/external usage.
198199
"""
199200

200201
path: str
201202
revision: str
202203
name: str
203204
split: str
204205
trust_remote_code: bool
205-
dataset_version: str # NLPJournalAbsArticleRetrieval.V2
206206

207207

208208
class TaskMetadata(BaseModel):
@@ -552,7 +552,7 @@ def _create_dataset_card_data(
552552
license_mapping = {
553553
"not specified": "unknown",
554554
"msr-la-nc": "other",
555-
"cc-by-nd-2.1-jp": "cc-by-nd-2.1",
555+
"cc-by-nd-2.1-jp": "other",
556556
}
557557
dataset_license = license_mapping.get(
558558
dataset_license,

mteb/create_dataloaders.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def corpus_to_dict(
4141
) -> dict[str, str]:
4242
text = (
4343
(row["title"] + " " + row["text"]).strip()
44-
if "title" in row
44+
if "title" in row and len(row["title"]) > 0
4545
else row["text"].strip()
4646
)
4747
new_row = {
@@ -50,7 +50,7 @@ def corpus_to_dict(
5050
"body": row["text"],
5151
}
5252
# dataloders can't handle None
53-
if "title" in row and row["title"] is not None:
53+
if "title" in row and row["title"] is not None and len(row["title"]) > 0:
5454
new_row["title"] = row["title"]
5555
return new_row
5656

mteb/tasks/BitextMining/dan/BornholmskBitextMining.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,8 @@ class BornholmBitextMining(AbsTaskBitextMining):
88
metadata = TaskMetadata(
99
name="BornholmBitextMining",
1010
dataset={
11-
"path": "strombergnlp/bornholmsk_parallel",
12-
"revision": "3bc5cfb4ec514264fe2db5615fac9016f7251552",
13-
"trust_remote_code": True,
11+
"path": "mteb/BornholmBitextMining",
12+
"revision": "5b02048bd75e79275aa91a1fce6cdfd3f4a391cb",
1413
},
1514
description="Danish Bornholmsk Parallel Corpus. Bornholmsk is a Danish dialect spoken on the island of Bornholm, Denmark. Historically it is a part of east Danish which was also spoken in Scania and Halland, Sweden.",
1615
reference="https://aclanthology.org/W19-6138/",
@@ -43,8 +42,3 @@ class BornholmBitextMining(AbsTaskBitextMining):
4342
""",
4443
prompt="Retrieve parallel sentences.",
4544
)
46-
47-
def dataset_transform(self):
48-
# Convert to standard format
49-
self.dataset = self.dataset.rename_column("da", "sentence1")
50-
self.dataset = self.dataset.rename_column("da_bornholm", "sentence2")

mteb/tasks/BitextMining/multilingual/BibleNLPBitextMining.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -865,7 +865,6 @@ class BibleNLPBitextMining(AbsTaskBitextMining):
865865
"path": "davidstap/biblenlp-corpus-mmteb",
866866
"revision": "264a18480c529d9e922483839b4b9758e690b762",
867867
"split": f"train[:{_N}]",
868-
"trust_remote_code": True,
869868
},
870869
description="Partial Bible translations in 829 languages, aligned by verse.",
871870
reference="https://arxiv.org/abs/2304.09919",
Lines changed: 2 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
from __future__ import annotations
22

3-
import datasets
4-
53
from mteb.abstasks.AbsTaskBitextMining import AbsTaskBitextMining
64
from mteb.abstasks.task_metadata import TaskMetadata
75

@@ -10,9 +8,8 @@ class DiaBLaBitextMining(AbsTaskBitextMining):
108
metadata = TaskMetadata(
119
name="DiaBlaBitextMining",
1210
dataset={
13-
"path": "rbawden/DiaBLa",
14-
"revision": "5345895c56a601afe1a98519ce3199be60a27dba",
15-
"trust_remote_code": True,
11+
"path": "mteb/DiaBlaBitextMining",
12+
"revision": "c458e9bf4306d6380604462926a38c34861b4d3b",
1613
},
1714
description="English-French Parallel Corpus. DiaBLa is an English-French dataset for the evaluation of Machine Translation (MT) for informal, written bilingual dialogue.",
1815
reference="https://inria.hal.science/hal-03021633",
@@ -42,32 +39,3 @@ class DiaBLaBitextMining(AbsTaskBitextMining):
4239
}
4340
""",
4441
)
45-
46-
def load_data(self) -> None:
47-
"""Load dataset from HuggingFace hub and convert it to the standard format."""
48-
if self.data_loaded:
49-
return
50-
51-
self.dataset = {}
52-
53-
for lang in self.hf_subsets:
54-
self.dataset[lang] = datasets.load_dataset(**self.metadata.dataset)
55-
56-
self.dataset_transform()
57-
self.data_loaded = True
58-
59-
def dataset_transform(self):
60-
def create_columns(row):
61-
"""Put all French texts in column 'sentence1' and English texts in 'sentence2' column"""
62-
row["orig_lang"] = row["utterance_meta"]["lang"]
63-
row["sentence1"] = (
64-
row["orig"] if row["orig_lang"] == "french" else row["ref"]
65-
)
66-
row["sentence2"] = (
67-
row["orig"] if not row["orig_lang"] == "french" else row["ref"]
68-
)
69-
return row
70-
71-
# Convert to standard format
72-
for lang in self.hf_subsets:
73-
self.dataset[lang] = self.dataset[lang].map(create_columns)

0 commit comments

Comments
 (0)