|
1 | 1 | from __future__ import annotations
|
2 | 2 |
|
3 | 3 | import logging
|
| 4 | +from collections import defaultdict |
4 | 5 | from typing import Any
|
5 | 6 |
|
6 | 7 | from datasets import Dataset, DatasetDict
|
@@ -181,34 +182,25 @@ def _calculate_descriptive_statistics_from_split(
|
181 | 182 |
|
182 | 183 | def _push_dataset_to_hub(self, repo_name: str) -> None:
|
183 | 184 | if self.metadata.is_multilingual:
|
| 185 | + dataset = defaultdict(dict) |
184 | 186 | for config in self.metadata.eval_langs:
|
185 | 187 | logger.info(f"Converting {config} of {self.metadata.name}")
|
186 | 188 |
|
187 |
| - sentences = {} |
188 | 189 | if self.parallel_subsets:
|
189 |
| - # If there are parallel subsets, process them |
190 | 190 | for split in self.dataset:
|
191 | 191 | sent_1, sent_2 = config.split("-")
|
192 |
| - sentences[split] = Dataset.from_dict( |
193 |
| - { |
194 |
| - "sentence1": self.dataset[split][sent_1], |
195 |
| - "sentence2": self.dataset[split][sent_2], |
196 |
| - } |
197 |
| - ) |
| 192 | + dataset[split][sent_1] = self.dataset[split][sent_1] |
| 193 | + dataset[split][sent_2] = self.dataset[split][sent_2] |
198 | 194 | else:
|
199 |
| - # Handle the non-parallel subset case |
200 | 195 | sent_1, sent_2 = self.get_pairs(self.parallel_subsets)[0]
|
| 196 | + lang_1, lang_2 = config.split("-") |
201 | 197 | for split in self.dataset[config]:
|
202 |
| - sentences[split] = Dataset.from_dict( |
203 |
| - { |
204 |
| - "sentence1": self.dataset[config][split][sent_1], |
205 |
| - "sentence2": self.dataset[config][split][sent_2], |
206 |
| - } |
207 |
| - ) |
208 |
| - sentences = DatasetDict(sentences) |
209 |
| - sentences.push_to_hub( |
210 |
| - repo_name, config, commit_message=f"Add {config} subset" |
211 |
| - ) |
| 198 | + dataset[split][lang_1] = self.dataset[config][split][sent_1] |
| 199 | + dataset[split][lang_2] = self.dataset[config][split][sent_2] |
| 200 | + for split in dataset: |
| 201 | + dataset[split] = Dataset.from_dict(dataset[split]) |
| 202 | + dataset = DatasetDict(dataset) |
| 203 | + dataset.push_to_hub(repo_name) |
212 | 204 | else:
|
213 | 205 | sentences = {}
|
214 | 206 | for split in self.dataset:
|
|
0 commit comments