From 79045953cc0fcd3e124abaeab08eb3be6c40c599 Mon Sep 17 00:00:00 2001 From: MedAmineYoussef Date: Tue, 5 Aug 2025 18:02:39 -0400 Subject: [PATCH 1/8] fracas added --- mteb/tasks/PairClassification/__init__.py | 2 + mteb/tasks/PairClassification/fra/FraCaS.py | 82 +++++++++++++++++++ mteb/tasks/PairClassification/fra/__init__.py | 0 3 files changed, 84 insertions(+) create mode 100644 mteb/tasks/PairClassification/fra/FraCaS.py create mode 100644 mteb/tasks/PairClassification/fra/__init__.py diff --git a/mteb/tasks/PairClassification/__init__.py b/mteb/tasks/PairClassification/__init__.py index d3ecd19272..f44cfbdf45 100644 --- a/mteb/tasks/PairClassification/__init__.py +++ b/mteb/tasks/PairClassification/__init__.py @@ -14,6 +14,8 @@ from .eng.TwitterURLCorpusPC import * from .fas.FaMTEBPairClassification import * from .fas.FarsTail import * +from .fra.FraCaS import * + from .hye.ArmenianParaphrasePC import * from .ind.IndoNLI import * from .ita.DisCoTexPairClassification import * diff --git a/mteb/tasks/PairClassification/fra/FraCaS.py b/mteb/tasks/PairClassification/fra/FraCaS.py new file mode 100644 index 0000000000..ce175c84d4 --- /dev/null +++ b/mteb/tasks/PairClassification/fra/FraCaS.py @@ -0,0 +1,82 @@ +from datasets import load_dataset +from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + +class FracasTask(AbsTaskPairClassification): + metadata = TaskMetadata( + name="fracas", + description=( + "Natural language inference on FRACAS: " + "prédit si une hypothèse découle (entailment) ou non d'une prémisse." + ), + reference="https://huggingface.co/datasets/maximoss/fracas", + dataset={"path": "maximoss/fracas", "revision": "main"}, + type="PairClassification", + category="s2s", + modalities=["text"], + eval_splits=["train"], # FRACAS ne propose que ce split + eval_langs=["fra-Latn"], + main_score="max_accuracy", + date=("2025-08-05", "2025-08-05"), + domains=["Academic"], + task_subtypes=["Textual Entailment"], + license="cc-by-4.0", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{fracas2025, + author = {Maxim Oss and Collaborateurs}, + title = {FRACAS: A French NLI dataset}, + booktitle = {Imaginary Conference on French NLP}, + year = {2025}, +} +""", + ) + + def load_data(self, **kwargs): + """Charge le DatasetDict HF puis transforme en self.dataset.""" + if getattr(self, "data_loaded", False): + return + self.dataset = load_dataset( + self.metadata.dataset["path"], + revision=self.metadata.dataset["revision"], + ) + self.dataset_transform() + self.data_loaded = True + + def dataset_transform(self): + """ + Construit self.dataset sous la forme : + { + 'fra-Latn': { + 'train': [ + { + 'sentence1': [...], # liste de prémisses + 'sentence2': [...], # liste d’hypothèses + 'labels': [...], # liste de 0/1 + } + ] + } + } + """ + out: dict[str, dict[str, list[dict[str, list]]]] = {} + for lang in self.hf_subsets: # ['fra-Latn'] + out[lang] = {} + for split in self.metadata.eval_splits: # ['train'] + ds = self.dataset[split] + # Affiche les labels pour debugging + print("FRACAS labels disponibles :", sorted(set(ds["label"]))) + # Filtrer hors 'undef' + ds = ds.filter(lambda x: x["label"] != "undef") + # Remapper '1'→1 (positif), '0' et '2'→0 (négatif) + ds = ds.map(lambda ex: {"label": 1 if ex["label"] == "1" else 0}) + # Construire la liste contenant UN dict de listes alignées + out[lang][split] = [ + { + "sentence1": ds["premises"], + "sentence2": ds["hypothesis"], + "labels": ds["label"], + } + ] + self.dataset = out \ No newline at end of file diff --git a/mteb/tasks/PairClassification/fra/__init__.py b/mteb/tasks/PairClassification/fra/__init__.py new file mode 100644 index 0000000000..e69de29bb2 From 3d5596eb14e1d731e094adadea3d6addde5671e4 Mon Sep 17 00:00:00 2001 From: MedAmineYoussef <152202583+MedAmineYoussef@users.noreply.github.com> Date: Wed, 6 Aug 2025 10:49:19 -0400 Subject: [PATCH 2/8] Update mteb/tasks/PairClassification/fra/FraCaS.py Co-authored-by: Roman Solomatin --- mteb/tasks/PairClassification/fra/FraCaS.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mteb/tasks/PairClassification/fra/FraCaS.py b/mteb/tasks/PairClassification/fra/FraCaS.py index ce175c84d4..92c8a8d079 100644 --- a/mteb/tasks/PairClassification/fra/FraCaS.py +++ b/mteb/tasks/PairClassification/fra/FraCaS.py @@ -66,7 +66,6 @@ def dataset_transform(self): for split in self.metadata.eval_splits: # ['train'] ds = self.dataset[split] # Affiche les labels pour debugging - print("FRACAS labels disponibles :", sorted(set(ds["label"]))) # Filtrer hors 'undef' ds = ds.filter(lambda x: x["label"] != "undef") # Remapper '1'→1 (positif), '0' et '2'→0 (négatif) From 0282c3967dca6e9d8cd927818dccc8258e50f767 Mon Sep 17 00:00:00 2001 From: MedAmineYoussef Date: Wed, 6 Aug 2025 10:51:28 -0400 Subject: [PATCH 3/8] Update FraCaS.py --- mteb/tasks/PairClassification/fra/FraCaS.py | 77 ++++++++++----------- 1 file changed, 38 insertions(+), 39 deletions(-) diff --git a/mteb/tasks/PairClassification/fra/FraCaS.py b/mteb/tasks/PairClassification/fra/FraCaS.py index 92c8a8d079..9a55998833 100644 --- a/mteb/tasks/PairClassification/fra/FraCaS.py +++ b/mteb/tasks/PairClassification/fra/FraCaS.py @@ -10,11 +10,11 @@ class FracasTask(AbsTaskPairClassification): "prédit si une hypothèse découle (entailment) ou non d'une prémisse." ), reference="https://huggingface.co/datasets/maximoss/fracas", - dataset={"path": "maximoss/fracas", "revision": "main"}, + dataset={"path": "maximoss/fracas", "revision": "2506e60be409b124bd72336038dea6f9460ea70c"}, type="PairClassification", category="s2s", modalities=["text"], - eval_splits=["train"], # FRACAS ne propose que ce split + eval_splits=["train"], eval_langs=["fra-Latn"], main_score="max_accuracy", date=("2025-08-05", "2025-08-05"), @@ -25,57 +25,56 @@ class FracasTask(AbsTaskPairClassification): dialect=[], sample_creation="found", bibtex_citation=r""" -@inproceedings{fracas2025, - author = {Maxim Oss and Collaborateurs}, - title = {FRACAS: A French NLI dataset}, - booktitle = {Imaginary Conference on French NLP}, - year = {2025}, +@inproceedings{amblard-etal-2020-french, + title = "A {F}rench Version of the {F}ra{C}a{S} Test Suite", + author = "Amblard, Maxime and + Beysson, Cl{\'e}ment and + de Groote, Philippe and + Guillaume, Bruno and + Pogodalla, Sylvain", + editor = "Calzolari, Nicoletta and + B{\'e}chet, Fr{\'e}d{\'e}ric and + Blache, Philippe and + Choukri, Khalid and + Cieri, Christopher and + Declerck, Thierry and + Goggi, Sara and + Isahara, Hitoshi and + Maegaard, Bente and + Mariani, Joseph and + Mazo, H{\'e}l{\`e}ne and + Moreno, Asuncion and + Odijk, Jan and + Piperidis, Stelios", + booktitle = "Proceedings of the Twelfth Language Resources and Evaluation Conference", + month = may, + year = "2020", + address = "Marseille, France", + publisher = "European Language Resources Association", + url = "https://aclanthology.org/2020.lrec-1.721", + pages = "5887--5895", + abstract = "This paper presents a French version of the FraCaS test suite. This test suite, originally written in English, contains problems illustrating semantic inference in natural language. We describe linguistic choices we had to make when translating the FraCaS test suite in French, and discuss some of the issues that were raised by the translation. We also report an experiment we ran in order to test both the translation and the logical semantics underlying the problems of the test suite. This provides a way of checking formal semanticists{'} hypotheses against actual semantic capacity of speakers (in the present case, French speakers), and allow us to compare the results we obtained with the ones of similar experiments that have been conducted for other languages.", + language = "English", + ISBN = "979-10-95546-34-4", } + """, ) - def load_data(self, **kwargs): - """Charge le DatasetDict HF puis transforme en self.dataset.""" - if getattr(self, "data_loaded", False): - return - self.dataset = load_dataset( - self.metadata.dataset["path"], - revision=self.metadata.dataset["revision"], - ) - self.dataset_transform() - self.data_loaded = True - def dataset_transform(self): - """ - Construit self.dataset sous la forme : - { - 'fra-Latn': { - 'train': [ - { - 'sentence1': [...], # liste de prémisses - 'sentence2': [...], # liste d’hypothèses - 'labels': [...], # liste de 0/1 - } - ] - } - } - """ + out: dict[str, dict[str, list[dict[str, list]]]] = {} - for lang in self.hf_subsets: # ['fra-Latn'] + for lang in self.hf_subsets: out[lang] = {} - for split in self.metadata.eval_splits: # ['train'] + for split in self.metadata.eval_splits: ds = self.dataset[split] - # Affiche les labels pour debugging - # Filtrer hors 'undef' ds = ds.filter(lambda x: x["label"] != "undef") - # Remapper '1'→1 (positif), '0' et '2'→0 (négatif) ds = ds.map(lambda ex: {"label": 1 if ex["label"] == "1" else 0}) - # Construire la liste contenant UN dict de listes alignées out[lang][split] = [ { "sentence1": ds["premises"], "sentence2": ds["hypothesis"], - "labels": ds["label"], + "labels": ds["label"], } ] self.dataset = out \ No newline at end of file From 7de0d141ae3becba3afe6ccb273b4aae2eabb2f5 Mon Sep 17 00:00:00 2001 From: MedAmineYoussef <152202583+MedAmineYoussef@users.noreply.github.com> Date: Thu, 7 Aug 2025 10:41:09 -0400 Subject: [PATCH 4/8] Update mteb/tasks/PairClassification/fra/FraCaS.py Co-authored-by: Roman Solomatin --- mteb/tasks/PairClassification/fra/FraCaS.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mteb/tasks/PairClassification/fra/FraCaS.py b/mteb/tasks/PairClassification/fra/FraCaS.py index 9a55998833..540127aeea 100644 --- a/mteb/tasks/PairClassification/fra/FraCaS.py +++ b/mteb/tasks/PairClassification/fra/FraCaS.py @@ -23,7 +23,7 @@ class FracasTask(AbsTaskPairClassification): license="cc-by-4.0", annotations_creators="human-annotated", dialect=[], - sample_creation="found", + sample_creation="translated", bibtex_citation=r""" @inproceedings{amblard-etal-2020-french, title = "A {F}rench Version of the {F}ra{C}a{S} Test Suite", From f9be37ef88b13b92d7bafcb4d87def36a187f0c8 Mon Sep 17 00:00:00 2001 From: MedAmineYoussef Date: Thu, 7 Aug 2025 10:49:14 -0400 Subject: [PATCH 5/8] Update FraCaS.py --- mteb/tasks/PairClassification/fra/FraCaS.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mteb/tasks/PairClassification/fra/FraCaS.py b/mteb/tasks/PairClassification/fra/FraCaS.py index 540127aeea..d162cae116 100644 --- a/mteb/tasks/PairClassification/fra/FraCaS.py +++ b/mteb/tasks/PairClassification/fra/FraCaS.py @@ -17,7 +17,7 @@ class FracasTask(AbsTaskPairClassification): eval_splits=["train"], eval_langs=["fra-Latn"], main_score="max_accuracy", - date=("2025-08-05", "2025-08-05"), + date = ("2020-01-01", "2020-12-31"), domains=["Academic"], task_subtypes=["Textual Entailment"], license="cc-by-4.0", From 964620554e9c6792c8ab6f8d1a6a472b5af08f1e Mon Sep 17 00:00:00 2001 From: MedAmineYoussef Date: Mon, 11 Aug 2025 10:42:10 -0400 Subject: [PATCH 6/8] Update adding_a_dataset.md --- docs/adding_a_dataset.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/adding_a_dataset.md b/docs/adding_a_dataset.md index 6df514c514..dfc3579ec4 100644 --- a/docs/adding_a_dataset.md +++ b/docs/adding_a_dataset.md @@ -237,8 +237,8 @@ The PR will be reviewed by one of the organizers or contributors who might ask y Before you commit, here is a checklist you should complete before submitting: -- [ ] I have outlined why this dataset is filling an existing gap in `mteb` -- [ ] I have tested that the dataset runs with the `mteb` package. +- [x] I have outlined why this dataset is filling an existing gap in `mteb` +- [x] I have tested that the dataset runs with the `mteb` package. An easy way to test it is using: ```python @@ -251,8 +251,8 @@ evaluation = mteb.MTEB(tasks=[task]) evaluation.run(model) ``` -- [ ] I have run the following models on the task (adding the results to the pr). These can be run using the `mteb run -m {model_name} -t {task_name}` command. - - [ ] `sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2` - - [ ] `intfloat/multilingual-e5-small` -- [ ] I have checked that the performance is neither trivial (both models gain close to perfect scores) nor random (both models gain close to random scores). -- [ ] I have considered the size of the dataset and reduced it if it is too big (2048 examples is typically large enough for most tasks) +- [x] I have run the following models on the task (adding the results to the pr). These can be run using the `mteb run -m {model_name} -t {task_name}` command. + - [x] `sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2` + - [x] `intfloat/multilingual-e5-small` +- [x] I have checked that the performance is neither trivial (both models gain close to perfect scores) nor random (both models gain close to random scores). +- [x] I have considered the size of the dataset and reduced it if it is too big (2048 examples is typically large enough for most tasks) From e90b6c9aa942666ff93aec685e88940f867cddc5 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Thu, 18 Sep 2025 12:31:45 +0200 Subject: [PATCH 7/8] revert --- docs/adding_a_dataset.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/adding_a_dataset.md b/docs/adding_a_dataset.md index dfc3579ec4..6df514c514 100644 --- a/docs/adding_a_dataset.md +++ b/docs/adding_a_dataset.md @@ -237,8 +237,8 @@ The PR will be reviewed by one of the organizers or contributors who might ask y Before you commit, here is a checklist you should complete before submitting: -- [x] I have outlined why this dataset is filling an existing gap in `mteb` -- [x] I have tested that the dataset runs with the `mteb` package. +- [ ] I have outlined why this dataset is filling an existing gap in `mteb` +- [ ] I have tested that the dataset runs with the `mteb` package. An easy way to test it is using: ```python @@ -251,8 +251,8 @@ evaluation = mteb.MTEB(tasks=[task]) evaluation.run(model) ``` -- [x] I have run the following models on the task (adding the results to the pr). These can be run using the `mteb run -m {model_name} -t {task_name}` command. - - [x] `sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2` - - [x] `intfloat/multilingual-e5-small` -- [x] I have checked that the performance is neither trivial (both models gain close to perfect scores) nor random (both models gain close to random scores). -- [x] I have considered the size of the dataset and reduced it if it is too big (2048 examples is typically large enough for most tasks) +- [ ] I have run the following models on the task (adding the results to the pr). These can be run using the `mteb run -m {model_name} -t {task_name}` command. + - [ ] `sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2` + - [ ] `intfloat/multilingual-e5-small` +- [ ] I have checked that the performance is neither trivial (both models gain close to perfect scores) nor random (both models gain close to random scores). +- [ ] I have considered the size of the dataset and reduced it if it is too big (2048 examples is typically large enough for most tasks) From 079db8fba3863893531df709fea90abc018f8f3f Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Thu, 18 Sep 2025 12:33:07 +0200 Subject: [PATCH 8/8] fixed comments --- mteb/tasks/PairClassification/__init__.py | 2 +- ...FraCaS.py => fracas_pairclassification.py} | 21 +++++++++++-------- 2 files changed, 13 insertions(+), 10 deletions(-) rename mteb/tasks/PairClassification/fra/{FraCaS.py => fracas_pairclassification.py} (69%) diff --git a/mteb/tasks/PairClassification/__init__.py b/mteb/tasks/PairClassification/__init__.py index f44cfbdf45..0608b2f209 100644 --- a/mteb/tasks/PairClassification/__init__.py +++ b/mteb/tasks/PairClassification/__init__.py @@ -14,7 +14,7 @@ from .eng.TwitterURLCorpusPC import * from .fas.FaMTEBPairClassification import * from .fas.FarsTail import * -from .fra.FraCaS import * +from .fra.fracas_pairclassification import * from .hye.ArmenianParaphrasePC import * from .ind.IndoNLI import * diff --git a/mteb/tasks/PairClassification/fra/FraCaS.py b/mteb/tasks/PairClassification/fra/fracas_pairclassification.py similarity index 69% rename from mteb/tasks/PairClassification/fra/FraCaS.py rename to mteb/tasks/PairClassification/fra/fracas_pairclassification.py index d162cae116..848ae9d81e 100644 --- a/mteb/tasks/PairClassification/fra/FraCaS.py +++ b/mteb/tasks/PairClassification/fra/fracas_pairclassification.py @@ -1,29 +1,34 @@ -from datasets import load_dataset +from __future__ import annotations + from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification from mteb.abstasks.TaskMetadata import TaskMetadata -class FracasTask(AbsTaskPairClassification): + +class FracasPairClassification(AbsTaskPairClassification): metadata = TaskMetadata( - name="fracas", + name="FracasPairClassification", description=( "Natural language inference on FRACAS: " "prédit si une hypothèse découle (entailment) ou non d'une prémisse." ), reference="https://huggingface.co/datasets/maximoss/fracas", - dataset={"path": "maximoss/fracas", "revision": "2506e60be409b124bd72336038dea6f9460ea70c"}, + dataset={ + "path": "maximoss/fracas", + "revision": "2506e60be409b124bd72336038dea6f9460ea70c", + }, type="PairClassification", category="s2s", modalities=["text"], eval_splits=["train"], eval_langs=["fra-Latn"], main_score="max_accuracy", - date = ("2020-01-01", "2020-12-31"), + date=("2020-01-01", "2020-12-31"), domains=["Academic"], task_subtypes=["Textual Entailment"], license="cc-by-4.0", annotations_creators="human-annotated", dialect=[], - sample_creation="translated", + sample_creation="human-translated", bibtex_citation=r""" @inproceedings{amblard-etal-2020-french, title = "A {F}rench Version of the {F}ra{C}a{S} Test Suite", @@ -53,7 +58,6 @@ class FracasTask(AbsTaskPairClassification): publisher = "European Language Resources Association", url = "https://aclanthology.org/2020.lrec-1.721", pages = "5887--5895", - abstract = "This paper presents a French version of the FraCaS test suite. This test suite, originally written in English, contains problems illustrating semantic inference in natural language. We describe linguistic choices we had to make when translating the FraCaS test suite in French, and discuss some of the issues that were raised by the translation. We also report an experiment we ran in order to test both the translation and the logical semantics underlying the problems of the test suite. This provides a way of checking formal semanticists{'} hypotheses against actual semantic capacity of speakers (in the present case, French speakers), and allow us to compare the results we obtained with the ones of similar experiments that have been conducted for other languages.", language = "English", ISBN = "979-10-95546-34-4", } @@ -62,7 +66,6 @@ class FracasTask(AbsTaskPairClassification): ) def dataset_transform(self): - out: dict[str, dict[str, list[dict[str, list]]]] = {} for lang in self.hf_subsets: out[lang] = {} @@ -77,4 +80,4 @@ def dataset_transform(self): "labels": ds["label"], } ] - self.dataset = out \ No newline at end of file + self.dataset = out