Skip to content
2 changes: 2 additions & 0 deletions tests/test_dataset_formatting.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

from typing import Callable

import pytest
from datasets import Dataset, load_dataset
from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer

Expand All @@ -23,6 +24,7 @@
from .testing_utils import TrlTestCase


@pytest.mark.filterwarnings("ignore::FutureWarning")
class TestDatasetFormatting(TrlTestCase):
def setup_method(self):
self.llama_tokenizer = AutoTokenizer.from_pretrained("trl-internal-testing/tiny-MistralForCausalLM-0.1")
Expand Down
32 changes: 32 additions & 0 deletions trl/extras/dataset_formatting.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

import logging
import warnings
from typing import Callable, Literal, Optional

import datasets
Expand Down Expand Up @@ -41,7 +42,17 @@ def conversations_formatting_function(
r"""
return a callable function that takes in a "messages" dataset and returns a formatted dataset, based on the
tokenizer apply chat template to the dataset along with the schema of the list of functions in the tools list.

.. deprecated:: 0.24.0
`conversations_formatting_function` is deprecated and will be removed in version 0.27.
Please use `tokenizer.apply_chat_template()` directly instead.
"""
warnings.warn(
"`conversations_formatting_function` is deprecated and will be removed in TRL 0.27. "
"Please use `tokenizer.apply_chat_template()` directly instead.",
DeprecationWarning,
stacklevel=2,
)

def format_dataset(examples):
if isinstance(examples[messages_field][0], list):
Expand All @@ -61,7 +72,17 @@ def instructions_formatting_function(tokenizer: AutoTokenizer):
r"""
return a callable function that takes in an "instructions" dataset and returns a formatted dataset, based on the
tokenizer apply chat template to the dataset

.. deprecated:: 0.24.0
`instructions_formatting_function` is deprecated and will be removed in version 0.27.
Please use `tokenizer.apply_chat_template()` directly instead.
"""
warnings.warn(
"`instructions_formatting_function` is deprecated and will be removed in TRL 0.27. "
"Please use `tokenizer.apply_chat_template()` directly instead.",
DeprecationWarning,
stacklevel=2,
)

def format_dataset(examples):
if isinstance(examples["prompt"], list):
Expand Down Expand Up @@ -99,7 +120,18 @@ def get_formatting_func_from_dataset(

Returns:
Callable: Formatting function if the dataset format is supported else None

.. deprecated:: 0.24.0
`get_formatting_func_from_dataset` is deprecated and will be removed in version 0.27.
Please use `tokenizer.apply_chat_template()` directly instead.
"""
warnings.warn(
"`get_formatting_func_from_dataset` is deprecated and will be removed in TRL 0.27. "
"Please use `tokenizer.apply_chat_template()` directly instead.",
DeprecationWarning,
stacklevel=2,
)

if isinstance(dataset, Dataset):
if "messages" in dataset.features:
if dataset.features["messages"] == FORMAT_MAPPING["chatml"]:
Expand Down
Loading