🎁 Reward submodule (#3430)

qgallouedec · web-flow · commit 54d4f6b13ab8 · 2025-05-15T19:10:22.000-07:00
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
@@ -107,6 +107,8 @@
     title: Callbacks
   - local: data_utils
     title: Data Utilities
+  - local: rewards
+    title: Reward Functions
   - local: script_utils
     title: Script Utilities
   - local: others
diff --git a/docs/source/rewards.md b/docs/source/rewards.md
@@ -0,0 +1,9 @@
+# Reward Functions
+
+This module contains some useful reward functions, primarily intended for use with the [`GRPOTrainer`].
+
+## Format rewards
+
+### think_format_reward
+
+[[autodoc]] rewards.think_format_reward
diff --git a/tests/test_rewards.py b/tests/test_rewards.py
@@ -0,0 +1,65 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from trl.rewards import think_format_reward
+
+
+class ThinkFormatRewardTester(unittest.TestCase):
+    def test_valid_format(self):
+        completions = [
+            "<think>This is my reasoning.</think>This is my answer.",  # Simple, one-line reasoning
+            "<think>\nThis is my reasoning.\n</think>\nThis is my answer.",  # Multiline reasoning
+            "<think>\nThis is\nmy reasoning.\n</think>\nThis is my answer.",  # Multiline reasoning
+            "<think>\nThis is <some tag> my reasoning.</think>\nThis is my answer.",  # Reasoning including other tags
+            "<think></think>\nThis is my answer.",  # Empty reasoning
+        ]
+        completions = [[{"content": completion}] for completion in completions]
+        expected_rewards = [1.0, 1.0, 1.0, 1.0, 1.0]  # All should be valid
+        rewards = think_format_reward(completions)
+        self.assertEqual(rewards, expected_rewards)
+
+    def test_invalid_format(self):
+        completions = [
+            "<think>\nThis is my reasoning.\nThis is my answer.",  # No closing </think>
+            "<think>This is my reasoning.\nThis is my answer.",  # No closing </think>
+            "This is my reasoning. This is my answer.",  # No <think> tags
+            "This is my reasoning.\nThis is my answer.",  # No <think> tags
+            "This is my reasoning.</think>\nThis is my answer.",  # No opening <think>
+            "This is my reasoning.</think>This is my answer.",  # No opening <think>
+            "This<think>is my reasoning.</think>\nThis is my answer.",  # <think> tag in the middle
+            "<think>This is<think>my reasoning.</think></think>This is my answer.",  # Nested <think> tags
+            "<think>This is</think>\nmy\n<think>reasoning.</think>\nThis is my answer.",  # Multiline <think>
+        ]
+        completions = [[{"content": completion}] for completion in completions]
+        expected_rewards = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]  # All should be invalid
+        rewards = think_format_reward(completions)
+        self.assertEqual(rewards, expected_rewards)
+
+    def test_mixed_format(self):
+        completions = [
+            "<think>This is my reasoning.</think>This is my answer.",  # Valid
+            "<think>\nThis is my reasoning.\n</think>\nThis is my answer.",  # Valid
+            "<think>This is my reasoning.\nThis is my answer.",  # Invalid
+            "This is my reasoning. This is my answer.",  # Invalid
+        ]
+        completions = [[{"content": completion}] for completion in completions]
+        expected_rewards = [1.0, 1.0, 0.0, 0.0]
+        rewards = think_format_reward(completions)
+        self.assertEqual(rewards, expected_rewards)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/trl/rewards/__init__.py b/trl/rewards/__init__.py
@@ -0,0 +1,32 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import sys
+from typing import TYPE_CHECKING
+
+from ..import_utils import _LazyModule
+
+
+_import_structure = {
+    "format_rewards": ["think_format_reward"],
+}
+
+
+if TYPE_CHECKING:
+    from .format_rewards import think_format_reward
+
+
+else:
+    sys.modules[__name__] = _LazyModule(__name__, __file__, _import_structure, module_spec=__spec__)
diff --git a/trl/rewards/format_rewards.py b/trl/rewards/format_rewards.py
@@ -0,0 +1,49 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+
+
+def think_format_reward(completions: list[list[dict[str, str]]], **kwargs) -> list[float]:
+    r"""
+    Reward function that checks if the reasoning process is enclosed within `"<think>"` and `"</think>"` tags. The
+    function returns a reward of 1.0 if the format is correct, otherwise 0.0.
+
+    Args:
+        completions (`list[list[dict[str, str]]]`):
+            List of completions to be evaluated. Each completion must be a list of one message, i.e. a dictionary
+            containing the key `"content"` with the value being the text of the completion.
+        **kwargs:
+            Additional keyword arguments. This function does not use them, but they are required in the function
+            signature to ensure compatibility with trainers like [`GRPOTrainer`].
+
+    Returns:
+        `list[float]`:
+            A list of rewards, where each reward is 1.0 if the completion matches the expected format, otherwise 0.0.
+
+    Example:
+    ```python
+    >>> from trl.rewards import think_format_reward
+    >>> completions = [
+    ...     [{"content": "<think>\nThis is my reasoning.\n</think>\nThis is my answer."}],
+    ...     [{"content": "<think>\nThis is my reasoning.\nThis is my answer."}],
+    ... ]
+    >>> think_format_reward(completions)
+    [1.0, 0.0]
+    ```
+    """
+    pattern = r"^<think>(?!.*<think>)(.*?)</think>.*$"
+    completion_contents = [completion[0]["content"] for completion in completions]
+    matches = [re.match(pattern, content, re.DOTALL | re.MULTILINE) for content in completion_contents]
+    return [1.0 if match else 0.0 for match in matches]
diff --git a/trl/scripts/grpo.py b/trl/scripts/grpo.py
@@ -13,13 +13,20 @@
 # limitations under the License.
 
 import argparse
+import importlib
 from dataclasses import dataclass, field
 from typing import Optional
 
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer
 
 from trl import GRPOConfig, GRPOTrainer, ModelConfig, ScriptArguments, TrlParser, get_peft_config
+from trl.rewards import think_format_reward
+
+
+reward_funcs_registry = {
+    "think_format_reward": think_format_reward,
+}
 
 
 @dataclass
@@ -28,9 +35,12 @@ class GRPOScriptArguments(ScriptArguments):
     Script arguments for the GRPO training script.
 
     Args:
-        reward_model_name_or_path (`str` or `None`):
+        reward_model_name_or_path (`str` or `None`, *optional*, defaults to `None`):
             Reward model id of a pretrained model hosted inside a model repo on huggingface.co or local path to a
             directory containing model weights saved using [`~transformers.PreTrainedModel.save_pretrained`].
+        reward_funcs (`list[str]` or `None`, *optional*, defaults to `None`):
+            Reward functions to use. It can be either one of  `"think_format_reward"`; or a dotted import path "
+            (e.g., `'my_lib.rewards.custom_reward'`).
     """
 
     reward_model_name_or_path: Optional[str] = field(
@@ -40,6 +50,13 @@ class GRPOScriptArguments(ScriptArguments):
             "local path to a directory containing model weights saved using `PreTrainedModel.save_pretrained`."
         },
     )
+    reward_funcs: Optional[list[str]] = field(
+        default=None,
+        metadata={
+            "help": "Reward functions to use. It can be either one of  'think_format_reward'; or a dotted "
+            "import path. (e.g., 'my_lib.rewards.custom_reward')."
+        },
+    )
 
 
 def main(script_args, training_args, model_args):
@@ -50,9 +67,29 @@ def main(script_args, training_args, model_args):
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.model_name_or_path, trust_remote_code=model_args.trust_remote_code
     )
-    reward_model = AutoModelForSequenceClassification.from_pretrained(
-        script_args.reward_model_name_or_path, trust_remote_code=model_args.trust_remote_code, num_labels=1
-    )
+
+    # Get the reward models and functions
+    reward_funcs = []
+    if script_args.reward_model_name_or_path:
+        reward_model = AutoModelForSequenceClassification.from_pretrained(
+            script_args.reward_model_name_or_path, trust_remote_code=model_args.trust_remote_code, num_labels=1
+        )
+        reward_funcs.append(reward_model)
+
+    if script_args.reward_funcs:
+        for func_name in script_args.reward_funcs:
+            if func_name in reward_funcs_registry:
+                reward_funcs.append(reward_funcs_registry[func_name])
+            elif "." in func_name:
+                module_path, func_name = func_name.rsplit(".", 1)
+                module = importlib.import_module(module_path)
+                reward_func = getattr(module, func_name)
+                reward_funcs.append(reward_func)
+            else:
+                raise ValueError(
+                    f"Could not load reward function '{func_name}'. Expected one of "
+                    f"{list(reward_funcs_registry.keys())} or a valid import path."
+                )
 
     # Load the dataset
     dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)