oumi-ai
diff --git a/‎src/oumi/core/collators/vision_language_sft_collator.py‎
Lines changed: 6 additions & 5 deletions b/‎src/oumi/core/collators/vision_language_sft_collator.py‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎src/oumi/core/feature_generators/vision_language_conversation_feature_generator.py‎
Lines changed: 57 additions & 1 deletion b/‎src/oumi/core/feature_generators/vision_language_conversation_feature_generator.py‎
Lines changed: 57 additions & 1 deletion
diff --git a/‎src/oumi/core/processors/base_processor.py‎
Lines changed: 20 additions & 0 deletions b/‎src/oumi/core/processors/base_processor.py‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎src/oumi/core/processors/default_processor.py‎
Lines changed: 27 additions & 1 deletion b/‎src/oumi/core/processors/default_processor.py‎
Lines changed: 27 additions & 1 deletion
diff --git a/‎src/oumi/utils/conversation_utils.py‎
Lines changed: 90 additions & 0 deletions b/‎src/oumi/utils/conversation_utils.py‎
Lines changed: 90 additions & 0 deletions
@@ -30,8 +30,9 @@ def __init__(
         tokenizer: BaseTokenizer,
         processor_name: str,
         *,
-        max_length: Optional[int],
+        max_length: Optional[int] = None,
         truncation: bool = False,
+        truncation_side: str = "right",
         label_ignore_index: Optional[int] = None,
         allow_multi_image_inputs: bool = True,
         trust_remote_code: bool = False,
@@ -45,6 +46,7 @@ def __init__(
             truncation: Whether to truncate long inputs to `max_length`.
                 If False, the long inputs are preserved as is even if they exceed
                 `max_length`. Only has effect if `max_length` is specified.
+            truncation_side: The side to truncate the tokens ("right" or "left").
             label_ignore_index:  If set, then label values of tokens that shouldn't
                 contribute to the loss computation will be replaced by
                 this special value.
@@ -53,10 +55,6 @@ def __init__(
         """
         self._allow_multi_image_inputs = allow_multi_image_inputs
 
-        # TODO Consider supporting truncation using these params
-        self._max_length = max_length
-        self._truncation = truncation
-
         if not processor_name:
             raise ValueError("processor_name is required for VisionLanguageSftCollator")
 
@@ -66,6 +64,9 @@ def __init__(
                 processor_name=processor_name,
                 trust_remote_code=trust_remote_code,
                 return_tensors="pt",
+                truncation=truncation,
+                truncation_side=truncation_side,
+                max_length=max_length,
                 label_ignore_index=label_ignore_index,
             )
         )
 
@@ -37,9 +37,14 @@
 from oumi.core.types.conversation import (
     ContentItem,
     Conversation,
+    Message,
+)
+from oumi.utils.conversation_utils import (
+    load_pil_image_from_content_item,
+    truncate_text_in_content_items,
 )
-from oumi.utils.conversation_utils import load_pil_image_from_content_item
 from oumi.utils.logging import logger
+from oumi.utils.str_utils import truncate_text_pieces_to_max_tokens_limit
 from oumi.utils.torch_utils import get_first_dim_len
 
 
@@ -65,12 +70,24 @@ def __init__(
         processor_name: Optional[str] = None,
         trust_remote_code: bool = False,
         return_tensors: Optional[str] = None,
+        max_length: Optional[int] = None,
+        truncation: bool = False,
+        truncation_side: str = "right",
         label_ignore_index: Optional[int] = None,
     ) -> None:
         """Initializes a new instance of VisionLanguageFeatureProcessor."""
         # Importing these here to avoid circular dependencies
         from oumi.builders.processors import build_processor
 
+        if truncation_side not in ("left", "right"):
+            raise ValueError(
+                f"Invalid truncation_side: '{truncation_side}'. "
+                "Expected 'left' or 'right'."
+            )
+
+        self._max_length: Optional[int] = max_length
+        self._truncation: bool = truncation
+        self._truncation_side = truncation_side
         self._return_tensors = return_tensors
 
         if tokenizer is None:
@@ -145,6 +162,9 @@ def _prepare_simple_model(
         last_text_item: ContentItem = text_turns[-1].text_content_items[-1]
 
         prompt = last_text_item.content or ""
+        truncated_texts = self._truncate_text_pieces([prompt])
+        assert len(truncated_texts) == 1
+        prompt = truncated_texts[0]
         image = self._load_image(last_image_item)
 
         return image, prompt
@@ -171,6 +191,8 @@ def _prepare_instruct_model(
                     f"Unsupported message: {turn.id}. Contains no text and no images."
                 )
 
+        messages = self._truncate_text_in_content_items(messages)
+
         text_prompt = self._processor.apply_chat_template(
             messages, add_generation_prompt=False
         )
@@ -361,3 +383,37 @@ def transform_conversations(
                 inputs["labels"] = labels.tolist()
 
         return inputs.data
+
+    def _truncate_text_in_content_items(self, messages: list[Message]) -> list[Message]:
+        """Truncates text contents in Messages to `max_length` total tokens.
+
+        Note that we have to truncate plain texts *before* we apply chat template
+        as the final processed prompt is generally unsafe to truncate at arbitrary
+        offset: it may break invariants (e.g., prompt contains `N` images tokens)
+        leading to runtime errors in processor.
+        """
+        if not (
+            self._truncation and self._max_length is not None and self._max_length > 0
+        ):
+            return messages
+
+        return truncate_text_in_content_items(
+            messages,
+            tokenizer=self._processor.tokenizer,
+            max_tokens=self._max_length,
+            truncation_side=self._truncation_side,
+        )
+
+    def _truncate_text_pieces(self, text_pieces: list[str]) -> list[str]:
+        """Truncates text pieces to total length not exceeding `max_length`."""
+        if not (
+            self._truncation and self._max_length is not None and self._max_length > 0
+        ):
+            return copy.deepcopy(text_pieces)
+
+        return truncate_text_pieces_to_max_tokens_limit(
+            text_pieces,
+            tokenizer=self._processor.tokenizer,
+            max_tokens=self._max_length,
+            truncation_side=self._truncation_side,
+        )
@@ -128,3 +128,23 @@ def apply_chat_template(
     def save_config(self, output_dir: Union[Path, str]) -> None:
         """Saves processor config to the directory."""
         raise NotImplementedError
+
+    @abc.abstractmethod
+    def truncate_text(
+        self,
+        text: str,
+        *,
+        max_tokens: int,
+        truncation_side: str = "right",
+    ) -> tuple[str, int]:
+        """Truncates text to `max_length` in tokens.
+
+        Args:
+            text: A text prompt.
+            max_tokens: Maximum number of tokens to keep.
+            truncation_side: The side to truncate the tokens ("right" or "left").
+
+        Returns:
+            A tuple containing truncated text prompt and the number of tokens.
+        """
+        raise NotImplementedError
@@ -25,6 +25,7 @@
 from oumi.core.tokenizers.base_tokenizer import BaseTokenizer
 from oumi.core.types.conversation import Message
 from oumi.utils.logging import logger
+from oumi.utils.str_utils import truncate_to_max_tokens_limit
 
 
 class DefaultProcessor(BaseProcessor):
@@ -54,7 +55,7 @@ def __init__(
             and callable(worker_processor.apply_chat_template)
         ):
             raise ValueError(
-                "Worker processor doesn't have " "the `apply_chat_template` method"
+                "Worker processor doesn't have the `apply_chat_template` method"
             )
 
         self._processor_name = processor_name
@@ -250,3 +251,28 @@ def save_config(self, output_dir: Union[Path, str]) -> None:
             return
 
         self._worker_processor.save_pretrained(str(output_dir))
+
+    @override
+    def truncate_text(
+        self,
+        text: str,
+        *,
+        max_tokens: int,
+        truncation_side: str = "right",
+    ) -> tuple[str, int]:
+        """Truncates text to `max_length` in tokens.
+
+        Args:
+            text: A text prompt.
+            max_tokens: Maximum number of tokens to keep.
+            truncation_side: The side to truncate the tokens ("right" or "left").
+
+        Returns:
+            A tuple containing truncated text prompt and the number of tokens.
+        """
+        return truncate_to_max_tokens_limit(
+            text,
+            self._tokenizer,
+            max_tokens=max_tokens,
+            truncation_side=truncation_side,
+        )
@@ -17,6 +17,7 @@
 
 import PIL.Image
 
+from oumi.core.tokenizers.base_tokenizer import BaseTokenizer
 from oumi.core.types.conversation import ContentItem, Conversation, Message, Type
 from oumi.utils.image_utils import (
     DEFAULT_IMAGE_MODE,
@@ -26,6 +27,7 @@
     load_pil_image_from_path,
     load_pil_image_from_url,
 )
+from oumi.utils.str_utils import truncate_text_pieces_to_max_tokens_limit
 
 
 def load_image_bytes_to_content_item(
@@ -343,3 +345,91 @@ def remove_excessive_images_from_conversation(
         messages=filtered_messages,
         metadata=conversation.metadata,
     )
+
+
+def truncate_text_in_content_items(
+    messages: list[Message],
+    tokenizer: BaseTokenizer,
+    *,
+    max_tokens: int,
+    truncation_side: str = "right",
+) -> list[Message]:
+    """Truncates text contents in Messages to `max_length` total tokens.
+
+    Note that we have to truncate plain texts *before* we apply chat template
+    as the final processed prompt is generally unsafe to truncate at arbitrary
+    offset: it may break invariants (e.g., prompt contains `N` images tokens)
+    leading to runtime errors in processor.
+
+    Args:
+        messages: A list of messages.
+        tokenizer: The tokenizer used for encoding the data.
+        max_tokens: Maximum number of tokens to keep in all text pieces combined.
+        truncation_side: The side to truncate the tokens ("right" or "left").
+
+    Returns:
+        A list of messages with potentially truncated text prompts.
+        The returned list contains the same messages as the input list,
+        except that the text content items may be truncated.
+    """
+    if max_tokens <= 0:
+        raise ValueError("`max_tokens` must be a positive integer")
+    elif truncation_side not in ("left", "right"):
+        raise ValueError(
+            f"Invalid truncation_side: '{truncation_side}'. Expected 'left' or 'right'."
+        )
+
+    result = [m for m in messages]  # shallow copy
+
+    text_pieces: list[str] = []
+    for msg_idx, message in enumerate(result):
+        for item_idx, item in enumerate(message.content_items):
+            if item.is_text():
+                text_pieces.append(item.content or "")
+
+    if len(text_pieces) == 0:
+        return result
+
+    truncated_texts = truncate_text_pieces_to_max_tokens_limit(
+        text_pieces,
+        tokenizer=tokenizer,
+        max_tokens=max_tokens,
+        truncation_side=truncation_side,
+    )
+    assert len(text_pieces) == len(truncated_texts)
+
+    idx = 0
+    for msg_idx, message in enumerate(result):
+        message_truncated = False
+        items: list[ContentItem] = []
+        for item_idx, item in enumerate(message.content_items):
+            if item.is_text():
+                items.append(
+                    ContentItem(
+                        content=truncated_texts[idx],
+                        type=item.type,
+                    )
+                )
+                original_text = item.content or ""
+                if truncated_texts[idx] != original_text:
+                    message_truncated = True
+                idx += 1
+            else:
+                items.append(item)
+
+        if message_truncated:
+            if (
+                len(items) == 1
+                and items[0].is_text()
+                and isinstance(messages[msg_idx].content, str)
+            ):
+                assert isinstance(items[0].content, str)
+                result[msg_idx] = Message(
+                    id=message.id, content=items[0].content, role=message.role
+                )
+            else:
+                result[msg_idx] = Message(
+                    id=message.id, content=items, role=message.role
+                )
+
+    return result