Bump version, sync codebase

hauntsaninja · hauntsaninja · commit e1c661edf360 · 2023-03-28T13:45:26.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,10 @@
 
 This is the changelog for the open source version of tiktoken.
 
+## [v0.3.3]
+- `tiktoken` will now make a best effort attempt to replace surrogate pairs with the corresponding
+   Unicode character and will replace lone surrogates with the Unicode replacement character.
+
 ## [v0.3.2]
 - Add encoding for GPT-4
 
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tiktoken"
-version = "0.3.2"
+version = "0.3.3"
 edition = "2021"
 rust-version = "1.57.0"
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "tiktoken"
-version = "0.3.2"
+version = "0.3.3"
 description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models"
 readme = "README.md"
 license = {file = "LICENSE"}
diff --git a/tiktoken/core.py b/tiktoken/core.py
@@ -65,7 +65,12 @@ def encode_ordinary(self, text: str) -> list[int]:
         >>> enc.encode_ordinary("hello world")
         [31373, 995]
         """
-        return self._core_bpe.encode_ordinary(text)
+        try:
+            return self._core_bpe.encode_ordinary(text)
+        except UnicodeEncodeError:
+            # See comment in encode
+            text = text.encode("utf-16", "surrogatepass").decode("utf-16", "replace")
+            return self._core_bpe.encode_ordinary(text)
 
     def encode(
         self,
@@ -111,7 +116,17 @@ def encode(
             if match := _special_token_regex(disallowed_special).search(text):
                 raise_disallowed_special_token(match.group())
 
-        return self._core_bpe.encode(text, allowed_special)
+        try:
+            return self._core_bpe.encode(text, allowed_special)
+        except UnicodeEncodeError:
+            # BPE operates on bytes, but the regex operates on unicode. If we pass a str that is
+            # invalid UTF-8 to Rust, it will rightfully complain. Here we do a quick and dirty
+            # fixup for any surrogate pairs that may have sneaked their way into the text.
+            # Technically, this introduces a place where encode + decode doesn't roundtrip a Python
+            # string, but given that this is input we want to support, maybe that's okay.
+            # Also we use errors="replace" to handle weird things like lone surrogates.
+            text = text.encode("utf-16", "surrogatepass").decode("utf-16", "replace")
+            return self._core_bpe.encode(text, allowed_special)
 
     def encode_ordinary_batch(self, text: list[str], *, num_threads: int = 8) -> list[list[int]]:
         """Encodes a list of strings into tokens, in parallel, ignoring special tokens.