@@ -65,7 +65,12 @@ def encode_ordinary(self, text: str) -> list[int]:
6565 >>> enc.encode_ordinary("hello world")
6666 [31373, 995]
6767 """
68- return self ._core_bpe .encode_ordinary (text )
68+ try :
69+ return self ._core_bpe .encode_ordinary (text )
70+ except UnicodeEncodeError :
71+ # See comment in encode
72+ text = text .encode ("utf-16" , "surrogatepass" ).decode ("utf-16" , "replace" )
73+ return self ._core_bpe .encode_ordinary (text )
6974
7075 def encode (
7176 self ,
@@ -111,7 +116,17 @@ def encode(
111116 if match := _special_token_regex (disallowed_special ).search (text ):
112117 raise_disallowed_special_token (match .group ())
113118
114- return self ._core_bpe .encode (text , allowed_special )
119+ try :
120+ return self ._core_bpe .encode (text , allowed_special )
121+ except UnicodeEncodeError :
122+ # BPE operates on bytes, but the regex operates on unicode. If we pass a str that is
123+ # invalid UTF-8 to Rust, it will rightfully complain. Here we do a quick and dirty
124+ # fixup for any surrogate pairs that may have sneaked their way into the text.
125+ # Technically, this introduces a place where encode + decode doesn't roundtrip a Python
126+ # string, but given that this is input we want to support, maybe that's okay.
127+ # Also we use errors="replace" to handle weird things like lone surrogates.
128+ text = text .encode ("utf-16" , "surrogatepass" ).decode ("utf-16" , "replace" )
129+ return self ._core_bpe .encode (text , allowed_special )
115130
116131 def encode_ordinary_batch (self , text : list [str ], * , num_threads : int = 8 ) -> list [list [int ]]:
117132 """Encodes a list of strings into tokens, in parallel, ignoring special tokens.
0 commit comments