Skip to content

Commit 56cb683

Browse files
chenmoneygithubmattdangerw
authored andcommitted
fix regex string (#458)
1 parent c498bb7 commit 56cb683

File tree

1 file changed

+7
-3
lines changed

1 file changed

+7
-3
lines changed

keras_nlp/tokenizers/byte_pair_tokenizer.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,12 +39,16 @@
3939
SPECIAL_WHITESPACES = r"\x{a0}\x{2009}\x{202f}\x{3000}"
4040

4141
# String splitting regex pattern.
42-
SPLIT_PATTERN_1 = r"""'s|'t|'re|'ve|'m|'ll|'d
43-
|[\s{special_spaces}]+[\n\r\t\f६{special_spaces}]| ?\p{L}+
44-
| ?[\p{N}]+| ?[^\s\p{L}\p{N}{special_spaces}]+""".replace(
42+
SPLIT_PATTERN_1 = (
43+
r"'s|'t|'re|'ve|'m|'ll|'d"
44+
+ r"|[\s{special_spaces}]+[\n\r\t\f६{special_spaces}]| ?\p{L}+|"
45+
+ r" ?[\p{N}]+| ?[^\s\p{L}\p{N}{special_spaces}]+"
46+
)
47+
SPLIT_PATTERN_1 = SPLIT_PATTERN_1.replace(
4548
"{special_spaces}", SPECIAL_WHITESPACES
4649
)
4750

51+
4852
SPLIT_PATTERN_2 = rf"""[\s६{SPECIAL_WHITESPACES}]$"""
4953

5054

0 commit comments

Comments
 (0)