We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent c498bb7 commit 56cb683Copy full SHA for 56cb683
keras_nlp/tokenizers/byte_pair_tokenizer.py
@@ -39,12 +39,16 @@
39
SPECIAL_WHITESPACES = r"\x{a0}\x{2009}\x{202f}\x{3000}"
40
41
# String splitting regex pattern.
42
-SPLIT_PATTERN_1 = r"""'s|'t|'re|'ve|'m|'ll|'d
43
- |[\s{special_spaces}]+[\n\r\t\f६{special_spaces}]| ?\p{L}+
44
- | ?[\p{N}]+| ?[^\s\p{L}\p{N}{special_spaces}]+""".replace(
+SPLIT_PATTERN_1 = (
+ r"'s|'t|'re|'ve|'m|'ll|'d"
+ + r"|[\s{special_spaces}]+[\n\r\t\f६{special_spaces}]| ?\p{L}+|"
45
+ + r" ?[\p{N}]+| ?[^\s\p{L}\p{N}{special_spaces}]+"
46
+)
47
+SPLIT_PATTERN_1 = SPLIT_PATTERN_1.replace(
48
"{special_spaces}", SPECIAL_WHITESPACES
49
)
50
51
+
52
SPLIT_PATTERN_2 = rf"""[\s६{SPECIAL_WHITESPACES}]$"""
53
54
0 commit comments