Skip to content

Commit 835a0b6

Browse files
committed
Update unicode.cpp
1 parent 7f4e47f commit 835a0b6

File tree

1 file changed

+6
-1
lines changed

1 file changed

+6
-1
lines changed

src/unicode.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -608,7 +608,12 @@ static std::vector<size_t> unicode_regex_split_custom_kimi_k2(const std::string
608608
// Pattern 2 & 3: Letter words excluding Han characters with optional contractions
609609
// [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?:'s|'t|'re|'ve|'m|'ll|'d)?
610610
// [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?:'s|'t|'re|'ve|'m|'ll|'d)?
611-
if (flags.is_letter && !unicode_cpt_is_han(cpt)) {
611+
// Check if current char is a letter OR if current char could be a leading char and next char is a letter
612+
bool is_letter_pattern = (flags.is_letter && !unicode_cpt_is_han(cpt)) ||
613+
(!(cpt == '\r' || cpt == '\n' || flags.is_letter || flags.is_number) &&
614+
_get_flags(pos + 1).is_letter && !unicode_cpt_is_han(_get_cpt(pos + 1)));
615+
616+
if (is_letter_pattern) {
612617
// Handle optional leading non-letter/non-number character
613618
bool has_leading_char = false;
614619
if (!(cpt == '\r' || cpt == '\n' || flags.is_letter || flags.is_number)) {

0 commit comments

Comments
 (0)