fix: prevent panics with some regexp assertions are used in Unicode mode.

plusvic · plusvic · commit 0536ad1b8267 · 2025-09-12T13:50:07.000+02:00
When regexp assertions like `\b` and `\B` were used in a regexp in Unicode mode a panic occurred because this assertions are not implemented for Unicode regexps.

Now the compiler fails with an error instead of panicking.
diff --git a/lib/src/compiler/ir/ast2ir.rs b/lib/src/compiler/ir/ast2ir.rs
@@ -1891,6 +1891,19 @@ fn re_error_to_compile_error(
                 ),
             )
         }
+        re::parser::Error::UnsupportedInUnicode { span } => {
+            InvalidRegexp::build(
+                report_builder,
+                err.to_string(),
+                report_builder.span_to_code_loc(
+                    regexp
+                        .span()
+                        .subspan(span.start.offset, span.end.offset)
+                        .offset(1),
+                ),
+                None,
+            )
+        }
         re::parser::Error::MixedGreediness {
             is_greedy_1,
             is_greedy_2,
diff --git a/lib/src/compiler/tests/testdata/errors/149.in b/lib/src/compiler/tests/testdata/errors/149.in
@@ -0,0 +1,6 @@
+rule test {
+  strings:
+    $a = /(?u)foo\b/
+  condition:
+    $a
+}
diff --git a/lib/src/compiler/tests/testdata/errors/149.out b/lib/src/compiler/tests/testdata/errors/149.out
@@ -0,0 +1,5 @@
+error[E014]: invalid regular expression
+ --> line:3:18
+  |
+3 |     $a = /(?u)foo\b/
+  |                  ^^ this is unsupported in Unicode regular expressions
diff --git a/lib/src/re/parser.rs b/lib/src/re/parser.rs
@@ -6,7 +6,7 @@ use std::ops::Deref;
 
 use regex_syntax as re;
 use regex_syntax::ast::{
-    AssertionKind, Ast, ErrorKind, Literal, LiteralKind, RepetitionKind,
+    AssertionKind, Ast, ErrorKind, Flag, Literal, LiteralKind, RepetitionKind,
     RepetitionRange,
 };
 use thiserror::Error;
@@ -29,6 +29,9 @@ pub(crate) enum Error {
         span_1: re::ast::Span,
         span_2: re::ast::Span,
     },
+    UnsupportedInUnicode {
+        span: re::ast::Span,
+    },
     ArbitraryPrefix {
         span: re::ast::Span,
     },
@@ -39,6 +42,9 @@ impl Display for Error {
         match self {
             Error::WrongSyntax { msg, .. } => write!(f, "{msg}"),
             Error::MixedGreediness { .. } => write!(f, "mixed greediness"),
+            Error::UnsupportedInUnicode { .. } => {
+                write!(f, "this is unsupported in Unicode regular expressions")
+            }
             Error::ArbitraryPrefix { .. } => {
                 write!(f, "arbitrary prefix")
             }
@@ -313,15 +319,17 @@ enum MatchKind {
 /// compatibility with YARA.
 ///
 /// This includes checks such as disallowing a mix of greedy and non-greedy
-/// quantifiers, and rejecting expressions that begin with patterns capable
-/// of matching arbitrarily long sequences of arbitrary bytes.
+/// quantifiers, rejecting expressions that begin with patterns capable
+/// of matching arbitrarily long sequences of arbitrary bytes, and making
+/// sure that some assertions like \b and \B are not used in Unicode mode.
 struct Validator {
     first_rep: Option<(bool, re::ast::Span)>,
     greedy: Option<bool>,
     match_kind: MatchKind,
     stack: Vec<Vec<MatchKind>>,
     allow_mixed_greediness: bool,
     dot_matches_new_line: bool,
+    unicode_mode_stack: Vec<bool>,
 }
 
 impl Validator {
@@ -333,6 +341,7 @@ impl Validator {
             stack: Vec::new(),
             allow_mixed_greediness: false,
             dot_matches_new_line: false,
+            unicode_mode_stack: Vec::new(),
         }
     }
     fn allow_mixed_greediness(mut self, yes: bool) -> Self {
@@ -348,6 +357,19 @@ impl Validator {
     fn validate(&mut self, ast: &Ast) -> Result<Option<bool>, Error> {
         re::ast::visit(ast, self)
     }
+
+    /// Returns true if we are currently in Unicode mode.
+    ///
+    /// Unicode mode is enabled by using the `(?u)` flag in the regexp, and can
+    /// be disabled by using `(?-u)`. These flags apply to the current regular
+    /// expression group. For instance, in `/((?u)foo)bar)/` the `foo` portion
+    /// is in Unicode mode, but `bar` it's not, because the flag appears within
+    /// the group that encloses `foo`.
+    fn in_unicode_mode(&self) -> bool {
+        // The current Unicode mode is the value at the top of the stack,
+        // or false if the stack is empty.
+        self.unicode_mode_stack.last().cloned().unwrap_or(false)
+    }
 }
 
 impl re::ast::Visitor for &mut Validator {
@@ -360,6 +382,41 @@ impl re::ast::Visitor for &mut Validator {
 
     fn visit_pre(&mut self, ast: &Ast) -> Result<(), Self::Err> {
         match ast {
+            Ast::Group(_) => {
+                self.unicode_mode_stack.push(self.in_unicode_mode());
+            }
+            Ast::Flags(f) => {
+                if let Some(unicode_flag) = f.flags.flag_state(Flag::Unicode) {
+                    match self.unicode_mode_stack.last_mut() {
+                        Some(u) => *u = unicode_flag,
+                        None => self.unicode_mode_stack.push(unicode_flag),
+                    }
+                }
+            }
+            Ast::Assertion(assertion) => {
+                // The transformer should have removed all WordBoundaryStartAngle
+                // and WordBoundaryEndAngle from the AST. These kinds of assertions
+                // should not be found.
+                debug_assert!(!matches!(
+                    assertion.kind,
+                    AssertionKind::WordBoundaryStartAngle  // \<
+                        | AssertionKind::WordBoundaryEndAngle // \>
+                ));
+
+                if self.in_unicode_mode()
+                    && matches!(
+                        assertion.kind,
+                        AssertionKind::NotWordBoundary  // \B
+                        | AssertionKind::WordBoundary // \b
+                        | AssertionKind::WordBoundaryStart // \b{start}
+                        | AssertionKind::WordBoundaryEnd // \b{end}
+                    )
+                {
+                    return Err(Error::UnsupportedInUnicode {
+                        span: assertion.span.clone(),
+                    });
+                }
+            }
             Ast::Repetition(rep) => {
                 if let Some(first_rep) = self.first_rep {
                     if rep.greedy != first_rep.0 {
@@ -389,7 +446,10 @@ impl re::ast::Visitor for &mut Validator {
 
     fn visit_post(&mut self, ast: &Ast) -> Result<(), Self::Err> {
         match ast {
-            Ast::Flags(_) | Ast::Assertion(_) | Ast::Group(_) => {}
+            Ast::Group(_) => {
+                self.unicode_mode_stack.pop();
+            }
+            Ast::Flags(_) | Ast::Assertion(_) => {}
             Ast::Empty(_)
             | Ast::Literal(_)
             | Ast::ClassUnicode(_)
@@ -540,11 +600,7 @@ impl Transformer {
                 Ast::Flags(_) => {}
                 Ast::Literal(_) => {}
                 Ast::Dot(_) => {}
-                Ast::Assertion(assertion) => match assertion.kind {
-                    AssertionKind::WordBoundaryStartAngle => {}
-                    AssertionKind::WordBoundaryEndAngle => {}
-                    _ => {}
-                },
+                Ast::Assertion(_) => {}
                 Ast::ClassUnicode(_) => {}
                 Ast::ClassPerl(_) => {}
                 Ast::ClassBracketed(_) => {}
diff --git a/lib/src/re/thompson/compiler.rs b/lib/src/re/thompson/compiler.rs
@@ -557,7 +557,7 @@ impl Compiler {
         //     ....
         // lN: ... code for eN ...
         // lEND:
-        debug_assert!(alternatives.len() < 256);
+        debug_assert!(alternatives.len() <= MAX_ALTERNATIVES);
 
         let l0 = self.emit_split_n(alternatives.len().try_into().unwrap())?;
 
diff --git a/lib/src/tests/mod.rs b/lib/src/tests/mod.rs
@@ -317,6 +317,9 @@ fn string_operations() {
     // characters.
     condition_false!(r#""🙈🙉🙊" matches /^...$/"#);
     condition_true!(r#""🙈🙉🙊" matches /(?u)^...$/"#);
+    // This doesn't match because unicode support is disabled after
+    // the first dot (.).
+    condition_false!(r#""🙈🙉🙊" matches /(?u)^.(?-u)..$/"#);
 }
 
 #[test]
@@ -1746,6 +1749,29 @@ fn regexp_patterns_5() {
         r#"/🙈🙉🙊/i"#,
         b"\xF0\x9F\x99\x88\xF0\x9F\x99\x89\xF0\x9F\x99\x8A"
     );
+
+    pattern_match!(r"/^abc \bxyz$/", b"abc xyz", b"abc xyz");
+    pattern_match!(r"/^abc\b xyz$/", b"abc xyz", b"abc xyz");
+    pattern_false!(r"/^abc\bxyz$/", b"abcxyz");
+
+    pattern_match!(r"/^abc \b{start}xyz$/", b"abc xyz", b"abc xyz");
+    pattern_false!(r"/^abc\b{start} xyz$/", b"abc xyz");
+
+    pattern_match!(r"/^abc\b{end} xyz$/", b"abc xyz", b"abc xyz");
+    pattern_false!(r"/^abc \b{end}xyz$/", b"abc xyz");
+
+    pattern_match!(r"/^abc\Bxyz$/", b"abcxyz", b"abcxyz");
+
+    // Here the Unicode mode is enabled only for "abc", for the rest of the
+    // regexp Unicode is disabled
+    pattern_match!(r"/(?u)^abc(?-u)\b xyz$/", b"abc xyz", b"abc xyz");
+    pattern_match!(r"/^((?u)abc)\b xyz$/", b"abc xyz", b"abc xyz");
+
+    // TODO: enable if we ever implement unicode support for regexps.
+    //pattern_match!(r"/(?u)^abc \bxyz$/", b"abc xyz", b"abc xyz");
+    //pattern_match!(r"/(?u)^abc\Bxyz$/", b"abcxyz", b"abcxyz");
+    //pattern_match!(r"/(?u)^abc \b{start}xyz$/", b"abc xyz", b"abc xyz");
+    //pattern_match!(r"/(?u)^abc\b{end} xyz$/", b"abc xyz", b"abc xyz");
 }
 
 #[test]