Skip to content

Commit 0536ad1

Browse files
committed
fix: prevent panics with some regexp assertions are used in Unicode mode.
When regexp assertions like `\b` and `\B` were used in a regexp in Unicode mode a panic occurred because this assertions are not implemented for Unicode regexps. Now the compiler fails with an error instead of panicking.
1 parent 63bec4d commit 0536ad1

File tree

6 files changed

+116
-10
lines changed

6 files changed

+116
-10
lines changed

lib/src/compiler/ir/ast2ir.rs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1891,6 +1891,19 @@ fn re_error_to_compile_error(
18911891
),
18921892
)
18931893
}
1894+
re::parser::Error::UnsupportedInUnicode { span } => {
1895+
InvalidRegexp::build(
1896+
report_builder,
1897+
err.to_string(),
1898+
report_builder.span_to_code_loc(
1899+
regexp
1900+
.span()
1901+
.subspan(span.start.offset, span.end.offset)
1902+
.offset(1),
1903+
),
1904+
None,
1905+
)
1906+
}
18941907
re::parser::Error::MixedGreediness {
18951908
is_greedy_1,
18961909
is_greedy_2,
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
rule test {
2+
strings:
3+
$a = /(?u)foo\b/
4+
condition:
5+
$a
6+
}
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
error[E014]: invalid regular expression
2+
--> line:3:18
3+
|
4+
3 | $a = /(?u)foo\b/
5+
| ^^ this is unsupported in Unicode regular expressions

lib/src/re/parser.rs

Lines changed: 65 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ use std::ops::Deref;
66

77
use regex_syntax as re;
88
use regex_syntax::ast::{
9-
AssertionKind, Ast, ErrorKind, Literal, LiteralKind, RepetitionKind,
9+
AssertionKind, Ast, ErrorKind, Flag, Literal, LiteralKind, RepetitionKind,
1010
RepetitionRange,
1111
};
1212
use thiserror::Error;
@@ -29,6 +29,9 @@ pub(crate) enum Error {
2929
span_1: re::ast::Span,
3030
span_2: re::ast::Span,
3131
},
32+
UnsupportedInUnicode {
33+
span: re::ast::Span,
34+
},
3235
ArbitraryPrefix {
3336
span: re::ast::Span,
3437
},
@@ -39,6 +42,9 @@ impl Display for Error {
3942
match self {
4043
Error::WrongSyntax { msg, .. } => write!(f, "{msg}"),
4144
Error::MixedGreediness { .. } => write!(f, "mixed greediness"),
45+
Error::UnsupportedInUnicode { .. } => {
46+
write!(f, "this is unsupported in Unicode regular expressions")
47+
}
4248
Error::ArbitraryPrefix { .. } => {
4349
write!(f, "arbitrary prefix")
4450
}
@@ -313,15 +319,17 @@ enum MatchKind {
313319
/// compatibility with YARA.
314320
///
315321
/// This includes checks such as disallowing a mix of greedy and non-greedy
316-
/// quantifiers, and rejecting expressions that begin with patterns capable
317-
/// of matching arbitrarily long sequences of arbitrary bytes.
322+
/// quantifiers, rejecting expressions that begin with patterns capable
323+
/// of matching arbitrarily long sequences of arbitrary bytes, and making
324+
/// sure that some assertions like \b and \B are not used in Unicode mode.
318325
struct Validator {
319326
first_rep: Option<(bool, re::ast::Span)>,
320327
greedy: Option<bool>,
321328
match_kind: MatchKind,
322329
stack: Vec<Vec<MatchKind>>,
323330
allow_mixed_greediness: bool,
324331
dot_matches_new_line: bool,
332+
unicode_mode_stack: Vec<bool>,
325333
}
326334

327335
impl Validator {
@@ -333,6 +341,7 @@ impl Validator {
333341
stack: Vec::new(),
334342
allow_mixed_greediness: false,
335343
dot_matches_new_line: false,
344+
unicode_mode_stack: Vec::new(),
336345
}
337346
}
338347
fn allow_mixed_greediness(mut self, yes: bool) -> Self {
@@ -348,6 +357,19 @@ impl Validator {
348357
fn validate(&mut self, ast: &Ast) -> Result<Option<bool>, Error> {
349358
re::ast::visit(ast, self)
350359
}
360+
361+
/// Returns true if we are currently in Unicode mode.
362+
///
363+
/// Unicode mode is enabled by using the `(?u)` flag in the regexp, and can
364+
/// be disabled by using `(?-u)`. These flags apply to the current regular
365+
/// expression group. For instance, in `/((?u)foo)bar)/` the `foo` portion
366+
/// is in Unicode mode, but `bar` it's not, because the flag appears within
367+
/// the group that encloses `foo`.
368+
fn in_unicode_mode(&self) -> bool {
369+
// The current Unicode mode is the value at the top of the stack,
370+
// or false if the stack is empty.
371+
self.unicode_mode_stack.last().cloned().unwrap_or(false)
372+
}
351373
}
352374

353375
impl re::ast::Visitor for &mut Validator {
@@ -360,6 +382,41 @@ impl re::ast::Visitor for &mut Validator {
360382

361383
fn visit_pre(&mut self, ast: &Ast) -> Result<(), Self::Err> {
362384
match ast {
385+
Ast::Group(_) => {
386+
self.unicode_mode_stack.push(self.in_unicode_mode());
387+
}
388+
Ast::Flags(f) => {
389+
if let Some(unicode_flag) = f.flags.flag_state(Flag::Unicode) {
390+
match self.unicode_mode_stack.last_mut() {
391+
Some(u) => *u = unicode_flag,
392+
None => self.unicode_mode_stack.push(unicode_flag),
393+
}
394+
}
395+
}
396+
Ast::Assertion(assertion) => {
397+
// The transformer should have removed all WordBoundaryStartAngle
398+
// and WordBoundaryEndAngle from the AST. These kinds of assertions
399+
// should not be found.
400+
debug_assert!(!matches!(
401+
assertion.kind,
402+
AssertionKind::WordBoundaryStartAngle // \<
403+
| AssertionKind::WordBoundaryEndAngle // \>
404+
));
405+
406+
if self.in_unicode_mode()
407+
&& matches!(
408+
assertion.kind,
409+
AssertionKind::NotWordBoundary // \B
410+
| AssertionKind::WordBoundary // \b
411+
| AssertionKind::WordBoundaryStart // \b{start}
412+
| AssertionKind::WordBoundaryEnd // \b{end}
413+
)
414+
{
415+
return Err(Error::UnsupportedInUnicode {
416+
span: assertion.span.clone(),
417+
});
418+
}
419+
}
363420
Ast::Repetition(rep) => {
364421
if let Some(first_rep) = self.first_rep {
365422
if rep.greedy != first_rep.0 {
@@ -389,7 +446,10 @@ impl re::ast::Visitor for &mut Validator {
389446

390447
fn visit_post(&mut self, ast: &Ast) -> Result<(), Self::Err> {
391448
match ast {
392-
Ast::Flags(_) | Ast::Assertion(_) | Ast::Group(_) => {}
449+
Ast::Group(_) => {
450+
self.unicode_mode_stack.pop();
451+
}
452+
Ast::Flags(_) | Ast::Assertion(_) => {}
393453
Ast::Empty(_)
394454
| Ast::Literal(_)
395455
| Ast::ClassUnicode(_)
@@ -540,11 +600,7 @@ impl Transformer {
540600
Ast::Flags(_) => {}
541601
Ast::Literal(_) => {}
542602
Ast::Dot(_) => {}
543-
Ast::Assertion(assertion) => match assertion.kind {
544-
AssertionKind::WordBoundaryStartAngle => {}
545-
AssertionKind::WordBoundaryEndAngle => {}
546-
_ => {}
547-
},
603+
Ast::Assertion(_) => {}
548604
Ast::ClassUnicode(_) => {}
549605
Ast::ClassPerl(_) => {}
550606
Ast::ClassBracketed(_) => {}

lib/src/re/thompson/compiler.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -557,7 +557,7 @@ impl Compiler {
557557
// ....
558558
// lN: ... code for eN ...
559559
// lEND:
560-
debug_assert!(alternatives.len() < 256);
560+
debug_assert!(alternatives.len() <= MAX_ALTERNATIVES);
561561

562562
let l0 = self.emit_split_n(alternatives.len().try_into().unwrap())?;
563563

lib/src/tests/mod.rs

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -317,6 +317,9 @@ fn string_operations() {
317317
// characters.
318318
condition_false!(r#""🙈🙉🙊" matches /^...$/"#);
319319
condition_true!(r#""🙈🙉🙊" matches /(?u)^...$/"#);
320+
// This doesn't match because unicode support is disabled after
321+
// the first dot (.).
322+
condition_false!(r#""🙈🙉🙊" matches /(?u)^.(?-u)..$/"#);
320323
}
321324

322325
#[test]
@@ -1746,6 +1749,29 @@ fn regexp_patterns_5() {
17461749
r#"/🙈🙉🙊/i"#,
17471750
b"\xF0\x9F\x99\x88\xF0\x9F\x99\x89\xF0\x9F\x99\x8A"
17481751
);
1752+
1753+
pattern_match!(r"/^abc \bxyz$/", b"abc xyz", b"abc xyz");
1754+
pattern_match!(r"/^abc\b xyz$/", b"abc xyz", b"abc xyz");
1755+
pattern_false!(r"/^abc\bxyz$/", b"abcxyz");
1756+
1757+
pattern_match!(r"/^abc \b{start}xyz$/", b"abc xyz", b"abc xyz");
1758+
pattern_false!(r"/^abc\b{start} xyz$/", b"abc xyz");
1759+
1760+
pattern_match!(r"/^abc\b{end} xyz$/", b"abc xyz", b"abc xyz");
1761+
pattern_false!(r"/^abc \b{end}xyz$/", b"abc xyz");
1762+
1763+
pattern_match!(r"/^abc\Bxyz$/", b"abcxyz", b"abcxyz");
1764+
1765+
// Here the Unicode mode is enabled only for "abc", for the rest of the
1766+
// regexp Unicode is disabled
1767+
pattern_match!(r"/(?u)^abc(?-u)\b xyz$/", b"abc xyz", b"abc xyz");
1768+
pattern_match!(r"/^((?u)abc)\b xyz$/", b"abc xyz", b"abc xyz");
1769+
1770+
// TODO: enable if we ever implement unicode support for regexps.
1771+
//pattern_match!(r"/(?u)^abc \bxyz$/", b"abc xyz", b"abc xyz");
1772+
//pattern_match!(r"/(?u)^abc\Bxyz$/", b"abcxyz", b"abcxyz");
1773+
//pattern_match!(r"/(?u)^abc \b{start}xyz$/", b"abc xyz", b"abc xyz");
1774+
//pattern_match!(r"/(?u)^abc\b{end} xyz$/", b"abc xyz", b"abc xyz");
17491775
}
17501776

17511777
#[test]

0 commit comments

Comments
 (0)