@@ -6,7 +6,7 @@ use std::ops::Deref;
6
6
7
7
use regex_syntax as re;
8
8
use regex_syntax:: ast:: {
9
- AssertionKind , Ast , ErrorKind , Literal , LiteralKind , RepetitionKind ,
9
+ AssertionKind , Ast , ErrorKind , Flag , Literal , LiteralKind , RepetitionKind ,
10
10
RepetitionRange ,
11
11
} ;
12
12
use thiserror:: Error ;
@@ -29,6 +29,9 @@ pub(crate) enum Error {
29
29
span_1 : re:: ast:: Span ,
30
30
span_2 : re:: ast:: Span ,
31
31
} ,
32
+ UnsupportedInUnicode {
33
+ span : re:: ast:: Span ,
34
+ } ,
32
35
ArbitraryPrefix {
33
36
span : re:: ast:: Span ,
34
37
} ,
@@ -39,6 +42,9 @@ impl Display for Error {
39
42
match self {
40
43
Error :: WrongSyntax { msg, .. } => write ! ( f, "{msg}" ) ,
41
44
Error :: MixedGreediness { .. } => write ! ( f, "mixed greediness" ) ,
45
+ Error :: UnsupportedInUnicode { .. } => {
46
+ write ! ( f, "this is unsupported in Unicode regular expressions" )
47
+ }
42
48
Error :: ArbitraryPrefix { .. } => {
43
49
write ! ( f, "arbitrary prefix" )
44
50
}
@@ -313,15 +319,17 @@ enum MatchKind {
313
319
/// compatibility with YARA.
314
320
///
315
321
/// This includes checks such as disallowing a mix of greedy and non-greedy
316
- /// quantifiers, and rejecting expressions that begin with patterns capable
317
- /// of matching arbitrarily long sequences of arbitrary bytes.
322
+ /// quantifiers, rejecting expressions that begin with patterns capable
323
+ /// of matching arbitrarily long sequences of arbitrary bytes, and making
324
+ /// sure that some assertions like \b and \B are not used in Unicode mode.
318
325
struct Validator {
319
326
first_rep : Option < ( bool , re:: ast:: Span ) > ,
320
327
greedy : Option < bool > ,
321
328
match_kind : MatchKind ,
322
329
stack : Vec < Vec < MatchKind > > ,
323
330
allow_mixed_greediness : bool ,
324
331
dot_matches_new_line : bool ,
332
+ unicode_mode_stack : Vec < bool > ,
325
333
}
326
334
327
335
impl Validator {
@@ -333,6 +341,7 @@ impl Validator {
333
341
stack : Vec :: new ( ) ,
334
342
allow_mixed_greediness : false ,
335
343
dot_matches_new_line : false ,
344
+ unicode_mode_stack : Vec :: new ( ) ,
336
345
}
337
346
}
338
347
fn allow_mixed_greediness ( mut self , yes : bool ) -> Self {
@@ -348,6 +357,19 @@ impl Validator {
348
357
fn validate ( & mut self , ast : & Ast ) -> Result < Option < bool > , Error > {
349
358
re:: ast:: visit ( ast, self )
350
359
}
360
+
361
+ /// Returns true if we are currently in Unicode mode.
362
+ ///
363
+ /// Unicode mode is enabled by using the `(?u)` flag in the regexp, and can
364
+ /// be disabled by using `(?-u)`. These flags apply to the current regular
365
+ /// expression group. For instance, in `/((?u)foo)bar)/` the `foo` portion
366
+ /// is in Unicode mode, but `bar` it's not, because the flag appears within
367
+ /// the group that encloses `foo`.
368
+ fn in_unicode_mode ( & self ) -> bool {
369
+ // The current Unicode mode is the value at the top of the stack,
370
+ // or false if the stack is empty.
371
+ self . unicode_mode_stack . last ( ) . cloned ( ) . unwrap_or ( false )
372
+ }
351
373
}
352
374
353
375
impl re:: ast:: Visitor for & mut Validator {
@@ -360,6 +382,41 @@ impl re::ast::Visitor for &mut Validator {
360
382
361
383
fn visit_pre ( & mut self , ast : & Ast ) -> Result < ( ) , Self :: Err > {
362
384
match ast {
385
+ Ast :: Group ( _) => {
386
+ self . unicode_mode_stack . push ( self . in_unicode_mode ( ) ) ;
387
+ }
388
+ Ast :: Flags ( f) => {
389
+ if let Some ( unicode_flag) = f. flags . flag_state ( Flag :: Unicode ) {
390
+ match self . unicode_mode_stack . last_mut ( ) {
391
+ Some ( u) => * u = unicode_flag,
392
+ None => self . unicode_mode_stack . push ( unicode_flag) ,
393
+ }
394
+ }
395
+ }
396
+ Ast :: Assertion ( assertion) => {
397
+ // The transformer should have removed all WordBoundaryStartAngle
398
+ // and WordBoundaryEndAngle from the AST. These kinds of assertions
399
+ // should not be found.
400
+ debug_assert ! ( !matches!(
401
+ assertion. kind,
402
+ AssertionKind :: WordBoundaryStartAngle // \<
403
+ | AssertionKind :: WordBoundaryEndAngle // \>
404
+ ) ) ;
405
+
406
+ if self . in_unicode_mode ( )
407
+ && matches ! (
408
+ assertion. kind,
409
+ AssertionKind :: NotWordBoundary // \B
410
+ | AssertionKind :: WordBoundary // \b
411
+ | AssertionKind :: WordBoundaryStart // \b{start}
412
+ | AssertionKind :: WordBoundaryEnd // \b{end}
413
+ )
414
+ {
415
+ return Err ( Error :: UnsupportedInUnicode {
416
+ span : assertion. span . clone ( ) ,
417
+ } ) ;
418
+ }
419
+ }
363
420
Ast :: Repetition ( rep) => {
364
421
if let Some ( first_rep) = self . first_rep {
365
422
if rep. greedy != first_rep. 0 {
@@ -389,7 +446,10 @@ impl re::ast::Visitor for &mut Validator {
389
446
390
447
fn visit_post ( & mut self , ast : & Ast ) -> Result < ( ) , Self :: Err > {
391
448
match ast {
392
- Ast :: Flags ( _) | Ast :: Assertion ( _) | Ast :: Group ( _) => { }
449
+ Ast :: Group ( _) => {
450
+ self . unicode_mode_stack . pop ( ) ;
451
+ }
452
+ Ast :: Flags ( _) | Ast :: Assertion ( _) => { }
393
453
Ast :: Empty ( _)
394
454
| Ast :: Literal ( _)
395
455
| Ast :: ClassUnicode ( _)
@@ -540,11 +600,7 @@ impl Transformer {
540
600
Ast :: Flags ( _) => { }
541
601
Ast :: Literal ( _) => { }
542
602
Ast :: Dot ( _) => { }
543
- Ast :: Assertion ( assertion) => match assertion. kind {
544
- AssertionKind :: WordBoundaryStartAngle => { }
545
- AssertionKind :: WordBoundaryEndAngle => { }
546
- _ => { }
547
- } ,
603
+ Ast :: Assertion ( _) => { }
548
604
Ast :: ClassUnicode ( _) => { }
549
605
Ast :: ClassPerl ( _) => { }
550
606
Ast :: ClassBracketed ( _) => { }
0 commit comments