From 1c246d355baa186468dc0ba6fb519df574b878e7 Mon Sep 17 00:00:00 2001 From: CPunisher <1343316114@qq.com> Date: Mon, 27 Oct 2025 21:38:21 +0800 Subject: [PATCH 1/2] Scan regex in parser --- crates/swc_ecma_parser/src/lexer/capturing.rs | 12 ++-- crates/swc_ecma_parser/src/lexer/mod.rs | 11 +--- crates/swc_ecma_parser/src/lexer/state.rs | 44 +++++++++---- crates/swc_ecma_parser/src/lexer/token.rs | 24 +------ crates/swc_ecma_parser/src/parser/expr.rs | 63 ++++++++----------- crates/swc_ecma_parser/src/parser/input.rs | 30 +++------ 6 files changed, 78 insertions(+), 106 deletions(-) diff --git a/crates/swc_ecma_parser/src/lexer/capturing.rs b/crates/swc_ecma_parser/src/lexer/capturing.rs index 1d408368a033..3b1369700ddc 100644 --- a/crates/swc_ecma_parser/src/lexer/capturing.rs +++ b/crates/swc_ecma_parser/src/lexer/capturing.rs @@ -1,5 +1,7 @@ use std::mem; +use swc_atoms::Atom; + use crate::{ error::Error, input::Tokens, @@ -116,10 +118,6 @@ impl Tokens for Capturing { self.inner.set_expr_allowed(allow); } - fn set_next_regexp(&mut self, start: Option) { - self.inner.set_next_regexp(start); - } - fn add_error(&mut self, error: Error) { self.inner.add_error(error); } @@ -164,6 +162,12 @@ impl Tokens for Capturing { self.inner.set_token_value(token_value); } + fn scan_regex(&mut self) -> (TokenAndSpan, Option<(Atom, Atom)>) { + let result = self.inner.scan_regex(); + self.capture(result.0); + result + } + fn scan_jsx_token(&mut self, allow_multiline_jsx_text: bool) -> TokenAndSpan { self.inner.scan_jsx_token(allow_multiline_jsx_text) } diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index f6928a95e3a1..cd1b02123035 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -1785,12 +1785,7 @@ impl<'a> Lexer<'a> { } /// Expects current char to be '/' - fn read_regexp(&mut self, start: BytePos) -> LexResult { - unsafe { - // Safety: start is valid position, and cur() is Some('/') - self.input_mut().reset_to(start); - } - + pub(crate) fn read_regexp(&mut self) -> LexResult<(Atom, Atom)> { debug_assert_eq!(self.cur(), Some('/')); let start = self.cur_pos(); @@ -1830,7 +1825,7 @@ impl<'a> Lexer<'a> { self.bump(); } - let content = { + let exp = { let s = unsafe { self.input_slice_to_cur(slice_start) }; self.atom(s) }; @@ -1863,7 +1858,7 @@ impl<'a> Lexer<'a> { }? .unwrap_or_default(); - Ok(Token::regexp(content, flags, self)) + Ok((exp, flags)) } /// This method is optimized for texts without escape sequences. diff --git a/crates/swc_ecma_parser/src/lexer/state.rs b/crates/swc_ecma_parser/src/lexer/state.rs index 80354010b5ae..708b8dfa905d 100644 --- a/crates/swc_ecma_parser/src/lexer/state.rs +++ b/crates/swc_ecma_parser/src/lexer/state.rs @@ -1,6 +1,6 @@ use std::mem::take; -use swc_atoms::wtf8::CodePoint; +use swc_atoms::{wtf8::CodePoint, Atom}; use swc_common::BytePos; use swc_ecma_ast::EsVersion; @@ -35,7 +35,6 @@ pub struct State { pub had_line_break_before_last: bool, /// TODO: Remove this field. is_first: bool, - pub next_regexp: Option, pub start: BytePos, pub prev_hi: BytePos, @@ -111,11 +110,6 @@ impl crate::input::Tokens for Lexer<'_> { #[inline] fn set_expr_allowed(&mut self, _: bool) {} - #[inline] - fn set_next_regexp(&mut self, start: Option) { - self.state.next_regexp = start; - } - fn add_error(&mut self, error: Error) { self.errors.push(error); } @@ -169,6 +163,36 @@ impl crate::input::Tokens for Lexer<'_> { self.state.token_value.take() } + fn scan_regex(&mut self) -> (TokenAndSpan, Option<(Atom, Atom)>) { + let start = self.cur_pos(); + let (token, ret) = match self.read_regexp() { + Ok(ret) => (Token::Regex, Some(ret)), + Err(error) => { + self.state.set_token_value(TokenValue::Error(error)); + (Token::Error, None) + } + }; + + let span = self.span(start); + if token != Token::Eof { + if let Some(comments) = self.comments_buffer.as_mut() { + comments.pending_to_comment(BufferedCommentKind::Leading, start); + } + + self.state.set_token_type(token); + self.state.prev_hi = self.last_pos(); + self.state.had_line_break_before_last = self.had_line_break_before_last(); + } + + // Attach span to token. + let token = TokenAndSpan { + token, + had_line_break: self.had_line_break_before_last(), + span, + }; + (token, ret) + } + fn rescan_jsx_token(&mut self, allow_multiline_jsx_text: bool, reset: BytePos) -> TokenAndSpan { unsafe { self.input.reset_to(reset); @@ -373,11 +397,6 @@ impl crate::input::Tokens for Lexer<'_> { impl Lexer<'_> { fn next_token(&mut self, start: &mut BytePos) -> Result { - if let Some(next_regexp) = self.state.next_regexp { - *start = next_regexp; - return self.read_regexp(next_regexp); - } - if self.state.is_first { if let Some(shebang) = self.read_shebang()? { self.state.set_token_value(TokenValue::Word(shebang)); @@ -593,7 +612,6 @@ impl State { had_line_break: false, had_line_break_before_last: false, is_first: true, - next_regexp: None, start: BytePos(0), prev_hi: start_pos, token_value: None, diff --git a/crates/swc_ecma_parser/src/lexer/token.rs b/crates/swc_ecma_parser/src/lexer/token.rs index a36880935a5c..9b12f54a6a76 100644 --- a/crates/swc_ecma_parser/src/lexer/token.rs +++ b/crates/swc_ecma_parser/src/lexer/token.rs @@ -23,11 +23,6 @@ pub enum TokenValue { value: Wtf8Atom, raw: Atom, }, - // regexp - Regex { - value: Atom, - flags: Atom, - }, Num { value: f64, raw: Atom, @@ -356,15 +351,6 @@ impl<'a> Token { Token::Template } - #[inline(always)] - pub fn regexp(content: Atom, flags: Atom, lexer: &mut crate::Lexer<'a>) -> Self { - lexer.set_token_value(Some(TokenValue::Regex { - value: content, - flags, - })); - Token::Regex - } - #[inline(always)] pub fn num(value: f64, raw: Atom, lexer: &mut crate::Lexer<'a>) -> Self { lexer.set_token_value(Some(TokenValue::Num { value, raw })); @@ -457,11 +443,6 @@ impl<'a> Token { (value.as_atom().cloned().unwrap(), raw) } - #[inline(always)] - pub fn take_regexp(self, buffer: &mut Buffer) -> (Atom, Atom) { - buffer.expect_regex_token_value() - } - #[inline(always)] pub fn shebang(value: Atom, lexer: &mut Lexer) -> Self { lexer.set_token_value(Some(TokenValue::Word(value))); @@ -651,10 +632,7 @@ impl Token { return format!("bigint literal ({value}, {raw})"); } Token::Regex => { - let Some(TokenValue::Regex { value, flags, .. }) = value else { - unreachable!("{:#?}", value) - }; - return format!("regexp literal ({value}, {flags})"); + return "regexp literal".to_string(); } Token::Template => { let Some(TokenValue::Template { raw, .. }) = value else { diff --git a/crates/swc_ecma_parser/src/parser/expr.rs b/crates/swc_ecma_parser/src/parser/expr.rs index 9ad64f6ea1ba..028727210fec 100644 --- a/crates/swc_ecma_parser/src/parser/expr.rs +++ b/crates/swc_ecma_parser/src/parser/expr.rs @@ -340,11 +340,7 @@ impl Parser { return self.parse_lit().map(|lit| lit.into()); } // Regexp - Token::Slash | Token::DivEq => { - if let Some(res) = self.try_parse_regexp(start) { - return Ok(res); - } - } + Token::Slash | Token::DivEq => return self.parse_regexp(start), Token::LParen => return self.parse_paren_expr_or_arrow_fn(can_be_arrow, None), Token::NoSubstitutionTemplateLiteral => { return Ok(self.parse_no_substitution_template_literal(false)?.into()) @@ -2592,45 +2588,38 @@ impl Parser { } } - fn try_parse_regexp(&mut self, start: BytePos) -> Option> { + fn parse_regexp(&mut self, start: BytePos) -> PResult> { // Regexp debug_assert!(self.input().cur() == Token::Slash || self.input().cur() == Token::DivEq); - self.input_mut().set_next_regexp(Some(start)); - - self.bump(); // `/` or `/=` - - let cur = self.input().cur(); - if cur == Token::Regex { - self.input_mut().set_next_regexp(None); - let (exp, flags) = self.input_mut().expect_regex_token_and_bump(); - let span = self.span(start); - - let mut flags_count = - flags - .chars() - .fold(FxHashMap::::default(), |mut map, flag| { - let key = match flag { - // https://tc39.es/ecma262/#sec-isvalidregularexpressionliteral - 'd' | 'g' | 'i' | 'm' | 's' | 'u' | 'v' | 'y' => flag, - _ => '\u{0000}', // special marker for unknown flags - }; - map.entry(key).and_modify(|count| *count += 1).or_insert(1); - map - }); + let Some((exp, flags)) = self.input_mut().scan_regex() else { + let error = self.input_mut().expect_error_token_and_bump(); + return Err(error); + }; - if flags_count.remove(&'\u{0000}').is_some() { - self.emit_err(span, SyntaxError::UnknownRegExpFlags); - } + let span = self.span(start); + let mut flags_count = + flags + .chars() + .fold(FxHashMap::::default(), |mut map, flag| { + let key = match flag { + // https://tc39.es/ecma262/#sec-isvalidregularexpressionliteral + 'd' | 'g' | 'i' | 'm' | 's' | 'u' | 'v' | 'y' => flag, + _ => '\u{0000}', // special marker for unknown flags + }; + map.entry(key).and_modify(|count| *count += 1).or_insert(1); + map + }); - if let Some((flag, _)) = flags_count.iter().find(|(_, count)| **count > 1) { - self.emit_err(span, SyntaxError::DuplicatedRegExpFlags(*flag)); - } + if flags_count.remove(&'\u{0000}').is_some() { + self.emit_err(span, SyntaxError::UnknownRegExpFlags); + } - Some(Lit::Regex(Regex { span, exp, flags }).into()) - } else { - None + if let Some((flag, _)) = flags_count.iter().find(|(_, count)| **count > 1) { + self.emit_err(span, SyntaxError::DuplicatedRegExpFlags(*flag)); } + + Ok(Lit::Regex(Regex { span, exp, flags }).into()) } fn try_parse_async_start(&mut self, can_be_arrow: bool) -> Option>> { diff --git a/crates/swc_ecma_parser/src/parser/input.rs b/crates/swc_ecma_parser/src/parser/input.rs index 1ffa28353533..305c6731f1cd 100644 --- a/crates/swc_ecma_parser/src/parser/input.rs +++ b/crates/swc_ecma_parser/src/parser/input.rs @@ -28,7 +28,6 @@ pub trait Tokens: Clone + Iterator { } fn set_expr_allowed(&mut self, allow: bool); - fn set_next_regexp(&mut self, start: Option); /// Implementors should use Rc>>. /// @@ -60,6 +59,7 @@ pub trait Tokens: Clone + Iterator { fn get_token_value(&self) -> Option<&TokenValue>; fn set_token_value(&mut self, token_value: Option); + fn scan_regex(&mut self) -> (TokenAndSpan, Option<(Atom, Atom)>); fn scan_jsx_token(&mut self, allow_multiline_jsx_text: bool) -> TokenAndSpan; fn scan_jsx_open_el_terminal_token(&mut self) -> TokenAndSpan; fn rescan_jsx_open_el_terminal_token(&mut self, reset: BytePos) -> TokenAndSpan; @@ -120,14 +120,6 @@ impl Buffer { (value, raw) } - pub fn expect_regex_token_value(&mut self) -> (Atom, Atom) { - let Some(crate::lexer::TokenValue::Regex { value, flags }) = self.iter.take_token_value() - else { - unreachable!() - }; - (value, flags) - } - pub fn expect_template_token_value(&mut self) -> (LexResult, Atom) { let Some(crate::lexer::TokenValue::Template { cooked, raw }) = self.iter.take_token_value() else { @@ -147,6 +139,14 @@ impl Buffer { self.iter.get_token_value() } + pub(crate) fn scan_regex(&mut self) -> Option<(Atom, Atom)> { + let prev = self.cur; + let (t, ret) = self.iter.scan_regex(); + self.prev_span = prev.span; + self.set_cur(t); + ret + } + pub fn scan_jsx_token(&mut self, allow_multiline_jsx_text: bool) { let prev = self.cur; let t = self.iter.scan_jsx_token(allow_multiline_jsx_text); @@ -346,13 +346,6 @@ impl Buffer { ret } - pub fn expect_regex_token_and_bump(&mut self) -> (Atom, Atom) { - let cur = self.cur(); - let ret = cur.take_regexp(self); - self.bump(); - ret - } - pub fn expect_template_token_and_bump(&mut self) -> (LexResult, Atom) { let cur = self.cur(); let ret = cur.take_template(self); @@ -522,11 +515,6 @@ impl Buffer { self.iter_mut().set_expr_allowed(allow) } - #[inline] - pub fn set_next_regexp(&mut self, start: Option) { - self.iter_mut().set_next_regexp(start); - } - #[inline] pub fn end_pos(&self) -> BytePos { self.iter().end_pos() From 3676fdabcb8c241914d8662fd2b8ada1f1046abc Mon Sep 17 00:00:00 2001 From: CPunisher <1343316114@qq.com> Date: Mon, 27 Oct 2025 21:38:29 +0800 Subject: [PATCH 2/2] Reset to start --- crates/swc_ecma_parser/src/lexer/capturing.rs | 5 +++-- crates/swc_ecma_parser/src/lexer/mod.rs | 9 ++++++--- crates/swc_ecma_parser/src/lexer/state.rs | 5 ++--- crates/swc_ecma_parser/src/parser/expr.rs | 2 +- crates/swc_ecma_parser/src/parser/input.rs | 6 +++--- 5 files changed, 15 insertions(+), 12 deletions(-) diff --git a/crates/swc_ecma_parser/src/lexer/capturing.rs b/crates/swc_ecma_parser/src/lexer/capturing.rs index 3b1369700ddc..3a0778bc79eb 100644 --- a/crates/swc_ecma_parser/src/lexer/capturing.rs +++ b/crates/swc_ecma_parser/src/lexer/capturing.rs @@ -1,6 +1,7 @@ use std::mem; use swc_atoms::Atom; +use swc_common::BytePos; use crate::{ error::Error, @@ -162,8 +163,8 @@ impl Tokens for Capturing { self.inner.set_token_value(token_value); } - fn scan_regex(&mut self) -> (TokenAndSpan, Option<(Atom, Atom)>) { - let result = self.inner.scan_regex(); + fn scan_regex(&mut self, start: BytePos) -> (TokenAndSpan, Option<(Atom, Atom)>) { + let result = self.inner.scan_regex(start); self.capture(result.0); result } diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index cd1b02123035..47fff9871f02 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -1785,10 +1785,13 @@ impl<'a> Lexer<'a> { } /// Expects current char to be '/' - pub(crate) fn read_regexp(&mut self) -> LexResult<(Atom, Atom)> { - debug_assert_eq!(self.cur(), Some('/')); + pub(crate) fn read_regexp(&mut self, start: BytePos) -> LexResult<(Atom, Atom)> { + unsafe { + // Safety: start is valid position, and cur() is Some('/') + self.input_mut().reset_to(start); + } - let start = self.cur_pos(); + debug_assert_eq!(self.cur(), Some('/')); self.bump(); // bump '/' diff --git a/crates/swc_ecma_parser/src/lexer/state.rs b/crates/swc_ecma_parser/src/lexer/state.rs index 708b8dfa905d..a3b71b64b545 100644 --- a/crates/swc_ecma_parser/src/lexer/state.rs +++ b/crates/swc_ecma_parser/src/lexer/state.rs @@ -163,9 +163,8 @@ impl crate::input::Tokens for Lexer<'_> { self.state.token_value.take() } - fn scan_regex(&mut self) -> (TokenAndSpan, Option<(Atom, Atom)>) { - let start = self.cur_pos(); - let (token, ret) = match self.read_regexp() { + fn scan_regex(&mut self, start: BytePos) -> (TokenAndSpan, Option<(Atom, Atom)>) { + let (token, ret) = match self.read_regexp(start) { Ok(ret) => (Token::Regex, Some(ret)), Err(error) => { self.state.set_token_value(TokenValue::Error(error)); diff --git a/crates/swc_ecma_parser/src/parser/expr.rs b/crates/swc_ecma_parser/src/parser/expr.rs index 028727210fec..90445c4d34e8 100644 --- a/crates/swc_ecma_parser/src/parser/expr.rs +++ b/crates/swc_ecma_parser/src/parser/expr.rs @@ -2592,7 +2592,7 @@ impl Parser { // Regexp debug_assert!(self.input().cur() == Token::Slash || self.input().cur() == Token::DivEq); - let Some((exp, flags)) = self.input_mut().scan_regex() else { + let Some((exp, flags)) = self.input_mut().scan_regex(start) else { let error = self.input_mut().expect_error_token_and_bump(); return Err(error); }; diff --git a/crates/swc_ecma_parser/src/parser/input.rs b/crates/swc_ecma_parser/src/parser/input.rs index 305c6731f1cd..2321f17b3493 100644 --- a/crates/swc_ecma_parser/src/parser/input.rs +++ b/crates/swc_ecma_parser/src/parser/input.rs @@ -59,7 +59,7 @@ pub trait Tokens: Clone + Iterator { fn get_token_value(&self) -> Option<&TokenValue>; fn set_token_value(&mut self, token_value: Option); - fn scan_regex(&mut self) -> (TokenAndSpan, Option<(Atom, Atom)>); + fn scan_regex(&mut self, start: BytePos) -> (TokenAndSpan, Option<(Atom, Atom)>); fn scan_jsx_token(&mut self, allow_multiline_jsx_text: bool) -> TokenAndSpan; fn scan_jsx_open_el_terminal_token(&mut self) -> TokenAndSpan; fn rescan_jsx_open_el_terminal_token(&mut self, reset: BytePos) -> TokenAndSpan; @@ -139,9 +139,9 @@ impl Buffer { self.iter.get_token_value() } - pub(crate) fn scan_regex(&mut self) -> Option<(Atom, Atom)> { + pub(crate) fn scan_regex(&mut self, start: BytePos) -> Option<(Atom, Atom)> { let prev = self.cur; - let (t, ret) = self.iter.scan_regex(); + let (t, ret) = self.iter.scan_regex(start); self.prev_span = prev.span; self.set_cur(t); ret