Skip to content

Commit 5016d92

Browse files
committed
perf(estree/tokens): handle regex tokens separately (#19796)
Remove the `regex` field from `EstreeToken` and serialize regex tokens via their own dedicated type `EstreeRegExpToken`. This removes a branch and logic from `emit_token` which is the path the processes all tokens except identifiers. This PR is actually showing as a small perf degradation (-1% on ESTree tokens benchmark). But it enables a much bigger optimization later on in #19814.
1 parent 780a68e commit 5016d92

File tree

1 file changed

+56
-53
lines changed
  • crates/oxc_estree_tokens/src

1 file changed

+56
-53
lines changed

crates/oxc_estree_tokens/src/lib.rs

Lines changed: 56 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -46,33 +46,55 @@ type PrettyTokenSerializer = ESTreeSerializer<TokenConfig, PrettyFormatter>;
4646
pub struct EstreeToken<'a> {
4747
pub token_type: TokenType,
4848
pub value: &'a str,
49-
pub regex: Option<EstreeRegExpToken<'a>>,
5049
pub span: Span,
5150
}
5251

53-
pub struct EstreeRegExpToken<'a> {
54-
pub pattern: &'a str,
55-
pub flags: &'a str,
56-
}
57-
5852
impl ESTree for EstreeToken<'_> {
5953
fn serialize<S: Serializer>(&self, serializer: S) {
6054
let mut state = serializer.serialize_struct();
6155
state.serialize_field("type", &JsonSafeString(self.token_type.as_str()));
6256
state.serialize_field("value", &self.value);
63-
if let Some(regex) = &self.regex {
64-
state.serialize_field("regex", regex);
65-
}
6657
state.serialize_span(self.span);
6758
state.end();
6859
}
6960
}
7061

62+
/// Token type for RegExps.
63+
///
64+
/// This is a separate type from `EstreeToken` because RegExp tokens have a nested `regex` object
65+
/// containing `flags` and `pattern`, and the token type is always `"RegularExpression"`.
66+
/// Pattern is taken from the AST node (`RegExpLiteral.regex.pattern.text`), and flags are sliced
67+
/// from source text to preserve the original order (the AST stores flags as a bitfield which
68+
/// would alphabetize them).
69+
struct EstreeRegExpToken<'a> {
70+
value: &'a str,
71+
regex: RegExpData<'a>,
72+
span: Span,
73+
}
74+
75+
/// The `regex` sub-object inside a `RegularExpression` token.
76+
struct RegExpData<'a> {
77+
pattern: &'a str,
78+
flags: &'a str,
79+
}
80+
7181
impl ESTree for EstreeRegExpToken<'_> {
82+
fn serialize<S: Serializer>(&self, serializer: S) {
83+
let mut state = serializer.serialize_struct();
84+
state.serialize_field("type", &JsonSafeString("RegularExpression"));
85+
state.serialize_field("value", &self.value);
86+
state.serialize_field("regex", &self.regex);
87+
state.serialize_span(self.span);
88+
state.end();
89+
}
90+
}
91+
92+
impl ESTree for RegExpData<'_> {
7293
fn serialize<S: Serializer>(&self, serializer: S) {
7394
let mut state = serializer.serialize_struct();
7495
state.serialize_field("pattern", &self.pattern);
75-
state.serialize_field("flags", &self.flags);
96+
// Flags are single ASCII letters (d, g, i, m, s, u, v, y) — always JSON-safe
97+
state.serialize_field("flags", &JsonSafeString(self.flags));
7698
state.end();
7799
}
78100
}
@@ -397,33 +419,21 @@ impl<'b, S: SequenceSerializer> EstreeTokenContext<'b, S> {
397419
unreachable!("Expected token at position {start}");
398420
}
399421

400-
/// Serialize a single token.
422+
/// Serialize a single token using its raw source text as the value.
401423
fn emit_token(&mut self, token: &Token, token_type: TokenType) {
402424
let value = &self.source_text[token.start() as usize..token.end() as usize];
403-
let regex = if token.kind() == Kind::RegExp {
404-
regex_parts(value).map(|(pattern, flags)| EstreeRegExpToken { pattern, flags })
405-
} else {
406-
None
407-
};
408-
self.serialize_token(token, token_type, value, regex);
425+
self.serialize_token(token, token_type, value);
409426
}
410427

411428
/// Convert span to UTF-16 and serialize token.
412-
fn serialize_token(
413-
&mut self,
414-
token: &Token,
415-
token_type: TokenType,
416-
value: &str,
417-
regex: Option<EstreeRegExpToken<'_>>,
418-
) {
429+
fn serialize_token(&mut self, token: &Token, token_type: TokenType, value: &str) {
419430
// Convert offsets to UTF-16
420431
let mut span = Span::new(token.start(), token.end());
421432
if let Some(converter) = self.span_converter.as_mut() {
422433
converter.convert_span(&mut span);
423434
}
424435

425-
let estree_token = EstreeToken { token_type, value, regex, span };
426-
self.seq.serialize_element(&estree_token);
436+
self.seq.serialize_element(&EstreeToken { token_type, value, span });
427437
}
428438

429439
/// Serialize a token whose value is guaranteed JSON-safe, skipping escape-checking.
@@ -561,12 +571,31 @@ impl<'a, S: SequenceSerializer> Visit<'a> for EstreeTokenContext<'_, S> {
561571
fn emit<S: SequenceSerializer>(ctx: &mut EstreeTokenContext<'_, S>, token: &Token) {
562572
// Strip leading `#`
563573
let value = &ctx.source_text[token.start() as usize + 1..token.end() as usize];
564-
ctx.serialize_token(token, TokenType::new("PrivateIdentifier"), value, None);
574+
ctx.serialize_token(token, TokenType::new("PrivateIdentifier"), value);
565575
}
566576
emit(self, token);
567577
}
568578
}
569579

580+
fn visit_reg_exp_literal(&mut self, regexp: &RegExpLiteral<'a>) {
581+
let token = self.advance_to(regexp.span.start);
582+
583+
let value = regexp.raw.as_deref().unwrap();
584+
let pattern = regexp.regex.pattern.text.as_str();
585+
586+
// Flags start after opening `/`, pattern, and closing `/`
587+
let flags = &value[pattern.len() + 2..];
588+
let regex = RegExpData { pattern, flags };
589+
590+
// Convert offsets to UTF-16
591+
let mut span = Span::new(token.start(), token.end());
592+
if let Some(converter) = self.span_converter.as_mut() {
593+
converter.convert_span(&mut span);
594+
}
595+
596+
self.seq.serialize_element(&EstreeRegExpToken { value, regex, span });
597+
}
598+
570599
fn visit_ts_this_parameter(&mut self, parameter: &TSThisParameter<'a>) {
571600
self.emit_token_at(parameter.this_span.start, TokenType::new("Identifier"));
572601
walk::walk_ts_this_parameter(self, parameter);
@@ -708,29 +737,3 @@ fn get_token_type(kind: Kind) -> TokenType {
708737
_ => TokenType::new("Punctuator"),
709738
}
710739
}
711-
712-
fn regex_parts(raw: &str) -> Option<(&str, &str)> {
713-
let bytes = raw.as_bytes();
714-
if bytes.first() != Some(&b'/') {
715-
return None;
716-
}
717-
718-
let mut escaped = false;
719-
let mut in_character_class = false;
720-
for index in 1..bytes.len() {
721-
let byte = bytes[index];
722-
if escaped {
723-
escaped = false;
724-
continue;
725-
}
726-
match byte {
727-
b'\\' => escaped = true,
728-
b'[' if !in_character_class => in_character_class = true,
729-
b']' if in_character_class => in_character_class = false,
730-
b'/' if !in_character_class => return Some((&raw[1..index], &raw[index + 1..])),
731-
_ => {}
732-
}
733-
}
734-
735-
None
736-
}

0 commit comments

Comments
 (0)