22use std:: sync:: Arc ;
33
44use napi:: bindgen_prelude:: { Either , This } ;
5+ use napi:: { Env , JsString } ;
56use napi_derive:: napi;
67use rolldown_sourcemap:: { JSONSourceMap , SourceMap } ;
78use rolldown_utils:: base64:: to_standard_base64;
@@ -22,43 +23,87 @@ struct SerializableSourceMap<'a> {
2223 mappings : & ' a String ,
2324}
2425
26+ /// Per-UTF-16-index mapping entry: byte offset + surrogate code unit.
27+ #[ derive( Clone , Copy ) ]
28+ struct Utf16Mapping {
29+ /// UTF-8 byte offset at this UTF-16 position.
30+ byte_offset : u32 ,
31+ /// Raw UTF-16 code unit value. 0 for BMP characters and the end sentinel.
32+ /// High surrogates (0xD800–0xDBFF) and low surrogates (0xDC00–0xDFFF)
33+ /// store their actual code unit value, used to emit lone surrogates in `slice`.
34+ surrogate : u16 ,
35+ }
36+
37+ impl Utf16Mapping {
38+ #[ inline]
39+ fn is_low_surrogate ( self ) -> bool {
40+ ( 0xDC00 ..=0xDFFF ) . contains ( & self . surrogate )
41+ }
42+ }
43+
2544#[ derive( Clone ) ]
2645struct CharToByteMapper {
27- char_to_byte : Vec < u32 > ,
46+ /// One entry per UTF-16 code unit, plus a sentinel at the end.
47+ /// Length = utf16_len + 1. Indexed directly by JS string index.
48+ entries : Vec < Utf16Mapping > ,
2849}
2950
3051impl CharToByteMapper {
52+ /// Builds a mapping from UTF-16 code unit positions (JS string indices) to UTF-8 byte offsets.
53+ ///
54+ /// JavaScript strings are UTF-16 encoded, so all indices from JS are UTF-16 code unit positions.
55+ /// Characters outside the BMP (e.g. emoji `🤷`) use 2 UTF-16 code units (a surrogate pair) but
56+ /// are a single Rust `char`. This mapper accounts for that by pushing one entry per UTF-16 code
57+ /// unit, so the array is indexed directly by JS string index.
3158 #[ expect( clippy:: cast_possible_truncation) ]
3259 fn new ( s : & str ) -> Self {
33- let mut char_to_byte = Vec :: with_capacity ( s. chars ( ) . count ( ) + 1 ) ;
34- char_to_byte. push ( 0 ) ; // char 0 is at byte 0
60+ // UTF-16 length <= UTF-8 byte length for all strings, so s.len() + 1
61+ // is always a valid upper-bound capacity, avoiding a second pass over chars.
62+ let mut entries = Vec :: with_capacity ( s. len ( ) + 1 ) ;
3563
3664 let mut byte_offset = 0u32 ;
3765 for ch in s. chars ( ) {
38- byte_offset += ch. len_utf16 ( ) as u32 ;
39- char_to_byte. push ( byte_offset) ;
66+ if ch. len_utf16 ( ) == 2 {
67+ let mut buf = [ 0u16 ; 2 ] ;
68+ ch. encode_utf16 ( & mut buf) ;
69+ // High surrogate: byte offset *before* the character.
70+ entries. push ( Utf16Mapping { byte_offset, surrogate : buf[ 0 ] } ) ;
71+ byte_offset += ch. len_utf8 ( ) as u32 ;
72+ // Low surrogate: byte offset *after* the character.
73+ entries. push ( Utf16Mapping { byte_offset, surrogate : buf[ 1 ] } ) ;
74+ } else {
75+ entries. push ( Utf16Mapping { byte_offset, surrogate : 0 } ) ;
76+ byte_offset += ch. len_utf8 ( ) as u32 ;
77+ }
4078 }
79+ // End sentinel.
80+ entries. push ( Utf16Mapping { byte_offset, surrogate : 0 } ) ;
4181
42- Self { char_to_byte }
82+ Self { entries }
83+ }
84+
85+ #[ inline]
86+ fn get ( & self , utf16_index : u32 ) -> Option < Utf16Mapping > {
87+ self . entries . get ( utf16_index as usize ) . copied ( )
4388 }
4489
4590 #[ inline]
4691 fn char_to_byte ( & self , char_offset : u32 ) -> Option < u32 > {
47- self . char_to_byte . get ( char_offset as usize ) . copied ( )
92+ self . get ( char_offset) . map ( |e| e . byte_offset )
4893 }
4994
50- /// Returns the character count (number of characters in the string).
95+ /// Returns the UTF-16 code unit count of the original string.
96+ /// This matches JavaScript's `String.prototype.length`.
5197 fn char_count ( & self ) -> i64 {
52- // The vector has N+1 elements for N characters (stores byte offset after each char)
5398 #[ expect( clippy:: cast_possible_wrap) ]
54- let count = ( self . char_to_byte . len ( ) - 1 ) as i64 ;
99+ let count = ( self . entries . len ( ) - 1 ) as i64 ;
55100 count
56101 }
57102
58- /// Returns the total accumulated length (in the same units as `char_to_byte` entries) .
103+ /// Returns the total UTF-8 byte length of the original string .
59104 /// This is the correct sentinel for out-of-bounds index clamping in `slice`.
60105 fn total_len ( & self ) -> u32 {
61- self . char_to_byte . last ( ) . copied ( ) . unwrap_or ( 0 )
106+ self . entries . last ( ) . map_or ( 0 , |e| e . byte_offset )
62107 }
63108
64109 /// Normalizes a potentially negative index to a positive index.
@@ -635,8 +680,17 @@ impl BindingMagicString<'_> {
635680
636681 /// Returns the content between the specified original character positions.
637682 /// Supports negative indices (counting from the end).
683+ ///
684+ /// When an index falls in the middle of a surrogate pair, the lone surrogate is
685+ /// included in the result (matching the original magic-string / JS behavior).
686+ /// This is done by returning a UTF-16 encoded JS string via `napi_create_string_utf16`.
638687 #[ napi]
639- pub fn slice ( & self , start : Option < i64 > , end : Option < i64 > ) -> napi:: Result < String > {
688+ pub fn slice < ' env > (
689+ & self ,
690+ env : & ' env Env ,
691+ start : Option < i64 > ,
692+ end : Option < i64 > ,
693+ ) -> napi:: Result < JsString < ' env > > {
640694 // Apply offset to both start and end (including defaults), then normalize negatives
641695 let start = self . apply_offset_i64 ( start. unwrap_or ( 0 ) ) ;
642696
@@ -652,17 +706,73 @@ impl BindingMagicString<'_> {
652706 let start = self . char_to_byte_mapper . normalize_index ( start) ;
653707 let end = self . char_to_byte_mapper . normalize_index ( end) ;
654708
655- // Convert character indices to byte indices.
656- // indices are non-negative after normalize_index and files are < 4GB.
657- // Use total_len() (in the mapper's own units) as the out-of-bounds sentinel instead of
658- // source().len() (UTF-8 bytes), which would be wrong for non-ASCII strings.
659- let total_len = self . char_to_byte_mapper . total_len ( ) ;
660709 #[ expect( clippy:: cast_sign_loss, clippy:: cast_possible_truncation) ]
661- let start_byte = self . char_to_byte_mapper . char_to_byte ( start as u32 ) . unwrap_or ( total_len ) ;
710+ let start_u32 = start as u32 ;
662711 #[ expect( clippy:: cast_sign_loss, clippy:: cast_possible_truncation) ]
663- let end_byte = self . char_to_byte_mapper . char_to_byte ( end as u32 ) . unwrap_or ( total_len) ;
712+ let end_u32 = end as u32 ;
713+
714+ // Fetch the mapping entries once. If start/end fall on a low surrogate (middle
715+ // of a surrogate pair), we need special handling:
716+ // - start at LOW: prepend the lone low surrogate, UTF-8 slice starts after the char.
717+ // - end at LOW: use the previous entry's byte offset (before the char) and append
718+ // the lone high surrogate.
719+ // - HIGH surrogate positions already have the correct byte offset (before the char).
720+ let total_len = self . char_to_byte_mapper . total_len ( ) ;
721+ let start_entry = self . char_to_byte_mapper . get ( start_u32) ;
722+ let end_entry = self . char_to_byte_mapper . get ( end_u32) ;
723+
724+ // When start == end, the result is always empty regardless of surrogate position.
725+ // Only check surrogates when the range is non-empty.
726+ let ( start_is_low, end_prev_entry) = if start_u32 < end_u32 {
727+ let start_is_low = start_entry. is_some_and ( Utf16Mapping :: is_low_surrogate) ;
728+ let end_is_low = end_entry. is_some_and ( Utf16Mapping :: is_low_surrogate) ;
729+ // When end is a low surrogate, look up the preceding high surrogate entry once
730+ // (used for both the byte offset and the surrogate value to append).
731+ let end_prev = if end_is_low {
732+ debug_assert ! ( end_u32 > 0 , "low surrogate cannot appear at index 0" ) ;
733+ self . char_to_byte_mapper . get ( end_u32 - 1 )
734+ } else {
735+ None
736+ } ;
737+ ( start_is_low, end_prev)
738+ } else {
739+ ( false , None )
740+ } ;
741+
742+ let start_byte = start_entry. map_or ( total_len, |e| e. byte_offset ) ;
743+ let end_byte = if let Some ( prev) = end_prev_entry {
744+ // End falls on a low surrogate — use the high surrogate's byte_offset
745+ // (before the character) so the UTF-8 slice excludes it.
746+ prev. byte_offset
747+ } else {
748+ end_entry. map_or ( total_len, |e| e. byte_offset )
749+ } ;
750+ // Clamp reversed ranges (e.g. slice(2, 1) on 'a🤷b') to empty.
751+ let end_byte = end_byte. max ( start_byte) ;
752+
753+ let utf8_result =
754+ self . inner . slice ( start_byte, Some ( end_byte) ) . map_err ( napi:: Error :: from_reason) ?;
755+
756+ // Fast path: no lone surrogates involved — return the UTF-8 string directly,
757+ // avoiding the UTF-16 transcoding and allocation.
758+ if !start_is_low && end_prev_entry. is_none ( ) {
759+ return env. create_string ( & utf8_result) ;
760+ }
761+
762+ // Slow path: build UTF-16 buffer with lone surrogates at the boundaries.
763+ let mut utf16_buf: Vec < u16 > = Vec :: new ( ) ;
764+
765+ if let Some ( entry) = start_entry. filter ( |e| e. is_low_surrogate ( ) ) {
766+ utf16_buf. push ( entry. surrogate ) ;
767+ }
768+
769+ utf16_buf. extend ( utf8_result. encode_utf16 ( ) ) ;
770+
771+ if let Some ( high_entry) = end_prev_entry {
772+ utf16_buf. push ( high_entry. surrogate ) ;
773+ }
664774
665- self . inner . slice ( start_byte , Some ( end_byte ) ) . map_err ( napi :: Error :: from_reason )
775+ env . create_string_utf16 ( & utf16_buf )
666776 }
667777
668778 /// Generates a source map for the transformations applied to this MagicString.
0 commit comments