fix: 8685

IWANABETHATGUY · IWANABETHATGUY · commit e03a0d14d2ab · 2026-03-15T00:38:10.000+08:00
diff --git a/crates/rolldown_binding/src/types/binding_magic_string.rs b/crates/rolldown_binding/src/types/binding_magic_string.rs
@@ -2,6 +2,7 @@
 use std::sync::Arc;
 
 use napi::bindgen_prelude::{Either, This};
+use napi::{Env, JsString};
 use napi_derive::napi;
 use rolldown_sourcemap::{JSONSourceMap, SourceMap};
 use rolldown_utils::base64::to_standard_base64;
@@ -22,43 +23,87 @@ struct SerializableSourceMap<'a> {
   mappings: &'a String,
 }
 
+/// Per-UTF-16-index mapping entry: byte offset + surrogate code unit.
+#[derive(Clone, Copy)]
+struct Utf16Mapping {
+  /// UTF-8 byte offset at this UTF-16 position.
+  byte_offset: u32,
+  /// Raw UTF-16 code unit value. 0 for BMP characters and the end sentinel.
+  /// High surrogates (0xD800–0xDBFF) and low surrogates (0xDC00–0xDFFF)
+  /// store their actual code unit value, used to emit lone surrogates in `slice`.
+  surrogate: u16,
+}
+
+impl Utf16Mapping {
+  #[inline]
+  fn is_low_surrogate(self) -> bool {
+    (0xDC00..=0xDFFF).contains(&self.surrogate)
+  }
+}
+
 #[derive(Clone)]
 struct CharToByteMapper {
-  char_to_byte: Vec<u32>,
+  /// One entry per UTF-16 code unit, plus a sentinel at the end.
+  /// Length = utf16_len + 1. Indexed directly by JS string index.
+  entries: Vec<Utf16Mapping>,
 }
 
 impl CharToByteMapper {
+  /// Builds a mapping from UTF-16 code unit positions (JS string indices) to UTF-8 byte offsets.
+  ///
+  /// JavaScript strings are UTF-16 encoded, so all indices from JS are UTF-16 code unit positions.
+  /// Characters outside the BMP (e.g. emoji `🤷`) use 2 UTF-16 code units (a surrogate pair) but
+  /// are a single Rust `char`. This mapper accounts for that by pushing one entry per UTF-16 code
+  /// unit, so the array is indexed directly by JS string index.
   #[expect(clippy::cast_possible_truncation)]
   fn new(s: &str) -> Self {
-    let mut char_to_byte = Vec::with_capacity(s.chars().count() + 1);
-    char_to_byte.push(0); // char 0 is at byte 0
+    // UTF-16 length <= UTF-8 byte length for all strings, so s.len() + 1
+    // is always a valid upper-bound capacity, avoiding a second pass over chars.
+    let mut entries = Vec::with_capacity(s.len() + 1);
 
     let mut byte_offset = 0u32;
     for ch in s.chars() {
-      byte_offset += ch.len_utf16() as u32;
-      char_to_byte.push(byte_offset);
+      if ch.len_utf16() == 2 {
+        let mut buf = [0u16; 2];
+        ch.encode_utf16(&mut buf);
+        // High surrogate: byte offset *before* the character.
+        entries.push(Utf16Mapping { byte_offset, surrogate: buf[0] });
+        byte_offset += ch.len_utf8() as u32;
+        // Low surrogate: byte offset *after* the character.
+        entries.push(Utf16Mapping { byte_offset, surrogate: buf[1] });
+      } else {
+        entries.push(Utf16Mapping { byte_offset, surrogate: 0 });
+        byte_offset += ch.len_utf8() as u32;
+      }
     }
+    // End sentinel.
+    entries.push(Utf16Mapping { byte_offset, surrogate: 0 });
 
-    Self { char_to_byte }
+    Self { entries }
+  }
+
+  #[inline]
+  fn get(&self, utf16_index: u32) -> Option<Utf16Mapping> {
+    self.entries.get(utf16_index as usize).copied()
   }
 
   #[inline]
   fn char_to_byte(&self, char_offset: u32) -> Option<u32> {
-    self.char_to_byte.get(char_offset as usize).copied()
+    self.get(char_offset).map(|e| e.byte_offset)
   }
 
-  /// Returns the character count (number of characters in the string).
+  /// Returns the UTF-16 code unit count of the original string.
+  /// This matches JavaScript's `String.prototype.length`.
   fn char_count(&self) -> i64 {
-    // The vector has N+1 elements for N characters (stores byte offset after each char)
     #[expect(clippy::cast_possible_wrap)]
-    let count = (self.char_to_byte.len() - 1) as i64;
+    let count = (self.entries.len() - 1) as i64;
     count
   }
 
-  /// Returns the total accumulated length (in the same units as `char_to_byte` entries).
+  /// Returns the total UTF-8 byte length of the original string.
   /// This is the correct sentinel for out-of-bounds index clamping in `slice`.
   fn total_len(&self) -> u32 {
-    self.char_to_byte.last().copied().unwrap_or(0)
+    self.entries.last().map_or(0, |e| e.byte_offset)
   }
 
   /// Normalizes a potentially negative index to a positive index.
@@ -635,8 +680,17 @@ impl BindingMagicString<'_> {
 
   /// Returns the content between the specified original character positions.
   /// Supports negative indices (counting from the end).
+  ///
+  /// When an index falls in the middle of a surrogate pair, the lone surrogate is
+  /// included in the result (matching the original magic-string / JS behavior).
+  /// This is done by returning a UTF-16 encoded JS string via `napi_create_string_utf16`.
   #[napi]
-  pub fn slice(&self, start: Option<i64>, end: Option<i64>) -> napi::Result<String> {
+  pub fn slice<'env>(
+    &self,
+    env: &'env Env,
+    start: Option<i64>,
+    end: Option<i64>,
+  ) -> napi::Result<JsString<'env>> {
     // Apply offset to both start and end (including defaults), then normalize negatives
     let start = self.apply_offset_i64(start.unwrap_or(0));
 
@@ -652,17 +706,73 @@ impl BindingMagicString<'_> {
     let start = self.char_to_byte_mapper.normalize_index(start);
     let end = self.char_to_byte_mapper.normalize_index(end);
 
-    // Convert character indices to byte indices.
-    // indices are non-negative after normalize_index and files are < 4GB.
-    // Use total_len() (in the mapper's own units) as the out-of-bounds sentinel instead of
-    // source().len() (UTF-8 bytes), which would be wrong for non-ASCII strings.
-    let total_len = self.char_to_byte_mapper.total_len();
     #[expect(clippy::cast_sign_loss, clippy::cast_possible_truncation)]
-    let start_byte = self.char_to_byte_mapper.char_to_byte(start as u32).unwrap_or(total_len);
+    let start_u32 = start as u32;
     #[expect(clippy::cast_sign_loss, clippy::cast_possible_truncation)]
-    let end_byte = self.char_to_byte_mapper.char_to_byte(end as u32).unwrap_or(total_len);
+    let end_u32 = end as u32;
+
+    // Fetch the mapping entries once. If start/end fall on a low surrogate (middle
+    // of a surrogate pair), we need special handling:
+    // - start at LOW: prepend the lone low surrogate, UTF-8 slice starts after the char.
+    // - end at LOW: use the previous entry's byte offset (before the char) and append
+    //   the lone high surrogate.
+    // - HIGH surrogate positions already have the correct byte offset (before the char).
+    let total_len = self.char_to_byte_mapper.total_len();
+    let start_entry = self.char_to_byte_mapper.get(start_u32);
+    let end_entry = self.char_to_byte_mapper.get(end_u32);
+
+    // When start == end, the result is always empty regardless of surrogate position.
+    // Only check surrogates when the range is non-empty.
+    let (start_is_low, end_prev_entry) = if start_u32 < end_u32 {
+      let start_is_low = start_entry.is_some_and(Utf16Mapping::is_low_surrogate);
+      let end_is_low = end_entry.is_some_and(Utf16Mapping::is_low_surrogate);
+      // When end is a low surrogate, look up the preceding high surrogate entry once
+      // (used for both the byte offset and the surrogate value to append).
+      let end_prev = if end_is_low {
+        debug_assert!(end_u32 > 0, "low surrogate cannot appear at index 0");
+        self.char_to_byte_mapper.get(end_u32 - 1)
+      } else {
+        None
+      };
+      (start_is_low, end_prev)
+    } else {
+      (false, None)
+    };
+
+    let start_byte = start_entry.map_or(total_len, |e| e.byte_offset);
+    let end_byte = if let Some(prev) = end_prev_entry {
+      // End falls on a low surrogate — use the high surrogate's byte_offset
+      // (before the character) so the UTF-8 slice excludes it.
+      prev.byte_offset
+    } else {
+      end_entry.map_or(total_len, |e| e.byte_offset)
+    };
+    // Clamp reversed ranges (e.g. slice(2, 1) on 'a🤷b') to empty.
+    let end_byte = end_byte.max(start_byte);
+
+    let utf8_result =
+      self.inner.slice(start_byte, Some(end_byte)).map_err(napi::Error::from_reason)?;
+
+    // Fast path: no lone surrogates involved — return the UTF-8 string directly,
+    // avoiding the UTF-16 transcoding and allocation.
+    if !start_is_low && end_prev_entry.is_none() {
+      return env.create_string(&utf8_result);
+    }
+
+    // Slow path: build UTF-16 buffer with lone surrogates at the boundaries.
+    let mut utf16_buf: Vec<u16> = Vec::new();
+
+    if let Some(entry) = start_entry.filter(|e| e.is_low_surrogate()) {
+      utf16_buf.push(entry.surrogate);
+    }
+
+    utf16_buf.extend(utf8_result.encode_utf16());
+
+    if let Some(high_entry) = end_prev_entry {
+      utf16_buf.push(high_entry.surrogate);
+    }
 
-    self.inner.slice(start_byte, Some(end_byte)).map_err(napi::Error::from_reason)
+    env.create_string_utf16(&utf16_buf)
   }
 
   /// Generates a source map for the transformations applied to this MagicString.
diff --git a/packages/rolldown/src/binding.d.cts b/packages/rolldown/src/binding.d.cts
@@ -1535,6 +1535,10 @@ export declare class BindingMagicString {
   /**
    * Returns the content between the specified original character positions.
    * Supports negative indices (counting from the end).
+   *
+   * When an index falls in the middle of a surrogate pair, the lone surrogate is
+   * included in the result (matching the original magic-string / JS behavior).
+   * This is done by returning a UTF-16 encoded JS string via `napi_create_string_utf16`.
    */
   slice(start?: number | undefined | null, end?: number | undefined | null): string
   /**
diff --git a/packages/rolldown/tests/magic-string/magic-string-unicode.test.ts b/packages/rolldown/tests/magic-string/magic-string-unicode.test.ts
@@ -0,0 +1,89 @@
+import assert from 'node:assert';
+import { RolldownMagicString as MagicString } from 'rolldown';
+import { describe, it } from 'vitest';
+
+describe('MagicString unicode handling', () => {
+  // Exact repro from issue #8685
+  it('should slice strings with emoji (surrogate pairs)', () => {
+    const s = new MagicString('some 🤷‍♂️ string');
+    // '🤷‍♂️' is composed of: 🤷 (U+1F937, 2 UTF-16 units) + ZWJ (U+200D, 1) + ♂ (U+2642, 1) + VS16 (U+FE0F, 1) = 5 UTF-16 units
+    // 'some ' = 5 UTF-16 units, so emoji sequence ends at index 10
+    assert.strictEqual(s.slice(0, 5), 'some ');
+    assert.strictEqual(s.slice(10), ' string');
+  });
+
+  it('should overwrite across emoji boundaries', () => {
+    const s = new MagicString('a🤷b');
+    // 'a' = index 0-1, '🤷' = index 1-3 (2 UTF-16 units), 'b' = index 3-4
+    s.overwrite(0, 3, 'replaced');
+    assert.strictEqual(s.toString(), 'replacedb');
+  });
+
+  it('should remove emoji characters', () => {
+    const s = new MagicString('hello🌍world');
+    // 'hello' = 0-5, '🌍' = 5-7 (2 UTF-16 units), 'world' = 7-12
+    s.remove(5, 7);
+    assert.strictEqual(s.toString(), 'helloworld');
+  });
+
+  it('should handle CJK characters (3-byte UTF-8, 1 UTF-16 unit)', () => {
+    const s = new MagicString('你好世界');
+    // Each CJK character is 1 UTF-16 unit
+    assert.strictEqual(s.slice(0, 2), '你好');
+    assert.strictEqual(s.slice(2, 4), '世界');
+  });
+
+  it('should handle mixed ASCII, CJK, and emoji', () => {
+    const s = new MagicString('hi你好🌍ok');
+    // 'h'=0, 'i'=1, '你'=2, '好'=3, '🌍'=4-5 (surrogate pair), 'o'=6, 'k'=7
+    assert.strictEqual(s.slice(0, 2), 'hi');
+    assert.strictEqual(s.slice(2, 4), '你好');
+    assert.strictEqual(s.slice(6, 8), 'ok');
+  });
+
+  it('should handle negative indices with multi-byte characters', () => {
+    const s = new MagicString('abc🤷def');
+    // Total length: 3 + 2 + 3 = 8 UTF-16 units
+    // -3 should map to index 5 => 'def'
+    assert.strictEqual(s.slice(-3), 'def');
+  });
+
+  it('should handle update with emoji', () => {
+    const s = new MagicString('hello🌍world');
+    // Replace '🌍' (indices 5-7) with ' '
+    s.update(5, 7, ' ');
+    assert.strictEqual(s.toString(), 'hello world');
+  });
+
+  it('should handle prepend/append left/right with emoji', () => {
+    const s = new MagicString('a🤷b');
+    // '🤷' starts at index 1, ends at index 3
+    s.appendLeft(3, '!');
+    assert.strictEqual(s.toString(), 'a🤷!b');
+  });
+
+  it('should return lone surrogates when indexing middle of surrogate pair', () => {
+    const s = new MagicString('a🤷b');
+    // In JS: 'a'=0, high surrogate (0xD83E)=1, low surrogate (0xDD37)=2, 'b'=3
+    // slice(1) starts at high surrogate — includes the full emoji
+    assert.strictEqual(s.slice(1), '🤷b');
+    // slice(2) starts at low surrogate — returns lone low surrogate + 'b'
+    // matching JS behavior: 'a🤷b'.slice(2) === '\uDD37b'
+    assert.strictEqual(s.slice(2), '\uDD37b');
+    // slice(0, 2) ends at low surrogate — returns 'a' + lone high surrogate
+    // matching JS behavior: 'a🤷b'.slice(0, 2) === 'a\uD83E'
+    assert.strictEqual(s.slice(0, 2), 'a\uD83E');
+    // slice(3) is 'b'
+    assert.strictEqual(s.slice(3), 'b');
+  });
+
+  it('should return an empty string for slice(i, i) at a low-surrogate index', () => {
+    const s = new MagicString('a🤷b');
+    assert.strictEqual(s.slice(2, 2), '');
+  });
+
+  it('should return an empty string for reversed ranges across a surrogate boundary', () => {
+    const s = new MagicString('a🤷b');
+    assert.strictEqual(s.slice(2, 1), '');
+  });
+});