Skip to content

Commit e03a0d1

Browse files
fix: 8685
1 parent 68f6274 commit e03a0d1

File tree

3 files changed

+224
-21
lines changed

3 files changed

+224
-21
lines changed

crates/rolldown_binding/src/types/binding_magic_string.rs

Lines changed: 131 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
use std::sync::Arc;
33

44
use napi::bindgen_prelude::{Either, This};
5+
use napi::{Env, JsString};
56
use napi_derive::napi;
67
use rolldown_sourcemap::{JSONSourceMap, SourceMap};
78
use rolldown_utils::base64::to_standard_base64;
@@ -22,43 +23,87 @@ struct SerializableSourceMap<'a> {
2223
mappings: &'a String,
2324
}
2425

26+
/// Per-UTF-16-index mapping entry: byte offset + surrogate code unit.
27+
#[derive(Clone, Copy)]
28+
struct Utf16Mapping {
29+
/// UTF-8 byte offset at this UTF-16 position.
30+
byte_offset: u32,
31+
/// Raw UTF-16 code unit value. 0 for BMP characters and the end sentinel.
32+
/// High surrogates (0xD800–0xDBFF) and low surrogates (0xDC00–0xDFFF)
33+
/// store their actual code unit value, used to emit lone surrogates in `slice`.
34+
surrogate: u16,
35+
}
36+
37+
impl Utf16Mapping {
38+
#[inline]
39+
fn is_low_surrogate(self) -> bool {
40+
(0xDC00..=0xDFFF).contains(&self.surrogate)
41+
}
42+
}
43+
2544
#[derive(Clone)]
2645
struct CharToByteMapper {
27-
char_to_byte: Vec<u32>,
46+
/// One entry per UTF-16 code unit, plus a sentinel at the end.
47+
/// Length = utf16_len + 1. Indexed directly by JS string index.
48+
entries: Vec<Utf16Mapping>,
2849
}
2950

3051
impl CharToByteMapper {
52+
/// Builds a mapping from UTF-16 code unit positions (JS string indices) to UTF-8 byte offsets.
53+
///
54+
/// JavaScript strings are UTF-16 encoded, so all indices from JS are UTF-16 code unit positions.
55+
/// Characters outside the BMP (e.g. emoji `🤷`) use 2 UTF-16 code units (a surrogate pair) but
56+
/// are a single Rust `char`. This mapper accounts for that by pushing one entry per UTF-16 code
57+
/// unit, so the array is indexed directly by JS string index.
3158
#[expect(clippy::cast_possible_truncation)]
3259
fn new(s: &str) -> Self {
33-
let mut char_to_byte = Vec::with_capacity(s.chars().count() + 1);
34-
char_to_byte.push(0); // char 0 is at byte 0
60+
// UTF-16 length <= UTF-8 byte length for all strings, so s.len() + 1
61+
// is always a valid upper-bound capacity, avoiding a second pass over chars.
62+
let mut entries = Vec::with_capacity(s.len() + 1);
3563

3664
let mut byte_offset = 0u32;
3765
for ch in s.chars() {
38-
byte_offset += ch.len_utf16() as u32;
39-
char_to_byte.push(byte_offset);
66+
if ch.len_utf16() == 2 {
67+
let mut buf = [0u16; 2];
68+
ch.encode_utf16(&mut buf);
69+
// High surrogate: byte offset *before* the character.
70+
entries.push(Utf16Mapping { byte_offset, surrogate: buf[0] });
71+
byte_offset += ch.len_utf8() as u32;
72+
// Low surrogate: byte offset *after* the character.
73+
entries.push(Utf16Mapping { byte_offset, surrogate: buf[1] });
74+
} else {
75+
entries.push(Utf16Mapping { byte_offset, surrogate: 0 });
76+
byte_offset += ch.len_utf8() as u32;
77+
}
4078
}
79+
// End sentinel.
80+
entries.push(Utf16Mapping { byte_offset, surrogate: 0 });
4181

42-
Self { char_to_byte }
82+
Self { entries }
83+
}
84+
85+
#[inline]
86+
fn get(&self, utf16_index: u32) -> Option<Utf16Mapping> {
87+
self.entries.get(utf16_index as usize).copied()
4388
}
4489

4590
#[inline]
4691
fn char_to_byte(&self, char_offset: u32) -> Option<u32> {
47-
self.char_to_byte.get(char_offset as usize).copied()
92+
self.get(char_offset).map(|e| e.byte_offset)
4893
}
4994

50-
/// Returns the character count (number of characters in the string).
95+
/// Returns the UTF-16 code unit count of the original string.
96+
/// This matches JavaScript's `String.prototype.length`.
5197
fn char_count(&self) -> i64 {
52-
// The vector has N+1 elements for N characters (stores byte offset after each char)
5398
#[expect(clippy::cast_possible_wrap)]
54-
let count = (self.char_to_byte.len() - 1) as i64;
99+
let count = (self.entries.len() - 1) as i64;
55100
count
56101
}
57102

58-
/// Returns the total accumulated length (in the same units as `char_to_byte` entries).
103+
/// Returns the total UTF-8 byte length of the original string.
59104
/// This is the correct sentinel for out-of-bounds index clamping in `slice`.
60105
fn total_len(&self) -> u32 {
61-
self.char_to_byte.last().copied().unwrap_or(0)
106+
self.entries.last().map_or(0, |e| e.byte_offset)
62107
}
63108

64109
/// Normalizes a potentially negative index to a positive index.
@@ -635,8 +680,17 @@ impl BindingMagicString<'_> {
635680

636681
/// Returns the content between the specified original character positions.
637682
/// Supports negative indices (counting from the end).
683+
///
684+
/// When an index falls in the middle of a surrogate pair, the lone surrogate is
685+
/// included in the result (matching the original magic-string / JS behavior).
686+
/// This is done by returning a UTF-16 encoded JS string via `napi_create_string_utf16`.
638687
#[napi]
639-
pub fn slice(&self, start: Option<i64>, end: Option<i64>) -> napi::Result<String> {
688+
pub fn slice<'env>(
689+
&self,
690+
env: &'env Env,
691+
start: Option<i64>,
692+
end: Option<i64>,
693+
) -> napi::Result<JsString<'env>> {
640694
// Apply offset to both start and end (including defaults), then normalize negatives
641695
let start = self.apply_offset_i64(start.unwrap_or(0));
642696

@@ -652,17 +706,73 @@ impl BindingMagicString<'_> {
652706
let start = self.char_to_byte_mapper.normalize_index(start);
653707
let end = self.char_to_byte_mapper.normalize_index(end);
654708

655-
// Convert character indices to byte indices.
656-
// indices are non-negative after normalize_index and files are < 4GB.
657-
// Use total_len() (in the mapper's own units) as the out-of-bounds sentinel instead of
658-
// source().len() (UTF-8 bytes), which would be wrong for non-ASCII strings.
659-
let total_len = self.char_to_byte_mapper.total_len();
660709
#[expect(clippy::cast_sign_loss, clippy::cast_possible_truncation)]
661-
let start_byte = self.char_to_byte_mapper.char_to_byte(start as u32).unwrap_or(total_len);
710+
let start_u32 = start as u32;
662711
#[expect(clippy::cast_sign_loss, clippy::cast_possible_truncation)]
663-
let end_byte = self.char_to_byte_mapper.char_to_byte(end as u32).unwrap_or(total_len);
712+
let end_u32 = end as u32;
713+
714+
// Fetch the mapping entries once. If start/end fall on a low surrogate (middle
715+
// of a surrogate pair), we need special handling:
716+
// - start at LOW: prepend the lone low surrogate, UTF-8 slice starts after the char.
717+
// - end at LOW: use the previous entry's byte offset (before the char) and append
718+
// the lone high surrogate.
719+
// - HIGH surrogate positions already have the correct byte offset (before the char).
720+
let total_len = self.char_to_byte_mapper.total_len();
721+
let start_entry = self.char_to_byte_mapper.get(start_u32);
722+
let end_entry = self.char_to_byte_mapper.get(end_u32);
723+
724+
// When start == end, the result is always empty regardless of surrogate position.
725+
// Only check surrogates when the range is non-empty.
726+
let (start_is_low, end_prev_entry) = if start_u32 < end_u32 {
727+
let start_is_low = start_entry.is_some_and(Utf16Mapping::is_low_surrogate);
728+
let end_is_low = end_entry.is_some_and(Utf16Mapping::is_low_surrogate);
729+
// When end is a low surrogate, look up the preceding high surrogate entry once
730+
// (used for both the byte offset and the surrogate value to append).
731+
let end_prev = if end_is_low {
732+
debug_assert!(end_u32 > 0, "low surrogate cannot appear at index 0");
733+
self.char_to_byte_mapper.get(end_u32 - 1)
734+
} else {
735+
None
736+
};
737+
(start_is_low, end_prev)
738+
} else {
739+
(false, None)
740+
};
741+
742+
let start_byte = start_entry.map_or(total_len, |e| e.byte_offset);
743+
let end_byte = if let Some(prev) = end_prev_entry {
744+
// End falls on a low surrogate — use the high surrogate's byte_offset
745+
// (before the character) so the UTF-8 slice excludes it.
746+
prev.byte_offset
747+
} else {
748+
end_entry.map_or(total_len, |e| e.byte_offset)
749+
};
750+
// Clamp reversed ranges (e.g. slice(2, 1) on 'a🤷b') to empty.
751+
let end_byte = end_byte.max(start_byte);
752+
753+
let utf8_result =
754+
self.inner.slice(start_byte, Some(end_byte)).map_err(napi::Error::from_reason)?;
755+
756+
// Fast path: no lone surrogates involved — return the UTF-8 string directly,
757+
// avoiding the UTF-16 transcoding and allocation.
758+
if !start_is_low && end_prev_entry.is_none() {
759+
return env.create_string(&utf8_result);
760+
}
761+
762+
// Slow path: build UTF-16 buffer with lone surrogates at the boundaries.
763+
let mut utf16_buf: Vec<u16> = Vec::new();
764+
765+
if let Some(entry) = start_entry.filter(|e| e.is_low_surrogate()) {
766+
utf16_buf.push(entry.surrogate);
767+
}
768+
769+
utf16_buf.extend(utf8_result.encode_utf16());
770+
771+
if let Some(high_entry) = end_prev_entry {
772+
utf16_buf.push(high_entry.surrogate);
773+
}
664774

665-
self.inner.slice(start_byte, Some(end_byte)).map_err(napi::Error::from_reason)
775+
env.create_string_utf16(&utf16_buf)
666776
}
667777

668778
/// Generates a source map for the transformations applied to this MagicString.

packages/rolldown/src/binding.d.cts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1535,6 +1535,10 @@ export declare class BindingMagicString {
15351535
/**
15361536
* Returns the content between the specified original character positions.
15371537
* Supports negative indices (counting from the end).
1538+
*
1539+
* When an index falls in the middle of a surrogate pair, the lone surrogate is
1540+
* included in the result (matching the original magic-string / JS behavior).
1541+
* This is done by returning a UTF-16 encoded JS string via `napi_create_string_utf16`.
15381542
*/
15391543
slice(start?: number | undefined | null, end?: number | undefined | null): string
15401544
/**
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
import assert from 'node:assert';
2+
import { RolldownMagicString as MagicString } from 'rolldown';
3+
import { describe, it } from 'vitest';
4+
5+
describe('MagicString unicode handling', () => {
6+
// Exact repro from issue #8685
7+
it('should slice strings with emoji (surrogate pairs)', () => {
8+
const s = new MagicString('some 🤷‍♂️ string');
9+
// '🤷‍♂️' is composed of: 🤷 (U+1F937, 2 UTF-16 units) + ZWJ (U+200D, 1) + ♂ (U+2642, 1) + VS16 (U+FE0F, 1) = 5 UTF-16 units
10+
// 'some ' = 5 UTF-16 units, so emoji sequence ends at index 10
11+
assert.strictEqual(s.slice(0, 5), 'some ');
12+
assert.strictEqual(s.slice(10), ' string');
13+
});
14+
15+
it('should overwrite across emoji boundaries', () => {
16+
const s = new MagicString('a🤷b');
17+
// 'a' = index 0-1, '🤷' = index 1-3 (2 UTF-16 units), 'b' = index 3-4
18+
s.overwrite(0, 3, 'replaced');
19+
assert.strictEqual(s.toString(), 'replacedb');
20+
});
21+
22+
it('should remove emoji characters', () => {
23+
const s = new MagicString('hello🌍world');
24+
// 'hello' = 0-5, '🌍' = 5-7 (2 UTF-16 units), 'world' = 7-12
25+
s.remove(5, 7);
26+
assert.strictEqual(s.toString(), 'helloworld');
27+
});
28+
29+
it('should handle CJK characters (3-byte UTF-8, 1 UTF-16 unit)', () => {
30+
const s = new MagicString('你好世界');
31+
// Each CJK character is 1 UTF-16 unit
32+
assert.strictEqual(s.slice(0, 2), '你好');
33+
assert.strictEqual(s.slice(2, 4), '世界');
34+
});
35+
36+
it('should handle mixed ASCII, CJK, and emoji', () => {
37+
const s = new MagicString('hi你好🌍ok');
38+
// 'h'=0, 'i'=1, '你'=2, '好'=3, '🌍'=4-5 (surrogate pair), 'o'=6, 'k'=7
39+
assert.strictEqual(s.slice(0, 2), 'hi');
40+
assert.strictEqual(s.slice(2, 4), '你好');
41+
assert.strictEqual(s.slice(6, 8), 'ok');
42+
});
43+
44+
it('should handle negative indices with multi-byte characters', () => {
45+
const s = new MagicString('abc🤷def');
46+
// Total length: 3 + 2 + 3 = 8 UTF-16 units
47+
// -3 should map to index 5 => 'def'
48+
assert.strictEqual(s.slice(-3), 'def');
49+
});
50+
51+
it('should handle update with emoji', () => {
52+
const s = new MagicString('hello🌍world');
53+
// Replace '🌍' (indices 5-7) with ' '
54+
s.update(5, 7, ' ');
55+
assert.strictEqual(s.toString(), 'hello world');
56+
});
57+
58+
it('should handle prepend/append left/right with emoji', () => {
59+
const s = new MagicString('a🤷b');
60+
// '🤷' starts at index 1, ends at index 3
61+
s.appendLeft(3, '!');
62+
assert.strictEqual(s.toString(), 'a🤷!b');
63+
});
64+
65+
it('should return lone surrogates when indexing middle of surrogate pair', () => {
66+
const s = new MagicString('a🤷b');
67+
// In JS: 'a'=0, high surrogate (0xD83E)=1, low surrogate (0xDD37)=2, 'b'=3
68+
// slice(1) starts at high surrogate — includes the full emoji
69+
assert.strictEqual(s.slice(1), '🤷b');
70+
// slice(2) starts at low surrogate — returns lone low surrogate + 'b'
71+
// matching JS behavior: 'a🤷b'.slice(2) === '\uDD37b'
72+
assert.strictEqual(s.slice(2), '\uDD37b');
73+
// slice(0, 2) ends at low surrogate — returns 'a' + lone high surrogate
74+
// matching JS behavior: 'a🤷b'.slice(0, 2) === 'a\uD83E'
75+
assert.strictEqual(s.slice(0, 2), 'a\uD83E');
76+
// slice(3) is 'b'
77+
assert.strictEqual(s.slice(3), 'b');
78+
});
79+
80+
it('should return an empty string for slice(i, i) at a low-surrogate index', () => {
81+
const s = new MagicString('a🤷b');
82+
assert.strictEqual(s.slice(2, 2), '');
83+
});
84+
85+
it('should return an empty string for reversed ranges across a surrogate boundary', () => {
86+
const s = new MagicString('a🤷b');
87+
assert.strictEqual(s.slice(2, 1), '');
88+
});
89+
});

0 commit comments

Comments
 (0)