@@ -89,6 +89,24 @@ impl CodePoint {
89
89
self . value
90
90
}
91
91
92
+ /// Returns the numeric value of the code point if it is a leading surrogate.
93
+ #[ inline]
94
+ pub fn to_lead_surrogate ( & self ) -> Option < u16 > {
95
+ match self . value {
96
+ lead @ 0xD800 ..=0xDBFF => Some ( lead as u16 ) ,
97
+ _ => None ,
98
+ }
99
+ }
100
+
101
+ /// Returns the numeric value of the code point if it is a trailing surrogate.
102
+ #[ inline]
103
+ pub fn to_trail_surrogate ( & self ) -> Option < u16 > {
104
+ match self . value {
105
+ trail @ 0xDC00 ..=0xDFFF => Some ( trail as u16 ) ,
106
+ _ => None ,
107
+ }
108
+ }
109
+
92
110
/// Optionally returns a Unicode scalar value for the code point.
93
111
///
94
112
/// Returns `None` if the code point is a surrogate (from U+D800 to U+DFFF).
@@ -117,6 +135,14 @@ impl CodePoint {
117
135
#[ derive( Eq , PartialEq , Ord , PartialOrd , Clone ) ]
118
136
pub struct Wtf8Buf {
119
137
bytes : Vec < u8 > ,
138
+
139
+ /// Do we know that `bytes` holds a valid UTF-8 encoding? We can easily
140
+ /// know this if we're constructed from a `String` or `&str`.
141
+ ///
142
+ /// It is possible for `bytes` to have valid UTF-8 without this being
143
+ /// set, such as when we're concatenating `&Wtf8`'s and surrogates become
144
+ /// paired, as we don't bother to rescan the entire string.
145
+ is_known_utf8 : bool ,
120
146
}
121
147
122
148
impl ops:: Deref for Wtf8Buf {
@@ -147,13 +173,13 @@ impl Wtf8Buf {
147
173
/// Creates a new, empty WTF-8 string.
148
174
#[ inline]
149
175
pub fn new ( ) -> Wtf8Buf {
150
- Wtf8Buf { bytes : Vec :: new ( ) }
176
+ Wtf8Buf { bytes : Vec :: new ( ) , is_known_utf8 : true }
151
177
}
152
178
153
179
/// Creates a new, empty WTF-8 string with pre-allocated capacity for `capacity` bytes.
154
180
#[ inline]
155
181
pub fn with_capacity ( capacity : usize ) -> Wtf8Buf {
156
- Wtf8Buf { bytes : Vec :: with_capacity ( capacity) }
182
+ Wtf8Buf { bytes : Vec :: with_capacity ( capacity) , is_known_utf8 : true }
157
183
}
158
184
159
185
/// Creates a WTF-8 string from a UTF-8 `String`.
@@ -163,7 +189,7 @@ impl Wtf8Buf {
163
189
/// Since WTF-8 is a superset of UTF-8, this always succeeds.
164
190
#[ inline]
165
191
pub fn from_string ( string : String ) -> Wtf8Buf {
166
- Wtf8Buf { bytes : string. into_bytes ( ) }
192
+ Wtf8Buf { bytes : string. into_bytes ( ) , is_known_utf8 : true }
167
193
}
168
194
169
195
/// Creates a WTF-8 string from a UTF-8 `&str` slice.
@@ -173,11 +199,12 @@ impl Wtf8Buf {
173
199
/// Since WTF-8 is a superset of UTF-8, this always succeeds.
174
200
#[ inline]
175
201
pub fn from_str ( str : & str ) -> Wtf8Buf {
176
- Wtf8Buf { bytes : <[ _ ] >:: to_vec ( str. as_bytes ( ) ) }
202
+ Wtf8Buf { bytes : <[ _ ] >:: to_vec ( str. as_bytes ( ) ) , is_known_utf8 : true }
177
203
}
178
204
179
205
pub fn clear ( & mut self ) {
180
- self . bytes . clear ( )
206
+ self . bytes . clear ( ) ;
207
+ self . is_known_utf8 = true ;
181
208
}
182
209
183
210
/// Creates a WTF-8 string from a potentially ill-formed UTF-16 slice of 16-bit code units.
@@ -193,17 +220,19 @@ impl Wtf8Buf {
193
220
let surrogate = surrogate. unpaired_surrogate ( ) ;
194
221
// Surrogates are known to be in the code point range.
195
222
let code_point = unsafe { CodePoint :: from_u32_unchecked ( surrogate as u32 ) } ;
223
+ // The string will now contain an unpaired surrogate.
224
+ string. is_known_utf8 = false ;
196
225
// Skip the WTF-8 concatenation check,
197
226
// surrogate pairs are already decoded by decode_utf16
198
- string. push_code_point_unchecked ( code_point)
227
+ string. push_code_point_unchecked ( code_point) ;
199
228
}
200
229
}
201
230
}
202
231
string
203
232
}
204
233
205
234
/// Copied from String::push
206
- /// This does **not** include the WTF-8 concatenation check.
235
+ /// This does **not** include the WTF-8 concatenation check or `is_known_utf8` check .
207
236
fn push_code_point_unchecked ( & mut self , code_point : CodePoint ) {
208
237
let mut bytes = [ 0 ; 4 ] ;
209
238
let bytes = char:: encode_utf8_raw ( code_point. value , & mut bytes) ;
@@ -217,6 +246,9 @@ impl Wtf8Buf {
217
246
218
247
#[ inline]
219
248
pub fn as_mut_slice ( & mut self ) -> & mut Wtf8 {
249
+ // Safety: `Wtf8` doesn't expose any way to mutate the bytes that would
250
+ // cause them to change from well-formed UTF-8 to ill-formed UTF-8,
251
+ // which would break the assumptions of the `is_known_utf8` field.
220
252
unsafe { Wtf8 :: from_mut_bytes_unchecked ( & mut self . bytes ) }
221
253
}
222
254
@@ -314,7 +346,15 @@ impl Wtf8Buf {
314
346
self . push_char ( decode_surrogate_pair ( lead, trail) ) ;
315
347
self . bytes . extend_from_slice ( other_without_trail_surrogate) ;
316
348
}
317
- _ => self . bytes . extend_from_slice ( & other. bytes ) ,
349
+ _ => {
350
+ // If we'll be pushing a string containing a surrogate, we may
351
+ // no longer have UTF-8.
352
+ if other. next_surrogate ( 0 ) . is_some ( ) {
353
+ self . is_known_utf8 = false ;
354
+ }
355
+
356
+ self . bytes . extend_from_slice ( & other. bytes ) ;
357
+ }
318
358
}
319
359
}
320
360
@@ -331,13 +371,19 @@ impl Wtf8Buf {
331
371
/// like concatenating ill-formed UTF-16 strings effectively would.
332
372
#[ inline]
333
373
pub fn push ( & mut self , code_point : CodePoint ) {
334
- if let trail @ 0xDC00 ..= 0xDFFF = code_point. to_u32 ( ) {
374
+ if let Some ( trail) = code_point. to_trail_surrogate ( ) {
335
375
if let Some ( lead) = ( & * self ) . final_lead_surrogate ( ) {
336
376
let len_without_lead_surrogate = self . len ( ) - 3 ;
337
377
self . bytes . truncate ( len_without_lead_surrogate) ;
338
- self . push_char ( decode_surrogate_pair ( lead, trail as u16 ) ) ;
378
+ self . push_char ( decode_surrogate_pair ( lead, trail) ) ;
339
379
return ;
340
380
}
381
+
382
+ // We're pushing a trailing surrogate.
383
+ self . is_known_utf8 = false ;
384
+ } else if code_point. to_lead_surrogate ( ) . is_some ( ) {
385
+ // We're pushing a leading surrogate.
386
+ self . is_known_utf8 = false ;
341
387
}
342
388
343
389
// No newly paired surrogates at the boundary.
@@ -364,9 +410,10 @@ impl Wtf8Buf {
364
410
/// (that is, if the string contains surrogates),
365
411
/// the original WTF-8 string is returned instead.
366
412
pub fn into_string ( self ) -> Result < String , Wtf8Buf > {
367
- match self . next_surrogate ( 0 ) {
368
- None => Ok ( unsafe { String :: from_utf8_unchecked ( self . bytes ) } ) ,
369
- Some ( _) => Err ( self ) ,
413
+ if self . is_known_utf8 || self . next_surrogate ( 0 ) . is_none ( ) {
414
+ Ok ( unsafe { String :: from_utf8_unchecked ( self . bytes ) } )
415
+ } else {
416
+ Err ( self )
370
417
}
371
418
}
372
419
@@ -376,6 +423,11 @@ impl Wtf8Buf {
376
423
///
377
424
/// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”)
378
425
pub fn into_string_lossy ( mut self ) -> String {
426
+ // Fast path: If we already have UTF-8, we can return it immediately.
427
+ if self . is_known_utf8 {
428
+ return unsafe { String :: from_utf8_unchecked ( self . bytes ) } ;
429
+ }
430
+
379
431
let mut pos = 0 ;
380
432
loop {
381
433
match self . next_surrogate ( pos) {
@@ -398,7 +450,7 @@ impl Wtf8Buf {
398
450
/// Converts a `Box<Wtf8>` into a `Wtf8Buf`.
399
451
pub fn from_box ( boxed : Box < Wtf8 > ) -> Wtf8Buf {
400
452
let bytes: Box < [ u8 ] > = unsafe { mem:: transmute ( boxed) } ;
401
- Wtf8Buf { bytes : bytes. into_vec ( ) }
453
+ Wtf8Buf { bytes : bytes. into_vec ( ) , is_known_utf8 : false }
402
454
}
403
455
}
404
456
@@ -576,6 +628,11 @@ impl Wtf8 {
576
628
}
577
629
}
578
630
631
+ /// Creates an owned `Wtf8Buf` from a borrowed `Wtf8`.
632
+ pub fn to_owned ( & self ) -> Wtf8Buf {
633
+ Wtf8Buf { bytes : self . bytes . to_vec ( ) , is_known_utf8 : false }
634
+ }
635
+
579
636
/// Lossily converts the string to UTF-8.
580
637
/// Returns a UTF-8 `&str` slice if the contents are well-formed in UTF-8.
581
638
///
@@ -665,7 +722,8 @@ impl Wtf8 {
665
722
}
666
723
667
724
pub fn clone_into ( & self , buf : & mut Wtf8Buf ) {
668
- self . bytes . clone_into ( & mut buf. bytes )
725
+ buf. is_known_utf8 = false ;
726
+ self . bytes . clone_into ( & mut buf. bytes ) ;
669
727
}
670
728
671
729
/// Boxes this `Wtf8`.
@@ -705,12 +763,12 @@ impl Wtf8 {
705
763
706
764
#[ inline]
707
765
pub fn to_ascii_lowercase ( & self ) -> Wtf8Buf {
708
- Wtf8Buf { bytes : self . bytes . to_ascii_lowercase ( ) }
766
+ Wtf8Buf { bytes : self . bytes . to_ascii_lowercase ( ) , is_known_utf8 : false }
709
767
}
710
768
711
769
#[ inline]
712
770
pub fn to_ascii_uppercase ( & self ) -> Wtf8Buf {
713
- Wtf8Buf { bytes : self . bytes . to_ascii_uppercase ( ) }
771
+ Wtf8Buf { bytes : self . bytes . to_ascii_uppercase ( ) , is_known_utf8 : false }
714
772
}
715
773
716
774
#[ inline]
0 commit comments