@@ -61,8 +61,8 @@ use core::hash;
61
61
use core:: iter:: { FromIterator , FusedIterator } ;
62
62
use core:: ops:: { self , Add , AddAssign , Index , IndexMut } ;
63
63
use core:: ptr;
64
- use core:: str as core_str;
65
64
use core:: str:: pattern:: Pattern ;
65
+ use std_unicode:: lossy;
66
66
use std_unicode:: char:: { decode_utf16, REPLACEMENT_CHARACTER } ;
67
67
68
68
use borrow:: { Cow , ToOwned } ;
@@ -533,111 +533,34 @@ impl String {
533
533
/// ```
534
534
#[ stable( feature = "rust1" , since = "1.0.0" ) ]
535
535
pub fn from_utf8_lossy < ' a > ( v : & ' a [ u8 ] ) -> Cow < ' a , str > {
536
- let mut i;
537
- match str:: from_utf8 ( v) {
538
- Ok ( s) => return Cow :: Borrowed ( s) ,
539
- Err ( e) => i = e. valid_up_to ( ) ,
540
- }
536
+ let mut iter = lossy:: Utf8Lossy :: from_bytes ( v) . chunks ( ) ;
541
537
542
- const TAG_CONT_U8 : u8 = 128 ;
543
- const REPLACEMENT : & ' static [ u8 ] = b"\xEF \xBF \xBD " ; // U+FFFD in UTF-8
544
- let total = v. len ( ) ;
545
- fn unsafe_get ( xs : & [ u8 ] , i : usize ) -> u8 {
546
- unsafe { * xs. get_unchecked ( i) }
547
- }
548
- fn safe_get ( xs : & [ u8 ] , i : usize , total : usize ) -> u8 {
549
- if i >= total { 0 } else { unsafe_get ( xs, i) }
550
- }
538
+ let ( first_valid, first_broken) = if let Some ( chunk) = iter. next ( ) {
539
+ let lossy:: Utf8LossyChunk { valid, broken } = chunk;
540
+ if valid. len ( ) == v. len ( ) {
541
+ debug_assert ! ( broken. is_empty( ) ) ;
542
+ return Cow :: Borrowed ( valid) ;
543
+ }
544
+ ( valid, broken)
545
+ } else {
546
+ return Cow :: Borrowed ( "" ) ;
547
+ } ;
551
548
552
- let mut res = String :: with_capacity ( total ) ;
549
+ const REPLACEMENT : & ' static str = " \u{FFFD} " ;
553
550
554
- if i > 0 {
555
- unsafe { res. as_mut_vec ( ) . extend_from_slice ( & v[ ..i] ) } ;
551
+ let mut res = String :: with_capacity ( v. len ( ) ) ;
552
+ res. push_str ( first_valid) ;
553
+ if !first_broken. is_empty ( ) {
554
+ res. push_str ( REPLACEMENT ) ;
556
555
}
557
556
558
- // subseqidx is the index of the first byte of the subsequence we're
559
- // looking at. It's used to copy a bunch of contiguous good codepoints
560
- // at once instead of copying them one by one.
561
- let mut subseqidx = i;
562
-
563
- while i < total {
564
- let i_ = i;
565
- let byte = unsafe_get ( v, i) ;
566
- i += 1 ;
567
-
568
- macro_rules! error { ( ) => ( {
569
- unsafe {
570
- if subseqidx != i_ {
571
- res. as_mut_vec( ) . extend_from_slice( & v[ subseqidx..i_] ) ;
572
- }
573
- subseqidx = i;
574
- res. as_mut_vec( ) . extend_from_slice( REPLACEMENT ) ;
575
- }
576
- } ) }
577
-
578
- if byte < 128 {
579
- // subseqidx handles this
580
- } else {
581
- let w = core_str:: utf8_char_width ( byte) ;
582
-
583
- match w {
584
- 2 => {
585
- if safe_get ( v, i, total) & 192 != TAG_CONT_U8 {
586
- error ! ( ) ;
587
- continue ;
588
- }
589
- i += 1 ;
590
- }
591
- 3 => {
592
- match ( byte, safe_get ( v, i, total) ) {
593
- ( 0xE0 , 0xA0 ...0xBF ) => ( ) ,
594
- ( 0xE1 ...0xEC , 0x80 ...0xBF ) => ( ) ,
595
- ( 0xED , 0x80 ...0x9F ) => ( ) ,
596
- ( 0xEE ...0xEF , 0x80 ...0xBF ) => ( ) ,
597
- _ => {
598
- error ! ( ) ;
599
- continue ;
600
- }
601
- }
602
- i += 1 ;
603
- if safe_get ( v, i, total) & 192 != TAG_CONT_U8 {
604
- error ! ( ) ;
605
- continue ;
606
- }
607
- i += 1 ;
608
- }
609
- 4 => {
610
- match ( byte, safe_get ( v, i, total) ) {
611
- ( 0xF0 , 0x90 ...0xBF ) => ( ) ,
612
- ( 0xF1 ...0xF3 , 0x80 ...0xBF ) => ( ) ,
613
- ( 0xF4 , 0x80 ...0x8F ) => ( ) ,
614
- _ => {
615
- error ! ( ) ;
616
- continue ;
617
- }
618
- }
619
- i += 1 ;
620
- if safe_get ( v, i, total) & 192 != TAG_CONT_U8 {
621
- error ! ( ) ;
622
- continue ;
623
- }
624
- i += 1 ;
625
- if safe_get ( v, i, total) & 192 != TAG_CONT_U8 {
626
- error ! ( ) ;
627
- continue ;
628
- }
629
- i += 1 ;
630
- }
631
- _ => {
632
- error ! ( ) ;
633
- continue ;
634
- }
635
- }
557
+ for lossy:: Utf8LossyChunk { valid, broken } in iter {
558
+ res. push_str ( valid) ;
559
+ if !broken. is_empty ( ) {
560
+ res. push_str ( REPLACEMENT ) ;
636
561
}
637
562
}
638
- if subseqidx < total {
639
- unsafe { res. as_mut_vec ( ) . extend_from_slice ( & v[ subseqidx..total] ) } ;
640
- }
563
+
641
564
Cow :: Owned ( res)
642
565
}
643
566
0 commit comments