@@ -9,6 +9,8 @@ use std::char::from_digit;
99use std:: ffi:: OsStr ;
1010use std:: fmt;
1111
12+ use crate :: os_str_as_bytes;
13+
1214// These are characters with special meaning in the shell (e.g. bash).
1315// The first const contains characters that only have a special meaning when they appear at the beginning of a name.
1416const SPECIAL_SHELL_CHARS_START : & [ char ] = & [ '~' , '#' ] ;
@@ -73,7 +75,7 @@ enum EscapeState {
7375}
7476
7577struct EscapeOctal {
76- c : char ,
78+ c : u32 ,
7779 state : EscapeOctalState ,
7880 idx : usize ,
7981}
@@ -95,7 +97,7 @@ impl Iterator for EscapeOctal {
9597 Some ( '\\' )
9698 }
9799 EscapeOctalState :: Value => {
98- let octal_digit = ( ( self . c as u32 ) >> ( self . idx * 3 ) ) & 0o7 ;
100+ let octal_digit = ( ( self . c ) >> ( self . idx * 3 ) ) & 0o7 ;
99101 if self . idx == 0 {
100102 self . state = EscapeOctalState :: Done ;
101103 } else {
@@ -108,9 +110,17 @@ impl Iterator for EscapeOctal {
108110}
109111
110112impl EscapeOctal {
111- fn from ( c : char ) -> Self {
113+ fn from_char ( c : char ) -> Self {
114+ Self {
115+ c : c as u32 ,
116+ idx : 2 ,
117+ state : EscapeOctalState :: Backslash ,
118+ }
119+ }
120+
121+ fn from_byte ( c : u8 ) -> Self {
112122 Self {
113- c,
123+ c : c as u32 ,
114124 idx : 2 ,
115125 state : EscapeOctalState :: Backslash ,
116126 }
@@ -148,7 +158,7 @@ impl EscapedChar {
148158 _ => Char ( ' ' ) ,
149159 } ,
150160 ':' if dirname => Backslash ( ':' ) ,
151- _ if c. is_ascii_control ( ) => Octal ( EscapeOctal :: from ( c) ) ,
161+ _ if c. is_ascii_control ( ) => Octal ( EscapeOctal :: from_char ( c) ) ,
152162 _ => Char ( c) ,
153163 } ;
154164 Self { state : init_state }
@@ -165,7 +175,7 @@ impl EscapedChar {
165175 '\x0B' => Backslash ( 'v' ) ,
166176 '\x0C' => Backslash ( 'f' ) ,
167177 '\r' => Backslash ( 'r' ) ,
168- '\x00' ..='\x1F' | '\x7F' => Octal ( EscapeOctal :: from ( c) ) ,
178+ '\x00' ..='\x1F' | '\x7F' => Octal ( EscapeOctal :: from_char ( c) ) ,
169179 '\'' => match quotes {
170180 Quotes :: Single => Backslash ( '\'' ) ,
171181 _ => Char ( '\'' ) ,
@@ -176,6 +186,15 @@ impl EscapedChar {
176186 Self { state : init_state }
177187 }
178188
189+ fn new_byte ( b : u8 , escape : bool ) -> Self {
190+ let init_state = if escape {
191+ EscapeState :: Octal ( EscapeOctal :: from_byte ( b) )
192+ } else {
193+ EscapeState :: Char ( '?' )
194+ } ;
195+ Self { state : init_state }
196+ }
197+
179198 fn hide_control ( self ) -> Self {
180199 match self . state {
181200 EscapeState :: Char ( c) if c. is_control ( ) => Self {
@@ -205,18 +224,92 @@ impl Iterator for EscapedChar {
205224 }
206225}
207226
208- fn shell_without_escape ( name : & str , quotes : Quotes , show_control_chars : bool ) -> ( String , bool ) {
209- let mut must_quote = false ;
210- let mut escaped_str = String :: with_capacity ( name. len ( ) ) ;
227+ enum NonUtf8StringPart < ' a > {
228+ Valid ( & ' a str ) ,
229+ Invalid ( & ' a [ u8 ] ) ,
230+ }
211231
212- for c in name. chars ( ) {
213- let escaped = {
214- let ec = EscapedChar :: new_shell ( c, false , quotes) ;
215- if show_control_chars {
216- ec
232+ impl < ' a > NonUtf8StringPart < ' a > {
233+ fn valid ( & self ) -> Option < & ' a str > {
234+ match self {
235+ NonUtf8StringPart :: Valid ( s) => Some ( s) ,
236+ NonUtf8StringPart :: Invalid ( _) => None ,
237+ }
238+ }
239+ }
240+
241+ /// Represent a string which might contains non UTF-8 characters.
242+ struct MaybeNonUtf8String < ' a > {
243+ source : Vec < NonUtf8StringPart < ' a > > ,
244+ }
245+
246+ impl < ' a > MaybeNonUtf8String < ' a > {
247+ fn new ( source : & ' a [ u8 ] ) -> Self {
248+ Self {
249+ source : source
250+ . utf8_chunks ( )
251+ . flat_map ( |chunk| {
252+ let mut parts = vec ! [ ] ;
253+ if !chunk. valid ( ) . is_empty ( ) {
254+ parts. push ( NonUtf8StringPart :: Valid ( chunk. valid ( ) ) ) ;
255+ }
256+ if !chunk. invalid ( ) . is_empty ( ) {
257+ parts. push ( NonUtf8StringPart :: Invalid ( chunk. invalid ( ) ) ) ;
258+ }
259+ parts
260+ } )
261+ . collect ( ) ,
262+ }
263+ }
264+
265+ fn contains_chars ( & self , s : & [ char ] ) -> bool {
266+ self . source
267+ . iter ( )
268+ . any ( |chunk| chunk. valid ( ) . is_some_and ( |valid| valid. contains ( s) ) )
269+ }
270+
271+ fn contains_char ( & self , c : char ) -> bool {
272+ self . source
273+ . iter ( )
274+ . any ( |chunk| chunk. valid ( ) . is_some_and ( |valid| valid. contains ( c) ) )
275+ }
276+
277+ fn starts_with ( & self , chars : & [ char ] ) -> bool {
278+ self . source . first ( ) . is_some_and ( |chunk| {
279+ if let NonUtf8StringPart :: Valid ( s) = chunk {
280+ s. starts_with ( chars)
217281 } else {
218- ec . hide_control ( )
282+ false
219283 }
284+ } )
285+ }
286+
287+ fn estimated_len ( & self ) -> usize {
288+ self . source . iter ( ) . fold ( 0 , |i, chunk| match chunk {
289+ NonUtf8StringPart :: Valid ( s) => i + s. len ( ) ,
290+ NonUtf8StringPart :: Invalid ( b) => i + b. len ( ) ,
291+ } )
292+ }
293+
294+ fn iter ( & self ) -> impl Iterator < Item = & NonUtf8StringPart < ' a > > {
295+ self . source . iter ( )
296+ }
297+ }
298+
299+ fn shell_without_escape (
300+ name : & MaybeNonUtf8String < ' _ > ,
301+ quotes : Quotes ,
302+ show_control_chars : bool ,
303+ ) -> ( String , bool ) {
304+ let mut must_quote = false ;
305+ let mut escaped_str = String :: with_capacity ( name. estimated_len ( ) ) ;
306+ let chunks = name. iter ( ) ;
307+
308+ let mut push_to_str = |ec : EscapedChar | {
309+ let escaped = if show_control_chars {
310+ ec
311+ } else {
312+ ec. hide_control ( )
220313 } ;
221314
222315 match escaped. state {
@@ -231,53 +324,85 @@ fn shell_without_escape(name: &str, quotes: Quotes, show_control_chars: bool) ->
231324 }
232325 }
233326 }
327+ } ;
328+
329+ for chunk in chunks {
330+ match chunk {
331+ NonUtf8StringPart :: Valid ( s) => {
332+ for c in s. chars ( ) {
333+ let escaped = EscapedChar :: new_shell ( c, false , quotes) ;
334+ push_to_str ( escaped)
335+ }
336+ }
337+ NonUtf8StringPart :: Invalid ( bytes) => {
338+ for b in * bytes {
339+ let escaped = EscapedChar :: new_byte ( * b, false ) ;
340+ push_to_str ( escaped)
341+ }
342+ }
343+ }
234344 }
235345
236346 must_quote = must_quote || name. starts_with ( SPECIAL_SHELL_CHARS_START ) ;
237347 ( escaped_str, must_quote)
238348}
239349
240- fn shell_with_escape ( name : & str , quotes : Quotes ) -> ( String , bool ) {
350+ fn shell_with_escape ( name : & MaybeNonUtf8String < ' _ > , quotes : Quotes ) -> ( String , bool ) {
241351 // We need to keep track of whether we are in a dollar expression
242352 // because e.g. \b\n is escaped as $'\b\n' and not like $'b'$'n'
243353 let mut in_dollar = false ;
244354 let mut must_quote = false ;
245- let mut escaped_str = String :: with_capacity ( name. len ( ) ) ;
355+ let mut escaped_str = String :: with_capacity ( name. estimated_len ( ) ) ;
356+ let chunks = name. iter ( ) ;
246357
247- for c in name. chars ( ) {
248- let escaped = EscapedChar :: new_shell ( c, true , quotes) ;
249- match escaped. state {
250- EscapeState :: Char ( x) => {
251- if in_dollar {
252- escaped_str. push_str ( "''" ) ;
253- in_dollar = false ;
254- }
255- escaped_str. push ( x) ;
256- }
257- EscapeState :: ForceQuote ( x) => {
258- if in_dollar {
259- escaped_str. push_str ( "''" ) ;
260- in_dollar = false ;
261- }
262- must_quote = true ;
263- escaped_str. push ( x) ;
358+ let mut push_to_string = |escaped : EscapedChar | match escaped. state {
359+ EscapeState :: Char ( x) => {
360+ if in_dollar {
361+ escaped_str. push_str ( "''" ) ;
362+ in_dollar = false ;
264363 }
265- // Single quotes are not put in dollar expressions, but are escaped
266- // if the string also contains double quotes. In that case, they must
267- // be handled separately.
268- EscapeState :: Backslash ( '\'' ) => {
269- must_quote = true ;
364+ escaped_str . push ( x ) ;
365+ }
366+ EscapeState :: ForceQuote ( x ) => {
367+ if in_dollar {
368+ escaped_str . push_str ( "''" ) ;
270369 in_dollar = false ;
271- escaped_str. push_str ( "'\\ ''" ) ;
272370 }
273- _ => {
274- if !in_dollar {
275- escaped_str. push_str ( "'$'" ) ;
276- in_dollar = true ;
371+ must_quote = true ;
372+ escaped_str. push ( x) ;
373+ }
374+ // Single quotes are not put in dollar expressions, but are escaped
375+ // if the string also contains double quotes. In that case, they must
376+ // be handled separately.
377+ EscapeState :: Backslash ( '\'' ) => {
378+ must_quote = true ;
379+ in_dollar = false ;
380+ escaped_str. push_str ( "'\\ ''" ) ;
381+ }
382+ _ => {
383+ if !in_dollar {
384+ escaped_str. push_str ( "'$'" ) ;
385+ in_dollar = true ;
386+ }
387+ must_quote = true ;
388+ for char in escaped {
389+ escaped_str. push ( char) ;
390+ }
391+ }
392+ } ;
393+
394+ for chunk in chunks {
395+ match chunk {
396+ NonUtf8StringPart :: Valid ( s) => {
397+ for c in s. chars ( ) {
398+ let escaped = EscapedChar :: new_shell ( c, true , quotes) ;
399+ push_to_string ( escaped)
277400 }
278- must_quote = true ;
279- for char in escaped {
280- escaped_str. push ( char) ;
401+ }
402+ NonUtf8StringPart :: Invalid ( bytes) => {
403+ for b in * bytes {
404+ let escaped = EscapedChar :: new_byte ( * b, true ) ;
405+ push_to_string ( escaped)
281406 }
282407 }
283408 }
@@ -309,6 +434,12 @@ fn shell_escaped_char_set(is_dirname: bool) -> &'static [char] {
309434/// This inner function provides an additional flag `dirname` which
310435/// is meant for ls' directory name display.
311436fn escape_name_inner ( name : & OsStr , style : & QuotingStyle , dirname : bool ) -> String {
437+ // utf8_chunks separates good from bad UTF8 in a byte sequence.
438+ let name_bytes = os_str_as_bytes ( name)
439+ . map ( ToOwned :: to_owned)
440+ . unwrap_or_else ( |_| name. to_string_lossy ( ) . as_bytes ( ) . to_vec ( ) ) ;
441+ let name_chunks = MaybeNonUtf8String :: new ( & name_bytes) ;
442+
312443 match style {
313444 QuotingStyle :: Literal { show_control } => {
314445 if * show_control {
@@ -321,10 +452,21 @@ fn escape_name_inner(name: &OsStr, style: &QuotingStyle, dirname: bool) -> Strin
321452 }
322453 }
323454 QuotingStyle :: C { quotes } => {
324- let escaped_str: String = name
325- . to_string_lossy ( )
326- . chars ( )
327- . flat_map ( |c| EscapedChar :: new_c ( c, * quotes, dirname) )
455+ let escaped_str: String = name_chunks
456+ . iter ( )
457+ . flat_map ( |chunk| {
458+ let x: Box < dyn Iterator < Item = char > > = match chunk {
459+ NonUtf8StringPart :: Valid ( s) => Box :: new (
460+ s. chars ( )
461+ . flat_map ( |c| EscapedChar :: new_c ( c, * quotes, dirname) ) ,
462+ ) ,
463+ NonUtf8StringPart :: Invalid ( bytes) => {
464+ Box :: new ( bytes. iter ( ) . flat_map ( |b| EscapedChar :: new_byte ( * b, true ) ) )
465+ }
466+ } ;
467+
468+ x
469+ } )
328470 . collect ( ) ;
329471
330472 match quotes {
@@ -338,11 +480,11 @@ fn escape_name_inner(name: &OsStr, style: &QuotingStyle, dirname: bool) -> Strin
338480 always_quote,
339481 show_control,
340482 } => {
341- let name = name . to_string_lossy ( ) ;
483+ let escaped_char_set = shell_escaped_char_set ( dirname ) ;
342484
343- let ( quotes, must_quote) = if name . contains ( shell_escaped_char_set ( dirname ) ) {
485+ let ( quotes, must_quote) = if name_chunks . contains_chars ( escaped_char_set ) {
344486 ( Quotes :: Single , true )
345- } else if name . contains ( '\'' ) {
487+ } else if name_chunks . contains_char ( '\'' ) {
346488 ( Quotes :: Double , true )
347489 } else if * always_quote {
348490 ( Quotes :: Single , true )
@@ -351,9 +493,9 @@ fn escape_name_inner(name: &OsStr, style: &QuotingStyle, dirname: bool) -> Strin
351493 } ;
352494
353495 let ( escaped_str, contains_quote_chars) = if * escape {
354- shell_with_escape ( & name , quotes)
496+ shell_with_escape ( & name_chunks , quotes)
355497 } else {
356- shell_without_escape ( & name , quotes, * show_control)
498+ shell_without_escape ( & name_chunks , quotes, * show_control)
357499 } ;
358500
359501 match ( must_quote | contains_quote_chars, quotes) {
0 commit comments