Skip to content

Commit e894e57

Browse files
jtraceysylvestre
authored andcommitted
quoting_style: fix multi-byte control characters
1 parent 2f0072e commit e894e57

File tree

1 file changed

+99
-28
lines changed

1 file changed

+99
-28
lines changed

src/uucore/src/lib/features/quoting_style.rs

Lines changed: 99 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -76,17 +76,24 @@ enum EscapeState {
7676
Octal(EscapeOctal),
7777
}
7878

79-
/// Byte we need to present as escaped octal, in the form of `\nnn`
79+
/// Bytes we need to present as escaped octal, in the form of `\nnn` per byte.
80+
/// Only supports characters up to 2 bytes long in UTF-8.
8081
struct EscapeOctal {
81-
c: u8,
82+
c: [u8; 2],
8283
state: EscapeOctalState,
83-
idx: usize,
84+
idx: u8,
8485
}
8586

8687
enum EscapeOctalState {
8788
Done,
88-
Backslash,
89-
Value,
89+
FirstBackslash,
90+
FirstValue,
91+
LastBackslash,
92+
LastValue,
93+
}
94+
95+
fn byte_to_octal_digit(byte: u8, idx: u8) -> u8 {
96+
(byte >> (idx * 3)) & 0o7
9097
}
9198

9299
impl Iterator for EscapeOctal {
@@ -95,12 +102,26 @@ impl Iterator for EscapeOctal {
95102
fn next(&mut self) -> Option<char> {
96103
match self.state {
97104
EscapeOctalState::Done => None,
98-
EscapeOctalState::Backslash => {
99-
self.state = EscapeOctalState::Value;
105+
EscapeOctalState::FirstBackslash => {
106+
self.state = EscapeOctalState::FirstValue;
100107
Some('\\')
101108
}
102-
EscapeOctalState::Value => {
103-
let octal_digit = ((self.c) >> (self.idx * 3)) & 0o7;
109+
EscapeOctalState::LastBackslash => {
110+
self.state = EscapeOctalState::LastValue;
111+
Some('\\')
112+
}
113+
EscapeOctalState::FirstValue => {
114+
let octal_digit = byte_to_octal_digit(self.c[0], self.idx);
115+
if self.idx == 0 {
116+
self.state = EscapeOctalState::LastBackslash;
117+
self.idx = 2;
118+
} else {
119+
self.idx -= 1;
120+
}
121+
Some(from_digit(octal_digit.into(), 8).unwrap())
122+
}
123+
EscapeOctalState::LastValue => {
124+
let octal_digit = byte_to_octal_digit(self.c[1], self.idx);
104125
if self.idx == 0 {
105126
self.state = EscapeOctalState::Done;
106127
} else {
@@ -113,11 +134,25 @@ impl Iterator for EscapeOctal {
113134
}
114135

115136
impl EscapeOctal {
116-
fn from(c: u8) -> Self {
137+
fn from_char(c: char) -> Self {
138+
if c.len_utf8() == 1 {
139+
return Self::from_byte(c as u8);
140+
}
141+
142+
let mut buf = [0; 2];
143+
let _s = c.encode_utf8(&mut buf);
144+
Self {
145+
c: buf,
146+
idx: 2,
147+
state: EscapeOctalState::FirstBackslash,
148+
}
149+
}
150+
151+
fn from_byte(b: u8) -> Self {
117152
Self {
118-
c,
153+
c: [0, b],
119154
idx: 2,
120-
state: EscapeOctalState::Backslash,
155+
state: EscapeOctalState::LastBackslash,
121156
}
122157
}
123158
}
@@ -131,7 +166,7 @@ impl EscapedChar {
131166

132167
fn new_octal(b: u8) -> Self {
133168
Self {
134-
state: EscapeState::Octal(EscapeOctal::from(b)),
169+
state: EscapeState::Octal(EscapeOctal::from_byte(b)),
135170
}
136171
}
137172

@@ -159,7 +194,7 @@ impl EscapedChar {
159194
_ => Char(' '),
160195
},
161196
':' if dirname => Backslash(':'),
162-
_ if c.is_ascii_control() => Octal(EscapeOctal::from(c as u8)),
197+
_ if c.is_control() => Octal(EscapeOctal::from_char(c)),
163198
_ => Char(c),
164199
};
165200
Self { state: init_state }
@@ -176,11 +211,11 @@ impl EscapedChar {
176211
'\x0B' => Backslash('v'),
177212
'\x0C' => Backslash('f'),
178213
'\r' => Backslash('r'),
179-
'\x00'..='\x1F' | '\x7F' => Octal(EscapeOctal::from(c as u8)),
180214
'\'' => match quotes {
181215
Quotes::Single => Backslash('\''),
182216
_ => Char('\''),
183217
},
218+
_ if c.is_control() => Octal(EscapeOctal::from_char(c)),
184219
_ if SPECIAL_SHELL_CHARS.contains(c) => ForceQuote(c),
185220
_ => Char(c),
186221
};
@@ -559,10 +594,10 @@ mod tests {
559594
("\"one_two\"", "c"),
560595
("one_two", "shell"),
561596
("one_two", "shell-show"),
562-
("\'one_two\'", "shell-always"),
563-
("\'one_two\'", "shell-always-show"),
597+
("'one_two'", "shell-always"),
598+
("'one_two'", "shell-always-show"),
564599
("one_two", "shell-escape"),
565-
("\'one_two\'", "shell-escape-always"),
600+
("'one_two'", "shell-escape-always"),
566601
],
567602
);
568603
}
@@ -576,12 +611,12 @@ mod tests {
576611
("one two", "literal-show"),
577612
("one\\ two", "escape"),
578613
("\"one two\"", "c"),
579-
("\'one two\'", "shell"),
580-
("\'one two\'", "shell-show"),
581-
("\'one two\'", "shell-always"),
582-
("\'one two\'", "shell-always-show"),
583-
("\'one two\'", "shell-escape"),
584-
("\'one two\'", "shell-escape-always"),
614+
("'one two'", "shell"),
615+
("'one two'", "shell-show"),
616+
("'one two'", "shell-always"),
617+
("'one two'", "shell-always-show"),
618+
("'one two'", "shell-escape"),
619+
("'one two'", "shell-escape-always"),
585620
],
586621
);
587622

@@ -623,7 +658,7 @@ mod tests {
623658

624659
// One single quote
625660
check_names(
626-
"one\'two",
661+
"one'two",
627662
&[
628663
("one'two", "literal"),
629664
("one'two", "literal-show"),
@@ -709,7 +744,7 @@ mod tests {
709744
],
710745
);
711746

712-
// The first 16 control characters. NUL is also included, even though it is of
747+
// The first 16 ASCII control characters. NUL is also included, even though it is of
713748
// no importance for file names.
714749
check_names(
715750
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F",
@@ -748,7 +783,7 @@ mod tests {
748783
],
749784
);
750785

751-
// The last 16 control characters.
786+
// The last 16 ASCII control characters.
752787
check_names(
753788
"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F",
754789
&[
@@ -802,6 +837,42 @@ mod tests {
802837
("''$'\\177'", "shell-escape-always"),
803838
],
804839
);
840+
841+
// The first 16 Unicode control characters.
842+
let test_str = std::str::from_utf8(b"\xC2\x80\xC2\x81\xC2\x82\xC2\x83\xC2\x84\xC2\x85\xC2\x86\xC2\x87\xC2\x88\xC2\x89\xC2\x8A\xC2\x8B\xC2\x8C\xC2\x8D\xC2\x8E\xC2\x8F").unwrap();
843+
check_names(
844+
test_str,
845+
&[
846+
("????????????????", "literal"),
847+
(test_str, "literal-show"),
848+
("\\302\\200\\302\\201\\302\\202\\302\\203\\302\\204\\302\\205\\302\\206\\302\\207\\302\\210\\302\\211\\302\\212\\302\\213\\302\\214\\302\\215\\302\\216\\302\\217", "escape"),
849+
("\"\\302\\200\\302\\201\\302\\202\\302\\203\\302\\204\\302\\205\\302\\206\\302\\207\\302\\210\\302\\211\\302\\212\\302\\213\\302\\214\\302\\215\\302\\216\\302\\217\"", "c"),
850+
("????????????????", "shell"),
851+
(test_str, "shell-show"),
852+
("'????????????????'", "shell-always"),
853+
(&format!("'{}'", test_str), "shell-always-show"),
854+
("''$'\\302\\200\\302\\201\\302\\202\\302\\203\\302\\204\\302\\205\\302\\206\\302\\207\\302\\210\\302\\211\\302\\212\\302\\213\\302\\214\\302\\215\\302\\216\\302\\217'", "shell-escape"),
855+
("''$'\\302\\200\\302\\201\\302\\202\\302\\203\\302\\204\\302\\205\\302\\206\\302\\207\\302\\210\\302\\211\\302\\212\\302\\213\\302\\214\\302\\215\\302\\216\\302\\217'", "shell-escape-always"),
856+
],
857+
);
858+
859+
// The last 16 Unicode control characters.
860+
let test_str = std::str::from_utf8(b"\xC2\x90\xC2\x91\xC2\x92\xC2\x93\xC2\x94\xC2\x95\xC2\x96\xC2\x97\xC2\x98\xC2\x99\xC2\x9A\xC2\x9B\xC2\x9C\xC2\x9D\xC2\x9E\xC2\x9F").unwrap();
861+
check_names(
862+
test_str,
863+
&[
864+
("????????????????", "literal"),
865+
(test_str, "literal-show"),
866+
("\\302\\220\\302\\221\\302\\222\\302\\223\\302\\224\\302\\225\\302\\226\\302\\227\\302\\230\\302\\231\\302\\232\\302\\233\\302\\234\\302\\235\\302\\236\\302\\237", "escape"),
867+
("\"\\302\\220\\302\\221\\302\\222\\302\\223\\302\\224\\302\\225\\302\\226\\302\\227\\302\\230\\302\\231\\302\\232\\302\\233\\302\\234\\302\\235\\302\\236\\302\\237\"", "c"),
868+
("????????????????", "shell"),
869+
(test_str, "shell-show"),
870+
("'????????????????'", "shell-always"),
871+
(&format!("'{}'", test_str), "shell-always-show"),
872+
("''$'\\302\\220\\302\\221\\302\\222\\302\\223\\302\\224\\302\\225\\302\\226\\302\\227\\302\\230\\302\\231\\302\\232\\302\\233\\302\\234\\302\\235\\302\\236\\302\\237'", "shell-escape"),
873+
("''$'\\302\\220\\302\\221\\302\\222\\302\\223\\302\\224\\302\\225\\302\\226\\302\\227\\302\\230\\302\\231\\302\\232\\302\\233\\302\\234\\302\\235\\302\\236\\302\\237'", "shell-escape-always"),
874+
],
875+
);
805876
}
806877

807878
#[test]
@@ -1060,7 +1131,7 @@ mod tests {
10601131
("one\\\\two", "escape"),
10611132
("\"one\\\\two\"", "c"),
10621133
("'one\\two'", "shell"),
1063-
("\'one\\two\'", "shell-always"),
1134+
("'one\\two'", "shell-always"),
10641135
("'one\\two'", "shell-escape"),
10651136
("'one\\two'", "shell-escape-always"),
10661137
],

0 commit comments

Comments
 (0)