Skip to content

Commit d4e0e52

Browse files
committed
Accept underscores in unicode escapes
Fixes #43692.
1 parent dd39ecf commit d4e0e52

8 files changed

+101
-56
lines changed

src/libsyntax/parse/lexer/mod.rs

+47-40
Original file line numberDiff line numberDiff line change
@@ -963,60 +963,67 @@ impl<'a> StringReader<'a> {
963963
true
964964
}
965965

966-
/// Scan over a \u{...} escape
966+
/// Scan over a `\u{...}` escape
967967
///
968-
/// At this point, we have already seen the \ and the u, the { is the current character. We
969-
/// will read at least one digit, and up to 6, and pass over the }.
968+
/// At this point, we have already seen the `\` and the `u`, the `{` is the current character.
969+
/// We will read a hex number (with `_` separators), with 1 to 6 actual digits,
970+
/// and pass over the `}`.
970971
fn scan_unicode_escape(&mut self, delim: char) -> bool {
971972
self.bump(); // past the {
972973
let start_bpos = self.pos;
973-
let mut count = 0;
974-
let mut accum_int = 0;
975974
let mut valid = true;
976975

977-
while !self.ch_is('}') && count <= 6 {
978-
let c = match self.ch {
979-
Some(c) => c,
980-
None => {
981-
panic!(self.fatal_span_(start_bpos,
982-
self.pos,
983-
"unterminated unicode escape (found EOF)"));
984-
}
985-
};
986-
accum_int *= 16;
987-
accum_int += c.to_digit(16).unwrap_or_else(|| {
988-
if c == delim {
989-
panic!(self.fatal_span_(self.pos,
990-
self.next_pos,
991-
"unterminated unicode escape (needed a `}`)"));
992-
} else {
993-
self.err_span_char(self.pos,
994-
self.next_pos,
995-
"invalid character in unicode escape",
996-
c);
997-
}
998-
valid = false;
999-
0
1000-
});
1001-
self.bump();
1002-
count += 1;
976+
if let Some('_') = self.ch {
977+
// disallow leading `_`
978+
self.err_span_(self.pos,
979+
self.next_pos,
980+
"invalid start of unicode escape");
981+
valid = false;
1003982
}
1004983

984+
let count = self.scan_digits(16, 16);
985+
1005986
if count > 6 {
1006987
self.err_span_(start_bpos,
1007988
self.pos,
1008-
"overlong unicode escape (can have at most 6 hex digits)");
989+
"overlong unicode escape (must have at most 6 hex digits)");
1009990
valid = false;
1010991
}
1011-
1012-
if valid && (char::from_u32(accum_int).is_none() || count == 0) {
1013-
self.err_span_(start_bpos,
1014-
self.pos,
1015-
"invalid unicode character escape");
1016-
valid = false;
992+
loop {
993+
match self.ch {
994+
Some('}') => {
995+
if valid && count == 0 {
996+
self.err_span_(start_bpos,
997+
self.pos,
998+
"empty unicode escape (must have at least 1 hex digit)");
999+
valid = false;
1000+
}
1001+
self.bump(); // past the ending `}`
1002+
break;
1003+
},
1004+
Some(c) => {
1005+
if c == delim {
1006+
self.err_span_(self.pos,
1007+
self.pos,
1008+
"unterminated unicode escape (needed a `}`)");
1009+
valid = false;
1010+
break;
1011+
} else if valid {
1012+
self.err_span_char(start_bpos,
1013+
self.pos,
1014+
"invalid character in unicode escape",
1015+
c);
1016+
valid = false;
1017+
}
1018+
},
1019+
None => {
1020+
panic!(self.fatal_span_(start_bpos,
1021+
self.pos,
1022+
"unterminated unicode escape (found EOF)"));
1023+
}
1024+
}
1025+
self.bump();
10171026
}
1018-
1019-
self.bump(); // past the ending }
10201027
valid
10211028
}
10221029

src/libsyntax/parse/mod.rs

+18-7
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,7 @@ pub fn stream_to_parser(sess: &ParseSess, stream: TokenStream) -> Parser {
230230
/// Rather than just accepting/rejecting a given literal, unescapes it as
231231
/// well. Can take any slice prefixed by a character escape. Returns the
232232
/// character and the number of characters consumed.
233-
pub fn char_lit(lit: &str) -> (char, isize) {
233+
pub fn char_lit(lit: &str, diag: Option<(Span, &Handler)>) -> (char, isize) {
234234
use std::char;
235235

236236
// Handle non-escaped chars first.
@@ -258,8 +258,19 @@ pub fn char_lit(lit: &str) -> (char, isize) {
258258
'u' => {
259259
assert_eq!(lit.as_bytes()[2], b'{');
260260
let idx = lit.find('}').unwrap();
261-
let v = u32::from_str_radix(&lit[3..idx], 16).unwrap();
262-
let c = char::from_u32(v).unwrap();
261+
let s = &lit[3..idx].chars().filter(|&c| c != '_').collect::<String>();
262+
let v = u32::from_str_radix(&s, 16).unwrap();
263+
let c = char::from_u32(v).unwrap_or_else(|| {
264+
if let Some((span, diag)) = diag {
265+
let mut diag = diag.struct_span_err(span, "invalid unicode character escape");
266+
if v > 0x10FFFF {
267+
diag.help("unicode escape must be at most 10FFFF").emit();
268+
} else {
269+
diag.help("unicode escape must not be a surrogate").emit();
270+
}
271+
}
272+
'\u{FFFD}'
273+
});
263274
(c, (idx + 1) as isize)
264275
}
265276
_ => panic!("lexer should have rejected a bad character escape {}", lit)
@@ -272,7 +283,7 @@ pub fn escape_default(s: &str) -> String {
272283

273284
/// Parse a string representing a string literal into its final form. Does
274285
/// unescaping.
275-
pub fn str_lit(lit: &str) -> String {
286+
pub fn str_lit(lit: &str, diag: Option<(Span, &Handler)>) -> String {
276287
debug!("parse_str_lit: given {}", escape_default(lit));
277288
let mut res = String::with_capacity(lit.len());
278289

@@ -313,7 +324,7 @@ pub fn str_lit(lit: &str) -> String {
313324
eat(&mut chars);
314325
} else {
315326
// otherwise, a normal escape
316-
let (c, n) = char_lit(&lit[i..]);
327+
let (c, n) = char_lit(&lit[i..], diag);
317328
for _ in 0..n - 1 { // we don't need to move past the first \
318329
chars.next();
319330
}
@@ -385,15 +396,15 @@ pub fn lit_token(lit: token::Lit, suf: Option<Symbol>, diag: Option<(Span, &Hand
385396

386397
match lit {
387398
token::Byte(i) => (true, Some(LitKind::Byte(byte_lit(&i.as_str()).0))),
388-
token::Char(i) => (true, Some(LitKind::Char(char_lit(&i.as_str()).0))),
399+
token::Char(i) => (true, Some(LitKind::Char(char_lit(&i.as_str(), diag).0))),
389400

390401
// There are some valid suffixes for integer and float literals,
391402
// so all the handling is done internally.
392403
token::Integer(s) => (false, integer_lit(&s.as_str(), suf, diag)),
393404
token::Float(s) => (false, float_lit(&s.as_str(), suf, diag)),
394405

395406
token::Str_(s) => {
396-
let s = Symbol::intern(&str_lit(&s.as_str()));
407+
let s = Symbol::intern(&str_lit(&s.as_str(), diag));
397408
(true, Some(LitKind::Str(s, ast::StrStyle::Cooked)))
398409
}
399410
token::StrRaw(s, n) => {

src/test/parse-fail/issue-23620-invalid-escapes.rs

+4-5
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,8 @@ fn main() {
4141
//~^^^ ERROR incorrect unicode escape sequence
4242
//~^^^^ ERROR unicode escape sequences cannot be used as a byte or in a byte string
4343

44-
let _ = "\u{ffffff} \xf \u";
45-
//~^ ERROR invalid unicode character escape
46-
//~^^ ERROR invalid character in numeric character escape:
47-
//~^^^ ERROR form of character escape may only be used with characters in the range [\x00-\x7f]
48-
//~^^^^ ERROR incorrect unicode escape sequence
44+
let _ = "\xf \u";
45+
//~^ ERROR invalid character in numeric character escape:
46+
//~^^ ERROR form of character escape may only be used with characters in the range [\x00-\x7f]
47+
//~^^^ ERROR incorrect unicode escape sequence
4948
}

src/test/parse-fail/issue-43692.rs

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
// Copyright 2017 The Rust Project Developers. See the COPYRIGHT
2+
// file at the top-level directory of this distribution and at
3+
// http://rust-lang.org/COPYRIGHT.
4+
//
5+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8+
// option. This file may not be copied, modified, or distributed
9+
// except according to those terms.
10+
11+
// compile-flags: -Z parse-only
12+
13+
fn main() {
14+
'\u{_10FFFF}'; //~ ERROR invalid start of unicode escape
15+
}

src/test/parse-fail/new-unicode-escapes-2.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,5 +11,5 @@
1111
// compile-flags: -Z parse-only
1212

1313
pub fn main() {
14-
let s = "\u{260311111111}"; //~ ERROR overlong unicode escape (can have at most 6 hex digits)
14+
let s = "\u{260311111111}"; //~ ERROR overlong unicode escape (must have at most 6 hex digits)
1515
}

src/test/parse-fail/new-unicode-escapes-3.rs

+2-1
Original file line numberDiff line numberDiff line change
@@ -11,5 +11,6 @@
1111
// compile-flags: -Z parse-only
1212

1313
pub fn main() {
14-
let s = "\u{d805}"; //~ ERROR invalid unicode character escape
14+
let s1 = "\u{d805}"; //~ ERROR invalid unicode character escape
15+
let s2 = "\u{ffffff}"; //~ ERROR invalid unicode character escape
1516
}

src/test/parse-fail/new-unicode-escapes-4.rs

-2
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,4 @@
1313
pub fn main() {
1414
let s = "\u{lol}";
1515
//~^ ERROR invalid character in unicode escape: l
16-
//~^^ ERROR invalid character in unicode escape: o
17-
//~^^^ ERROR invalid character in unicode escape: l
1816
}

src/test/run-pass/issue-43692.rs

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
// Copyright 2017 The Rust Project Developers. See the COPYRIGHT
2+
// file at the top-level directory of this distribution and at
3+
// http://rust-lang.org/COPYRIGHT.
4+
//
5+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8+
// option. This file may not be copied, modified, or distributed
9+
// except according to those terms.
10+
11+
fn main() {
12+
assert_eq!('\u{10__FFFF}', '\u{10FFFF}');
13+
assert_eq!("\u{10_F0FF__}foo\u{1_0_0_0__}", "\u{10F0FF}foo\u{1000}");
14+
}

0 commit comments

Comments
 (0)