Skip to content

Commit 8b7c3f2

Browse files
committed
Auto merge of #29734 - Ryman:whitespace_consistency, r=Aatch
libsyntax: be more accepting of whitespace in lexer Fixes #29590. Perhaps this may need more thorough testing? r? @Aatch
2 parents 4352a85 + 24578e0 commit 8b7c3f2

File tree

9 files changed

+111
-64
lines changed

9 files changed

+111
-64
lines changed

mk/crates.mk

+1-1
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ DEPS_serialize := std log
8787
DEPS_term := std log
8888
DEPS_test := std getopts serialize rbml term native:rust_test_helpers
8989

90-
DEPS_syntax := std term serialize log arena libc rustc_bitflags
90+
DEPS_syntax := std term serialize log arena libc rustc_bitflags rustc_unicode
9191
DEPS_syntax_ext := syntax fmt_macros
9292

9393
DEPS_rustc := syntax fmt_macros flate arena serialize getopts rbml rustc_front\

src/etc/unicode.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -398,7 +398,7 @@ def emit_norm_module(f, canon, compat, combine, norm_props):
398398
derived = load_properties("DerivedCoreProperties.txt", want_derived)
399399
scripts = load_properties("Scripts.txt", [])
400400
props = load_properties("PropList.txt",
401-
["White_Space", "Join_Control", "Noncharacter_Code_Point"])
401+
["White_Space", "Join_Control", "Noncharacter_Code_Point", "Pattern_White_Space"])
402402
norm_props = load_properties("DerivedNormalizationProps.txt",
403403
["Full_Composition_Exclusion"])
404404

@@ -408,7 +408,7 @@ def emit_norm_module(f, canon, compat, combine, norm_props):
408408
# category tables
409409
for (name, cat, pfuns) in ("general_category", gencats, ["N", "Cc"]), \
410410
("derived_property", derived, want_derived), \
411-
("property", props, ["White_Space"]):
411+
("property", props, ["White_Space", "Pattern_White_Space"]):
412412
emit_property_module(rf, name, cat, pfuns)
413413

414414
# normalizations and conversions module

src/librustc_unicode/lib.rs

+5
Original file line numberDiff line numberDiff line change
@@ -51,3 +51,8 @@ pub mod str {
5151
pub mod derived_property {
5252
pub use tables::derived_property::{Cased, Case_Ignorable};
5353
}
54+
55+
// For use in libsyntax
56+
pub mod property {
57+
pub use tables::property::Pattern_White_Space;
58+
}

src/librustc_unicode/tables.rs

+9
Original file line numberDiff line numberDiff line change
@@ -1180,6 +1180,15 @@ pub mod derived_property {
11801180
}
11811181

11821182
pub mod property {
1183+
pub const Pattern_White_Space_table: &'static [(char, char)] = &[
1184+
('\u{9}', '\u{d}'), ('\u{20}', '\u{20}'), ('\u{85}', '\u{85}'), ('\u{200e}', '\u{200f}'),
1185+
('\u{2028}', '\u{2029}')
1186+
];
1187+
1188+
pub fn Pattern_White_Space(c: char) -> bool {
1189+
super::bsearch_range_table(c, Pattern_White_Space_table)
1190+
}
1191+
11831192
pub const White_Space_table: &'static [(char, char)] = &[
11841193
('\u{9}', '\u{d}'), ('\u{20}', '\u{20}'), ('\u{85}', '\u{85}'), ('\u{a0}', '\u{a0}'),
11851194
('\u{1680}', '\u{1680}'), ('\u{2000}', '\u{200a}'), ('\u{2028}', '\u{2029}'), ('\u{202f}',

src/libsyntax/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ extern crate term;
3838
extern crate libc;
3939
#[macro_use] extern crate log;
4040
#[macro_use] #[no_link] extern crate rustc_bitflags;
41+
extern crate rustc_unicode;
4142

4243
extern crate serialize as rustc_serialize; // used by deriving
4344

src/libsyntax/parse/lexer/comments.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ use codemap::{BytePos, CharPos, CodeMap, Pos};
1515
use errors;
1616
use parse::lexer::is_block_doc_comment;
1717
use parse::lexer::{StringReader, TokenAndSpan};
18-
use parse::lexer::{is_whitespace, Reader};
18+
use parse::lexer::{is_pattern_whitespace, Reader};
1919
use parse::lexer;
2020
use print::pprust;
2121
use str::char_at;
@@ -153,7 +153,7 @@ fn push_blank_line_comment(rdr: &StringReader, comments: &mut Vec<Comment>) {
153153
}
154154

155155
fn consume_whitespace_counting_blank_lines(rdr: &mut StringReader, comments: &mut Vec<Comment>) {
156-
while is_whitespace(rdr.curr) && !rdr.is_eof() {
156+
while is_pattern_whitespace(rdr.curr) && !rdr.is_eof() {
157157
if rdr.col == CharPos(0) && rdr.curr_is('\n') {
158158
push_blank_line_comment(rdr, &mut *comments);
159159
}

src/libsyntax/parse/lexer/mod.rs

+10-10
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ use ext::tt::transcribe::tt_next_token;
1616
use parse::token::str_to_ident;
1717
use parse::token;
1818
use str::char_at;
19+
use rustc_unicode::property::Pattern_White_Space;
1920

2021
use std::borrow::Cow;
2122
use std::char;
@@ -546,10 +547,10 @@ impl<'a> StringReader<'a> {
546547
let c = self.scan_comment();
547548
debug!("scanning a comment {:?}", c);
548549
c
549-
}
550-
c if is_whitespace(Some(c)) => {
550+
},
551+
c if is_pattern_whitespace(Some(c)) => {
551552
let start_bpos = self.last_pos;
552-
while is_whitespace(self.curr) {
553+
while is_pattern_whitespace(self.curr) {
553554
self.bump();
554555
}
555556
let c = Some(TokenAndSpan {
@@ -1440,7 +1441,7 @@ impl<'a> StringReader<'a> {
14401441
}
14411442

14421443
fn consume_whitespace(&mut self) {
1443-
while is_whitespace(self.curr) && !self.is_eof() {
1444+
while is_pattern_whitespace(self.curr) && !self.is_eof() {
14441445
self.bump();
14451446
}
14461447
}
@@ -1465,7 +1466,7 @@ impl<'a> StringReader<'a> {
14651466
}
14661467

14671468
fn consume_non_eol_whitespace(&mut self) {
1468-
while is_whitespace(self.curr) && !self.curr_is('\n') && !self.is_eof() {
1469+
while is_pattern_whitespace(self.curr) && !self.curr_is('\n') && !self.is_eof() {
14691470
self.bump();
14701471
}
14711472
}
@@ -1596,11 +1597,10 @@ impl<'a> StringReader<'a> {
15961597
}
15971598
}
15981599

1599-
pub fn is_whitespace(c: Option<char>) -> bool {
1600-
match c.unwrap_or('\x00') { // None can be null for now... it's not whitespace
1601-
' ' | '\n' | '\t' | '\r' => true,
1602-
_ => false,
1603-
}
1600+
// This tests the character for the unicode property 'PATTERN_WHITE_SPACE' which
1601+
// is guaranteed to be forward compatible. http://unicode.org/reports/tr31/#R3
1602+
pub fn is_pattern_whitespace(c: Option<char>) -> bool {
1603+
c.map_or(false, Pattern_White_Space)
16041604
}
16051605

16061606
fn in_range(c: Option<char>, lo: char, hi: char) -> bool {

src/libsyntax/util/parser_testing.rs

+59-49
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,11 @@
1010

1111
use ast;
1212
use parse::{ParseSess,PResult,filemap_to_tts};
13-
use parse::new_parser_from_source_str;
13+
use parse::{lexer, new_parser_from_source_str};
1414
use parse::parser::Parser;
1515
use parse::token;
1616
use ptr::P;
17-
use str::char_at;
17+
use std::iter::Peekable;
1818

1919
/// Map a string to tts, using a made-up filename:
2020
pub fn string_to_tts(source_str: String) -> Vec<ast::TokenTree> {
@@ -87,69 +87,62 @@ pub fn strs_to_idents(ids: Vec<&str> ) -> Vec<ast::Ident> {
8787

8888
/// Does the given string match the pattern? whitespace in the first string
8989
/// may be deleted or replaced with other whitespace to match the pattern.
90-
/// this function is Unicode-ignorant; fortunately, the careful design of
91-
/// UTF-8 mitigates this ignorance. In particular, this function only collapses
92-
/// sequences of \n, \r, ' ', and \t, but it should otherwise tolerate Unicode
93-
/// chars. Unsurprisingly, it doesn't do NKF-normalization(?).
90+
/// This function is relatively Unicode-ignorant; fortunately, the careful design
91+
/// of UTF-8 mitigates this ignorance. It doesn't do NKF-normalization(?).
9492
pub fn matches_codepattern(a : &str, b : &str) -> bool {
95-
let mut idx_a = 0;
96-
let mut idx_b = 0;
93+
let mut a_iter = a.chars().peekable();
94+
let mut b_iter = b.chars().peekable();
95+
9796
loop {
98-
if idx_a == a.len() && idx_b == b.len() {
99-
return true;
100-
}
101-
else if idx_a == a.len() {return false;}
102-
else if idx_b == b.len() {
103-
// maybe the stuff left in a is all ws?
104-
if is_whitespace(char_at(a, idx_a)) {
105-
return scan_for_non_ws_or_end(a,idx_a) == a.len();
106-
} else {
107-
return false;
97+
let (a, b) = match (a_iter.peek(), b_iter.peek()) {
98+
(None, None) => return true,
99+
(None, _) => return false,
100+
(Some(&a), None) => {
101+
if is_pattern_whitespace(a) {
102+
break // trailing whitespace check is out of loop for borrowck
103+
} else {
104+
return false
105+
}
108106
}
109-
}
110-
// ws in both given and pattern:
111-
else if is_whitespace(char_at(a, idx_a))
112-
&& is_whitespace(char_at(b, idx_b)) {
113-
idx_a = scan_for_non_ws_or_end(a,idx_a);
114-
idx_b = scan_for_non_ws_or_end(b,idx_b);
115-
}
116-
// ws in given only:
117-
else if is_whitespace(char_at(a, idx_a)) {
118-
idx_a = scan_for_non_ws_or_end(a,idx_a);
119-
}
120-
// *don't* silently eat ws in expected only.
121-
else if char_at(a, idx_a) == char_at(b, idx_b) {
122-
idx_a += 1;
123-
idx_b += 1;
124-
}
125-
else {
126-
return false;
107+
(Some(&a), Some(&b)) => (a, b)
108+
};
109+
110+
if is_pattern_whitespace(a) && is_pattern_whitespace(b) {
111+
// skip whitespace for a and b
112+
scan_for_non_ws_or_end(&mut a_iter);
113+
scan_for_non_ws_or_end(&mut b_iter);
114+
} else if is_pattern_whitespace(a) {
115+
// skip whitespace for a
116+
scan_for_non_ws_or_end(&mut a_iter);
117+
} else if a == b {
118+
a_iter.next();
119+
b_iter.next();
120+
} else {
121+
return false
127122
}
128123
}
124+
125+
// check if a has *only* trailing whitespace
126+
a_iter.all(is_pattern_whitespace)
129127
}
130128

131-
/// Given a string and an index, return the first usize >= idx
132-
/// that is a non-ws-char or is outside of the legal range of
133-
/// the string.
134-
fn scan_for_non_ws_or_end(a : &str, idx: usize) -> usize {
135-
let mut i = idx;
136-
let len = a.len();
137-
while (i < len) && (is_whitespace(char_at(a, i))) {
138-
i += 1;
129+
/// Advances the given peekable `Iterator` until it reaches a non-whitespace character
130+
fn scan_for_non_ws_or_end<I: Iterator<Item= char>>(iter: &mut Peekable<I>) {
131+
while lexer::is_pattern_whitespace(iter.peek().cloned()) {
132+
iter.next();
139133
}
140-
i
141134
}
142135

143-
/// Copied from lexer.
144-
pub fn is_whitespace(c: char) -> bool {
145-
return c == ' ' || c == '\t' || c == '\r' || c == '\n';
136+
pub fn is_pattern_whitespace(c: char) -> bool {
137+
lexer::is_pattern_whitespace(Some(c))
146138
}
147139

148140
#[cfg(test)]
149141
mod tests {
150142
use super::*;
151143

152-
#[test] fn eqmodws() {
144+
#[test]
145+
fn eqmodws() {
153146
assert_eq!(matches_codepattern("",""),true);
154147
assert_eq!(matches_codepattern("","a"),false);
155148
assert_eq!(matches_codepattern("a",""),false);
@@ -160,5 +153,22 @@ mod tests {
160153
assert_eq!(matches_codepattern("a b","a b"),true);
161154
assert_eq!(matches_codepattern("ab","a b"),false);
162155
assert_eq!(matches_codepattern("a b","ab"),true);
156+
assert_eq!(matches_codepattern(" a b","ab"),true);
157+
}
158+
159+
#[test]
160+
fn pattern_whitespace() {
161+
assert_eq!(matches_codepattern("","\x0C"), false);
162+
assert_eq!(matches_codepattern("a b ","a \u{0085}\n\t\r b"),true);
163+
assert_eq!(matches_codepattern("a b","a \u{0085}\n\t\r b "),false);
164+
}
165+
166+
#[test]
167+
fn non_pattern_whitespace() {
168+
// These have the property 'White_Space' but not 'Pattern_White_Space'
169+
assert_eq!(matches_codepattern("a b","a\u{2002}b"), false);
170+
assert_eq!(matches_codepattern("a b","a\u{2002}b"), false);
171+
assert_eq!(matches_codepattern("\u{205F}a b","ab"), false);
172+
assert_eq!(matches_codepattern("a \u{3000}b","ab"), false);
163173
}
164174
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
// Copyright 2015 The Rust Project Developers. See the COPYRIGHT
2+
// file at the top-level directory of this distribution and at
3+
// http://rust-lang.org/COPYRIGHT.
4+
//
5+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8+
// option. This file may not be copied, modified, or distributed
9+
// except according to those terms.
10+
11+
12+
// Beware editing: it has numerous whitespace characters which are important.
13+
// It contains one ranges from the 'PATTERN_WHITE_SPACE' property outlined in
14+
// http://unicode.org/Public/UNIDATA/PropList.txt
15+
//
16+
// The characters in the first expression of the assertion can be generated
17+
// from: "4\u{0C}+\n\t\r7\t*\u{20}2\u{85}/\u{200E}3\u{200F}*\u{2028}2\u{2029}"
18+
pub fn main() {
19+
assert_eq!(4 +
20+
21+
7 * 2…/‎3*2, 4 + 7 * 2 / 3 * 2);
22+
}

0 commit comments

Comments
 (0)