Skip to content

Commit 24578e0

Browse files
committed
libsyntax: accept only whitespace with the PATTERN_WHITE_SPACE property
This aligns with unicode recommendations and should be stable for all future unicode releases. See http://unicode.org/reports/tr31/#R3. This renames `libsyntax::lexer::is_whitespace` to `is_pattern_whitespace` so potentially breaks users of libsyntax.
1 parent 9e3e43f commit 24578e0

File tree

9 files changed

+57
-36
lines changed

9 files changed

+57
-36
lines changed

mk/crates.mk

+1-1
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ DEPS_serialize := std log
8686
DEPS_term := std log
8787
DEPS_test := std getopts serialize rbml term native:rust_test_helpers
8888

89-
DEPS_syntax := std term serialize log arena libc rustc_bitflags
89+
DEPS_syntax := std term serialize log arena libc rustc_bitflags rustc_unicode
9090
DEPS_syntax_ext := syntax fmt_macros
9191

9292
DEPS_rustc := syntax fmt_macros flate arena serialize getopts rbml rustc_front\

src/etc/unicode.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -398,7 +398,7 @@ def emit_norm_module(f, canon, compat, combine, norm_props):
398398
derived = load_properties("DerivedCoreProperties.txt", want_derived)
399399
scripts = load_properties("Scripts.txt", [])
400400
props = load_properties("PropList.txt",
401-
["White_Space", "Join_Control", "Noncharacter_Code_Point"])
401+
["White_Space", "Join_Control", "Noncharacter_Code_Point", "Pattern_White_Space"])
402402
norm_props = load_properties("DerivedNormalizationProps.txt",
403403
["Full_Composition_Exclusion"])
404404

@@ -408,7 +408,7 @@ def emit_norm_module(f, canon, compat, combine, norm_props):
408408
# category tables
409409
for (name, cat, pfuns) in ("general_category", gencats, ["N", "Cc"]), \
410410
("derived_property", derived, want_derived), \
411-
("property", props, ["White_Space"]):
411+
("property", props, ["White_Space", "Pattern_White_Space"]):
412412
emit_property_module(rf, name, cat, pfuns)
413413

414414
# normalizations and conversions module

src/librustc_unicode/lib.rs

+5
Original file line numberDiff line numberDiff line change
@@ -50,3 +50,8 @@ pub mod str {
5050
pub mod derived_property {
5151
pub use tables::derived_property::{Cased, Case_Ignorable};
5252
}
53+
54+
// For use in libsyntax
55+
pub mod property {
56+
pub use tables::property::Pattern_White_Space;
57+
}

src/librustc_unicode/tables.rs

+9
Original file line numberDiff line numberDiff line change
@@ -1180,6 +1180,15 @@ pub mod derived_property {
11801180
}
11811181

11821182
pub mod property {
1183+
pub const Pattern_White_Space_table: &'static [(char, char)] = &[
1184+
('\u{9}', '\u{d}'), ('\u{20}', '\u{20}'), ('\u{85}', '\u{85}'), ('\u{200e}', '\u{200f}'),
1185+
('\u{2028}', '\u{2029}')
1186+
];
1187+
1188+
pub fn Pattern_White_Space(c: char) -> bool {
1189+
super::bsearch_range_table(c, Pattern_White_Space_table)
1190+
}
1191+
11831192
pub const White_Space_table: &'static [(char, char)] = &[
11841193
('\u{9}', '\u{d}'), ('\u{20}', '\u{20}'), ('\u{85}', '\u{85}'), ('\u{a0}', '\u{a0}'),
11851194
('\u{1680}', '\u{1680}'), ('\u{2000}', '\u{200a}'), ('\u{2028}', '\u{2029}'), ('\u{202f}',

src/libsyntax/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ extern crate term;
3737
extern crate libc;
3838
#[macro_use] extern crate log;
3939
#[macro_use] #[no_link] extern crate rustc_bitflags;
40+
extern crate rustc_unicode;
4041

4142
extern crate serialize as rustc_serialize; // used by deriving
4243

src/libsyntax/parse/lexer/comments.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ use codemap::{BytePos, CharPos, CodeMap, Pos};
1515
use errors;
1616
use parse::lexer::is_block_doc_comment;
1717
use parse::lexer::{StringReader, TokenAndSpan};
18-
use parse::lexer::{is_whitespace, Reader};
18+
use parse::lexer::{is_pattern_whitespace, Reader};
1919
use parse::lexer;
2020
use print::pprust;
2121
use str::char_at;
@@ -153,7 +153,7 @@ fn push_blank_line_comment(rdr: &StringReader, comments: &mut Vec<Comment>) {
153153
}
154154

155155
fn consume_whitespace_counting_blank_lines(rdr: &mut StringReader, comments: &mut Vec<Comment>) {
156-
while is_whitespace(rdr.curr) && !rdr.is_eof() {
156+
while is_pattern_whitespace(rdr.curr) && !rdr.is_eof() {
157157
if rdr.col == CharPos(0) && rdr.curr_is('\n') {
158158
push_blank_line_comment(rdr, &mut *comments);
159159
}

src/libsyntax/parse/lexer/mod.rs

+10-7
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ use ext::tt::transcribe::tt_next_token;
1616
use parse::token::str_to_ident;
1717
use parse::token;
1818
use str::char_at;
19+
use rustc_unicode::property::Pattern_White_Space;
1920

2021
use std::borrow::Cow;
2122
use std::char;
@@ -546,10 +547,10 @@ impl<'a> StringReader<'a> {
546547
let c = self.scan_comment();
547548
debug!("scanning a comment {:?}", c);
548549
c
549-
}
550-
c if is_whitespace(Some(c)) => {
550+
},
551+
c if is_pattern_whitespace(Some(c)) => {
551552
let start_bpos = self.last_pos;
552-
while is_whitespace(self.curr) {
553+
while is_pattern_whitespace(self.curr) {
553554
self.bump();
554555
}
555556
let c = Some(TokenAndSpan {
@@ -1435,7 +1436,7 @@ impl<'a> StringReader<'a> {
14351436
}
14361437

14371438
fn consume_whitespace(&mut self) {
1438-
while is_whitespace(self.curr) && !self.is_eof() {
1439+
while is_pattern_whitespace(self.curr) && !self.is_eof() {
14391440
self.bump();
14401441
}
14411442
}
@@ -1460,7 +1461,7 @@ impl<'a> StringReader<'a> {
14601461
}
14611462

14621463
fn consume_non_eol_whitespace(&mut self) {
1463-
while is_whitespace(self.curr) && !self.curr_is('\n') && !self.is_eof() {
1464+
while is_pattern_whitespace(self.curr) && !self.curr_is('\n') && !self.is_eof() {
14641465
self.bump();
14651466
}
14661467
}
@@ -1591,8 +1592,10 @@ impl<'a> StringReader<'a> {
15911592
}
15921593
}
15931594

1594-
pub fn is_whitespace(c: Option<char>) -> bool {
1595-
c.map_or(false, char::is_whitespace)
1595+
// This tests the character for the unicode property 'PATTERN_WHITE_SPACE' which
1596+
// is guaranteed to be forward compatible. http://unicode.org/reports/tr31/#R3
1597+
pub fn is_pattern_whitespace(c: Option<char>) -> bool {
1598+
c.map_or(false, Pattern_White_Space)
15961599
}
15971600

15981601
fn in_range(c: Option<char>, lo: char, hi: char) -> bool {

src/libsyntax/util/parser_testing.rs

+19-20
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
use ast;
1212
use parse::{ParseSess,PResult,filemap_to_tts};
13-
use parse::new_parser_from_source_str;
13+
use parse::{lexer, new_parser_from_source_str};
1414
use parse::parser::Parser;
1515
use parse::token;
1616
use ptr::P;
@@ -97,8 +97,8 @@ pub fn matches_codepattern(a : &str, b : &str) -> bool {
9797
let (a, b) = match (a_iter.peek(), b_iter.peek()) {
9898
(None, None) => return true,
9999
(None, _) => return false,
100-
(Some(a), None) => {
101-
if a.is_whitespace() {
100+
(Some(&a), None) => {
101+
if is_pattern_whitespace(a) {
102102
break // trailing whitespace check is out of loop for borrowck
103103
} else {
104104
return false
@@ -107,11 +107,11 @@ pub fn matches_codepattern(a : &str, b : &str) -> bool {
107107
(Some(&a), Some(&b)) => (a, b)
108108
};
109109

110-
if a.is_whitespace() && b.is_whitespace() {
110+
if is_pattern_whitespace(a) && is_pattern_whitespace(b) {
111111
// skip whitespace for a and b
112112
scan_for_non_ws_or_end(&mut a_iter);
113113
scan_for_non_ws_or_end(&mut b_iter);
114-
} else if a.is_whitespace() {
114+
} else if is_pattern_whitespace(a) {
115115
// skip whitespace for a
116116
scan_for_non_ws_or_end(&mut a_iter);
117117
} else if a == b {
@@ -123,23 +123,18 @@ pub fn matches_codepattern(a : &str, b : &str) -> bool {
123123
}
124124

125125
// check if a has *only* trailing whitespace
126-
a_iter.all(|c| c.is_whitespace())
126+
a_iter.all(is_pattern_whitespace)
127127
}
128128

129129
/// Advances the given peekable `Iterator` until it reaches a non-whitespace character
130130
fn scan_for_non_ws_or_end<I: Iterator<Item= char>>(iter: &mut Peekable<I>) {
131-
loop {
132-
match iter.peek() {
133-
Some(c) if c.is_whitespace() => {} // fall through; borrowck
134-
_ => return
135-
}
136-
131+
while lexer::is_pattern_whitespace(iter.peek().cloned()) {
137132
iter.next();
138133
}
139134
}
140135

141-
pub fn is_whitespace(c: char) -> bool {
142-
c.is_whitespace()
136+
pub fn is_pattern_whitespace(c: char) -> bool {
137+
lexer::is_pattern_whitespace(Some(c))
143138
}
144139

145140
#[cfg(test)]
@@ -162,14 +157,18 @@ mod tests {
162157
}
163158

164159
#[test]
165-
fn more_whitespace() {
160+
fn pattern_whitespace() {
166161
assert_eq!(matches_codepattern("","\x0C"), false);
167-
assert_eq!(matches_codepattern("a b","a\u{2002}b"),true);
168162
assert_eq!(matches_codepattern("a b ","a \u{0085}\n\t\r b"),true);
169163
assert_eq!(matches_codepattern("a b","a \u{0085}\n\t\r b "),false);
170-
assert_eq!(matches_codepattern("a b","a\u{2002}b"),true);
171-
assert_eq!(matches_codepattern("ab","a\u{2003}b"),false);
172-
assert_eq!(matches_codepattern("a \u{3000}b","ab"),true);
173-
assert_eq!(matches_codepattern("\u{205F}a b","ab"),true);
164+
}
165+
166+
#[test]
167+
fn non_pattern_whitespace() {
168+
// These have the property 'White_Space' but not 'Pattern_White_Space'
169+
assert_eq!(matches_codepattern("a b","a\u{2002}b"), false);
170+
assert_eq!(matches_codepattern("a b","a\u{2002}b"), false);
171+
assert_eq!(matches_codepattern("\u{205F}a b","ab"), false);
172+
assert_eq!(matches_codepattern("a \u{3000}b","ab"), false);
174173
}
175174
}

src/test/run-pass/parser-unicode-whitespace.rs

+8-4
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,14 @@
99
// except according to those terms.
1010

1111

12-
// Beware editing: it has numerous whitespace characters which are important
12+
// Beware editing: it has numerous whitespace characters which are important.
13+
// It contains one ranges from the 'PATTERN_WHITE_SPACE' property outlined in
14+
// http://unicode.org/Public/UNIDATA/PropList.txt
15+
//
16+
// The characters in the first expression of the assertion can be generated
17+
// from: "4\u{0C}+\n\t\r7\t*\u{20}2\u{85}/\u{200E}3\u{200F}*\u{2028}2\u{2029}"
1318
pub fn main() {
14-
assert_eq!(4 +  7 * 2
15-
19+
assert_eq!(4 +
1620

17-
/ 3*2, 4 + 7 * 2 / 3 * 2);
21+
7 * 2…/‎3*2, 4 + 7 * 2 / 3 * 2);
1822
}

0 commit comments

Comments
 (0)