Skip to content

Commit aadb1c8

Browse files
committed
Merged: Return kBadChar for longest subpart of incomplete utf-8 character.
Revision: fd40ebb BUG=chromium:662822 LOG=N NOTRY=true NOPRESUBMIT=true NOTREECHECKS=true [email protected] Review URL: https://codereview.chromium.org/2526453002 . Cr-Commit-Position: refs/branch-heads/5.5@{#52} Cr-Branched-From: 3cbd583-refs/heads/5.5.372@{#1} Cr-Branched-From: b3c8b0c-refs/heads/master@{#40015}
1 parent 028acde commit aadb1c8

2 files changed

Lines changed: 270 additions & 300 deletions

File tree

src/unicode.cc

Lines changed: 53 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -228,80 +228,56 @@ static inline bool IsContinuationCharacter(byte chr) {
228228
// This method decodes an UTF-8 value according to RFC 3629.
229229
uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) {
230230
size_t length = NonASCIISequenceLength(str[0]);
231-
if (length == 0 || max_length < length) {
232-
*cursor += 1;
233-
return kBadChar;
234-
}
235-
if (length == 2) {
236-
if (!IsContinuationCharacter(str[1])) {
237-
*cursor += 1;
238-
return kBadChar;
239-
}
240-
*cursor += 2;
241-
return ((str[0] << 6) + str[1]) - 0x00003080;
231+
232+
// Check continuation characters.
233+
size_t max_count = std::min(length, max_length);
234+
size_t count = 1;
235+
while (count < max_count && IsContinuationCharacter(str[count])) {
236+
count++;
242237
}
238+
239+
// Check overly long sequences & other conditions. Use length as error
240+
// indicator.
243241
if (length == 3) {
244-
switch (str[0]) {
245-
case 0xE0:
246-
// Overlong three-byte sequence.
247-
if (str[1] < 0xA0 || str[1] > 0xBF) {
248-
*cursor += 1;
249-
return kBadChar;
250-
}
251-
break;
252-
case 0xED:
253-
// High and low surrogate halves.
254-
if (str[1] < 0x80 || str[1] > 0x9F) {
255-
*cursor += 1;
256-
return kBadChar;
257-
}
258-
break;
259-
default:
260-
if (!IsContinuationCharacter(str[1])) {
261-
*cursor += 1;
262-
return kBadChar;
263-
}
264-
}
265-
if (!IsContinuationCharacter(str[2])) {
266-
*cursor += 1;
267-
return kBadChar;
242+
if (str[0] == 0xE0 && (str[1] < 0xA0 || str[1] > 0xBF)) {
243+
// Overlong three-byte sequence?
244+
length = 0;
245+
} else if (str[0] == 0xED && (str[1] < 0x80 || str[1] > 0x9F)) {
246+
// High and low surrogate halves?
247+
length = 0;
268248
}
269-
*cursor += 3;
270-
return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080;
271-
}
272-
DCHECK(length == 4);
273-
switch (str[0]) {
274-
case 0xF0:
249+
} else if (length == 4) {
250+
if (str[0] == 0xF0 && (str[1] < 0x90 || str[1] > 0xBF)) {
275251
// Overlong four-byte sequence.
276-
if (str[1] < 0x90 || str[1] > 0xBF) {
277-
*cursor += 1;
278-
return kBadChar;
279-
}
280-
break;
281-
case 0xF4:
252+
length = 0;
253+
} else if (str[0] == 0xF4 && (str[1] < 0x80 || str[1] > 0x8F)) {
282254
// Code points outside of the unicode range.
283-
if (str[1] < 0x80 || str[1] > 0x8F) {
284-
*cursor += 1;
285-
return kBadChar;
286-
}
287-
break;
288-
default:
289-
if (!IsContinuationCharacter(str[1])) {
290-
*cursor += 1;
291-
return kBadChar;
292-
}
255+
length = 0;
256+
}
293257
}
294-
if (!IsContinuationCharacter(str[2])) {
295-
*cursor += 1;
258+
259+
if (count != length) {
260+
// All invalid encodings should land here.
261+
*cursor += count;
296262
return kBadChar;
297263
}
298-
if (!IsContinuationCharacter(str[3])) {
299-
*cursor += 1;
300-
return kBadChar;
264+
265+
// All errors have been handled, so we only have to assemble the result.
266+
*cursor += length;
267+
switch (length) {
268+
case 1:
269+
return str[0];
270+
case 2:
271+
return ((str[0] << 6) + str[1]) - 0x00003080;
272+
case 3:
273+
return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080;
274+
case 4:
275+
return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) -
276+
0x03C82080;
301277
}
302-
*cursor += 4;
303-
return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) -
304-
0x03C82080;
278+
279+
UNREACHABLE();
280+
return kBadChar;
305281
}
306282

307283
uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {
@@ -323,9 +299,10 @@ uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {
323299
// with one shift.
324300
uint8_t mask = 0x7f >> kind;
325301

326-
// Store the kind - 1 (i.e., remaining bytes) in the top byte, value
327-
// in the bottom three.
328-
*buffer = (kind - 1) << 24 | (next & mask);
302+
// Store the kind in the top nibble, and kind - 1 (i.e., remaining bytes)
303+
// in 2nd nibble, and the value in the bottom three. The 2nd nibble is
304+
// intended as a counter about how many bytes are still needed.
305+
*buffer = kind << 28 | (kind - 1) << 24 | (next & mask);
329306
return kIncomplete;
330307
} else {
331308
// No buffer, and not the start of a 1-byte char (handled at the
@@ -354,15 +331,19 @@ uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {
354331
// We're inside of a character, as described by buffer.
355332

356333
// How many bytes (excluding this one) do we still expect?
357-
uint8_t count = (*buffer >> 24) - 1;
334+
uint8_t bytes_expected = *buffer >> 28;
335+
uint8_t bytes_left = (*buffer >> 24) & 0x0f;
336+
bytes_left--;
358337
// Update the value.
359338
uint32_t value = ((*buffer & 0xffffff) << 6) | (next & 0x3F);
360-
if (count) {
361-
*buffer = count << 24 | value;
339+
if (bytes_left) {
340+
*buffer = (bytes_expected << 28 | bytes_left << 24 | value);
362341
return kIncomplete;
363342
} else {
364343
*buffer = 0;
365-
return value;
344+
bool sequence_was_too_long = (bytes_expected == 2 && value < 0x80) ||
345+
(bytes_expected == 3 && value < 0x800);
346+
return sequence_was_too_long ? kBadChar : value;
366347
}
367348
} else {
368349
// Within a character, but not a continuation character? Then the

0 commit comments

Comments
 (0)