@@ -228,80 +228,56 @@ static inline bool IsContinuationCharacter(byte chr) {
228228// This method decodes an UTF-8 value according to RFC 3629.
229229uchar Utf8::CalculateValue (const byte* str, size_t max_length, size_t * cursor) {
230230 size_t length = NonASCIISequenceLength (str[0 ]);
231- if (length == 0 || max_length < length) {
232- *cursor += 1 ;
233- return kBadChar ;
234- }
235- if (length == 2 ) {
236- if (!IsContinuationCharacter (str[1 ])) {
237- *cursor += 1 ;
238- return kBadChar ;
239- }
240- *cursor += 2 ;
241- return ((str[0 ] << 6 ) + str[1 ]) - 0x00003080 ;
231+
232+ // Check continuation characters.
233+ size_t max_count = std::min (length, max_length);
234+ size_t count = 1 ;
235+ while (count < max_count && IsContinuationCharacter (str[count])) {
236+ count++;
242237 }
238+
239+ // Check overly long sequences & other conditions. Use length as error
240+ // indicator.
243241 if (length == 3 ) {
244- switch (str[0 ]) {
245- case 0xE0 :
246- // Overlong three-byte sequence.
247- if (str[1 ] < 0xA0 || str[1 ] > 0xBF ) {
248- *cursor += 1 ;
249- return kBadChar ;
250- }
251- break ;
252- case 0xED :
253- // High and low surrogate halves.
254- if (str[1 ] < 0x80 || str[1 ] > 0x9F ) {
255- *cursor += 1 ;
256- return kBadChar ;
257- }
258- break ;
259- default :
260- if (!IsContinuationCharacter (str[1 ])) {
261- *cursor += 1 ;
262- return kBadChar ;
263- }
264- }
265- if (!IsContinuationCharacter (str[2 ])) {
266- *cursor += 1 ;
267- return kBadChar ;
242+ if (str[0 ] == 0xE0 && (str[1 ] < 0xA0 || str[1 ] > 0xBF )) {
243+ // Overlong three-byte sequence?
244+ length = 0 ;
245+ } else if (str[0 ] == 0xED && (str[1 ] < 0x80 || str[1 ] > 0x9F )) {
246+ // High and low surrogate halves?
247+ length = 0 ;
268248 }
269- *cursor += 3 ;
270- return ((str[0 ] << 12 ) + (str[1 ] << 6 ) + str[2 ]) - 0x000E2080 ;
271- }
272- DCHECK (length == 4 );
273- switch (str[0 ]) {
274- case 0xF0 :
249+ } else if (length == 4 ) {
250+ if (str[0 ] == 0xF0 && (str[1 ] < 0x90 || str[1 ] > 0xBF )) {
275251 // Overlong four-byte sequence.
276- if (str[1 ] < 0x90 || str[1 ] > 0xBF ) {
277- *cursor += 1 ;
278- return kBadChar ;
279- }
280- break ;
281- case 0xF4 :
252+ length = 0 ;
253+ } else if (str[0 ] == 0xF4 && (str[1 ] < 0x80 || str[1 ] > 0x8F )) {
282254 // Code points outside of the unicode range.
283- if (str[1 ] < 0x80 || str[1 ] > 0x8F ) {
284- *cursor += 1 ;
285- return kBadChar ;
286- }
287- break ;
288- default :
289- if (!IsContinuationCharacter (str[1 ])) {
290- *cursor += 1 ;
291- return kBadChar ;
292- }
255+ length = 0 ;
256+ }
293257 }
294- if (!IsContinuationCharacter (str[2 ])) {
295- *cursor += 1 ;
258+
259+ if (count != length) {
260+ // All invalid encodings should land here.
261+ *cursor += count;
296262 return kBadChar ;
297263 }
298- if (!IsContinuationCharacter (str[3 ])) {
299- *cursor += 1 ;
300- return kBadChar ;
264+
265+ // All errors have been handled, so we only have to assemble the result.
266+ *cursor += length;
267+ switch (length) {
268+ case 1 :
269+ return str[0 ];
270+ case 2 :
271+ return ((str[0 ] << 6 ) + str[1 ]) - 0x00003080 ;
272+ case 3 :
273+ return ((str[0 ] << 12 ) + (str[1 ] << 6 ) + str[2 ]) - 0x000E2080 ;
274+ case 4 :
275+ return ((str[0 ] << 18 ) + (str[1 ] << 12 ) + (str[2 ] << 6 ) + str[3 ]) -
276+ 0x03C82080 ;
301277 }
302- *cursor += 4 ;
303- return ((str[ 0 ] << 18 ) + (str[ 1 ] << 12 ) + (str[ 2 ] << 6 ) + str[ 3 ]) -
304- 0x03C82080 ;
278+
279+ UNREACHABLE ();
280+ return kBadChar ;
305281}
306282
307283uchar Utf8::ValueOfIncremental (byte next, Utf8IncrementalBuffer* buffer) {
@@ -323,9 +299,10 @@ uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {
323299 // with one shift.
324300 uint8_t mask = 0x7f >> kind;
325301
326- // Store the kind - 1 (i.e., remaining bytes) in the top byte, value
327- // in the bottom three.
328- *buffer = (kind - 1 ) << 24 | (next & mask);
302+ // Store the kind in the top nibble, and kind - 1 (i.e., remaining bytes)
303+ // in 2nd nibble, and the value in the bottom three. The 2nd nibble is
304+ // intended as a counter about how many bytes are still needed.
305+ *buffer = kind << 28 | (kind - 1 ) << 24 | (next & mask);
329306 return kIncomplete ;
330307 } else {
331308 // No buffer, and not the start of a 1-byte char (handled at the
@@ -354,15 +331,19 @@ uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {
354331 // We're inside of a character, as described by buffer.
355332
356333 // How many bytes (excluding this one) do we still expect?
357- uint8_t count = (*buffer >> 24 ) - 1 ;
334+ uint8_t bytes_expected = *buffer >> 28 ;
335+ uint8_t bytes_left = (*buffer >> 24 ) & 0x0f ;
336+ bytes_left--;
358337 // Update the value.
359338 uint32_t value = ((*buffer & 0xffffff ) << 6 ) | (next & 0x3F );
360- if (count ) {
361- *buffer = count << 24 | value;
339+ if (bytes_left ) {
340+ *buffer = (bytes_expected << 28 | bytes_left << 24 | value) ;
362341 return kIncomplete ;
363342 } else {
364343 *buffer = 0 ;
365- return value;
344+ bool sequence_was_too_long = (bytes_expected == 2 && value < 0x80 ) ||
345+ (bytes_expected == 3 && value < 0x800 );
346+ return sequence_was_too_long ? kBadChar : value;
366347 }
367348 } else {
368349 // Within a character, but not a continuation character? Then the
0 commit comments