Skip to content

Commit df598e1

Browse files
committed
HTML API: Optimize low-level parsing details in Tag Processor.
Introduces a number of micro-level optimizations in the Tag Processor to improve token-scanning performance. Should contain no functional changes. Based on benchmarking against a list of the 100 most-visited websites, these changes result in an average improvement in performance of the Tag Processor for scanning tags from between 3.5% and 7.5%. Developed in WordPress/wordpress-develop#6890 Discussed in https://core.trac.wordpress.org/ticket/61545 Follow-up to [55203]. See #61545. Built from https://develop.svn.wordpress.org/trunk@58613 git-svn-id: http://core.svn.wordpress.org/trunk@58046 1a063a9b-81f0-0310-95a4-ce76da25c4cd
1 parent 8a4deae commit df598e1

File tree

3 files changed

+53
-93
lines changed

3 files changed

+53
-93
lines changed

wp-includes/html-api/class-wp-html-decoder.php

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ public static function decode( $context, $text ) {
141141

142142
while ( $at < $end ) {
143143
$next_character_reference_at = strpos( $text, '&', $at );
144-
if ( false === $next_character_reference_at || $next_character_reference_at >= $end ) {
144+
if ( false === $next_character_reference_at ) {
145145
break;
146146
}
147147

@@ -436,26 +436,26 @@ public static function code_point_to_utf8_bytes( $code_point ) {
436436
}
437437

438438
if ( $code_point <= 0x7FF ) {
439-
$byte1 = ( $code_point >> 6 ) | 0xC0;
440-
$byte2 = $code_point & 0x3F | 0x80;
439+
$byte1 = chr( ( $code_point >> 6 ) | 0xC0 );
440+
$byte2 = chr( $code_point & 0x3F | 0x80 );
441441

442-
return pack( 'CC', $byte1, $byte2 );
442+
return "{$byte1}{$byte2}";
443443
}
444444

445445
if ( $code_point <= 0xFFFF ) {
446-
$byte1 = ( $code_point >> 12 ) | 0xE0;
447-
$byte2 = ( $code_point >> 6 ) & 0x3F | 0x80;
448-
$byte3 = $code_point & 0x3F | 0x80;
446+
$byte1 = chr( ( $code_point >> 12 ) | 0xE0 );
447+
$byte2 = chr( ( $code_point >> 6 ) & 0x3F | 0x80 );
448+
$byte3 = chr( $code_point & 0x3F | 0x80 );
449449

450-
return pack( 'CCC', $byte1, $byte2, $byte3 );
450+
return "{$byte1}{$byte2}{$byte3}";
451451
}
452452

453453
// Any values above U+10FFFF are eliminated above in the pre-check.
454-
$byte1 = ( $code_point >> 18 ) | 0xF0;
455-
$byte2 = ( $code_point >> 12 ) & 0x3F | 0x80;
456-
$byte3 = ( $code_point >> 6 ) & 0x3F | 0x80;
457-
$byte4 = $code_point & 0x3F | 0x80;
454+
$byte1 = chr( ( $code_point >> 18 ) | 0xF0 );
455+
$byte2 = chr( ( $code_point >> 12 ) & 0x3F | 0x80 );
456+
$byte3 = chr( ( $code_point >> 6 ) & 0x3F | 0x80 );
457+
$byte4 = chr( $code_point & 0x3F | 0x80 );
458458

459-
return pack( 'CCCC', $byte1, $byte2, $byte3, $byte4 );
459+
return "{$byte1}{$byte2}{$byte3}{$byte4}";
460460
}
461461
}

wp-includes/html-api/class-wp-html-tag-processor.php

Lines changed: 39 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -1524,21 +1524,10 @@ private function parse_next_tag() {
15241524
$was_at = $this->bytes_already_parsed;
15251525
$at = $was_at;
15261526

1527-
while ( false !== $at && $at < $doc_length ) {
1527+
while ( $at < $doc_length ) {
15281528
$at = strpos( $html, '<', $at );
1529-
1530-
/*
1531-
* This does not imply an incomplete parse; it indicates that there
1532-
* can be nothing left in the document other than a #text node.
1533-
*/
15341529
if ( false === $at ) {
1535-
$this->parser_state = self::STATE_TEXT_NODE;
1536-
$this->token_starts_at = $was_at;
1537-
$this->token_length = strlen( $html ) - $was_at;
1538-
$this->text_starts_at = $was_at;
1539-
$this->text_length = $this->token_length;
1540-
$this->bytes_already_parsed = strlen( $html );
1541-
return true;
1530+
break;
15421531
}
15431532

15441533
if ( $at > $was_at ) {
@@ -1554,19 +1543,9 @@ private function parse_next_tag() {
15541543
*
15551544
* @see https://html.spec.whatwg.org/#tag-open-state
15561545
*/
1557-
if ( strlen( $html ) > $at + 1 ) {
1558-
$next_character = $html[ $at + 1 ];
1559-
$at_another_node = (
1560-
'!' === $next_character ||
1561-
'/' === $next_character ||
1562-
'?' === $next_character ||
1563-
( 'A' <= $next_character && $next_character <= 'Z' ) ||
1564-
( 'a' <= $next_character && $next_character <= 'z' )
1565-
);
1566-
if ( ! $at_another_node ) {
1567-
++$at;
1568-
continue;
1569-
}
1546+
if ( 1 !== strspn( $html, '!/?abcdefghijklmnopqrstuvwxyzABCEFGHIJKLMNOPQRSTUVWXYZ', $at + 1, 1 ) ) {
1547+
++$at;
1548+
continue;
15701549
}
15711550

15721551
$this->parser_state = self::STATE_TEXT_NODE;
@@ -1630,11 +1609,7 @@ private function parse_next_tag() {
16301609
* `<!--` transitions to a comment state – apply further comment rules.
16311610
* https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
16321611
*/
1633-
if (
1634-
$doc_length > $at + 3 &&
1635-
'-' === $html[ $at + 2 ] &&
1636-
'-' === $html[ $at + 3 ]
1637-
) {
1612+
if ( 0 === substr_compare( $html, '--', $at + 2, 2 ) ) {
16381613
$closer_at = $at + 4;
16391614
// If it's not possible to close the comment then there is nothing more to scan.
16401615
if ( $doc_length <= $closer_at ) {
@@ -1911,7 +1886,17 @@ private function parse_next_tag() {
19111886
++$at;
19121887
}
19131888

1914-
return false;
1889+
/*
1890+
* This does not imply an incomplete parse; it indicates that there
1891+
* can be nothing left in the document other than a #text node.
1892+
*/
1893+
$this->parser_state = self::STATE_TEXT_NODE;
1894+
$this->token_starts_at = $was_at;
1895+
$this->token_length = $doc_length - $was_at;
1896+
$this->text_starts_at = $was_at;
1897+
$this->text_length = $this->token_length;
1898+
$this->bytes_already_parsed = $doc_length;
1899+
return true;
19151900
}
19161901

19171902
/**
@@ -1922,9 +1907,11 @@ private function parse_next_tag() {
19221907
* @return bool Whether an attribute was found before the end of the document.
19231908
*/
19241909
private function parse_next_attribute() {
1910+
$doc_length = strlen( $this->html );
1911+
19251912
// Skip whitespace and slashes.
19261913
$this->bytes_already_parsed += strspn( $this->html, " \t\f\r\n/", $this->bytes_already_parsed );
1927-
if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
1914+
if ( $this->bytes_already_parsed >= $doc_length ) {
19281915
$this->parser_state = self::STATE_INCOMPLETE_INPUT;
19291916

19301917
return false;
@@ -1941,21 +1928,21 @@ private function parse_next_attribute() {
19411928
: strcspn( $this->html, "=/> \t\f\r\n", $this->bytes_already_parsed );
19421929

19431930
// No attribute, just tag closer.
1944-
if ( 0 === $name_length || $this->bytes_already_parsed + $name_length >= strlen( $this->html ) ) {
1931+
if ( 0 === $name_length || $this->bytes_already_parsed + $name_length >= $doc_length ) {
19451932
return false;
19461933
}
19471934

19481935
$attribute_start = $this->bytes_already_parsed;
19491936
$attribute_name = substr( $this->html, $attribute_start, $name_length );
19501937
$this->bytes_already_parsed += $name_length;
1951-
if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
1938+
if ( $this->bytes_already_parsed >= $doc_length ) {
19521939
$this->parser_state = self::STATE_INCOMPLETE_INPUT;
19531940

19541941
return false;
19551942
}
19561943

19571944
$this->skip_whitespace();
1958-
if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
1945+
if ( $this->bytes_already_parsed >= $doc_length ) {
19591946
$this->parser_state = self::STATE_INCOMPLETE_INPUT;
19601947

19611948
return false;
@@ -1965,7 +1952,7 @@ private function parse_next_attribute() {
19651952
if ( $has_value ) {
19661953
++$this->bytes_already_parsed;
19671954
$this->skip_whitespace();
1968-
if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
1955+
if ( $this->bytes_already_parsed >= $doc_length ) {
19691956
$this->parser_state = self::STATE_INCOMPLETE_INPUT;
19701957

19711958
return false;
@@ -1976,8 +1963,10 @@ private function parse_next_attribute() {
19761963
case '"':
19771964
$quote = $this->html[ $this->bytes_already_parsed ];
19781965
$value_start = $this->bytes_already_parsed + 1;
1979-
$value_length = strcspn( $this->html, $quote, $value_start );
1980-
$attribute_end = $value_start + $value_length + 1;
1966+
$end_quote_at = strpos( $this->html, $quote, $value_start );
1967+
$end_quote_at = false === $end_quote_at ? $doc_length : $end_quote_at;
1968+
$value_length = $end_quote_at - $value_start;
1969+
$attribute_end = $end_quote_at + 1;
19811970
$this->bytes_already_parsed = $attribute_end;
19821971
break;
19831972

@@ -1993,7 +1982,7 @@ private function parse_next_attribute() {
19931982
$attribute_end = $attribute_start + $name_length;
19941983
}
19951984

1996-
if ( $attribute_end >= strlen( $this->html ) ) {
1985+
if ( $attribute_end >= $doc_length ) {
19971986
$this->parser_state = self::STATE_INCOMPLETE_INPUT;
19981987

19991988
return false;
@@ -2014,7 +2003,7 @@ private function parse_next_attribute() {
20142003
$comparable_name = strtolower( $attribute_name );
20152004

20162005
// If an attribute is listed many times, only use the first declaration and ignore the rest.
2017-
if ( ! array_key_exists( $comparable_name, $this->attributes ) ) {
2006+
if ( ! isset( $this->attributes[ $comparable_name ] ) ) {
20182007
$this->attributes[ $comparable_name ] = new WP_HTML_Attribute_Token(
20192008
$attribute_name,
20202009
$value_start,
@@ -2038,7 +2027,7 @@ private function parse_next_attribute() {
20382027
$duplicate_span = new WP_HTML_Span( $attribute_start, $attribute_end - $attribute_start );
20392028
if ( null === $this->duplicate_attributes ) {
20402029
$this->duplicate_attributes = array( $comparable_name => array( $duplicate_span ) );
2041-
} elseif ( ! array_key_exists( $comparable_name, $this->duplicate_attributes ) ) {
2030+
} elseif ( ! isset( $this->duplicate_attributes[ $comparable_name ] ) ) {
20422031
$this->duplicate_attributes[ $comparable_name ] = array( $duplicate_span );
20432032
} else {
20442033
$this->duplicate_attributes[ $comparable_name ][] = $duplicate_span;
@@ -3110,14 +3099,12 @@ public function remove_attribute( $name ) {
31103099
);
31113100

31123101
// Removes any duplicated attributes if they were also present.
3113-
if ( null !== $this->duplicate_attributes && array_key_exists( $name, $this->duplicate_attributes ) ) {
3114-
foreach ( $this->duplicate_attributes[ $name ] as $attribute_token ) {
3115-
$this->lexical_updates[] = new WP_HTML_Text_Replacement(
3116-
$attribute_token->start,
3117-
$attribute_token->length,
3118-
''
3119-
);
3120-
}
3102+
foreach ( $this->duplicate_attributes[ $name ] ?? array() as $attribute_token ) {
3103+
$this->lexical_updates[] = new WP_HTML_Text_Replacement(
3104+
$attribute_token->start,
3105+
$attribute_token->length,
3106+
''
3107+
);
31213108
}
31223109

31233110
return true;
@@ -3317,35 +3304,8 @@ private function matches() {
33173304
}
33183305

33193306
// Does the tag name match the requested tag name in a case-insensitive manner?
3320-
if ( null !== $this->sought_tag_name ) {
3321-
/*
3322-
* String (byte) length lookup is fast. If they aren't the
3323-
* same length then they can't be the same string values.
3324-
*/
3325-
if ( strlen( $this->sought_tag_name ) !== $this->tag_name_length ) {
3326-
return false;
3327-
}
3328-
3329-
/*
3330-
* Check each character to determine if they are the same.
3331-
* Defer calls to `strtoupper()` to avoid them when possible.
3332-
* Calling `strcasecmp()` here tested slowed than comparing each
3333-
* character, so unless benchmarks show otherwise, it should
3334-
* not be used.
3335-
*
3336-
* It's expected that most of the time that this runs, a
3337-
* lower-case tag name will be supplied and the input will
3338-
* contain lower-case tag names, thus normally bypassing
3339-
* the case comparison code.
3340-
*/
3341-
for ( $i = 0; $i < $this->tag_name_length; $i++ ) {
3342-
$html_char = $this->html[ $this->tag_name_starts_at + $i ];
3343-
$tag_char = $this->sought_tag_name[ $i ];
3344-
3345-
if ( $html_char !== $tag_char && strtoupper( $html_char ) !== $tag_char ) {
3346-
return false;
3347-
}
3348-
}
3307+
if ( isset( $this->sought_tag_name ) && 0 !== substr_compare( $this->html, $this->sought_tag_name, $this->tag_name_starts_at, $this->tag_name_length, true ) ) {
3308+
return false;
33493309
}
33503310

33513311
if ( null !== $this->sought_class_name && ! $this->has_class( $this->sought_class_name ) ) {

wp-includes/version.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
*
1717
* @global string $wp_version
1818
*/
19-
$wp_version = '6.7-alpha-58612';
19+
$wp_version = '6.7-alpha-58613';
2020

2121
/**
2222
* Holds the WordPress DB revision, increments when changes are made to the WordPress DB schema.

0 commit comments

Comments
 (0)