@@ -1524,21 +1524,10 @@ private function parse_next_tag() {
15241524 $ was_at = $ this ->bytes_already_parsed ;
15251525 $ at = $ was_at ;
15261526
1527- while ( false !== $ at && $ at < $ doc_length ) {
1527+ while ( $ at < $ doc_length ) {
15281528 $ at = strpos ( $ html , '< ' , $ at );
1529-
1530- /*
1531- * This does not imply an incomplete parse; it indicates that there
1532- * can be nothing left in the document other than a #text node.
1533- */
15341529 if ( false === $ at ) {
1535- $ this ->parser_state = self ::STATE_TEXT_NODE ;
1536- $ this ->token_starts_at = $ was_at ;
1537- $ this ->token_length = strlen ( $ html ) - $ was_at ;
1538- $ this ->text_starts_at = $ was_at ;
1539- $ this ->text_length = $ this ->token_length ;
1540- $ this ->bytes_already_parsed = strlen ( $ html );
1541- return true ;
1530+ break ;
15421531 }
15431532
15441533 if ( $ at > $ was_at ) {
@@ -1554,19 +1543,9 @@ private function parse_next_tag() {
15541543 *
15551544 * @see https://html.spec.whatwg.org/#tag-open-state
15561545 */
1557- if ( strlen ( $ html ) > $ at + 1 ) {
1558- $ next_character = $ html [ $ at + 1 ];
1559- $ at_another_node = (
1560- '! ' === $ next_character ||
1561- '/ ' === $ next_character ||
1562- '? ' === $ next_character ||
1563- ( 'A ' <= $ next_character && $ next_character <= 'Z ' ) ||
1564- ( 'a ' <= $ next_character && $ next_character <= 'z ' )
1565- );
1566- if ( ! $ at_another_node ) {
1567- ++$ at ;
1568- continue ;
1569- }
1546+ if ( 1 !== strspn ( $ html , '!/?abcdefghijklmnopqrstuvwxyzABCEFGHIJKLMNOPQRSTUVWXYZ ' , $ at + 1 , 1 ) ) {
1547+ ++$ at ;
1548+ continue ;
15701549 }
15711550
15721551 $ this ->parser_state = self ::STATE_TEXT_NODE ;
@@ -1630,11 +1609,7 @@ private function parse_next_tag() {
16301609 * `<!--` transitions to a comment state – apply further comment rules.
16311610 * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
16321611 */
1633- if (
1634- $ doc_length > $ at + 3 &&
1635- '- ' === $ html [ $ at + 2 ] &&
1636- '- ' === $ html [ $ at + 3 ]
1637- ) {
1612+ if ( 0 === substr_compare ( $ html , '-- ' , $ at + 2 , 2 ) ) {
16381613 $ closer_at = $ at + 4 ;
16391614 // If it's not possible to close the comment then there is nothing more to scan.
16401615 if ( $ doc_length <= $ closer_at ) {
@@ -1911,7 +1886,17 @@ private function parse_next_tag() {
19111886 ++$ at ;
19121887 }
19131888
1914- return false ;
1889+ /*
1890+ * This does not imply an incomplete parse; it indicates that there
1891+ * can be nothing left in the document other than a #text node.
1892+ */
1893+ $ this ->parser_state = self ::STATE_TEXT_NODE ;
1894+ $ this ->token_starts_at = $ was_at ;
1895+ $ this ->token_length = $ doc_length - $ was_at ;
1896+ $ this ->text_starts_at = $ was_at ;
1897+ $ this ->text_length = $ this ->token_length ;
1898+ $ this ->bytes_already_parsed = $ doc_length ;
1899+ return true ;
19151900 }
19161901
19171902 /**
@@ -1922,9 +1907,11 @@ private function parse_next_tag() {
19221907 * @return bool Whether an attribute was found before the end of the document.
19231908 */
19241909 private function parse_next_attribute () {
1910+ $ doc_length = strlen ( $ this ->html );
1911+
19251912 // Skip whitespace and slashes.
19261913 $ this ->bytes_already_parsed += strspn ( $ this ->html , " \t\f\r\n/ " , $ this ->bytes_already_parsed );
1927- if ( $ this ->bytes_already_parsed >= strlen ( $ this -> html ) ) {
1914+ if ( $ this ->bytes_already_parsed >= $ doc_length ) {
19281915 $ this ->parser_state = self ::STATE_INCOMPLETE_INPUT ;
19291916
19301917 return false ;
@@ -1941,21 +1928,21 @@ private function parse_next_attribute() {
19411928 : strcspn ( $ this ->html , "=/> \t\f\r\n" , $ this ->bytes_already_parsed );
19421929
19431930 // No attribute, just tag closer.
1944- if ( 0 === $ name_length || $ this ->bytes_already_parsed + $ name_length >= strlen ( $ this -> html ) ) {
1931+ if ( 0 === $ name_length || $ this ->bytes_already_parsed + $ name_length >= $ doc_length ) {
19451932 return false ;
19461933 }
19471934
19481935 $ attribute_start = $ this ->bytes_already_parsed ;
19491936 $ attribute_name = substr ( $ this ->html , $ attribute_start , $ name_length );
19501937 $ this ->bytes_already_parsed += $ name_length ;
1951- if ( $ this ->bytes_already_parsed >= strlen ( $ this -> html ) ) {
1938+ if ( $ this ->bytes_already_parsed >= $ doc_length ) {
19521939 $ this ->parser_state = self ::STATE_INCOMPLETE_INPUT ;
19531940
19541941 return false ;
19551942 }
19561943
19571944 $ this ->skip_whitespace ();
1958- if ( $ this ->bytes_already_parsed >= strlen ( $ this -> html ) ) {
1945+ if ( $ this ->bytes_already_parsed >= $ doc_length ) {
19591946 $ this ->parser_state = self ::STATE_INCOMPLETE_INPUT ;
19601947
19611948 return false ;
@@ -1965,7 +1952,7 @@ private function parse_next_attribute() {
19651952 if ( $ has_value ) {
19661953 ++$ this ->bytes_already_parsed ;
19671954 $ this ->skip_whitespace ();
1968- if ( $ this ->bytes_already_parsed >= strlen ( $ this -> html ) ) {
1955+ if ( $ this ->bytes_already_parsed >= $ doc_length ) {
19691956 $ this ->parser_state = self ::STATE_INCOMPLETE_INPUT ;
19701957
19711958 return false ;
@@ -1976,8 +1963,10 @@ private function parse_next_attribute() {
19761963 case '" ' :
19771964 $ quote = $ this ->html [ $ this ->bytes_already_parsed ];
19781965 $ value_start = $ this ->bytes_already_parsed + 1 ;
1979- $ value_length = strcspn ( $ this ->html , $ quote , $ value_start );
1980- $ attribute_end = $ value_start + $ value_length + 1 ;
1966+ $ end_quote_at = strpos ( $ this ->html , $ quote , $ value_start );
1967+ $ end_quote_at = false === $ end_quote_at ? $ doc_length : $ end_quote_at ;
1968+ $ value_length = $ end_quote_at - $ value_start ;
1969+ $ attribute_end = $ end_quote_at + 1 ;
19811970 $ this ->bytes_already_parsed = $ attribute_end ;
19821971 break ;
19831972
@@ -1993,7 +1982,7 @@ private function parse_next_attribute() {
19931982 $ attribute_end = $ attribute_start + $ name_length ;
19941983 }
19951984
1996- if ( $ attribute_end >= strlen ( $ this -> html ) ) {
1985+ if ( $ attribute_end >= $ doc_length ) {
19971986 $ this ->parser_state = self ::STATE_INCOMPLETE_INPUT ;
19981987
19991988 return false ;
@@ -2014,7 +2003,7 @@ private function parse_next_attribute() {
20142003 $ comparable_name = strtolower ( $ attribute_name );
20152004
20162005 // If an attribute is listed many times, only use the first declaration and ignore the rest.
2017- if ( ! array_key_exists ( $ comparable_name , $ this ->attributes ) ) {
2006+ if ( ! isset ( $ this ->attributes [ $ comparable_name ] ) ) {
20182007 $ this ->attributes [ $ comparable_name ] = new WP_HTML_Attribute_Token (
20192008 $ attribute_name ,
20202009 $ value_start ,
@@ -2038,7 +2027,7 @@ private function parse_next_attribute() {
20382027 $ duplicate_span = new WP_HTML_Span ( $ attribute_start , $ attribute_end - $ attribute_start );
20392028 if ( null === $ this ->duplicate_attributes ) {
20402029 $ this ->duplicate_attributes = array ( $ comparable_name => array ( $ duplicate_span ) );
2041- } elseif ( ! array_key_exists ( $ comparable_name , $ this ->duplicate_attributes ) ) {
2030+ } elseif ( ! isset ( $ this ->duplicate_attributes [ $ comparable_name ] ) ) {
20422031 $ this ->duplicate_attributes [ $ comparable_name ] = array ( $ duplicate_span );
20432032 } else {
20442033 $ this ->duplicate_attributes [ $ comparable_name ][] = $ duplicate_span ;
@@ -3110,14 +3099,12 @@ public function remove_attribute( $name ) {
31103099 );
31113100
31123101 // Removes any duplicated attributes if they were also present.
3113- if ( null !== $ this ->duplicate_attributes && array_key_exists ( $ name , $ this ->duplicate_attributes ) ) {
3114- foreach ( $ this ->duplicate_attributes [ $ name ] as $ attribute_token ) {
3115- $ this ->lexical_updates [] = new WP_HTML_Text_Replacement (
3116- $ attribute_token ->start ,
3117- $ attribute_token ->length ,
3118- ''
3119- );
3120- }
3102+ foreach ( $ this ->duplicate_attributes [ $ name ] ?? array () as $ attribute_token ) {
3103+ $ this ->lexical_updates [] = new WP_HTML_Text_Replacement (
3104+ $ attribute_token ->start ,
3105+ $ attribute_token ->length ,
3106+ ''
3107+ );
31213108 }
31223109
31233110 return true ;
@@ -3317,35 +3304,8 @@ private function matches() {
33173304 }
33183305
33193306 // Does the tag name match the requested tag name in a case-insensitive manner?
3320- if ( null !== $ this ->sought_tag_name ) {
3321- /*
3322- * String (byte) length lookup is fast. If they aren't the
3323- * same length then they can't be the same string values.
3324- */
3325- if ( strlen ( $ this ->sought_tag_name ) !== $ this ->tag_name_length ) {
3326- return false ;
3327- }
3328-
3329- /*
3330- * Check each character to determine if they are the same.
3331- * Defer calls to `strtoupper()` to avoid them when possible.
3332- * Calling `strcasecmp()` here tested slowed than comparing each
3333- * character, so unless benchmarks show otherwise, it should
3334- * not be used.
3335- *
3336- * It's expected that most of the time that this runs, a
3337- * lower-case tag name will be supplied and the input will
3338- * contain lower-case tag names, thus normally bypassing
3339- * the case comparison code.
3340- */
3341- for ( $ i = 0 ; $ i < $ this ->tag_name_length ; $ i ++ ) {
3342- $ html_char = $ this ->html [ $ this ->tag_name_starts_at + $ i ];
3343- $ tag_char = $ this ->sought_tag_name [ $ i ];
3344-
3345- if ( $ html_char !== $ tag_char && strtoupper ( $ html_char ) !== $ tag_char ) {
3346- return false ;
3347- }
3348- }
3307+ if ( isset ( $ this ->sought_tag_name ) && 0 !== substr_compare ( $ this ->html , $ this ->sought_tag_name , $ this ->tag_name_starts_at , $ this ->tag_name_length , true ) ) {
3308+ return false ;
33493309 }
33503310
33513311 if ( null !== $ this ->sought_class_name && ! $ this ->has_class ( $ this ->sought_class_name ) ) {
0 commit comments