1111
1212namespace Symfony \Component \DomCrawler ;
1313
14- use Masterminds \HTML5 ;
1514use Symfony \Component \CssSelector \CssSelectorConverter ;
1615
1716/**
@@ -53,23 +52,15 @@ class Crawler implements \Countable, \IteratorAggregate
5352 */
5453 private bool $ isHtml = true ;
5554
56- private ?HTML5 $ html5Parser = null ;
57-
5855 /**
5956 * @param \DOMNodeList|\DOMNode|\DOMNode[]|string|null $node A Node to use as the base for the crawling
6057 */
6158 public function __construct (
6259 \DOMNodeList |\DOMNode |array |string |null $ node = null ,
6360 protected ?string $ uri = null ,
6461 ?string $ baseHref = null ,
65- private bool $ useHtml5Parser = true ,
6662 ) {
67- if (\PHP_VERSION_ID >= 80400 && !$ useHtml5Parser ) {
68- trigger_deprecation ('symfony/dom-crawler ' , '7.4 ' , 'Disabling HTML5 parsing is deprecated. Symfony 8 will unconditionally use the native HTML5 parser. ' );
69- }
70-
7163 $ this ->baseHref = $ baseHref ?: $ uri ;
72- $ this ->html5Parser = \PHP_VERSION_ID < 80400 && $ useHtml5Parser ? new HTML5 (['disable_html_ns ' => true ]) : null ;
7364 $ this ->cachedNamespaces = new \ArrayObject ();
7465
7566 $ this ->add ($ node );
@@ -175,7 +166,7 @@ public function addContent(string $content, ?string $type = null): void
175166 */
176167 public function addHtmlContent (string $ content , string $ charset = 'UTF-8 ' ): void
177168 {
178- $ dom = $ this ->parseHtmlString ($ content , $ charset );
169+ $ dom = $ this ->parseXhtml ($ content , $ charset );
179170 $ this ->addDocument ($ dom );
180171
181172 $ base = $ this ->filterRelativeXPath ('descendant-or-self::base ' )->extract (['href ' ]);
@@ -609,10 +600,6 @@ public function html(?string $default = null): string
609600 $ node = $ this ->getNode (0 );
610601 $ owner = $ node ->ownerDocument ;
611602
612- if ($ this ->html5Parser && '<!DOCTYPE html> ' === $ owner ->saveXML ($ owner ->childNodes [0 ])) {
613- $ owner = $ this ->html5Parser ;
614- }
615-
616603 $ html = '' ;
617604 foreach ($ node ->childNodes as $ child ) {
618605 $ html .= $ owner ->saveHTML ($ child );
@@ -630,10 +617,6 @@ public function outerHtml(): string
630617 $ node = $ this ->getNode (0 );
631618 $ owner = $ node ->ownerDocument ;
632619
633- if ($ this ->html5Parser && '<!DOCTYPE html> ' === $ owner ->saveXML ($ owner ->childNodes [0 ])) {
634- $ owner = $ this ->html5Parser ;
635- }
636-
637620 return $ owner ->saveHTML ($ node );
638621 }
639622
@@ -1064,48 +1047,8 @@ protected function sibling(\DOMNode $node, string $siblingDir = 'nextSibling'):
10641047 return $ nodes ;
10651048 }
10661049
1067- private function parseHtml5 (string $ htmlContent , string $ charset = 'UTF-8 ' ): \DOMDocument
1068- {
1069- if (!$ this ->supportsEncoding ($ charset )) {
1070- $ htmlContent = $ this ->convertToHtmlEntities ($ htmlContent , $ charset );
1071- $ charset = 'UTF-8 ' ;
1072- }
1073-
1074- return $ this ->html5Parser ->parse ($ htmlContent , ['encoding ' => $ charset ]);
1075- }
1076-
1077- private function supportsEncoding (string $ encoding ): bool
1078- {
1079- try {
1080- return '' === @mb_convert_encoding ('' , $ encoding , 'UTF-8 ' );
1081- } catch (\Throwable $ e ) {
1082- return false ;
1083- }
1084- }
1085-
10861050 private function parseXhtml (string $ htmlContent , string $ charset = 'UTF-8 ' ): \DOMDocument
10871051 {
1088- if (\PHP_VERSION_ID < 80400 || !$ this ->useHtml5Parser ) {
1089- if ('UTF-8 ' === $ charset && preg_match ('//u ' , $ htmlContent )) {
1090- $ htmlContent = '<?xml encoding="UTF-8"> ' .$ htmlContent ;
1091- } else {
1092- $ htmlContent = $ this ->convertToHtmlEntities ($ htmlContent , $ charset );
1093- }
1094-
1095- $ internalErrors = libxml_use_internal_errors (true );
1096-
1097- $ dom = new \DOMDocument ('1.0 ' , $ charset );
1098- $ dom ->validateOnParse = true ;
1099-
1100- if ('' !== trim ($ htmlContent )) {
1101- @$ dom ->loadHTML ($ htmlContent );
1102- }
1103-
1104- libxml_use_internal_errors ($ internalErrors );
1105-
1106- return $ dom ;
1107- }
1108-
11091052 $ document = @\Dom \HTMLDocument::createFromString ($ htmlContent , \Dom \HTML_NO_DEFAULT_NS , $ charset );
11101053 $ htmlContent = $ document ->saveXml ();
11111054 $ charset = $ document ->inputEncoding ;
@@ -1202,7 +1145,6 @@ private function createSubCrawler(\DOMNodeList|\DOMNode|array|string|null $nodes
12021145 $ crawler ->document = $ this ->document ;
12031146 $ crawler ->namespaces = $ this ->namespaces ;
12041147 $ crawler ->cachedNamespaces = $ this ->cachedNamespaces ;
1205- $ crawler ->html5Parser = $ this ->html5Parser ;
12061148
12071149 return $ crawler ;
12081150 }
@@ -1219,39 +1161,6 @@ private function createCssSelectorConverter(): CssSelectorConverter
12191161 return new CssSelectorConverter ($ this ->isHtml );
12201162 }
12211163
1222- /**
1223- * Parse string into DOMDocument object using HTML5 parser if the content is HTML5 and the library is available.
1224- * Use libxml parser otherwise.
1225- */
1226- private function parseHtmlString (string $ content , string $ charset ): \DOMDocument
1227- {
1228- if ($ this ->canParseHtml5String ($ content )) {
1229- return $ this ->parseHtml5 ($ content , $ charset );
1230- }
1231-
1232- return $ this ->parseXhtml ($ content , $ charset );
1233- }
1234-
1235- private function canParseHtml5String (string $ content ): bool
1236- {
1237- if (!$ this ->html5Parser ) {
1238- return false ;
1239- }
1240-
1241- if (false === $ pos = stripos ($ content , '<!doctype html> ' )) {
1242- return false ;
1243- }
1244-
1245- $ header = substr ($ content , 0 , $ pos );
1246-
1247- return '' === $ header || $ this ->isValidHtml5Heading ($ header );
1248- }
1249-
1250- private function isValidHtml5Heading (string $ heading ): bool
1251- {
1252- return 1 === preg_match ('/^\x{FEFF}?\s*(<!--[^>]*?-->\s*)*$/u ' , $ heading );
1253- }
1254-
12551164 private function normalizeWhitespace (string $ string ): string
12561165 {
12571166 return trim (preg_replace ("/(?:[ \n\r\t\x0C]{2,}+|[ \n\r\t\x0C])/ " , ' ' , $ string ), " \n\r\t\x0C" );
0 commit comments