From 9bbdab68efa4e5a995f775eea320de53d0522c17 Mon Sep 17 00:00:00 2001 From: Titouan Galopin Date: Sat, 6 Apr 2019 14:57:24 +0200 Subject: [PATCH] [DomCrawler] Improve Crawler HTML5 parser need detection --- src/Symfony/Component/DomCrawler/Crawler.php | 25 +++---- .../DomCrawler/Tests/AbstractCrawlerTest.php | 73 +++++++++---------- .../Tests/Html5ParserCrawlerTest.php | 14 +++- .../Tests/NativeParserCrawlerTest.php | 10 +-- 4 files changed, 59 insertions(+), 63 deletions(-) diff --git a/src/Symfony/Component/DomCrawler/Crawler.php b/src/Symfony/Component/DomCrawler/Crawler.php index 590462238b4be..4efbbf9b2d880 100644 --- a/src/Symfony/Component/DomCrawler/Crawler.php +++ b/src/Symfony/Component/DomCrawler/Crawler.php @@ -61,24 +61,15 @@ class Crawler implements \Countable, \IteratorAggregate private $html5Parser; /** - * @param mixed $node A Node to use as the base for the crawling - * @param string $uri The current URI - * @param string $baseHref The base href value - * @param bool|null $useHtml5Parser Whether the Crawler should use the HTML5 parser or the native DOM parser + * @param mixed $node A Node to use as the base for the crawling + * @param string $uri The current URI + * @param string $baseHref The base href value */ - public function __construct($node = null, string $uri = null, string $baseHref = null, bool $useHtml5Parser = null) + public function __construct($node = null, string $uri = null, string $baseHref = null) { $this->uri = $uri; $this->baseHref = $baseHref ?: $uri; - if ($useHtml5Parser && !class_exists(HTML5::class)) { - throw new \LogicException('Using the DomCrawler HTML5 parser requires the html5-php library. Try running "composer require masterminds/html5".'); - } - - if ($useHtml5Parser ?? class_exists(HTML5::class)) { - $this->html5Parser = new HTML5(['disable_html_ns' => true]); - } - $this->add($node); } @@ -198,6 +189,13 @@ public function addContent($content, $type = null) */ public function addHtmlContent($content, $charset = 'UTF-8') { + // Use HTML5 parser if the content is HTML5 and the library is available + if (!$this->html5Parser + && class_exists(HTML5::class) + && '' === strtolower(substr(ltrim($content), 0, 15))) { + $this->html5Parser = new HTML5(['disable_html_ns' => true]); + } + $dom = null !== $this->html5Parser ? $this->parseHtml5($content, $charset) : $this->parseXhtml($content, $charset); $this->addDocument($dom); @@ -1219,6 +1217,7 @@ private function createSubCrawler($nodes) $crawler->isHtml = $this->isHtml; $crawler->document = $this->document; $crawler->namespaces = $this->namespaces; + $crawler->html5Parser = $this->html5Parser; return $crawler; } diff --git a/src/Symfony/Component/DomCrawler/Tests/AbstractCrawlerTest.php b/src/Symfony/Component/DomCrawler/Tests/AbstractCrawlerTest.php index e77cb8cdf87ae..bc9777235c11a 100644 --- a/src/Symfony/Component/DomCrawler/Tests/AbstractCrawlerTest.php +++ b/src/Symfony/Component/DomCrawler/Tests/AbstractCrawlerTest.php @@ -16,14 +16,12 @@ abstract class AbstractCrawlerTest extends TestCase { - /** - * @param mixed $node - * @param string|null $uri - * @param string|null $baseHref - * - * @return Crawler - */ - abstract public function createCrawler($node = null, string $uri = null, string $baseHref = null); + abstract public function getDoctype(): string; + + protected function createCrawler($node = null, string $uri = null, string $baseHref = null) + { + return new Crawler($node, $uri, $baseHref); + } public function testConstructor() { @@ -74,7 +72,7 @@ public function testAdd() $this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->add() adds nodes from a \DOMNode'); $crawler = $this->createCrawler(); - $crawler->add('Foo'); + $crawler->add($this->getDoctype().'Foo'); $this->assertEquals('Foo', $crawler->filterXPath('//body')->text(), '->add() adds nodes from a string'); } @@ -94,13 +92,13 @@ public function testAddInvalidType() public function testAddMultipleDocumentNode() { $crawler = $this->createTestCrawler(); - $crawler->addHtmlContent('
', 'UTF-8'); + $crawler->addHtmlContent($this->getDoctype().'
', 'UTF-8'); } public function testAddHtmlContent() { $crawler = $this->createCrawler(); - $crawler->addHtmlContent('
', 'UTF-8'); + $crawler->addHtmlContent($this->getDoctype().'
', 'UTF-8'); $this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addHtmlContent() adds nodes from an HTML string'); } @@ -108,8 +106,7 @@ public function testAddHtmlContent() public function testAddHtmlContentWithBaseTag() { $crawler = $this->createCrawler(); - - $crawler->addHtmlContent('', 'UTF-8'); + $crawler->addHtmlContent($this->getDoctype().'', 'UTF-8'); $this->assertEquals('http://symfony.com', $crawler->filterXPath('//base')->attr('href'), '->addHtmlContent() adds nodes from an HTML string'); $this->assertEquals('http://symfony.com/contact', $crawler->filterXPath('//a')->link()->getUri(), '->addHtmlContent() adds nodes from an HTML string'); @@ -121,7 +118,7 @@ public function testAddHtmlContentWithBaseTag() public function testAddHtmlContentCharset() { $crawler = $this->createCrawler(); - $crawler->addHtmlContent('
Tiếng Việt', 'UTF-8'); + $crawler->addHtmlContent($this->getDoctype().'
Tiếng Việt', 'UTF-8'); $this->assertEquals('Tiếng Việt', $crawler->filterXPath('//div')->text()); } @@ -129,7 +126,7 @@ public function testAddHtmlContentCharset() public function testAddHtmlContentInvalidBaseTag() { $crawler = $this->createCrawler(null, 'http://symfony.com'); - $crawler->addHtmlContent('', 'UTF-8'); + $crawler->addHtmlContent($this->getDoctype().'', 'UTF-8'); $this->assertEquals('http://symfony.com/contact', current($crawler->filterXPath('//a')->links())->getUri(), '->addHtmlContent() correctly handles a non-existent base tag href attribute'); } @@ -141,7 +138,7 @@ public function testAddHtmlContentCharsetGbk() { $crawler = $this->createCrawler(); //gbk encode of

中文

- $crawler->addHtmlContent(base64_decode('PGh0bWw+PHA+1tDOxDwvcD48L2h0bWw+'), 'gbk'); + $crawler->addHtmlContent($this->getDoctype().base64_decode('PGh0bWw+PHA+1tDOxDwvcD48L2h0bWw+'), 'gbk'); $this->assertEquals('中文', $crawler->filterXPath('//p')->text()); } @@ -149,7 +146,7 @@ public function testAddHtmlContentCharsetGbk() public function testAddXmlContent() { $crawler = $this->createCrawler(); - $crawler->addXmlContent('
', 'UTF-8'); + $crawler->addXmlContent($this->getDoctype().'
', 'UTF-8'); $this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addXmlContent() adds nodes from an XML string'); } @@ -157,7 +154,7 @@ public function testAddXmlContent() public function testAddXmlContentCharset() { $crawler = $this->createCrawler(); - $crawler->addXmlContent('
Tiếng Việt
', 'UTF-8'); + $crawler->addXmlContent($this->getDoctype().'
Tiếng Việt
', 'UTF-8'); $this->assertEquals('Tiếng Việt', $crawler->filterXPath('//div')->text()); } @@ -165,23 +162,23 @@ public function testAddXmlContentCharset() public function testAddContent() { $crawler = $this->createCrawler(); - $crawler->addContent('
', 'text/html; charset=UTF-8'); + $crawler->addContent($this->getDoctype().'
', 'text/html; charset=UTF-8'); $this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addContent() adds nodes from an HTML string'); $crawler = $this->createCrawler(); - $crawler->addContent('
', 'text/html; charset=UTF-8; dir=RTL'); + $crawler->addContent($this->getDoctype().'
', 'text/html; charset=UTF-8; dir=RTL'); $this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addContent() adds nodes from an HTML string with extended content type'); $crawler = $this->createCrawler(); - $crawler->addContent('
'); + $crawler->addContent($this->getDoctype().'
'); $this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addContent() uses text/html as the default type'); $crawler = $this->createCrawler(); - $crawler->addContent('
', 'text/xml; charset=UTF-8'); + $crawler->addContent($this->getDoctype().'
', 'text/xml; charset=UTF-8'); $this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addContent() adds nodes from an XML string'); $crawler = $this->createCrawler(); - $crawler->addContent('
', 'text/xml'); + $crawler->addContent($this->getDoctype().'
', 'text/xml'); $this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addContent() adds nodes from an XML string'); $crawler = $this->createCrawler(); @@ -189,7 +186,7 @@ public function testAddContent() $this->assertCount(0, $crawler, '->addContent() does nothing if the type is not (x|ht)ml'); $crawler = $this->createCrawler(); - $crawler->addContent('中文'); + $crawler->addContent($this->getDoctype().'中文'); $this->assertEquals('中文', $crawler->filterXPath('//span')->text(), '->addContent() guess wrong charset'); } @@ -199,7 +196,7 @@ public function testAddContent() public function testAddContentNonUtf8() { $crawler = $this->createCrawler(); - $crawler->addContent(iconv('UTF-8', 'SJIS', '日本語')); + $crawler->addContent(iconv('UTF-8', 'SJIS', $this->getDoctype().'日本語')); $this->assertEquals('日本語', $crawler->filterXPath('//body')->text(), '->addContent() can recognize "Shift_JIS" in html5 meta charset tag'); } @@ -314,7 +311,7 @@ public function testAttr() public function testMissingAttrValueIsNull() { $crawler = $this->createCrawler(); - $crawler->addContent('
', 'text/html; charset=UTF-8'); + $crawler->addContent($this->getDoctype().'
', 'text/html; charset=UTF-8'); $div = $crawler->filterXPath('//div'); $this->assertEquals('sample value', $div->attr('non-empty-attr'), '->attr() reads non-empty attributes correctly'); @@ -670,7 +667,6 @@ public function testSelectButton() public function testSelectButtonWithSingleQuotesInNameAttribute() { $html = <<<'HTML' -
@@ -683,7 +679,7 @@ public function testSelectButtonWithSingleQuotesInNameAttribute() HTML; - $crawler = $this->createCrawler($html); + $crawler = $this->createCrawler($this->getDoctype().$html); $this->assertCount(1, $crawler->selectButton('Click \'Here\'')); } @@ -691,7 +687,6 @@ public function testSelectButtonWithSingleQuotesInNameAttribute() public function testSelectButtonWithDoubleQuotesInNameAttribute() { $html = <<<'HTML' -
@@ -704,7 +699,7 @@ public function testSelectButtonWithDoubleQuotesInNameAttribute() HTML; - $crawler = $this->createCrawler($html); + $crawler = $this->createCrawler($this->getDoctype().$html); $this->assertCount(1, $crawler->selectButton('Click "Here"')); } @@ -763,7 +758,6 @@ public function testImage() public function testSelectLinkAndLinkFiltered() { $html = <<<'HTML' -
@@ -776,7 +770,7 @@ public function testSelectLinkAndLinkFiltered() HTML; - $crawler = $this->createCrawler($html); + $crawler = $this->createCrawler($this->getDoctype().$html); $filtered = $crawler->filterXPath("descendant-or-self::*[@id = 'login-form']"); $this->assertCount(0, $filtered->selectLink('Login')); @@ -793,7 +787,7 @@ public function testSelectLinkAndLinkFiltered() public function testChaining() { - $crawler = $this->createCrawler('
'); + $crawler = $this->createCrawler($this->getDoctype().'
'); $this->assertEquals('a', $crawler->filterXPath('//div')->filterXPath('div')->filterXPath('div')->attr('name')); } @@ -965,7 +959,6 @@ public function testChildren() public function testFilteredChildren() { $html = <<<'HTML' -
@@ -981,7 +974,7 @@ public function testFilteredChildren() HTML; - $crawler = $this->createCrawler($html); + $crawler = $this->createCrawler($this->getDoctype().$html); $foo = $crawler->filter('#foo'); $this->assertEquals(3, $foo->children()->count()); @@ -1018,7 +1011,7 @@ public function testParents() */ public function testBaseTag($baseValue, $linkValue, $expectedUri, $currentUri = null, $description = '') { - $crawler = $this->createCrawler('', $currentUri); + $crawler = $this->createCrawler($this->getDoctype().'', $currentUri); $this->assertEquals($expectedUri, $crawler->filterXPath('//a')->link()->getUri(), $description); } @@ -1038,7 +1031,7 @@ public function getBaseTagData() */ public function testBaseTagWithForm($baseValue, $actionValue, $expectedUri, $currentUri = null, $description = null) { - $crawler = $this->createCrawler('