10000 merged branch bronze1man/pr-2.2-crawler (PR #9074) · symfony/symfony@f73aa37 · GitHub
[go: up one dir, main page]

Skip to content

Commit f73aa37

Browse files
committed
merged branch bronze1man/pr-2.2-crawler (PR #9074)
This PR was squashed before being merged into the 2.2 branch (closes #9074). Discussion ---------- [DomCrawler]Crawler guess charset from html | Q | A | ------------- | --- | Bug fix? | no | New feature? | yes | BC breaks? | no | Deprecations? | no | Tests pass? | yes | Fixed tickets | #9061 | License | MIT | Doc PR | n/a Commits ------- e5282e8 [DomCrawler]Crawler guess charset from html
2 parents 8552aa4 + e5282e8 commit f73aa37

File tree

2 files changed

+16
-3
lines changed

2 files changed

+16
-3
lines changed

src/Symfony/Component/DomCrawler/Crawler.php

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -92,19 +92,28 @@ public function addContent($content, $type = null)
9292
}
9393

9494
// DOM only for HTML/XML content
95-
if (!preg_match('/(x|ht)ml/i', $type, $matches)) {
95+
if (!preg_match('/(x|ht)ml/i', $type, $xmlMatches)) {
9696
return null;
9797
}
9898

99-
$charset = 'ISO-8859-1';
99+
$charset = null;
100100
if (false !== $pos = strpos($type, 'charset=')) {
101101
$charset = substr($type, $pos + 8);
102102
if (false !== $pos = strpos($charset, ';')) {
103103
$charset = substr($charset, 0, $pos);
104104
}
105105
}
106106

107-
if ('x' === $matches[1]) {
107+
if (null === $charset &&
108+
preg_match('/\<meta[^\>]+charset *= *["\']?([a-zA-Z\-0-9]+)/i', $content, $matches)) {
109+
$charset = $matches[1];
110+
}
111+
112+
if (null === $charset) {
113+
$charset = 'ISO-8859-1';
114+
}
115+
116+
if ('x' === $xmlMatches[1]) {
108117
$this->addXmlContent($content, $charset);
109118
} else {
110119
$this->addHtmlContent($content, $charset);

src/Symfony/Component/DomCrawler/Tests/CrawlerTest.php

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,10 @@ public function testAddContent()
207207
$crawler = new Crawler();
208208
$crawler->addContent('foo bar', 'text/plain');
209209
$this->assertCount(0, $crawler, '->addContent() does nothing if the type is not (x|ht)ml');
210+
211+
$crawler = new Crawler();
212+
$crawler->addContent('<html><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /><span>中文</span></html>');
213+
$this->assertEquals('中文', $crawler->filterXPath('//span')->text(), '->addContent() guess wrong charset');
210214
}
211215

212216
/**

0 commit comments

Comments
 (0)
0