8000 bug #45256 [DomCrawler] ignore bad charsets (nicolas-grekas) · symfony/symfony@2119155 · GitHub
[go: up one dir, main page]

Skip to content

Commit 2119155

Browse files
bug #45256 [DomCrawler] ignore bad charsets (nicolas-grekas)
This PR was merged into the 4.4 branch. Discussion ---------- [DomCrawler] ignore bad charsets | Q | A | ------------- | --- | Branch? | 4.4 | Bug fix? | yes | New feature? | no | Deprecations? | no | Tickets | Fix #42255 | License | MIT | Doc PR | - Commits ------- 7802c1f [DomCrawler] ignore bad charsets
2 parents b7da2a0 + 7802c1f commit 2119155

File tree

2 files changed

+11
-14
lines changed

2 files changed

+11
-14
lines changed

src/Symfony/Component/DomCrawler/Crawler.php

Lines changed: 7 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -156,24 +156,17 @@ public function addContent($content, $type = null)
156156
return;
157157
}
158158

159-
$charset = null;
160-
if (false !== $pos = stripos($type, 'charset=')) {
161-
$charset = substr($type, $pos + 8);
162-
if (false !== $pos = strpos($charset, ';')) {
163-
$charset = substr($charset, 0, $pos);
164-
}
165-
}
159+
$charset = preg_match('//u', $content) ? 'UTF-8' : 'ISO-8859-1';
166160

167161
// http://www.w3.org/TR/encoding/#encodings
168162
// http://www.w3.org/TR/REC-xml/#NT-EncName
169-
if (null === $charset &&
170-
preg_match('/\<meta[^\>]+charset *= *["\']?([a-zA-Z\-0-9_:.]+)/i', $content, $matches)) {
171-
$charset = $matches[1];
172-
}
163+
$content = preg_replace_callback('/(charset *= *["\']?)([a-zA-Z\-0-9_:.]+)/i', function ($m) use (&$charset) {
164+
if ('charset=' === $this->convertToHtmlEntities('charset=', $m[2])) {
165+
$charset = $m[2];
166+
}
173167

174-
if (null === $charset) {
175-
$charset = preg_match('//u', $content) ? 'UTF-8' : 'ISO-8859-1';
176-
}
168+
return $m[1].$charset;
169+
}, $content, 1);
177170

178171
if ('x' === $xmlMatches[1]) {
179172
$this->addXmlContent($content, $charset);

src/Symfony/Component/DomCrawler/Tests/AbstractCrawlerTest.php

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,10 @@ public function testAddContent()
187187
$crawler = $this->createCrawler();
188188
$crawler->addContent($this->getDoctype().'<html><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /><span>中文</span></html>');
189189
$this->assertEquals('中文', $crawler->filterXPath('//span')->text(), '->addContent() guess wrong charset');
190+
191+
$crawler = $this->createCrawler();
192+
$crawler->addContent($this->getDoctype().'<html><meta http-equiv="Content-Type" content="text/html; charset=unicode" /><div class="foo"></html></html>');
193+
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addContent() ignores bad charset');
190194
}
191195

192196
/**

0 commit comments

Comments
 (0)
0