8000 feature #17585 [DomCrawler] Abstract URI logic and crawl images (val… · symfony/symfony@1183aca · GitHub
[go: up one dir, main page]

Skip to content

Commit 1183aca

Browse files
committed
feature #17585 [DomCrawler] Abstract URI logic and crawl images (valeriangalliat)
This PR was squashed before being merged into the 3.1-dev branch (closes #17585). Discussion ---------- [DomCrawler] Abstract URI logic and crawl images | Q | A | ------------- | --- | Bug fix? | no | New feature? | yes | BC breaks? | no | Deprecations? | no | Tests pass? | yes | Fixed tickets | #12429 | License | MIT | Doc PR | symfony/symfony-docs#4971 This is a backward-compatible version of #13620, and a rebase of #13649 on current `master`. Commits ------- 1553b07 [DomCrawler] Abstract URI logic and crawl images
2 parents ba25521 + 1553b07 commit 1183aca

File tree

7 files changed

+395
-193
lines changed

7 files changed

+395
-193
lines changed
Lines changed: 212 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,212 @@
1+
<?php
2+
3+
/*
4+
* This file is part of the Symfony package.
5+
*
6+
* (c) Fabien Potencier <fabien@symfony.com>
7+
*
8+
* For the full copyright and license information, please view the LICENSE
9+
* file that was distributed with this source code.
10+
*/
11+
12+
namespace Symfony\Component\DomCrawler;
13+
14+
/**
15+
* Any HTML element that can link to an URI.
16+
*
17+
* @author Fabien Potencier <fabien@symfony.com>
18+
*/
19+
abstract class AbstractUriElement
20+
{
21+
/**
22+
* @var \DOMElement
23+
*/
24+
protected $node;
25+
26+
/**
27+
* @var string The method to use for the element
28+
*/
29+
protected $method;
30+
31+
/**
32+
* @var string The URI of the page where the element is embedded (or the base href)
33+
*/
34+
protected $currentUri;
35+
36+
/**
37+
* @param \DOMElement $node A \DOMElement instance
38+
* @param string $currentUri The URI of the page where the link is embedded (or the base href)
39+
* @param string $method The method to use for the link (get by default)
40+
*
41+
* @throws \InvalidArgumentException if the node is not a link
42+
*/
43+
public function __construct(\DOMElement $node, $currentUri, $method = 'GET')
44+
{
45+
if (!in_array(strtolower(substr($currentUri, 0, 4)), array('http', 'file'))) {
46+
throw new \InvalidArgumentException(sprintf('Current URI must be an absolute URL ("%s").', $currentUri));
47+
}
48+
49+
$this->setNode($node);
50+
$this->method = $method ? strtoupper($method) : null;
51+
$this->currentUri = $currentUri;
52+
}
53+
54+
/**
55+
* Gets the node associated with this link.
56+
*
57+
* @return \DOMElement A \DOMElement instance
58+
*/
59+
public function getNode()
60+
{
61+
return $this->node;
62+
}
63+
64+
/**
65+
* Gets the method associated with this link.
66+
*
67+
* @return string The method
68+
*/
69+
public function getMethod()
70+
{
71+
return $this->method;
72+
}
73+
74+
/**
75+
* Gets the URI associated with this link.
76+
*
77+
* @return string The URI
78+
*/
79+
public function getUri()
80+
{
81+
$uri = trim($this->getRawUri());
82+
83+
// absolute URL?
84+
if (null !== parse_url($uri, PHP_URL_SCHEME)) {
85+
return $uri;
86+
}
87+
88+
// empty URI
89+
if (!$uri) {
90+
return $this->currentUri;
91+
}
92+
93+
// an anchor
94+
if ('#' === $uri[0]) {
95+
return $this->cleanupAnchor($this->currentUri).$uri;
96+
}
97+
98+
$baseUri = $this->cleanupUri($this->currentUri);
99+
100+
if ('?' === $uri[0]) {
101+
return $baseUri.$uri;
102+
}
103+
104+
// absolute URL with relative schema
105+
if (0 === strpos($uri, '//')) {
106+
return preg_replace('#^([^/]*)//.*$#', '$1', $baseUri).$uri;
107+
}
108+
109+
$baseUri = preg_replace('#^(.*?//[^/]*)(?:\/.*)?$#', '$1', $baseUri);
110+
111+
// absolute path
112+
if ('/' === $uri[0]) {
113+
return $baseUri.$uri;
114+
}
115+
116+
// relative path
117+
$path = parse_url(substr($this->currentUri, strlen($baseUri)), PHP_URL_PATH);
118+
$path = $this->canonicalizePath(substr($path, 0, strrpos($path, '/')).'/'.$uri);
119+
120+
return $baseUri.('' === $path || '/' !== $path[0] ? '/' : '').$path;
121+
}
122+
123+
/**
124+
* Returns raw URI data.
125+
*
126+
* @return string
127+
*/
128+
abstract protected function getRawUri();
129+
130+
/**
131+
* Returns the canonicalized URI path (see RFC 3986, section 5.2.4).
132+
*
133+
* @param string $path URI path
134+
*
135+
* @return string
136+
*/
137+
protected function canonicalizePath($path)
138+
{
139+
if ('' === $path || '/' === $path) {
140+
return $path;
141+
}
142+
143+
if ('.' === substr($path, -1)) {
144+
$path .= '/';
145+
}
146+
147+
$output = array();
148+
149+
foreach (explode('/', $path) as $segment) {
150+
if ('..' === $segment) {
151+
array_pop($output);
152+
} elseif ('.' !== $segment) {
153+
$output[] = $segment;
154+
}
155+
}
156+
157+
return implode('/', $output);
158+
}
159+
160+
/**
161+
* Sets current \DOMElement instance.
162+
*
163+
* @param \DOMElement $node A \DOMElement instance
164+
*
165+
* @throws \LogicException If given node is not an anchor
166+
*/
167+
abstract protected function setNode(\DOMElement $node);
168+
169+
/**
170+
* Removes the query string and the anchor from the given uri.
171+
*
172+
* @param string $uri The uri to clean
173+
*
174+
* @return string
175+
*/
176+
private function cleanupUri($uri)
177+
{
178+
return $this->cleanupQuery($this->cleanupAnchor($uri));
179+
}
180+
181+
/**
182+
* Remove the query string from the uri.
183+
*
184+
* @param string $uri
185+
*
186+
* @return string
187+
*/
188+
private function cleanupQuery($uri)
189+
{
190+
if (false !== $pos = strpos($uri, '?')) {
191+
return substr($uri, 0, $pos);
192+
}
193+
194+
return $uri;
195+
}
196+
197+
/**
198+
* Remove the anchor from the uri.
199+
*
200+
* @param string $uri
201+
*
202+
* @return string
203+
*/
204+
private function cleanupAnchor($uri)
205+
{
206+
if (false !== $pos = strpos($uri, '#')) {
207+
return substr($uri, 0, $pos);
208+
}
209+
210+
return $uri;
211+
}
212+
}

src/Symfony/Component/DomCrawler/CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
11
CHANGELOG
22
=========
33

4+
3.1.0
5+
-----
6+
7+
* All the URI parsing logic have been abstracted in the `AbstractUriElement` class. The `Link` class is now a child of `AbstractUriElement` which implements the new `UriElementInterface`, describing the common `getNode`, `getMethod` and `getUri` methods.
8+
* Added an `Image` class to crawl images and parse their `src` attribute, and `selectImage`, `image`, `images` methods in `Crawler`, the image version of the equivalent `link` methods.
9+
410
2.5.0
511
-----
612

src/Symfony/Component/DomCrawler/Crawler.php

Lines changed: 55 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,6 @@ class Crawler implements \Countable, \IteratorAggregate
5858
private $isHtml = true;
5959

6060
/**
61-
* Constructor.
62-
*
6361
* @param mixed $node A Node to use as the base for the crawling
6462
* @param string $currentUri The current URI
6563
* @param string $baseHref The base href value
@@ -668,6 +666,20 @@ public function selectLink($value)
668666
return $this->filterRelativeXPath($xpath);
669667
}
670668

669+
/**
670+
* Selects images by alt value.
671+
*
672+
* @param string $value The image alt
673+
*
674+
* @return Crawler A new instance of Crawler with the filtered list of nodes
675+
*/
676+
public function selectImage($value)
677+
{
678+
$xpath = sprintf('descendant-or-self::img[contains(normalize-space(string(@alt)), %s)]', static::xpathLiteral($value));
679+
680+
return $this->filterRelativeXPath($xpath);
681+
}
682+
671683
/**
672684
* Selects a button by name or alt value for images.
673685
*
@@ -730,6 +742,47 @@ public function links()
730742
return $links;
731743
}
732744

745+
/**
746+
* Returns an Image object for the first node in the list.
747+
*
748+
* @return Image An Image instance
749+
*
750+
* @throws \InvalidArgumentException If the current node list is empty
751+
*/
752+
public function image()
753+
{
754+
if (!count($this)) {
755+
throw new \InvalidArgumentException('The current node list is empty.');
756+
}
757+
758+
$node = $this->getNode(0);
759+
760+
if (!$node instanceof \DOMElement) {
761+
throw new \InvalidArgumentException(sprintf('The selected node should be instance of DOMElement, got "%s".', get_class($node)));
762+
}
763+
764+
return new Image($node, $this->baseHref);
765+
}
766+
767+
/**
768+
* Returns an array of Image objects for the nodes in the list.
769+
*
770+
* @return Image[] An array of Image instances
771+
*/
772+
public function images()
773+
{
774+
$images = array();
775+
foreach ($this as $node) {
776+
if (!$node instanceof \DOMElement) {
777+
throw new \InvalidArgumentException(sprintf('The current node list should contain only DOMElement instances, "%s" found.', get_class($node)));
778+
}
779+
780+
$images[] = new Image($node, $this->baseHref);
781+
}
782+
783+
return $images;
784+
}
785+
733786
/**
734787
* Returns a Form object for the first node in the list.
735788
*
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
<?php
2+
3+
/*
4+
* This file is part of the Symfony package.
5+
*
6+
* (c) Fabien Potencier <fabien@symfony.com>
7+
*
8+
* For the full copyright and license information, please view the LICENSE
9+
* file that was distributed with this source code.
10+
*/
11+
12+
namespace Symfony\Component\DomCrawler;
13+
14+
/**
15+
* Image represents an HTML image (an HTML img tag).
16+
*/
17+
class Image extends AbstractUriElement
18+
{
19+
public function __construct(\DOMElement $node, $currentUri)
20+
{
21+
parent::__construct($node, $currentUri, 'GET');
22+
}
23+
24+
protected function getRawUri()
25+
{
26+
return $this->node->getAttribute('src');
27+
}
28+
29+
protected function setNode(\DOMElement $node)
30+
{
31+
if ('img' !== $node->nodeName) {
32+
throw new \LogicException(sprintf('Unable to visualize a "%s" tag.', $node->nodeName));
33+
}
34+
35+
$this->node = $node;
36+
}
37+
}

0 commit comments

Comments
 (0)
0