8000 [DomCrawler] [3.0] Abstract URI logic and crawl images by valeriangalliat · Pull Request #13650 · symfony/symfony · GitHub
[go: up one dir, main page]

Skip to content

[DomCrawler] [3.0] Abstract URI logic and crawl images #13650

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension 10000

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
[DomCrawler] Abstract URI-related logic
All the URI parsing logic is externalized in the AbstractUriElement
class, implementing the UriElementInterface interface.

The AbstractUriElement class have two abstract methods:

* setNode: validate the DOMElement node according to the concrete class
  rules, and set $this->node.
* getRawUri: get the raw URI from $this->node.

The Link classs now extends AbstractUriElement.

This refactor is desirable for #12429.
  • Loading branch information
valeriangalliat committed Feb 20, 2015
commit 5259d279169ed4aef17dbe5087d24c9fc8d8a4a7
201 changes: 201 additions & 0 deletions src/Symfony/Component/DomCrawler/AbstractUriElement.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
<?php

/*
* This file is part of the Symfony package.
*
* (c) Fabien Potencier <fabien@symfony.com>
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/

namespace Symfony\Component\DomCrawler;

/**
* Any HTML element that can link to an URI.
*
* @author Fabien Potencier <fabien@symfony.com>
*
* @api
*/
abstract class AbstractUriElement implements UriElementInterface
{
/**
* @var \DOMElement
*/
protected $node;

/**
* @var string The method to use for the element URI
*/
protected $method;

/**
* @var string The URI of the page where the element is embedded (or the base href)
*/
protected $currentUri;

/**
* Constructor.
*
* @param \DOMElement $node A \DOMElement instance
* @param string $currentUri The URI of the page where the element is embedded (or the base href)
* @param string $method The method to use for the element URI (get by default)
*
* @throws \InvalidArgumentException if the node is not a link
*
* @api
*/
public function __construct(\DOMElement $node, $currentUri, $method = 'GET')
{
if (!in_array(strtolower(substr($currentUri, 0, 4)), array('http', 'file'))) {
throw new \InvalidArgumentException(sprintf('Current URI must be an absolute URL ("%s").', $currentUri));
}

$this->setNode($node);
$this->method = $method ? strtoupper($method) : null;
$this->currentUri = $currentUri;
}

public function getNode()
{
return $this->node;
}

public function getMethod()
{
return $this->method;
}

public function getUri()
{
$uri = trim($this->getRawUri());

// absolute URL?
if (null !== parse_url($uri, PHP_URL_SCHEME)) {
return $uri;
}

// empty URI
if (!$uri) {
return $this->currentUri;
}

// an anchor
if ('#' === $uri[0]) {
return $this->cleanupAnchor($this->currentUri).$uri;
}

$baseUri = $this->cleanupUri($this->currentUri);

if ('?' === $uri[0]) {
return $baseUri.$uri;
}

// absolute URL with relative schema
if (0 === strpos($uri, '//')) {
return preg_replace('#^([^/]*)//.*$#', '$1', $baseUri).$uri;
}

$baseUri = preg_replace('#^(.*?//[^/]*)(?:\/.*)?$#', '$1', $baseUri);

// absolute path
if ('/' === $uri[0]) {
return $baseUri.$uri;
}

// relative path
$path = parse_url(substr($this->currentUri, strlen($baseUri)), PHP_URL_PATH);
$path = $this->canonicalizePath(substr($path, 0, strrpos($path, '/')).'/'.$uri);

return $baseUri.('' === $path || '/' !== $path[0] ? '/' : '').$path;
}

/**
* Returns raw URI data.
*
* @return string
*/
abstract protected function getRawUri();

/**
* Returns the canonicalized URI path (see RFC 3986, section 5.2.4).
*
* @param string $path URI path
*
* @return string
*/
protected function canonicalizePath($path)
{
if ('' === $path || '/' === $path) {
return $path;
}

if ('.' === substr($path, -1)) {
$path = $path.'/';
}

$output = array();

foreach (explode('/', $path) as $segment) {
if ('..' === $segment) {
array_pop($output);
} elseif ('.' !== $segment) {
array_push($output, $segment);
}
}

return implode('/', $output);
}

/**
* Sets current \DOMElement instance.
*
* @param \DOMElement $node A \DOMElement instance
*/
abstract protected function setNode(\DOMElement $node);

/**
* Removes the query string and the anchor from the given uri.
*
* @param string $uri The uri to clean
*
* @return string
*/
private function cleanupUri($uri)
{
return $this->cleanupQuery($this->cleanupAnchor($uri));
}

/**
* Remove the query string from the uri.
*
* @param string $uri
*
* @return string
*/
private function cleanupQuery($uri)
{
if (false !== $pos = strpos($uri, '?')) {
return substr($uri, 0, $pos);
}

return $uri;
}

/**
* Remove the anchor from the uri.
*
* @param string $uri
*
* @return string
*/
private function cleanupAnchor($uri)
{
if (false !== $pos = strpos($uri, '#')) {
return substr($uri, 0, $pos);
}

return $uri;
}
}
5 changes: 5 additions & 0 deletions src/Symfony/Component/DomCrawler/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
CHANGELOG
=========

2.7.0
-----

* All the URI parsing logic have been abstracted in the `AbstractUriElement` class. The `Link` class is now a child of `AbstractUriElement` which implements the new `UriElementInterface`, describing the common `getNode`, `getMethod` and `getUri` methods.

2.5.0
-----

Expand Down
Loading
0