2026-02-25 06:59:34 +00:00
< ? php
/*
* This file is part of the Symfony package .
*
* ( c ) Fabien Potencier < fabien @ symfony . com >
*
* For the full copyright and license information , please view the LICENSE
* file that was distributed with this source code .
*/
namespace Symfony\Component\DomCrawler ;
use Symfony\Component\CssSelector\CssSelectorConverter ;
/**
* Crawler eases navigation of a list of \DOMNode objects .
*
* @ author Fabien Potencier < fabien @ symfony . com >
2026-02-27 00:03:00 +00:00
*
* @ implements \IteratorAggregate < int , \DOMNode >
2026-02-25 06:59:34 +00:00
*/
class Crawler implements \Countable , \IteratorAggregate
{
/**
* The default namespace prefix to be used with XPath and CSS expressions .
*/
2026-02-27 00:03:00 +00:00
private string $defaultNamespacePrefix = 'default' ;
2026-02-25 06:59:34 +00:00
/**
* A map of manually registered namespaces .
*
* @ var array < string , string >
*/
2026-02-27 00:03:00 +00:00
private array $namespaces = [];
2026-02-25 06:59:34 +00:00
/**
2026-02-27 00:03:00 +00:00
* A map of cached namespaces .
2026-02-25 06:59:34 +00:00
*
2026-02-27 00:03:00 +00:00
* @ var \ArrayObject < string , string | null >
2026-02-25 06:59:34 +00:00
*/
2026-02-27 00:03:00 +00:00
private \ArrayObject $cachedNamespaces ;
2026-02-25 06:59:34 +00:00
2026-02-27 00:03:00 +00:00
private ? string $baseHref ;
2026-02-25 06:59:34 +00:00
2026-02-27 00:03:00 +00:00
private ? \DOMDocument $document = null ;
2026-02-25 06:59:34 +00:00
/**
2026-02-27 00:03:00 +00:00
* @ var list < \DOMNode >
2026-02-25 06:59:34 +00:00
*/
2026-02-27 00:03:00 +00:00
private array $nodes = [];
2026-02-25 06:59:34 +00:00
/**
2026-02-27 00:03:00 +00:00
* Whether the Crawler contains HTML or XML content ( used when converting CSS to XPath ) .
2026-02-25 06:59:34 +00:00
*/
2026-02-27 00:03:00 +00:00
private bool $isHtml = true ;
2026-02-25 06:59:34 +00:00
/**
2026-02-27 00:03:00 +00:00
* @ param \DOMNodeList < \DOMNode >| \DOMNode | \DOMNode [] | string | null $node A Node to use as the base for the crawling
2026-02-25 06:59:34 +00:00
*/
2026-02-27 00:03:00 +00:00
public function __construct (
\DOMNodeList | \DOMNode | array | string | null $node = null ,
protected ? string $uri = null ,
? string $baseHref = null ,
) {
2026-02-25 06:59:34 +00:00
$this -> baseHref = $baseHref ? : $uri ;
2026-02-27 00:03:00 +00:00
$this -> cachedNamespaces = new \ArrayObject ();
2026-02-25 06:59:34 +00:00
$this -> add ( $node );
}
/**
* Returns the current URI .
*/
2026-02-27 00:03:00 +00:00
public function getUri () : ? string
2026-02-25 06:59:34 +00:00
{
return $this -> uri ;
}
/**
* Returns base href .
*/
2026-02-27 00:03:00 +00:00
public function getBaseHref () : ? string
2026-02-25 06:59:34 +00:00
{
return $this -> baseHref ;
}
/**
* Removes all the nodes .
*/
2026-02-27 00:03:00 +00:00
public function clear () : void
2026-02-25 06:59:34 +00:00
{
$this -> nodes = [];
$this -> document = null ;
2026-02-27 00:03:00 +00:00
$this -> cachedNamespaces = new \ArrayObject ();
2026-02-25 06:59:34 +00:00
}
/**
* Adds a node to the current list of nodes .
*
* This method uses the appropriate specialized add * () method based
* on the type of the argument .
*
2026-02-27 00:03:00 +00:00
* @ param \DOMNodeList < \DOMNode >| \DOMNode | \DOMNode [] | string | null $node
2026-02-25 06:59:34 +00:00
*/
2026-02-27 00:03:00 +00:00
public function add ( \DOMNodeList | \DOMNode | array | string | null $node ) : void
2026-02-25 06:59:34 +00:00
{
if ( $node instanceof \DOMNodeList ) {
$this -> addNodeList ( $node );
} elseif ( $node instanceof \DOMNode ) {
$this -> addNode ( $node );
} elseif ( \is_array ( $node )) {
$this -> addNodes ( $node );
} elseif ( \is_string ( $node )) {
$this -> addContent ( $node );
}
}
/**
* Adds HTML / XML content .
*
* If the charset is not set via the content type , it is assumed to be UTF - 8 ,
* or ISO - 8859 - 1 as a fallback , which is the default charset defined by the
* HTTP 1.1 specification .
*/
2026-02-27 00:03:00 +00:00
public function addContent ( string $content , ? string $type = null ) : void
2026-02-25 06:59:34 +00:00
{
2026-02-27 00:03:00 +00:00
if ( ! $type ) {
2026-02-25 06:59:34 +00:00
$type = str_starts_with ( $content , '<?xml' ) ? 'application/xml' : 'text/html' ;
}
// DOM only for HTML/XML content
if ( ! preg_match ( '/(x|ht)ml/i' , $type , $xmlMatches )) {
return ;
}
$charset = preg_match ( '//u' , $content ) ? 'UTF-8' : 'ISO-8859-1' ;
// http://www.w3.org/TR/encoding/#encodings
// http://www.w3.org/TR/REC-xml/#NT-EncName
$content = preg_replace_callback ( '/(charset *= *["\']?)([a-zA-Z\-0-9_:.]+)/i' , function ( $m ) use ( & $charset ) {
if ( 'charset=' === $this -> convertToHtmlEntities ( 'charset=' , $m [ 2 ])) {
$charset = $m [ 2 ];
}
return $m [ 1 ] . $charset ;
}, $content , 1 );
if ( 'x' === $xmlMatches [ 1 ]) {
$this -> addXmlContent ( $content , $charset );
} else {
$this -> addHtmlContent ( $content , $charset );
}
}
/**
* Adds an HTML content to the list of nodes .
*
* The libxml errors are disabled when the content is parsed .
*
* If you want to get parsing errors , be sure to enable
* internal errors via libxml_use_internal_errors ( true )
* and then , get the errors via libxml_get_errors () . Be
* sure to clear errors with libxml_clear_errors () afterward .
*/
2026-02-27 00:03:00 +00:00
public function addHtmlContent ( string $content , string $charset = 'UTF-8' ) : void
2026-02-25 06:59:34 +00:00
{
2026-02-27 00:03:00 +00:00
$dom = $this -> parseHtml5 ( $content , $charset );
2026-02-25 06:59:34 +00:00
$this -> addDocument ( $dom );
$base = $this -> filterRelativeXPath ( 'descendant-or-self::base' ) -> extract ([ 'href' ]);
$baseHref = current ( $base );
2026-02-27 00:03:00 +00:00
if ( \count ( $base ) && $baseHref ) {
2026-02-25 06:59:34 +00:00
if ( $this -> baseHref ) {
$linkNode = $dom -> createElement ( 'a' );
$linkNode -> setAttribute ( 'href' , $baseHref );
$link = new Link ( $linkNode , $this -> baseHref );
$this -> baseHref = $link -> getUri ();
} else {
$this -> baseHref = $baseHref ;
}
}
}
/**
* Adds an XML content to the list of nodes .
*
* The libxml errors are disabled when the content is parsed .
*
* If you want to get parsing errors , be sure to enable
* internal errors via libxml_use_internal_errors ( true )
* and then , get the errors via libxml_get_errors () . Be
* sure to clear errors with libxml_clear_errors () afterward .
*
2026-02-27 00:03:00 +00:00
* @ param int $options Bitwise OR of the libxml option constants
* LIBXML_PARSEHUGE is dangerous , see
* http :// symfony . com / blog / security - release - symfony - 2 - 0 - 17 - released
2026-02-25 06:59:34 +00:00
*/
2026-02-27 00:03:00 +00:00
public function addXmlContent ( string $content , string $charset = 'UTF-8' , int $options = \LIBXML_NONET ) : void
2026-02-25 06:59:34 +00:00
{
// remove the default namespace if it's the only namespace to make XPath expressions simpler
2026-02-27 00:03:00 +00:00
if ( ! str_contains ( $content , 'xmlns:' )) {
2026-02-25 06:59:34 +00:00
$content = str_replace ( 'xmlns' , 'ns' , $content );
}
$internalErrors = libxml_use_internal_errors ( true );
$dom = new \DOMDocument ( '1.0' , $charset );
$dom -> validateOnParse = true ;
if ( '' !== trim ( $content )) {
@ $dom -> loadXML ( $content , $options );
}
libxml_use_internal_errors ( $internalErrors );
$this -> addDocument ( $dom );
$this -> isHtml = false ;
}
/**
* Adds a \DOMDocument to the list of nodes .
*/
2026-02-27 00:03:00 +00:00
public function addDocument ( \DOMDocument $dom ) : void
2026-02-25 06:59:34 +00:00
{
if ( $dom -> documentElement ) {
$this -> addNode ( $dom -> documentElement );
}
}
/**
* Adds a \DOMNodeList to the list of nodes .
*
2026-02-27 00:03:00 +00:00
* @ param \DOMNodeList < \DOMNode > $nodes
2026-02-25 06:59:34 +00:00
*/
2026-02-27 00:03:00 +00:00
public function addNodeList ( \DOMNodeList $nodes ) : void
2026-02-25 06:59:34 +00:00
{
foreach ( $nodes as $node ) {
if ( $node instanceof \DOMNode ) {
$this -> addNode ( $node );
}
}
}
/**
* Adds an array of \DOMNode instances to the list of nodes .
*
2026-02-27 00:03:00 +00:00
* @ param \DOMNode [] $nodes
2026-02-25 06:59:34 +00:00
*/
2026-02-27 00:03:00 +00:00
public function addNodes ( array $nodes ) : void
2026-02-25 06:59:34 +00:00
{
foreach ( $nodes as $node ) {
$this -> add ( $node );
}
}
/**
* Adds a \DOMNode instance to the list of nodes .
*/
2026-02-27 00:03:00 +00:00
public function addNode ( \DOMNode $node ) : void
2026-02-25 06:59:34 +00:00
{
if ( $node instanceof \DOMDocument ) {
$node = $node -> documentElement ;
}
if ( null !== $this -> document && $this -> document !== $node -> ownerDocument ) {
throw new \InvalidArgumentException ( 'Attaching DOM nodes from multiple documents in the same crawler is forbidden.' );
}
2026-02-27 00:03:00 +00:00
$this -> document ? ? = $node -> ownerDocument ;
2026-02-25 06:59:34 +00:00
// Don't add duplicate nodes in the Crawler
if ( \in_array ( $node , $this -> nodes , true )) {
return ;
}
$this -> nodes [] = $node ;
}
/**
* Returns a node given its position in the node list .
*/
2026-02-27 00:03:00 +00:00
public function eq ( int $position ) : static
2026-02-25 06:59:34 +00:00
{
if ( isset ( $this -> nodes [ $position ])) {
return $this -> createSubCrawler ( $this -> nodes [ $position ]);
}
return $this -> createSubCrawler ( null );
}
/**
* Calls an anonymous function on each node of the list .
*
* The anonymous function receives the position and the node wrapped
* in a Crawler instance as arguments .
*
* Example :
*
2026-02-27 00:03:00 +00:00
* $crawler -> filter ( 'h1' ) -> each ( fn ( $node , $i ) => $node -> text ());
*
* @ template R of mixed
2026-02-25 06:59:34 +00:00
*
2026-02-27 00:03:00 +00:00
* @ param \Closure ( static , int ) : R $closure
2026-02-25 06:59:34 +00:00
*
2026-02-27 00:03:00 +00:00
* @ return list < R > An array of values returned by the anonymous function
2026-02-25 06:59:34 +00:00
*/
2026-02-27 00:03:00 +00:00
public function each ( \Closure $closure ) : array
2026-02-25 06:59:34 +00:00
{
$data = [];
foreach ( $this -> nodes as $i => $node ) {
$data [] = $closure ( $this -> createSubCrawler ( $node ), $i );
}
return $data ;
}
/**
* Slices the list of nodes by $offset and $length .
*/
2026-02-27 00:03:00 +00:00
public function slice ( int $offset = 0 , ? int $length = null ) : static
2026-02-25 06:59:34 +00:00
{
return $this -> createSubCrawler ( \array_slice ( $this -> nodes , $offset , $length ));
}
/**
* Reduces the list of nodes by calling an anonymous function .
*
* To remove a node from the list , the anonymous function must return false .
*
2026-02-27 00:03:00 +00:00
* @ param \Closure ( static , int ) : bool $closure
2026-02-25 06:59:34 +00:00
*/
2026-02-27 00:03:00 +00:00
public function reduce ( \Closure $closure ) : static
2026-02-25 06:59:34 +00:00
{
$nodes = [];
foreach ( $this -> nodes as $i => $node ) {
if ( false !== $closure ( $this -> createSubCrawler ( $node ), $i )) {
$nodes [] = $node ;
}
}
return $this -> createSubCrawler ( $nodes );
}
/**
* Returns the first node of the current selection .
*/
2026-02-27 00:03:00 +00:00
public function first () : static
2026-02-25 06:59:34 +00:00
{
return $this -> eq ( 0 );
}
/**
* Returns the last node of the current selection .
*/
2026-02-27 00:03:00 +00:00
public function last () : static
2026-02-25 06:59:34 +00:00
{
return $this -> eq ( \count ( $this -> nodes ) - 1 );
}
/**
* Returns the siblings nodes of the current selection .
*
2026-02-27 00:03:00 +00:00
* @ throws \InvalidArgumentException When the current node is empty
2026-02-25 06:59:34 +00:00
*/
2026-02-27 00:03:00 +00:00
public function siblings () : static
2026-02-25 06:59:34 +00:00
{
if ( ! $this -> nodes ) {
throw new \InvalidArgumentException ( 'The current node list is empty.' );
}
return $this -> createSubCrawler ( $this -> sibling ( $this -> getNode ( 0 ) -> parentNode -> firstChild ));
}
public function matches ( string $selector ) : bool
{
if ( ! $this -> nodes ) {
return false ;
}
$converter = $this -> createCssSelectorConverter ();
$xpath = $converter -> toXPath ( $selector , 'self::' );
return 0 !== $this -> filterRelativeXPath ( $xpath ) -> count ();
}
/**
* Return first parents ( heading toward the document root ) of the Element that matches the provided selector .
*
* @ see https :// developer . mozilla . org / en - US / docs / Web / API / Element / closest #Polyfill
*
* @ throws \InvalidArgumentException When current node is empty
*/
2026-02-27 00:03:00 +00:00
public function closest ( string $selector ) : ? static
2026-02-25 06:59:34 +00:00
{
if ( ! $this -> nodes ) {
throw new \InvalidArgumentException ( 'The current node list is empty.' );
}
$domNode = $this -> getNode ( 0 );
2026-02-27 00:03:00 +00:00
while ( null !== $domNode && \XML_ELEMENT_NODE === $domNode -> nodeType ) {
2026-02-25 06:59:34 +00:00
$node = $this -> createSubCrawler ( $domNode );
if ( $node -> matches ( $selector )) {
return $node ;
}
$domNode = $node -> getNode ( 0 ) -> parentNode ;
}
return null ;
}
/**
* Returns the next siblings nodes of the current selection .
*
* @ throws \InvalidArgumentException When current node is empty
*/
2026-02-27 00:03:00 +00:00
public function nextAll () : static
2026-02-25 06:59:34 +00:00
{
if ( ! $this -> nodes ) {
throw new \InvalidArgumentException ( 'The current node list is empty.' );
}
return $this -> createSubCrawler ( $this -> sibling ( $this -> getNode ( 0 )));
}
/**
* Returns the previous sibling nodes of the current selection .
*
2026-02-27 00:03:00 +00:00
* @ throws \InvalidArgumentException When current node is empty
2026-02-25 06:59:34 +00:00
*/
2026-02-27 00:03:00 +00:00
public function previousAll () : static
2026-02-25 06:59:34 +00:00
{
if ( ! $this -> nodes ) {
throw new \InvalidArgumentException ( 'The current node list is empty.' );
}
return $this -> createSubCrawler ( $this -> sibling ( $this -> getNode ( 0 ), 'previousSibling' ));
}
/**
2026-02-27 00:03:00 +00:00
* Returns the ancestors of the current selection .
2026-02-25 06:59:34 +00:00
*
2026-02-27 00:03:00 +00:00
* @ throws \InvalidArgumentException When the current node is empty
2026-02-25 06:59:34 +00:00
*/
2026-02-27 00:03:00 +00:00
public function ancestors () : static
2026-02-25 06:59:34 +00:00
{
if ( ! $this -> nodes ) {
throw new \InvalidArgumentException ( 'The current node list is empty.' );
}
$node = $this -> getNode ( 0 );
$nodes = [];
while ( $node = $node -> parentNode ) {
if ( \XML_ELEMENT_NODE === $node -> nodeType ) {
$nodes [] = $node ;
}
}
return $this -> createSubCrawler ( $nodes );
}
/**
* Returns the children nodes of the current selection .
*
2026-02-27 00:03:00 +00:00
* @ throws \InvalidArgumentException When the current node is empty
2026-02-25 06:59:34 +00:00
* @ throws \RuntimeException If the CssSelector Component is not available and $selector is provided
*/
2026-02-27 00:03:00 +00:00
public function children ( ? string $selector = null ) : static
2026-02-25 06:59:34 +00:00
{
if ( ! $this -> nodes ) {
throw new \InvalidArgumentException ( 'The current node list is empty.' );
}
if ( null !== $selector ) {
$converter = $this -> createCssSelectorConverter ();
$xpath = $converter -> toXPath ( $selector , 'child::' );
return $this -> filterRelativeXPath ( $xpath );
}
$node = $this -> getNode ( 0 ) -> firstChild ;
return $this -> createSubCrawler ( $node ? $this -> sibling ( $node ) : []);
}
/**
* Returns the attribute value of the first node of the list .
*
2026-02-27 00:03:00 +00:00
* @ param string | null $default When not null : the value to return when the node or attribute is empty
2026-02-25 06:59:34 +00:00
*
* @ throws \InvalidArgumentException When current node is empty
*/
2026-02-27 00:03:00 +00:00
public function attr ( string $attribute , ? string $default = null ) : ? string
2026-02-25 06:59:34 +00:00
{
if ( ! $this -> nodes ) {
2026-02-27 00:03:00 +00:00
if ( null !== $default ) {
return $default ;
}
2026-02-25 06:59:34 +00:00
throw new \InvalidArgumentException ( 'The current node list is empty.' );
}
$node = $this -> getNode ( 0 );
2026-02-27 00:03:00 +00:00
return $node -> hasAttribute ( $attribute ) ? $node -> getAttribute ( $attribute ) : $default ;
2026-02-25 06:59:34 +00:00
}
/**
* Returns the node name of the first node of the list .
*
2026-02-27 00:03:00 +00:00
* @ throws \InvalidArgumentException When the current node is empty
2026-02-25 06:59:34 +00:00
*/
2026-02-27 00:03:00 +00:00
public function nodeName () : string
2026-02-25 06:59:34 +00:00
{
if ( ! $this -> nodes ) {
throw new \InvalidArgumentException ( 'The current node list is empty.' );
}
return $this -> getNode ( 0 ) -> nodeName ;
}
/**
* Returns the text of the first node of the list .
*
* Pass true as the second argument to normalize whitespaces .
*
* @ param string | null $default When not null : the value to return when the current node is empty
* @ param bool $normalizeWhitespace Whether whitespaces should be trimmed and normalized to single spaces
*
* @ throws \InvalidArgumentException When current node is empty
*/
2026-02-27 00:03:00 +00:00
public function text ( ? string $default = null , bool $normalizeWhitespace = true ) : string
2026-02-25 06:59:34 +00:00
{
if ( ! $this -> nodes ) {
2026-02-27 00:03:00 +00:00
if ( null !== $default ) {
return $default ;
2026-02-25 06:59:34 +00:00
}
throw new \InvalidArgumentException ( 'The current node list is empty.' );
}
$text = $this -> getNode ( 0 ) -> nodeValue ;
2026-02-27 00:03:00 +00:00
if ( $normalizeWhitespace ) {
return $this -> normalizeWhitespace ( $text );
2026-02-25 06:59:34 +00:00
}
2026-02-27 00:03:00 +00:00
return $text ;
}
/**
* Returns only the inner text that is the direct descendent of the current node , excluding any child nodes .
*
* @ param bool $normalizeWhitespace Whether whitespaces should be trimmed and normalized to single spaces
*/
public function innerText ( bool $normalizeWhitespace = true ) : string
{
foreach ( $this -> getNode ( 0 ) -> childNodes as $childNode ) {
if ( \XML_TEXT_NODE !== $childNode -> nodeType && \XML_CDATA_SECTION_NODE !== $childNode -> nodeType ) {
continue ;
}
if ( ! $normalizeWhitespace ) {
return $childNode -> nodeValue ;
}
if ( '' !== trim ( $childNode -> nodeValue )) {
return $this -> normalizeWhitespace ( $childNode -> nodeValue );
}
2026-02-25 06:59:34 +00:00
}
2026-02-27 00:03:00 +00:00
return '' ;
2026-02-25 06:59:34 +00:00
}
/**
* Returns the first node of the list as HTML .
*
* @ param string | null $default When not null : the value to return when the current node is empty
*
2026-02-27 00:03:00 +00:00
* @ throws \InvalidArgumentException When the current node is empty
2026-02-25 06:59:34 +00:00
*/
2026-02-27 00:03:00 +00:00
public function html ( ? string $default = null ) : string
2026-02-25 06:59:34 +00:00
{
if ( ! $this -> nodes ) {
2026-02-27 00:03:00 +00:00
if ( null !== $default ) {
return $default ;
2026-02-25 06:59:34 +00:00
}
throw new \InvalidArgumentException ( 'The current node list is empty.' );
}
$node = $this -> getNode ( 0 );
$owner = $node -> ownerDocument ;
$html = '' ;
foreach ( $node -> childNodes as $child ) {
$html .= $owner -> saveHTML ( $child );
}
return $html ;
}
2026-02-27 00:03:00 +00:00
/**
* @ throws \InvalidArgumentException When the current node is empty
*/
2026-02-25 06:59:34 +00:00
public function outerHtml () : string
{
if ( ! \count ( $this )) {
throw new \InvalidArgumentException ( 'The current node list is empty.' );
}
$node = $this -> getNode ( 0 );
$owner = $node -> ownerDocument ;
return $owner -> saveHTML ( $node );
}
/**
* Evaluates an XPath expression .
*
* Since an XPath expression might evaluate to either a simple type or a \DOMNodeList ,
* this method will return either an array of simple types or a new Crawler instance .
*/
2026-02-27 00:03:00 +00:00
public function evaluate ( string $xpath ) : array | static
2026-02-25 06:59:34 +00:00
{
if ( null === $this -> document ) {
throw new \LogicException ( 'Cannot evaluate the expression on an uninitialized crawler.' );
}
$data = [];
$domxpath = $this -> createDOMXPath ( $this -> document , $this -> findNamespacePrefixes ( $xpath ));
foreach ( $this -> nodes as $node ) {
$data [] = $domxpath -> evaluate ( $xpath , $node );
}
if ( isset ( $data [ 0 ]) && $data [ 0 ] instanceof \DOMNodeList ) {
return $this -> createSubCrawler ( $data );
}
return $data ;
}
/**
* Extracts information from the list of nodes .
*
* You can extract attributes or / and the node value ( _text ) .
*
* Example :
*
* $crawler -> filter ( 'h1 a' ) -> extract ([ '_text' , 'href' ]);
*/
2026-02-27 00:03:00 +00:00
public function extract ( array $attributes ) : array
2026-02-25 06:59:34 +00:00
{
$count = \count ( $attributes );
$data = [];
foreach ( $this -> nodes as $node ) {
$elements = [];
foreach ( $attributes as $attribute ) {
if ( '_text' === $attribute ) {
$elements [] = $node -> nodeValue ;
} elseif ( '_name' === $attribute ) {
$elements [] = $node -> nodeName ;
} else {
$elements [] = $node -> getAttribute ( $attribute );
}
}
$data [] = 1 === $count ? $elements [ 0 ] : $elements ;
}
return $data ;
}
/**
* Filters the list of nodes with an XPath expression .
*
* The XPath expression is evaluated in the context of the crawler , which
* is considered as a fake parent of the elements inside it .
* This means that a child selector " div " or " ./div " will match only
* the div elements of the current crawler , not their children .
*/
2026-02-27 00:03:00 +00:00
public function filterXPath ( string $xpath ) : static
2026-02-25 06:59:34 +00:00
{
$xpath = $this -> relativize ( $xpath );
// If we dropped all expressions in the XPath while preparing it, there would be no match
if ( '' === $xpath ) {
return $this -> createSubCrawler ( null );
}
return $this -> filterRelativeXPath ( $xpath );
}
/**
* Filters the list of nodes with a CSS selector .
*
* This method only works if you have installed the CssSelector Symfony Component .
*
2026-02-27 00:03:00 +00:00
* @ throws \LogicException if the CssSelector Component is not available
2026-02-25 06:59:34 +00:00
*/
2026-02-27 00:03:00 +00:00
public function filter ( string $selector ) : static
2026-02-25 06:59:34 +00:00
{
$converter = $this -> createCssSelectorConverter ();
// The CssSelector already prefixes the selector with descendant-or-self::
return $this -> filterRelativeXPath ( $converter -> toXPath ( $selector ));
}
/**
* Selects links by name or alt value for clickable images .
*/
2026-02-27 00:03:00 +00:00
public function selectLink ( string $value ) : static
2026-02-25 06:59:34 +00:00
{
return $this -> filterRelativeXPath (
2026-02-27 00:03:00 +00:00
\sprintf ( 'descendant-or-self::a[contains(concat(\' \', normalize-space(string(.)), \' \'), %1$s) or ./img[contains(concat(\' \', normalize-space(string(@alt)), \' \'), %1$s)]]' , static :: xpathLiteral ( ' ' . $value . ' ' ))
2026-02-25 06:59:34 +00:00
);
}
/**
* Selects images by alt value .
*/
2026-02-27 00:03:00 +00:00
public function selectImage ( string $value ) : static
2026-02-25 06:59:34 +00:00
{
2026-02-27 00:03:00 +00:00
$xpath = \sprintf ( 'descendant-or-self::img[contains(normalize-space(string(@alt)), %s)]' , static :: xpathLiteral ( $value ));
2026-02-25 06:59:34 +00:00
return $this -> filterRelativeXPath ( $xpath );
}
/**
2026-02-27 00:03:00 +00:00
* Selects a button by its text content , id , value , name or alt attribute .
2026-02-25 06:59:34 +00:00
*/
2026-02-27 00:03:00 +00:00
public function selectButton ( string $value ) : static
2026-02-25 06:59:34 +00:00
{
return $this -> filterRelativeXPath (
2026-02-27 00:03:00 +00:00
\sprintf ( 'descendant-or-self::input[((contains(%1$s, "submit") or contains(%1$s, "button")) and contains(concat(\' \', normalize-space(string(@value)), \' \'), %2$s)) or (contains(%1$s, "image") and contains(concat(\' \', normalize-space(string(@alt)), \' \'), %2$s)) or @id=%3$s or @name=%3$s] | descendant-or-self::button[contains(concat(\' \', normalize-space(string(.)), \' \'), %2$s) or contains(concat(\' \', normalize-space(string(@value)), \' \'), %2$s) or @id=%3$s or @name=%3$s]' , 'translate(@type, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz")' , static :: xpathLiteral ( ' ' . $value . ' ' ), static :: xpathLiteral ( $value ))
2026-02-25 06:59:34 +00:00
);
}
/**
* Returns a Link object for the first node in the list .
*
* @ throws \InvalidArgumentException If the current node list is empty or the selected node is not instance of DOMElement
*/
2026-02-27 00:03:00 +00:00
public function link ( string $method = 'get' ) : Link
2026-02-25 06:59:34 +00:00
{
if ( ! $this -> nodes ) {
throw new \InvalidArgumentException ( 'The current node list is empty.' );
}
$node = $this -> getNode ( 0 );
if ( ! $node instanceof \DOMElement ) {
2026-02-27 00:03:00 +00:00
throw new \InvalidArgumentException ( \sprintf ( 'The selected node should be instance of DOMElement, got "%s".' , get_debug_type ( $node )));
2026-02-25 06:59:34 +00:00
}
return new Link ( $node , $this -> baseHref , $method );
}
/**
* Returns an array of Link objects for the nodes in the list .
*
2026-02-27 00:03:00 +00:00
* @ return Link []
2026-02-25 06:59:34 +00:00
*
* @ throws \InvalidArgumentException If the current node list contains non - DOMElement instances
*/
2026-02-27 00:03:00 +00:00
public function links () : array
2026-02-25 06:59:34 +00:00
{
$links = [];
foreach ( $this -> nodes as $node ) {
if ( ! $node instanceof \DOMElement ) {
2026-02-27 00:03:00 +00:00
throw new \InvalidArgumentException ( \sprintf ( 'The current node list should contain only DOMElement instances, "%s" found.' , get_debug_type ( $node )));
2026-02-25 06:59:34 +00:00
}
$links [] = new Link ( $node , $this -> baseHref , 'get' );
}
return $links ;
}
/**
* Returns an Image object for the first node in the list .
*
* @ throws \InvalidArgumentException If the current node list is empty
*/
2026-02-27 00:03:00 +00:00
public function image () : Image
2026-02-25 06:59:34 +00:00
{
if ( ! \count ( $this )) {
throw new \InvalidArgumentException ( 'The current node list is empty.' );
}
$node = $this -> getNode ( 0 );
if ( ! $node instanceof \DOMElement ) {
2026-02-27 00:03:00 +00:00
throw new \InvalidArgumentException ( \sprintf ( 'The selected node should be instance of DOMElement, got "%s".' , get_debug_type ( $node )));
2026-02-25 06:59:34 +00:00
}
return new Image ( $node , $this -> baseHref );
}
/**
* Returns an array of Image objects for the nodes in the list .
*
2026-02-27 00:03:00 +00:00
* @ return Image []
2026-02-25 06:59:34 +00:00
*/
2026-02-27 00:03:00 +00:00
public function images () : array
2026-02-25 06:59:34 +00:00
{
$images = [];
foreach ( $this as $node ) {
if ( ! $node instanceof \DOMElement ) {
2026-02-27 00:03:00 +00:00
throw new \InvalidArgumentException ( \sprintf ( 'The current node list should contain only DOMElement instances, "%s" found.' , get_debug_type ( $node )));
2026-02-25 06:59:34 +00:00
}
$images [] = new Image ( $node , $this -> baseHref );
}
return $images ;
}
/**
* Returns a Form object for the first node in the list .
*
* @ throws \InvalidArgumentException If the current node list is empty or the selected node is not instance of DOMElement
*/
2026-02-27 00:03:00 +00:00
public function form ( ? array $values = null , ? string $method = null ) : Form
2026-02-25 06:59:34 +00:00
{
if ( ! $this -> nodes ) {
throw new \InvalidArgumentException ( 'The current node list is empty.' );
}
$node = $this -> getNode ( 0 );
if ( ! $node instanceof \DOMElement ) {
2026-02-27 00:03:00 +00:00
throw new \InvalidArgumentException ( \sprintf ( 'The selected node should be instance of DOMElement, got "%s".' , get_debug_type ( $node )));
2026-02-25 06:59:34 +00:00
}
$form = new Form ( $node , $this -> uri , $method , $this -> baseHref );
if ( null !== $values ) {
$form -> setValues ( $values );
}
return $form ;
}
/**
* Overloads a default namespace prefix to be used with XPath and CSS expressions .
*/
2026-02-27 00:03:00 +00:00
public function setDefaultNamespacePrefix ( string $prefix ) : void
2026-02-25 06:59:34 +00:00
{
$this -> defaultNamespacePrefix = $prefix ;
}
2026-02-27 00:03:00 +00:00
public function registerNamespace ( string $prefix , string $namespace ) : void
2026-02-25 06:59:34 +00:00
{
$this -> namespaces [ $prefix ] = $namespace ;
}
/**
* Converts string for XPath expressions .
*
* Escaped characters are : quotes ( " ) and apostrophe (').
*
* Examples :
*
* echo Crawler :: xpathLiteral ( 'foo " bar' );
* //prints 'foo " bar'
*
* echo Crawler :: xpathLiteral ( " foo ' bar " );
* //prints "foo ' bar"
*
* echo Crawler :: xpathLiteral ( 'a\'b"c' );
* //prints concat('a', "'", 'b"c')
*/
2026-02-27 00:03:00 +00:00
public static function xpathLiteral ( string $s ) : string
2026-02-25 06:59:34 +00:00
{
if ( ! str_contains ( $s , " ' " )) {
2026-02-27 00:03:00 +00:00
return \sprintf ( " '%s' " , $s );
2026-02-25 06:59:34 +00:00
}
if ( ! str_contains ( $s , '"' )) {
2026-02-27 00:03:00 +00:00
return \sprintf ( '"%s"' , $s );
2026-02-25 06:59:34 +00:00
}
$string = $s ;
$parts = [];
while ( true ) {
if ( false !== $pos = strpos ( $string , " ' " )) {
2026-02-27 00:03:00 +00:00
$parts [] = \sprintf ( " '%s' " , substr ( $string , 0 , $pos ));
2026-02-25 06:59:34 +00:00
$parts [] = " \" ' \" " ;
$string = substr ( $string , $pos + 1 );
} else {
$parts [] = " ' $string ' " ;
break ;
}
}
2026-02-27 00:03:00 +00:00
return \sprintf ( 'concat(%s)' , implode ( ', ' , $parts ));
2026-02-25 06:59:34 +00:00
}
/**
* Filters the list of nodes with an XPath expression .
*
* The XPath expression should already be processed to apply it in the context of each node .
*/
2026-02-27 00:03:00 +00:00
private function filterRelativeXPath ( string $xpath ) : static
2026-02-25 06:59:34 +00:00
{
$crawler = $this -> createSubCrawler ( null );
2026-02-27 00:03:00 +00:00
if ( null === $this -> document ) {
return $crawler ;
}
$domxpath = $this -> createDOMXPath ( $this -> document , $this -> findNamespacePrefixes ( $xpath ));
2026-02-25 06:59:34 +00:00
foreach ( $this -> nodes as $node ) {
$crawler -> add ( $domxpath -> query ( $xpath , $node ));
}
return $crawler ;
}
/**
* Make the XPath relative to the current context .
*
* The returned XPath will match elements matching the XPath inside the current crawler
* when running in the context of a node of the crawler .
*/
private function relativize ( string $xpath ) : string
{
$expressions = [];
// An expression which will never match to replace expressions which cannot match in the crawler
// We cannot drop
$nonMatchingExpression = 'a[name() = "b"]' ;
$xpathLen = \strlen ( $xpath );
$openedBrackets = 0 ;
$startPosition = strspn ( $xpath , " \t \n \r \0 \x0B " );
for ( $i = $startPosition ; $i <= $xpathLen ; ++ $i ) {
$i += strcspn ( $xpath , '"\'[]|' , $i );
if ( $i < $xpathLen ) {
switch ( $xpath [ $i ]) {
case '"' :
case " ' " :
if ( false === $i = strpos ( $xpath , $xpath [ $i ], $i + 1 )) {
return $xpath ; // The XPath expression is invalid
}
continue 2 ;
case '[' :
++ $openedBrackets ;
continue 2 ;
case ']' :
-- $openedBrackets ;
continue 2 ;
}
}
if ( $openedBrackets ) {
continue ;
}
if ( $startPosition < $xpathLen && '(' === $xpath [ $startPosition ]) {
// If the union is inside some braces, we need to preserve the opening braces and apply
// the change only inside it.
$j = 1 + strspn ( $xpath , " ( \t \n \r \0 \x0B " , $startPosition + 1 );
$parenthesis = substr ( $xpath , $startPosition , $j );
$startPosition += $j ;
} else {
$parenthesis = '' ;
}
$expression = rtrim ( substr ( $xpath , $startPosition , $i - $startPosition ));
if ( str_starts_with ( $expression , 'self::*/' )) {
$expression = './' . substr ( $expression , 8 );
}
// add prefix before absolute element selector
if ( '' === $expression ) {
$expression = $nonMatchingExpression ;
} elseif ( str_starts_with ( $expression , '//' )) {
$expression = 'descendant-or-self::' . substr ( $expression , 2 );
} elseif ( str_starts_with ( $expression , './/' )) {
$expression = 'descendant-or-self::' . substr ( $expression , 3 );
} elseif ( str_starts_with ( $expression , './' )) {
$expression = 'self::' . substr ( $expression , 2 );
} elseif ( str_starts_with ( $expression , 'child::' )) {
$expression = 'self::' . substr ( $expression , 7 );
} elseif ( '/' === $expression [ 0 ] || '.' === $expression [ 0 ] || str_starts_with ( $expression , 'self::' )) {
$expression = $nonMatchingExpression ;
} elseif ( str_starts_with ( $expression , 'descendant::' )) {
$expression = 'descendant-or-self::' . substr ( $expression , 12 );
} elseif ( preg_match ( '/^(ancestor|ancestor-or-self|attribute|following|following-sibling|namespace|parent|preceding|preceding-sibling)::/' , $expression )) {
// the fake root has no parent, preceding or following nodes and also no attributes (even no namespace attributes)
$expression = $nonMatchingExpression ;
} elseif ( ! str_starts_with ( $expression , 'descendant-or-self::' )) {
$expression = 'self::' . $expression ;
}
$expressions [] = $parenthesis . $expression ;
if ( $i === $xpathLen ) {
return implode ( ' | ' , $expressions );
}
$i += strspn ( $xpath , " \t \n \r \0 \x0B " , $i + 1 );
$startPosition = $i + 1 ;
}
return $xpath ; // The XPath expression is invalid
}
2026-02-27 00:03:00 +00:00
public function getNode ( int $position ) : ? \DOMNode
2026-02-25 06:59:34 +00:00
{
return $this -> nodes [ $position ] ? ? null ;
}
2026-02-27 00:03:00 +00:00
public function count () : int
2026-02-25 06:59:34 +00:00
{
return \count ( $this -> nodes );
}
/**
2026-02-27 00:03:00 +00:00
* @ return \ArrayIterator < int , \DOMNode >
2026-02-25 06:59:34 +00:00
*/
2026-02-27 00:03:00 +00:00
public function getIterator () : \ArrayIterator
2026-02-25 06:59:34 +00:00
{
return new \ArrayIterator ( $this -> nodes );
}
2026-02-27 00:03:00 +00:00
protected function sibling ( \DOMNode $node , string $siblingDir = 'nextSibling' ) : array
2026-02-25 06:59:34 +00:00
{
$nodes = [];
$currentNode = $this -> getNode ( 0 );
do {
if ( $node !== $currentNode && \XML_ELEMENT_NODE === $node -> nodeType ) {
$nodes [] = $node ;
}
} while ( $node = $node -> $siblingDir );
return $nodes ;
}
private function parseHtml5 ( string $htmlContent , string $charset = 'UTF-8' ) : \DOMDocument
{
$internalErrors = libxml_use_internal_errors ( true );
2026-02-27 00:03:00 +00:00
try {
$document = \Dom\HTMLDocument :: createFromString ( $htmlContent , \Dom\HTML_NO_DEFAULT_NS , $charset );
} catch ( \ValueError ) {
$document = \Dom\HTMLDocument :: createFromString ( $htmlContent , \Dom\HTML_NO_DEFAULT_NS );
2026-02-25 06:59:34 +00:00
}
libxml_use_internal_errors ( $internalErrors );
2026-02-27 00:03:00 +00:00
$dom = new \DOMDocument ( '1.0' , $document -> inputEncoding );
$this -> copyFromHtml5ToDom ( $document -> documentElement , $dom );
2026-02-25 06:59:34 +00:00
return $dom ;
}
/**
* Converts charset to HTML - entities to ensure valid parsing .
*/
private function convertToHtmlEntities ( string $htmlContent , string $charset = 'UTF-8' ) : string
{
2026-02-27 00:03:00 +00:00
set_error_handler ( static fn () => throw new \Exception ());
2026-02-25 06:59:34 +00:00
try {
return mb_encode_numericentity ( $htmlContent , [ 0x80 , 0x10FFFF , 0 , 0x1FFFFF ], $charset );
2026-02-27 00:03:00 +00:00
} catch ( \Exception | \ValueError ) {
2026-02-25 06:59:34 +00:00
try {
$htmlContent = iconv ( $charset , 'UTF-8' , $htmlContent );
$htmlContent = mb_encode_numericentity ( $htmlContent , [ 0x80 , 0x10FFFF , 0 , 0x1FFFFF ], 'UTF-8' );
2026-02-27 00:03:00 +00:00
} catch ( \Exception | \ValueError ) {
2026-02-25 06:59:34 +00:00
}
return $htmlContent ;
} finally {
restore_error_handler ();
}
}
/**
* @ throws \InvalidArgumentException
*/
private function createDOMXPath ( \DOMDocument $document , array $prefixes = []) : \DOMXPath
{
$domxpath = new \DOMXPath ( $document );
foreach ( $prefixes as $prefix ) {
$namespace = $this -> discoverNamespace ( $domxpath , $prefix );
if ( null !== $namespace ) {
$domxpath -> registerNamespace ( $prefix , $namespace );
}
}
return $domxpath ;
}
/**
* @ throws \InvalidArgumentException
*/
private function discoverNamespace ( \DOMXPath $domxpath , string $prefix ) : ? string
{
2026-02-27 00:03:00 +00:00
if ( \array_key_exists ( $prefix , $this -> namespaces )) {
2026-02-25 06:59:34 +00:00
return $this -> namespaces [ $prefix ];
}
2026-02-27 00:03:00 +00:00
if ( $this -> cachedNamespaces -> offsetExists ( $prefix )) {
return $this -> cachedNamespaces [ $prefix ];
}
2026-02-25 06:59:34 +00:00
// ask for one namespace, otherwise we'd get a collection with an item for each node
2026-02-27 00:03:00 +00:00
$namespaces = $domxpath -> query ( \sprintf ( '(//namespace::*[name()="%s"])[last()]' , $this -> defaultNamespacePrefix === $prefix ? '' : $prefix ));
2026-02-25 06:59:34 +00:00
2026-02-27 00:03:00 +00:00
return $this -> cachedNamespaces [ $prefix ] = $namespaces -> item ( 0 ) ? -> nodeValue ;
2026-02-25 06:59:34 +00:00
}
private function findNamespacePrefixes ( string $xpath ) : array
{
if ( preg_match_all ( '/(?P<prefix>[a-z_][a-z_0-9\-\.]*+):[^"\/:]/i' , $xpath , $matches )) {
return array_unique ( $matches [ 'prefix' ]);
}
return [];
}
/**
* Creates a crawler for some subnodes .
*
2026-02-27 00:03:00 +00:00
* @ param \DOMNodeList < \DOMNode >| \DOMNode | \DOMNode [] | string | null $nodes
2026-02-25 06:59:34 +00:00
*/
2026-02-27 00:03:00 +00:00
private function createSubCrawler ( \DOMNodeList | \DOMNode | array | string | null $nodes ) : static
2026-02-25 06:59:34 +00:00
{
$crawler = new static ( $nodes , $this -> uri , $this -> baseHref );
$crawler -> isHtml = $this -> isHtml ;
$crawler -> document = $this -> document ;
$crawler -> namespaces = $this -> namespaces ;
2026-02-27 00:03:00 +00:00
$crawler -> cachedNamespaces = $this -> cachedNamespaces ;
2026-02-25 06:59:34 +00:00
return $crawler ;
}
/**
* @ throws \LogicException If the CssSelector Component is not available
*/
private function createCssSelectorConverter () : CssSelectorConverter
{
if ( ! class_exists ( CssSelectorConverter :: class )) {
throw new \LogicException ( 'To filter with a CSS selector, install the CssSelector component ("composer require symfony/css-selector"). Or use filterXpath instead.' );
}
return new CssSelectorConverter ( $this -> isHtml );
}
2026-02-27 00:03:00 +00:00
private function copyFromHtml5ToDom ( \Dom\Node $source , \DOMDocument $target ) : void
2026-02-25 06:59:34 +00:00
{
2026-02-27 00:03:00 +00:00
/** @var list<array{0: iterable<\Dom\Node>, 1: \DOMNode}> $stack */
$stack = [[[ $source ], $target ]];
while ( $stack ) {
[ $children , $parent ] = array_pop ( $stack );
foreach ( $children as $source ) {
if ( $source instanceof \Dom\CharacterData ) {
$parent -> appendChild ( match ( true ) {
$source instanceof \Dom\Text => $target -> createTextNode ( $source -> data ),
$source instanceof \Dom\Comment => $target -> createComment ( $source -> data ),
$source instanceof \Dom\CDATASection => $target -> createCDATASection ( $source -> data ),
$source instanceof \Dom\ProcessingInstruction => $target -> createProcessingInstruction ( $source -> target , $source -> data ),
});
continue ;
}
2026-02-25 06:59:34 +00:00
2026-02-27 00:03:00 +00:00
if ( ! $source instanceof \Dom\Element ) {
continue ;
}
2026-02-25 06:59:34 +00:00
2026-02-27 00:03:00 +00:00
try {
$element = $target -> createElement ( $source -> tagName );
} catch ( \DOMException ) {
continue ;
}
foreach ( $source -> attributes as $attr ) {
try {
$element -> setAttribute ( $attr -> name , $attr -> value );
} catch ( \DOMException ) {
// ignore invalid attribute name
}
if ( 'id' === $attr -> name ) {
$element -> setIdAttribute ( 'id' , true );
}
}
$parent -> appendChild ( $element );
2026-02-25 06:59:34 +00:00
2026-02-27 00:03:00 +00:00
$stack [] = [ $source -> childNodes , $element ];
}
}
2026-02-25 06:59:34 +00:00
}
2026-02-27 00:03:00 +00:00
private function normalizeWhitespace ( string $string ) : string
2026-02-25 06:59:34 +00:00
{
2026-02-27 00:03:00 +00:00
return trim ( preg_replace ( " /(?:[ \n \r \t \x0C ] { 2,}+|[ \n \r \t \x0C ])/ " , ' ' , $string ), " \n \r \t \x0C " );
2026-02-25 06:59:34 +00:00
}
}