Skip to content

Commit 18f3d42

Browse files
committed
Add support for robots.txt disallow rules
1 parent 6edd661 commit 18f3d42

14 files changed

+252
-32
lines changed

.phan/config.php

+1
Original file line numberDiff line numberDiff line change
@@ -342,6 +342,7 @@
342342
// your application should be included in this list.
343343
'directory_list' => [
344344
'src',
345+
'vendor/spatie/robots-txt/src',
345346
'vendor/guzzlehttp/guzzle/src',
346347
'vendor/guzzlehttp/psr7/src',
347348
'vendor/psr/http-client',

README.md

+5-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ PHP-Spider Features
99
- supports two traversal algorithms: breadth-first and depth-first
1010
- supports crawl depth limiting, queue size limiting and max downloads limiting
1111
- supports adding custom URI discovery logic, based on XPath, CSS selectors, or plain old PHP
12-
- comes with a useful set of URI filters, such as Domain limiting
12+
- comes with a useful set of URI filters, such as robots.txt and Domain limiting
1313
- supports custom URI filters, both prefetch (URI) and postfetch (Resource content)
1414
- supports custom request handling logic
1515
- supports Basic, Digest and NTLM HTTP authentication. See [example](example/example_basic_auth.php).
@@ -25,6 +25,10 @@ Installation
2525
------------
2626
The easiest way to install PHP-Spider is with [composer](https://getcomposer.org/). Find it on [Packagist](https://packagist.org/packages/vdb/php-spider).
2727

28+
```bash
29+
$ composer require vdb/php-spider
30+
```
31+
2832
Usage
2933
-----
3034
This is a very simple example. This code can be found in [example/example_simple.php](example/example_simple.php). For a more complete example with some logging, caching and filters, see [example/example_complex.php](example/example_complex.php). That file contains a more real-world example.

composer.json

+2-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@
2222
"symfony/dom-crawler": "^3.0.0||^4.0.0||^5.0.0||^6.0",
2323
"symfony/finder": "^3.0.0||^4.0.0||^5.0.0||^6.0",
2424
"symfony/event-dispatcher": "^4.0.0||^5.0.0||^6.0",
25-
"vdb/uri": "^0.3.1",
25+
"vdb/uri": "^0.3.2",
26+
"spatie/robots-txt": "^1.0",
2627
"phan/phan": "^4.0||^5.0"
2728
},
2829
"require-dev": {

example/example_complex.php

+9-1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
use VDB\Spider\EventListener\PolitenessPolicyListener;
1010
use VDB\Spider\Filter\Prefetch\AllowedHostsFilter;
1111
use VDB\Spider\Filter\Prefetch\AllowedSchemeFilter;
12+
use VDB\Spider\Filter\Prefetch\RobotsTxtDisallowFilter;
1213
use VDB\Spider\Filter\Prefetch\UriWithHashFragmentFilter;
1314
use VDB\Spider\Filter\Prefetch\UriWithQueryStringFilter;
1415
use VDB\Spider\PersistenceHandler\FileSerializedResourcePersistenceHandler;
@@ -36,6 +37,7 @@
3637

3738
$queueManager->getDispatcher()->addSubscriber($statsHandler);
3839
$queueManager->getDispatcher()->addSubscriber($LogHandler);
40+
$spider->getDownloader()->getDispatcher()->addSubscriber($statsHandler);
3941

4042
// Set some sane defaults for this example.
4143
// We only visit the first level of http://dmoztools.net. We stop at 10 queued resources
@@ -47,7 +49,7 @@
4749
$spider->setQueueManager($queueManager);
4850

4951
// We add an URI discoverer. Without it, the spider wouldn't get past the seed resource.
50-
$spider->getDiscovererSet()->set(new XPathExpressionDiscoverer("//*[@id='cat-list-content-2']/div/a"));
52+
$spider->getDiscovererSet()->set(new XPathExpressionDiscoverer("//a"));
5153

5254
// Let's tell the spider to save all found resources on the filesystem
5355
$spider->getDownloader()->setPersistenceHandler(
@@ -60,6 +62,7 @@
6062
$spider->getDiscovererSet()->addFilter(new AllowedHostsFilter(array($seed), $allowSubDomains));
6163
$spider->getDiscovererSet()->addFilter(new UriWithHashFragmentFilter());
6264
$spider->getDiscovererSet()->addFilter(new UriWithQueryStringFilter());
65+
$spider->getDiscovererSet()->addFilter(new RobotsTxtDisallowFilter($seed, 'PHP-Spider'));
6366

6467
// We add an event listener to the crawler that implements a politeness policy.
6568
// We wait 100ms between every request to the same domain
@@ -131,3 +134,8 @@ function (GenericEvent $event) {
131134
echo "\n - " . $contentLengthString . " $title ($uri)";
132135
}
133136
echo "\n";
137+
138+
echo "\nFAILED RESOURCES: ";
139+
foreach ($statsHandler->getFailed() as $uri => $message) {
140+
echo "\n - " . $uri . " failed because: " . $message;
141+
}

example/lib/Example/StatsHandler.php

+4-4
Original file line numberDiff line numberDiff line change
@@ -35,22 +35,22 @@ public static function getSubscribedEvents(): array
3535
);
3636
}
3737

38-
public function addToQueued(GenericEvent $event)
38+
public function addToQueued(GenericEvent $event): void
3939
{
4040
$this->queued[] = $event->getArgument('uri');
4141
}
4242

43-
public function addToPersisted(GenericEvent $event)
43+
public function addToPersisted(GenericEvent $event): void
4444
{
4545
$this->persisted[] = $event->getArgument('uri');
4646
}
4747

48-
public function addToFiltered(GenericEvent $event)
48+
public function addToFiltered(GenericEvent $event): void
4949
{
5050
$this->filtered[] = $event->getArgument('uri');
5151
}
5252

53-
public function addToFailed(GenericEvent $event)
53+
public function addToFailed(GenericEvent $event): void
5454
{
5555
$this->failed[$event->getArgument('uri')->toString()] = $event->getArgument('message');
5656
}

src/Discoverer/DiscovererSet.php

+7-7
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ public function __construct(array $discoverers = array())
4242
* If it already exists, it is not overwritten, since we want to keep the
4343
* first depth it was found at.
4444
*/
45-
private function markSeen(DiscoveredUri $uri)
45+
private function markSeen(DiscoveredUri $uri): void
4646
{
4747
$uriString = $uri->normalize()->toString();
4848
if (!array_key_exists($uriString, $this->alreadySeenUris)) {
@@ -97,20 +97,20 @@ public function discover(Resource $resource): array
9797
*
9898
* @param discovererInterface $discoverer The discoverer instance
9999
*/
100-
public function set(DiscovererInterface $discoverer)
100+
public function set(DiscovererInterface $discoverer): void
101101
{
102102
$this->discoverers[$discoverer->getName()] = $discoverer;
103103
}
104104

105-
public function addFilter(PreFetchFilterInterface $filter)
105+
public function addFilter(PreFetchFilterInterface $filter): void
106106
{
107107
$this->filters[] = $filter;
108108
}
109109

110110
/**
111111
* @param UriInterface[] $discoveredUris
112112
*/
113-
private function normalize(array &$discoveredUris)
113+
private function normalize(array &$discoveredUris): void
114114
{
115115
/** @var DiscoveredUri[] $discoveredUris */
116116
foreach ($discoveredUris as $k => $uri) {
@@ -121,7 +121,7 @@ private function normalize(array &$discoveredUris)
121121
/**
122122
* @param UriInterface[] $discoveredUris
123123
*/
124-
private function filterAlreadySeen(array &$discoveredUris)
124+
private function filterAlreadySeen(array &$discoveredUris): void
125125
{
126126
foreach ($discoveredUris as $k => $uri) {
127127
if (array_key_exists($uri->toString(), $this->alreadySeenUris)) {
@@ -134,7 +134,7 @@ private function filterAlreadySeen(array &$discoveredUris)
134134
* Filter out any URI that matches any of the filters
135135
* @param UriInterface[] $discoveredUris
136136
*/
137-
private function filter(array &$discoveredUris)
137+
private function filter(array &$discoveredUris): void
138138
{
139139
foreach ($discoveredUris as $k => $uri) {
140140
foreach ($this->filters as $filter) {
@@ -148,7 +148,7 @@ private function filter(array &$discoveredUris)
148148
/**
149149
* @param UriInterface[] $discoveredUris
150150
*/
151-
private function removeDuplicates(array &$discoveredUris)
151+
private function removeDuplicates(array &$discoveredUris): void
152152
{
153153
// make sure there are no duplicates in the list
154154
$tmp = array();
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
<?php
2+
3+
namespace VDB\Spider\Filter\Prefetch;
4+
5+
use RuntimeException;
6+
7+
class ExtractRobotsTxtException extends RuntimeException
8+
{
9+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
<?php
2+
3+
namespace VDB\Spider\Filter\Prefetch;
4+
5+
use RuntimeException;
6+
7+
class FetchRobotsTxtException extends RuntimeException
8+
{
9+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
<?php
2+
3+
namespace VDB\Spider\Filter\Prefetch;
4+
5+
use ErrorException;
6+
use Exception;
7+
use Spatie\Robots\RobotsTxt;
8+
use VDB\Spider\Filter\PreFetchFilterInterface;
9+
use VDB\Uri\Exception\UriSyntaxException;
10+
use VDB\Uri\FileUri;
11+
use VDB\Uri\Http;
12+
use VDB\Uri\Uri;
13+
use VDB\Uri\UriInterface;
14+
15+
/**
16+
* @author Matthijs van den Bos <[email protected]>
17+
*/
18+
class RobotsTxtDisallowFilter implements PreFetchFilterInterface
19+
{
20+
private RobotsTxt $parser;
21+
private ?string $userAgent;
22+
private Uri $seedUri;
23+
24+
/**
25+
* @param string $seedUrl The robots.txt file will be loaded from this domain.
26+
* @param string|null $userAgent
27+
* @throws ErrorException
28+
* @throws UriSyntaxException
29+
*/
30+
public function __construct(string $seedUrl, string $userAgent = null)
31+
{
32+
$this->seedUri = new Uri($seedUrl);
33+
$this->seedUri->normalize();
34+
$this->userAgent = $userAgent;
35+
$this->parser = new RobotsTxt(self::fetchRobotsTxt(self::extractRobotsTxtUri($seedUrl)));
36+
}
37+
38+
/**
39+
* @param string $robotsUri
40+
* @return string
41+
*/
42+
private static function fetchRobotsTxt(string $robotsUri): string
43+
{
44+
try {
45+
$robotsTxt = file_get_contents($robotsUri);
46+
} catch (Exception $e) {
47+
throw new FetchRobotsTxtException("Could not fetch $robotsUri: " . $e->getMessage());
48+
}
49+
50+
return $robotsTxt;
51+
}
52+
53+
/**
54+
* Clean up the URL and strip any parameters and fragments
55+
*
56+
* @param string $seedUrl
57+
* @return string
58+
*
59+
* @throws ErrorException
60+
* @throws UriSyntaxException
61+
*/
62+
private static function extractRobotsTxtUri(string $seedUrl): string
63+
{
64+
$uri = new Uri($seedUrl);
65+
if (in_array($uri->getScheme(), FileUri::$allowedSchemes)) {
66+
return new FileUri($seedUrl . '/robots.txt');
67+
} elseif (in_array($uri->getScheme(), Http::$allowedSchemes)) {
68+
return $uri->toBaseUri()->toString() . '/robots.txt';
69+
} else {
70+
throw new ExtractRobotsTxtException(
71+
"Seed URL scheme must be one of " .
72+
implode(', ', array_merge(FileUri::$allowedSchemes, Http::$allowedSchemes))
73+
);
74+
}
75+
}
76+
77+
public function match(UriInterface $uri): bool
78+
{
79+
// Make the uri relative to $this->seedUri, so it will match with the rules in the robots.txt
80+
$relativeUri = str_replace($this->seedUri->toString(), '', $uri->normalize()->toString());
81+
return !$this->parser->allows($relativeUri, $this->userAgent);
82+
}
83+
}

src/Uri/DiscoveredUri.php

+2-18
Original file line numberDiff line numberDiff line change
@@ -9,22 +9,14 @@
99

1010
class DiscoveredUri implements UriInterface
1111
{
12-
/**
13-
* @var UriInterface
14-
*/
15-
protected $decorated;
16-
17-
/** @var int */
12+
protected string|UriInterface $decorated;
1813
private int $depthFound;
1914

2015
/**
21-
* @param string|UriInterface $decorated
22-
* @param int $depthFound
23-
*
2416
* @throws ErrorException
2517
* @throws UriSyntaxException
2618
*/
27-
public function __construct($decorated, int $depthFound)
19+
public function __construct(UriInterface|string $decorated, int $depthFound)
2820
{
2921
if (!$decorated instanceof UriInterface) {
3022
$decorated = new Uri($decorated);
@@ -42,14 +34,6 @@ public function getDepthFound(): int
4234
return $this->depthFound;
4335
}
4436

45-
// /**
46-
// * @param int $depthFound The depth this Uri was found on
47-
// */
48-
// public function setDepthFound(int $depthFound)
49-
// {
50-
// $this->depthFound = $depthFound;
51-
// }
52-
5337
// @codeCoverageIgnoreStart
5438
// We ignore coverage for all proxy methods below:
5539
// the constructor is tested and if that is successful there is no point

tests/Discoverer/DiscovererSetTest.php

+27
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,11 @@
1717
use VDB\Spider\Discoverer\XPathExpressionDiscoverer;
1818
use VDB\Spider\Filter\Prefetch\AllowedHostsFilter;
1919
use VDB\Spider\Filter\Prefetch\AllowedPortsFilter;
20+
use VDB\Spider\Filter\Prefetch\RobotsTxtDisallowFilter;
2021
use VDB\Spider\Filter\Prefetch\UriFilter;
2122
use VDB\Spider\Uri\DiscoveredUri;
2223
use VDB\Uri\Exception\UriSyntaxException;
24+
use VDB\Uri\FileUri;
2325

2426
/**
2527
*
@@ -86,6 +88,31 @@ public function testUriFilter()
8688
$this->assertCount(1, $uris);
8789
}
8890

91+
/**
92+
* @covers \VDB\Spider\Discoverer\DiscovererSet
93+
* @covers \VDB\Spider\Filter\Prefetch\RobotsTxtDisallowFilter
94+
*
95+
* @throws UriSyntaxException
96+
* @throws ErrorException
97+
* @throws Exception
98+
*/
99+
public function testRobotsTxtDisallowFilter()
100+
{
101+
$baseUri = "file://" . __DIR__;
102+
$resourceUri = new DiscoveredUri($baseUri, 0);
103+
$uriInBody1 = $baseUri . '/internal';
104+
$uriInBody2 = $baseUri . '/foo';
105+
106+
$spiderResource = self::createResourceWithLinks($resourceUri, [$uriInBody1, $uriInBody2]);
107+
108+
$discovererSet = new DiscovererSet([new XPathExpressionDiscoverer("//a")]);
109+
$discovererSet->addFilter(new RobotsTxtDisallowFilter($baseUri));
110+
111+
$uris = $discovererSet->discover($spiderResource);
112+
$this->assertCount(1, $uris);
113+
$this->assertNotContains((new FileUri($uriInBody2))->toString(), array_map(fn($uri): string => (new FileUri($uri->toString()))->toString(), $uris));
114+
}
115+
89116
/**
90117
* @covers \VDB\Spider\Discoverer\DiscovererSet
91118
* @covers \VDB\Spider\Filter\Prefetch\AllowedPortsFilter

tests/Discoverer/robots.txt

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
User-agent: *
2+
Disallow: /foo
3+
4+
User-agent: PHP-Spider
5+
Disallow: /bar

0 commit comments

Comments
 (0)