-
Notifications
You must be signed in to change notification settings - Fork 231
/
Copy pathexample_link_check.php
144 lines (118 loc) · 6.06 KB
/
example_link_check.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
<?php
use Example\LogHandler;
use GuzzleHttp\Middleware;
use Symfony\Component\EventDispatcher\GenericEvent;
use VDB\Spider\Discoverer\XPathExpressionDiscoverer;
use VDB\Spider\Event\SpiderEvents;
use VDB\Spider\EventListener\PolitenessPolicyListener;
use VDB\Spider\Filter\Prefetch\AllowedHostsFilter;
use VDB\Spider\Filter\Prefetch\AllowedSchemeFilter;
use VDB\Spider\Filter\Prefetch\UriWithHashFragmentFilter;
use VDB\Spider\Filter\Prefetch\UriWithQueryStringFilter;
use VDB\Spider\QueueManager\InMemoryQueueManager;
use VDB\Spider\Spider;
use Example\StatsHandler;
/*
* This example is almost identical to example_complex, with one big difference:
* We set a custom request handler that does not throw exceptions on failed requests.
* This way, failed requests with their status code are also persisted.
* That means we can then use the spider as a link checker.
*/
require_once('example_complex_bootstrap.php');
// The URI we want to start crawling with
$seed = 'https://www.dmoz-odp.org/';
// We want to allow all subdomains of dmoz.org
$allowSubDomains = true;
// Create spider
$spider = new Spider($seed);
$spider->getDownloader()->setDownloadLimit(10);
// Set a custom request handler that does not throw exceptions on failed requests
$spider->getDownloader()->setRequestHandler(new \Example\LinkCheckRequestHandler());
$statsHandler = new StatsHandler();
$LogHandler = new LogHandler();
$queueManager = new InMemoryQueueManager();
$queueManager->getDispatcher()->addSubscriber($statsHandler);
$queueManager->getDispatcher()->addSubscriber($LogHandler);
// Set some sane defaults for this example. We only visit the first level of www.dmoz.org. We stop at 10 queued resources
$spider->getDiscovererSet()->maxDepth = 1;
// This time, we set the traversal algorithm to breadth-first. The default is depth-first
$queueManager->setTraversalAlgorithm(InMemoryQueueManager::ALGORITHM_BREADTH_FIRST);
$spider->setQueueManager($queueManager);
// We add an URI discoverer. Without it, the spider wouldn't get past the seed resource.
//$spider->getDiscovererSet()->set(new XPathExpressionDiscoverer("//*[@id='cat-list-content-2']/div/a"));
$spider->getDiscovererSet()->set(new XPathExpressionDiscoverer("//a"));
// Let's tell the spider to save all found resources on the filesystem
$spider->getDownloader()->setPersistenceHandler(
new \VDB\Spider\PersistenceHandler\FileSerializedResourcePersistenceHandler(__DIR__ . '/results')
);
// Add some prefetch filters. These are executed before a resource is requested.
// The more you have of these, the less HTTP requests and work for the processors
//$spider->getDiscovererSet()->addFilter(new AllowedSchemeFilter(array('http')));
$spider->getDiscovererSet()->addFilter(new AllowedSchemeFilter(array('https', 'http')));
$spider->getDiscovererSet()->addFilter(new AllowedHostsFilter(array($seed), $allowSubDomains));
$spider->getDiscovererSet()->addFilter(new UriWithHashFragmentFilter());
$spider->getDiscovererSet()->addFilter(new UriWithQueryStringFilter());
// We add an eventlistener to the crawler that implements a politeness policy. We wait 450ms between every request to the same domain
$politenessPolicyEventListener = new PolitenessPolicyListener(100);
$spider->getDownloader()->getDispatcher()->addListener(
SpiderEvents::SPIDER_CRAWL_PRE_REQUEST,
array($politenessPolicyEventListener, 'onCrawlPreRequest')
);
$spider->getDispatcher()->addSubscriber($statsHandler);
$spider->getDispatcher()->addSubscriber($LogHandler);
// Let's add something to enable us to stop the script
$spider->getDispatcher()->addListener(
SpiderEvents::SPIDER_CRAWL_USER_STOPPED,
function (GenericEvent $event) {
echo "\nCrawl aborted by user.\n";
exit();
}
);
// Let's add a CLI progress meter for fun
echo "\nCrawling";
$spider->getDownloader()->getDispatcher()->addListener(
SpiderEvents::SPIDER_CRAWL_POST_REQUEST,
function (GenericEvent $event) {
echo '.';
}
);
// Set up some caching, logging and profiling on the HTTP client of the spider
$guzzleClient = $spider->getDownloader()->getRequestHandler()->getClient();
$tapMiddleware = Middleware::tap([$timerMiddleware, 'onRequest'], [$timerMiddleware, 'onResponse']);
$guzzleClient->getConfig('handler')->push($tapMiddleware, 'timer');
// Execute the crawl
$result = $spider->crawl();
// Report
echo "\n ENQUEUED: " . count($statsHandler->getQueued());
echo "\n SKIPPED: " . count($statsHandler->getFiltered());
echo "\n FAILED: " . count($statsHandler->getFailed());
echo "\n PERSISTED: " . count($statsHandler->getPersisted());
// With the information from some of plugins and listeners, we can determine some metrics
$peakMem = round(memory_get_peak_usage(true) / 1024 / 1024, 2);
$totalTime = round(microtime(true) - $start, 2);
$totalDelay = round($politenessPolicyEventListener->totalDelay / 1000 / 1000, 2);
echo "\n\nMETRICS:";
echo "\n PEAK MEM USAGE: " . $peakMem . 'MB';
echo "\n TOTAL TIME: " . $totalTime . 's';
echo "\n REQUEST TIME: " . $timerMiddleware->getTotal() . 's';
echo "\n POLITENESS WAIT TIME: " . $totalDelay . 's';
echo "\n PROCESSING TIME: " . ($totalTime - $timerMiddleware->getTotal() - $totalDelay) . 's';
// Finally we could start some processing on the downloaded resources
echo "\n\nDOWNLOADED RESOURCES: ";
$downloaded = $spider->getDownloader()->getPersistenceHandler();
/** @var \VDB\Spider\Resource $resource */
foreach ($downloaded as $resource) {
$code = $resource->getResponse()->getStatusCode();
$reason = $resource->getResponse()->getReasonPhrase();
$title = $resource->getCrawler()->filterXpath('//title')->text("");
$contentLength = (int)$resource->getResponse()->getHeaderLine('Content-Length');
$contentLengthString = '';
if ($contentLength >= 1024) {
$contentLengthString = str_pad("[" . round($contentLength / 1024), 4, ' ', STR_PAD_LEFT) . "KB]";
} else {
$contentLengthString = str_pad("[" . $contentLength, 5, ' ', STR_PAD_LEFT) . "B]";
}
$uri = $resource->getUri()->toString();
echo "\n - " . $contentLengthString . " $title ($uri) " .$code ." ". $reason;
}
echo "\n";