1 | <?php |
||
2 | |||
3 | namespace PhpTek\Exodus\Crawl; |
||
4 | |||
5 | use PHPCrawl\PHPCrawler; |
||
6 | use PHPCrawl\PHPCrawlerDocumentInfo; |
||
7 | use PHPCrawl\PHPCrawlerURLDescriptor; |
||
8 | use SilverStripe\Core\Injector\Injectable; |
||
9 | use SilverStripe\Core\Config\Configurable; |
||
10 | use PhpTek\Exodus\Tool\StaticSiteUrlList; |
||
11 | use PhpTek\Exodus\Tool\StaticSiteUtils; |
||
12 | |||
13 | /** |
||
14 | * Extends PHPCrawler essentially to override its handleDocumentInfo() method. |
||
15 | * |
||
16 | * @see {@link PHPCrawler} |
||
17 | */ |
||
18 | class StaticSiteCrawler extends PHPCrawler |
||
19 | { |
||
20 | use Injectable; |
||
21 | use Configurable; |
||
22 | |||
23 | /** |
||
24 | * |
||
25 | * @var StaticSiteUrlList |
||
26 | */ |
||
27 | protected $urlList; |
||
28 | |||
29 | /** |
||
30 | * |
||
31 | * @var boolean |
||
32 | */ |
||
33 | protected $verbose = false; |
||
34 | |||
35 | /* |
||
36 | * Holds the StaticSiteUtils object on construct |
||
37 | * |
||
38 | * @var StaticSiteUtils |
||
39 | */ |
||
40 | protected $utils; |
||
41 | |||
42 | /** |
||
43 | * Set this by using the yml config system |
||
44 | * |
||
45 | * Example: |
||
46 | * <code> |
||
47 | * StaticSiteContentExtractor: |
||
48 | * log_file: ../logs/crawler-log.txt |
||
49 | * </code> |
||
50 | * |
||
51 | * @var string |
||
52 | */ |
||
53 | private static $log_file = null; |
||
0 ignored issues
–
show
introduced
by
![]() |
|||
54 | |||
55 | /** |
||
56 | * |
||
57 | * @param StaticSiteUrlList $urlList |
||
58 | * @param number $limit |
||
59 | * @param boolean $verbose |
||
60 | * @return void |
||
61 | */ |
||
62 | public function __construct(StaticSiteUrlList $urlList, $limit = false, $verbose = false) |
||
63 | { |
||
64 | parent::__construct(); |
||
65 | |||
66 | $this->urlList = $urlList; |
||
67 | $this->verbose = $verbose; |
||
68 | $this->utils = singleton(StaticSiteUtils::class); |
||
69 | } |
||
70 | |||
71 | /** |
||
72 | * After checking raw status codes out of PHPCrawler we continue to save each URL to our cache file. |
||
73 | * |
||
74 | * $PageInfo gives us: |
||
75 | * |
||
76 | * $PageInfo->url |
||
77 | * $PageInfo->http_status_code |
||
78 | * $PageInfo->links_found_url_descriptors |
||
79 | * |
||
80 | * @param PHPCrawlerDocumentInfo $PageInfo |
||
81 | * @return int |
||
82 | * @todo Can we make use of PHPCrawlerDocumentInfo#error_occured instead of manually checking server codes?? |
||
83 | * @todo The comments below state that badly formatted URLs never make it to our caching logic. Wrong! |
||
84 | * - Pass the preg_replace() call for "fixing" $mossBracketRegex into StaticSiteUrlProcessor#postProcessUrl() |
||
85 | * @todo Processor-specific logic (MOSS) should be ported into dedicated class under "Process" namespace |
||
86 | */ |
||
87 | public function handleDocumentInfo(PHPCrawlerDocumentInfo $PageInfo): int |
||
88 | { |
||
89 | $info = $PageInfo; // upgraded phpcrawler compatibility |
||
90 | /* |
||
91 | * MOSS has many URLs with brackets, e.g. http://www.stuff.co.nz/news/cat-stuck-up-tree/(/ |
||
92 | * These result in 4xx response-codes returned from curl requests for it, and won't filter down to our |
||
93 | * caching or URL Processor logic. We can "recover" these URLs by stripping and replacing |
||
94 | * with a trailing slash. This allows us to be able to fetch all the URL's children, if any. |
||
95 | */ |
||
96 | $isRecoverableUrl = (bool) preg_match('#(\(|%28)+(.+)?$#i', $info->url); |
||
97 | // Ignore errors and redirects, they'll get logged for later analysis |
||
98 | $badStatusCode = (($info->http_status_code < 200) || ($info->http_status_code > 299)); |
||
99 | /* |
||
100 | * We're checking for a bad status code AND for "recoverability", becuase we might be able to recover the URL |
||
101 | * when re-requesting it during the import stage, as long as we cache it correctly here. |
||
102 | */ |
||
103 | if ($badStatusCode && !$isRecoverableUrl) { |
||
104 | $message = $info->url . " Skipped. We got a bad status-code and URL was irrecoverable" . PHP_EOL; |
||
105 | $this->utils->log($message); |
||
106 | |||
107 | return 1; |
||
108 | } |
||
109 | |||
110 | // Continue building our cache |
||
111 | $this->urlList->addAbsoluteURL($info->url, $info->content_type); |
||
112 | $this->urlList->saveURLs(); |
||
113 | |||
114 | return 0; |
||
115 | } |
||
116 | |||
117 | /** |
||
118 | * |
||
119 | * @return void |
||
120 | * @throws \InvalidArgumentException |
||
121 | */ |
||
122 | protected function initCrawlerProcess(): void |
||
123 | { |
||
124 | parent::initCrawlerProcess(); |
||
125 | |||
126 | // Add additional URLs to crawl to the crawler's LinkCache |
||
127 | // NOTE: This is using an undocumented API |
||
128 | if ($extraURLs = $this->urlList->getExtraCrawlURLs()) { |
||
129 | foreach ($extraURLs as $extraURL) { |
||
130 | $this->LinkCache->addUrl(new PHPCrawlerURLDescriptor($extraURL)); |
||
131 | } |
||
132 | } |
||
133 | |||
134 | // Prevent URLs that match the exclude patterns from being fetched |
||
135 | if ($excludePatterns = $this->urlList->getExcludePatterns()) { |
||
136 | foreach ($excludePatterns as $pattern) { |
||
137 | $validRegExp = $this->addURLFilterRule('|' . str_replace('|', '\|', $pattern) . '|'); |
||
138 | |||
139 | if (!$validRegExp) { |
||
140 | throw new \InvalidArgumentException('Exclude url pattern "' . $pattern . '" is not a valid regular expression.'); |
||
141 | } |
||
142 | } |
||
143 | } |
||
144 | } |
||
145 | } |
||
146 |