| 1 | <?php |
||
| 2 | |||
| 3 | namespace PhpTek\Exodus\Crawl; |
||
| 4 | |||
| 5 | use PHPCrawl\PHPCrawler; |
||
| 6 | use PHPCrawl\PHPCrawlerDocumentInfo; |
||
| 7 | use PHPCrawl\PHPCrawlerURLDescriptor; |
||
| 8 | use SilverStripe\Core\Injector\Injectable; |
||
| 9 | use SilverStripe\Core\Config\Configurable; |
||
| 10 | use PhpTek\Exodus\Tool\StaticSiteUrlList; |
||
| 11 | use PhpTek\Exodus\Tool\StaticSiteUtils; |
||
| 12 | |||
| 13 | /** |
||
| 14 | * Extends PHPCrawler essentially to override its handleDocumentInfo() method. |
||
| 15 | * |
||
| 16 | * @see {@link PHPCrawler} |
||
| 17 | */ |
||
| 18 | class StaticSiteCrawler extends PHPCrawler |
||
| 19 | { |
||
| 20 | use Injectable; |
||
| 21 | use Configurable; |
||
| 22 | |||
| 23 | /** |
||
| 24 | * |
||
| 25 | * @var StaticSiteUrlList |
||
| 26 | */ |
||
| 27 | protected $urlList; |
||
| 28 | |||
| 29 | /** |
||
| 30 | * |
||
| 31 | * @var boolean |
||
| 32 | */ |
||
| 33 | protected $verbose = false; |
||
| 34 | |||
| 35 | /* |
||
| 36 | * Holds the StaticSiteUtils object on construct |
||
| 37 | * |
||
| 38 | * @var StaticSiteUtils |
||
| 39 | */ |
||
| 40 | protected $utils; |
||
| 41 | |||
| 42 | /** |
||
| 43 | * Set this by using the yml config system |
||
| 44 | * |
||
| 45 | * Example: |
||
| 46 | * <code> |
||
| 47 | * StaticSiteContentExtractor: |
||
| 48 | * log_file: ../logs/crawler-log.txt |
||
| 49 | * </code> |
||
| 50 | * |
||
| 51 | * @var string |
||
| 52 | */ |
||
| 53 | private static $log_file = null; |
||
|
0 ignored issues
–
show
introduced
by
Loading history...
|
|||
| 54 | |||
| 55 | /** |
||
| 56 | * |
||
| 57 | * @param StaticSiteUrlList $urlList |
||
| 58 | * @param number $limit |
||
| 59 | * @param boolean $verbose |
||
| 60 | * @return void |
||
| 61 | */ |
||
| 62 | public function __construct(StaticSiteUrlList $urlList, $limit = false, $verbose = false) |
||
| 63 | { |
||
| 64 | parent::__construct(); |
||
| 65 | |||
| 66 | $this->urlList = $urlList; |
||
| 67 | $this->verbose = $verbose; |
||
| 68 | $this->utils = singleton(StaticSiteUtils::class); |
||
| 69 | } |
||
| 70 | |||
| 71 | /** |
||
| 72 | * After checking raw status codes out of PHPCrawler we continue to save each URL to our cache file. |
||
| 73 | * |
||
| 74 | * $PageInfo gives us: |
||
| 75 | * |
||
| 76 | * $PageInfo->url |
||
| 77 | * $PageInfo->http_status_code |
||
| 78 | * $PageInfo->links_found_url_descriptors |
||
| 79 | * |
||
| 80 | * @param PHPCrawlerDocumentInfo $PageInfo |
||
| 81 | * @return int |
||
| 82 | * @todo Can we make use of PHPCrawlerDocumentInfo#error_occured instead of manually checking server codes?? |
||
| 83 | * @todo The comments below state that badly formatted URLs never make it to our caching logic. Wrong! |
||
| 84 | * - Pass the preg_replace() call for "fixing" $mossBracketRegex into StaticSiteUrlProcessor#postProcessUrl() |
||
| 85 | * @todo Processor-specific logic (MOSS) should be ported into dedicated class under "Process" namespace |
||
| 86 | */ |
||
| 87 | public function handleDocumentInfo(PHPCrawlerDocumentInfo $PageInfo): int |
||
| 88 | { |
||
| 89 | $info = $PageInfo; // upgraded phpcrawler compatibility |
||
| 90 | /* |
||
| 91 | * MOSS has many URLs with brackets, e.g. http://www.stuff.co.nz/news/cat-stuck-up-tree/(/ |
||
| 92 | * These result in 4xx response-codes returned from curl requests for it, and won't filter down to our |
||
| 93 | * caching or URL Processor logic. We can "recover" these URLs by stripping and replacing |
||
| 94 | * with a trailing slash. This allows us to be able to fetch all the URL's children, if any. |
||
| 95 | */ |
||
| 96 | $isRecoverableUrl = (bool) preg_match('#(\(|%28)+(.+)?$#i', $info->url); |
||
| 97 | // Ignore errors and redirects, they'll get logged for later analysis |
||
| 98 | $badStatusCode = (($info->http_status_code < 200) || ($info->http_status_code > 299)); |
||
| 99 | /* |
||
| 100 | * We're checking for a bad status code AND for "recoverability", becuase we might be able to recover the URL |
||
| 101 | * when re-requesting it during the import stage, as long as we cache it correctly here. |
||
| 102 | */ |
||
| 103 | if ($badStatusCode && !$isRecoverableUrl) { |
||
| 104 | $message = $info->url . " Skipped. We got a bad status-code and URL was irrecoverable" . PHP_EOL; |
||
| 105 | $this->utils->log($message); |
||
| 106 | |||
| 107 | return 1; |
||
| 108 | } |
||
| 109 | |||
| 110 | // Continue building our cache |
||
| 111 | $this->urlList->addAbsoluteURL($info->url, $info->content_type); |
||
| 112 | $this->urlList->saveURLs(); |
||
| 113 | |||
| 114 | return 0; |
||
| 115 | } |
||
| 116 | |||
| 117 | /** |
||
| 118 | * |
||
| 119 | * @return void |
||
| 120 | * @throws \InvalidArgumentException |
||
| 121 | */ |
||
| 122 | protected function initCrawlerProcess(): void |
||
| 123 | { |
||
| 124 | parent::initCrawlerProcess(); |
||
| 125 | |||
| 126 | // Add additional URLs to crawl to the crawler's LinkCache |
||
| 127 | // NOTE: This is using an undocumented API |
||
| 128 | if ($extraURLs = $this->urlList->getExtraCrawlURLs()) { |
||
| 129 | foreach ($extraURLs as $extraURL) { |
||
| 130 | $this->LinkCache->addUrl(new PHPCrawlerURLDescriptor($extraURL)); |
||
| 131 | } |
||
| 132 | } |
||
| 133 | |||
| 134 | // Prevent URLs that match the exclude patterns from being fetched |
||
| 135 | if ($excludePatterns = $this->urlList->getExcludePatterns()) { |
||
| 136 | foreach ($excludePatterns as $pattern) { |
||
| 137 | $validRegExp = $this->addURLFilterRule('|' . str_replace('|', '\|', $pattern) . '|'); |
||
| 138 | |||
| 139 | if (!$validRegExp) { |
||
| 140 | throw new \InvalidArgumentException('Exclude url pattern "' . $pattern . '" is not a valid regular expression.'); |
||
| 141 | } |
||
| 142 | } |
||
| 143 | } |
||
| 144 | } |
||
| 145 | } |
||
| 146 |