| Total Complexity | 93 |
| Total Lines | 759 |
| Duplicated Lines | 0 % |
| Changes | 8 | ||
| Bugs | 2 | Features | 0 |
Complex classes like StaticSiteUrlList often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use StaticSiteUrlList, and based on these observations, apply Extract Interface, too.
| 1 | <?php |
||
| 25 | class StaticSiteUrlList |
||
| 26 | { |
||
| 27 | use Injectable; |
||
| 28 | use Configurable; |
||
| 29 | |||
| 30 | /** |
||
| 31 | * @var string |
||
| 32 | */ |
||
| 33 | public const CRAWL_STATUS_COMPLETE = 'Complete'; |
||
| 34 | |||
| 35 | /** |
||
| 36 | * @var string |
||
| 37 | */ |
||
| 38 | public const CRAWL_STATUS_PARTIAL = 'Partial'; |
||
| 39 | |||
| 40 | /** |
||
| 41 | * @var string |
||
| 42 | */ |
||
| 43 | public const CRAWL_STATUS_NOTSTARTED = 'Not started'; |
||
| 44 | |||
| 45 | /** |
||
| 46 | * |
||
| 47 | * @var string |
||
| 48 | */ |
||
| 49 | private static $undefined_mime_type = 'unknown'; |
||
| 50 | |||
| 51 | /** |
||
| 52 | * |
||
| 53 | * @var string |
||
| 54 | */ |
||
| 55 | protected $baseURL; |
||
| 56 | |||
| 57 | /** |
||
| 58 | * |
||
| 59 | * @var string |
||
| 60 | */ |
||
| 61 | protected $cacheDir; |
||
| 62 | |||
| 63 | /** |
||
| 64 | * Two element array: contains keys 'inferred' and 'regular': |
||
| 65 | * - 'regular' is an array mapping raw URLs to processed URLs |
||
| 66 | * - 'inferred' is an array of inferred URLs |
||
| 67 | * |
||
| 68 | * @var array |
||
| 69 | */ |
||
| 70 | protected $urls = null; |
||
| 71 | |||
| 72 | /** |
||
| 73 | * |
||
| 74 | * @var boolean |
||
| 75 | */ |
||
| 76 | protected $autoCrawl = false; |
||
| 77 | |||
| 78 | /** |
||
| 79 | * |
||
| 80 | * @var StaticSiteUrlProcessor |
||
| 81 | */ |
||
| 82 | protected $urlProcessor = null; |
||
| 83 | |||
| 84 | /** |
||
| 85 | * |
||
| 86 | * @var array |
||
| 87 | */ |
||
| 88 | protected $extraCrawlURLs = null; |
||
| 89 | |||
| 90 | /** |
||
| 91 | * A list of regular expression patterns to exclude from scraping |
||
| 92 | * |
||
| 93 | * @var array |
||
| 94 | */ |
||
| 95 | protected $excludePatterns = []; |
||
| 96 | |||
| 97 | /** |
||
| 98 | * The StaticSiteContentSource object |
||
| 99 | * |
||
| 100 | * @var StaticSiteContentSource |
||
| 101 | */ |
||
| 102 | protected $source; |
||
| 103 | |||
| 104 | /** |
||
| 105 | * Create a new URL List |
||
| 106 | * @param StaticSiteContentSource $source |
||
| 107 | * @param string $cacheDir The local path to cache data into |
||
| 108 | * @return void |
||
| 109 | */ |
||
| 110 | public function __construct(StaticSiteContentSource $source, $cacheDir) |
||
| 111 | { |
||
| 112 | // baseURL must not have a trailing slash |
||
| 113 | $baseURL = $source->BaseUrl; |
||
|
|
|||
| 114 | |||
| 115 | if (substr($baseURL, -1) == "/") { |
||
| 116 | $baseURL = substr($baseURL, 0, -1); |
||
| 117 | } |
||
| 118 | |||
| 119 | // cacheDir must have a trailing slash |
||
| 120 | if (substr($cacheDir, -1) != "/") { |
||
| 121 | $cacheDir .= "/"; |
||
| 122 | } |
||
| 123 | |||
| 124 | $this->baseURL = $baseURL; |
||
| 125 | $this->cacheDir = $cacheDir; |
||
| 126 | $this->source = $source; |
||
| 127 | } |
||
| 128 | |||
| 129 | /** |
||
| 130 | * Set a URL processor for this URL List. |
||
| 131 | * |
||
| 132 | * URL processors process the URLs before the site hierarchy and any inferred metadata are generated. |
||
| 133 | * These can be used to tranform URLs from CMS's that don't provide a natural hierarchy, into something |
||
| 134 | * more useful. |
||
| 135 | * |
||
| 136 | * @see {@link StaticSiteMOSSURLProcessor} for an example. |
||
| 137 | * @param StaticSiteUrlProcessor $urlProcessor |
||
| 138 | * @return void |
||
| 139 | */ |
||
| 140 | public function setUrlProcessor(StaticSiteUrlProcessor $urlProcessor = null) |
||
| 141 | { |
||
| 142 | $this->urlProcessor = $urlProcessor; |
||
| 143 | } |
||
| 144 | |||
| 145 | /** |
||
| 146 | * Define additional crawl URLs as an array |
||
| 147 | * Each of these URLs will be crawled in addition the base URL. |
||
| 148 | * This can be helpful if pages are getting missed by the crawl |
||
| 149 | * |
||
| 150 | * @param array $extraCrawlURLs |
||
| 151 | * @return void |
||
| 152 | */ |
||
| 153 | public function setExtraCrawlURls($extraCrawlURLs) |
||
| 154 | { |
||
| 155 | $this->extraCrawlURLs = $extraCrawlURLs; |
||
| 156 | } |
||
| 157 | |||
| 158 | /** |
||
| 159 | * Return the additional crawl URLs as an array |
||
| 160 | * |
||
| 161 | * @return array |
||
| 162 | */ |
||
| 163 | public function getExtraCrawlURLs() |
||
| 164 | { |
||
| 165 | return $this->extraCrawlURLs; |
||
| 166 | } |
||
| 167 | |||
| 168 | /** |
||
| 169 | * Set an array of regular expression patterns that should be excluded from |
||
| 170 | * being added to the url list. |
||
| 171 | * |
||
| 172 | * @param array $excludePatterns |
||
| 173 | * @return void |
||
| 174 | */ |
||
| 175 | public function setExcludePatterns(array $excludePatterns) |
||
| 176 | { |
||
| 177 | $this->excludePatterns = $excludePatterns; |
||
| 178 | } |
||
| 179 | |||
| 180 | /** |
||
| 181 | * Get an array of regular expression patterns that should not be added to |
||
| 182 | * the url list. |
||
| 183 | * |
||
| 184 | * @return array |
||
| 185 | */ |
||
| 186 | public function getExcludePatterns() |
||
| 187 | { |
||
| 188 | return $this->excludePatterns; |
||
| 189 | } |
||
| 190 | |||
| 191 | /** |
||
| 192 | * Set whether the crawl should be triggered on demand. |
||
| 193 | * |
||
| 194 | * @param boolean $autoCrawl |
||
| 195 | * @return StaticSiteUrlList |
||
| 196 | */ |
||
| 197 | public function setAutoCrawl(bool $autoCrawl): StaticSiteUrlList |
||
| 202 | } |
||
| 203 | |||
| 204 | /** |
||
| 205 | * Returns the status of the spidering. |
||
| 206 | * |
||
| 207 | * @return string |
||
| 208 | */ |
||
| 209 | public function getSpiderStatus(): string |
||
| 210 | { |
||
| 211 | if (file_exists($this->cacheDir . 'urls')) { |
||
| 212 | if (file_exists($this->cacheDir . 'crawlerid')) { |
||
| 213 | return self::CRAWL_STATUS_PARTIAL; |
||
| 214 | } |
||
| 215 | |||
| 216 | return self::CRAWL_STATUS_COMPLETE; |
||
| 217 | } |
||
| 218 | |||
| 219 | return self::CRAWL_STATUS_NOTSTARTED; |
||
| 220 | } |
||
| 221 | |||
| 222 | /** |
||
| 223 | * Raw URL+Mime data accessor method, used internally by logic outside of the class. |
||
| 224 | * |
||
| 225 | * @return mixed string $urls | null if no cached URL/Mime data found |
||
| 226 | */ |
||
| 227 | public function getRawCacheData() |
||
| 228 | { |
||
| 229 | if ($this->urls) { |
||
| 230 | // Don't rely on loadUrls() as it chokes on partially completed imports |
||
| 231 | $urls = $this->urls; |
||
| 232 | } elseif (file_exists($this->cacheDir . 'urls')) { |
||
| 233 | $urls = unserialize(file_get_contents($this->cacheDir . 'urls')); |
||
| 234 | } else { |
||
| 235 | return null; |
||
| 236 | } |
||
| 237 | return $urls; |
||
| 238 | } |
||
| 239 | |||
| 240 | /** |
||
| 241 | * Return the number of URLs crawled so far. If the urlcache is incomplete or |
||
| 242 | * doesn't exist, assumes zero. |
||
| 243 | * |
||
| 244 | * @return mixed integer |
||
| 245 | */ |
||
| 246 | public function getNumURIs(): int |
||
| 268 | } |
||
| 269 | |||
| 270 | /** |
||
| 271 | * Return a map of URLs crawled, with raw URLs as keys and processed URLs as values |
||
| 272 | * |
||
| 273 | * @return array |
||
| 274 | */ |
||
| 275 | public function getProcessedURLs(): array |
||
| 299 | ); |
||
| 300 | } |
||
| 301 | } |
||
| 302 | |||
| 303 | /** |
||
| 304 | * There are URLs and we're not in the middle of a crawl. |
||
| 305 | * |
||
| 306 | * @return boolean |
||
| 307 | */ |
||
| 308 | public function hasCrawled(): bool |
||
| 311 | } |
||
| 312 | |||
| 313 | /** |
||
| 314 | * Load the URLs, either by crawling, or by fetching from cache. |
||
| 315 | * |
||
| 316 | * @return void |
||
| 317 | * @throws \LogicException |
||
| 318 | */ |
||
| 319 | public function loadUrls(): void |
||
| 320 | { |
||
| 321 | if ($this->hasCrawled()) { |
||
| 322 | $this->urls = unserialize(file_get_contents($this->cacheDir . 'urls')); |
||
| 323 | |||
| 324 | // Clear out obsolete format |
||
| 325 | if (!isset($this->urls['regular'])) { |
||
| 326 | $this->urls['regular'] = []; |
||
| 327 | } |
||
| 328 | if (!isset($this->urls['inferred'])) { |
||
| 329 | $this->urls['inferred'] = []; |
||
| 330 | } |
||
| 331 | } elseif ($this->autoCrawl) { |
||
| 332 | $this->crawl(); |
||
| 333 | } else { |
||
| 334 | // This is grim, but we get to keep the useful check |
||
| 335 | if (!$this->isRunningTest()) { |
||
| 336 | // This happens if you move a cache-file out of the way during a real (non-test) run... |
||
| 337 | $msg = 'Crawl hasn\'t been executed yet and autoCrawl is false. Has the cache file been moved?'; |
||
| 338 | throw new \LogicException($msg); |
||
| 339 | } |
||
| 340 | } |
||
| 341 | } |
||
| 342 | |||
| 343 | /** |
||
| 344 | * @return boolean |
||
| 345 | */ |
||
| 346 | private function isRunningTest(): bool |
||
| 347 | { |
||
| 348 | return ( |
||
| 349 | // Github tests have SS_BASE_URL set as follows |
||
| 350 | Environment::getEnv('SS_BASE_URL') == 'http://localhost' || |
||
| 351 | // Tests use "static-site-0" s cache dirname |
||
| 352 | file_exists(preg_replace('#[0-9]+#', '0', $this->cacheDir)) |
||
| 353 | ); |
||
| 354 | } |
||
| 355 | |||
| 356 | /** |
||
| 357 | * Re-execute the URL processor on all the fetched URLs. |
||
| 358 | * If the site has been crawled and then subsequently the URLProcessor was changed, we need to ensure |
||
| 359 | * URLs are re-processed using the newly selected URL Preprocessor. |
||
| 360 | * |
||
| 361 | * @return void |
||
| 362 | */ |
||
| 363 | public function reprocessUrls() |
||
| 364 | { |
||
| 365 | if ($this->urls === null) { |
||
| 366 | $this->loadUrls(); |
||
| 367 | } |
||
| 368 | |||
| 369 | // Clear out all inferred URLs; these will be added |
||
| 370 | $this->urls['inferred'] = []; |
||
| 371 | |||
| 372 | // Reprocess URLs, in case the processing has changed since the last crawl |
||
| 373 | foreach ($this->urls['regular'] as $url => $urlData) { |
||
| 374 | $processedURLData = $this->generateProcessedURL($urlData); |
||
| 375 | $this->urls['regular'][$url] = $processedURLData; |
||
| 376 | |||
| 377 | // Trigger parent URL back-filling on new processed URL |
||
| 378 | $this->parentProcessedURL($processedURLData); |
||
| 379 | } |
||
| 380 | |||
| 381 | $this->saveURLs(); |
||
| 382 | } |
||
| 383 | |||
| 384 | /** |
||
| 385 | * |
||
| 386 | * @param number $limit |
||
| 387 | * @param bool $verbose |
||
| 388 | * @return StaticSiteCrawler |
||
| 389 | * @throws Exception |
||
| 390 | */ |
||
| 391 | public function crawl($limit = false, $verbose = false) |
||
| 392 | { |
||
| 393 | Environment::increaseTimeLimitTo(3600); |
||
| 394 | |||
| 395 | if (!is_dir($this->cacheDir)) { |
||
| 396 | if (!mkdir($this->cacheDir)) { |
||
| 397 | throw new \Exception('Unable to create cache directory at: ' . $this->cacheDir); |
||
| 398 | } |
||
| 399 | } |
||
| 400 | |||
| 401 | $crawler = StaticSiteCrawler::create($this, $limit, $verbose); |
||
| 402 | $crawler->enableResumption(); |
||
| 403 | $crawler->setUrlCacheType(PHPCrawlerUrlCacheTypes::URLCACHE_SQLITE); |
||
| 404 | $crawler->setWorkingDirectory($this->cacheDir); |
||
| 405 | |||
| 406 | // Find links in externally-linked CSS files |
||
| 407 | if ($this->source->ParseCSS) { |
||
| 408 | $crawler->addLinkSearchContentType("#text/css# i"); |
||
| 409 | } |
||
| 410 | |||
| 411 | // Set some proxy options for phpCrawler |
||
| 412 | singleton(StaticSiteUtils::class)->defineProxyOpts(!Director::isDev(), $crawler); |
||
| 413 | |||
| 414 | // Allow for resuming an incomplete crawl |
||
| 415 | if (file_exists($this->cacheDir . 'crawlerid')) { |
||
| 416 | // We should re-load the partial list of URLs, if relevant |
||
| 417 | // This should only happen when we are resuming a partial crawl |
||
| 418 | if (file_exists($this->cacheDir . 'urls')) { |
||
| 419 | $this->urls = unserialize(file_get_contents($this->cacheDir . 'urls')); |
||
| 420 | } else { |
||
| 421 | $this->urls = [ |
||
| 422 | 'regular' => [], |
||
| 423 | 'inferred' => [], |
||
| 424 | ]; |
||
| 425 | } |
||
| 426 | |||
| 427 | $crawlerID = file_get_contents($this->cacheDir . 'crawlerid'); |
||
| 428 | $crawler->resume($crawlerID); |
||
| 429 | } else { |
||
| 430 | $crawlerID = $crawler->getCrawlerId(); |
||
| 431 | file_put_contents($this->cacheDir . '/crawlerid', $crawlerID); |
||
| 432 | |||
| 433 | $this->urls = [ |
||
| 434 | 'regular' => [], |
||
| 435 | 'inferred' => [], |
||
| 436 | ]; |
||
| 437 | } |
||
| 438 | |||
| 439 | $crawler->setURL($this->baseURL); |
||
| 440 | $crawler->go(); |
||
| 441 | |||
| 442 | unlink($this->cacheDir . 'crawlerid'); |
||
| 443 | |||
| 444 | // TODO Document these |
||
| 445 | ksort($this->urls['regular']); |
||
| 446 | ksort($this->urls['inferred']); |
||
| 447 | |||
| 448 | $this->saveURLs(); |
||
| 449 | |||
| 450 | return $crawler; |
||
| 451 | } |
||
| 452 | |||
| 453 | /** |
||
| 454 | * Cache the current list of URLs to disk. |
||
| 455 | * |
||
| 456 | * @return void |
||
| 457 | */ |
||
| 458 | public function saveURLs() |
||
| 461 | } |
||
| 462 | |||
| 463 | /** |
||
| 464 | * Add a URL to this list, given the absolute URL. |
||
| 465 | * |
||
| 466 | * @param string $url The absolute URL |
||
| 467 | * @param string $content_type The Mime-Type found at this URL e.g text/html or image/png |
||
| 468 | * @throws \InvalidArgumentException |
||
| 469 | * @return void |
||
| 470 | */ |
||
| 471 | public function addAbsoluteURL($url, $content_type) |
||
| 472 | { |
||
| 473 | $simplifiedURL = $this->simplifyURL($url); |
||
| 474 | $simplifiedBase = $this->simplifyURL($this->baseURL); |
||
| 475 | |||
| 476 | // Check we're adhering to the correct base URL |
||
| 477 | if (substr($simplifiedURL, 0, strlen($simplifiedBase)) == $simplifiedBase) { |
||
| 478 | $relURL = preg_replace("#https?://(www.)?[^/]+#", '', $url); |
||
| 479 | } else { |
||
| 480 | throw new \InvalidArgumentException("URL $url is not from the site $this->baseURL"); |
||
| 481 | } |
||
| 482 | |||
| 483 | $this->addURL($relURL, $content_type); |
||
| 484 | } |
||
| 485 | |||
| 486 | /** |
||
| 487 | * Appends a processed URL onto the URL cache. |
||
| 488 | * |
||
| 489 | * @param string $url |
||
| 490 | * @param string $contentType |
||
| 491 | * @return void |
||
| 492 | */ |
||
| 493 | public function addURL($url, $contentType) |
||
| 494 | { |
||
| 495 | if ($this->urls === null) { |
||
| 496 | $this->loadUrls(); |
||
| 497 | } |
||
| 498 | |||
| 499 | // Generate and save the processed URLs |
||
| 500 | $urlData = [ |
||
| 501 | 'url' => $url, |
||
| 502 | 'mime' => $contentType, |
||
| 503 | ]; |
||
| 504 | |||
| 505 | $this->urls['regular'][$url] = $this->generateProcessedURL($urlData); |
||
| 506 | |||
| 507 | // Trigger parent URL back-filling |
||
| 508 | $this->parentProcessedURL($this->urls['regular'][$url]); |
||
| 509 | } |
||
| 510 | |||
| 511 | /** |
||
| 512 | * Add an inferred URL to the list. |
||
| 513 | * |
||
| 514 | * Since the unprocessed URL isn't available, we use the processed URL in its place. |
||
| 515 | * This should be used with some caution. |
||
| 516 | * |
||
| 517 | * @param array $inferredURLData Contains the processed URL and Mime-Type to add |
||
| 518 | * @return void |
||
| 519 | */ |
||
| 520 | public function addInferredURL($inferredURLData) |
||
| 521 | { |
||
| 522 | if ($this->urls === null) { |
||
| 523 | $this->loadUrls(); |
||
| 524 | } |
||
| 525 | |||
| 526 | // Generate and save the processed URLs |
||
| 527 | $this->urls['inferred'][$inferredURLData['url']] = $inferredURLData; |
||
| 528 | |||
| 529 | // Trigger parent URL back-filling |
||
| 530 | $this->parentProcessedURL($inferredURLData); |
||
| 531 | } |
||
| 532 | |||
| 533 | /** |
||
| 534 | * Return true if the given URL exists. |
||
| 535 | * |
||
| 536 | * @param string $url The URL, either absolute, or relative starting with "/" |
||
| 537 | * @return boolean Does the URL exist |
||
| 538 | * @throws \InvalidArgumentException |
||
| 539 | */ |
||
| 540 | public function hasURL($url) |
||
| 541 | { |
||
| 542 | if ($this->urls === null) { |
||
| 543 | $this->loadUrls(); |
||
| 544 | } |
||
| 545 | |||
| 546 | // Try and relativise an absolute URL |
||
| 547 | if ($url[0] != '/') { |
||
| 548 | $simpifiedURL = $this->simplifyURL($url); |
||
| 549 | $simpifiedBase = $this->simplifyURL($this->baseURL); |
||
| 550 | |||
| 551 | if (substr($simpifiedURL, 0, strlen($simpifiedBase)) == $simpifiedBase) { |
||
| 552 | $url = substr($simpifiedURL, strlen($simpifiedBase)); |
||
| 553 | } else { |
||
| 554 | throw new \InvalidArgumentException("URL $url is not from the site $this->baseURL"); |
||
| 555 | } |
||
| 556 | } |
||
| 557 | |||
| 558 | return isset($this->urls['regular'][$url]) || in_array($url, $this->urls['inferred']); |
||
| 559 | } |
||
| 560 | |||
| 561 | /** |
||
| 562 | * Simplify a URL. Ignores https/http differences and "www." / non differences. |
||
| 563 | * |
||
| 564 | * @param string $url |
||
| 565 | * @return string |
||
| 566 | * @todo Why does this ignore https/http differences? Should it? |
||
| 567 | */ |
||
| 568 | public function simplifyURL($url) |
||
| 569 | { |
||
| 570 | return preg_replace("#^http(s)?://(www.)?#i", 'http://www.', $url); |
||
| 571 | } |
||
| 572 | |||
| 573 | /** |
||
| 574 | * Returns true if the given URL is in the list of processed URls |
||
| 575 | * |
||
| 576 | * @param string $processedURL The processed URL |
||
| 577 | * @return boolean True if it exists, false otherwise |
||
| 578 | */ |
||
| 579 | public function hasProcessedURL($processedURL) |
||
| 580 | { |
||
| 581 | if ($this->urls === null) { |
||
| 582 | $this->loadUrls(); |
||
| 583 | } |
||
| 584 | |||
| 585 | return in_array($processedURL, array_keys($this->urls['regular'])) || |
||
| 586 | in_array($processedURL, array_keys($this->urls['inferred'])); |
||
| 587 | } |
||
| 588 | |||
| 589 | /** |
||
| 590 | * Return the processed URL that is the parent of the given one. |
||
| 591 | * |
||
| 592 | * Both input and output are processed URLs |
||
| 593 | * |
||
| 594 | * @param array $processedURLData URLData comprising a relative URL and Mime-Type |
||
| 595 | * @return string | array $processedURLData |
||
| 596 | */ |
||
| 597 | public function parentProcessedURL($processedURLData) |
||
| 598 | { |
||
| 599 | $mime = self::$undefined_mime_type; |
||
| 600 | $processedURL = $processedURLData; |
||
| 601 | |||
| 602 | if (is_array($processedURLData)) { |
||
| 603 | /* |
||
| 604 | * If $processedURLData['url'] is not HTML, it's unlikely its parent |
||
| 605 | * is anything useful (Prob just a directory) |
||
| 606 | */ |
||
| 607 | $sng = singleton(StaticSiteMimeProcessor::class); |
||
| 608 | $mime = $sng->IsOfHtml($processedURLData['mime']) ? $processedURLData['mime'] : self::$undefined_mime_type; |
||
| 609 | $processedURL = $processedURLData['url']; |
||
| 610 | } |
||
| 611 | |||
| 612 | $default = function ($fragment) use ($mime) { |
||
| 613 | return [ |
||
| 614 | 'url' => $fragment, |
||
| 615 | 'mime' => $mime, |
||
| 616 | ]; |
||
| 617 | }; |
||
| 618 | |||
| 619 | if ($processedURL == "/") { |
||
| 620 | return $default(''); |
||
| 621 | } |
||
| 622 | |||
| 623 | // URL hierarchy can be broken down by querystring or by URL |
||
| 624 | $breakpoint = max(strrpos($processedURL, '?'), strrpos($processedURL, '/')); |
||
| 625 | |||
| 626 | // Special case for children of the root |
||
| 627 | if ($breakpoint == 0) { |
||
| 628 | return $default('/'); |
||
| 629 | } |
||
| 630 | |||
| 631 | // Get parent URL |
||
| 632 | $parentProcessedURL = substr($processedURL, 0, $breakpoint); |
||
| 633 | |||
| 634 | $processedURLData = [ |
||
| 635 | 'url' => $parentProcessedURL, |
||
| 636 | 'mime' => $mime, |
||
| 637 | ]; |
||
| 638 | |||
| 639 | // If an intermediary URL doesn't exist, create it |
||
| 640 | if (!$this->hasProcessedURL($parentProcessedURL)) { |
||
| 641 | $this->addInferredURL($processedURLData); |
||
| 642 | } |
||
| 643 | |||
| 644 | return $processedURLData; |
||
| 645 | } |
||
| 646 | |||
| 647 | /** |
||
| 648 | * Find the processed URL in the URL list |
||
| 649 | * |
||
| 650 | * @param mixed string | array $urlData |
||
| 651 | * @return array $urlData |
||
| 652 | */ |
||
| 653 | public function processedURL($urlData) |
||
| 654 | { |
||
| 655 | $url = $urlData; |
||
| 656 | $mime = self::$undefined_mime_type; |
||
| 657 | |||
| 658 | if (is_array($urlData)) { |
||
| 659 | $url = $urlData['url']; |
||
| 660 | $mime = $urlData['mime']; |
||
| 661 | } |
||
| 662 | |||
| 663 | if ($this->urls === null) { |
||
| 664 | $this->loadUrls(); |
||
| 665 | } |
||
| 666 | |||
| 667 | $urlData = [ |
||
| 668 | 'url' => $url, |
||
| 669 | 'mime' => $mime, |
||
| 670 | ]; |
||
| 671 | |||
| 672 | if (isset($this->urls['regular'][$url])) { |
||
| 673 | // Generate it if missing |
||
| 674 | if ($this->urls['regular'][$url] === true) { |
||
| 675 | $this->urls['regular'][$url] = $this->generateProcessedURL($urlData); |
||
| 676 | } |
||
| 677 | |||
| 678 | return $this->urls['regular'][$url]; |
||
| 679 | } elseif (isset($this->urls['inferred'][$url])) { |
||
| 680 | return $this->urls['inferred'][$url]; |
||
| 681 | } |
||
| 682 | } |
||
| 683 | |||
| 684 | /** |
||
| 685 | * Execute custom logic for processing URLs prior to heirachy generation. |
||
| 686 | * |
||
| 687 | * This can be used to implement logic such as ignoring the "/Pages/" parts of MOSS URLs, or dropping extensions. |
||
| 688 | * |
||
| 689 | * @param array $urlData The unprocessed URLData |
||
| 690 | * @return array $urlData The processed URLData |
||
| 691 | * @throws \LogicException |
||
| 692 | */ |
||
| 693 | public function generateProcessedURL(array $urlData): array |
||
| 694 | { |
||
| 695 | if (!isset($urlData['url'])) { |
||
| 696 | throw new \LogicException("Can't pass a blank URL to generateProcessedURL"); |
||
| 697 | } |
||
| 698 | |||
| 699 | if ($this->urlProcessor) { |
||
| 700 | $urlData = $this->urlProcessor->processURL($urlData); |
||
| 701 | } |
||
| 702 | |||
| 703 | if (!$urlData) { |
||
| 704 | throw new \LogicException(get_class($this->urlProcessor) . " returned a blank URL."); |
||
| 705 | } |
||
| 706 | |||
| 707 | return $urlData; |
||
| 708 | } |
||
| 709 | |||
| 710 | /** |
||
| 711 | * Return the URLs that are a child of the given URL |
||
| 712 | * |
||
| 713 | * @param string $url |
||
| 714 | * @return array |
||
| 715 | */ |
||
| 716 | public function getChildren($url) |
||
| 717 | { |
||
| 718 | if ($this->urls === null) { |
||
| 719 | $this->loadUrls(); |
||
| 720 | } |
||
| 721 | |||
| 722 | $processedURL = $this->processedURL($url); |
||
| 723 | $processedURL = $processedURL['url'] ?? '/'; |
||
| 724 | |||
| 725 | // Subtly different regex if the URL ends in '?' or '/' |
||
| 726 | if (preg_match('#[/?]$#', $processedURL)) { |
||
| 727 | $regEx = '#^' . preg_quote($processedURL, '#') . '[^/?]+$#'; |
||
| 728 | } else { |
||
| 729 | $regEx = '#^' . preg_quote($processedURL, '#') . '[/?][^/?]+$#'; |
||
| 730 | } |
||
| 731 | |||
| 732 | $children = []; |
||
| 733 | |||
| 734 | foreach ($this->urls['regular'] as $urlKey => $potentialProcessedChild) { |
||
| 735 | $potentialProcessedChild = $urlKey; |
||
| 736 | if (preg_match($regEx, $potentialProcessedChild)) { |
||
| 737 | if (!isset($children[$potentialProcessedChild])) { |
||
| 738 | $children[$potentialProcessedChild] = $potentialProcessedChild; |
||
| 739 | } |
||
| 740 | } |
||
| 741 | } |
||
| 742 | |||
| 743 | foreach ($this->urls['inferred'] as $urlKey => $potentialProcessedChild) { |
||
| 744 | $potentialProcessedChild = $urlKey; |
||
| 745 | if (preg_match($regEx, $potentialProcessedChild)) { |
||
| 746 | if (!isset($children[$potentialProcessedChild])) { |
||
| 747 | $children[$potentialProcessedChild] = $potentialProcessedChild; |
||
| 748 | } |
||
| 749 | } |
||
| 750 | } |
||
| 751 | |||
| 752 | return array_values($children); |
||
| 753 | } |
||
| 754 | |||
| 755 | /** |
||
| 756 | * Simple property getter. Used in unit-testing. |
||
| 757 | * |
||
| 758 | * @param string $prop |
||
| 759 | * @return mixed |
||
| 760 | */ |
||
| 761 | public function getProperty($prop) |
||
| 762 | { |
||
| 763 | if ($this->$prop) { |
||
| 764 | return $this->$prop; |
||
| 765 | } |
||
| 766 | } |
||
| 767 | |||
| 768 | /** |
||
| 769 | * Get the serialized cache content and return the unserialized string |
||
| 770 | * |
||
| 771 | * @todo implement to replace x3 refs to unserialize(file_get_contents($this->cacheDir . 'urls')); |
||
| 772 | * @return string |
||
| 773 | */ |
||
| 774 | public function getCacheFileContents() |
||
| 784 | } |
||
| 785 | } |
||
| 786 |