| Total Complexity | 98 |
| Total Lines | 784 |
| Duplicated Lines | 0 % |
| Changes | 8 | ||
| Bugs | 2 | Features | 0 |
Complex classes like StaticSiteUrlList often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use StaticSiteUrlList, and based on these observations, apply Extract Interface, too.
| 1 | <?php |
||
| 26 | class StaticSiteUrlList |
||
| 27 | { |
||
| 28 | use Injectable; |
||
| 29 | use Configurable; |
||
| 30 | |||
| 31 | /** |
||
| 32 | * @var string |
||
| 33 | */ |
||
| 34 | public const CRAWL_STATUS_COMPLETE = 'Complete'; |
||
| 35 | |||
| 36 | /** |
||
| 37 | * @var string |
||
| 38 | */ |
||
| 39 | public const CRAWL_STATUS_PARTIAL = 'Partial'; |
||
| 40 | |||
| 41 | /** |
||
| 42 | * @var string |
||
| 43 | */ |
||
| 44 | public const CRAWL_STATUS_NOTSTARTED = 'Not started'; |
||
| 45 | |||
| 46 | /** |
||
| 47 | * |
||
| 48 | * @var string |
||
| 49 | */ |
||
| 50 | private static $undefined_mime_type = 'unknown/unknown'; |
||
| 51 | |||
| 52 | /** |
||
| 53 | * |
||
| 54 | * @var string |
||
| 55 | */ |
||
| 56 | protected $baseURL; |
||
| 57 | |||
| 58 | /** |
||
| 59 | * |
||
| 60 | * @var string |
||
| 61 | */ |
||
| 62 | protected $cacheDir; |
||
| 63 | |||
| 64 | /** |
||
| 65 | * Two element array: contains keys 'inferred' and 'regular': |
||
| 66 | * - 'regular' is an array mapping raw URLs to processed URLs |
||
| 67 | * - 'inferred' is an array of inferred URLs |
||
| 68 | * |
||
| 69 | * @var array |
||
| 70 | */ |
||
| 71 | protected $urls = null; |
||
| 72 | |||
| 73 | /** |
||
| 74 | * |
||
| 75 | * @var boolean |
||
| 76 | */ |
||
| 77 | protected $autoCrawl = false; |
||
| 78 | |||
| 79 | /** |
||
| 80 | * |
||
| 81 | * @var StaticSiteUrlProcessor |
||
| 82 | */ |
||
| 83 | protected $urlProcessor = null; |
||
| 84 | |||
| 85 | /** |
||
| 86 | * |
||
| 87 | * @var array |
||
| 88 | */ |
||
| 89 | protected $extraCrawlURLs = null; |
||
| 90 | |||
| 91 | /** |
||
| 92 | * A list of regular expression patterns to exclude from scraping |
||
| 93 | * |
||
| 94 | * @var array |
||
| 95 | */ |
||
| 96 | protected $excludePatterns = []; |
||
| 97 | |||
| 98 | /** |
||
| 99 | * The StaticSiteContentSource object |
||
| 100 | * |
||
| 101 | * @var StaticSiteContentSource |
||
| 102 | */ |
||
| 103 | protected $source; |
||
| 104 | |||
| 105 | /** |
||
| 106 | * Create a new URL List |
||
| 107 | * @param StaticSiteContentSource $source |
||
| 108 | * @param string $cacheDir The local path to cache data into |
||
| 109 | * @return void |
||
| 110 | */ |
||
| 111 | public function __construct(StaticSiteContentSource $source, $cacheDir) |
||
| 112 | { |
||
| 113 | $this->setIsRunningTest(); |
||
| 114 | |||
| 115 | // baseURL must not have a trailing slash |
||
| 116 | $baseURL = $source->BaseUrl; |
||
|
|
|||
| 117 | |||
| 118 | if (substr($baseURL, -1) == "/") { |
||
| 119 | $baseURL = substr($baseURL, 0, -1); |
||
| 120 | } |
||
| 121 | |||
| 122 | // cacheDir must have a trailing slash |
||
| 123 | if (substr($cacheDir, -1) != "/") { |
||
| 124 | $cacheDir .= "/"; |
||
| 125 | } |
||
| 126 | |||
| 127 | $this->baseURL = $baseURL; |
||
| 128 | $this->cacheDir = $cacheDir; |
||
| 129 | $this->source = $source; |
||
| 130 | } |
||
| 131 | |||
| 132 | /** |
||
| 133 | * Set a URL processor for this URL List. |
||
| 134 | * |
||
| 135 | * URL processors process the URLs before the site hierarchy and any inferred metadata are generated. |
||
| 136 | * These can be used to tranform URLs from CMS's that don't provide a natural hierarchy, into something |
||
| 137 | * more useful. |
||
| 138 | * |
||
| 139 | * @see {@link StaticSiteMOSSURLProcessor} for an example. |
||
| 140 | * @param StaticSiteUrlProcessor $urlProcessor |
||
| 141 | * @return void |
||
| 142 | */ |
||
| 143 | public function setUrlProcessor(StaticSiteUrlProcessor $urlProcessor = null) |
||
| 144 | { |
||
| 145 | $this->urlProcessor = $urlProcessor; |
||
| 146 | } |
||
| 147 | |||
| 148 | /** |
||
| 149 | * Define additional crawl URLs as an array |
||
| 150 | * Each of these URLs will be crawled in addition the base URL. |
||
| 151 | * This can be helpful if pages are getting missed by the crawl |
||
| 152 | * |
||
| 153 | * @param array $extraCrawlURLs |
||
| 154 | * @return void |
||
| 155 | */ |
||
| 156 | public function setExtraCrawlURls($extraCrawlURLs) |
||
| 157 | { |
||
| 158 | $this->extraCrawlURLs = $extraCrawlURLs; |
||
| 159 | } |
||
| 160 | |||
| 161 | /** |
||
| 162 | * Return the additional crawl URLs as an array |
||
| 163 | * |
||
| 164 | * @return array |
||
| 165 | */ |
||
| 166 | public function getExtraCrawlURLs() |
||
| 167 | { |
||
| 168 | return $this->extraCrawlURLs; |
||
| 169 | } |
||
| 170 | |||
| 171 | /** |
||
| 172 | * Set an array of regular expression patterns that should be excluded from |
||
| 173 | * being added to the url list. |
||
| 174 | * |
||
| 175 | * @param array $excludePatterns |
||
| 176 | * @return void |
||
| 177 | */ |
||
| 178 | public function setExcludePatterns(array $excludePatterns) |
||
| 179 | { |
||
| 180 | $this->excludePatterns = $excludePatterns; |
||
| 181 | } |
||
| 182 | |||
| 183 | /** |
||
| 184 | * Get an array of regular expression patterns that should not be added to |
||
| 185 | * the url list. |
||
| 186 | * |
||
| 187 | * @return array |
||
| 188 | */ |
||
| 189 | public function getExcludePatterns() |
||
| 190 | { |
||
| 191 | return $this->excludePatterns; |
||
| 192 | } |
||
| 193 | |||
| 194 | /** |
||
| 195 | * Set whether the crawl should be triggered on demand. |
||
| 196 | * |
||
| 197 | * @param boolean $autoCrawl |
||
| 198 | * @return StaticSiteUrlList |
||
| 199 | */ |
||
| 200 | public function setAutoCrawl(bool $autoCrawl): StaticSiteUrlList |
||
| 201 | { |
||
| 202 | $this->autoCrawl = $autoCrawl; |
||
| 203 | |||
| 204 | return $this; |
||
| 205 | } |
||
| 206 | |||
| 207 | /** |
||
| 208 | * Returns the status of the spidering. |
||
| 209 | * |
||
| 210 | * @return string |
||
| 211 | */ |
||
| 212 | public function getSpiderStatus(): string |
||
| 213 | { |
||
| 214 | if (file_exists($this->cacheDir . 'urls')) { |
||
| 215 | if (file_exists($this->cacheDir . 'crawlerid')) { |
||
| 216 | return self::CRAWL_STATUS_PARTIAL; |
||
| 217 | } |
||
| 218 | |||
| 219 | return self::CRAWL_STATUS_COMPLETE; |
||
| 220 | } |
||
| 221 | |||
| 222 | return self::CRAWL_STATUS_NOTSTARTED; |
||
| 223 | } |
||
| 224 | |||
| 225 | /** |
||
| 226 | * Raw URL+Mime data accessor method, used internally by logic outside of the class. |
||
| 227 | * |
||
| 228 | * @return mixed string $urls | null if no cached URL/Mime data found |
||
| 229 | */ |
||
| 230 | public function getRawCacheData() |
||
| 231 | { |
||
| 232 | if ($this->urls) { |
||
| 233 | // Don't rely on loadUrls() as it chokes on partially completed imports |
||
| 234 | $urls = $this->urls; |
||
| 235 | } elseif (file_exists($this->cacheDir . 'urls')) { |
||
| 236 | $urls = unserialize(file_get_contents($this->cacheDir . 'urls')); |
||
| 237 | } else { |
||
| 238 | return null; |
||
| 239 | } |
||
| 240 | return $urls; |
||
| 241 | } |
||
| 242 | |||
| 243 | /** |
||
| 244 | * Return the number of URLs crawled so far. If the urlcache is incomplete or |
||
| 245 | * doesn't exist, assumes zero. |
||
| 246 | * |
||
| 247 | * @return mixed integer |
||
| 248 | */ |
||
| 249 | public function getNumURIs(): int |
||
| 250 | { |
||
| 251 | if (!$urls = $this->getRawCacheData()) { |
||
| 252 | return 0; |
||
| 253 | } |
||
| 254 | |||
| 255 | if (!isset($urls['regular']) || !isset($urls['regular'])) { |
||
| 256 | return 0; |
||
| 257 | } |
||
| 258 | |||
| 259 | $_regular = []; |
||
| 260 | $_inferred = []; |
||
| 261 | |||
| 262 | foreach ($urls['regular'] as $key => $urlData) { |
||
| 263 | array_push($_regular, $urlData['url']); |
||
| 264 | } |
||
| 265 | |||
| 266 | foreach ($urls['inferred'] as $key => $urlData) { |
||
| 267 | array_push($_inferred, $urlData['url']); |
||
| 268 | } |
||
| 269 | |||
| 270 | return count(array_unique($_regular)) + count($_inferred); |
||
| 271 | } |
||
| 272 | |||
| 273 | /** |
||
| 274 | * Return a map of URLs crawled, with raw URLs as keys and processed URLs as values |
||
| 275 | * |
||
| 276 | * @return array |
||
| 277 | */ |
||
| 278 | public function getProcessedURLs(): array |
||
| 279 | { |
||
| 280 | if ($this->hasCrawled() || $this->autoCrawl) { |
||
| 281 | if ($this->urls === null) { |
||
| 282 | $this->loadUrls(); |
||
| 283 | } |
||
| 284 | |||
| 285 | $_regular = []; |
||
| 286 | $_inferred = null; |
||
| 287 | |||
| 288 | foreach ($this->urls['regular'] as $key => $urlData) { |
||
| 289 | $_regular[$key] = $urlData['url']; |
||
| 290 | } |
||
| 291 | |||
| 292 | if ($this->urls['inferred']) { |
||
| 293 | $_inferred = []; |
||
| 294 | foreach ($this->urls['inferred'] as $key => $urlData) { |
||
| 295 | $_inferred[$key] = $urlData['url']; |
||
| 296 | } |
||
| 297 | } |
||
| 298 | |||
| 299 | return array_merge( |
||
| 300 | $_regular, |
||
| 301 | $_inferred ? array_combine($_inferred, $_inferred) : [] |
||
| 302 | ); |
||
| 303 | } |
||
| 304 | } |
||
| 305 | |||
| 306 | /** |
||
| 307 | * There are URLs and we're not in the middle of a crawl. |
||
| 308 | * |
||
| 309 | * @return boolean |
||
| 310 | */ |
||
| 311 | public function hasCrawled(): bool |
||
| 312 | { |
||
| 313 | return file_exists($this->cacheDir . 'urls') && !file_exists($this->cacheDir . 'crawlerid'); |
||
| 314 | } |
||
| 315 | |||
| 316 | /** |
||
| 317 | * Load the URLs, either by crawling, or by fetching from cache. |
||
| 318 | * |
||
| 319 | * @return void |
||
| 320 | * @throws \LogicException |
||
| 321 | */ |
||
| 322 | public function loadUrls(): void |
||
| 323 | { |
||
| 324 | if ($this->hasCrawled()) { |
||
| 325 | $this->urls = unserialize(file_get_contents($this->cacheDir . 'urls')); |
||
| 326 | |||
| 327 | // Clear out obsolete format |
||
| 328 | if (!isset($this->urls['regular'])) { |
||
| 329 | $this->urls['regular'] = []; |
||
| 330 | } |
||
| 331 | if (!isset($this->urls['inferred'])) { |
||
| 332 | $this->urls['inferred'] = []; |
||
| 333 | } |
||
| 334 | } elseif ($this->autoCrawl) { |
||
| 335 | $this->crawl(); |
||
| 336 | } else { |
||
| 337 | // This happens if you move a cache-file out of the way during a real (non-test) run... |
||
| 338 | $msg = 'Crawl hasn\'t been executed yet and autoCrawl is false. Has the cache file been moved?'; |
||
| 339 | throw new \LogicException($msg); |
||
| 340 | } |
||
| 341 | } |
||
| 342 | |||
| 343 | /** |
||
| 344 | * @return void |
||
| 345 | */ |
||
| 346 | private function setIsRunningTest(): void |
||
| 347 | { |
||
| 348 | $isGithub = Environment::getEnv('SS_BASE_URL') == 'http://localhost'; // Github tests have SS_BASE_URL set |
||
| 349 | |||
| 350 | if ($isGithub && !file_exists(ASSETS_PATH)) { |
||
| 351 | mkdir(ASSETS_PATH, 0777, true); |
||
| 352 | } |
||
| 353 | } |
||
| 354 | |||
| 355 | /** |
||
| 356 | * Re-execute the URL processor on all the fetched URLs. |
||
| 357 | * If the site has been crawled and then subsequently the URLProcessor was changed through |
||
| 358 | * user-interaction in the "external content" CMS admin, then we need to ensure that |
||
| 359 | * URLs are re-processed using the newly selected URL Preprocessor. |
||
| 360 | * |
||
| 361 | * @return void |
||
| 362 | */ |
||
| 363 | public function reprocessUrls() |
||
| 364 | { |
||
| 365 | if ($this->urls === null) { |
||
| 366 | $this->loadUrls(); |
||
| 367 | } |
||
| 368 | |||
| 369 | // Clear out all inferred URLs; these will be added |
||
| 370 | $this->urls['inferred'] = []; |
||
| 371 | |||
| 372 | // Reprocess URLs, in case the processing has changed since the last crawl |
||
| 373 | foreach ($this->urls['regular'] as $url => $urlData) { |
||
| 374 | // TODO Log this in exodus.log |
||
| 375 | if (empty($urlData['url'])) { |
||
| 376 | // echo $urlData['mime'] . "\n"; |
||
| 377 | continue; |
||
| 378 | } |
||
| 379 | |||
| 380 | $processedURLData = $this->generateProcessedURL($urlData); |
||
| 381 | $this->urls['regular'][$url] = $processedURLData; |
||
| 382 | // Trigger parent URL back-filling on new processed URL |
||
| 383 | $this->parentProcessedURL($processedURLData); |
||
| 384 | } |
||
| 385 | |||
| 386 | $this->saveURLs(); |
||
| 387 | } |
||
| 388 | |||
| 389 | /** |
||
| 390 | * |
||
| 391 | * @param number $limit |
||
| 392 | * @param bool $verbose |
||
| 393 | * @return StaticSiteCrawler |
||
| 394 | * @throws Exception |
||
| 395 | */ |
||
| 396 | public function crawl($limit = false, $verbose = false) |
||
| 397 | { |
||
| 398 | Environment::increaseTimeLimitTo(3600); |
||
| 399 | |||
| 400 | if (!is_dir($this->cacheDir)) { |
||
| 401 | if (!mkdir($this->cacheDir)) { |
||
| 402 | throw new \Exception('Unable to create cache directory at: ' . $this->cacheDir); |
||
| 403 | } |
||
| 404 | } |
||
| 405 | |||
| 406 | $crawler = StaticSiteCrawler::create($this, $limit, $verbose); |
||
| 407 | $crawler->enableResumption(); |
||
| 408 | $crawler->setUrlCacheType(PHPCrawlerUrlCacheTypes::URLCACHE_SQLITE); |
||
| 409 | $crawler->setWorkingDirectory($this->cacheDir); |
||
| 410 | |||
| 411 | // Find links in externally-linked CSS files |
||
| 412 | if ($this->source->ParseCSS) { |
||
| 413 | $crawler->addLinkSearchContentType("#text/css# i"); |
||
| 414 | } |
||
| 415 | |||
| 416 | // Set some proxy options for phpCrawler |
||
| 417 | singleton(StaticSiteUtils::class)->defineProxyOpts(!Director::isDev(), $crawler); |
||
| 418 | |||
| 419 | // Allow for resuming an incomplete crawl |
||
| 420 | if (file_exists($this->cacheDir . 'crawlerid')) { |
||
| 421 | // We should re-load the partial list of URLs, if relevant |
||
| 422 | // This should only happen when we are resuming a partial crawl |
||
| 423 | if (file_exists($this->cacheDir . 'urls')) { |
||
| 424 | $this->urls = unserialize(file_get_contents($this->cacheDir . 'urls')); |
||
| 425 | } else { |
||
| 426 | $this->urls = [ |
||
| 427 | 'regular' => [], |
||
| 428 | 'inferred' => [], |
||
| 429 | ]; |
||
| 430 | } |
||
| 431 | |||
| 432 | $crawlerID = file_get_contents($this->cacheDir . 'crawlerid'); |
||
| 433 | $crawler->resume($crawlerID); |
||
| 434 | } else { |
||
| 435 | $crawlerID = $crawler->getCrawlerId(); |
||
| 436 | |||
| 437 | $this->urls = [ |
||
| 438 | 'regular' => [], |
||
| 439 | 'inferred' => [], |
||
| 440 | ]; |
||
| 441 | } |
||
| 442 | |||
| 443 | $crawler->setURL($this->baseURL); |
||
| 444 | $crawler->setPort(preg_match('#^https#', $this->baseURL) ? 443 : 80); |
||
| 445 | $crawler->go(); |
||
| 446 | |||
| 447 | // TODO Why were we deleting this originally? |
||
| 448 | // unlink($this->cacheDir . 'crawlerid'); |
||
| 449 | |||
| 450 | // TODO Document these |
||
| 451 | ksort($this->urls['regular']); |
||
| 452 | ksort($this->urls['inferred']); |
||
| 453 | |||
| 454 | $this->saveURLs(); |
||
| 455 | |||
| 456 | return $crawler; |
||
| 457 | } |
||
| 458 | |||
| 459 | /** |
||
| 460 | * Cache the current list of URLs to disk. |
||
| 461 | * |
||
| 462 | * @return void |
||
| 463 | */ |
||
| 464 | public function saveURLs() |
||
| 467 | } |
||
| 468 | |||
| 469 | /** |
||
| 470 | * Add a URL to this list, given the absolute URL. |
||
| 471 | * |
||
| 472 | * @param string $url The absolute URL |
||
| 473 | * @param string $content_type The Mime-Type found at this URL e.g text/html or image/png |
||
| 474 | * @throws \InvalidArgumentException |
||
| 475 | * @return void |
||
| 476 | */ |
||
| 477 | public function addAbsoluteURL($url, $content_type) |
||
| 478 | { |
||
| 479 | $simplifiedURL = $this->simplifyURL($url); |
||
| 480 | $simplifiedBase = $this->simplifyURL($this->baseURL); |
||
| 481 | |||
| 482 | // Check we're adhering to the correct base URL |
||
| 483 | if (substr($simplifiedURL, 0, strlen($simplifiedBase)) == $simplifiedBase) { |
||
| 484 | $relURL = preg_replace("#https?://(www.)?[^/]+#", '', $url); |
||
| 485 | } else { |
||
| 486 | throw new \InvalidArgumentException("URL $url is not from the site $this->baseURL"); |
||
| 487 | } |
||
| 488 | |||
| 489 | $this->addURL($relURL, $content_type); |
||
| 490 | } |
||
| 491 | |||
| 492 | /** |
||
| 493 | * Appends a processed URL onto the URL cache. |
||
| 494 | * |
||
| 495 | * @param string $url |
||
| 496 | * @param string $contentType |
||
| 497 | * @return mixed null|void |
||
| 498 | */ |
||
| 499 | public function addURL($url, $contentType) |
||
| 500 | { |
||
| 501 | if ($this->urls === null) { |
||
| 502 | $this->loadUrls(); |
||
| 503 | } |
||
| 504 | |||
| 505 | if (empty($url)) { |
||
| 506 | return null; |
||
| 507 | } |
||
| 508 | |||
| 509 | // Generate and save the processed URLs |
||
| 510 | $urlData = [ |
||
| 511 | 'url' => $url, |
||
| 512 | 'mime' => $contentType, |
||
| 513 | ]; |
||
| 514 | |||
| 515 | $this->urls['regular'][$url] = $this->generateProcessedURL($urlData); |
||
| 516 | |||
| 517 | // Trigger parent URL back-filling |
||
| 518 | $this->parentProcessedURL($this->urls['regular'][$url]); |
||
| 519 | } |
||
| 520 | |||
| 521 | /** |
||
| 522 | * Add an inferred URL to the list. |
||
| 523 | * |
||
| 524 | * Since the unprocessed URL isn't available, we use the processed URL in its place. |
||
| 525 | * This should be used with some caution. |
||
| 526 | * |
||
| 527 | * @param array $inferredURLData Contains the processed URL and Mime-Type to add |
||
| 528 | * @return void |
||
| 529 | */ |
||
| 530 | public function addInferredURL($inferredURLData) |
||
| 531 | { |
||
| 532 | if ($this->urls === null) { |
||
| 533 | $this->loadUrls(); |
||
| 534 | } |
||
| 535 | |||
| 536 | // Generate and save the processed URLs |
||
| 537 | $this->urls['inferred'][$inferredURLData['url']] = $inferredURLData; |
||
| 538 | |||
| 539 | // Trigger parent URL back-filling |
||
| 540 | $this->parentProcessedURL($inferredURLData); |
||
| 541 | } |
||
| 542 | |||
| 543 | /** |
||
| 544 | * Return true if the given URL exists. |
||
| 545 | * |
||
| 546 | * @param string $url The URL, either absolute, or relative starting with "/" |
||
| 547 | * @return boolean Does the URL exist |
||
| 548 | * @throws \InvalidArgumentException |
||
| 549 | */ |
||
| 550 | public function hasURL($url) |
||
| 551 | { |
||
| 552 | if ($this->urls === null) { |
||
| 553 | $this->loadUrls(); |
||
| 554 | } |
||
| 555 | |||
| 556 | // Try and relativise an absolute URL |
||
| 557 | if ($url[0] != '/') { |
||
| 558 | $simpifiedURL = $this->simplifyURL($url); |
||
| 559 | $simpifiedBase = $this->simplifyURL($this->baseURL); |
||
| 560 | |||
| 561 | if (substr($simpifiedURL, 0, strlen($simpifiedBase)) == $simpifiedBase) { |
||
| 562 | $url = substr($simpifiedURL, strlen($simpifiedBase)); |
||
| 563 | } else { |
||
| 564 | throw new \InvalidArgumentException("URL $url is not from the site $this->baseURL"); |
||
| 565 | } |
||
| 566 | } |
||
| 567 | |||
| 568 | return isset($this->urls['regular'][$url]) || in_array($url, $this->urls['inferred']); |
||
| 569 | } |
||
| 570 | |||
| 571 | /** |
||
| 572 | * Simplify a URL. Ignores https/http differences and "www." / non differences. |
||
| 573 | * |
||
| 574 | * @param string $url |
||
| 575 | * @return string |
||
| 576 | * @todo Why does this ignore https/http differences? Should it? |
||
| 577 | */ |
||
| 578 | public function simplifyURL($url) |
||
| 579 | { |
||
| 580 | return preg_replace("#^http(s)?://(www.)?#i", 'http://www.', $url); |
||
| 581 | } |
||
| 582 | |||
| 583 | /** |
||
| 584 | * Returns true if the given URL is in the list of processed URls |
||
| 585 | * |
||
| 586 | * @param string $processedURL The processed URL |
||
| 587 | * @return boolean True if it exists, false otherwise |
||
| 588 | */ |
||
| 589 | public function hasProcessedURL($processedURL) |
||
| 590 | { |
||
| 591 | if ($this->urls === null) { |
||
| 592 | $this->loadUrls(); |
||
| 593 | } |
||
| 594 | |||
| 595 | return in_array($processedURL, array_keys($this->urls['regular'])) || |
||
| 596 | in_array($processedURL, array_keys($this->urls['inferred'])); |
||
| 597 | } |
||
| 598 | |||
| 599 | /** |
||
| 600 | * Return the processed URL that is the parent of the given one. |
||
| 601 | * |
||
| 602 | * Both input and output are processed URLs |
||
| 603 | * |
||
| 604 | * @param array $processedURLData URLData comprising a relative URL and Mime-Type |
||
| 605 | * @return array |
||
| 606 | */ |
||
| 607 | public function parentProcessedURL(array $processedURLData): array |
||
| 608 | { |
||
| 609 | $mime = self::$undefined_mime_type; |
||
| 610 | $processedURL = $processedURLData; |
||
| 611 | |||
| 612 | if (is_array($processedURLData)) { |
||
| 613 | if (empty($processedURLData['url'])) { |
||
| 614 | $processedURLData['url'] = '/'; // This will be dealt with, with the selected duplication strategy |
||
| 615 | } |
||
| 616 | |||
| 617 | if (empty($processedURLData['mime'])) { |
||
| 618 | $processedURLData['mime'] = self::$undefined_mime_type; |
||
| 619 | } |
||
| 620 | |||
| 621 | /* |
||
| 622 | * If $processedURLData['url'] is not HTML, it's unlikely its parent |
||
| 623 | * is anything useful (Prob just a directory) |
||
| 624 | */ |
||
| 625 | $sng = singleton(StaticSiteMimeProcessor::class); |
||
| 626 | $mime = $sng->IsOfHtml($processedURLData['mime']) ? |
||
| 627 | $processedURLData['mime'] : |
||
| 628 | self::$undefined_mime_type; |
||
| 629 | $processedURL = $processedURLData['url']; |
||
| 630 | } |
||
| 631 | |||
| 632 | $default = function ($fragment) use ($mime) { |
||
| 633 | return [ |
||
| 634 | 'url' => $fragment, |
||
| 635 | 'mime' => $mime, |
||
| 636 | ]; |
||
| 637 | }; |
||
| 638 | |||
| 639 | if ($processedURL == "/") { |
||
| 640 | return $default(''); |
||
| 641 | } |
||
| 642 | |||
| 643 | // URL hierarchy can be broken down by querystring or by URL |
||
| 644 | $breakpoint = max(strrpos($processedURL, '?'), strrpos($processedURL, '/')); |
||
| 645 | |||
| 646 | // Special case for children of the root |
||
| 647 | if ($breakpoint == 0) { |
||
| 648 | return $default('/'); |
||
| 649 | } |
||
| 650 | |||
| 651 | // Get parent URL |
||
| 652 | $parentProcessedURL = substr($processedURL, 0, $breakpoint); |
||
| 653 | |||
| 654 | $processedURLData = [ |
||
| 655 | 'url' => $parentProcessedURL, |
||
| 656 | 'mime' => $mime, |
||
| 657 | ]; |
||
| 658 | |||
| 659 | // If an intermediary URL doesn't exist, create it |
||
| 660 | if (!$this->hasProcessedURL($parentProcessedURL)) { |
||
| 661 | $this->addInferredURL($processedURLData); |
||
| 662 | } |
||
| 663 | |||
| 664 | return $processedURLData; |
||
| 665 | } |
||
| 666 | |||
| 667 | /** |
||
| 668 | * Find the processed URL in the URL list |
||
| 669 | * |
||
| 670 | * @param mixed string | array $urlData |
||
| 671 | * @return array |
||
| 672 | * @todo Under what circumstances would $this->urls['regular'][$url] === true (line ~696)? |
||
| 673 | */ |
||
| 674 | public function processedURL($urlData): array |
||
| 675 | { |
||
| 676 | // Load-up the cache into memory |
||
| 677 | if ($this->urls === null) { |
||
| 678 | $this->loadUrls(); |
||
| 679 | } |
||
| 680 | |||
| 681 | if (is_array($urlData)) { |
||
| 682 | $url = $urlData['url']; |
||
| 683 | $mime = $urlData['mime']; |
||
| 684 | } else { |
||
| 685 | $url = $urlData; |
||
| 686 | $mime = self::$undefined_mime_type; |
||
| 687 | } |
||
| 688 | |||
| 689 | $urlData = [ |
||
| 690 | 'url' => $url, |
||
| 691 | 'mime' => $mime, |
||
| 692 | ]; |
||
| 693 | |||
| 694 | // Cached urls use $url as the key.. |
||
| 695 | if (isset($this->urls['regular'][$url])) { |
||
| 696 | // Generate it if missing |
||
| 697 | if ($this->urls['regular'][$url] === true) { |
||
| 698 | $this->urls['regular'][$url] = $this->generateProcessedURL($urlData); |
||
| 699 | } |
||
| 700 | |||
| 701 | return $this->urls['regular'][$url]; |
||
| 702 | } elseif(isset($this->urls['inferred'][$url])) { |
||
| 703 | return $this->urls['inferred'][$url]; |
||
| 704 | } |
||
| 705 | |||
| 706 | return []; |
||
| 707 | } |
||
| 708 | |||
| 709 | /** |
||
| 710 | * Execute custom logic for processing URLs prior to heirachy generation. |
||
| 711 | * |
||
| 712 | * This can be used to implement logic such as ignoring the "/Pages/" parts of MOSS URLs, or dropping extensions. |
||
| 713 | * |
||
| 714 | * @param array $urlData The unprocessed URLData |
||
| 715 | * @return array $urlData The processed URLData |
||
| 716 | * @throws \LogicException |
||
| 717 | */ |
||
| 718 | public function generateProcessedURL(array $urlData): array |
||
| 719 | { |
||
| 720 | if (empty($urlData['url'])) { |
||
| 721 | throw new \LogicException("Can't pass a blank URL to generateProcessedURL"); |
||
| 722 | } |
||
| 723 | |||
| 724 | if ($this->urlProcessor) { |
||
| 725 | $urlData = $this->urlProcessor->processURL($urlData); |
||
| 726 | } |
||
| 727 | |||
| 728 | if (!$urlData) { |
||
| 729 | //return []; // Even if $urlData has a mime-type, it's useless without a URI |
||
| 730 | throw new \LogicException(get_class($this->urlProcessor) . " returned a blank URL."); |
||
| 731 | } |
||
| 732 | |||
| 733 | return $urlData; |
||
| 734 | } |
||
| 735 | |||
| 736 | /** |
||
| 737 | * Return the URLs that are a child of the given URL |
||
| 738 | * |
||
| 739 | * @param string $url |
||
| 740 | * @return array |
||
| 741 | */ |
||
| 742 | public function getChildren($url) |
||
| 743 | { |
||
| 744 | if ($this->urls === null) { |
||
| 745 | $this->loadUrls(); |
||
| 746 | } |
||
| 747 | |||
| 748 | $processedURL = $this->processedURL($url); |
||
| 749 | $processedURL = $processedURL['url'] ?? '/'; |
||
| 750 | |||
| 751 | // Subtly different regex if the URL ends in '?' or '/' |
||
| 752 | if (preg_match('#[/?]$#', $processedURL)) { |
||
| 753 | $regEx = '#^' . preg_quote($processedURL, '#') . '[^/?]+$#'; |
||
| 754 | } else { |
||
| 755 | $regEx = '#^' . preg_quote($processedURL, '#') . '[/?][^/?]+$#'; |
||
| 756 | } |
||
| 757 | |||
| 758 | $children = []; |
||
| 759 | |||
| 760 | foreach ($this->urls['regular'] as $urlKey => $potentialProcessedChild) { |
||
| 761 | $potentialProcessedChild = $urlKey; |
||
| 762 | if (preg_match($regEx, $potentialProcessedChild)) { |
||
| 763 | if (!isset($children[$potentialProcessedChild])) { |
||
| 764 | $children[$potentialProcessedChild] = $potentialProcessedChild; |
||
| 765 | } |
||
| 766 | } |
||
| 767 | } |
||
| 768 | |||
| 769 | foreach ($this->urls['inferred'] as $urlKey => $potentialProcessedChild) { |
||
| 770 | $potentialProcessedChild = $urlKey; |
||
| 771 | if (preg_match($regEx, $potentialProcessedChild)) { |
||
| 772 | if (!isset($children[$potentialProcessedChild])) { |
||
| 773 | $children[$potentialProcessedChild] = $potentialProcessedChild; |
||
| 774 | } |
||
| 775 | } |
||
| 776 | } |
||
| 777 | |||
| 778 | return array_values($children); |
||
| 779 | } |
||
| 780 | |||
| 781 | /** |
||
| 782 | * Simple property getter. Used in unit-testing. |
||
| 783 | * |
||
| 784 | * @param string $prop |
||
| 785 | * @return mixed |
||
| 786 | */ |
||
| 787 | public function getProperty($prop) |
||
| 788 | { |
||
| 789 | if ($this->$prop) { |
||
| 790 | return $this->$prop; |
||
| 791 | } |
||
| 792 | } |
||
| 793 | |||
| 794 | /** |
||
| 795 | * Get the serialized cache content and return the unserialized string |
||
| 796 | * |
||
| 797 | * @todo implement to replace x3 refs to unserialize(file_get_contents($this->cacheDir . 'urls')); |
||
| 798 | * @return string |
||
| 799 | */ |
||
| 800 | public function getCacheFileContents() |
||
| 810 | } |
||
| 811 | } |
||
| 812 |