Total Complexity | 93 |
Total Lines | 759 |
Duplicated Lines | 0 % |
Changes | 8 | ||
Bugs | 2 | Features | 0 |
Complex classes like StaticSiteUrlList often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use StaticSiteUrlList, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
25 | class StaticSiteUrlList |
||
26 | { |
||
27 | use Injectable; |
||
28 | use Configurable; |
||
29 | |||
30 | /** |
||
31 | * @var string |
||
32 | */ |
||
33 | public const CRAWL_STATUS_COMPLETE = 'Complete'; |
||
34 | |||
35 | /** |
||
36 | * @var string |
||
37 | */ |
||
38 | public const CRAWL_STATUS_PARTIAL = 'Partial'; |
||
39 | |||
40 | /** |
||
41 | * @var string |
||
42 | */ |
||
43 | public const CRAWL_STATUS_NOTSTARTED = 'Not started'; |
||
44 | |||
45 | /** |
||
46 | * |
||
47 | * @var string |
||
48 | */ |
||
49 | private static $undefined_mime_type = 'unknown'; |
||
50 | |||
51 | /** |
||
52 | * |
||
53 | * @var string |
||
54 | */ |
||
55 | protected $baseURL; |
||
56 | |||
57 | /** |
||
58 | * |
||
59 | * @var string |
||
60 | */ |
||
61 | protected $cacheDir; |
||
62 | |||
63 | /** |
||
64 | * Two element array: contains keys 'inferred' and 'regular': |
||
65 | * - 'regular' is an array mapping raw URLs to processed URLs |
||
66 | * - 'inferred' is an array of inferred URLs |
||
67 | * |
||
68 | * @var array |
||
69 | */ |
||
70 | protected $urls = null; |
||
71 | |||
72 | /** |
||
73 | * |
||
74 | * @var boolean |
||
75 | */ |
||
76 | protected $autoCrawl = false; |
||
77 | |||
78 | /** |
||
79 | * |
||
80 | * @var StaticSiteUrlProcessor |
||
81 | */ |
||
82 | protected $urlProcessor = null; |
||
83 | |||
84 | /** |
||
85 | * |
||
86 | * @var array |
||
87 | */ |
||
88 | protected $extraCrawlURLs = null; |
||
89 | |||
90 | /** |
||
91 | * A list of regular expression patterns to exclude from scraping |
||
92 | * |
||
93 | * @var array |
||
94 | */ |
||
95 | protected $excludePatterns = []; |
||
96 | |||
97 | /** |
||
98 | * The StaticSiteContentSource object |
||
99 | * |
||
100 | * @var StaticSiteContentSource |
||
101 | */ |
||
102 | protected $source; |
||
103 | |||
104 | /** |
||
105 | * Create a new URL List |
||
106 | * @param StaticSiteContentSource $source |
||
107 | * @param string $cacheDir The local path to cache data into |
||
108 | * @return void |
||
109 | */ |
||
110 | public function __construct(StaticSiteContentSource $source, $cacheDir) |
||
111 | { |
||
112 | // baseURL must not have a trailing slash |
||
113 | $baseURL = $source->BaseUrl; |
||
|
|||
114 | |||
115 | if (substr($baseURL, -1) == "/") { |
||
116 | $baseURL = substr($baseURL, 0, -1); |
||
117 | } |
||
118 | |||
119 | // cacheDir must have a trailing slash |
||
120 | if (substr($cacheDir, -1) != "/") { |
||
121 | $cacheDir .= "/"; |
||
122 | } |
||
123 | |||
124 | $this->baseURL = $baseURL; |
||
125 | $this->cacheDir = $cacheDir; |
||
126 | $this->source = $source; |
||
127 | } |
||
128 | |||
129 | /** |
||
130 | * Set a URL processor for this URL List. |
||
131 | * |
||
132 | * URL processors process the URLs before the site hierarchy and any inferred metadata are generated. |
||
133 | * These can be used to tranform URLs from CMS's that don't provide a natural hierarchy, into something |
||
134 | * more useful. |
||
135 | * |
||
136 | * @see {@link StaticSiteMOSSURLProcessor} for an example. |
||
137 | * @param StaticSiteUrlProcessor $urlProcessor |
||
138 | * @return void |
||
139 | */ |
||
140 | public function setUrlProcessor(StaticSiteUrlProcessor $urlProcessor = null) |
||
141 | { |
||
142 | $this->urlProcessor = $urlProcessor; |
||
143 | } |
||
144 | |||
145 | /** |
||
146 | * Define additional crawl URLs as an array |
||
147 | * Each of these URLs will be crawled in addition the base URL. |
||
148 | * This can be helpful if pages are getting missed by the crawl |
||
149 | * |
||
150 | * @param array $extraCrawlURLs |
||
151 | * @return void |
||
152 | */ |
||
153 | public function setExtraCrawlURls($extraCrawlURLs) |
||
154 | { |
||
155 | $this->extraCrawlURLs = $extraCrawlURLs; |
||
156 | } |
||
157 | |||
158 | /** |
||
159 | * Return the additional crawl URLs as an array |
||
160 | * |
||
161 | * @return array |
||
162 | */ |
||
163 | public function getExtraCrawlURLs() |
||
164 | { |
||
165 | return $this->extraCrawlURLs; |
||
166 | } |
||
167 | |||
168 | /** |
||
169 | * Set an array of regular expression patterns that should be excluded from |
||
170 | * being added to the url list. |
||
171 | * |
||
172 | * @param array $excludePatterns |
||
173 | * @return void |
||
174 | */ |
||
175 | public function setExcludePatterns(array $excludePatterns) |
||
176 | { |
||
177 | $this->excludePatterns = $excludePatterns; |
||
178 | } |
||
179 | |||
180 | /** |
||
181 | * Get an array of regular expression patterns that should not be added to |
||
182 | * the url list. |
||
183 | * |
||
184 | * @return array |
||
185 | */ |
||
186 | public function getExcludePatterns() |
||
187 | { |
||
188 | return $this->excludePatterns; |
||
189 | } |
||
190 | |||
191 | /** |
||
192 | * Set whether the crawl should be triggered on demand. |
||
193 | * |
||
194 | * @param boolean $autoCrawl |
||
195 | * @return StaticSiteUrlList |
||
196 | */ |
||
197 | public function setAutoCrawl(bool $autoCrawl): StaticSiteUrlList |
||
202 | } |
||
203 | |||
204 | /** |
||
205 | * Returns the status of the spidering. |
||
206 | * |
||
207 | * @return string |
||
208 | */ |
||
209 | public function getSpiderStatus(): string |
||
210 | { |
||
211 | if (file_exists($this->cacheDir . 'urls')) { |
||
212 | if (file_exists($this->cacheDir . 'crawlerid')) { |
||
213 | return self::CRAWL_STATUS_PARTIAL; |
||
214 | } |
||
215 | |||
216 | return self::CRAWL_STATUS_COMPLETE; |
||
217 | } |
||
218 | |||
219 | return self::CRAWL_STATUS_NOTSTARTED; |
||
220 | } |
||
221 | |||
222 | /** |
||
223 | * Raw URL+Mime data accessor method, used internally by logic outside of the class. |
||
224 | * |
||
225 | * @return mixed string $urls | null if no cached URL/Mime data found |
||
226 | */ |
||
227 | public function getRawCacheData() |
||
228 | { |
||
229 | if ($this->urls) { |
||
230 | // Don't rely on loadUrls() as it chokes on partially completed imports |
||
231 | $urls = $this->urls; |
||
232 | } elseif (file_exists($this->cacheDir . 'urls')) { |
||
233 | $urls = unserialize(file_get_contents($this->cacheDir . 'urls')); |
||
234 | } else { |
||
235 | return null; |
||
236 | } |
||
237 | return $urls; |
||
238 | } |
||
239 | |||
240 | /** |
||
241 | * Return the number of URLs crawled so far. If the urlcache is incomplete or |
||
242 | * doesn't exist, assumes zero. |
||
243 | * |
||
244 | * @return mixed integer |
||
245 | */ |
||
246 | public function getNumURIs(): int |
||
268 | } |
||
269 | |||
270 | /** |
||
271 | * Return a map of URLs crawled, with raw URLs as keys and processed URLs as values |
||
272 | * |
||
273 | * @return array |
||
274 | */ |
||
275 | public function getProcessedURLs(): array |
||
299 | ); |
||
300 | } |
||
301 | } |
||
302 | |||
303 | /** |
||
304 | * There are URLs and we're not in the middle of a crawl. |
||
305 | * |
||
306 | * @return boolean |
||
307 | */ |
||
308 | public function hasCrawled(): bool |
||
311 | } |
||
312 | |||
313 | /** |
||
314 | * Load the URLs, either by crawling, or by fetching from cache. |
||
315 | * |
||
316 | * @return void |
||
317 | * @throws \LogicException |
||
318 | */ |
||
319 | public function loadUrls(): void |
||
320 | { |
||
321 | if ($this->hasCrawled()) { |
||
322 | $this->urls = unserialize(file_get_contents($this->cacheDir . 'urls')); |
||
323 | |||
324 | // Clear out obsolete format |
||
325 | if (!isset($this->urls['regular'])) { |
||
326 | $this->urls['regular'] = []; |
||
327 | } |
||
328 | if (!isset($this->urls['inferred'])) { |
||
329 | $this->urls['inferred'] = []; |
||
330 | } |
||
331 | } elseif ($this->autoCrawl) { |
||
332 | $this->crawl(); |
||
333 | } else { |
||
334 | // This is grim, but we get to keep the useful check |
||
335 | if (!$this->isRunningTest()) { |
||
336 | // This happens if you move a cache-file out of the way during a real (non-test) run... |
||
337 | $msg = 'Crawl hasn\'t been executed yet and autoCrawl is false. Has the cache file been moved?'; |
||
338 | throw new \LogicException($msg); |
||
339 | } |
||
340 | } |
||
341 | } |
||
342 | |||
343 | /** |
||
344 | * @return boolean |
||
345 | */ |
||
346 | private function isRunningTest(): bool |
||
347 | { |
||
348 | return ( |
||
349 | // Github tests have SS_BASE_URL set as follows |
||
350 | Environment::getEnv('SS_BASE_URL') == 'http://localhost' || |
||
351 | // Tests use "static-site-0" s cache dirname |
||
352 | file_exists(preg_replace('#[0-9]+#', '0', $this->cacheDir)) |
||
353 | ); |
||
354 | } |
||
355 | |||
356 | /** |
||
357 | * Re-execute the URL processor on all the fetched URLs. |
||
358 | * If the site has been crawled and then subsequently the URLProcessor was changed, we need to ensure |
||
359 | * URLs are re-processed using the newly selected URL Preprocessor. |
||
360 | * |
||
361 | * @return void |
||
362 | */ |
||
363 | public function reprocessUrls() |
||
364 | { |
||
365 | if ($this->urls === null) { |
||
366 | $this->loadUrls(); |
||
367 | } |
||
368 | |||
369 | // Clear out all inferred URLs; these will be added |
||
370 | $this->urls['inferred'] = []; |
||
371 | |||
372 | // Reprocess URLs, in case the processing has changed since the last crawl |
||
373 | foreach ($this->urls['regular'] as $url => $urlData) { |
||
374 | $processedURLData = $this->generateProcessedURL($urlData); |
||
375 | $this->urls['regular'][$url] = $processedURLData; |
||
376 | |||
377 | // Trigger parent URL back-filling on new processed URL |
||
378 | $this->parentProcessedURL($processedURLData); |
||
379 | } |
||
380 | |||
381 | $this->saveURLs(); |
||
382 | } |
||
383 | |||
384 | /** |
||
385 | * |
||
386 | * @param number $limit |
||
387 | * @param bool $verbose |
||
388 | * @return StaticSiteCrawler |
||
389 | * @throws Exception |
||
390 | */ |
||
391 | public function crawl($limit = false, $verbose = false) |
||
392 | { |
||
393 | Environment::increaseTimeLimitTo(3600); |
||
394 | |||
395 | if (!is_dir($this->cacheDir)) { |
||
396 | if (!mkdir($this->cacheDir)) { |
||
397 | throw new \Exception('Unable to create cache directory at: ' . $this->cacheDir); |
||
398 | } |
||
399 | } |
||
400 | |||
401 | $crawler = StaticSiteCrawler::create($this, $limit, $verbose); |
||
402 | $crawler->enableResumption(); |
||
403 | $crawler->setUrlCacheType(PHPCrawlerUrlCacheTypes::URLCACHE_SQLITE); |
||
404 | $crawler->setWorkingDirectory($this->cacheDir); |
||
405 | |||
406 | // Find links in externally-linked CSS files |
||
407 | if ($this->source->ParseCSS) { |
||
408 | $crawler->addLinkSearchContentType("#text/css# i"); |
||
409 | } |
||
410 | |||
411 | // Set some proxy options for phpCrawler |
||
412 | singleton(StaticSiteUtils::class)->defineProxyOpts(!Director::isDev(), $crawler); |
||
413 | |||
414 | // Allow for resuming an incomplete crawl |
||
415 | if (file_exists($this->cacheDir . 'crawlerid')) { |
||
416 | // We should re-load the partial list of URLs, if relevant |
||
417 | // This should only happen when we are resuming a partial crawl |
||
418 | if (file_exists($this->cacheDir . 'urls')) { |
||
419 | $this->urls = unserialize(file_get_contents($this->cacheDir . 'urls')); |
||
420 | } else { |
||
421 | $this->urls = [ |
||
422 | 'regular' => [], |
||
423 | 'inferred' => [], |
||
424 | ]; |
||
425 | } |
||
426 | |||
427 | $crawlerID = file_get_contents($this->cacheDir . 'crawlerid'); |
||
428 | $crawler->resume($crawlerID); |
||
429 | } else { |
||
430 | $crawlerID = $crawler->getCrawlerId(); |
||
431 | file_put_contents($this->cacheDir . '/crawlerid', $crawlerID); |
||
432 | |||
433 | $this->urls = [ |
||
434 | 'regular' => [], |
||
435 | 'inferred' => [], |
||
436 | ]; |
||
437 | } |
||
438 | |||
439 | $crawler->setURL($this->baseURL); |
||
440 | $crawler->go(); |
||
441 | |||
442 | unlink($this->cacheDir . 'crawlerid'); |
||
443 | |||
444 | // TODO Document these |
||
445 | ksort($this->urls['regular']); |
||
446 | ksort($this->urls['inferred']); |
||
447 | |||
448 | $this->saveURLs(); |
||
449 | |||
450 | return $crawler; |
||
451 | } |
||
452 | |||
453 | /** |
||
454 | * Cache the current list of URLs to disk. |
||
455 | * |
||
456 | * @return void |
||
457 | */ |
||
458 | public function saveURLs() |
||
461 | } |
||
462 | |||
463 | /** |
||
464 | * Add a URL to this list, given the absolute URL. |
||
465 | * |
||
466 | * @param string $url The absolute URL |
||
467 | * @param string $content_type The Mime-Type found at this URL e.g text/html or image/png |
||
468 | * @throws \InvalidArgumentException |
||
469 | * @return void |
||
470 | */ |
||
471 | public function addAbsoluteURL($url, $content_type) |
||
472 | { |
||
473 | $simplifiedURL = $this->simplifyURL($url); |
||
474 | $simplifiedBase = $this->simplifyURL($this->baseURL); |
||
475 | |||
476 | // Check we're adhering to the correct base URL |
||
477 | if (substr($simplifiedURL, 0, strlen($simplifiedBase)) == $simplifiedBase) { |
||
478 | $relURL = preg_replace("#https?://(www.)?[^/]+#", '', $url); |
||
479 | } else { |
||
480 | throw new \InvalidArgumentException("URL $url is not from the site $this->baseURL"); |
||
481 | } |
||
482 | |||
483 | $this->addURL($relURL, $content_type); |
||
484 | } |
||
485 | |||
486 | /** |
||
487 | * Appends a processed URL onto the URL cache. |
||
488 | * |
||
489 | * @param string $url |
||
490 | * @param string $contentType |
||
491 | * @return void |
||
492 | */ |
||
493 | public function addURL($url, $contentType) |
||
494 | { |
||
495 | if ($this->urls === null) { |
||
496 | $this->loadUrls(); |
||
497 | } |
||
498 | |||
499 | // Generate and save the processed URLs |
||
500 | $urlData = [ |
||
501 | 'url' => $url, |
||
502 | 'mime' => $contentType, |
||
503 | ]; |
||
504 | |||
505 | $this->urls['regular'][$url] = $this->generateProcessedURL($urlData); |
||
506 | |||
507 | // Trigger parent URL back-filling |
||
508 | $this->parentProcessedURL($this->urls['regular'][$url]); |
||
509 | } |
||
510 | |||
511 | /** |
||
512 | * Add an inferred URL to the list. |
||
513 | * |
||
514 | * Since the unprocessed URL isn't available, we use the processed URL in its place. |
||
515 | * This should be used with some caution. |
||
516 | * |
||
517 | * @param array $inferredURLData Contains the processed URL and Mime-Type to add |
||
518 | * @return void |
||
519 | */ |
||
520 | public function addInferredURL($inferredURLData) |
||
521 | { |
||
522 | if ($this->urls === null) { |
||
523 | $this->loadUrls(); |
||
524 | } |
||
525 | |||
526 | // Generate and save the processed URLs |
||
527 | $this->urls['inferred'][$inferredURLData['url']] = $inferredURLData; |
||
528 | |||
529 | // Trigger parent URL back-filling |
||
530 | $this->parentProcessedURL($inferredURLData); |
||
531 | } |
||
532 | |||
533 | /** |
||
534 | * Return true if the given URL exists. |
||
535 | * |
||
536 | * @param string $url The URL, either absolute, or relative starting with "/" |
||
537 | * @return boolean Does the URL exist |
||
538 | * @throws \InvalidArgumentException |
||
539 | */ |
||
540 | public function hasURL($url) |
||
541 | { |
||
542 | if ($this->urls === null) { |
||
543 | $this->loadUrls(); |
||
544 | } |
||
545 | |||
546 | // Try and relativise an absolute URL |
||
547 | if ($url[0] != '/') { |
||
548 | $simpifiedURL = $this->simplifyURL($url); |
||
549 | $simpifiedBase = $this->simplifyURL($this->baseURL); |
||
550 | |||
551 | if (substr($simpifiedURL, 0, strlen($simpifiedBase)) == $simpifiedBase) { |
||
552 | $url = substr($simpifiedURL, strlen($simpifiedBase)); |
||
553 | } else { |
||
554 | throw new \InvalidArgumentException("URL $url is not from the site $this->baseURL"); |
||
555 | } |
||
556 | } |
||
557 | |||
558 | return isset($this->urls['regular'][$url]) || in_array($url, $this->urls['inferred']); |
||
559 | } |
||
560 | |||
561 | /** |
||
562 | * Simplify a URL. Ignores https/http differences and "www." / non differences. |
||
563 | * |
||
564 | * @param string $url |
||
565 | * @return string |
||
566 | * @todo Why does this ignore https/http differences? Should it? |
||
567 | */ |
||
568 | public function simplifyURL($url) |
||
569 | { |
||
570 | return preg_replace("#^http(s)?://(www.)?#i", 'http://www.', $url); |
||
571 | } |
||
572 | |||
573 | /** |
||
574 | * Returns true if the given URL is in the list of processed URls |
||
575 | * |
||
576 | * @param string $processedURL The processed URL |
||
577 | * @return boolean True if it exists, false otherwise |
||
578 | */ |
||
579 | public function hasProcessedURL($processedURL) |
||
580 | { |
||
581 | if ($this->urls === null) { |
||
582 | $this->loadUrls(); |
||
583 | } |
||
584 | |||
585 | return in_array($processedURL, array_keys($this->urls['regular'])) || |
||
586 | in_array($processedURL, array_keys($this->urls['inferred'])); |
||
587 | } |
||
588 | |||
589 | /** |
||
590 | * Return the processed URL that is the parent of the given one. |
||
591 | * |
||
592 | * Both input and output are processed URLs |
||
593 | * |
||
594 | * @param array $processedURLData URLData comprising a relative URL and Mime-Type |
||
595 | * @return string | array $processedURLData |
||
596 | */ |
||
597 | public function parentProcessedURL($processedURLData) |
||
598 | { |
||
599 | $mime = self::$undefined_mime_type; |
||
600 | $processedURL = $processedURLData; |
||
601 | |||
602 | if (is_array($processedURLData)) { |
||
603 | /* |
||
604 | * If $processedURLData['url'] is not HTML, it's unlikely its parent |
||
605 | * is anything useful (Prob just a directory) |
||
606 | */ |
||
607 | $sng = singleton(StaticSiteMimeProcessor::class); |
||
608 | $mime = $sng->IsOfHtml($processedURLData['mime']) ? $processedURLData['mime'] : self::$undefined_mime_type; |
||
609 | $processedURL = $processedURLData['url']; |
||
610 | } |
||
611 | |||
612 | $default = function ($fragment) use ($mime) { |
||
613 | return [ |
||
614 | 'url' => $fragment, |
||
615 | 'mime' => $mime, |
||
616 | ]; |
||
617 | }; |
||
618 | |||
619 | if ($processedURL == "/") { |
||
620 | return $default(''); |
||
621 | } |
||
622 | |||
623 | // URL hierarchy can be broken down by querystring or by URL |
||
624 | $breakpoint = max(strrpos($processedURL, '?'), strrpos($processedURL, '/')); |
||
625 | |||
626 | // Special case for children of the root |
||
627 | if ($breakpoint == 0) { |
||
628 | return $default('/'); |
||
629 | } |
||
630 | |||
631 | // Get parent URL |
||
632 | $parentProcessedURL = substr($processedURL, 0, $breakpoint); |
||
633 | |||
634 | $processedURLData = [ |
||
635 | 'url' => $parentProcessedURL, |
||
636 | 'mime' => $mime, |
||
637 | ]; |
||
638 | |||
639 | // If an intermediary URL doesn't exist, create it |
||
640 | if (!$this->hasProcessedURL($parentProcessedURL)) { |
||
641 | $this->addInferredURL($processedURLData); |
||
642 | } |
||
643 | |||
644 | return $processedURLData; |
||
645 | } |
||
646 | |||
647 | /** |
||
648 | * Find the processed URL in the URL list |
||
649 | * |
||
650 | * @param mixed string | array $urlData |
||
651 | * @return array $urlData |
||
652 | */ |
||
653 | public function processedURL($urlData) |
||
654 | { |
||
655 | $url = $urlData; |
||
656 | $mime = self::$undefined_mime_type; |
||
657 | |||
658 | if (is_array($urlData)) { |
||
659 | $url = $urlData['url']; |
||
660 | $mime = $urlData['mime']; |
||
661 | } |
||
662 | |||
663 | if ($this->urls === null) { |
||
664 | $this->loadUrls(); |
||
665 | } |
||
666 | |||
667 | $urlData = [ |
||
668 | 'url' => $url, |
||
669 | 'mime' => $mime, |
||
670 | ]; |
||
671 | |||
672 | if (isset($this->urls['regular'][$url])) { |
||
673 | // Generate it if missing |
||
674 | if ($this->urls['regular'][$url] === true) { |
||
675 | $this->urls['regular'][$url] = $this->generateProcessedURL($urlData); |
||
676 | } |
||
677 | |||
678 | return $this->urls['regular'][$url]; |
||
679 | } elseif (isset($this->urls['inferred'][$url])) { |
||
680 | return $this->urls['inferred'][$url]; |
||
681 | } |
||
682 | } |
||
683 | |||
684 | /** |
||
685 | * Execute custom logic for processing URLs prior to heirachy generation. |
||
686 | * |
||
687 | * This can be used to implement logic such as ignoring the "/Pages/" parts of MOSS URLs, or dropping extensions. |
||
688 | * |
||
689 | * @param array $urlData The unprocessed URLData |
||
690 | * @return array $urlData The processed URLData |
||
691 | * @throws \LogicException |
||
692 | */ |
||
693 | public function generateProcessedURL(array $urlData): array |
||
694 | { |
||
695 | if (!isset($urlData['url'])) { |
||
696 | throw new \LogicException("Can't pass a blank URL to generateProcessedURL"); |
||
697 | } |
||
698 | |||
699 | if ($this->urlProcessor) { |
||
700 | $urlData = $this->urlProcessor->processURL($urlData); |
||
701 | } |
||
702 | |||
703 | if (!$urlData) { |
||
704 | throw new \LogicException(get_class($this->urlProcessor) . " returned a blank URL."); |
||
705 | } |
||
706 | |||
707 | return $urlData; |
||
708 | } |
||
709 | |||
710 | /** |
||
711 | * Return the URLs that are a child of the given URL |
||
712 | * |
||
713 | * @param string $url |
||
714 | * @return array |
||
715 | */ |
||
716 | public function getChildren($url) |
||
717 | { |
||
718 | if ($this->urls === null) { |
||
719 | $this->loadUrls(); |
||
720 | } |
||
721 | |||
722 | $processedURL = $this->processedURL($url); |
||
723 | $processedURL = $processedURL['url'] ?? '/'; |
||
724 | |||
725 | // Subtly different regex if the URL ends in '?' or '/' |
||
726 | if (preg_match('#[/?]$#', $processedURL)) { |
||
727 | $regEx = '#^' . preg_quote($processedURL, '#') . '[^/?]+$#'; |
||
728 | } else { |
||
729 | $regEx = '#^' . preg_quote($processedURL, '#') . '[/?][^/?]+$#'; |
||
730 | } |
||
731 | |||
732 | $children = []; |
||
733 | |||
734 | foreach ($this->urls['regular'] as $urlKey => $potentialProcessedChild) { |
||
735 | $potentialProcessedChild = $urlKey; |
||
736 | if (preg_match($regEx, $potentialProcessedChild)) { |
||
737 | if (!isset($children[$potentialProcessedChild])) { |
||
738 | $children[$potentialProcessedChild] = $potentialProcessedChild; |
||
739 | } |
||
740 | } |
||
741 | } |
||
742 | |||
743 | foreach ($this->urls['inferred'] as $urlKey => $potentialProcessedChild) { |
||
744 | $potentialProcessedChild = $urlKey; |
||
745 | if (preg_match($regEx, $potentialProcessedChild)) { |
||
746 | if (!isset($children[$potentialProcessedChild])) { |
||
747 | $children[$potentialProcessedChild] = $potentialProcessedChild; |
||
748 | } |
||
749 | } |
||
750 | } |
||
751 | |||
752 | return array_values($children); |
||
753 | } |
||
754 | |||
755 | /** |
||
756 | * Simple property getter. Used in unit-testing. |
||
757 | * |
||
758 | * @param string $prop |
||
759 | * @return mixed |
||
760 | */ |
||
761 | public function getProperty($prop) |
||
762 | { |
||
763 | if ($this->$prop) { |
||
764 | return $this->$prop; |
||
765 | } |
||
766 | } |
||
767 | |||
768 | /** |
||
769 | * Get the serialized cache content and return the unserialized string |
||
770 | * |
||
771 | * @todo implement to replace x3 refs to unserialize(file_get_contents($this->cacheDir . 'urls')); |
||
772 | * @return string |
||
773 | */ |
||
774 | public function getCacheFileContents() |
||
784 | } |
||
785 | } |
||
786 |