Total Complexity | 98 |
Total Lines | 779 |
Duplicated Lines | 0 % |
Changes | 8 | ||
Bugs | 2 | Features | 0 |
Complex classes like StaticSiteUrlList often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use StaticSiteUrlList, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
26 | class StaticSiteUrlList |
||
27 | { |
||
28 | use Injectable; |
||
29 | use Configurable; |
||
30 | |||
31 | /** |
||
32 | * @var string |
||
33 | */ |
||
34 | public const CRAWL_STATUS_COMPLETE = 'Complete'; |
||
35 | |||
36 | /** |
||
37 | * @var string |
||
38 | */ |
||
39 | public const CRAWL_STATUS_PARTIAL = 'Partial'; |
||
40 | |||
41 | /** |
||
42 | * @var string |
||
43 | */ |
||
44 | public const CRAWL_STATUS_NOTSTARTED = 'Not started'; |
||
45 | |||
46 | /** |
||
47 | * |
||
48 | * @var string |
||
49 | */ |
||
50 | private static $undefined_mime_type = 'unknown/unknown'; |
||
51 | |||
52 | /** |
||
53 | * |
||
54 | * @var string |
||
55 | */ |
||
56 | protected $baseURL; |
||
57 | |||
58 | /** |
||
59 | * |
||
60 | * @var string |
||
61 | */ |
||
62 | protected $cacheDir; |
||
63 | |||
64 | /** |
||
65 | * Two element array: contains keys 'inferred' and 'regular': |
||
66 | * - 'regular' is an array mapping raw URLs to processed URLs |
||
67 | * - 'inferred' is an array of inferred URLs |
||
68 | * |
||
69 | * @var array |
||
70 | */ |
||
71 | protected $urls = null; |
||
72 | |||
73 | /** |
||
74 | * |
||
75 | * @var boolean |
||
76 | */ |
||
77 | protected $autoCrawl = false; |
||
78 | |||
79 | /** |
||
80 | * |
||
81 | * @var StaticSiteUrlProcessor |
||
82 | */ |
||
83 | protected $urlProcessor = null; |
||
84 | |||
85 | /** |
||
86 | * |
||
87 | * @var array |
||
88 | */ |
||
89 | protected $extraCrawlURLs = null; |
||
90 | |||
91 | /** |
||
92 | * A list of regular expression patterns to exclude from scraping |
||
93 | * |
||
94 | * @var array |
||
95 | */ |
||
96 | protected $excludePatterns = []; |
||
97 | |||
98 | /** |
||
99 | * The StaticSiteContentSource object |
||
100 | * |
||
101 | * @var StaticSiteContentSource |
||
102 | */ |
||
103 | protected $source; |
||
104 | |||
105 | /** |
||
106 | * Create a new URL List |
||
107 | * @param StaticSiteContentSource $source |
||
108 | * @param string $cacheDir The local path to cache data into |
||
109 | * @return void |
||
110 | */ |
||
111 | public function __construct(StaticSiteContentSource $source, $cacheDir) |
||
112 | { |
||
113 | $this->setIsRunningTest(); |
||
114 | |||
115 | // baseURL must not have a trailing slash |
||
116 | $baseURL = (string) $source->BaseUrl; |
||
|
|||
117 | |||
118 | if (substr($baseURL, -1) == "/") { |
||
119 | $baseURL = substr($baseURL, 0, -1); |
||
120 | } |
||
121 | |||
122 | // cacheDir must have a trailing slash |
||
123 | if (substr($cacheDir, -1) != "/") { |
||
124 | $cacheDir .= "/"; |
||
125 | } |
||
126 | |||
127 | $this->baseURL = $baseURL; |
||
128 | $this->cacheDir = $cacheDir; |
||
129 | $this->source = $source; |
||
130 | } |
||
131 | |||
132 | /** |
||
133 | * Set a URL processor for this URL List. |
||
134 | * |
||
135 | * URL processors process the URLs before the site hierarchy and any inferred metadata are generated. |
||
136 | * These can be used to tranform URLs from CMS's that don't provide a natural hierarchy, into something |
||
137 | * more useful. |
||
138 | * |
||
139 | * @see implementors of {@link StaticSiteUrlProcessor} for examples. |
||
140 | * @param StaticSiteUrlProcessor $urlProcessor |
||
141 | * @return void |
||
142 | */ |
||
143 | public function setUrlProcessor(StaticSiteUrlProcessor $urlProcessor = null) |
||
144 | { |
||
145 | $this->urlProcessor = $urlProcessor; |
||
146 | } |
||
147 | |||
148 | /** |
||
149 | * Define additional crawl URLs as an array |
||
150 | * Each of these URLs will be crawled in addition the base URL. |
||
151 | * This can be helpful if pages are getting missed by the crawl |
||
152 | * |
||
153 | * @param array $extraCrawlURLs |
||
154 | * @return void |
||
155 | */ |
||
156 | public function setExtraCrawlURls($extraCrawlURLs) |
||
157 | { |
||
158 | $this->extraCrawlURLs = $extraCrawlURLs; |
||
159 | } |
||
160 | |||
161 | /** |
||
162 | * Return the additional crawl URLs as an array |
||
163 | * |
||
164 | * @return array |
||
165 | */ |
||
166 | public function getExtraCrawlURLs() |
||
167 | { |
||
168 | return $this->extraCrawlURLs; |
||
169 | } |
||
170 | |||
171 | /** |
||
172 | * Set an array of regular expression patterns that should be excluded from |
||
173 | * being added to the url list. |
||
174 | * |
||
175 | * @param array $excludePatterns |
||
176 | * @return void |
||
177 | */ |
||
178 | public function setExcludePatterns(array $excludePatterns) |
||
179 | { |
||
180 | $this->excludePatterns = $excludePatterns; |
||
181 | } |
||
182 | |||
183 | /** |
||
184 | * Get an array of regular expression patterns that should not be added to |
||
185 | * the url list. |
||
186 | * |
||
187 | * @return array |
||
188 | */ |
||
189 | public function getExcludePatterns() |
||
190 | { |
||
191 | return $this->excludePatterns; |
||
192 | } |
||
193 | |||
194 | /** |
||
195 | * Set whether the crawl should be triggered on demand. |
||
196 | * |
||
197 | * @param boolean $autoCrawl |
||
198 | * @return StaticSiteUrlList |
||
199 | */ |
||
200 | public function setAutoCrawl(bool $autoCrawl): StaticSiteUrlList |
||
201 | { |
||
202 | $this->autoCrawl = $autoCrawl; |
||
203 | |||
204 | return $this; |
||
205 | } |
||
206 | |||
207 | /** |
||
208 | * Returns the status of the spidering. |
||
209 | * |
||
210 | * @return string |
||
211 | */ |
||
212 | public function getSpiderStatus(): string |
||
213 | { |
||
214 | if (file_exists($this->cacheDir . 'urls')) { |
||
215 | if (file_exists($this->cacheDir . 'crawlerid')) { |
||
216 | return self::CRAWL_STATUS_PARTIAL; |
||
217 | } |
||
218 | |||
219 | return self::CRAWL_STATUS_COMPLETE; |
||
220 | } |
||
221 | |||
222 | return self::CRAWL_STATUS_NOTSTARTED; |
||
223 | } |
||
224 | |||
225 | /** |
||
226 | * Raw URL+Mime data accessor method, used internally by logic outside of the class. |
||
227 | * |
||
228 | * @return mixed string $urls | null if no cached URL/Mime data found |
||
229 | */ |
||
230 | public function getRawCacheData() |
||
231 | { |
||
232 | if ($this->urls) { |
||
233 | // Don't rely on loadUrls() as it chokes on partially completed imports |
||
234 | $urls = $this->urls; |
||
235 | } elseif (file_exists($this->cacheDir . 'urls')) { |
||
236 | $urls = unserialize(file_get_contents($this->cacheDir . 'urls')); |
||
237 | } else { |
||
238 | return null; |
||
239 | } |
||
240 | |||
241 | return $urls; |
||
242 | } |
||
243 | |||
244 | /** |
||
245 | * Return the number of URLs crawled so far. If the urlcache is incomplete or |
||
246 | * doesn't exist, assumes zero. |
||
247 | * |
||
248 | * @return mixed integer |
||
249 | */ |
||
250 | public function getNumURIs(): int |
||
251 | { |
||
252 | if (!$urls = $this->getRawCacheData()) { |
||
253 | return 0; |
||
254 | } |
||
255 | |||
256 | if (!isset($urls['regular']) || !isset($urls['regular'])) { |
||
257 | return 0; |
||
258 | } |
||
259 | |||
260 | $_regular = []; |
||
261 | $_inferred = []; |
||
262 | |||
263 | foreach ($urls['regular'] as $key => $urlData) { |
||
264 | array_push($_regular, $urlData['url']); |
||
265 | } |
||
266 | |||
267 | foreach ($urls['inferred'] as $key => $urlData) { |
||
268 | array_push($_inferred, $urlData['url']); |
||
269 | } |
||
270 | |||
271 | return count(array_unique($_regular)) + count($_inferred); |
||
272 | } |
||
273 | |||
274 | /** |
||
275 | * Return a map of URLs crawled, with raw URLs as keys and processed URLs as values |
||
276 | * |
||
277 | * @return array |
||
278 | */ |
||
279 | public function getProcessedURLs(): array |
||
280 | { |
||
281 | if ($this->hasCrawled() || $this->autoCrawl) { |
||
282 | if ($this->urls === null) { |
||
283 | $this->loadUrls(); |
||
284 | } |
||
285 | |||
286 | $_regular = []; |
||
287 | $_inferred = null; |
||
288 | |||
289 | foreach ($this->urls['regular'] as $key => $urlData) { |
||
290 | $_regular[$key] = $urlData['url']; |
||
291 | } |
||
292 | |||
293 | if ($this->urls['inferred']) { |
||
294 | $_inferred = []; |
||
295 | foreach ($this->urls['inferred'] as $key => $urlData) { |
||
296 | $_inferred[$key] = $urlData['url']; |
||
297 | } |
||
298 | } |
||
299 | |||
300 | return array_merge( |
||
301 | $_regular, |
||
302 | $_inferred ? array_combine($_inferred, $_inferred) : [] |
||
303 | ); |
||
304 | } |
||
305 | } |
||
306 | |||
307 | /** |
||
308 | * There are URLs and we're not in the middle of a crawl. |
||
309 | * |
||
310 | * @return boolean |
||
311 | */ |
||
312 | public function hasCrawled(): bool |
||
313 | { |
||
314 | return file_exists($this->cacheDir . 'urls') && !file_exists($this->cacheDir . 'crawlerid'); |
||
315 | } |
||
316 | |||
317 | /** |
||
318 | * Load the URLs, either by crawling, or by fetching from cache. |
||
319 | * |
||
320 | * @return void |
||
321 | * @throws \LogicException |
||
322 | */ |
||
323 | public function loadUrls(): void |
||
324 | { |
||
325 | if ($this->hasCrawled()) { |
||
326 | $this->urls = unserialize(file_get_contents($this->cacheDir . 'urls')); |
||
327 | |||
328 | // Clear out obsolete format |
||
329 | if (!isset($this->urls['regular'])) { |
||
330 | $this->urls['regular'] = []; |
||
331 | } |
||
332 | if (!isset($this->urls['inferred'])) { |
||
333 | $this->urls['inferred'] = []; |
||
334 | } |
||
335 | } elseif ($this->autoCrawl) { |
||
336 | $this->crawl(); |
||
337 | } else { |
||
338 | // This happens if you move a cache-file out of the way during a real (non-test) run... |
||
339 | $msg = 'Crawl hasn\'t been executed yet and autoCrawl is false. Has the cache file been moved?'; |
||
340 | throw new \LogicException($msg); |
||
341 | } |
||
342 | } |
||
343 | |||
344 | /** |
||
345 | * @return void |
||
346 | */ |
||
347 | private function setIsRunningTest(): void |
||
348 | { |
||
349 | $isGithub = Environment::getEnv('SS_BASE_URL') == 'http://localhost'; // Github tests have SS_BASE_URL set |
||
350 | |||
351 | if ($isGithub && !file_exists(ASSETS_PATH)) { |
||
352 | mkdir(ASSETS_PATH, 0777, true); |
||
353 | } |
||
354 | } |
||
355 | |||
356 | /** |
||
357 | * Re-execute the URL processor on all the fetched URLs. |
||
358 | * If the site has been crawled and then subsequently the URLProcessor was changed through |
||
359 | * user-interaction in the "migration" CMS admin, then we need to ensure that |
||
360 | * URLs are re-processed using the newly selected URL Preprocessor. |
||
361 | * |
||
362 | * @return void |
||
363 | */ |
||
364 | public function reprocessUrls() |
||
365 | { |
||
366 | if ($this->urls === null) { |
||
367 | $this->loadUrls(); |
||
368 | } |
||
369 | |||
370 | // Clear out all inferred URLs; these will be added |
||
371 | $this->urls['inferred'] = []; |
||
372 | |||
373 | // Reprocess URLs, in case the processing has changed since the last crawl |
||
374 | foreach ($this->urls['regular'] as $url => $urlData) { |
||
375 | // TODO Log this in exodus.log |
||
376 | if (empty($urlData['url'])) { |
||
377 | // echo $urlData['mime'] . "\n"; |
||
378 | continue; |
||
379 | } |
||
380 | |||
381 | $processedURLData = $this->generateProcessedURL($urlData); |
||
382 | $this->urls['regular'][$url] = $processedURLData; |
||
383 | // Trigger parent URL back-filling on new processed URL |
||
384 | $this->parentProcessedURL($processedURLData); |
||
385 | } |
||
386 | |||
387 | $this->saveURLs(); |
||
388 | } |
||
389 | |||
390 | /** |
||
391 | * |
||
392 | * @param number $limit |
||
393 | * @param bool $verbose |
||
394 | * @return StaticSiteCrawler |
||
395 | * @throws Exception |
||
396 | */ |
||
397 | public function crawl($limit = false, $verbose = false) |
||
398 | { |
||
399 | Environment::increaseTimeLimitTo(3600); |
||
400 | |||
401 | if (!is_dir($this->cacheDir)) { |
||
402 | if (!mkdir($this->cacheDir)) { |
||
403 | throw new \Exception('Unable to create cache directory at: ' . $this->cacheDir); |
||
404 | } |
||
405 | } |
||
406 | |||
407 | $crawler = StaticSiteCrawler::create($this, $limit, $verbose); |
||
408 | $crawler->enableResumption(); |
||
409 | $crawler->setUrlCacheType(PHPCrawlerUrlCacheTypes::URLCACHE_MEMORY); |
||
410 | $crawler->setWorkingDirectory($this->cacheDir); |
||
411 | |||
412 | // Find links in externally-linked CSS files |
||
413 | if ($this->source->ParseCSS) { |
||
414 | $crawler->addLinkSearchContentType("#text/css# i"); |
||
415 | } |
||
416 | |||
417 | // Set some proxy options for phpCrawler |
||
418 | singleton(StaticSiteUtils::class)->defineProxyOpts(!Director::isDev(), $crawler); |
||
419 | |||
420 | $this->urls = [ |
||
421 | 'regular' => [], |
||
422 | 'inferred' => [], |
||
423 | ]; |
||
424 | |||
425 | // Allow for resuming an incomplete crawl |
||
426 | if (file_exists($this->cacheDir . 'crawlerid')) { |
||
427 | // We should re-load the partial list of URLs, if relevant |
||
428 | // This should only happen when we are resuming a partial crawl |
||
429 | if (file_exists($this->cacheDir . 'urls')) { |
||
430 | $this->urls = unserialize(file_get_contents($this->cacheDir . 'urls')); |
||
431 | } |
||
432 | |||
433 | $crawlerID = file_get_contents($this->cacheDir . 'crawlerid'); |
||
434 | $crawler->resume($crawlerID); |
||
435 | } else { |
||
436 | $crawlerID = $crawler->getCrawlerId(); |
||
437 | } |
||
438 | |||
439 | $crawler->setURL($this->baseURL); |
||
440 | $crawler->setPort(preg_match('#^https#', $this->baseURL) ? 443 : 80); |
||
441 | $crawler->go(); |
||
442 | |||
443 | // TODO Document these |
||
444 | ksort($this->urls['regular']); |
||
445 | ksort($this->urls['inferred']); |
||
446 | |||
447 | // Cache the URLs to a file for use by the importer |
||
448 | $this->saveURLs(); |
||
449 | |||
450 | return $crawler; |
||
451 | } |
||
452 | |||
453 | /** |
||
454 | * Cache the current list of URLs to disk. |
||
455 | * |
||
456 | * @return void |
||
457 | */ |
||
458 | public function saveURLs() |
||
461 | } |
||
462 | |||
463 | /** |
||
464 | * Add a URL to this list, given the absolute URL. |
||
465 | * |
||
466 | * @param string $url The absolute URL |
||
467 | * @param string $content_type The Mime-Type found at this URL e.g text/html or image/png |
||
468 | * @throws \InvalidArgumentException |
||
469 | * @return void |
||
470 | */ |
||
471 | public function addAbsoluteURL($url, $content_type) |
||
484 | } |
||
485 | |||
486 | /** |
||
487 | * Appends a processed URL onto the URL cache. |
||
488 | * |
||
489 | * @param string $url |
||
490 | * @param string $contentType |
||
491 | * @return mixed null|void |
||
492 | */ |
||
493 | public function addURL($url, $contentType) |
||
494 | { |
||
495 | if ($this->urls === null) { |
||
496 | $this->loadUrls(); |
||
497 | } |
||
498 | |||
499 | if (empty($url)) { |
||
500 | return null; |
||
501 | } |
||
502 | |||
503 | // Generate and save the processed URLs |
||
504 | $urlData = [ |
||
505 | 'url' => $url, |
||
506 | 'mime' => $contentType, |
||
507 | ]; |
||
508 | |||
509 | $this->urls['regular'][$url] = $this->generateProcessedURL($urlData); |
||
510 | |||
511 | // Trigger parent URL back-filling |
||
512 | $this->parentProcessedURL($this->urls['regular'][$url]); |
||
513 | } |
||
514 | |||
515 | /** |
||
516 | * Add an inferred URL to the list. |
||
517 | * |
||
518 | * Since the unprocessed URL isn't available, we use the processed URL in its place. |
||
519 | * This should be used with some caution. |
||
520 | * |
||
521 | * @param array $inferredURLData Contains the processed URL and Mime-Type to add |
||
522 | * @return void |
||
523 | */ |
||
524 | public function addInferredURL($inferredURLData) |
||
525 | { |
||
526 | if ($this->urls === null) { |
||
527 | $this->loadUrls(); |
||
528 | } |
||
529 | |||
530 | // Generate and save the processed URLs |
||
531 | $this->urls['inferred'][$inferredURLData['url']] = $inferredURLData; |
||
532 | |||
533 | // Trigger parent URL back-filling |
||
534 | $this->parentProcessedURL($inferredURLData); |
||
535 | } |
||
536 | |||
537 | /** |
||
538 | * Return true if the given URL exists. |
||
539 | * |
||
540 | * @param string $url The URL, either absolute, or relative starting with "/" |
||
541 | * @return boolean Does the URL exist |
||
542 | * @throws \InvalidArgumentException |
||
543 | */ |
||
544 | public function hasURL($url) |
||
563 | } |
||
564 | |||
565 | /** |
||
566 | * Simplify a URL. Ignores https/http differences and "www." / non differences. |
||
567 | * |
||
568 | * @param string $url |
||
569 | * @return string |
||
570 | * @todo Why does this ignore https/http differences? Should it? |
||
571 | */ |
||
572 | public function simplifyURL($url) |
||
573 | { |
||
574 | return preg_replace("#^http(s)?://(www.)?#i", 'http://www.', $url); |
||
575 | } |
||
576 | |||
577 | /** |
||
578 | * Returns true if the given URL is in the list of processed URls |
||
579 | * |
||
580 | * @param string $processedURL The processed URL |
||
581 | * @return boolean True if it exists, false otherwise |
||
582 | */ |
||
583 | public function hasProcessedURL($processedURL) |
||
591 | } |
||
592 | |||
593 | /** |
||
594 | * Return the processed URL that is the parent of the given one. |
||
595 | * |
||
596 | * Both input and output are processed URLs |
||
597 | * |
||
598 | * @param array $processedURLData URLData comprising a relative URL and Mime-Type |
||
599 | * @return array |
||
600 | */ |
||
601 | public function parentProcessedURL(array $processedURLData): array |
||
602 | { |
||
603 | $mime = self::$undefined_mime_type; |
||
604 | $processedURL = $processedURLData; |
||
605 | |||
606 | if (is_array($processedURLData)) { |
||
607 | if (empty($processedURLData['url'])) { |
||
608 | $processedURLData['url'] = '/'; // This will be dealt with, with the selected duplication strategy |
||
609 | } |
||
610 | |||
611 | if (empty($processedURLData['mime'])) { |
||
612 | $processedURLData['mime'] = self::$undefined_mime_type; |
||
613 | } |
||
614 | |||
615 | /* |
||
616 | * If $processedURLData['url'] is not HTML, it's unlikely its parent |
||
617 | * is anything useful (Prob just a directory) |
||
618 | */ |
||
619 | $sng = singleton(StaticSiteMimeProcessor::class); |
||
620 | $mime = $sng->IsOfHtml($processedURLData['mime']) ? |
||
621 | $processedURLData['mime'] : |
||
622 | self::$undefined_mime_type; |
||
623 | $processedURL = $processedURLData['url']; |
||
624 | } |
||
625 | |||
626 | $default = function ($fragment) use ($mime) { |
||
627 | return [ |
||
628 | 'url' => $fragment, |
||
629 | 'mime' => $mime, |
||
630 | ]; |
||
631 | }; |
||
632 | |||
633 | if ($processedURL == "/") { |
||
634 | return $default(''); |
||
635 | } |
||
636 | |||
637 | // URL hierarchy can be broken down by querystring or by URL |
||
638 | $breakpoint = max(strrpos($processedURL, '?'), strrpos($processedURL, '/')); |
||
639 | |||
640 | // Special case for children of the root |
||
641 | if ($breakpoint == 0) { |
||
642 | return $default('/'); |
||
643 | } |
||
644 | |||
645 | // Get parent URL |
||
646 | $parentProcessedURL = substr($processedURL, 0, $breakpoint); |
||
647 | |||
648 | $processedURLData = [ |
||
649 | 'url' => $parentProcessedURL, |
||
650 | 'mime' => $mime, |
||
651 | ]; |
||
652 | |||
653 | // If an intermediary URL doesn't exist, create it |
||
654 | if (!$this->hasProcessedURL($parentProcessedURL)) { |
||
655 | $this->addInferredURL($processedURLData); |
||
656 | } |
||
657 | |||
658 | return $processedURLData; |
||
659 | } |
||
660 | |||
661 | /** |
||
662 | * Find the processed URL in the URL list |
||
663 | * |
||
664 | * @param mixed string | array $urlData |
||
665 | * @return array |
||
666 | * @todo Under what circumstances would $this->urls['regular'][$url] === true (line ~696)? |
||
667 | */ |
||
668 | public function processedURL($urlData): array |
||
669 | { |
||
670 | // Load-up the cache into memory |
||
671 | if ($this->urls === null) { |
||
672 | $this->loadUrls(); |
||
673 | } |
||
674 | |||
675 | if (is_array($urlData)) { |
||
676 | $url = $urlData['url']; |
||
677 | $mime = $urlData['mime']; |
||
678 | } else { |
||
679 | $url = $urlData; |
||
680 | $mime = self::$undefined_mime_type; |
||
681 | } |
||
682 | |||
683 | $urlData = [ |
||
684 | 'url' => $url, |
||
685 | 'mime' => $mime, |
||
686 | ]; |
||
687 | |||
688 | // Cached urls use $url as the key.. |
||
689 | if (isset($this->urls['regular'][$url])) { |
||
690 | // Generate it if missing |
||
691 | if ($this->urls['regular'][$url] === true) { |
||
692 | $this->urls['regular'][$url] = $this->generateProcessedURL($urlData); |
||
693 | } |
||
694 | |||
695 | return $this->urls['regular'][$url]; |
||
696 | } elseif (isset($this->urls['inferred'][$url])) { |
||
697 | return $this->urls['inferred'][$url]; |
||
698 | } |
||
699 | |||
700 | return []; |
||
701 | } |
||
702 | |||
703 | /** |
||
704 | * Execute custom logic for processing URLs prior to heirachy generation. |
||
705 | * |
||
706 | * This can be used to implement logic such as ignoring specific components of URLs, or dropping extensions. |
||
707 | * See implementors of {@link StaticSiteUrlProcessor}. |
||
708 | * |
||
709 | * @param array $urlData The unprocessed URLData |
||
710 | * @return array $urlData The processed URLData |
||
711 | * @throws \LogicException |
||
712 | */ |
||
713 | public function generateProcessedURL(array $urlData): array |
||
714 | { |
||
715 | if (empty($urlData['url'])) { |
||
716 | throw new \LogicException("Can't pass a blank URL to generateProcessedURL"); |
||
717 | } |
||
718 | |||
719 | if ($this->urlProcessor) { |
||
720 | $urlData = $this->urlProcessor->processURL($urlData); |
||
721 | } |
||
722 | |||
723 | if (!$urlData) { |
||
724 | //return []; // Even if $urlData has a mime-type, it's useless without a URI |
||
725 | throw new \LogicException(get_class($this->urlProcessor) . " returned a blank URL."); |
||
726 | } |
||
727 | |||
728 | return $urlData; |
||
729 | } |
||
730 | |||
731 | /** |
||
732 | * Return the URLs that are a child of the given URL |
||
733 | * |
||
734 | * @param string $url |
||
735 | * @return array |
||
736 | */ |
||
737 | public function getChildren($url) |
||
738 | { |
||
739 | if ($this->urls === null) { |
||
740 | $this->loadUrls(); |
||
741 | } |
||
742 | |||
743 | $processedURL = $this->processedURL($url); |
||
744 | $processedURL = $processedURL['url'] ?? '/'; |
||
745 | |||
746 | // Subtly different regex if the URL ends in '?' or '/' |
||
747 | if (preg_match('#[/?]$#', $processedURL)) { |
||
748 | $regEx = '#^' . preg_quote($processedURL, '#') . '[^/?]+$#'; |
||
749 | } else { |
||
750 | $regEx = '#^' . preg_quote($processedURL, '#') . '[/?][^/?]+$#'; |
||
751 | } |
||
752 | |||
753 | $children = []; |
||
754 | |||
755 | foreach ($this->urls['regular'] as $urlKey => $potentialProcessedChild) { |
||
756 | $potentialProcessedChild = $urlKey; |
||
757 | if (preg_match($regEx, $potentialProcessedChild)) { |
||
758 | if (!isset($children[$potentialProcessedChild])) { |
||
759 | $children[$potentialProcessedChild] = $potentialProcessedChild; |
||
760 | } |
||
761 | } |
||
762 | } |
||
763 | |||
764 | foreach ($this->urls['inferred'] as $urlKey => $potentialProcessedChild) { |
||
765 | $potentialProcessedChild = $urlKey; |
||
766 | if (preg_match($regEx, $potentialProcessedChild)) { |
||
767 | if (!isset($children[$potentialProcessedChild])) { |
||
768 | $children[$potentialProcessedChild] = $potentialProcessedChild; |
||
769 | } |
||
770 | } |
||
771 | } |
||
772 | |||
773 | return array_values($children); |
||
774 | } |
||
775 | |||
776 | /** |
||
777 | * Simple property getter. Used in unit-testing. |
||
778 | * |
||
779 | * @param string $prop |
||
780 | * @return mixed |
||
781 | */ |
||
782 | public function getProperty($prop) |
||
783 | { |
||
784 | if ($this->$prop) { |
||
785 | return $this->$prop; |
||
786 | } |
||
787 | } |
||
788 | |||
789 | /** |
||
790 | * Get the serialized cache content and return the unserialized string |
||
791 | * |
||
792 | * @todo implement to replace x3 refs to unserialize(file_get_contents($this->cacheDir . 'urls')); |
||
793 | * @return string |
||
794 | */ |
||
795 | public function getCacheFileContents() |
||
805 | } |
||
806 | } |
||
807 |