Total Complexity | 98 |
Total Lines | 784 |
Duplicated Lines | 0 % |
Changes | 8 | ||
Bugs | 2 | Features | 0 |
Complex classes like StaticSiteUrlList often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use StaticSiteUrlList, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
26 | class StaticSiteUrlList |
||
27 | { |
||
28 | use Injectable; |
||
29 | use Configurable; |
||
30 | |||
31 | /** |
||
32 | * @var string |
||
33 | */ |
||
34 | public const CRAWL_STATUS_COMPLETE = 'Complete'; |
||
35 | |||
36 | /** |
||
37 | * @var string |
||
38 | */ |
||
39 | public const CRAWL_STATUS_PARTIAL = 'Partial'; |
||
40 | |||
41 | /** |
||
42 | * @var string |
||
43 | */ |
||
44 | public const CRAWL_STATUS_NOTSTARTED = 'Not started'; |
||
45 | |||
46 | /** |
||
47 | * |
||
48 | * @var string |
||
49 | */ |
||
50 | private static $undefined_mime_type = 'unknown/unknown'; |
||
51 | |||
52 | /** |
||
53 | * |
||
54 | * @var string |
||
55 | */ |
||
56 | protected $baseURL; |
||
57 | |||
58 | /** |
||
59 | * |
||
60 | * @var string |
||
61 | */ |
||
62 | protected $cacheDir; |
||
63 | |||
64 | /** |
||
65 | * Two element array: contains keys 'inferred' and 'regular': |
||
66 | * - 'regular' is an array mapping raw URLs to processed URLs |
||
67 | * - 'inferred' is an array of inferred URLs |
||
68 | * |
||
69 | * @var array |
||
70 | */ |
||
71 | protected $urls = null; |
||
72 | |||
73 | /** |
||
74 | * |
||
75 | * @var boolean |
||
76 | */ |
||
77 | protected $autoCrawl = false; |
||
78 | |||
79 | /** |
||
80 | * |
||
81 | * @var StaticSiteUrlProcessor |
||
82 | */ |
||
83 | protected $urlProcessor = null; |
||
84 | |||
85 | /** |
||
86 | * |
||
87 | * @var array |
||
88 | */ |
||
89 | protected $extraCrawlURLs = null; |
||
90 | |||
91 | /** |
||
92 | * A list of regular expression patterns to exclude from scraping |
||
93 | * |
||
94 | * @var array |
||
95 | */ |
||
96 | protected $excludePatterns = []; |
||
97 | |||
98 | /** |
||
99 | * The StaticSiteContentSource object |
||
100 | * |
||
101 | * @var StaticSiteContentSource |
||
102 | */ |
||
103 | protected $source; |
||
104 | |||
105 | /** |
||
106 | * Create a new URL List |
||
107 | * @param StaticSiteContentSource $source |
||
108 | * @param string $cacheDir The local path to cache data into |
||
109 | * @return void |
||
110 | */ |
||
111 | public function __construct(StaticSiteContentSource $source, $cacheDir) |
||
112 | { |
||
113 | $this->setIsRunningTest(); |
||
114 | |||
115 | // baseURL must not have a trailing slash |
||
116 | $baseURL = $source->BaseUrl; |
||
|
|||
117 | |||
118 | if (substr($baseURL, -1) == "/") { |
||
119 | $baseURL = substr($baseURL, 0, -1); |
||
120 | } |
||
121 | |||
122 | // cacheDir must have a trailing slash |
||
123 | if (substr($cacheDir, -1) != "/") { |
||
124 | $cacheDir .= "/"; |
||
125 | } |
||
126 | |||
127 | $this->baseURL = $baseURL; |
||
128 | $this->cacheDir = $cacheDir; |
||
129 | $this->source = $source; |
||
130 | } |
||
131 | |||
132 | /** |
||
133 | * Set a URL processor for this URL List. |
||
134 | * |
||
135 | * URL processors process the URLs before the site hierarchy and any inferred metadata are generated. |
||
136 | * These can be used to tranform URLs from CMS's that don't provide a natural hierarchy, into something |
||
137 | * more useful. |
||
138 | * |
||
139 | * @see {@link StaticSiteMOSSURLProcessor} for an example. |
||
140 | * @param StaticSiteUrlProcessor $urlProcessor |
||
141 | * @return void |
||
142 | */ |
||
143 | public function setUrlProcessor(StaticSiteUrlProcessor $urlProcessor = null) |
||
144 | { |
||
145 | $this->urlProcessor = $urlProcessor; |
||
146 | } |
||
147 | |||
148 | /** |
||
149 | * Define additional crawl URLs as an array |
||
150 | * Each of these URLs will be crawled in addition the base URL. |
||
151 | * This can be helpful if pages are getting missed by the crawl |
||
152 | * |
||
153 | * @param array $extraCrawlURLs |
||
154 | * @return void |
||
155 | */ |
||
156 | public function setExtraCrawlURls($extraCrawlURLs) |
||
157 | { |
||
158 | $this->extraCrawlURLs = $extraCrawlURLs; |
||
159 | } |
||
160 | |||
161 | /** |
||
162 | * Return the additional crawl URLs as an array |
||
163 | * |
||
164 | * @return array |
||
165 | */ |
||
166 | public function getExtraCrawlURLs() |
||
167 | { |
||
168 | return $this->extraCrawlURLs; |
||
169 | } |
||
170 | |||
171 | /** |
||
172 | * Set an array of regular expression patterns that should be excluded from |
||
173 | * being added to the url list. |
||
174 | * |
||
175 | * @param array $excludePatterns |
||
176 | * @return void |
||
177 | */ |
||
178 | public function setExcludePatterns(array $excludePatterns) |
||
179 | { |
||
180 | $this->excludePatterns = $excludePatterns; |
||
181 | } |
||
182 | |||
183 | /** |
||
184 | * Get an array of regular expression patterns that should not be added to |
||
185 | * the url list. |
||
186 | * |
||
187 | * @return array |
||
188 | */ |
||
189 | public function getExcludePatterns() |
||
190 | { |
||
191 | return $this->excludePatterns; |
||
192 | } |
||
193 | |||
194 | /** |
||
195 | * Set whether the crawl should be triggered on demand. |
||
196 | * |
||
197 | * @param boolean $autoCrawl |
||
198 | * @return StaticSiteUrlList |
||
199 | */ |
||
200 | public function setAutoCrawl(bool $autoCrawl): StaticSiteUrlList |
||
201 | { |
||
202 | $this->autoCrawl = $autoCrawl; |
||
203 | |||
204 | return $this; |
||
205 | } |
||
206 | |||
207 | /** |
||
208 | * Returns the status of the spidering. |
||
209 | * |
||
210 | * @return string |
||
211 | */ |
||
212 | public function getSpiderStatus(): string |
||
213 | { |
||
214 | if (file_exists($this->cacheDir . 'urls')) { |
||
215 | if (file_exists($this->cacheDir . 'crawlerid')) { |
||
216 | return self::CRAWL_STATUS_PARTIAL; |
||
217 | } |
||
218 | |||
219 | return self::CRAWL_STATUS_COMPLETE; |
||
220 | } |
||
221 | |||
222 | return self::CRAWL_STATUS_NOTSTARTED; |
||
223 | } |
||
224 | |||
225 | /** |
||
226 | * Raw URL+Mime data accessor method, used internally by logic outside of the class. |
||
227 | * |
||
228 | * @return mixed string $urls | null if no cached URL/Mime data found |
||
229 | */ |
||
230 | public function getRawCacheData() |
||
231 | { |
||
232 | if ($this->urls) { |
||
233 | // Don't rely on loadUrls() as it chokes on partially completed imports |
||
234 | $urls = $this->urls; |
||
235 | } elseif (file_exists($this->cacheDir . 'urls')) { |
||
236 | $urls = unserialize(file_get_contents($this->cacheDir . 'urls')); |
||
237 | } else { |
||
238 | return null; |
||
239 | } |
||
240 | return $urls; |
||
241 | } |
||
242 | |||
243 | /** |
||
244 | * Return the number of URLs crawled so far. If the urlcache is incomplete or |
||
245 | * doesn't exist, assumes zero. |
||
246 | * |
||
247 | * @return mixed integer |
||
248 | */ |
||
249 | public function getNumURIs(): int |
||
250 | { |
||
251 | if (!$urls = $this->getRawCacheData()) { |
||
252 | return 0; |
||
253 | } |
||
254 | |||
255 | if (!isset($urls['regular']) || !isset($urls['regular'])) { |
||
256 | return 0; |
||
257 | } |
||
258 | |||
259 | $_regular = []; |
||
260 | $_inferred = []; |
||
261 | |||
262 | foreach ($urls['regular'] as $key => $urlData) { |
||
263 | array_push($_regular, $urlData['url']); |
||
264 | } |
||
265 | |||
266 | foreach ($urls['inferred'] as $key => $urlData) { |
||
267 | array_push($_inferred, $urlData['url']); |
||
268 | } |
||
269 | |||
270 | return count(array_unique($_regular)) + count($_inferred); |
||
271 | } |
||
272 | |||
273 | /** |
||
274 | * Return a map of URLs crawled, with raw URLs as keys and processed URLs as values |
||
275 | * |
||
276 | * @return array |
||
277 | */ |
||
278 | public function getProcessedURLs(): array |
||
279 | { |
||
280 | if ($this->hasCrawled() || $this->autoCrawl) { |
||
281 | if ($this->urls === null) { |
||
282 | $this->loadUrls(); |
||
283 | } |
||
284 | |||
285 | $_regular = []; |
||
286 | $_inferred = null; |
||
287 | |||
288 | foreach ($this->urls['regular'] as $key => $urlData) { |
||
289 | $_regular[$key] = $urlData['url']; |
||
290 | } |
||
291 | |||
292 | if ($this->urls['inferred']) { |
||
293 | $_inferred = []; |
||
294 | foreach ($this->urls['inferred'] as $key => $urlData) { |
||
295 | $_inferred[$key] = $urlData['url']; |
||
296 | } |
||
297 | } |
||
298 | |||
299 | return array_merge( |
||
300 | $_regular, |
||
301 | $_inferred ? array_combine($_inferred, $_inferred) : [] |
||
302 | ); |
||
303 | } |
||
304 | } |
||
305 | |||
306 | /** |
||
307 | * There are URLs and we're not in the middle of a crawl. |
||
308 | * |
||
309 | * @return boolean |
||
310 | */ |
||
311 | public function hasCrawled(): bool |
||
312 | { |
||
313 | return file_exists($this->cacheDir . 'urls') && !file_exists($this->cacheDir . 'crawlerid'); |
||
314 | } |
||
315 | |||
316 | /** |
||
317 | * Load the URLs, either by crawling, or by fetching from cache. |
||
318 | * |
||
319 | * @return void |
||
320 | * @throws \LogicException |
||
321 | */ |
||
322 | public function loadUrls(): void |
||
323 | { |
||
324 | if ($this->hasCrawled()) { |
||
325 | $this->urls = unserialize(file_get_contents($this->cacheDir . 'urls')); |
||
326 | |||
327 | // Clear out obsolete format |
||
328 | if (!isset($this->urls['regular'])) { |
||
329 | $this->urls['regular'] = []; |
||
330 | } |
||
331 | if (!isset($this->urls['inferred'])) { |
||
332 | $this->urls['inferred'] = []; |
||
333 | } |
||
334 | } elseif ($this->autoCrawl) { |
||
335 | $this->crawl(); |
||
336 | } else { |
||
337 | // This happens if you move a cache-file out of the way during a real (non-test) run... |
||
338 | $msg = 'Crawl hasn\'t been executed yet and autoCrawl is false. Has the cache file been moved?'; |
||
339 | throw new \LogicException($msg); |
||
340 | } |
||
341 | } |
||
342 | |||
343 | /** |
||
344 | * @return void |
||
345 | */ |
||
346 | private function setIsRunningTest(): void |
||
347 | { |
||
348 | $isGithub = Environment::getEnv('SS_BASE_URL') == 'http://localhost'; // Github tests have SS_BASE_URL set |
||
349 | |||
350 | if ($isGithub && !file_exists(ASSETS_PATH)) { |
||
351 | mkdir(ASSETS_PATH, 0777, true); |
||
352 | } |
||
353 | } |
||
354 | |||
355 | /** |
||
356 | * Re-execute the URL processor on all the fetched URLs. |
||
357 | * If the site has been crawled and then subsequently the URLProcessor was changed through |
||
358 | * user-interaction in the "external content" CMS admin, then we need to ensure that |
||
359 | * URLs are re-processed using the newly selected URL Preprocessor. |
||
360 | * |
||
361 | * @return void |
||
362 | */ |
||
363 | public function reprocessUrls() |
||
364 | { |
||
365 | if ($this->urls === null) { |
||
366 | $this->loadUrls(); |
||
367 | } |
||
368 | |||
369 | // Clear out all inferred URLs; these will be added |
||
370 | $this->urls['inferred'] = []; |
||
371 | |||
372 | // Reprocess URLs, in case the processing has changed since the last crawl |
||
373 | foreach ($this->urls['regular'] as $url => $urlData) { |
||
374 | // TODO Log this in exodus.log |
||
375 | if (empty($urlData['url'])) { |
||
376 | // echo $urlData['mime'] . "\n"; |
||
377 | continue; |
||
378 | } |
||
379 | |||
380 | $processedURLData = $this->generateProcessedURL($urlData); |
||
381 | $this->urls['regular'][$url] = $processedURLData; |
||
382 | // Trigger parent URL back-filling on new processed URL |
||
383 | $this->parentProcessedURL($processedURLData); |
||
384 | } |
||
385 | |||
386 | $this->saveURLs(); |
||
387 | } |
||
388 | |||
389 | /** |
||
390 | * |
||
391 | * @param number $limit |
||
392 | * @param bool $verbose |
||
393 | * @return StaticSiteCrawler |
||
394 | * @throws Exception |
||
395 | */ |
||
396 | public function crawl($limit = false, $verbose = false) |
||
397 | { |
||
398 | Environment::increaseTimeLimitTo(3600); |
||
399 | |||
400 | if (!is_dir($this->cacheDir)) { |
||
401 | if (!mkdir($this->cacheDir)) { |
||
402 | throw new \Exception('Unable to create cache directory at: ' . $this->cacheDir); |
||
403 | } |
||
404 | } |
||
405 | |||
406 | $crawler = StaticSiteCrawler::create($this, $limit, $verbose); |
||
407 | $crawler->enableResumption(); |
||
408 | $crawler->setUrlCacheType(PHPCrawlerUrlCacheTypes::URLCACHE_SQLITE); |
||
409 | $crawler->setWorkingDirectory($this->cacheDir); |
||
410 | |||
411 | // Find links in externally-linked CSS files |
||
412 | if ($this->source->ParseCSS) { |
||
413 | $crawler->addLinkSearchContentType("#text/css# i"); |
||
414 | } |
||
415 | |||
416 | // Set some proxy options for phpCrawler |
||
417 | singleton(StaticSiteUtils::class)->defineProxyOpts(!Director::isDev(), $crawler); |
||
418 | |||
419 | // Allow for resuming an incomplete crawl |
||
420 | if (file_exists($this->cacheDir . 'crawlerid')) { |
||
421 | // We should re-load the partial list of URLs, if relevant |
||
422 | // This should only happen when we are resuming a partial crawl |
||
423 | if (file_exists($this->cacheDir . 'urls')) { |
||
424 | $this->urls = unserialize(file_get_contents($this->cacheDir . 'urls')); |
||
425 | } else { |
||
426 | $this->urls = [ |
||
427 | 'regular' => [], |
||
428 | 'inferred' => [], |
||
429 | ]; |
||
430 | } |
||
431 | |||
432 | $crawlerID = file_get_contents($this->cacheDir . 'crawlerid'); |
||
433 | $crawler->resume($crawlerID); |
||
434 | } else { |
||
435 | $crawlerID = $crawler->getCrawlerId(); |
||
436 | |||
437 | $this->urls = [ |
||
438 | 'regular' => [], |
||
439 | 'inferred' => [], |
||
440 | ]; |
||
441 | } |
||
442 | |||
443 | $crawler->setURL($this->baseURL); |
||
444 | $crawler->setPort(preg_match('#^https#', $this->baseURL) ? 443 : 80); |
||
445 | $crawler->go(); |
||
446 | |||
447 | // TODO Why were we deleting this originally? |
||
448 | // unlink($this->cacheDir . 'crawlerid'); |
||
449 | |||
450 | // TODO Document these |
||
451 | ksort($this->urls['regular']); |
||
452 | ksort($this->urls['inferred']); |
||
453 | |||
454 | $this->saveURLs(); |
||
455 | |||
456 | return $crawler; |
||
457 | } |
||
458 | |||
459 | /** |
||
460 | * Cache the current list of URLs to disk. |
||
461 | * |
||
462 | * @return void |
||
463 | */ |
||
464 | public function saveURLs() |
||
467 | } |
||
468 | |||
469 | /** |
||
470 | * Add a URL to this list, given the absolute URL. |
||
471 | * |
||
472 | * @param string $url The absolute URL |
||
473 | * @param string $content_type The Mime-Type found at this URL e.g text/html or image/png |
||
474 | * @throws \InvalidArgumentException |
||
475 | * @return void |
||
476 | */ |
||
477 | public function addAbsoluteURL($url, $content_type) |
||
478 | { |
||
479 | $simplifiedURL = $this->simplifyURL($url); |
||
480 | $simplifiedBase = $this->simplifyURL($this->baseURL); |
||
481 | |||
482 | // Check we're adhering to the correct base URL |
||
483 | if (substr($simplifiedURL, 0, strlen($simplifiedBase)) == $simplifiedBase) { |
||
484 | $relURL = preg_replace("#https?://(www.)?[^/]+#", '', $url); |
||
485 | } else { |
||
486 | throw new \InvalidArgumentException("URL $url is not from the site $this->baseURL"); |
||
487 | } |
||
488 | |||
489 | $this->addURL($relURL, $content_type); |
||
490 | } |
||
491 | |||
492 | /** |
||
493 | * Appends a processed URL onto the URL cache. |
||
494 | * |
||
495 | * @param string $url |
||
496 | * @param string $contentType |
||
497 | * @return mixed null|void |
||
498 | */ |
||
499 | public function addURL($url, $contentType) |
||
500 | { |
||
501 | if ($this->urls === null) { |
||
502 | $this->loadUrls(); |
||
503 | } |
||
504 | |||
505 | if (empty($url)) { |
||
506 | return null; |
||
507 | } |
||
508 | |||
509 | // Generate and save the processed URLs |
||
510 | $urlData = [ |
||
511 | 'url' => $url, |
||
512 | 'mime' => $contentType, |
||
513 | ]; |
||
514 | |||
515 | $this->urls['regular'][$url] = $this->generateProcessedURL($urlData); |
||
516 | |||
517 | // Trigger parent URL back-filling |
||
518 | $this->parentProcessedURL($this->urls['regular'][$url]); |
||
519 | } |
||
520 | |||
521 | /** |
||
522 | * Add an inferred URL to the list. |
||
523 | * |
||
524 | * Since the unprocessed URL isn't available, we use the processed URL in its place. |
||
525 | * This should be used with some caution. |
||
526 | * |
||
527 | * @param array $inferredURLData Contains the processed URL and Mime-Type to add |
||
528 | * @return void |
||
529 | */ |
||
530 | public function addInferredURL($inferredURLData) |
||
531 | { |
||
532 | if ($this->urls === null) { |
||
533 | $this->loadUrls(); |
||
534 | } |
||
535 | |||
536 | // Generate and save the processed URLs |
||
537 | $this->urls['inferred'][$inferredURLData['url']] = $inferredURLData; |
||
538 | |||
539 | // Trigger parent URL back-filling |
||
540 | $this->parentProcessedURL($inferredURLData); |
||
541 | } |
||
542 | |||
543 | /** |
||
544 | * Return true if the given URL exists. |
||
545 | * |
||
546 | * @param string $url The URL, either absolute, or relative starting with "/" |
||
547 | * @return boolean Does the URL exist |
||
548 | * @throws \InvalidArgumentException |
||
549 | */ |
||
550 | public function hasURL($url) |
||
551 | { |
||
552 | if ($this->urls === null) { |
||
553 | $this->loadUrls(); |
||
554 | } |
||
555 | |||
556 | // Try and relativise an absolute URL |
||
557 | if ($url[0] != '/') { |
||
558 | $simpifiedURL = $this->simplifyURL($url); |
||
559 | $simpifiedBase = $this->simplifyURL($this->baseURL); |
||
560 | |||
561 | if (substr($simpifiedURL, 0, strlen($simpifiedBase)) == $simpifiedBase) { |
||
562 | $url = substr($simpifiedURL, strlen($simpifiedBase)); |
||
563 | } else { |
||
564 | throw new \InvalidArgumentException("URL $url is not from the site $this->baseURL"); |
||
565 | } |
||
566 | } |
||
567 | |||
568 | return isset($this->urls['regular'][$url]) || in_array($url, $this->urls['inferred']); |
||
569 | } |
||
570 | |||
571 | /** |
||
572 | * Simplify a URL. Ignores https/http differences and "www." / non differences. |
||
573 | * |
||
574 | * @param string $url |
||
575 | * @return string |
||
576 | * @todo Why does this ignore https/http differences? Should it? |
||
577 | */ |
||
578 | public function simplifyURL($url) |
||
579 | { |
||
580 | return preg_replace("#^http(s)?://(www.)?#i", 'http://www.', $url); |
||
581 | } |
||
582 | |||
583 | /** |
||
584 | * Returns true if the given URL is in the list of processed URls |
||
585 | * |
||
586 | * @param string $processedURL The processed URL |
||
587 | * @return boolean True if it exists, false otherwise |
||
588 | */ |
||
589 | public function hasProcessedURL($processedURL) |
||
590 | { |
||
591 | if ($this->urls === null) { |
||
592 | $this->loadUrls(); |
||
593 | } |
||
594 | |||
595 | return in_array($processedURL, array_keys($this->urls['regular'])) || |
||
596 | in_array($processedURL, array_keys($this->urls['inferred'])); |
||
597 | } |
||
598 | |||
599 | /** |
||
600 | * Return the processed URL that is the parent of the given one. |
||
601 | * |
||
602 | * Both input and output are processed URLs |
||
603 | * |
||
604 | * @param array $processedURLData URLData comprising a relative URL and Mime-Type |
||
605 | * @return array |
||
606 | */ |
||
607 | public function parentProcessedURL(array $processedURLData): array |
||
608 | { |
||
609 | $mime = self::$undefined_mime_type; |
||
610 | $processedURL = $processedURLData; |
||
611 | |||
612 | if (is_array($processedURLData)) { |
||
613 | if (empty($processedURLData['url'])) { |
||
614 | $processedURLData['url'] = '/'; // This will be dealt with, with the selected duplication strategy |
||
615 | } |
||
616 | |||
617 | if (empty($processedURLData['mime'])) { |
||
618 | $processedURLData['mime'] = self::$undefined_mime_type; |
||
619 | } |
||
620 | |||
621 | /* |
||
622 | * If $processedURLData['url'] is not HTML, it's unlikely its parent |
||
623 | * is anything useful (Prob just a directory) |
||
624 | */ |
||
625 | $sng = singleton(StaticSiteMimeProcessor::class); |
||
626 | $mime = $sng->IsOfHtml($processedURLData['mime']) ? |
||
627 | $processedURLData['mime'] : |
||
628 | self::$undefined_mime_type; |
||
629 | $processedURL = $processedURLData['url']; |
||
630 | } |
||
631 | |||
632 | $default = function ($fragment) use ($mime) { |
||
633 | return [ |
||
634 | 'url' => $fragment, |
||
635 | 'mime' => $mime, |
||
636 | ]; |
||
637 | }; |
||
638 | |||
639 | if ($processedURL == "/") { |
||
640 | return $default(''); |
||
641 | } |
||
642 | |||
643 | // URL hierarchy can be broken down by querystring or by URL |
||
644 | $breakpoint = max(strrpos($processedURL, '?'), strrpos($processedURL, '/')); |
||
645 | |||
646 | // Special case for children of the root |
||
647 | if ($breakpoint == 0) { |
||
648 | return $default('/'); |
||
649 | } |
||
650 | |||
651 | // Get parent URL |
||
652 | $parentProcessedURL = substr($processedURL, 0, $breakpoint); |
||
653 | |||
654 | $processedURLData = [ |
||
655 | 'url' => $parentProcessedURL, |
||
656 | 'mime' => $mime, |
||
657 | ]; |
||
658 | |||
659 | // If an intermediary URL doesn't exist, create it |
||
660 | if (!$this->hasProcessedURL($parentProcessedURL)) { |
||
661 | $this->addInferredURL($processedURLData); |
||
662 | } |
||
663 | |||
664 | return $processedURLData; |
||
665 | } |
||
666 | |||
667 | /** |
||
668 | * Find the processed URL in the URL list |
||
669 | * |
||
670 | * @param mixed string | array $urlData |
||
671 | * @return array |
||
672 | * @todo Under what circumstances would $this->urls['regular'][$url] === true (line ~696)? |
||
673 | */ |
||
674 | public function processedURL($urlData): array |
||
675 | { |
||
676 | // Load-up the cache into memory |
||
677 | if ($this->urls === null) { |
||
678 | $this->loadUrls(); |
||
679 | } |
||
680 | |||
681 | if (is_array($urlData)) { |
||
682 | $url = $urlData['url']; |
||
683 | $mime = $urlData['mime']; |
||
684 | } else { |
||
685 | $url = $urlData; |
||
686 | $mime = self::$undefined_mime_type; |
||
687 | } |
||
688 | |||
689 | $urlData = [ |
||
690 | 'url' => $url, |
||
691 | 'mime' => $mime, |
||
692 | ]; |
||
693 | |||
694 | // Cached urls use $url as the key.. |
||
695 | if (isset($this->urls['regular'][$url])) { |
||
696 | // Generate it if missing |
||
697 | if ($this->urls['regular'][$url] === true) { |
||
698 | $this->urls['regular'][$url] = $this->generateProcessedURL($urlData); |
||
699 | } |
||
700 | |||
701 | return $this->urls['regular'][$url]; |
||
702 | } elseif(isset($this->urls['inferred'][$url])) { |
||
703 | return $this->urls['inferred'][$url]; |
||
704 | } |
||
705 | |||
706 | return []; |
||
707 | } |
||
708 | |||
709 | /** |
||
710 | * Execute custom logic for processing URLs prior to heirachy generation. |
||
711 | * |
||
712 | * This can be used to implement logic such as ignoring the "/Pages/" parts of MOSS URLs, or dropping extensions. |
||
713 | * |
||
714 | * @param array $urlData The unprocessed URLData |
||
715 | * @return array $urlData The processed URLData |
||
716 | * @throws \LogicException |
||
717 | */ |
||
718 | public function generateProcessedURL(array $urlData): array |
||
719 | { |
||
720 | if (empty($urlData['url'])) { |
||
721 | throw new \LogicException("Can't pass a blank URL to generateProcessedURL"); |
||
722 | } |
||
723 | |||
724 | if ($this->urlProcessor) { |
||
725 | $urlData = $this->urlProcessor->processURL($urlData); |
||
726 | } |
||
727 | |||
728 | if (!$urlData) { |
||
729 | //return []; // Even if $urlData has a mime-type, it's useless without a URI |
||
730 | throw new \LogicException(get_class($this->urlProcessor) . " returned a blank URL."); |
||
731 | } |
||
732 | |||
733 | return $urlData; |
||
734 | } |
||
735 | |||
736 | /** |
||
737 | * Return the URLs that are a child of the given URL |
||
738 | * |
||
739 | * @param string $url |
||
740 | * @return array |
||
741 | */ |
||
742 | public function getChildren($url) |
||
743 | { |
||
744 | if ($this->urls === null) { |
||
745 | $this->loadUrls(); |
||
746 | } |
||
747 | |||
748 | $processedURL = $this->processedURL($url); |
||
749 | $processedURL = $processedURL['url'] ?? '/'; |
||
750 | |||
751 | // Subtly different regex if the URL ends in '?' or '/' |
||
752 | if (preg_match('#[/?]$#', $processedURL)) { |
||
753 | $regEx = '#^' . preg_quote($processedURL, '#') . '[^/?]+$#'; |
||
754 | } else { |
||
755 | $regEx = '#^' . preg_quote($processedURL, '#') . '[/?][^/?]+$#'; |
||
756 | } |
||
757 | |||
758 | $children = []; |
||
759 | |||
760 | foreach ($this->urls['regular'] as $urlKey => $potentialProcessedChild) { |
||
761 | $potentialProcessedChild = $urlKey; |
||
762 | if (preg_match($regEx, $potentialProcessedChild)) { |
||
763 | if (!isset($children[$potentialProcessedChild])) { |
||
764 | $children[$potentialProcessedChild] = $potentialProcessedChild; |
||
765 | } |
||
766 | } |
||
767 | } |
||
768 | |||
769 | foreach ($this->urls['inferred'] as $urlKey => $potentialProcessedChild) { |
||
770 | $potentialProcessedChild = $urlKey; |
||
771 | if (preg_match($regEx, $potentialProcessedChild)) { |
||
772 | if (!isset($children[$potentialProcessedChild])) { |
||
773 | $children[$potentialProcessedChild] = $potentialProcessedChild; |
||
774 | } |
||
775 | } |
||
776 | } |
||
777 | |||
778 | return array_values($children); |
||
779 | } |
||
780 | |||
781 | /** |
||
782 | * Simple property getter. Used in unit-testing. |
||
783 | * |
||
784 | * @param string $prop |
||
785 | * @return mixed |
||
786 | */ |
||
787 | public function getProperty($prop) |
||
788 | { |
||
789 | if ($this->$prop) { |
||
790 | return $this->$prop; |
||
791 | } |
||
792 | } |
||
793 | |||
794 | /** |
||
795 | * Get the serialized cache content and return the unserialized string |
||
796 | * |
||
797 | * @todo implement to replace x3 refs to unserialize(file_get_contents($this->cacheDir . 'urls')); |
||
798 | * @return string |
||
799 | */ |
||
800 | public function getCacheFileContents() |
||
810 | } |
||
811 | } |
||
812 |