Test Failed
Push — master ( 06def7...a41bab )
by Russell
05:46
created

StaticSiteUrlList::isRunningTest()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 7
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
eloc 3
c 1
b 0
f 0
dl 0
loc 7
rs 10
cc 2
nc 2
nop 0
1
<?php
2
3
namespace PhpTek\Exodus\Tool;
4
5
use PHPCrawl\Enums\PHPCrawlerUrlCacheTypes;
6
use PhpTek\Exodus\Model\StaticSiteContentSource;
7
use PhpTek\Exodus\Tool\StaticSiteUtils;
8
use PhpTek\Exodus\Tool\StaticSiteMimeProcessor;
9
use PhpTek\Exodus\Crawl\StaticSiteCrawler;
10
use SilverStripe\Control\Director;
11
use SilverStripe\Core\Config\Configurable;
12
use SilverStripe\Core\Environment;
13
use SilverStripe\Core\Injector\Injectable;
14
15
/**
16
 * Represents a set of URLs parsed from a site.
17
 *
18
 * Makes use of PHPCrawl to prepare a list of URLs on the site
19
 *
20
 * @package phptek/silverstripe-exodus
21
 * @author Sam Minee <[email protected]>
22
 * @author Russell Michell <[email protected]>
23
 */
24
25
class StaticSiteUrlList
26
{
27
    use Injectable;
28
    use Configurable;
29
30
    /**
31
     * @var string
32
     */
33
    public const CRAWL_STATUS_COMPLETE = 'Complete';
34
35
    /**
36
     * @var string
37
     */
38
    public const CRAWL_STATUS_PARTIAL = 'Partial';
39
40
    /**
41
     * @var string
42
     */
43
    public const CRAWL_STATUS_NOTSTARTED = 'Not started';
44
45
    /**
46
     *
47
     * @var string
48
     */
49
    private static $undefined_mime_type = 'unknown';
50
51
    /**
52
     *
53
     * @var string
54
     */
55
    protected $baseURL;
56
57
    /**
58
     *
59
     * @var string
60
     */
61
    protected $cacheDir;
62
63
    /**
64
     * Two element array: contains keys 'inferred' and 'regular':
65
     *  - 'regular' is an array mapping raw URLs to processed URLs
66
     *  - 'inferred' is an array of inferred URLs
67
     *
68
     * @var array
69
     */
70
    protected $urls = null;
71
72
    /**
73
     *
74
     * @var boolean
75
     */
76
    protected $autoCrawl = false;
77
78
    /**
79
     *
80
     * @var StaticSiteUrlProcessor
81
     */
82
    protected $urlProcessor = null;
83
84
    /**
85
     *
86
     * @var array
87
     */
88
    protected $extraCrawlURLs = null;
89
90
    /**
91
     * A list of regular expression patterns to exclude from scraping
92
     *
93
     * @var array
94
     */
95
    protected $excludePatterns = [];
96
97
    /**
98
     * The StaticSiteContentSource object
99
     *
100
     * @var StaticSiteContentSource
101
     */
102
    protected $source;
103
104
    /**
105
     * Create a new URL List
106
     * @param StaticSiteContentSource $source
107
     * @param string $cacheDir The local path to cache data into
108
     * @return void
109
     */
110
    public function __construct(StaticSiteContentSource $source, $cacheDir)
111
    {
112
        // baseURL must not have a trailing slash
113
        $baseURL = $source->BaseUrl;
0 ignored issues
show
Bug Best Practice introduced by
The property BaseUrl does not exist on PhpTek\Exodus\Model\StaticSiteContentSource. Since you implemented __get, consider adding a @property annotation.
Loading history...
114
115
        if (substr($baseURL, -1) == "/") {
0 ignored issues
show
Bug introduced by
It seems like $baseURL can also be of type null; however, parameter $string of substr() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

115
        if (substr(/** @scrutinizer ignore-type */ $baseURL, -1) == "/") {
Loading history...
116
            $baseURL = substr($baseURL, 0, -1);
117
        }
118
119
        // cacheDir must have a trailing slash
120
        if (substr($cacheDir, -1) != "/") {
121
            $cacheDir .= "/";
122
        }
123
124
        $this->baseURL = $baseURL;
125
        $this->cacheDir = $cacheDir;
126
        $this->source = $source;
127
    }
128
129
    /**
130
     * Set a URL processor for this URL List.
131
     *
132
     * URL processors process the URLs before the site hierarchy and any inferred metadata are generated.
133
     * These can be used to tranform URLs from CMS's that don't provide a natural hierarchy, into something
134
     * more useful.
135
     *
136
     * @see {@link StaticSiteMOSSURLProcessor} for an example.
137
     * @param StaticSiteUrlProcessor $urlProcessor
138
     * @return void
139
     */
140
    public function setUrlProcessor(StaticSiteUrlProcessor $urlProcessor = null)
141
    {
142
        $this->urlProcessor = $urlProcessor;
143
    }
144
145
    /**
146
     * Define additional crawl URLs as an array
147
     * Each of these URLs will be crawled in addition the base URL.
148
     * This can be helpful if pages are getting missed by the crawl
149
     *
150
     * @param array $extraCrawlURLs
151
     * @return void
152
     */
153
    public function setExtraCrawlURls($extraCrawlURLs)
154
    {
155
        $this->extraCrawlURLs = $extraCrawlURLs;
156
    }
157
158
    /**
159
     * Return the additional crawl URLs as an array
160
     *
161
     * @return array
162
     */
163
    public function getExtraCrawlURLs()
164
    {
165
        return $this->extraCrawlURLs;
166
    }
167
168
    /**
169
     * Set an array of regular expression patterns that should be excluded from
170
     * being added to the url list.
171
     *
172
     * @param array $excludePatterns
173
     * @return void
174
     */
175
    public function setExcludePatterns(array $excludePatterns)
176
    {
177
        $this->excludePatterns = $excludePatterns;
178
    }
179
180
    /**
181
     * Get an array of regular expression patterns that should not be added to
182
     * the url list.
183
     *
184
     * @return array
185
     */
186
    public function getExcludePatterns()
187
    {
188
        return $this->excludePatterns;
189
    }
190
191
    /**
192
     * Set whether the crawl should be triggered on demand.
193
     *
194
     * @param boolean $autoCrawl
195
     * @return StaticSiteUrlList
196
     */
197
    public function setAutoCrawl(bool $autoCrawl): StaticSiteUrlList
198
    {
199
        $this->autoCrawl = $autoCrawl;
200
201
        return $this;
202
    }
203
204
    /**
205
     * Returns the status of the spidering.
206
     *
207
     * @return string
208
     */
209
    public function getSpiderStatus(): string
210
    {
211
        if (file_exists($this->cacheDir . 'urls')) {
212
            if (file_exists($this->cacheDir . 'crawlerid')) {
213
                return self::CRAWL_STATUS_PARTIAL;
214
            }
215
216
            return self::CRAWL_STATUS_COMPLETE;
217
        }
218
219
        return self::CRAWL_STATUS_NOTSTARTED;
220
    }
221
222
    /**
223
     * Raw URL+Mime data accessor method, used internally by logic outside of the class.
224
     *
225
     * @return mixed string $urls | null if no cached URL/Mime data found
226
     */
227
    public function getRawCacheData()
228
    {
229
        if ($this->urls) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $this->urls of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using ! empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
230
            // Don't rely on loadUrls() as it chokes on partially completed imports
231
            $urls = $this->urls;
232
        } elseif (file_exists($this->cacheDir . 'urls')) {
233
            $urls = unserialize(file_get_contents($this->cacheDir . 'urls'));
234
        } else {
235
            return null;
236
        }
237
        return $urls;
238
    }
239
240
    /**
241
     * Return the number of URLs crawled so far. If the urlcache is incomplete or
242
     * doesn't exist, assumes zero.
243
     *
244
     * @return mixed integer
245
     */
246
    public function getNumURIs(): int
247
    {
248
        if (!$urls = $this->getRawCacheData()) {
249
            return 0;
250
        }
251
252
        if (!isset($urls['regular']) || !isset($urls['regular'])) {
253
            return 0;
254
        }
255
256
        $_regular = [];
257
        $_inferred = [];
258
259
        foreach ($urls['regular'] as $key => $urlData) {
260
            array_push($_regular, $urlData['url']);
261
        }
262
263
        foreach ($urls['inferred'] as $key => $urlData) {
264
            array_push($_inferred, $urlData['url']);
265
        }
266
267
        return count(array_unique($_regular)) + count($_inferred);
268
    }
269
270
    /**
271
     * Return a map of URLs crawled, with raw URLs as keys and processed URLs as values
272
     *
273
     * @return array
274
     */
275
    public function getProcessedURLs(): array
276
    {
277
        if ($this->hasCrawled() || $this->autoCrawl) {
278
            if ($this->urls === null) {
279
                $this->loadUrls();
280
            }
281
282
            $_regular = [];
283
            $_inferred = null;
284
285
            foreach ($this->urls['regular'] as $key => $urlData) {
286
                $_regular[$key] = $urlData['url'];
287
            }
288
289
            if ($this->urls['inferred']) {
290
                $_inferred = [];
291
                foreach ($this->urls['inferred'] as $key => $urlData) {
292
                    $_inferred[$key] = $urlData['url'];
293
                }
294
            }
295
296
            return array_merge(
297
                $_regular,
298
                $_inferred ? array_combine($_inferred, $_inferred) : []
299
            );
300
        }
0 ignored issues
show
Bug Best Practice introduced by
The function implicitly returns null when the if condition on line 277 is false. This is incompatible with the type-hinted return array. Consider adding a return statement or allowing null as return value.

For hinted functions/methods where all return statements with the correct type are only reachable via conditions, ?null? gets implicitly returned which may be incompatible with the hinted type. Let?s take a look at an example:

interface ReturnsInt {
    public function returnsIntHinted(): int;
}

class MyClass implements ReturnsInt {
    public function returnsIntHinted(): int
    {
        if (foo()) {
            return 123;
        }
        // here: null is implicitly returned
    }
}
Loading history...
301
    }
302
303
    /**
304
     * There are URLs and we're not in the middle of a crawl.
305
     *
306
     * @return boolean
307
     */
308
    public function hasCrawled(): bool
309
    {
310
        return file_exists($this->cacheDir . 'urls') && !file_exists($this->cacheDir . 'crawlerid');
311
    }
312
313
    /**
314
     * Load the URLs, either by crawling, or by fetching from cache.
315
     *
316
     * @return void
317
     * @throws \LogicException
318
     */
319
    public function loadUrls(): void
320
    {
321
        if ($this->hasCrawled()) {
322
            $this->urls = unserialize(file_get_contents($this->cacheDir . 'urls'));
323
324
            // Clear out obsolete format
325
            if (!isset($this->urls['regular'])) {
326
                $this->urls['regular'] = [];
327
            }
328
            if (!isset($this->urls['inferred'])) {
329
                $this->urls['inferred'] = [];
330
            }
331
        } elseif ($this->autoCrawl) {
332
            $this->crawl();
333
        } else {
334
            // This is grim, but we get to keep the useful check
335
            if (!$this->isRunningTest()) {
336
                // This happens if you move a cache-file out of the way during a real (non-test) run...
337
                $msg = 'Crawl hasn\'t been executed yet and autoCrawl is false. Has the cache file been moved?';
338
                throw new \LogicException($msg);
339
            }
340
        }
341
    }
342
343
    /**
344
     * @return boolean
345
     */
346
    private function isRunningTest(): bool
347
    {
348
        return (
349
            // Github tests have SS_BASE_URL set as follows
350
            Environment::getEnv('SS_BASE_URL') == 'http://localhost' ||
351
            // Tests use "static-site-0" s cache dirname
352
            file_exists(preg_replace('#[0-9]+#', '0', $this->cacheDir))
353
        );
354
    }
355
356
    /**
357
     * Re-execute the URL processor on all the fetched URLs.
358
     * If the site has been crawled and then subsequently the URLProcessor was changed, we need to ensure
359
     * URLs are re-processed using the newly selected URL Preprocessor.
360
     *
361
     * @return void
362
     */
363
    public function reprocessUrls()
364
    {
365
        if ($this->urls === null) {
366
            $this->loadUrls();
367
        }
368
369
        // Clear out all inferred URLs; these will be added
370
        $this->urls['inferred'] = [];
371
372
        // Reprocess URLs, in case the processing has changed since the last crawl
373
        foreach ($this->urls['regular'] as $url => $urlData) {
374
            $processedURLData = $this->generateProcessedURL($urlData);
375
            $this->urls['regular'][$url] = $processedURLData;
376
377
            // Trigger parent URL back-filling on new processed URL
378
            $this->parentProcessedURL($processedURLData);
379
        }
380
381
        $this->saveURLs();
382
    }
383
384
    /**
385
     *
386
     * @param number $limit
387
     * @param bool $verbose
388
     * @return StaticSiteCrawler
389
     * @throws Exception
390
     */
391
    public function crawl($limit = false, $verbose = false)
392
    {
393
        Environment::increaseTimeLimitTo(3600);
394
395
        if (!is_dir($this->cacheDir)) {
396
            if (!mkdir($this->cacheDir)) {
397
                throw new \Exception('Unable to create cache directory at: ' . $this->cacheDir);
398
            }
399
        }
400
401
        $crawler = StaticSiteCrawler::create($this, $limit, $verbose);
402
        $crawler->enableResumption();
403
        $crawler->setUrlCacheType(PHPCrawlerUrlCacheTypes::URLCACHE_SQLITE);
404
        $crawler->setWorkingDirectory($this->cacheDir);
405
406
        // Find links in externally-linked CSS files
407
        if ($this->source->ParseCSS) {
0 ignored issues
show
Bug Best Practice introduced by
The property ParseCSS does not exist on PhpTek\Exodus\Model\StaticSiteContentSource. Since you implemented __get, consider adding a @property annotation.
Loading history...
408
            $crawler->addLinkSearchContentType("#text/css# i");
409
        }
410
411
        // Set some proxy options for phpCrawler
412
        singleton(StaticSiteUtils::class)->defineProxyOpts(!Director::isDev(), $crawler);
413
414
        // Allow for resuming an incomplete crawl
415
        if (file_exists($this->cacheDir . 'crawlerid')) {
416
            // We should re-load the partial list of URLs, if relevant
417
            // This should only happen when we are resuming a partial crawl
418
            if (file_exists($this->cacheDir . 'urls')) {
419
                $this->urls = unserialize(file_get_contents($this->cacheDir . 'urls'));
420
            } else {
421
                $this->urls = [
422
                    'regular' => [],
423
                    'inferred' => [],
424
                ];
425
            }
426
427
            $crawlerID = file_get_contents($this->cacheDir . 'crawlerid');
428
            $crawler->resume($crawlerID);
429
        } else {
430
            $crawlerID = $crawler->getCrawlerId();
431
            file_put_contents($this->cacheDir . '/crawlerid', $crawlerID);
432
433
            $this->urls = [
434
                'regular' => [],
435
                'inferred' => [],
436
            ];
437
        }
438
439
        $crawler->setURL($this->baseURL);
440
        $crawler->go();
441
442
        unlink($this->cacheDir . 'crawlerid');
443
444
        // TODO Document these
445
        ksort($this->urls['regular']);
446
        ksort($this->urls['inferred']);
447
448
        $this->saveURLs();
449
450
        return $crawler;
451
    }
452
453
    /**
454
     * Cache the current list of URLs to disk.
455
     *
456
     * @return void
457
     */
458
    public function saveURLs()
459
    {
460
        file_put_contents($this->cacheDir . 'urls', serialize($this->urls));
461
    }
462
463
    /**
464
     * Add a URL to this list, given the absolute URL.
465
     *
466
     * @param string $url The absolute URL
467
     * @param string $content_type The Mime-Type found at this URL e.g text/html or image/png
468
     * @throws \InvalidArgumentException
469
     * @return void
470
     */
471
    public function addAbsoluteURL($url, $content_type)
472
    {
473
        $simplifiedURL = $this->simplifyURL($url);
474
        $simplifiedBase = $this->simplifyURL($this->baseURL);
475
476
        // Check we're adhering to the correct base URL
477
        if (substr($simplifiedURL, 0, strlen($simplifiedBase)) == $simplifiedBase) {
478
            $relURL = preg_replace("#https?://(www.)?[^/]+#", '', $url);
479
        } else {
480
            throw new \InvalidArgumentException("URL $url is not from the site $this->baseURL");
481
        }
482
483
        $this->addURL($relURL, $content_type);
484
    }
485
486
    /**
487
     * Appends a processed URL onto the URL cache.
488
     *
489
     * @param string $url
490
     * @param string $contentType
491
     * @return void
492
     */
493
    public function addURL($url, $contentType)
494
    {
495
        if ($this->urls === null) {
496
            $this->loadUrls();
497
        }
498
499
        // Generate and save the processed URLs
500
        $urlData = [
501
            'url' => $url,
502
            'mime' => $contentType,
503
        ];
504
505
        $this->urls['regular'][$url] = $this->generateProcessedURL($urlData);
506
507
        // Trigger parent URL back-filling
508
        $this->parentProcessedURL($this->urls['regular'][$url]);
509
    }
510
511
    /**
512
     * Add an inferred URL to the list.
513
     *
514
     * Since the unprocessed URL isn't available, we use the processed URL in its place.
515
     * This should be used with some caution.
516
     *
517
     * @param array $inferredURLData Contains the processed URL and Mime-Type to add
518
     * @return void
519
     */
520
    public function addInferredURL($inferredURLData)
521
    {
522
        if ($this->urls === null) {
523
            $this->loadUrls();
524
        }
525
526
        // Generate and save the processed URLs
527
        $this->urls['inferred'][$inferredURLData['url']] = $inferredURLData;
528
529
        // Trigger parent URL back-filling
530
        $this->parentProcessedURL($inferredURLData);
531
    }
532
533
    /**
534
     * Return true if the given URL exists.
535
     *
536
     * @param string $url The URL, either absolute, or relative starting with "/"
537
     * @return boolean Does the URL exist
538
     * @throws \InvalidArgumentException
539
     */
540
    public function hasURL($url)
541
    {
542
        if ($this->urls === null) {
543
            $this->loadUrls();
544
        }
545
546
        // Try and relativise an absolute URL
547
        if ($url[0] != '/') {
548
            $simpifiedURL = $this->simplifyURL($url);
549
            $simpifiedBase = $this->simplifyURL($this->baseURL);
550
551
            if (substr($simpifiedURL, 0, strlen($simpifiedBase)) == $simpifiedBase) {
552
                $url = substr($simpifiedURL, strlen($simpifiedBase));
553
            } else {
554
                throw new \InvalidArgumentException("URL $url is not from the site $this->baseURL");
555
            }
556
        }
557
558
        return isset($this->urls['regular'][$url]) || in_array($url, $this->urls['inferred']);
559
    }
560
561
    /**
562
     * Simplify a URL. Ignores https/http differences and "www." / non differences.
563
     *
564
     * @param  string $url
565
     * @return string
566
     * @todo Why does this ignore https/http differences? Should it?
567
     */
568
    public function simplifyURL($url)
569
    {
570
        return preg_replace("#^http(s)?://(www.)?#i", 'http://www.', $url);
571
    }
572
573
    /**
574
     * Returns true if the given URL is in the list of processed URls
575
     *
576
     * @param string $processedURL The processed URL
577
     * @return boolean True if it exists, false otherwise
578
     */
579
    public function hasProcessedURL($processedURL)
580
    {
581
        if ($this->urls === null) {
582
            $this->loadUrls();
583
        }
584
585
        return in_array($processedURL, array_keys($this->urls['regular'])) ||
586
               in_array($processedURL, array_keys($this->urls['inferred']));
587
    }
588
589
    /**
590
     * Return the processed URL that is the parent of the given one.
591
     *
592
     * Both input and output are processed URLs
593
     *
594
     * @param array $processedURLData URLData comprising a relative URL and Mime-Type
595
     * @return string | array $processedURLData
596
     */
597
    public function parentProcessedURL($processedURLData)
598
    {
599
        $mime = self::$undefined_mime_type;
600
        $processedURL = $processedURLData;
601
602
        if (is_array($processedURLData)) {
0 ignored issues
show
introduced by
The condition is_array($processedURLData) is always true.
Loading history...
603
            /*
604
             * If $processedURLData['url'] is not HTML, it's unlikely its parent
605
             * is anything useful (Prob just a directory)
606
             */
607
            $sng = singleton(StaticSiteMimeProcessor::class);
608
            $mime = $sng->IsOfHtml($processedURLData['mime']) ? $processedURLData['mime'] : self::$undefined_mime_type;
609
            $processedURL = $processedURLData['url'];
610
        }
611
612
        $default = function ($fragment) use ($mime) {
613
            return [
614
                'url' => $fragment,
615
                'mime' => $mime,
616
            ];
617
        };
618
619
        if ($processedURL == "/") {
620
            return $default('');
621
        }
622
623
        // URL hierarchy can be broken down by querystring or by URL
624
        $breakpoint = max(strrpos($processedURL, '?'), strrpos($processedURL, '/'));
625
626
        // Special case for children of the root
627
        if ($breakpoint == 0) {
628
            return $default('/');
629
        }
630
631
        // Get parent URL
632
        $parentProcessedURL = substr($processedURL, 0, $breakpoint);
633
634
        $processedURLData = [
635
            'url' => $parentProcessedURL,
636
            'mime' => $mime,
637
        ];
638
639
        // If an intermediary URL doesn't exist, create it
640
        if (!$this->hasProcessedURL($parentProcessedURL)) {
641
            $this->addInferredURL($processedURLData);
642
        }
643
644
        return $processedURLData;
645
    }
646
647
    /**
648
     * Find the processed URL in the URL list
649
     *
650
     * @param  mixed string | array $urlData
651
     * @return array $urlData
652
     */
653
    public function processedURL($urlData)
654
    {
655
        $url = $urlData;
656
        $mime = self::$undefined_mime_type;
657
658
        if (is_array($urlData)) {
659
            $url = $urlData['url'];
660
            $mime = $urlData['mime'];
661
        }
662
663
        if ($this->urls === null) {
664
            $this->loadUrls();
665
        }
666
667
        $urlData = [
668
            'url' => $url,
669
            'mime' => $mime,
670
        ];
671
672
        if (isset($this->urls['regular'][$url])) {
673
            // Generate it if missing
674
            if ($this->urls['regular'][$url] === true) {
675
                $this->urls['regular'][$url] = $this->generateProcessedURL($urlData);
676
            }
677
678
            return $this->urls['regular'][$url];
679
        } elseif (isset($this->urls['inferred'][$url])) {
680
            return $this->urls['inferred'][$url];
681
        }
682
    }
683
684
    /**
685
     * Execute custom logic for processing URLs prior to heirachy generation.
686
     *
687
     * This can be used to implement logic such as ignoring the "/Pages/" parts of MOSS URLs, or dropping extensions.
688
     *
689
     * @param  array $urlData The unprocessed URLData
690
     * @return array $urlData The processed URLData
691
     * @throws \LogicException
692
     */
693
    public function generateProcessedURL(array $urlData): array
694
    {
695
        if (!isset($urlData['url'])) {
696
            throw new \LogicException("Can't pass a blank URL to generateProcessedURL");
697
        }
698
699
        if ($this->urlProcessor) {
700
            $urlData = $this->urlProcessor->processURL($urlData);
701
        }
702
703
        if (!$urlData) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $urlData of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
704
            throw new \LogicException(get_class($this->urlProcessor) . " returned a blank URL.");
705
        }
706
707
        return $urlData;
708
    }
709
710
    /**
711
     * Return the URLs that are a child of the given URL
712
     *
713
     * @param string $url
714
     * @return array
715
     */
716
    public function getChildren($url)
717
    {
718
        if ($this->urls === null) {
719
            $this->loadUrls();
720
        }
721
722
        $processedURL = $this->processedURL($url);
723
        $processedURL = $processedURL['url'] ?? '/';
724
725
        // Subtly different regex if the URL ends in '?' or '/'
726
        if (preg_match('#[/?]$#', $processedURL)) {
727
            $regEx = '#^' . preg_quote($processedURL, '#') . '[^/?]+$#';
728
        } else {
729
            $regEx = '#^' . preg_quote($processedURL, '#') . '[/?][^/?]+$#';
730
        }
731
732
        $children = [];
733
734
        foreach ($this->urls['regular'] as $urlKey => $potentialProcessedChild) {
735
            $potentialProcessedChild = $urlKey;
736
            if (preg_match($regEx, $potentialProcessedChild)) {
737
                if (!isset($children[$potentialProcessedChild])) {
738
                    $children[$potentialProcessedChild] = $potentialProcessedChild;
739
                }
740
            }
741
        }
742
743
        foreach ($this->urls['inferred'] as $urlKey => $potentialProcessedChild) {
744
            $potentialProcessedChild = $urlKey;
745
            if (preg_match($regEx, $potentialProcessedChild)) {
746
                if (!isset($children[$potentialProcessedChild])) {
747
                    $children[$potentialProcessedChild] = $potentialProcessedChild;
748
                }
749
            }
750
        }
751
752
        return array_values($children);
753
    }
754
755
    /**
756
     * Simple property getter. Used in unit-testing.
757
     *
758
     * @param string $prop
759
     * @return mixed
760
     */
761
    public function getProperty($prop)
762
    {
763
        if ($this->$prop) {
764
            return $this->$prop;
765
        }
766
    }
767
768
    /**
769
     * Get the serialized cache content and return the unserialized string
770
     *
771
     * @todo implement to replace x3 refs to unserialize(file_get_contents($this->cacheDir . 'urls'));
772
     * @return string
773
     */
774
    public function getCacheFileContents()
775
    {
776
        $cache = '';
777
        $cacheFile = $this->cacheDir . 'urls';
778
779
        if (file_exists($cacheFile)) {
780
            $cache = unserialize(file_get_contents($cacheFile));
781
        }
782
783
        return $cache;
784
    }
785
}
786