Test Failed
Push — master ( 7a0753...802928 )
by Russell
12:42 queued 13s
created

StaticSiteUrlList::setIsRunningTest()   A

Complexity

Conditions 3
Paths 2

Size

Total Lines 6
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 3
eloc 3
c 0
b 0
f 0
nc 2
nop 0
dl 0
loc 6
rs 10
1
<?php
2
3
namespace PhpTek\Exodus\Tool;
4
5
use PHPCrawl\Enums\PHPCrawlerMultiProcessModes;
6
use PHPCrawl\Enums\PHPCrawlerUrlCacheTypes;
7
use PhpTek\Exodus\Model\StaticSiteContentSource;
8
use PhpTek\Exodus\Tool\StaticSiteUtils;
9
use PhpTek\Exodus\Tool\StaticSiteMimeProcessor;
10
use PhpTek\Exodus\Crawl\StaticSiteCrawler;
11
use SilverStripe\Control\Director;
12
use SilverStripe\Core\Config\Configurable;
13
use SilverStripe\Core\Environment;
14
use SilverStripe\Core\Injector\Injectable;
15
16
/**
17
 * Represents a set of URLs parsed from a site.
18
 *
19
 * Makes use of PHPCrawl to prepare a list of URLs on the site
20
 *
21
 * @package phptek/silverstripe-exodus
22
 * @author Sam Minee <[email protected]>
23
 * @author Russell Michell <[email protected]>
24
 */
25
26
class StaticSiteUrlList
27
{
28
    use Injectable;
29
    use Configurable;
30
31
    /**
32
     * @var string
33
     */
34
    public const CRAWL_STATUS_COMPLETE = 'Complete';
35
36
    /**
37
     * @var string
38
     */
39
    public const CRAWL_STATUS_PARTIAL = 'Partial';
40
41
    /**
42
     * @var string
43
     */
44
    public const CRAWL_STATUS_NOTSTARTED = 'Not started';
45
46
    /**
47
     *
48
     * @var string
49
     */
50
    private static $undefined_mime_type = 'unknown/unknown';
51
52
    /**
53
     *
54
     * @var string
55
     */
56
    protected $baseURL;
57
58
    /**
59
     *
60
     * @var string
61
     */
62
    protected $cacheDir;
63
64
    /**
65
     * Two element array: contains keys 'inferred' and 'regular':
66
     *  - 'regular' is an array mapping raw URLs to processed URLs
67
     *  - 'inferred' is an array of inferred URLs
68
     *
69
     * @var array
70
     */
71
    protected $urls = null;
72
73
    /**
74
     *
75
     * @var boolean
76
     */
77
    protected $autoCrawl = false;
78
79
    /**
80
     *
81
     * @var StaticSiteUrlProcessor
82
     */
83
    protected $urlProcessor = null;
84
85
    /**
86
     *
87
     * @var array
88
     */
89
    protected $extraCrawlURLs = null;
90
91
    /**
92
     * A list of regular expression patterns to exclude from scraping
93
     *
94
     * @var array
95
     */
96
    protected $excludePatterns = [];
97
98
    /**
99
     * The StaticSiteContentSource object
100
     *
101
     * @var StaticSiteContentSource
102
     */
103
    protected $source;
104
105
    /**
106
     * Create a new URL List
107
     * @param StaticSiteContentSource $source
108
     * @param string $cacheDir The local path to cache data into
109
     * @return void
110
     */
111
    public function __construct(StaticSiteContentSource $source, $cacheDir)
112
    {
113
        $this->setIsRunningTest();
114
115
        // baseURL must not have a trailing slash
116
        $baseURL = $source->BaseUrl;
0 ignored issues
show
Bug Best Practice introduced by
The property BaseUrl does not exist on PhpTek\Exodus\Model\StaticSiteContentSource. Since you implemented __get, consider adding a @property annotation.
Loading history...
117
118
        if (substr($baseURL, -1) == "/") {
0 ignored issues
show
Bug introduced by
It seems like $baseURL can also be of type null; however, parameter $string of substr() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

118
        if (substr(/** @scrutinizer ignore-type */ $baseURL, -1) == "/") {
Loading history...
119
            $baseURL = substr($baseURL, 0, -1);
120
        }
121
122
        // cacheDir must have a trailing slash
123
        if (substr($cacheDir, -1) != "/") {
124
            $cacheDir .= "/";
125
        }
126
127
        $this->baseURL = $baseURL;
128
        $this->cacheDir = $cacheDir;
129
        $this->source = $source;
130
    }
131
132
    /**
133
     * Set a URL processor for this URL List.
134
     *
135
     * URL processors process the URLs before the site hierarchy and any inferred metadata are generated.
136
     * These can be used to tranform URLs from CMS's that don't provide a natural hierarchy, into something
137
     * more useful.
138
     *
139
     * @see {@link StaticSiteMOSSURLProcessor} for an example.
140
     * @param StaticSiteUrlProcessor $urlProcessor
141
     * @return void
142
     */
143
    public function setUrlProcessor(StaticSiteUrlProcessor $urlProcessor = null)
144
    {
145
        $this->urlProcessor = $urlProcessor;
146
    }
147
148
    /**
149
     * Define additional crawl URLs as an array
150
     * Each of these URLs will be crawled in addition the base URL.
151
     * This can be helpful if pages are getting missed by the crawl
152
     *
153
     * @param array $extraCrawlURLs
154
     * @return void
155
     */
156
    public function setExtraCrawlURls($extraCrawlURLs)
157
    {
158
        $this->extraCrawlURLs = $extraCrawlURLs;
159
    }
160
161
    /**
162
     * Return the additional crawl URLs as an array
163
     *
164
     * @return array
165
     */
166
    public function getExtraCrawlURLs()
167
    {
168
        return $this->extraCrawlURLs;
169
    }
170
171
    /**
172
     * Set an array of regular expression patterns that should be excluded from
173
     * being added to the url list.
174
     *
175
     * @param array $excludePatterns
176
     * @return void
177
     */
178
    public function setExcludePatterns(array $excludePatterns)
179
    {
180
        $this->excludePatterns = $excludePatterns;
181
    }
182
183
    /**
184
     * Get an array of regular expression patterns that should not be added to
185
     * the url list.
186
     *
187
     * @return array
188
     */
189
    public function getExcludePatterns()
190
    {
191
        return $this->excludePatterns;
192
    }
193
194
    /**
195
     * Set whether the crawl should be triggered on demand.
196
     *
197
     * @param boolean $autoCrawl
198
     * @return StaticSiteUrlList
199
     */
200
    public function setAutoCrawl(bool $autoCrawl): StaticSiteUrlList
201
    {
202
        $this->autoCrawl = $autoCrawl;
203
204
        return $this;
205
    }
206
207
    /**
208
     * Returns the status of the spidering.
209
     *
210
     * @return string
211
     */
212
    public function getSpiderStatus(): string
213
    {
214
        if (file_exists($this->cacheDir . 'urls')) {
215
            if (file_exists($this->cacheDir . 'crawlerid')) {
216
                return self::CRAWL_STATUS_PARTIAL;
217
            }
218
219
            return self::CRAWL_STATUS_COMPLETE;
220
        }
221
222
        return self::CRAWL_STATUS_NOTSTARTED;
223
    }
224
225
    /**
226
     * Raw URL+Mime data accessor method, used internally by logic outside of the class.
227
     *
228
     * @return mixed string $urls | null if no cached URL/Mime data found
229
     */
230
    public function getRawCacheData()
231
    {
232
        if ($this->urls) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $this->urls of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using ! empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
233
            // Don't rely on loadUrls() as it chokes on partially completed imports
234
            $urls = $this->urls;
235
        } elseif (file_exists($this->cacheDir . 'urls')) {
236
            $urls = unserialize(file_get_contents($this->cacheDir . 'urls'));
237
        } else {
238
            return null;
239
        }
240
        return $urls;
241
    }
242
243
    /**
244
     * Return the number of URLs crawled so far. If the urlcache is incomplete or
245
     * doesn't exist, assumes zero.
246
     *
247
     * @return mixed integer
248
     */
249
    public function getNumURIs(): int
250
    {
251
        if (!$urls = $this->getRawCacheData()) {
252
            return 0;
253
        }
254
255
        if (!isset($urls['regular']) || !isset($urls['regular'])) {
256
            return 0;
257
        }
258
259
        $_regular = [];
260
        $_inferred = [];
261
262
        foreach ($urls['regular'] as $key => $urlData) {
263
            array_push($_regular, $urlData['url']);
264
        }
265
266
        foreach ($urls['inferred'] as $key => $urlData) {
267
            array_push($_inferred, $urlData['url']);
268
        }
269
270
        return count(array_unique($_regular)) + count($_inferred);
271
    }
272
273
    /**
274
     * Return a map of URLs crawled, with raw URLs as keys and processed URLs as values
275
     *
276
     * @return array
277
     */
278
    public function getProcessedURLs(): array
279
    {
280
        if ($this->hasCrawled() || $this->autoCrawl) {
281
            if ($this->urls === null) {
282
                $this->loadUrls();
283
            }
284
285
            $_regular = [];
286
            $_inferred = null;
287
288
            foreach ($this->urls['regular'] as $key => $urlData) {
289
                $_regular[$key] = $urlData['url'];
290
            }
291
292
            if ($this->urls['inferred']) {
293
                $_inferred = [];
294
                foreach ($this->urls['inferred'] as $key => $urlData) {
295
                    $_inferred[$key] = $urlData['url'];
296
                }
297
            }
298
299
            return array_merge(
300
                $_regular,
301
                $_inferred ? array_combine($_inferred, $_inferred) : []
302
            );
303
        }
0 ignored issues
show
Bug Best Practice introduced by
The function implicitly returns null when the if condition on line 280 is false. This is incompatible with the type-hinted return array. Consider adding a return statement or allowing null as return value.

For hinted functions/methods where all return statements with the correct type are only reachable via conditions, ?null? gets implicitly returned which may be incompatible with the hinted type. Let?s take a look at an example:

interface ReturnsInt {
    public function returnsIntHinted(): int;
}

class MyClass implements ReturnsInt {
    public function returnsIntHinted(): int
    {
        if (foo()) {
            return 123;
        }
        // here: null is implicitly returned
    }
}
Loading history...
304
    }
305
306
    /**
307
     * There are URLs and we're not in the middle of a crawl.
308
     *
309
     * @return boolean
310
     */
311
    public function hasCrawled(): bool
312
    {
313
        return file_exists($this->cacheDir . 'urls') && !file_exists($this->cacheDir . 'crawlerid');
314
    }
315
316
    /**
317
     * Load the URLs, either by crawling, or by fetching from cache.
318
     *
319
     * @return void
320
     * @throws \LogicException
321
     */
322
    public function loadUrls(): void
323
    {
324
        if ($this->hasCrawled()) {
325
            $this->urls = unserialize(file_get_contents($this->cacheDir . 'urls'));
326
327
            // Clear out obsolete format
328
            if (!isset($this->urls['regular'])) {
329
                $this->urls['regular'] = [];
330
            }
331
            if (!isset($this->urls['inferred'])) {
332
                $this->urls['inferred'] = [];
333
            }
334
        } elseif ($this->autoCrawl) {
335
            $this->crawl();
336
        } else {
337
            // This happens if you move a cache-file out of the way during a real (non-test) run...
338
            $msg = 'Crawl hasn\'t been executed yet and autoCrawl is false. Has the cache file been moved?';
339
            throw new \LogicException($msg);
340
        }
341
    }
342
343
    /**
344
     * @return void
345
     */
346
    private function setIsRunningTest(): void
347
    {
348
        $isGithub = Environment::getEnv('SS_BASE_URL') == 'http://localhost'; // Github tests have SS_BASE_URL set
349
350
        if ($isGithub && !file_exists(ASSETS_PATH)) {
351
            mkdir(ASSETS_PATH, 0777, true);
352
        }
353
    }
354
355
    /**
356
     * Re-execute the URL processor on all the fetched URLs.
357
     * If the site has been crawled and then subsequently the URLProcessor was changed through
358
     * user-interaction in the "external content" CMS admin, then we need to ensure that
359
     * URLs are re-processed using the newly selected URL Preprocessor.
360
     *
361
     * @return void
362
     */
363
    public function reprocessUrls()
364
    {
365
        if ($this->urls === null) {
366
            $this->loadUrls();
367
        }
368
369
        // Clear out all inferred URLs; these will be added
370
        $this->urls['inferred'] = [];
371
372
        // Reprocess URLs, in case the processing has changed since the last crawl
373
        foreach ($this->urls['regular'] as $url => $urlData) {
374
            // TODO Log this in exodus.log
375
            if (empty($urlData['url'])) {
376
               // echo $urlData['mime'] . "\n";
377
                continue;
378
            }
379
380
            $processedURLData = $this->generateProcessedURL($urlData);
381
            $this->urls['regular'][$url] = $processedURLData;
382
            // Trigger parent URL back-filling on new processed URL
383
            $this->parentProcessedURL($processedURLData);
384
        }
385
386
        $this->saveURLs();
387
    }
388
389
    /**
390
     *
391
     * @param number $limit
392
     * @param bool $verbose
393
     * @return StaticSiteCrawler
394
     * @throws Exception
395
     */
396
    public function crawl($limit = false, $verbose = false)
397
    {
398
        Environment::increaseTimeLimitTo(3600);
399
400
        if (!is_dir($this->cacheDir)) {
401
            if (!mkdir($this->cacheDir)) {
402
                throw new \Exception('Unable to create cache directory at: ' . $this->cacheDir);
403
            }
404
        }
405
406
        $crawler = StaticSiteCrawler::create($this, $limit, $verbose);
407
        $crawler->enableResumption();
408
        $crawler->setUrlCacheType(PHPCrawlerUrlCacheTypes::URLCACHE_SQLITE);
409
        $crawler->setWorkingDirectory($this->cacheDir);
410
411
        // Find links in externally-linked CSS files
412
        if ($this->source->ParseCSS) {
0 ignored issues
show
Bug Best Practice introduced by
The property ParseCSS does not exist on PhpTek\Exodus\Model\StaticSiteContentSource. Since you implemented __get, consider adding a @property annotation.
Loading history...
413
            $crawler->addLinkSearchContentType("#text/css# i");
414
        }
415
416
        // Set some proxy options for phpCrawler
417
        singleton(StaticSiteUtils::class)->defineProxyOpts(!Director::isDev(), $crawler);
418
419
        // Allow for resuming an incomplete crawl
420
        if (file_exists($this->cacheDir . 'crawlerid')) {
421
            // We should re-load the partial list of URLs, if relevant
422
            // This should only happen when we are resuming a partial crawl
423
            if (file_exists($this->cacheDir . 'urls')) {
424
                $this->urls = unserialize(file_get_contents($this->cacheDir . 'urls'));
425
            } else {
426
                $this->urls = [
427
                    'regular' => [],
428
                    'inferred' => [],
429
                ];
430
            }
431
432
            $crawlerID = file_get_contents($this->cacheDir . 'crawlerid');
433
            $crawler->resume($crawlerID);
434
        } else {
435
            $crawlerID = $crawler->getCrawlerId();
0 ignored issues
show
Unused Code introduced by
The assignment to $crawlerID is dead and can be removed.
Loading history...
436
437
            $this->urls = [
438
                'regular' => [],
439
                'inferred' => [],
440
            ];
441
        }
442
443
        $crawler->setURL($this->baseURL);
444
        $crawler->setPort(preg_match('#^https#', $this->baseURL) ? 443 : 80);
445
        $crawler->go();
446
447
        // TODO Why were we deleting this originally?
448
        // unlink($this->cacheDir . 'crawlerid');
449
450
        // TODO Document these
451
        ksort($this->urls['regular']);
452
        ksort($this->urls['inferred']);
453
454
        $this->saveURLs();
455
456
        return $crawler;
457
    }
458
459
    /**
460
     * Cache the current list of URLs to disk.
461
     *
462
     * @return void
463
     */
464
    public function saveURLs()
465
    {
466
        file_put_contents($this->cacheDir . 'urls', serialize($this->urls));
467
    }
468
469
    /**
470
     * Add a URL to this list, given the absolute URL.
471
     *
472
     * @param string $url The absolute URL
473
     * @param string $content_type The Mime-Type found at this URL e.g text/html or image/png
474
     * @throws \InvalidArgumentException
475
     * @return void
476
     */
477
    public function addAbsoluteURL($url, $content_type)
478
    {
479
        $simplifiedURL = $this->simplifyURL($url);
480
        $simplifiedBase = $this->simplifyURL($this->baseURL);
481
482
        // Check we're adhering to the correct base URL
483
        if (substr($simplifiedURL, 0, strlen($simplifiedBase)) == $simplifiedBase) {
484
            $relURL = preg_replace("#https?://(www.)?[^/]+#", '', $url);
485
        } else {
486
            throw new \InvalidArgumentException("URL $url is not from the site $this->baseURL");
487
        }
488
489
        $this->addURL($relURL, $content_type);
490
    }
491
492
    /**
493
     * Appends a processed URL onto the URL cache.
494
     *
495
     * @param string $url
496
     * @param string $contentType
497
     * @return mixed null|void
498
     */
499
    public function addURL($url, $contentType)
500
    {
501
        if ($this->urls === null) {
502
            $this->loadUrls();
503
        }
504
505
        if (empty($url)) {
506
            return null;
507
        }
508
509
        // Generate and save the processed URLs
510
        $urlData = [
511
            'url' => $url,
512
            'mime' => $contentType,
513
        ];
514
515
        $this->urls['regular'][$url] = $this->generateProcessedURL($urlData);
516
517
        // Trigger parent URL back-filling
518
        $this->parentProcessedURL($this->urls['regular'][$url]);
519
    }
520
521
    /**
522
     * Add an inferred URL to the list.
523
     *
524
     * Since the unprocessed URL isn't available, we use the processed URL in its place.
525
     * This should be used with some caution.
526
     *
527
     * @param array $inferredURLData Contains the processed URL and Mime-Type to add
528
     * @return void
529
     */
530
    public function addInferredURL($inferredURLData)
531
    {
532
        if ($this->urls === null) {
533
            $this->loadUrls();
534
        }
535
536
        // Generate and save the processed URLs
537
        $this->urls['inferred'][$inferredURLData['url']] = $inferredURLData;
538
539
        // Trigger parent URL back-filling
540
        $this->parentProcessedURL($inferredURLData);
541
    }
542
543
    /**
544
     * Return true if the given URL exists.
545
     *
546
     * @param string $url The URL, either absolute, or relative starting with "/"
547
     * @return boolean Does the URL exist
548
     * @throws \InvalidArgumentException
549
     */
550
    public function hasURL($url)
551
    {
552
        if ($this->urls === null) {
553
            $this->loadUrls();
554
        }
555
556
        // Try and relativise an absolute URL
557
        if ($url[0] != '/') {
558
            $simpifiedURL = $this->simplifyURL($url);
559
            $simpifiedBase = $this->simplifyURL($this->baseURL);
560
561
            if (substr($simpifiedURL, 0, strlen($simpifiedBase)) == $simpifiedBase) {
562
                $url = substr($simpifiedURL, strlen($simpifiedBase));
563
            } else {
564
                throw new \InvalidArgumentException("URL $url is not from the site $this->baseURL");
565
            }
566
        }
567
568
        return isset($this->urls['regular'][$url]) || in_array($url, $this->urls['inferred']);
569
    }
570
571
    /**
572
     * Simplify a URL. Ignores https/http differences and "www." / non differences.
573
     *
574
     * @param  string $url
575
     * @return string
576
     * @todo Why does this ignore https/http differences? Should it?
577
     */
578
    public function simplifyURL($url)
579
    {
580
        return preg_replace("#^http(s)?://(www.)?#i", 'http://www.', $url);
581
    }
582
583
    /**
584
     * Returns true if the given URL is in the list of processed URls
585
     *
586
     * @param string $processedURL The processed URL
587
     * @return boolean True if it exists, false otherwise
588
     */
589
    public function hasProcessedURL($processedURL)
590
    {
591
        if ($this->urls === null) {
592
            $this->loadUrls();
593
        }
594
595
        return in_array($processedURL, array_keys($this->urls['regular'])) ||
596
               in_array($processedURL, array_keys($this->urls['inferred']));
597
    }
598
599
    /**
600
     * Return the processed URL that is the parent of the given one.
601
     *
602
     * Both input and output are processed URLs
603
     *
604
     * @param array $processedURLData URLData comprising a relative URL and Mime-Type
605
     * @return array
606
     */
607
    public function parentProcessedURL(array $processedURLData): array
608
    {
609
        $mime = self::$undefined_mime_type;
610
        $processedURL = $processedURLData;
611
612
        if (is_array($processedURLData)) {
0 ignored issues
show
introduced by
The condition is_array($processedURLData) is always true.
Loading history...
613
            if (empty($processedURLData['url'])) {
614
                $processedURLData['url'] = '/'; // This will be dealt with, with the selected duplication strategy
615
            }
616
617
            if (empty($processedURLData['mime'])) {
618
                $processedURLData['mime'] = self::$undefined_mime_type;
619
            }
620
621
            /*
622
             * If $processedURLData['url'] is not HTML, it's unlikely its parent
623
             * is anything useful (Prob just a directory)
624
             */
625
            $sng = singleton(StaticSiteMimeProcessor::class);
626
            $mime = $sng->IsOfHtml($processedURLData['mime']) ?
627
                $processedURLData['mime'] :
628
                self::$undefined_mime_type;
629
            $processedURL = $processedURLData['url'];
630
        }
631
632
        $default = function ($fragment) use ($mime) {
633
            return [
634
                'url' => $fragment,
635
                'mime' => $mime,
636
            ];
637
        };
638
639
        if ($processedURL == "/") {
640
            return $default('');
641
        }
642
643
        // URL hierarchy can be broken down by querystring or by URL
644
        $breakpoint = max(strrpos($processedURL, '?'), strrpos($processedURL, '/'));
645
646
        // Special case for children of the root
647
        if ($breakpoint == 0) {
648
            return $default('/');
649
        }
650
651
        // Get parent URL
652
        $parentProcessedURL = substr($processedURL, 0, $breakpoint);
653
654
        $processedURLData = [
655
            'url' => $parentProcessedURL,
656
            'mime' => $mime,
657
        ];
658
659
        // If an intermediary URL doesn't exist, create it
660
        if (!$this->hasProcessedURL($parentProcessedURL)) {
661
            $this->addInferredURL($processedURLData);
662
        }
663
664
        return $processedURLData;
665
    }
666
667
    /**
668
     * Find the processed URL in the URL list
669
     *
670
     * @param  mixed string | array $urlData
671
     * @return array
672
     * @todo Under what circumstances would $this->urls['regular'][$url] === true (line ~696)?
673
     */
674
    public function processedURL($urlData): array
675
    {
676
        // Load-up the cache into memory
677
        if ($this->urls === null) {
678
            $this->loadUrls();
679
        }
680
681
        if (is_array($urlData)) {
682
            $url = $urlData['url'];
683
            $mime = $urlData['mime'];
684
        } else {
685
            $url = $urlData;
686
            $mime = self::$undefined_mime_type;
687
        }
688
689
        $urlData = [
690
            'url' => $url,
691
            'mime' => $mime,
692
        ];
693
694
        // Cached urls use $url as the key..
695
        if (isset($this->urls['regular'][$url])) {
696
            // Generate it if missing
697
            if ($this->urls['regular'][$url] === true) {
698
                $this->urls['regular'][$url] = $this->generateProcessedURL($urlData);
699
            }
700
701
            return $this->urls['regular'][$url];
702
        } elseif(isset($this->urls['inferred'][$url])) {
703
            return $this->urls['inferred'][$url];
704
        }
705
706
        return [];
707
    }
708
709
    /**
710
     * Execute custom logic for processing URLs prior to heirachy generation.
711
     *
712
     * This can be used to implement logic such as ignoring the "/Pages/" parts of MOSS URLs, or dropping extensions.
713
     *
714
     * @param  array $urlData The unprocessed URLData
715
     * @return array $urlData The processed URLData
716
     * @throws \LogicException
717
     */
718
    public function generateProcessedURL(array $urlData): array
719
    {
720
        if (empty($urlData['url'])) {
721
            throw new \LogicException("Can't pass a blank URL to generateProcessedURL");
722
        }
723
724
        if ($this->urlProcessor) {
725
            $urlData = $this->urlProcessor->processURL($urlData);
726
        }
727
728
        if (!$urlData) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $urlData of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
729
            //return []; // Even if $urlData has a mime-type, it's useless without a URI
730
            throw new \LogicException(get_class($this->urlProcessor) . " returned a blank URL.");
731
        }
732
733
        return $urlData;
734
    }
735
736
    /**
737
     * Return the URLs that are a child of the given URL
738
     *
739
     * @param string $url
740
     * @return array
741
     */
742
    public function getChildren($url)
743
    {
744
        if ($this->urls === null) {
745
            $this->loadUrls();
746
        }
747
748
        $processedURL = $this->processedURL($url);
749
        $processedURL = $processedURL['url'] ?? '/';
750
751
        // Subtly different regex if the URL ends in '?' or '/'
752
        if (preg_match('#[/?]$#', $processedURL)) {
753
            $regEx = '#^' . preg_quote($processedURL, '#') . '[^/?]+$#';
754
        } else {
755
            $regEx = '#^' . preg_quote($processedURL, '#') . '[/?][^/?]+$#';
756
        }
757
758
        $children = [];
759
760
        foreach ($this->urls['regular'] as $urlKey => $potentialProcessedChild) {
761
            $potentialProcessedChild = $urlKey;
762
            if (preg_match($regEx, $potentialProcessedChild)) {
763
                if (!isset($children[$potentialProcessedChild])) {
764
                    $children[$potentialProcessedChild] = $potentialProcessedChild;
765
                }
766
            }
767
        }
768
769
        foreach ($this->urls['inferred'] as $urlKey => $potentialProcessedChild) {
770
            $potentialProcessedChild = $urlKey;
771
            if (preg_match($regEx, $potentialProcessedChild)) {
772
                if (!isset($children[$potentialProcessedChild])) {
773
                    $children[$potentialProcessedChild] = $potentialProcessedChild;
774
                }
775
            }
776
        }
777
778
        return array_values($children);
779
    }
780
781
    /**
782
     * Simple property getter. Used in unit-testing.
783
     *
784
     * @param string $prop
785
     * @return mixed
786
     */
787
    public function getProperty($prop)
788
    {
789
        if ($this->$prop) {
790
            return $this->$prop;
791
        }
792
    }
793
794
    /**
795
     * Get the serialized cache content and return the unserialized string
796
     *
797
     * @todo implement to replace x3 refs to unserialize(file_get_contents($this->cacheDir . 'urls'));
798
     * @return string
799
     */
800
    public function getCacheFileContents()
801
    {
802
        $cache = '';
803
        $cacheFile = $this->cacheDir . 'urls';
804
805
        if (file_exists($cacheFile)) {
806
            $cache = unserialize(file_get_contents($cacheFile));
807
        }
808
809
        return $cache;
810
    }
811
}
812