ConvertHelper_URLFinder::getDefaultOptions()   A
last analyzed

Complexity

Conditions 1
Paths 1

Size

Total Lines 7
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 5
c 0
b 0
f 0
dl 0
loc 7
rs 10
cc 1
nc 1
nop 0
1
<?php
2
/**
3
 * File containing the {@see AppUtils\ConvertHelper_URLFinder} class.
4
 *
5
 * @package Application Utils
6
 * @subpackage ConvertHelper
7
 * @see AppUtils\ConvertHelper_URLFinder
8
 */
9
10
declare(strict_types=1);
11
12
namespace AppUtils;
13
14
use AppUtils\URLInfo\URISchemes;
15
16
/**
17
 * Can find any URLs in a string, be it plain text or HTML, XML.
18
 *
19
 * @package Application Utils
20
 * @subpackage ConvertHelper
21
 * @author Sebastian Mordziol <[email protected]>
22
 * 
23
 * @see ConvertHelper::createURLFinder()
24
 */
25
class ConvertHelper_URLFinder implements Interface_Optionable
26
{
27
    public const ERROR_INVALID_DETECTOR_CLASS = 87901;
28
29
    use Traits_Optionable;
30
    
31
    /**
32
     * @var array<string,URLInfo>
33
     */
34
    private $urls = array();
35
36
    /**
37
     * @var array<string,URLInfo>
38
     */
39
    private $emails = array();
40
41
    /**
42
     * @var array<string,bool>
43
     */
44
    private $enabledDetectorClasses = array(
45
        ConvertHelper_URLFinder_Detector_Tel::class => true,
46
        ConvertHelper_URLFinder_Detector_HTMLAttributes::class => false,
47
        ConvertHelper_URLFinder_Detector_IPV4::class => true
48
    );
49
50
    /**
51
     * @var string[]
52
     */
53
    private $matches = array();
54
55
    /**
56
     * @var string[]
57
     */
58
    private $boundaries = array(
59
        "\n",
60
        "\t",
61
        "\r",
62
        '"',
63
        "'",
64
        '|',
65
        ',',
66
        ';',
67
        '<',
68
        '>'
69
    );
70
71
    /**
72
     * @var ConvertHelper_URLFinder_Detector[]
73
     */
74
    private $detectors = array();
75
76
    /**
77
     * @var string
78
     */
79
    private $subject;
80
81
    /**
82
     * @var bool
83
     */
84
    private $parsed = false;
85
86
    public function __construct(string $subject)
87
    {
88
        $this->subject = $subject;
89
    }
90
    
91
    public function getDefaultOptions() : array
92
    {
93
        return array(
94
            'include-emails' => false,
95
            'omit-mailto' => false,
96
            'sorting' => false,
97
            'normalize' => false
98
        );
99
    }
100
101
    /**
102
     * Whether all URLs should be normalized (parameters ordered alphabetically,
103
     * whitespace removal). This ensures that URL duplicates are detected even
104
     * if they have a different order of parameters.
105
     *
106
     * @param bool $enabled
107
     * @return $this
108
     */
109
    public function enableNormalizing(bool $enabled=true) : ConvertHelper_URLFinder
110
    {
111
        $this->setOption('normalize', $enabled);
112
        return $this;
113
    }
114
    
115
   /**
116
    * Whether to enable sorting the URLs alphabetically (disabled by default).
117
    * 
118
    * @param bool $enabled
119
    * @return $this
120
    */
121
    public function enableSorting(bool $enabled=true) : ConvertHelper_URLFinder
122
    {
123
        $this->setOption('sorting', $enabled);
124
        return $this;
125
    }
126
    
127
   /**
128
    * Whether to include email addresses in the search. 
129
    * This is only relevant when using the getURLs()
130
    * method.
131
    * 
132
    * @param bool $include
133
    * @return ConvertHelper_URLFinder
134
    */
135
    public function includeEmails(bool $include=true) : ConvertHelper_URLFinder
136
    {
137
        $this->setOption('include-emails', $include);
138
        return $this;
139
    }
140
    
141
   /**
142
    * Whether to omit the mailto: that is automatically added to all email addresses.
143
    * 
144
    * @param bool $omit
145
    * @return ConvertHelper_URLFinder
146
    */
147
    public function omitMailto(bool $omit=true) : ConvertHelper_URLFinder
148
    {
149
        $this->setOption('omit-mailto', $omit);
150
        return $this;
151
    }
152
153
    /**
154
     * Splits the string by a list of word boundaries, so that all relevant
155
     * words are separated into individual lines. Each line is then checked
156
     * to keep only strings that are more or less likely to contain a domain name.
157
     *
158
     * @param string $subject
159
     * @return string[]
160
     */
161
    private function splitSubject(string $subject) : array
162
    {
163
        $subject = str_replace($this->boundaries, ' ', $subject);
164
        $lines = ConvertHelper::explodeTrim(' ', $subject);
165
166
        unset($subject);
167
168
        $keep = array();
169
170
        foreach ($lines as $line)
171
        {
172
            $line = $this->analyzeLine($line);
173
174
            if($line !== null) {
175
                $keep[] = $line;
176
            }
177
        }
178
179
        return array_unique($keep);
180
    }
181
182
    /**
183
     * Analyzes a single line to see if it is likely to contain a domain name.
184
     *
185
     * @param string $line
186
     * @return string|null
187
     */
188
    private function analyzeLine(string $line) : ?string
189
    {
190
        // Strip punctuation from the beginning and end,
191
        // to exclude the end of phrases, e.g. "domain.com."
192
        $line = trim($line, '.');
193
194
        // Handle detecting an URI scheme
195
        if(strstr($line, ':') !== false)
196
        {
197
            $scheme = URISchemes::detectScheme($line);
198
199
            if ($scheme !== null)
200
            {
201
                return $line;
202
            }
203
        }
204
205
        // From here on out, the only things we can still
206
        // detect are email addresses and domain names.
207
208
        // No dot? Then it's certainly not a domain name.
209
        if(strstr($line, '.') === false) {
210
            return null;
211
        }
212
213
        return $line;
214
    }
215
216
    /**
217
     * Filters the subject string before trying to detect regular HTTP/HTTPS
218
     * URLs as well as email addresses that are domain-based.
219
     *
220
     * @param string $subject
221
     * @return string
222
     */
223
    private function filterSubjectBefore(string $subject) : string
224
    {
225
        $subject = stripslashes($subject);
226
227
        foreach($this->detectors as $detector)
228
        {
229
            // Avoid processing the string if it is not needed.
230
            if($detector->getRunPosition() !== ConvertHelper_URLFinder_Detector::RUN_BEFORE || !$detector->isValidFor($subject)) {
231
                continue;
232
            }
233
234
            $subject = $detector->processString($subject);
235
236
            $this->matches = array_merge($this->matches, $detector->getMatches());
237
        }
238
239
        return $subject;
240
    }
241
242
    /**
243
     * @param string $className
244
     * @return ConvertHelper_URLFinder_Detector
245
     * @throws ConvertHelper_Exception
246
     */
247
    private function createDetector(string $className) : ConvertHelper_URLFinder_Detector
248
    {
249
        $detector = new $className();
250
251
        if($detector instanceof ConvertHelper_URLFinder_Detector)
252
        {
253
            return $detector;
254
        }
255
256
        throw new ConvertHelper_Exception(
257
            'Not a valid detector class.',
258
            sprintf(
259
                'The class [%s] is not an instance of [%s].',
260
                $className,
261
                ConvertHelper_URLFinder_Detector::class
262
            ),
263
            self::ERROR_INVALID_DETECTOR_CLASS
264
        );
265
    }
266
267
   /**
268
    * Fetches all URLs that can be found in the subject string.
269
    * 
270
    * @return string[]
271
    */
272
    public function getURLs() : array
273
    {
274
        $this->parse();
275
276
        $result = $this->getItemsAsString($this->urls);
277
278
        if($this->getBoolOption('include-emails'))
279
        {
280
            $result = array_merge($result, $this->getEmails());
281
        }
282
283
        if($this->getBoolOption('sorting'))
284
        {
285
            usort($result, function(string $a, string $b) {
286
                return strnatcasecmp($a, $b);
287
            });
288
        }
289
290
        return $result;
291
    }
292
293
    /**
294
     * @param array<string,URLInfo> $collection
295
     * @return string[]
296
     */
297
    private function getItemsAsString(array $collection) : array
298
    {
299
        $normalize = $this->getBoolOption('normalize');
300
301
        $result = array();
302
303
        foreach($collection as $url => $info) {
304
            if($normalize) {
305
                $url = $info->getNormalized();
306
            }
307
308
            if(!in_array($url, $result)) {
309
                $result[] = $url;
310
            }
311
        }
312
313
        return $result;
314
    }
315
316
    /**
317
     * Instantiates the selected detector classes, which are
318
     * used to detect specific elements in the target string
319
     * (beyond regular URLs and Email addresses).
320
     *
321
     * @throws ConvertHelper_Exception
322
     */
323
    private function initDetectors() : void
324
    {
325
        foreach($this->enabledDetectorClasses as $className => $enabled)
326
        {
327
            if($enabled) {
328
                $this->detectors[] = $this->createDetector($className);
329
            }
330
        }
331
    }
332
333
    /**
334
     * Parses the specified string to detect all URLs and Email addresses.
335
     * For accurate results, this does not use a regex, but splits the
336
     * string into a list of strings that are likely to be either an URL
337
     * or Email address. Each of these is then checked for a valid scheme
338
     * or domain name extension.
339
     */
340
    private function parse() : void
341
    {
342
        if($this->parsed) {
343
            return;
344
        }
345
346
        $this->parsed = true;
347
348
        $this->initDetectors();
349
        $this->detectMatches($this->subject);
350
351
        unset($this->subject);
352
353
        foreach($this->matches as $match)
354
        {
355
            $info = parseURL($match);
356
357
            if($info->isEmail())
358
            {
359
                $this->emails[$this->filterEmailAddress($match)] = $info;
360
                continue;
361
            }
362
363
            $this->urls[$match] = $info;
364
        }
365
    }
366
367
    /**
368
     * Enables the search for relative URLs in HTML attributes.
369
     *
370
     * @param bool $enable
371
     * @return $this
372
     */
373
    public function enableHTMLAttributes(bool $enable=true) : ConvertHelper_URLFinder
374
    {
375
        $this->enabledDetectorClasses[ConvertHelper_URLFinder_Detector_HTMLAttributes::class] = $enable;
376
        return $this;
377
    }
378
379
    /**
380
     * Ensures that the email address has the `mailto:` scheme prepended,
381
     * and lowercases it to avoid case mixups.
382
     *
383
     * @param string $email
384
     * @return string
385
     */
386
    private function filterEmailAddress(string $email) : string
387
    {
388
        if(stristr($email, 'mailto:') === false) {
389
            $email = 'mailto:'.$email;
390
        }
391
392
        return strtolower($email);
393
    }
394
395
    /**
396
     * Detects all URL and Email matches in the specified string.
397
     *
398
     * @param string $subject
399
     */
400
    private function detectMatches(string $subject) : void
401
    {
402
        $subject = $this->filterSubjectBefore($subject);
403
404
        $lines = $this->splitSubject($subject);
405
        $domains = new ConvertHelper_URLFinder_DomainExtensions();
406
407
        foreach ($lines as $line)
408
        {
409
            $scheme = URISchemes::detectScheme($line);
410
            if($scheme !== null) {
411
                $this->matches[] = $line;
412
                continue;
413
            }
414
415
            $extension = $this->detectDomainExtension($line);
416
417
            if($domains->nameExists($extension)) {
418
                $this->matches[] = $line;
419
            }
420
        }
421
422
        $this->filterSubjectAfter($subject);
423
    }
424
425
    private function filterSubjectAfter(string $subject) : void
426
    {
427
        // Sort the matches from longest to shortest, to avoid
428
        // replacing parts of URLs.
429
        $remove = $this->matches;
430
        usort($remove, function(string $a, string $b) {
431
            return strlen($b) - strlen($a);
432
        });
433
434
        $subject = str_replace($remove, ' ', $subject);
435
436
        foreach($this->detectors as $detector)
437
        {
438
            if($detector->getRunPosition() !== ConvertHelper_URLFinder_Detector::RUN_AFTER || !$detector->isValidFor($subject)) {
439
                continue;
440
            }
441
442
            $subject = $detector->processString($subject);
443
444
            $this->matches = array_merge($this->matches, $detector->getMatches());
445
        }
446
    }
447
448
    /**
449
     * Attempts to extract a valid domain name extension from
450
     * the specified URL.
451
     *
452
     * @param string $url
453
     * @return string
454
     * @see ConvertHelper_URLFinder_DomainExtensions
455
     */
456
    private function detectDomainExtension(string $url) : string
457
    {
458
        $boundaries = array('/', '?');
459
460
        // Remove the path or query parts to access the domain extension only
461
        foreach($boundaries as $boundary) {
462
            if(strstr($url, $boundary)) {
463
                $parts = explode($boundary, $url);
464
                $url = array_shift($parts);
465
                break;
466
            }
467
        }
468
469
        $parts = explode('.', $url);
470
471
        return array_pop($parts);
472
    }
473
474
   /**
475
    * Retrieves all email addresses from the subject string.
476
    * 
477
    * @return string[]
478
    * 
479
    * @see omitMailto()
480
    */
481
    public function getEmails() : array
482
    {
483
        $this->parse();
484
485
        $result = $this->getItemsAsString($this->emails);
486
487
        if($this->getBoolOption('omit-mailto')) {
488
            $keep = array();
489
            foreach($result as $email) {
490
                $keep[] = str_replace('mailto:', '', $email);
491
            }
492
493
            $result = $keep;
494
        }
495
496
        if($this->getBoolOption('sorting'))
497
        {
498
            usort($result, function(string $a, string $b) {
499
                return strnatcasecmp($a, $b);
500
            });
501
        }
502
        
503
        return $result;
504
    }
505
    
506
   /**
507
    * Retrieves all URLs as URLInfo instances.
508
    * 
509
    * @return URLInfo[]
510
    */
511
    public function getInfos() : array
512
    {
513
        $this->parse();
514
515
        $result = array();
516
        $normalize = $this->getBoolOption('normalize');
517
518
        foreach($this->urls as $url => $info)
519
        {
520
            if($normalize) {
521
                $url = $info->getNormalized();
522
            }
523
524
            $result[$url] = $info;
525
        }
526
527
        if($this->getBoolOption('sorting'))
528
        {
529
            ksort($result);
530
        }
531
532
        return array_values($result);
533
    }
534
}
535