Passed
Push — master ( 707ddf...aa8b2f )
by Sebastian
04:56
created

ConvertHelper_URLFinder::getItemsAsString()   A

Complexity

Conditions 4
Paths 5

Size

Total Lines 17
Code Lines 8

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 8
c 0
b 0
f 0
dl 0
loc 17
rs 10
cc 4
nc 5
nop 1
1
<?php
2
/**
3
 * File containing the {@see AppUtils\ConvertHelper_URLFinder} class.
4
 *
5
 * @package Application Utils
6
 * @subpackage ConvertHelper
7
 * @see AppUtils\ConvertHelper_URLFinder
8
 */
9
10
declare(strict_types=1);
11
12
namespace AppUtils;
13
14
/**
15
 * Can find any URLs in a string, be it plain text or HTML, XML.
16
 *
17
 * @package Application Utils
18
 * @subpackage ConvertHelper
19
 * @author Sebastian Mordziol <[email protected]>
20
 * 
21
 * @see ConvertHelper::createURLFinder()
22
 */
23
class ConvertHelper_URLFinder implements Interface_Optionable
24
{
25
    const ERROR_INVALID_DETECTOR_CLASS = 87901;
26
27
    use Traits_Optionable;
28
    
29
    /**
30
     * @var array<string,URLInfo>
31
     */
32
    private $urls = array();
33
34
    /**
35
     * @var array<string,URLInfo>
36
     */
37
    private $emails = array();
38
39
    /**
40
     * @var string[]
41
     */
42
    private $enabledDetectorClasses = array(
43
        ConvertHelper_URLFinder_Detector_Tel::class => true,
44
        ConvertHelper_URLFinder_Detector_HTMLAttributes::class => false,
45
        ConvertHelper_URLFinder_Detector_IPV4::class => true
46
    );
47
48
    /**
49
     * @var string[]
50
     */
51
    private $matches = array();
52
53
    /**
54
     * @var string[]
55
     */
56
    private $boundaries = array(
57
        "\n",
58
        "\t",
59
        "\r",
60
        '"',
61
        "'",
62
        '|',
63
        ',',
64
        ';',
65
        '<',
66
        '>'
67
    );
68
69
    /**
70
     * @var ConvertHelper_URLFinder_Detector[]
71
     */
72
    private $detectors = array();
73
74
    /**
75
     * @var string
76
     */
77
    private $subject;
78
79
    /**
80
     * @var bool
81
     */
82
    private $parsed = false;
83
84
    public function __construct(string $subject)
85
    {
86
        $this->subject = $subject;
87
    }
88
    
89
    public function getDefaultOptions() : array
90
    {
91
        return array(
92
            'include-emails' => false,
93
            'omit-mailto' => false,
94
            'sorting' => false,
95
            'normalize' => false
96
        );
97
    }
98
99
    /**
100
     * Whether all URLs should be normalized (parameters ordered alphabetically,
101
     * whitespace removal). This ensures that URL duplicates are detected even
102
     * if they have a different order of parameters.
103
     *
104
     * @param bool $enabled
105
     * @return $this
106
     */
107
    public function enableNormalizing(bool $enabled=true) : ConvertHelper_URLFinder
108
    {
109
        $this->setOption('normalize', $enabled);
110
        return $this;
111
    }
112
    
113
   /**
114
    * Whether to enable sorting the URLs alphabetically (disabled by default).
115
    * 
116
    * @param bool $enabled
117
    * @return $this
118
    */
119
    public function enableSorting(bool $enabled=true) : ConvertHelper_URLFinder
120
    {
121
        $this->setOption('sorting', $enabled);
122
        return $this;
123
    }
124
    
125
   /**
126
    * Whether to include email addresses in the search. 
127
    * This is only relevant when using the getURLs()
128
    * method.
129
    * 
130
    * @param bool $include
131
    * @return ConvertHelper_URLFinder
132
    */
133
    public function includeEmails(bool $include=true) : ConvertHelper_URLFinder
134
    {
135
        $this->setOption('include-emails', $include);
136
        return $this;
137
    }
138
    
139
   /**
140
    * Whether to omit the mailto: that is automatically added to all email addresses.
141
    * 
142
    * @param bool $omit
143
    * @return ConvertHelper_URLFinder
144
    */
145
    public function omitMailto(bool $omit=true) : ConvertHelper_URLFinder
146
    {
147
        $this->setOption('omit-mailto', $omit);
148
        return $this;
149
    }
150
151
    /**
152
     * Splits the string by a list of word boundaries, so that all relevant
153
     * words are separated into individual lines. Each line is then checked
154
     * to keep only strings that are more or less likely to contain a domain name.
155
     *
156
     * @param string $subject
157
     * @return string[]
158
     */
159
    private function splitSubject(string $subject) : array
160
    {
161
        $subject = str_replace($this->boundaries, ' ', $subject);
162
        $lines = ConvertHelper::explodeTrim(' ', $subject);
163
164
        unset($subject);
165
166
        $keep = array();
167
168
        foreach ($lines as $line)
169
        {
170
            $line = $this->analyzeLine($line);
171
172
            if($line !== null) {
173
                $keep[] = $line;
174
            }
175
        }
176
177
        return array_unique($keep);
178
    }
179
180
    /**
181
     * Analyzes a single line to see if it is likely to contain a domain name.
182
     *
183
     * @param string $line
184
     * @return string|null
185
     */
186
    private function analyzeLine(string $line) : ?string
187
    {
188
        // Strip punctuation from the beginning and end,
189
        // to exclude the end of phrases, e.g. "domain.com."
190
        $line = trim($line, '.');
191
192
        // Handle detecting an URI scheme
193
        if(strstr($line, ':') !== false)
194
        {
195
            $scheme = URLInfo_Schemes::detectScheme($line);
196
197
            if ($scheme !== null)
198
            {
199
                return $line;
200
            }
201
        }
202
203
        // From here on out, the only things we can still
204
        // detect are email addresses and domain names.
205
206
        // No dot? Then it's certainly not a domain name.
207
        if(strstr($line, '.') === false) {
208
            return null;
209
        }
210
211
        // Check again if there are still dots present
212
        if(strstr($line, '.') !== false) {
213
            return $line;
214
        }
215
216
        return null;
217
    }
218
219
    /**
220
     * Filters the subject string before trying to detect regular HTTP/HTTPS
221
     * URLs as well as email addresses that are domain-based.
222
     *
223
     * @param string $subject
224
     * @return string
225
     */
226
    private function filterSubjectBefore(string $subject) : string
227
    {
228
        $subject = stripslashes($subject);
229
230
        foreach($this->detectors as $detector)
231
        {
232
            // Avoid processing the string if it is not needed.
233
            if($detector->getRunPosition() !== ConvertHelper_URLFinder_Detector::RUN_BEFORE || !$detector->isValidFor($subject)) {
234
                continue;
235
            }
236
237
            $subject = $detector->processString($subject);
238
239
            $this->matches = array_merge($this->matches, $detector->getMatches());
240
        }
241
242
        return $subject;
243
    }
244
245
    /**
246
     * @param string $className
247
     * @return ConvertHelper_URLFinder_Detector
248
     * @throws ConvertHelper_Exception
249
     */
250
    private function createDetector(string $className) : ConvertHelper_URLFinder_Detector
251
    {
252
        $detector = new $className();
253
254
        if($detector instanceof ConvertHelper_URLFinder_Detector)
255
        {
256
            return $detector;
257
        }
258
259
        throw new ConvertHelper_Exception(
260
            'Not a valid detector class.',
261
            sprintf(
262
                'The class [%s] is not an instance of [%s].',
263
                $className,
264
                ConvertHelper_URLFinder_Detector::class
265
            ),
266
            self::ERROR_INVALID_DETECTOR_CLASS
267
        );
268
    }
269
270
   /**
271
    * Fetches all URLs that can be found in the subject string.
272
    * 
273
    * @return string[]
274
    */
275
    public function getURLs() : array
276
    {
277
        $this->parse();
278
279
        $result = $this->getItemsAsString($this->urls);
280
281
        if($this->getBoolOption('include-emails'))
282
        {
283
            $result = array_merge($result, $this->getEmails());
284
        }
285
286
        if($this->getBoolOption('sorting'))
287
        {
288
            usort($result, function(string $a, string $b) {
289
                return strnatcasecmp($a, $b);
290
            });
291
        }
292
293
        return $result;
294
    }
295
296
    /**
297
     * @param array<string,URLInfo> $collection
298
     * @return string[]
299
     */
300
    private function getItemsAsString(array $collection) : array
301
    {
302
        $normalize = $this->getBoolOption('normalize');
303
304
        $result = array();
305
306
        foreach($collection as $url => $info) {
307
            if($normalize) {
308
                $url = $info->getNormalized();
309
            }
310
311
            if(!in_array($url, $result)) {
312
                $result[] = $url;
313
            }
314
        }
315
316
        return $result;
317
    }
318
319
    /**
320
     * Instantiates the selected detector classes, which are
321
     * used to detect specific elements in the target string
322
     * (beyond regular URLs and Email addresses).
323
     *
324
     * @throws ConvertHelper_Exception
325
     */
326
    private function initDetectors() : void
327
    {
328
        foreach($this->enabledDetectorClasses as $className => $enabled)
329
        {
330
            if($enabled) {
331
                $this->detectors[] = $this->createDetector($className);
332
            }
333
        }
334
    }
335
336
    /**
337
     * Parses the specified string to detect all URLs and Email addresses.
338
     * For accurate results, this does not use a regex, but splits the
339
     * string into a list of strings that are likely to be either an URL
340
     * or Email address. Each of these is then checked for a valid scheme
341
     * or domain name extension.
342
     */
343
    private function parse() : void
344
    {
345
        if($this->parsed) {
346
            return;
347
        }
348
349
        $this->parsed = true;
350
351
        $this->initDetectors();
352
        $this->detectMatches($this->subject);
353
354
        unset($this->subject);
355
356
        foreach($this->matches as $match)
357
        {
358
            $info = parseURL($match);
359
360
            if($info->isEmail())
361
            {
362
                $this->emails[$this->filterEmailAddress($match)] = $info;
363
                continue;
364
            }
365
366
            $this->urls[$match] = $info;
367
        }
368
    }
369
370
    /**
371
     * Enables the search for relative URLs in HTML attributes.
372
     *
373
     * @param bool $enable
374
     * @return $this
375
     */
376
    public function enableHTMLAttributes(bool $enable=true) : ConvertHelper_URLFinder
377
    {
378
        $this->enabledDetectorClasses[ConvertHelper_URLFinder_Detector_HTMLAttributes::class] = $enable;
379
        return $this;
380
    }
381
382
    /**
383
     * Ensures that the email address has the `mailto:` scheme prepended,
384
     * and lowercases it to avoid case mixups.
385
     *
386
     * @param string $email
387
     * @return string
388
     */
389
    private function filterEmailAddress(string $email) : string
390
    {
391
        if(stristr($email, 'mailto:') === false) {
392
            $email = 'mailto:'.$email;
393
        }
394
395
        return strtolower($email);
396
    }
397
398
    /**
399
     * Detects all URL and Email matches in the specified string.
400
     *
401
     * @param string $subject
402
     */
403
    private function detectMatches(string $subject) : void
404
    {
405
        $subject = $this->filterSubjectBefore($subject);
406
407
        $lines = $this->splitSubject($subject);
408
        $domains = new ConvertHelper_URLFinder_DomainExtensions();
409
410
        foreach ($lines as $line)
411
        {
412
            $scheme = URLInfo_Schemes::detectScheme($line);
413
            if($scheme !== null) {
414
                $this->matches[] = $line;
415
                continue;
416
            }
417
418
            $extension = $this->detectDomainExtension($line);
419
420
            if($domains->nameExists($extension)) {
421
                $this->matches[] = $line;
422
            }
423
        }
424
425
        $this->filterSubjectAfter($subject);
426
    }
427
428
    private function filterSubjectAfter(string $subject) : void
429
    {
430
        // Sort the matches from longest to shortest, to avoid
431
        // replacing parts of URLs.
432
        $remove = $this->matches;
433
        usort($remove, function(string $a, string $b) {
434
            return strlen($b) - strlen($a);
435
        });
436
437
        $subject = str_replace($remove, ' ', $subject);
438
439
        foreach($this->detectors as $detector)
440
        {
441
            if($detector->getRunPosition() !== ConvertHelper_URLFinder_Detector::RUN_AFTER || !$detector->isValidFor($subject)) {
442
                continue;
443
            }
444
445
            $subject = $detector->processString($subject);
446
447
            $this->matches = array_merge($this->matches, $detector->getMatches());
448
        }
449
    }
450
451
    /**
452
     * Attempts to extract a valid domain name extension from
453
     * the specified URL.
454
     *
455
     * @param string $url
456
     * @return string
457
     * @see ConvertHelper_URLFinder_DomainExtensions
458
     */
459
    private function detectDomainExtension(string $url) : string
460
    {
461
        $boundaries = array('/', '?');
462
463
        // Remove the path or query parts to access the domain extension only
464
        foreach($boundaries as $boundary) {
465
            if(strstr($url, $boundary)) {
466
                $parts = explode($boundary, $url);
467
                $url = array_shift($parts);
468
                break;
469
            }
470
        }
471
472
        $parts = explode('.', $url);
473
474
        return array_pop($parts);
475
    }
476
477
   /**
478
    * Retrieves all email addresses from the subject string.
479
    * 
480
    * @return string[]
481
    * 
482
    * @see omitMailto()
483
    */
484
    public function getEmails() : array
485
    {
486
        $this->parse();
487
488
        $result = $this->getItemsAsString($this->emails);
489
490
        if($this->getBoolOption('omit-mailto')) {
491
            $keep = array();
492
            foreach($result as $email) {
493
                $keep[] = str_replace('mailto:', '', $email);
494
            }
495
496
            $result = $keep;
497
        }
498
499
        if($this->getBoolOption('sorting'))
500
        {
501
            usort($result, function(string $a, string $b) {
502
                return strnatcasecmp($a, $b);
503
            });
504
        }
505
        
506
        return $result;
507
    }
508
    
509
   /**
510
    * Retrieves all URLs as URLInfo instances.
511
    * 
512
    * @return URLInfo[]
513
    */
514
    public function getInfos() : array
515
    {
516
        $this->parse();
517
518
        $result = array();
519
        $normalize = $this->getBoolOption('normalize');
520
521
        foreach($this->urls as $url => $info)
522
        {
523
            if($normalize) {
524
                $url = $info->getNormalized();
525
            }
526
527
            $result[$url] = $info;
528
        }
529
530
        if($this->getBoolOption('sorting'))
531
        {
532
            ksort($result);
533
        }
534
535
        return array_values($result);
536
    }
537
}
538