Completed
Pull Request — master (#6)
by Colin
05:53 queued 02:19
created

UrlAutolinkProcessor::__construct()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 3
CRAP Score 1

Importance

Changes 0
Metric Value
dl 0
loc 4
ccs 3
cts 3
cp 1
rs 10
c 0
b 0
f 0
cc 1
nc 1
nop 1
crap 1
1
<?php
2
3
/*
4
 * This file is part of the league/commonmark-ext-autolink package.
5
 *
6
 * (c) Colin O'Dell <[email protected]>
7
 *
8
 * For the full copyright and license information, please view the LICENSE
9
 * file that was distributed with this source code.
10
 */
11
12
namespace League\CommonMark\Ext\Autolink;
13
14
use League\CommonMark\Block\Element\Document;
15
use League\CommonMark\DocumentProcessorInterface;
16
use League\CommonMark\Inline\Element\Link;
17
use League\CommonMark\Inline\Element\Text;
18
19
final class UrlAutolinkProcessor implements DocumentProcessorInterface
20
{
21
    // RegEx adapted from https://github.com/symfony/symfony/blob/4.2/src/Symfony/Component/Validator/Constraints/UrlValidator.php
22
    const REGEX = '~
23
        (?<=^|[ \\t\\n\\x0b\\x0c\\x0d*_\\~\\(])  # Can only come at the beginning of a line, after whitespace, or certain delimiting characters
24
        (
25
            # Must start with a supported scheme + auth, or "www"
26
            (?:
27
                (?:%s)://                                 # protocol
28
                (?:([\.\pL\pN-]+:)?([\.\pL\pN-]+)@)?      # basic auth
29
            |www\.)
30
            (?:
31
                (?:[\pL\pN\pS\-\.])+(?:\.?(?:[\pL\pN]|xn\-\-[\pL\pN-]+)+\.?) # a domain name
32
                    |                                                 # or
33
                \d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}                    # an IP address
34
                    |                                                 # or
35
                \[
36
                    (?:(?:(?:(?:(?:(?:(?:[0-9a-f]{1,4})):){6})(?:(?:(?:(?:(?:[0-9a-f]{1,4})):(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:::(?:(?:(?:[0-9a-f]{1,4})):){5})(?:(?:(?:(?:(?:[0-9a-f]{1,4})):(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:(?:(?:(?:[0-9a-f]{1,4})))?::(?:(?:(?:[0-9a-f]{1,4})):){4})(?:(?:(?:(?:(?:[0-9a-f]{1,4})):(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:(?:(?:(?:(?:[0-9a-f]{1,4})):){0,1}(?:(?:[0-9a-f]{1,4})))?::(?:(?:(?:[0-9a-f]{1,4})):){3})(?:(?:(?:(?:(?:[0-9a-f]{1,4})):(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:(?:(?:(?:(?:[0-9a-f]{1,4})):){0,2}(?:(?:[0-9a-f]{1,4})))?::(?:(?:(?:[0-9a-f]{1,4})):){2})(?:(?:(?:(?:(?:[0-9a-f]{1,4})):(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:(?:(?:(?:(?:[0-9a-f]{1,4})):){0,3}(?:(?:[0-9a-f]{1,4})))?::(?:(?:[0-9a-f]{1,4})):)(?:(?:(?:(?:(?:[0-9a-f]{1,4})):(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:(?:(?:(?:(?:[0-9a-f]{1,4})):){0,4}(?:(?:[0-9a-f]{1,4})))?::)(?:(?:(?:(?:(?:[0-9a-f]{1,4})):(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:(?:(?:(?:(?:[0-9a-f]{1,4})):){0,5}(?:(?:[0-9a-f]{1,4})))?::)(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:(?:[0-9a-f]{1,4})):){0,6}(?:(?:[0-9a-f]{1,4})))?::))))
37
                \]  # an IPv6 address
38
            )
39
            (?::[0-9]+)?                              # a port (optional)
40
            (?:/ (?:[\pL\pN\-._\~!$&\'()*+,;=:@]|%%[0-9A-Fa-f]{2})* )*      # a path
41
            (?:\? (?:[\pL\pN\-._\~!$&\'()*+,;=:@/?]|%%[0-9A-Fa-f]{2})* )?   # a query (optional)
42
            (?:\# (?:[\pL\pN\-._\~!$&\'()*+,;=:@/?]|%%[0-9A-Fa-f]{2})* )?   # a fragment (optional)
43
        )~ixu';
44
45
    private $allowedProtocols;
46
47 75
    public function __construct(array $allowedProtocols = ['http', 'https', 'ftp'])
48
    {
49 75
        $this->allowedProtocols = $allowedProtocols;
50 75
    }
51
52
    /**
53
     * @param Document $document
54
     *
55
     * @return void
56
     */
57 75
    public function processDocument(Document $document)
58
    {
59 75
        $regex = sprintf(self::REGEX, implode('|', $this->allowedProtocols));
60
61 75
        $walker = $document->walker();
62
63 75
        while ($event = $walker->next()) {
64 75
            if ($event->isEntering() && $event->getNode() instanceof Text) {
65
                /** @var Text $node */
66 75
                $node = $event->getNode();
67
68 75
                $contents = preg_split($regex, $node->getContent(), -1, PREG_SPLIT_DELIM_CAPTURE);
69
70 75
                $leftovers = '';
71 75
                foreach ($contents as $i => $content) {
72 75
                    if ($i % 2 === 0) {
73 75
                        $text = $leftovers . $content;
74 75
                        if ($text !== '') {
75 48
                            $node->insertBefore(new Text($leftovers . $content));
76
                        }
77 75
                        $leftovers = '';
78
                    } else {
79 57
                        $leftovers = '';
80
81
                        // Does the URL end with punctuation that should be stripped?
82 57
                        if (preg_match('/(.+)([?!.,:*_~]+)$/', $content, $matches)) {
83
                            // Add the punctuation later
84 15
                            $content = $matches[1];
85 15
                            $leftovers = $matches[2];
86
                        }
87
88
                        // Does the URL end with something that looks like an entity reference?
89 57
                        if (preg_match('/(.+)(&[A-Za-z0-9]+;)$/', $content, $matches)) {
90 3
                            $content = $matches[1];
91 3
                            $leftovers = $matches[2] . $leftovers;
92
                        }
93
94
                        // Does the URL need its closing paren chopped off?
95 57
                        if (substr($content, -1) === ')' && self::hasMoreCloserParensThanOpeners($content)) {
96 3
                            $content = substr($content, 0, -1);
97 3
                            $leftovers .= ')';
98
                        }
99
100
                        // Auto-prefix 'http://' onto 'www' URLs
101 57
                        if (substr($content, 0, 4) === 'www.') {
102 27
                            $node->insertBefore(new Link('http://' . $content, $content));
103
                        } else {
104 45
                            $node->insertBefore(new Link($content, $content));
105
                        }
106
                    }
107
                }
108
109 75
                $node->detach();
110
            }
111
        }
112 75
    }
113
114
    /**
115
     * @param string $content
116
     *
117
     * @return bool
118
     */
119 6
    private static function hasMoreCloserParensThanOpeners($content)
120
    {
121
        // Scan the entire autolink for the total number of parentheses.
122
        // If there is a greater number of closing parentheses than opening ones,
123
        // we don’t consider the last character part of the autolink, in order to
124
        // facilitate including an autolink inside a parenthesis.
125 6
        preg_match_all('/[()]/', $content, $matches);
126
127 6
        $charCount = ['(' => 0, ')' => 0];
128 6
        foreach ($matches[0] as $char) {
129 6
            $charCount[$char]++;
130
        }
131
132 6
        return $charCount[')'] > $charCount['('];
133
    }
134
}
135