Passed
Push — main ( 5de797...be0e90 )
by Colin
03:01
created

UrlAutolinkParser::diffParens()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 14
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 6
CRAP Score 2

Importance

Changes 0
Metric Value
eloc 5
c 0
b 0
f 0
dl 0
loc 14
ccs 6
cts 6
cp 1
rs 10
cc 2
nc 2
nop 1
crap 2
1
<?php
2
3
declare(strict_types=1);
4
5
/*
6
 * This file is part of the league/commonmark package.
7
 *
8
 * (c) Colin O'Dell <[email protected]>
9
 *
10
 * For the full copyright and license information, please view the LICENSE
11
 * file that was distributed with this source code.
12
 */
13
14
namespace League\CommonMark\Extension\Autolink;
15
16
use League\CommonMark\Extension\CommonMark\Node\Inline\Link;
17
use League\CommonMark\Parser\Inline\InlineParserInterface;
18
use League\CommonMark\Parser\Inline\InlineParserMatch;
19
use League\CommonMark\Parser\InlineParserContext;
20
21
final class UrlAutolinkParser implements InlineParserInterface
22
{
23
    private const ALLOWED_AFTER = [null, ' ', "\t", "\n", "\x0b", "\x0c", "\x0d", '*', '_', '~', '('];
24
25
    // RegEx adapted from https://github.com/symfony/symfony/blob/6.3/src/Symfony/Component/Validator/Constraints/UrlValidator.php
26
    private const REGEX = '~
27
        (
28
            # Must start with a supported scheme + auth, or "www"
29
            (?:
30
                (?:%s)://                                                                            # protocol
31
                (?:(?:(?:[\_\.\pL\pN-]|%%[0-9A-Fa-f]{2})+:)?((?:[\_\.\pL\pN-]|%%[0-9A-Fa-f]{2})+)@)? # basic auth
32
            |www\.)
33
            (?:
34
                (?:
35
                    (?:xn--[a-z0-9-]++\.)*+xn--[a-z0-9-]++            # a domain name using punycode
36
                        |
37
                    (?:[\pL\pN\pS\pM\-\_]++\.)+[\pL\pN\pM]++          # a multi-level domain name
38
                        |
39
                    [a-z0-9\-\_]++                                    # a single-level domain name
40
                )\.?
41
                    |                                                 # or
42
                \d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}                    # an IP address
43
                    |                                                 # or
44
                \[
45
                    (?:(?:(?:(?:(?:(?:(?:[0-9a-f]{1,4})):){6})(?:(?:(?:(?:(?:[0-9a-f]{1,4})):(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:::(?:(?:(?:[0-9a-f]{1,4})):){5})(?:(?:(?:(?:(?:[0-9a-f]{1,4})):(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:(?:(?:(?:[0-9a-f]{1,4})))?::(?:(?:(?:[0-9a-f]{1,4})):){4})(?:(?:(?:(?:(?:[0-9a-f]{1,4})):(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:(?:(?:(?:(?:[0-9a-f]{1,4})):){0,1}(?:(?:[0-9a-f]{1,4})))?::(?:(?:(?:[0-9a-f]{1,4})):){3})(?:(?:(?:(?:(?:[0-9a-f]{1,4})):(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:(?:(?:(?:(?:[0-9a-f]{1,4})):){0,2}(?:(?:[0-9a-f]{1,4})))?::(?:(?:(?:[0-9a-f]{1,4})):){2})(?:(?:(?:(?:(?:[0-9a-f]{1,4})):(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:(?:(?:(?:(?:[0-9a-f]{1,4})):){0,3}(?:(?:[0-9a-f]{1,4})))?::(?:(?:[0-9a-f]{1,4})):)(?:(?:(?:(?:(?:[0-9a-f]{1,4})):(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:(?:(?:(?:(?:[0-9a-f]{1,4})):){0,4}(?:(?:[0-9a-f]{1,4})))?::)(?:(?:(?:(?:(?:[0-9a-f]{1,4})):(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:(?:(?:(?:(?:[0-9a-f]{1,4})):){0,5}(?:(?:[0-9a-f]{1,4})))?::)(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:(?:[0-9a-f]{1,4})):){0,6}(?:(?:[0-9a-f]{1,4})))?::))))
46
                \]  # an IPv6 address
47
            )
48
            (?::[0-9]+)?                              # a port (optional)
49
            (?:/ (?:[\pL\pN\-._\~!$&\'()*+,;=:@]|%%[0-9A-Fa-f]{2})* )*        # a path
50
            (?:\? (?:[\pL\pN\-._\~!$&\'\[\]()*+,;=:@/?]|%%[0-9A-Fa-f]{2})* )? # a query (optional)
51
            (?:\# (?:[\pL\pN\-._\~!$&\'()*+,;=:@/?]|%%[0-9A-Fa-f]{2})* )?     # a fragment (optional)
52
        )~ixu';
53
54
    /**
55
     * @var string[]
56
     *
57
     * @psalm-readonly
58
     */
59
    private array $prefixes = ['www.'];
60
61
    /**
62
     * @psalm-var non-empty-string
63
     *
64
     * @psalm-readonly
65
     */
66
    private string $finalRegex;
67
68
    private string $defaultProtocol;
69
70
    /**
71
     * @param array<int, string> $allowedProtocols
72
     */
73 190
    public function __construct(array $allowedProtocols = ['http', 'https', 'ftp'], string $defaultProtocol = 'http')
74
    {
75
        /**
76
         * @psalm-suppress PropertyTypeCoercion
77
         */
78 190
        $this->finalRegex = \sprintf(self::REGEX, \implode('|', $allowedProtocols));
79
80 190
        foreach ($allowedProtocols as $protocol) {
81 190
            $this->prefixes[] = $protocol . '://';
82
        }
83
84 190
        $this->defaultProtocol = $defaultProtocol;
85
    }
86
87 190
    public function getMatchDefinition(): InlineParserMatch
88
    {
89 190
        return InlineParserMatch::oneOf(...$this->prefixes);
90
    }
91
92 86
    public function parse(InlineParserContext $inlineContext): bool
93
    {
94 86
        $cursor = $inlineContext->getCursor();
95
96
        // Autolinks can only come at the beginning of a line, after whitespace, or certain delimiting characters
97 86
        $previousChar = $cursor->peek(-1);
98 86
        if (! \in_array($previousChar, self::ALLOWED_AFTER, true)) {
99 2
            return false;
100
        }
101
102
        // Check if we have a valid URL
103 84
        if (! \preg_match($this->finalRegex, $cursor->getRemainder(), $matches)) {
104
            return false;
105
        }
106
107 84
        $url = $matches[0];
108
109
        // Does the URL end with punctuation that should be stripped?
110 84
        if (\preg_match('/(.+?)([?!.,:*_~]+)$/', $url, $matches)) {
111
            // Add the punctuation later
112 18
            $url = $matches[1];
113
        }
114
115
        // Does the URL end with something that looks like an entity reference?
116 84
        if (\preg_match('/(.+)(&[A-Za-z0-9]+;)$/', $url, $matches)) {
117 4
            $url = $matches[1];
118
        }
119
120
        // Does the URL need unmatched parens chopped off?
121 84
        if (\substr($url, -1) === ')' && ($diff = self::diffParens($url)) > 0) {
122 10
            $url = \substr($url, 0, -$diff);
123
        }
124
125 84
        $cursor->advanceBy(\mb_strlen($url, 'UTF-8'));
126
127
        // Auto-prefix 'http(s)://' onto 'www' URLs
128 84
        if (\substr($url, 0, 4) === 'www.') {
129 46
            $inlineContext->getContainer()->appendChild(new Link($this->defaultProtocol . '://' . $url, $url));
130
131 46
            return true;
132
        }
133
134 38
        $inlineContext->getContainer()->appendChild(new Link($url, $url));
135
136 38
        return true;
137
    }
138
139
    /**
140
     * @psalm-pure
141
     */
142 16
    private static function diffParens(string $content): int
143
    {
144
        // Scan the entire autolink for the total number of parentheses.
145
        // If there is a greater number of closing parentheses than opening ones,
146
        // we don’t consider ANY of the last characters as part of the autolink,
147
        // in order to facilitate including an autolink inside a parenthesis.
148 16
        \preg_match_all('/[()]/', $content, $matches);
149
150 16
        $charCount = ['(' => 0, ')' => 0];
151 16
        foreach ($matches[0] as $char) {
152 16
            $charCount[$char]++;
153
        }
154
155 16
        return $charCount[')'] - $charCount['('];
156
    }
157
}
158