1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
declare(strict_types=1); |
4
|
|
|
|
5
|
|
|
/* |
6
|
|
|
* This file is part of the league/commonmark package. |
7
|
|
|
* |
8
|
|
|
* (c) Colin O'Dell <[email protected]> |
9
|
|
|
* |
10
|
|
|
* For the full copyright and license information, please view the LICENSE |
11
|
|
|
* file that was distributed with this source code. |
12
|
|
|
*/ |
13
|
|
|
|
14
|
|
|
namespace League\CommonMark\Extension\Autolink; |
15
|
|
|
|
16
|
|
|
use League\CommonMark\Event\DocumentParsedEvent; |
17
|
|
|
use League\CommonMark\Extension\CommonMark\Node\Inline\Link; |
18
|
|
|
use League\CommonMark\Node\Inline\Text; |
19
|
|
|
|
20
|
|
|
final class UrlAutolinkProcessor |
21
|
|
|
{ |
22
|
|
|
// RegEx adapted from https://github.com/symfony/symfony/blob/4.2/src/Symfony/Component/Validator/Constraints/UrlValidator.php |
23
|
|
|
private const REGEX = '~ |
24
|
|
|
(?<=^|[ \\t\\n\\x0b\\x0c\\x0d*_\\~\\(]) # Can only come at the beginning of a line, after whitespace, or certain delimiting characters |
25
|
|
|
( |
26
|
|
|
# Must start with a supported scheme + auth, or "www" |
27
|
|
|
(?: |
28
|
|
|
(?:%s):// # protocol |
29
|
|
|
(?:([\.\pL\pN-]+:)?([\.\pL\pN-]+)@)? # basic auth |
30
|
|
|
|www\.) |
31
|
|
|
(?: |
32
|
|
|
(?:[\pL\pN\pS\-\.])+(?:\.?(?:[\pL\pN]|xn\-\-[\pL\pN-]+)+\.?) # a domain name |
33
|
|
|
| # or |
34
|
|
|
\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3} # an IP address |
35
|
|
|
| # or |
36
|
|
|
\[ |
37
|
|
|
(?:(?:(?:(?:(?:(?:(?:[0-9a-f]{1,4})):){6})(?:(?:(?:(?:(?:[0-9a-f]{1,4})):(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:::(?:(?:(?:[0-9a-f]{1,4})):){5})(?:(?:(?:(?:(?:[0-9a-f]{1,4})):(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:(?:(?:(?:[0-9a-f]{1,4})))?::(?:(?:(?:[0-9a-f]{1,4})):){4})(?:(?:(?:(?:(?:[0-9a-f]{1,4})):(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:(?:(?:(?:(?:[0-9a-f]{1,4})):){0,1}(?:(?:[0-9a-f]{1,4})))?::(?:(?:(?:[0-9a-f]{1,4})):){3})(?:(?:(?:(?:(?:[0-9a-f]{1,4})):(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:(?:(?:(?:(?:[0-9a-f]{1,4})):){0,2}(?:(?:[0-9a-f]{1,4})))?::(?:(?:(?:[0-9a-f]{1,4})):){2})(?:(?:(?:(?:(?:[0-9a-f]{1,4})):(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:(?:(?:(?:(?:[0-9a-f]{1,4})):){0,3}(?:(?:[0-9a-f]{1,4})))?::(?:(?:[0-9a-f]{1,4})):)(?:(?:(?:(?:(?:[0-9a-f]{1,4})):(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:(?:(?:(?:(?:[0-9a-f]{1,4})):){0,4}(?:(?:[0-9a-f]{1,4})))?::)(?:(?:(?:(?:(?:[0-9a-f]{1,4})):(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:(?:(?:(?:(?:[0-9a-f]{1,4})):){0,5}(?:(?:[0-9a-f]{1,4})))?::)(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:(?:[0-9a-f]{1,4})):){0,6}(?:(?:[0-9a-f]{1,4})))?::)))) |
38
|
|
|
\] # an IPv6 address |
39
|
|
|
) |
40
|
|
|
(?::[0-9]+)? # a port (optional) |
41
|
|
|
(?:/ (?:[\pL\pN\-._\~!$&\'()*+,;=:@]|%%[0-9A-Fa-f]{2})* )* # a path |
42
|
|
|
(?:\? (?:[\pL\pN\-._\~!$&\'()*+,;=:@/?]|%%[0-9A-Fa-f]{2})* )? # a query (optional) |
43
|
|
|
(?:\# (?:[\pL\pN\-._\~!$&\'()*+,;=:@/?]|%%[0-9A-Fa-f]{2})* )? # a fragment (optional) |
44
|
|
|
)~ixu'; |
45
|
|
|
|
46
|
|
|
/** |
47
|
|
|
* @var string |
48
|
|
|
* |
49
|
|
|
* @psalm-readonly |
50
|
|
|
*/ |
51
|
|
|
private $finalRegex; |
52
|
|
|
|
53
|
|
|
/** |
54
|
|
|
* @param array<int, string> $allowedProtocols |
55
|
|
|
*/ |
56
|
231 |
|
public function __construct(array $allowedProtocols = ['http', 'https', 'ftp']) |
57
|
|
|
{ |
58
|
231 |
|
$this->finalRegex = \sprintf(self::REGEX, \implode('|', $allowedProtocols)); |
59
|
231 |
|
} |
60
|
|
|
|
61
|
231 |
|
public function __invoke(DocumentParsedEvent $e): void |
62
|
|
|
{ |
63
|
231 |
|
$walker = $e->getDocument()->walker(); |
64
|
|
|
|
65
|
231 |
|
while ($event = $walker->next()) { |
66
|
231 |
|
$node = $event->getNode(); |
67
|
231 |
|
if ($node instanceof Text && ! ($node->parent() instanceof Link)) { |
68
|
213 |
|
self::processAutolinks($node, $this->finalRegex); |
69
|
|
|
} |
70
|
|
|
} |
71
|
231 |
|
} |
72
|
|
|
|
73
|
213 |
|
private static function processAutolinks(Text $node, string $regex): void |
74
|
|
|
{ |
75
|
213 |
|
$contents = \preg_split($regex, $node->getLiteral(), -1, PREG_SPLIT_DELIM_CAPTURE); |
76
|
|
|
|
77
|
213 |
|
if ($contents === false || \count($contents) === 1) { |
78
|
120 |
|
return; |
79
|
|
|
} |
80
|
|
|
|
81
|
93 |
|
$leftovers = ''; |
82
|
93 |
|
foreach ($contents as $i => $content) { |
83
|
|
|
// Even-indexed elements are things before/after the URLs |
84
|
93 |
|
if ($i % 2 === 0) { |
85
|
|
|
// Insert any left-over characters here as well |
86
|
93 |
|
$text = $leftovers . $content; |
87
|
93 |
|
if ($text !== '') { |
88
|
57 |
|
$node->insertBefore(new Text($leftovers . $content)); |
89
|
|
|
} |
90
|
|
|
|
91
|
93 |
|
$leftovers = ''; |
92
|
93 |
|
continue; |
93
|
|
|
} |
94
|
|
|
|
95
|
93 |
|
$leftovers = ''; |
96
|
|
|
|
97
|
|
|
// Does the URL end with punctuation that should be stripped? |
98
|
93 |
|
if (\preg_match('/(.+)([?!.,:*_~]+)$/', $content, $matches)) { |
99
|
|
|
// Add the punctuation later |
100
|
24 |
|
$content = $matches[1]; |
101
|
24 |
|
$leftovers = $matches[2]; |
102
|
|
|
} |
103
|
|
|
|
104
|
|
|
// Does the URL end with something that looks like an entity reference? |
105
|
93 |
|
if (\preg_match('/(.+)(&[A-Za-z0-9]+;)$/', $content, $matches)) { |
106
|
6 |
|
$content = $matches[1]; |
107
|
6 |
|
$leftovers = $matches[2] . $leftovers; |
108
|
|
|
} |
109
|
|
|
|
110
|
|
|
// Does the URL need unmatched parens chopped off? |
111
|
93 |
|
if (\substr($content, -1) === ')' && ($diff = self::diffParens($content)) > 0) { |
112
|
15 |
|
$content = \substr($content, 0, -$diff); |
113
|
15 |
|
$leftovers = \str_repeat(')', $diff) . $leftovers; |
114
|
|
|
} |
115
|
|
|
|
116
|
93 |
|
self::addLink($node, $content); |
117
|
|
|
} |
118
|
|
|
|
119
|
93 |
|
$node->detach(); |
120
|
93 |
|
} |
121
|
|
|
|
122
|
93 |
|
private static function addLink(Text $node, string $url): void |
123
|
|
|
{ |
124
|
|
|
// Auto-prefix 'http://' onto 'www' URLs |
125
|
93 |
|
if (\substr($url, 0, 4) === 'www.') { |
126
|
54 |
|
$node->insertBefore(new Link('http://' . $url, $url)); |
127
|
|
|
|
128
|
54 |
|
return; |
129
|
|
|
} |
130
|
|
|
|
131
|
39 |
|
$node->insertBefore(new Link($url, $url)); |
132
|
39 |
|
} |
133
|
|
|
|
134
|
|
|
/** |
135
|
|
|
* @psalm-pure |
136
|
|
|
*/ |
137
|
21 |
|
private static function diffParens(string $content): int |
138
|
|
|
{ |
139
|
|
|
// Scan the entire autolink for the total number of parentheses. |
140
|
|
|
// If there is a greater number of closing parentheses than opening ones, |
141
|
|
|
// we don’t consider ANY of the last characters as part of the autolink, |
142
|
|
|
// in order to facilitate including an autolink inside a parenthesis. |
143
|
21 |
|
\preg_match_all('/[()]/', $content, $matches); |
144
|
|
|
|
145
|
21 |
|
$charCount = ['(' => 0, ')' => 0]; |
146
|
21 |
|
foreach ($matches[0] as $char) { |
147
|
21 |
|
$charCount[$char]++; |
148
|
|
|
} |
149
|
|
|
|
150
|
21 |
|
return $charCount[')'] - $charCount['(']; |
151
|
|
|
} |
152
|
|
|
} |
153
|
|
|
|