Completed
Pull Request — master (#6)
by Colin
01:34
created

UrlAutolinkProcessor::__construct()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 3
CRAP Score 1

Importance

Changes 0
Metric Value
dl 0
loc 4
ccs 3
cts 3
cp 1
rs 10
c 0
b 0
f 0
cc 1
nc 1
nop 1
crap 1
1
<?php
2
3
/*
4
 * This file is part of the league/commonmark-ext-autolink package.
5
 *
6
 * (c) Colin O'Dell <[email protected]>
7
 *
8
 * For the full copyright and license information, please view the LICENSE
9
 * file that was distributed with this source code.
10
 */
11
12
namespace League\CommonMark\Ext\Autolink;
13
14
use League\CommonMark\Block\Element\Document;
15
use League\CommonMark\DocumentProcessorInterface;
16
use League\CommonMark\Inline\Element\Link;
17
use League\CommonMark\Inline\Element\Text;
18
19
final class UrlAutolinkProcessor implements DocumentProcessorInterface
20
{
21
    // RegEx adapted from https://github.com/symfony/symfony/blob/4.2/src/Symfony/Component/Validator/Constraints/UrlValidator.php
22
    const REGEX = '~
23
        (?<=^|[ \\t\\n\\x0b\\x0c\\x0d*_\\~\\(])  # Can only come at the beginning of a line, after whitespace, or certain delimiting characters
24
        (
25
            # Must start with a supported scheme + auth, or "www"
26
            (?:
27
                (?:%s)://                                 # protocol
28
                (?:([\.\pL\pN-]+:)?([\.\pL\pN-]+)@)?      # basic auth
29
            |www\.)
30
            (?:
31
                (?:[\pL\pN\pS\-\.])+(?:\.?(?:[\pL\pN]|xn\-\-[\pL\pN-]+)+\.?) # a domain name
32
                    |                                                 # or
33
                \d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}                    # an IP address
34
                    |                                                 # or
35
                \[
36
                    (?:(?:(?:(?:(?:(?:(?:[0-9a-f]{1,4})):){6})(?:(?:(?:(?:(?:[0-9a-f]{1,4})):(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:::(?:(?:(?:[0-9a-f]{1,4})):){5})(?:(?:(?:(?:(?:[0-9a-f]{1,4})):(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:(?:(?:(?:[0-9a-f]{1,4})))?::(?:(?:(?:[0-9a-f]{1,4})):){4})(?:(?:(?:(?:(?:[0-9a-f]{1,4})):(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:(?:(?:(?:(?:[0-9a-f]{1,4})):){0,1}(?:(?:[0-9a-f]{1,4})))?::(?:(?:(?:[0-9a-f]{1,4})):){3})(?:(?:(?:(?:(?:[0-9a-f]{1,4})):(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:(?:(?:(?:(?:[0-9a-f]{1,4})):){0,2}(?:(?:[0-9a-f]{1,4})))?::(?:(?:(?:[0-9a-f]{1,4})):){2})(?:(?:(?:(?:(?:[0-9a-f]{1,4})):(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:(?:(?:(?:(?:[0-9a-f]{1,4})):){0,3}(?:(?:[0-9a-f]{1,4})))?::(?:(?:[0-9a-f]{1,4})):)(?:(?:(?:(?:(?:[0-9a-f]{1,4})):(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:(?:(?:(?:(?:[0-9a-f]{1,4})):){0,4}(?:(?:[0-9a-f]{1,4})))?::)(?:(?:(?:(?:(?:[0-9a-f]{1,4})):(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9]))\.){3}(?:(?:25[0-5]|(?:[1-9]|1[0-9]|2[0-4])?[0-9])))))))|(?:(?:(?:(?:(?:(?:[0-9a-f]{1,4})):){0,5}(?:(?:[0-9a-f]{1,4})))?::)(?:(?:[0-9a-f]{1,4})))|(?:(?:(?:(?:(?:(?:[0-9a-f]{1,4})):){0,6}(?:(?:[0-9a-f]{1,4})))?::))))
37
                \]  # an IPv6 address
38
            )
39
            (?::[0-9]+)?                              # a port (optional)
40
            (?:/ (?:[\pL\pN\-._\~!$&\'()*+,;=:@]|%%[0-9A-Fa-f]{2})* )*      # a path
41
            (?:\? (?:[\pL\pN\-._\~!$&\'()*+,;=:@/?]|%%[0-9A-Fa-f]{2})* )?   # a query (optional)
42
            (?:\# (?:[\pL\pN\-._\~!$&\'()*+,;=:@/?]|%%[0-9A-Fa-f]{2})* )?   # a fragment (optional)
43
        )~ixu';
44
45
    private $allowedProtocols;
46
47 75
    public function __construct(array $allowedProtocols = ['http', 'https', 'ftp'])
48
    {
49 75
        $this->allowedProtocols = $allowedProtocols;
50 75
    }
51
52
    /**
53
     * @param Document $document
54
     *
55
     * @return void
56
     */
57 75
    public function processDocument(Document $document)
58
    {
59 75
        $regex = sprintf(self::REGEX, implode('|', $this->allowedProtocols));
60
61 75
        $walker = $document->walker();
62
63 75
        while ($event = $walker->next()) {
64 75
            if ($event->isEntering() && $event->getNode() instanceof Text) {
65
                /** @var Text $node */
66 75
                $node = $event->getNode();
67
68 75
                $contents = preg_split($regex, $node->getContent(), -1, PREG_SPLIT_DELIM_CAPTURE);
69
70 75
                $leftovers = '';
71 75
                foreach ($contents as $i => $content) {
72 75
                    if ($i % 2 === 0) {
73 75
                        $text = $leftovers.$content;
74 75
                        if ($text !== '') {
75 45
                            $node->insertBefore(new Text($leftovers . $content));
76
                        }
77 75
                        $leftovers = '';
78
                    } else {
79
                        // Does the URL end with punctuation that should be stripped?
80 57
                        if (preg_match('/(.+)([?!.,:*_~]+)$/', $content, $matches)) {
81
                            // Add the punctuation later
82 15
                            $content = $matches[1];
83 15
                            $leftovers = $matches[2];
84
                        }
85
86
                        // Does the URL need its closing paren chopped off?
87 57
                        if (substr($content, -1) === ')' && self::hasMoreCloserParensThanOpeners($content)) {
88 3
                            $content = substr($content, 0, -1);
89 3
                            $leftovers .= ')';
90
                        }
91
92
                        // Auto-prefix 'http://' onto 'www' URLs
93 57
                        if (substr($content, 0, 4) === 'www.') {
94 27
                            $node->insertBefore(new Link('http://'.$content, $content));
95
                        } else {
96 45
                            $node->insertBefore(new Link($content, $content));
97
                        }
98
                    }
99
                }
100
101 75
                $node->detach();
102
            }
103
        }
104 75
    }
105
106
    /**
107
     * @param string $content
108
     *
109
     * @return bool
110
     */
111 6
    private static function hasMoreCloserParensThanOpeners($content)
112
    {
113
        // Scan the entire autolink for the total number of parentheses.
114
        // If there is a greater number of closing parentheses than opening ones,
115
        // we don’t consider the last character part of the autolink, in order to
116
        // facilitate including an autolink inside a parenthesis.
117 6
        preg_match_all('/[()]/', $content, $matches);
118
119 6
        $charCount = ['(' => 0, ')' => 0];
120 6
        foreach ($matches[0] as $char) {
121 6
            $charCount[$char]++;
122
        }
123
124 6
        return $charCount[')'] > $charCount['('];
125
    }
126
}
127