Completed
Push — spec-0.28 ( 9f1e7c...c432d3 )
by Colin
04:30
created

RegexHelper   A

Complexity

Total Complexity 34

Size/Duplication

Total Lines 302
Duplicated Lines 0 %

Coupling/Cohesion

Components 1
Dependencies 1

Test Coverage

Coverage 94.83%

Importance

Changes 4
Bugs 0 Features 0
Metric Value
wmc 34
c 4
b 0
f 0
lcom 1
cbo 1
dl 0
loc 302
ccs 110
cts 116
cp 0.9483
rs 9.2

15 Methods

Rating   Name   Duplication   Size   Complexity  
A __construct() 0 4 1
A getInstance() 0 8 2
B buildRegexPatterns() 0 37 1
A getPartialRegex() 0 4 1
A getHtmlTagRegex() 0 4 1
A getLinkTitleRegex() 0 4 1
A getLinkDestinationRegex() 0 6 1
A getLinkDestinationBracesRegex() 0 4 1
A getThematicBreakRegex() 0 4 1
A matchAt() 0 13 2
B matchAll() 0 21 5
A unescape() 0 11 1
B getHtmlBlockOpenRegex() 0 21 8
B getHtmlBlockCloseRegex() 0 15 6
A isLinkPotentiallyUnsafe() 0 4 2
1
<?php
2
3
/*
4
 * This file is part of the league/commonmark package.
5
 *
6
 * (c) Colin O'Dell <[email protected]>
7
 *
8
 * Original code based on the CommonMark JS reference parser (https://bitly.com/commonmark-js)
9
 *  - (c) John MacFarlane
10
 *
11
 * For the full copyright and license information, please view the LICENSE
12
 * file that was distributed with this source code.
13
 */
14
15
namespace League\CommonMark\Util;
16
17
use League\CommonMark\Block\Element\HtmlBlock;
18
19
/**
20
 * Provides regular expressions and utilties for parsing Markdown
21
 *
22
 * Singletons are generally bad, but it allows us to build the regexes once (and only once).
23
 */
24
class RegexHelper
25
{
26
    const ESCAPABLE = 0;
27
    const ESCAPED_CHAR = 1;
28
    const IN_DOUBLE_QUOTES = 2;
29
    const IN_SINGLE_QUOTES = 3;
30
    const IN_PARENS = 4;
31
    const REG_CHAR = 5;
32
    const IN_PARENS_NOSP = 6;
33
    const TAGNAME = 7;
34
    const BLOCKTAGNAME = 8;
35
    const ATTRIBUTENAME = 9;
36
    const UNQUOTEDVALUE = 10;
37
    const SINGLEQUOTEDVALUE = 11;
38
    const DOUBLEQUOTEDVALUE = 12;
39
    const ATTRIBUTEVALUE = 13;
40
    const ATTRIBUTEVALUESPEC = 14;
41
    const ATTRIBUTE = 15;
42
    const OPENTAG = 16;
43
    const CLOSETAG = 17;
44
    const OPENBLOCKTAG = 18;
45
    const CLOSEBLOCKTAG = 19;
46
    const HTMLCOMMENT = 20;
47
    const PROCESSINGINSTRUCTION = 21;
48
    const DECLARATION = 22;
49
    const CDATA = 23;
50
    const HTMLTAG = 24;
51
    const HTMLBLOCKOPEN = 25;
52
    const LINK_TITLE = 26;
53
54
    const REGEX_ESCAPABLE = '[!"#$%&\'()*+,.\/:;<=>?@[\\\\\]^_`{|}~-]';
55
    const REGEX_ENTITY = '&(?:#x[a-f0-9]{1,8}|#[0-9]{1,8}|[a-z][a-z0-9]{1,31});';
56
    const REGEX_PUNCTUATION = '/^[\x{2000}-\x{206F}\x{2E00}-\x{2E7F}\p{Pc}\p{Pd}\p{Pe}\p{Pf}\p{Pi}\p{Po}\p{Ps}\\\\\'!"#\$%&\(\)\*\+,\-\.\\/:;<=>\?@\[\]\^_`\{\|\}~]/u';
57
    const REGEX_UNSAFE_PROTOCOL = '/^javascript:|vbscript:|file:|data:/i';
58
    const REGEX_SAFE_DATA_PROTOCOL = '/^data:image\/(?:png|gif|jpeg|webp)/i';
59
    const REGEX_NON_SPACE = '/[^ \t\f\v\r\n]/';
60
61
    const REGEX_WHITESPACE_CHAR = '/^[ \t\n\x0b\x0c\x0d]/';
62
    const REGEX_WHITESPACE = '/[ \t\n\x0b\x0c\x0d]+/';
63
    const REGEX_UNICODE_WHITESPACE_CHAR = '/^\pZ|\s/u';
64
65
    /**
66
     * @deprecated
67
     */
68
    const REGEX_UNICODE_WHITESPACE = '/\pZ|\s/u';
69
70
    protected $regex = [];
71
72
    protected static $instance;
73
74
    /**
75
     * Constructor
76
     */
77 3
    protected function __construct()
78
    {
79 3
        $this->buildRegexPatterns();
80 3
    }
81
82
    /**
83
     * @return RegexHelper
84
     */
85 1815
    public static function getInstance()
86
    {
87 1815
        if (self::$instance === null) {
88 3
            self::$instance = new self();
89 2
        }
90
91 1815
        return self::$instance;
92
    }
93
94
    /**
95
     * Builds the regular expressions required to parse Markdown
96
     *
97
     * We could hard-code them all as pre-built constants, but that would be more difficult to manage.
98
     */
99 3
    protected function buildRegexPatterns()
100
    {
101 3
        $regex = [];
102 3
        $regex[self::ESCAPABLE] = self::REGEX_ESCAPABLE;
103 3
        $regex[self::ESCAPED_CHAR] = '\\\\' . $regex[self::ESCAPABLE];
104 3
        $regex[self::IN_DOUBLE_QUOTES] = '"(' . $regex[self::ESCAPED_CHAR] . '|[^"\x00])*"';
105 3
        $regex[self::IN_SINGLE_QUOTES] = '\'(' . $regex[self::ESCAPED_CHAR] . '|[^\'\x00])*\'';
106 3
        $regex[self::IN_PARENS] = '\\((' . $regex[self::ESCAPED_CHAR] . '|[^)\x00])*\\)';
107 3
        $regex[self::REG_CHAR] = '[^\\\\()\x00-\x20]';
108 3
        $regex[self::IN_PARENS_NOSP] = '\((' . $regex[self::REG_CHAR] . '|' . $regex[self::ESCAPED_CHAR] . '|\\\\)*\)';
109 3
        $regex[self::TAGNAME] = '[A-Za-z][A-Za-z0-9-]*';
110 3
        $regex[self::BLOCKTAGNAME] = '(?:address|article|aside|base|basefont|blockquote|body|caption|center|col|colgroup|dd|details|dialog|dir|div|dl|dt|fieldset|figcaption|figure|footer|form|frame|frameset|h1|head|header|hr|html|iframe|legend|li|link|main|menu|menuitem|meta|nav|noframes|ol|optgroup|option|p|param|section|source|title|summary|table|tbody|td|tfoot|th|thead|title|tr|track|ul)';
111 3
        $regex[self::ATTRIBUTENAME] = '[a-zA-Z_:][a-zA-Z0-9:._-]*';
112 3
        $regex[self::UNQUOTEDVALUE] = '[^"\'=<>`\x00-\x20]+';
113 3
        $regex[self::SINGLEQUOTEDVALUE] = '\'[^\']*\'';
114 3
        $regex[self::DOUBLEQUOTEDVALUE] = '"[^"]*"';
115 3
        $regex[self::ATTRIBUTEVALUE] = '(?:' . $regex[self::UNQUOTEDVALUE] . '|' . $regex[self::SINGLEQUOTEDVALUE] . '|' . $regex[self::DOUBLEQUOTEDVALUE] . ')';
116 3
        $regex[self::ATTRIBUTEVALUESPEC] = '(?:' . '\s*=' . '\s*' . $regex[self::ATTRIBUTEVALUE] . ')';
117 3
        $regex[self::ATTRIBUTE] = '(?:' . '\s+' . $regex[self::ATTRIBUTENAME] . $regex[self::ATTRIBUTEVALUESPEC] . '?)';
118 3
        $regex[self::OPENTAG] = '<' . $regex[self::TAGNAME] . $regex[self::ATTRIBUTE] . '*' . '\s*\/?>';
119 3
        $regex[self::CLOSETAG] = '<\/' . $regex[self::TAGNAME] . '\s*[>]';
120 3
        $regex[self::OPENBLOCKTAG] = '<' . $regex[self::BLOCKTAGNAME] . $regex[self::ATTRIBUTE] . '*' . '\s*\/?>';
121 3
        $regex[self::CLOSEBLOCKTAG] = '<\/' . $regex[self::BLOCKTAGNAME] . '\s*[>]';
122 3
        $regex[self::HTMLCOMMENT] = '<!---->|<!--(?:-?[^>-])(?:-?[^-])*-->';
123 3
        $regex[self::PROCESSINGINSTRUCTION] = '[<][?].*?[?][>]';
124 3
        $regex[self::DECLARATION] = '<![A-Z]+' . '\s+[^>]*>';
125 3
        $regex[self::CDATA] = '<!\[CDATA\[[\s\S]*?]\]>';
126 3
        $regex[self::HTMLTAG] = '(?:' . $regex[self::OPENTAG] . '|' . $regex[self::CLOSETAG] . '|' . $regex[self::HTMLCOMMENT] . '|' .
127 3
            $regex[self::PROCESSINGINSTRUCTION] . '|' . $regex[self::DECLARATION] . '|' . $regex[self::CDATA] . ')';
128 3
        $regex[self::HTMLBLOCKOPEN] = '<(?:' . $regex[self::BLOCKTAGNAME] . '(?:[\s\/>]|$)' . '|' .
129 3
            '\/' . $regex[self::BLOCKTAGNAME] . '(?:[\s>]|$)' . '|' . '[?!])';
130 3
        $regex[self::LINK_TITLE] = '^(?:"(' . $regex[self::ESCAPED_CHAR] . '|[^"\x00])*"' .
131 3
            '|' . '\'(' . $regex[self::ESCAPED_CHAR] . '|[^\'\x00])*\'' .
132 3
            '|' . '\((' . $regex[self::ESCAPED_CHAR] . '|[^)\x00])*\))';
133
134 3
        $this->regex = $regex;
135 3
    }
136
137
    /**
138
     * Returns a partial regex
139
     *
140
     * It'll need to be wrapped with /.../ before use
141
     *
142
     * @param int $const
143
     *
144
     * @return string
145
     */
146 615
    public function getPartialRegex($const)
147
    {
148 615
        return $this->regex[$const];
149
    }
150
151
    /**
152
     * @return string
153
     */
154 123
    public function getHtmlTagRegex()
155
    {
156 123
        return '/^' . $this->regex[self::HTMLTAG] . '/i';
157
    }
158
159
    /**
160
     * @return string
161
     */
162 270
    public function getLinkTitleRegex()
163
    {
164 270
        return '/' . $this->regex[self::LINK_TITLE] . '/';
165
    }
166
167
    /**
168
     * @return string
169
     *
170
     * @deprecated
171
     */
172
    public function getLinkDestinationRegex()
173
    {
174
        @trigger_error('RegexHelper::getLinkDestinationRegex() is no longer used and will be removed in a future 0.x release.', E_USER_DEPRECATED);
1 ignored issue
show
Security Best Practice introduced by
It seems like you do not handle an error condition here. This can introduce security issues, and is generally not recommended.

If you suppress an error, we recommend checking for the error condition explicitly:

// For example instead of
@mkdir($dir);

// Better use
if (@mkdir($dir) === false) {
    throw new \RuntimeException('The directory '.$dir.' could not be created.');
}
Loading history...
175
176
        return '/^' . '(?:' . $this->regex[self::REG_CHAR] . '+|' . $this->regex[self::ESCAPED_CHAR] . '|\\\\|' . $this->regex[self::IN_PARENS_NOSP] . ')*' . '/';
177
    }
178
179
    /**
180
     * @return string
181
     */
182 387
    public function getLinkDestinationBracesRegex()
183
    {
184 387
        return '/^(?:' . '[<](?:[^ <>\\t\\n\\\\\\x00]' . '|' . $this->regex[self::ESCAPED_CHAR] . '|' . '\\\\)*[>]' . ')/';
185
    }
186
187
    /**
188
     * @return string
189
     */
190 1689
    public function getThematicBreakRegex()
191
    {
192 1689
        return '/^(?:(?:\*[ \t]*){3,}|(?:_[ \t]*){3,}|(?:-[ \t]*){3,})[ \t]*$/';
193
    }
194
195
    /**
196
     * Attempt to match a regex in string s at offset offset
197
     *
198
     * @param string $regex
199
     * @param string $string
200
     * @param int    $offset
201
     *
202
     * @return int|null Index of match, or null
203
     */
204 1776
    public static function matchAt($regex, $string, $offset = 0)
205
    {
206 1776
        $matches = [];
207 1776
        $string = mb_substr($string, $offset, null, 'utf-8');
208 1776
        if (!preg_match($regex, $string, $matches, PREG_OFFSET_CAPTURE)) {
209 1716
            return;
210
        }
211
212
        // PREG_OFFSET_CAPTURE always returns the byte offset, not the char offset, which is annoying
213 294
        $charPos = mb_strlen(mb_strcut($string, 0, $matches[0][1], 'utf-8'), 'utf-8');
214
215 294
        return $offset + $charPos;
216
    }
217
218
    /**
219
     * Functional wrapper around preg_match_all
220
     *
221
     * @param string $pattern
222
     * @param string $subject
223
     * @param int    $offset
224
     *
225
     * @return array|null
226
     */
227 1875
    public static function matchAll($pattern, $subject, $offset = 0)
228
    {
229 1875
        $matches = [];
230 1875
        $subject = substr($subject, $offset);
231 1875
        preg_match_all($pattern, $subject, $matches, PREG_PATTERN_ORDER);
232
233 1875
        $fullMatches = reset($matches);
234 1875
        if (empty($fullMatches)) {
235 1830
            return;
236
        }
237
238 660
        if (count($fullMatches) === 1) {
239 660
            foreach ($matches as &$match) {
240 660
                $match = reset($match);
241 440
            }
242 440
        }
243
244 660
        if (!empty($matches)) {
245 660
            return $matches;
246
        }
247
    }
248
249
    /**
250
     * Replace backslash escapes with literal characters
251
     *
252
     * @param string $string
253
     *
254
     * @return string
255
     */
256 492
    public static function unescape($string)
257
    {
258 492
        $allEscapedChar = '/\\\\(' . self::REGEX_ESCAPABLE . ')/';
259
260 492
        $escaped = preg_replace($allEscapedChar, '$1', $string);
261 492
        $replaced = preg_replace_callback('/' . self::REGEX_ENTITY . '/i', function ($e) {
262 15
            return Html5Entities::decodeEntity($e[0]);
263 492
        }, $escaped);
264
265 492
        return $replaced;
266
    }
267
268
    /**
269
     * @param int $type HTML block type
270
     *
271
     * @return string|null
272
     */
273 279
    public static function getHtmlBlockOpenRegex($type)
274
    {
275
        switch ($type) {
276 279
            case HtmlBlock::TYPE_1_CODE_CONTAINER:
277 279
                return '/^<(?:script|pre|style)(?:\s|>|$)/i';
278 261
            case HtmlBlock::TYPE_2_COMMENT:
279 261
                return '/^<!--/';
280 246
            case HtmlBlock::TYPE_3:
281 246
                return '/^<[?]/';
282 243
            case HtmlBlock::TYPE_4:
283 243
                return '/^<![A-Z]/';
284 240
            case HtmlBlock::TYPE_5_CDATA:
285 240
                return '/^<!\[CDATA\[/';
286 237
            case HtmlBlock::TYPE_6_BLOCK_ELEMENT:
287 237
                return '%^<[/]?(?:address|article|aside|base|basefont|blockquote|body|caption|center|col|colgroup|dd|details|dialog|dir|div|dl|dt|fieldset|figcaption|figure|footer|form|frame|frameset|h[123456]|head|header|hr|html|iframe|legend|li|link|main|menu|menuitem|meta|nav|noframes|ol|optgroup|option|p|param|section|source|title|summary|table|tbody|td|tfoot|th|thead|title|tr|track|ul)(?:\s|[/]?[>]|$)%i';
288 156
            case HtmlBlock::TYPE_7_MISC_ELEMENT:
289 156
                $self = self::getInstance();
290
291 156
                return '/^(?:' . $self->getPartialRegex(self::OPENTAG) . '|' . $self->getPartialRegex(self::CLOSETAG) . ')\\s*$/i';
292
        }
293
    }
294
295
    /**
296
     * @param int $type HTML block type
297
     *
298
     * @return string|null
299
     */
300 60
    public static function getHtmlBlockCloseRegex($type)
301
    {
302
        switch ($type) {
303 60
            case HtmlBlock::TYPE_1_CODE_CONTAINER:
304 36
                return '%<\/(?:script|pre|style)>%i';
305 24
            case HtmlBlock::TYPE_2_COMMENT:
306 15
                return '/-->/';
307 9
            case HtmlBlock::TYPE_3:
308 3
                return '/\?>/';
309 6
            case HtmlBlock::TYPE_4:
310 3
                return '/>/';
311 3
            case HtmlBlock::TYPE_5_CDATA:
312 3
                return '/\]\]>/';
313
        }
314
    }
315
316
    /**
317
     * @param string $url
318
     *
319
     * @return bool
320
     */
321 30
    public static function isLinkPotentiallyUnsafe($url)
322
    {
323 30
        return preg_match(self::REGEX_UNSAFE_PROTOCOL, $url) !== 0 && preg_match(self::REGEX_SAFE_DATA_PROTOCOL, $url) === 0;
324
    }
325
}
326