Completed
Push — master ( 235351...ea1f65 )
by Colin
10s
created

RegexHelper::isEscapable()   A

Complexity

Conditions 2
Paths 2

Duplication

Lines 0
Ratio 0 %

Size

Total Lines 8
Code Lines 4

Code Coverage

Tests 4
CRAP Score 2

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 0
loc 8
ccs 4
cts 4
cp 1
rs 9.4285
cc 2
eloc 4
nc 2
nop 1
crap 2
1
<?php
2
3
/*
4
 * This file is part of the league/commonmark package.
5
 *
6
 * (c) Colin O'Dell <[email protected]>
7
 *
8
 * Original code based on the CommonMark JS reference parser (https://bitly.com/commonmark-js)
9
 *  - (c) John MacFarlane
10
 *
11
 * For the full copyright and license information, please view the LICENSE
12
 * file that was distributed with this source code.
13
 */
14
15
namespace League\CommonMark\Util;
16
17
use League\CommonMark\Block\Element\HtmlBlock;
18
19
/**
20
 * Provides regular expressions and utilities for parsing Markdown
21
 */
22
final class RegexHelper
23
{
24
    /** @deprecated Use PARTIAL_ESCAPABLE instead */
25
    const ESCAPABLE = 0;
26
27
    /** @deprecated Use PARTIAL_ESCAPED_CHAR instead */
28
    const ESCAPED_CHAR = 1;
29
30
    /** @deprecated Use PARTIAL_IN_DOUBLE_QUOTES instead */
31
    const IN_DOUBLE_QUOTES = 2;
32
33
    /** @deprecated Use PARTIAL_IN_SINGLE_QUOTES instead */
34
    const IN_SINGLE_QUOTES = 3;
35
36
    /** @deprecated Use PARTIAL_IN_PARENS instead */
37
    const IN_PARENS = 4;
38
39
    /** @deprecated Use PARTIAL_REG_CHAR instead */
40
    const REG_CHAR = 5;
41
42
    /** @deprecated Use PARTIAL_IN_PARENS_NOSP instead */
43
    const IN_PARENS_NOSP = 6;
44
45
    /** @deprecated Use PARTIAL_TAGNAME instead */
46
    const TAGNAME = 7;
47
48
    /** @deprecated Use PARTIAL_BLOCKTAGNAME instead */
49
    const BLOCKTAGNAME = 8;
50
51
    /** @deprecated Use PARTIAL_ATTRIBUTENAME instead */
52
    const ATTRIBUTENAME = 9;
53
54
    /** @deprecated Use PARTIAL_UNQUOTEDVALUE instead */
55
    const UNQUOTEDVALUE = 10;
56
57
    /** @deprecated Use PARTIAL_SINGLEQUOTEDVALUE instead */
58
    const SINGLEQUOTEDVALUE = 11;
59
60
    /** @deprecated Use PARTIAL_DOUBLEQUOTEDVALUE instead */
61
    const DOUBLEQUOTEDVALUE = 12;
62
63
    /** @deprecated Use PARTIAL_ATTRIBUTEVALUE instead */
64
    const ATTRIBUTEVALUE = 13;
65
66
    /** @deprecated Use PARTIAL_ATTRIBUTEVALUESPEC instead */
67
    const ATTRIBUTEVALUESPEC = 14;
68
69
    /** @deprecated Use PARTIAL_ATTRIBUTE instead */
70
    const ATTRIBUTE = 15;
71
72
    /** @deprecated Use PARTIAL_OPENTAG instead */
73
    const OPENTAG = 16;
74
75
    /** @deprecated Use PARTIAL_CLOSETAG instead */
76
    const CLOSETAG = 17;
77
78
    /** @deprecated Use PARTIAL_OPENBLOCKTAG instead */
79
    const OPENBLOCKTAG = 18;
80
81
    /** @deprecated Use PARTIAL_CLOSEBLOCKTAG instead */
82
    const CLOSEBLOCKTAG = 19;
83
84
    /** @deprecated Use PARTIAL_HTMLCOMMENT instead */
85
    const HTMLCOMMENT = 20;
86
87
    /** @deprecated Use PARTIAL_PROCESSINGINSTRUCTION instead */
88
    const PROCESSINGINSTRUCTION = 21;
89
90
    /** @deprecated Use PARTIAL_DECLARATION instead */
91
    const DECLARATION = 22;
92
93
    /** @deprecated Use PARTIAL_CDATA instead */
94
    const CDATA = 23;
95
96
    /** @deprecated Use PARTIAL_HTMLTAG instead */
97
    const HTMLTAG = 24;
98
99
    /** @deprecated Use PARTIAL_HTMLBLOCKOPEN instead */
100
    const HTMLBLOCKOPEN = 25;
101
102
    /** @deprecated Use PARTIAL_LINK_TITLE instead */
103
    const LINK_TITLE = 26;
104
105
    // Partial regular expressions (wrap with `/` on each side before use)
106
    const PARTIAL_ENTITY = '&(?:#x[a-f0-9]{1,8}|#[0-9]{1,8}|[a-z][a-z0-9]{1,31});';
107
    const PARTIAL_ESCAPABLE = '[!"#$%&\'()*+,.\/:;<=>[email protected][\\\\\]^_`{|}~-]';
108
    const PARTIAL_ESCAPED_CHAR = '\\\\' . self::PARTIAL_ESCAPABLE;
109
    const PARTIAL_IN_DOUBLE_QUOTES = '"(' . self::PARTIAL_ESCAPED_CHAR . '|[^"\x00])*"';
110
    const PARTIAL_IN_SINGLE_QUOTES = '\'(' . self::PARTIAL_ESCAPED_CHAR . '|[^\'\x00])*\'';
111
    const PARTIAL_IN_PARENS = '\\((' . self::PARTIAL_ESCAPED_CHAR . '|[^)\x00])*\\)';
112
    const PARTIAL_REG_CHAR = '[^\\\\()\x00-\x20]';
113
    const PARTIAL_IN_PARENS_NOSP = '\((' . self::PARTIAL_REG_CHAR . '|' . self::PARTIAL_ESCAPED_CHAR . '|\\\\)*\)';
114
    const PARTIAL_TAGNAME = '[A-Za-z][A-Za-z0-9-]*';
115
    const PARTIAL_BLOCKTAGNAME = '(?:address|article|aside|base|basefont|blockquote|body|caption|center|col|colgroup|dd|details|dialog|dir|div|dl|dt|fieldset|figcaption|figure|footer|form|frame|frameset|h1|head|header|hr|html|iframe|legend|li|link|main|menu|menuitem|meta|nav|noframes|ol|optgroup|option|p|param|section|source|title|summary|table|tbody|td|tfoot|th|thead|title|tr|track|ul)';
116
    const PARTIAL_ATTRIBUTENAME = '[a-zA-Z_:][a-zA-Z0-9:._-]*';
117
    const PARTIAL_UNQUOTEDVALUE = '[^"\'=<>`\x00-\x20]+';
118
    const PARTIAL_SINGLEQUOTEDVALUE = '\'[^\']*\'';
119
    const PARTIAL_DOUBLEQUOTEDVALUE = '"[^"]*"';
120
    const PARTIAL_ATTRIBUTEVALUE = '(?:' . self::PARTIAL_UNQUOTEDVALUE . '|' . self::PARTIAL_SINGLEQUOTEDVALUE . '|' . self::PARTIAL_DOUBLEQUOTEDVALUE . ')';
121
    const PARTIAL_ATTRIBUTEVALUESPEC = '(?:' . '\s*=' . '\s*' . self::PARTIAL_ATTRIBUTEVALUE . ')';
122
    const PARTIAL_ATTRIBUTE = '(?:' . '\s+' . self::PARTIAL_ATTRIBUTENAME . self::PARTIAL_ATTRIBUTEVALUESPEC . '?)';
123
    const PARTIAL_OPENTAG = '<' . self::PARTIAL_TAGNAME . self::PARTIAL_ATTRIBUTE . '*' . '\s*\/?>';
124
    const PARTIAL_CLOSETAG = '<\/' . self::PARTIAL_TAGNAME . '\s*[>]';
125
    const PARTIAL_OPENBLOCKTAG = '<' . self::PARTIAL_BLOCKTAGNAME . self::PARTIAL_ATTRIBUTE . '*' . '\s*\/?>';
126
    const PARTIAL_CLOSEBLOCKTAG = '<\/' . self::PARTIAL_BLOCKTAGNAME . '\s*[>]';
127
    const PARTIAL_HTMLCOMMENT = '<!---->|<!--(?:-?[^>-])(?:-?[^-])*-->';
128
    const PARTIAL_PROCESSINGINSTRUCTION = '[<][?].*?[?][>]';
129
    const PARTIAL_DECLARATION = '<![A-Z]+' . '\s+[^>]*>';
130
    const PARTIAL_CDATA = '<!\[CDATA\[[\s\S]*?]\]>';
131
    const PARTIAL_HTMLTAG = '(?:' . self::PARTIAL_OPENTAG . '|' . self::PARTIAL_CLOSETAG . '|' . self::PARTIAL_HTMLCOMMENT . '|' .
132
        self::PARTIAL_PROCESSINGINSTRUCTION . '|' . self::PARTIAL_DECLARATION . '|' . self::PARTIAL_CDATA . ')';
133
    const PARTIAL_HTMLBLOCKOPEN = '<(?:' . self::PARTIAL_BLOCKTAGNAME . '(?:[\s\/>]|$)' . '|' .
134
        '\/' . self::PARTIAL_BLOCKTAGNAME . '(?:[\s>]|$)' . '|' . '[?!])';
135
    const PARTIAL_LINK_TITLE = '^(?:"(' . self::PARTIAL_ESCAPED_CHAR . '|[^"\x00])*"' .
136
        '|' . '\'(' . self::PARTIAL_ESCAPED_CHAR . '|[^\'\x00])*\'' .
137
        '|' . '\((' . self::PARTIAL_ESCAPED_CHAR . '|[^)\x00])*\))';
138
139
    /** @deprecated Use PARTIAL_ESCAPABLE instead */
140
    const REGEX_ESCAPABLE = self::PARTIAL_ESCAPABLE;
141
142
    /** @deprecated Use PARTIAL_ENTITY instead */
143
    const REGEX_ENTITY = self::PARTIAL_ENTITY;
144
145
    const REGEX_PUNCTUATION = '/^[\x{2000}-\x{206F}\x{2E00}-\x{2E7F}\p{Pc}\p{Pd}\p{Pe}\p{Pf}\p{Pi}\p{Po}\p{Ps}\\\\\'!"#\$%&\(\)\*\+,\-\.\\/:;<=>\[email protected]\[\]\^_`\{\|\}~]/u';
146
    const REGEX_UNSAFE_PROTOCOL = '/^javascript:|vbscript:|file:|data:/i';
147
    const REGEX_SAFE_DATA_PROTOCOL = '/^data:image\/(?:png|gif|jpeg|webp)/i';
148
    const REGEX_NON_SPACE = '/[^ \t\f\v\r\n]/';
149
150
    const REGEX_WHITESPACE_CHAR = '/^[ \t\n\x0b\x0c\x0d]/';
151
    const REGEX_WHITESPACE = '/[ \t\n\x0b\x0c\x0d]+/';
152
    const REGEX_UNICODE_WHITESPACE_CHAR = '/^\pZ|\s/u';
153
    const REGEX_THEMATIC_BREAK = '/^(?:(?:\*[ \t]*){3,}|(?:_[ \t]*){3,}|(?:-[ \t]*){3,})[ \t]*$/';
154
    const REGEX_LINK_DESTINATION_BRACES = '/^(?:' . '[<](?:[^ <>\\t\\n\\\\\\x00]' . '|' . self::PARTIAL_ESCAPED_CHAR . '|' . '\\\\)*[>]' . ')/';
155
156
    /**
157
     * @deprecated Instance methods will be removed in 0.18 or 1.0 (whichever comes first)
158
     */
159
    protected static $instance;
160
161
    /**
162
     * @return RegexHelper
163
     *
164
     * @deprecated Instances are no longer needed and will be removed in 0.18 or 1.0
165
     */
166 93
    public static function getInstance()
167
    {
168 93
        @trigger_error('RegexHelper no longer uses the singleton pattern. Directly grab the REGEX_ or PARTIAL_ constant you need instead.', E_USER_DEPRECATED);
169
170 93
        if (self::$instance === null) {
0 ignored issues
show
Deprecated Code introduced by Colin O'Dell
The property League\CommonMark\Util\RegexHelper::$instance has been deprecated with message: Instance methods will be removed in 0.18 or 1.0 (whichever comes first)

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
171 3
            self::$instance = new self();
0 ignored issues
show
Deprecated Code introduced by Colin O'Dell
The property League\CommonMark\Util\RegexHelper::$instance has been deprecated with message: Instance methods will be removed in 0.18 or 1.0 (whichever comes first)

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
172 2
        }
173
174 93
        return self::$instance;
0 ignored issues
show
Deprecated Code introduced by Colin O'Dell
The property League\CommonMark\Util\RegexHelper::$instance has been deprecated with message: Instance methods will be removed in 0.18 or 1.0 (whichever comes first)

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
175
    }
176
177
    /**
178
     * @param string|null $character
179
     *
180
     * @return bool
181
     */
182 105
    public static function isEscapable($character)
183
    {
184 105
        if ($character === null) {
185 12
            return false;
186
        }
187
188 96
        return preg_match('/' . self::PARTIAL_ESCAPABLE . '/', $character) === 1;
189
    }
190
191
    /**
192
     * Returns a partial regex
193
     *
194
     * It'll need to be wrapped with /.../ before use
195
     *
196
     * @param int $const
197
     *
198
     * @return string
199
     *
200
     * @deprecated Just grab the constant directly
201
     */
202 81
    public function getPartialRegex($const)
203
    {
204 81
        @trigger_error('RegexHelper no longer supports the getPartialRegex() function. Directly grab the PARTIAL_ constant you need instead.', E_USER_DEPRECATED);
205
206
        switch ($const) {
207 81
            case self::ESCAPABLE: return self::PARTIAL_ESCAPABLE;
208 78
            case self::ESCAPED_CHAR: return self::PARTIAL_ESCAPED_CHAR;
209 75
            case self::IN_DOUBLE_QUOTES: return self::PARTIAL_IN_DOUBLE_QUOTES;
210 72
            case self::IN_SINGLE_QUOTES: return self::PARTIAL_IN_SINGLE_QUOTES;
211 69
            case self::IN_PARENS: return self::PARTIAL_IN_PARENS;
212 66
            case self::REG_CHAR: return self::PARTIAL_REG_CHAR;
213 63
            case self::IN_PARENS_NOSP: return self::PARTIAL_IN_PARENS_NOSP;
214 60
            case self::TAGNAME: return self::PARTIAL_TAGNAME;
215 57
            case self::BLOCKTAGNAME: return self::PARTIAL_BLOCKTAGNAME;
216 54
            case self::ATTRIBUTENAME: return self::PARTIAL_ATTRIBUTENAME;
217 51
            case self::UNQUOTEDVALUE: return self::PARTIAL_UNQUOTEDVALUE;
218 48
            case self::SINGLEQUOTEDVALUE: return self::PARTIAL_SINGLEQUOTEDVALUE;
219 45
            case self::DOUBLEQUOTEDVALUE: return self::PARTIAL_DOUBLEQUOTEDVALUE;
220 42
            case self::ATTRIBUTEVALUE: return self::PARTIAL_ATTRIBUTEVALUE;
221 39
            case self::ATTRIBUTEVALUESPEC: return self::PARTIAL_ATTRIBUTEVALUESPEC;
222 36
            case self::ATTRIBUTE: return self::PARTIAL_ATTRIBUTE;
223 33
            case self::OPENTAG: return self::PARTIAL_OPENTAG;
224 30
            case self::CLOSETAG: return self::PARTIAL_CLOSETAG;
225 27
            case self::OPENBLOCKTAG: return self::PARTIAL_OPENBLOCKTAG;
226 24
            case self::CLOSEBLOCKTAG: return self::PARTIAL_CLOSEBLOCKTAG;
227 21
            case self::HTMLCOMMENT: return self::PARTIAL_HTMLCOMMENT;
228 18
            case self::PROCESSINGINSTRUCTION: return self::PARTIAL_PROCESSINGINSTRUCTION;
229 15
            case self::DECLARATION: return self::PARTIAL_DECLARATION;
230 12
            case self::CDATA: return self::PARTIAL_CDATA;
231 9
            case self::HTMLTAG: return self::PARTIAL_HTMLTAG;
232 6
            case self::HTMLBLOCKOPEN: return self::PARTIAL_HTMLBLOCKOPEN;
233 3
            case self::LINK_TITLE: return self::PARTIAL_LINK_TITLE;
234
        }
235
    }
236
237
    /**
238
     * @return string
239
     *
240
     * @deprecated Use PARTIAL_HTMLTAG and wrap it yourself instead
241
     */
242 3
    public function getHtmlTagRegex()
243
    {
244 3
        @trigger_error('RegexHelper::getHtmlTagRegex() has been deprecated. Use the RegexHelper::PARTIAL_HTMLTAG constant instead.', E_USER_DEPRECATED);
245
246 3
        return '/^' . self::PARTIAL_HTMLTAG . '/i';
247
    }
248
249
    /**
250
     * @return string
251
     *
252
     * @deprecated Use PARTIAL_LINK_TITLE and wrap it yourself instead
253
     */
254 3
    public function getLinkTitleRegex()
255
    {
256 3
        @trigger_error('RegexHelper::getLinkTitleRegex() has been deprecated. Use the RegexHelper::PARTIAL_LINK_TITLE constant instead.', E_USER_DEPRECATED);
257
258 3
        return '/' . self::PARTIAL_LINK_TITLE . '/';
259
    }
260
261
    /**
262
     * @return string
263
     *
264
     * @deprecated Use REGEX_LINK_DESTINATION_BRACES instead
265
     */
266 3
    public function getLinkDestinationBracesRegex()
267
    {
268 3
        @trigger_error('RegexHelper::getLinkDestinationBracesRegex() has been deprecated. Use the RegexHelper::REGEX_LINK_DESTINATION_BRACES constant instead.', E_USER_DEPRECATED);
269
270 3
        return self::REGEX_LINK_DESTINATION_BRACES;
271
    }
272
273
    /**
274
     * @return string
275
     *
276
     * @deprecated Use the REGEX_THEMATIC_BREAK constant directly
277
     */
278 3
    public function getThematicBreakRegex()
279
    {
280 3
        @trigger_error('RegexHelper::getThematicBreakRegex() has been deprecated. Use the RegexHelper::REGEX_THEMATIC_BREAK constant instead.', E_USER_DEPRECATED);
281
282 3
        return self::REGEX_THEMATIC_BREAK;
283
    }
284
285
    /**
286
     * Attempt to match a regex in string s at offset offset
287
     *
288
     * @param string $regex
289
     * @param string $string
290
     * @param int    $offset
291
     *
292
     * @return int|null Index of match, or null
293
     */
294 1776
    public static function matchAt($regex, $string, $offset = 0)
295
    {
296 1776
        $matches = [];
297 1776
        $string = mb_substr($string, $offset, null, 'utf-8');
298 1776
        if (!preg_match($regex, $string, $matches, PREG_OFFSET_CAPTURE)) {
299 1716
            return;
300
        }
301
302
        // PREG_OFFSET_CAPTURE always returns the byte offset, not the char offset, which is annoying
303 294
        $charPos = mb_strlen(mb_strcut($string, 0, $matches[0][1], 'utf-8'), 'utf-8');
304
305 294
        return $offset + $charPos;
306
    }
307
308
    /**
309
     * Functional wrapper around preg_match_all
310
     *
311
     * @param string $pattern
312
     * @param string $subject
313
     * @param int    $offset
314
     *
315
     * @return array|null
316
     */
317 1875
    public static function matchAll($pattern, $subject, $offset = 0)
318
    {
319 1875
        $matches = [];
320 1875
        $subject = substr($subject, $offset);
321 1875
        preg_match_all($pattern, $subject, $matches, PREG_PATTERN_ORDER);
322
323 1875
        $fullMatches = reset($matches);
324 1875
        if (empty($fullMatches)) {
325 1830
            return;
326
        }
327
328 288
        if (count($fullMatches) === 1) {
329 288
            foreach ($matches as &$match) {
330 288
                $match = reset($match);
331 192
            }
332 192
        }
333
334 288
        if (!empty($matches)) {
335 288
            return $matches;
336
        }
337
    }
338
339
    /**
340
     * Replace backslash escapes with literal characters
341
     *
342
     * @param string $string
343
     *
344
     * @return string
345
     */
346 498
    public static function unescape($string)
347
    {
348 498
        $allEscapedChar = '/\\\\(' . self::PARTIAL_ESCAPABLE . ')/';
349
350 498
        $escaped = preg_replace($allEscapedChar, '$1', $string);
351 498
        $replaced = preg_replace_callback('/' . self::PARTIAL_ENTITY . '/i', function ($e) {
352 15
            return Html5Entities::decodeEntity($e[0]);
353 498
        }, $escaped);
354
355 498
        return $replaced;
356
    }
357
358
    /**
359
     * @param int $type HTML block type
360
     *
361
     * @return string|null
362
     */
363 279
    public static function getHtmlBlockOpenRegex($type)
364
    {
365
        switch ($type) {
366 279
            case HtmlBlock::TYPE_1_CODE_CONTAINER:
367 279
                return '/^<(?:script|pre|style)(?:\s|>|$)/i';
368 261
            case HtmlBlock::TYPE_2_COMMENT:
369 261
                return '/^<!--/';
370 246
            case HtmlBlock::TYPE_3:
371 246
                return '/^<[?]/';
372 243
            case HtmlBlock::TYPE_4:
373 243
                return '/^<![A-Z]/';
374 240
            case HtmlBlock::TYPE_5_CDATA:
375 240
                return '/^<!\[CDATA\[/';
376 237
            case HtmlBlock::TYPE_6_BLOCK_ELEMENT:
377 237
                return '%^<[/]?(?:address|article|aside|base|basefont|blockquote|body|caption|center|col|colgroup|dd|details|dialog|dir|div|dl|dt|fieldset|figcaption|figure|footer|form|frame|frameset|h[123456]|head|header|hr|html|iframe|legend|li|link|main|menu|menuitem|meta|nav|noframes|ol|optgroup|option|p|param|section|source|title|summary|table|tbody|td|tfoot|th|thead|title|tr|track|ul)(?:\s|[/]?[>]|$)%i';
378 156
            case HtmlBlock::TYPE_7_MISC_ELEMENT:
379 156
                return '/^(?:' . self::PARTIAL_OPENTAG . '|' . self::PARTIAL_CLOSETAG . ')\\s*$/i';
380
        }
381
    }
382
383
    /**
384
     * @param int $type HTML block type
385
     *
386
     * @return string|null
387
     */
388 60
    public static function getHtmlBlockCloseRegex($type)
389
    {
390
        switch ($type) {
391 60
            case HtmlBlock::TYPE_1_CODE_CONTAINER:
392 36
                return '%<\/(?:script|pre|style)>%i';
393 24
            case HtmlBlock::TYPE_2_COMMENT:
394 15
                return '/-->/';
395 9
            case HtmlBlock::TYPE_3:
396 3
                return '/\?>/';
397 6
            case HtmlBlock::TYPE_4:
398 3
                return '/>/';
399 3
            case HtmlBlock::TYPE_5_CDATA:
400 3
                return '/\]\]>/';
401
        }
402
    }
403
404
    /**
405
     * @param string $url
406
     *
407
     * @return bool
408
     */
409 30
    public static function isLinkPotentiallyUnsafe($url)
410
    {
411 30
        return preg_match(self::REGEX_UNSAFE_PROTOCOL, $url) !== 0 && preg_match(self::REGEX_SAFE_DATA_PROTOCOL, $url) === 0;
412
    }
413
}
414