Completed
Push — master ( b06b97...f73407 )
by Colin
16s
created

RegexHelper::getHtmlBlockOpenRegex()   B

Complexity

Conditions 8
Paths 8

Size

Total Lines 21

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 15
CRAP Score 8.0155

Importance

Changes 0
Metric Value
dl 0
loc 21
ccs 15
cts 16
cp 0.9375
rs 8.4444
c 0
b 0
f 0
cc 8
nc 8
nop 1
crap 8.0155
1
<?php
2
3
/*
4
 * This file is part of the league/commonmark package.
5
 *
6
 * (c) Colin O'Dell <[email protected]>
7
 *
8
 * Original code based on the CommonMark JS reference parser (https://bitly.com/commonmark-js)
9
 *  - (c) John MacFarlane
10
 *
11
 * For the full copyright and license information, please view the LICENSE
12
 * file that was distributed with this source code.
13
 */
14
15
namespace League\CommonMark\Util;
16
17
use League\CommonMark\Block\Element\HtmlBlock;
18
19
/**
20
 * Provides regular expressions and utilities for parsing Markdown
21
 */
22
final class RegexHelper
23
{
24
    /** @deprecated Use PARTIAL_ESCAPABLE instead */
25
    const ESCAPABLE = 0;
26
27
    /** @deprecated Use PARTIAL_ESCAPED_CHAR instead */
28
    const ESCAPED_CHAR = 1;
29
30
    /** @deprecated Use PARTIAL_IN_DOUBLE_QUOTES instead */
31
    const IN_DOUBLE_QUOTES = 2;
32
33
    /** @deprecated Use PARTIAL_IN_SINGLE_QUOTES instead */
34
    const IN_SINGLE_QUOTES = 3;
35
36
    /** @deprecated Use PARTIAL_IN_PARENS instead */
37
    const IN_PARENS = 4;
38
39
    /** @deprecated Use PARTIAL_REG_CHAR instead */
40
    const REG_CHAR = 5;
41
42
    /** @deprecated Use PARTIAL_IN_PARENS_NOSP instead */
43
    const IN_PARENS_NOSP = 6;
44
45
    /** @deprecated Use PARTIAL_TAGNAME instead */
46
    const TAGNAME = 7;
47
48
    /** @deprecated Use PARTIAL_BLOCKTAGNAME instead */
49
    const BLOCKTAGNAME = 8;
50
51
    /** @deprecated Use PARTIAL_ATTRIBUTENAME instead */
52
    const ATTRIBUTENAME = 9;
53
54
    /** @deprecated Use PARTIAL_UNQUOTEDVALUE instead */
55
    const UNQUOTEDVALUE = 10;
56
57
    /** @deprecated Use PARTIAL_SINGLEQUOTEDVALUE instead */
58
    const SINGLEQUOTEDVALUE = 11;
59
60
    /** @deprecated Use PARTIAL_DOUBLEQUOTEDVALUE instead */
61
    const DOUBLEQUOTEDVALUE = 12;
62
63
    /** @deprecated Use PARTIAL_ATTRIBUTEVALUE instead */
64
    const ATTRIBUTEVALUE = 13;
65
66
    /** @deprecated Use PARTIAL_ATTRIBUTEVALUESPEC instead */
67
    const ATTRIBUTEVALUESPEC = 14;
68
69
    /** @deprecated Use PARTIAL_ATTRIBUTE instead */
70
    const ATTRIBUTE = 15;
71
72
    /** @deprecated Use PARTIAL_OPENTAG instead */
73
    const OPENTAG = 16;
74
75
    /** @deprecated Use PARTIAL_CLOSETAG instead */
76
    const CLOSETAG = 17;
77
78
    /** @deprecated Use PARTIAL_OPENBLOCKTAG instead */
79
    const OPENBLOCKTAG = 18;
80
81
    /** @deprecated Use PARTIAL_CLOSEBLOCKTAG instead */
82
    const CLOSEBLOCKTAG = 19;
83
84
    /** @deprecated Use PARTIAL_HTMLCOMMENT instead */
85
    const HTMLCOMMENT = 20;
86
87
    /** @deprecated Use PARTIAL_PROCESSINGINSTRUCTION instead */
88
    const PROCESSINGINSTRUCTION = 21;
89
90
    /** @deprecated Use PARTIAL_DECLARATION instead */
91
    const DECLARATION = 22;
92
93
    /** @deprecated Use PARTIAL_CDATA instead */
94
    const CDATA = 23;
95
96
    /** @deprecated Use PARTIAL_HTMLTAG instead */
97
    const HTMLTAG = 24;
98
99
    /** @deprecated Use PARTIAL_HTMLBLOCKOPEN instead */
100
    const HTMLBLOCKOPEN = 25;
101
102
    /** @deprecated Use PARTIAL_LINK_TITLE instead */
103
    const LINK_TITLE = 26;
104
105
    // Partial regular expressions (wrap with `/` on each side before use)
106
    const PARTIAL_ENTITY = '&(?:#x[a-f0-9]{1,8}|#[0-9]{1,8}|[a-z][a-z0-9]{1,31});';
107
    const PARTIAL_ESCAPABLE = '[!"#$%&\'()*+,.\/:;<=>[email protected][\\\\\]^_`{|}~-]';
108
    const PARTIAL_ESCAPED_CHAR = '\\\\' . self::PARTIAL_ESCAPABLE;
109
    const PARTIAL_IN_DOUBLE_QUOTES = '"(' . self::PARTIAL_ESCAPED_CHAR . '|[^"\x00])*"';
110
    const PARTIAL_IN_SINGLE_QUOTES = '\'(' . self::PARTIAL_ESCAPED_CHAR . '|[^\'\x00])*\'';
111
    const PARTIAL_IN_PARENS = '\\((' . self::PARTIAL_ESCAPED_CHAR . '|[^)\x00])*\\)';
112
    const PARTIAL_REG_CHAR = '[^\\\\()\x00-\x20]';
113
    const PARTIAL_IN_PARENS_NOSP = '\((' . self::PARTIAL_REG_CHAR . '|' . self::PARTIAL_ESCAPED_CHAR . '|\\\\)*\)';
114
    const PARTIAL_TAGNAME = '[A-Za-z][A-Za-z0-9-]*';
115
    const PARTIAL_BLOCKTAGNAME = '(?:address|article|aside|base|basefont|blockquote|body|caption|center|col|colgroup|dd|details|dialog|dir|div|dl|dt|fieldset|figcaption|figure|footer|form|frame|frameset|h1|head|header|hr|html|iframe|legend|li|link|main|menu|menuitem|meta|nav|noframes|ol|optgroup|option|p|param|section|source|title|summary|table|tbody|td|tfoot|th|thead|title|tr|track|ul)';
116
    const PARTIAL_ATTRIBUTENAME = '[a-zA-Z_:][a-zA-Z0-9:._-]*';
117
    const PARTIAL_UNQUOTEDVALUE = '[^"\'=<>`\x00-\x20]+';
118
    const PARTIAL_SINGLEQUOTEDVALUE = '\'[^\']*\'';
119
    const PARTIAL_DOUBLEQUOTEDVALUE = '"[^"]*"';
120
    const PARTIAL_ATTRIBUTEVALUE = '(?:' . self::PARTIAL_UNQUOTEDVALUE . '|' . self::PARTIAL_SINGLEQUOTEDVALUE . '|' . self::PARTIAL_DOUBLEQUOTEDVALUE . ')';
121
    const PARTIAL_ATTRIBUTEVALUESPEC = '(?:' . '\s*=' . '\s*' . self::PARTIAL_ATTRIBUTEVALUE . ')';
122
    const PARTIAL_ATTRIBUTE = '(?:' . '\s+' . self::PARTIAL_ATTRIBUTENAME . self::PARTIAL_ATTRIBUTEVALUESPEC . '?)';
123
    const PARTIAL_OPENTAG = '<' . self::PARTIAL_TAGNAME . self::PARTIAL_ATTRIBUTE . '*' . '\s*\/?>';
124
    const PARTIAL_CLOSETAG = '<\/' . self::PARTIAL_TAGNAME . '\s*[>]';
125
    const PARTIAL_OPENBLOCKTAG = '<' . self::PARTIAL_BLOCKTAGNAME . self::PARTIAL_ATTRIBUTE . '*' . '\s*\/?>';
126
    const PARTIAL_CLOSEBLOCKTAG = '<\/' . self::PARTIAL_BLOCKTAGNAME . '\s*[>]';
127
    const PARTIAL_HTMLCOMMENT = '<!---->|<!--(?:-?[^>-])(?:-?[^-])*-->';
128
    const PARTIAL_PROCESSINGINSTRUCTION = '[<][?].*?[?][>]';
129
    const PARTIAL_DECLARATION = '<![A-Z]+' . '\s+[^>]*>';
130
    const PARTIAL_CDATA = '<!\[CDATA\[[\s\S]*?]\]>';
131
    const PARTIAL_HTMLTAG = '(?:' . self::PARTIAL_OPENTAG . '|' . self::PARTIAL_CLOSETAG . '|' . self::PARTIAL_HTMLCOMMENT . '|' .
132
        self::PARTIAL_PROCESSINGINSTRUCTION . '|' . self::PARTIAL_DECLARATION . '|' . self::PARTIAL_CDATA . ')';
133
    const PARTIAL_HTMLBLOCKOPEN = '<(?:' . self::PARTIAL_BLOCKTAGNAME . '(?:[\s\/>]|$)' . '|' .
134
        '\/' . self::PARTIAL_BLOCKTAGNAME . '(?:[\s>]|$)' . '|' . '[?!])';
135
    const PARTIAL_LINK_TITLE = '^(?:"(' . self::PARTIAL_ESCAPED_CHAR . '|[^"\x00])*"' .
136
        '|' . '\'(' . self::PARTIAL_ESCAPED_CHAR . '|[^\'\x00])*\'' .
137
        '|' . '\((' . self::PARTIAL_ESCAPED_CHAR . '|[^)\x00])*\))';
138
139
    /** @deprecated Use PARTIAL_ESCAPABLE instead */
140
    const REGEX_ESCAPABLE = self::PARTIAL_ESCAPABLE;
141
142
    /** @deprecated Use PARTIAL_ENTITY instead */
143
    const REGEX_ENTITY = self::PARTIAL_ENTITY;
144
145
    const REGEX_PUNCTUATION = '/^[\x{2000}-\x{206F}\x{2E00}-\x{2E7F}\p{Pc}\p{Pd}\p{Pe}\p{Pf}\p{Pi}\p{Po}\p{Ps}\\\\\'!"#\$%&\(\)\*\+,\-\.\\/:;<=>\[email protected]\[\]\^_`\{\|\}~]/u';
146
    const REGEX_UNSAFE_PROTOCOL = '/^javascript:|vbscript:|file:|data:/i';
147
    const REGEX_SAFE_DATA_PROTOCOL = '/^data:image\/(?:png|gif|jpeg|webp)/i';
148
    const REGEX_NON_SPACE = '/[^ \t\f\v\r\n]/';
149
150
    const REGEX_WHITESPACE_CHAR = '/^[ \t\n\x0b\x0c\x0d]/';
151
    const REGEX_WHITESPACE = '/[ \t\n\x0b\x0c\x0d]+/';
152
    const REGEX_UNICODE_WHITESPACE_CHAR = '/^\pZ|\s/u';
153
    const REGEX_THEMATIC_BREAK = '/^(?:(?:\*[ \t]*){3,}|(?:_[ \t]*){3,}|(?:-[ \t]*){3,})[ \t]*$/';
154
    const REGEX_LINK_DESTINATION_BRACES = '/^(?:' . '[<](?:[^ <>\\t\\n\\\\\\x00]' . '|' . self::PARTIAL_ESCAPED_CHAR . '|' . '\\\\)*[>]' . ')/';
155
156
    /**
157
     * @param string $character
158
     *
159
     * @return bool
160
     */
161 96
    public static function isEscapable(string $character): bool
162
    {
163 96
        return preg_match('/' . self::PARTIAL_ESCAPABLE . '/', $character) === 1;
164
    }
165
166
    /**
167
     * Attempt to match a regex in string s at offset offset
168
     *
169
     * @param string $regex
170
     * @param string $string
171
     * @param int    $offset
172
     *
173
     * @return int|null Index of match, or null
174
     */
175 1776
    public static function matchAt(string $regex, string $string, int $offset = 0): ?int
176
    {
177 1776
        $matches = [];
178 1776
        $string = mb_substr($string, $offset, null, 'utf-8');
179 1776
        if (!preg_match($regex, $string, $matches, PREG_OFFSET_CAPTURE)) {
180 1716
            return null;
181
        }
182
183
        // PREG_OFFSET_CAPTURE always returns the byte offset, not the char offset, which is annoying
184 294
        $charPos = mb_strlen(mb_strcut($string, 0, $matches[0][1], 'utf-8'), 'utf-8');
185
186 294
        return $offset + $charPos;
187
    }
188
189
    /**
190
     * Functional wrapper around preg_match_all
191
     *
192
     * @param string $pattern
193
     * @param string $subject
194
     * @param int    $offset
195
     *
196
     * @return array|null
197
     */
198 1875
    public static function matchAll(string $pattern, string $subject, int $offset = 0): ?array
199
    {
200 1875
        $subject = substr($subject, $offset);
201 1875
        preg_match_all($pattern, $subject, $matches, PREG_PATTERN_ORDER);
202
203 1875
        $fullMatches = reset($matches);
204 1875
        if (empty($fullMatches)) {
205 1830
            return null;
206
        }
207
208 288
        if (count($fullMatches) === 1) {
209 288
            foreach ($matches as &$match) {
210 288
                $match = reset($match);
211
            }
212
        }
213
214 288
        if (!empty($matches)) {
215 288
            return $matches;
216
        }
217
218
        return null;
219
    }
220
221
    /**
222
     * Replace backslash escapes with literal characters
223
     *
224
     * @param string $string
225
     *
226
     * @return string
227
     */
228 498
    public static function unescape(string $string): string
229
    {
230 498
        $allEscapedChar = '/\\\\(' . self::PARTIAL_ESCAPABLE . ')/';
231
232 498
        $escaped = preg_replace($allEscapedChar, '$1', $string);
233
        $replaced = preg_replace_callback('/' . self::PARTIAL_ENTITY . '/i', function ($e) {
234 15
            return Html5Entities::decodeEntity($e[0]);
235 498
        }, $escaped);
236
237 498
        return $replaced;
238
    }
239
240
    /**
241
     * @param int $type HTML block type
242
     *
243
     * @return string|null
244
     */
245 279
    public static function getHtmlBlockOpenRegex(int $type): ?string
246
    {
247
        switch ($type) {
248 279
            case HtmlBlock::TYPE_1_CODE_CONTAINER:
249 279
                return '/^<(?:script|pre|style)(?:\s|>|$)/i';
250 261
            case HtmlBlock::TYPE_2_COMMENT:
251 261
                return '/^<!--/';
252 246
            case HtmlBlock::TYPE_3:
253 246
                return '/^<[?]/';
254 243
            case HtmlBlock::TYPE_4:
255 243
                return '/^<![A-Z]/';
256 240
            case HtmlBlock::TYPE_5_CDATA:
257 240
                return '/^<!\[CDATA\[/';
258 237
            case HtmlBlock::TYPE_6_BLOCK_ELEMENT:
259 237
                return '%^<[/]?(?:address|article|aside|base|basefont|blockquote|body|caption|center|col|colgroup|dd|details|dialog|dir|div|dl|dt|fieldset|figcaption|figure|footer|form|frame|frameset|h[123456]|head|header|hr|html|iframe|legend|li|link|main|menu|menuitem|meta|nav|noframes|ol|optgroup|option|p|param|section|source|title|summary|table|tbody|td|tfoot|th|thead|title|tr|track|ul)(?:\s|[/]?[>]|$)%i';
260 156
            case HtmlBlock::TYPE_7_MISC_ELEMENT:
261 156
                return '/^(?:' . self::PARTIAL_OPENTAG . '|' . self::PARTIAL_CLOSETAG . ')\\s*$/i';
262
        }
263
264
        return null;
265
    }
266
267
    /**
268
     * @param int $type HTML block type
269
     *
270
     * @return string|null
271
     */
272 60
    public static function getHtmlBlockCloseRegex(int $type): ?string
273
    {
274
        switch ($type) {
275 60
            case HtmlBlock::TYPE_1_CODE_CONTAINER:
276 36
                return '%<\/(?:script|pre|style)>%i';
277 24
            case HtmlBlock::TYPE_2_COMMENT:
278 15
                return '/-->/';
279 9
            case HtmlBlock::TYPE_3:
280 3
                return '/\?>/';
281 6
            case HtmlBlock::TYPE_4:
282 3
                return '/>/';
283 3
            case HtmlBlock::TYPE_5_CDATA:
284 3
                return '/\]\]>/';
285
        }
286
287
        return null;
288
    }
289
290
    /**
291
     * @param string $url
292
     *
293
     * @return bool
294
     */
295 33
    public static function isLinkPotentiallyUnsafe(string $url): bool
296
    {
297 33
        return preg_match(self::REGEX_UNSAFE_PROTOCOL, $url) !== 0 && preg_match(self::REGEX_SAFE_DATA_PROTOCOL, $url) === 0;
298
    }
299
}
300