Completed
Push — master ( f73407...9eba7f )
by Colin
02:44
created

RegexHelper::getHtmlBlockOpenRegex()   B

Complexity

Conditions 8
Paths 8

Size

Total Lines 21

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 15
CRAP Score 8.0155

Importance

Changes 0
Metric Value
dl 0
loc 21
ccs 15
cts 16
cp 0.9375
rs 8.4444
c 0
b 0
f 0
cc 8
nc 8
nop 1
crap 8.0155
1
<?php
2
3
/*
4
 * This file is part of the league/commonmark package.
5
 *
6
 * (c) Colin O'Dell <[email protected]>
7
 *
8
 * Original code based on the CommonMark JS reference parser (https://bitly.com/commonmark-js)
9
 *  - (c) John MacFarlane
10
 *
11
 * For the full copyright and license information, please view the LICENSE
12
 * file that was distributed with this source code.
13
 */
14
15
namespace League\CommonMark\Util;
16
17
use League\CommonMark\Block\Element\HtmlBlock;
18
19
/**
20
 * Provides regular expressions and utilities for parsing Markdown
21
 */
22
final class RegexHelper
23
{
24
    /** @deprecated Use PARTIAL_ESCAPABLE instead */
25
    const ESCAPABLE = 0;
26
27
    /** @deprecated Use PARTIAL_ESCAPED_CHAR instead */
28
    const ESCAPED_CHAR = 1;
29
30
    /** @deprecated Use PARTIAL_IN_DOUBLE_QUOTES instead */
31
    const IN_DOUBLE_QUOTES = 2;
32
33
    /** @deprecated Use PARTIAL_IN_SINGLE_QUOTES instead */
34
    const IN_SINGLE_QUOTES = 3;
35
36
    /** @deprecated Use PARTIAL_IN_PARENS instead */
37
    const IN_PARENS = 4;
38
39
    /** @deprecated Use PARTIAL_REG_CHAR instead */
40
    const REG_CHAR = 5;
41
42
    /** @deprecated Use PARTIAL_IN_PARENS_NOSP instead */
43
    const IN_PARENS_NOSP = 6;
44
45
    /** @deprecated Use PARTIAL_TAGNAME instead */
46
    const TAGNAME = 7;
47
48
    /** @deprecated Use PARTIAL_BLOCKTAGNAME instead */
49
    const BLOCKTAGNAME = 8;
50
51
    /** @deprecated Use PARTIAL_ATTRIBUTENAME instead */
52
    const ATTRIBUTENAME = 9;
53
54
    /** @deprecated Use PARTIAL_UNQUOTEDVALUE instead */
55
    const UNQUOTEDVALUE = 10;
56
57
    /** @deprecated Use PARTIAL_SINGLEQUOTEDVALUE instead */
58
    const SINGLEQUOTEDVALUE = 11;
59
60
    /** @deprecated Use PARTIAL_DOUBLEQUOTEDVALUE instead */
61
    const DOUBLEQUOTEDVALUE = 12;
62
63
    /** @deprecated Use PARTIAL_ATTRIBUTEVALUE instead */
64
    const ATTRIBUTEVALUE = 13;
65
66
    /** @deprecated Use PARTIAL_ATTRIBUTEVALUESPEC instead */
67
    const ATTRIBUTEVALUESPEC = 14;
68
69
    /** @deprecated Use PARTIAL_ATTRIBUTE instead */
70
    const ATTRIBUTE = 15;
71
72
    /** @deprecated Use PARTIAL_OPENTAG instead */
73
    const OPENTAG = 16;
74
75
    /** @deprecated Use PARTIAL_CLOSETAG instead */
76
    const CLOSETAG = 17;
77
78
    /** @deprecated Use PARTIAL_OPENBLOCKTAG instead */
79
    const OPENBLOCKTAG = 18;
80
81
    /** @deprecated Use PARTIAL_CLOSEBLOCKTAG instead */
82
    const CLOSEBLOCKTAG = 19;
83
84
    /** @deprecated Use PARTIAL_HTMLCOMMENT instead */
85
    const HTMLCOMMENT = 20;
86
87
    /** @deprecated Use PARTIAL_PROCESSINGINSTRUCTION instead */
88
    const PROCESSINGINSTRUCTION = 21;
89
90
    /** @deprecated Use PARTIAL_DECLARATION instead */
91
    const DECLARATION = 22;
92
93
    /** @deprecated Use PARTIAL_CDATA instead */
94
    const CDATA = 23;
95
96
    /** @deprecated Use PARTIAL_HTMLTAG instead */
97
    const HTMLTAG = 24;
98
99
    /** @deprecated Use PARTIAL_HTMLBLOCKOPEN instead */
100
    const HTMLBLOCKOPEN = 25;
101
102
    /** @deprecated Use PARTIAL_LINK_TITLE instead */
103
    const LINK_TITLE = 26;
104
105
    // Partial regular expressions (wrap with `/` on each side before use)
106
    const PARTIAL_ENTITY = '&(?:#x[a-f0-9]{1,8}|#[0-9]{1,8}|[a-z][a-z0-9]{1,31});';
107
    const PARTIAL_ESCAPABLE = '[!"#$%&\'()*+,.\/:;<=>?@[\\\\\]^_`{|}~-]';
108
    const PARTIAL_ESCAPED_CHAR = '\\\\' . self::PARTIAL_ESCAPABLE;
109
    const PARTIAL_IN_DOUBLE_QUOTES = '"(' . self::PARTIAL_ESCAPED_CHAR . '|[^"\x00])*"';
110
    const PARTIAL_IN_SINGLE_QUOTES = '\'(' . self::PARTIAL_ESCAPED_CHAR . '|[^\'\x00])*\'';
111
    const PARTIAL_IN_PARENS = '\\((' . self::PARTIAL_ESCAPED_CHAR . '|[^)\x00])*\\)';
112
    const PARTIAL_REG_CHAR = '[^\\\\()\x00-\x20]';
113
    const PARTIAL_IN_PARENS_NOSP = '\((' . self::PARTIAL_REG_CHAR . '|' . self::PARTIAL_ESCAPED_CHAR . '|\\\\)*\)';
114
    const PARTIAL_TAGNAME = '[A-Za-z][A-Za-z0-9-]*';
115
    const PARTIAL_BLOCKTAGNAME = '(?:address|article|aside|base|basefont|blockquote|body|caption|center|col|colgroup|dd|details|dialog|dir|div|dl|dt|fieldset|figcaption|figure|footer|form|frame|frameset|h1|head|header|hr|html|iframe|legend|li|link|main|menu|menuitem|meta|nav|noframes|ol|optgroup|option|p|param|section|source|title|summary|table|tbody|td|tfoot|th|thead|title|tr|track|ul)';
116
    const PARTIAL_ATTRIBUTENAME = '[a-zA-Z_:][a-zA-Z0-9:._-]*';
117
    const PARTIAL_UNQUOTEDVALUE = '[^"\'=<>`\x00-\x20]+';
118
    const PARTIAL_SINGLEQUOTEDVALUE = '\'[^\']*\'';
119
    const PARTIAL_DOUBLEQUOTEDVALUE = '"[^"]*"';
120
    const PARTIAL_ATTRIBUTEVALUE = '(?:' . self::PARTIAL_UNQUOTEDVALUE . '|' . self::PARTIAL_SINGLEQUOTEDVALUE . '|' . self::PARTIAL_DOUBLEQUOTEDVALUE . ')';
121
    const PARTIAL_ATTRIBUTEVALUESPEC = '(?:' . '\s*=' . '\s*' . self::PARTIAL_ATTRIBUTEVALUE . ')';
122
    const PARTIAL_ATTRIBUTE = '(?:' . '\s+' . self::PARTIAL_ATTRIBUTENAME . self::PARTIAL_ATTRIBUTEVALUESPEC . '?)';
123
    const PARTIAL_OPENTAG = '<' . self::PARTIAL_TAGNAME . self::PARTIAL_ATTRIBUTE . '*' . '\s*\/?>';
124
    const PARTIAL_CLOSETAG = '<\/' . self::PARTIAL_TAGNAME . '\s*[>]';
125
    const PARTIAL_OPENBLOCKTAG = '<' . self::PARTIAL_BLOCKTAGNAME . self::PARTIAL_ATTRIBUTE . '*' . '\s*\/?>';
126
    const PARTIAL_CLOSEBLOCKTAG = '<\/' . self::PARTIAL_BLOCKTAGNAME . '\s*[>]';
127
    const PARTIAL_HTMLCOMMENT = '<!---->|<!--(?:-?[^>-])(?:-?[^-])*-->';
128
    const PARTIAL_PROCESSINGINSTRUCTION = '[<][?].*?[?][>]';
129
    const PARTIAL_DECLARATION = '<![A-Z]+' . '\s+[^>]*>';
130
    const PARTIAL_CDATA = '<!\[CDATA\[[\s\S]*?]\]>';
131
    const PARTIAL_HTMLTAG = '(?:' . self::PARTIAL_OPENTAG . '|' . self::PARTIAL_CLOSETAG . '|' . self::PARTIAL_HTMLCOMMENT . '|' .
132
        self::PARTIAL_PROCESSINGINSTRUCTION . '|' . self::PARTIAL_DECLARATION . '|' . self::PARTIAL_CDATA . ')';
133
    const PARTIAL_HTMLBLOCKOPEN = '<(?:' . self::PARTIAL_BLOCKTAGNAME . '(?:[\s\/>]|$)' . '|' .
134
        '\/' . self::PARTIAL_BLOCKTAGNAME . '(?:[\s>]|$)' . '|' . '[?!])';
135
    const PARTIAL_LINK_TITLE = '^(?:"(' . self::PARTIAL_ESCAPED_CHAR . '|[^"\x00])*"' .
136
        '|' . '\'(' . self::PARTIAL_ESCAPED_CHAR . '|[^\'\x00])*\'' .
137
        '|' . '\((' . self::PARTIAL_ESCAPED_CHAR . '|[^)\x00])*\))';
138
139
    /** @deprecated Use PARTIAL_ESCAPABLE instead */
140
    const REGEX_ESCAPABLE = self::PARTIAL_ESCAPABLE;
141
142
    /** @deprecated Use PARTIAL_ENTITY instead */
143
    const REGEX_ENTITY = self::PARTIAL_ENTITY;
144
145
    const REGEX_PUNCTUATION = '/^[\x{2000}-\x{206F}\x{2E00}-\x{2E7F}\p{Pc}\p{Pd}\p{Pe}\p{Pf}\p{Pi}\p{Po}\p{Ps}\\\\\'!"#\$%&\(\)\*\+,\-\.\\/:;<=>\?@\[\]\^_`\{\|\}~]/u';
146
    const REGEX_UNSAFE_PROTOCOL = '/^javascript:|vbscript:|file:|data:/i';
147
    const REGEX_SAFE_DATA_PROTOCOL = '/^data:image\/(?:png|gif|jpeg|webp)/i';
148
    const REGEX_NON_SPACE = '/[^ \t\f\v\r\n]/';
149
150
    const REGEX_WHITESPACE_CHAR = '/^[ \t\n\x0b\x0c\x0d]/';
151
    const REGEX_WHITESPACE = '/[ \t\n\x0b\x0c\x0d]+/';
152
    const REGEX_UNICODE_WHITESPACE_CHAR = '/^\pZ|\s/u';
153
    const REGEX_THEMATIC_BREAK = '/^(?:(?:\*[ \t]*){3,}|(?:_[ \t]*){3,}|(?:-[ \t]*){3,})[ \t]*$/';
154
    const REGEX_LINK_DESTINATION_BRACES = '/^(?:' . '[<](?:[^ <>\\t\\n\\\\\\x00]' . '|' . self::PARTIAL_ESCAPED_CHAR . '|' . '\\\\)*[>]' . ')/';
155
156
    /**
157
     * @param string $character
158
     *
159
     * @return bool
160
     */
161 96
    public static function isEscapable(string $character): bool
162
    {
163 96
        return preg_match('/' . self::PARTIAL_ESCAPABLE . '/', $character) === 1;
164
    }
165
166
    /**
167
     * Attempt to match a regex in string s at offset offset
168
     *
169
     * @param string $regex
170
     * @param string $string
171
     * @param int    $offset
172
     *
173
     * @return int|null Index of match, or null
174
     */
175 1776
    public static function matchAt(string $regex, string $string, int $offset = 0): ?int
176
    {
177 1776
        $matches = [];
178 1776
        $string = mb_substr($string, $offset, null, 'utf-8');
179 1776
        if (!preg_match($regex, $string, $matches, PREG_OFFSET_CAPTURE)) {
180 1716
            return null;
181
        }
182
183
        // PREG_OFFSET_CAPTURE always returns the byte offset, not the char offset, which is annoying
184 294
        $charPos = mb_strlen(mb_strcut($string, 0, $matches[0][1], 'utf-8'), 'utf-8');
185
186 294
        return $offset + $charPos;
187
    }
188
189
    /**
190
     * Functional wrapper around preg_match_all
191
     *
192
     * @param string $pattern
193
     * @param string $subject
194
     * @param int    $offset
195
     *
196
     * @return array|null
197
     */
198 1875
    public static function matchAll(string $pattern, string $subject, int $offset = 0): ?array
199
    {
200 1875
        if ($offset !== 0) {
201 408
            $subject = substr($subject, $offset);
202
        }
203
204 1875
        preg_match_all($pattern, $subject, $matches, PREG_PATTERN_ORDER);
205
206 1875
        $fullMatches = reset($matches);
207 1875
        if (empty($fullMatches)) {
208 1830
            return null;
209
        }
210
211 288
        if (count($fullMatches) === 1) {
212 288
            foreach ($matches as &$match) {
213 288
                $match = reset($match);
214
            }
215
        }
216
217 288
        return $matches ?: null;
218
    }
219
220
    /**
221
     * Replace backslash escapes with literal characters
222
     *
223
     * @param string $string
224
     *
225
     * @return string
226
     */
227 498
    public static function unescape(string $string): string
228
    {
229 498
        $allEscapedChar = '/\\\\(' . self::PARTIAL_ESCAPABLE . ')/';
230
231 498
        $escaped = preg_replace($allEscapedChar, '$1', $string);
232
        $replaced = preg_replace_callback('/' . self::PARTIAL_ENTITY . '/i', function ($e) {
233 15
            return Html5Entities::decodeEntity($e[0]);
234 498
        }, $escaped);
235
236 498
        return $replaced;
237
    }
238
239
    /**
240
     * @param int $type HTML block type
241
     *
242
     * @return string|null
243
     */
244 279
    public static function getHtmlBlockOpenRegex(int $type): ?string
245
    {
246
        switch ($type) {
247 279
            case HtmlBlock::TYPE_1_CODE_CONTAINER:
248 279
                return '/^<(?:script|pre|style)(?:\s|>|$)/i';
249 261
            case HtmlBlock::TYPE_2_COMMENT:
250 261
                return '/^<!--/';
251 246
            case HtmlBlock::TYPE_3:
252 246
                return '/^<[?]/';
253 243
            case HtmlBlock::TYPE_4:
254 243
                return '/^<![A-Z]/';
255 240
            case HtmlBlock::TYPE_5_CDATA:
256 240
                return '/^<!\[CDATA\[/';
257 237
            case HtmlBlock::TYPE_6_BLOCK_ELEMENT:
258 237
                return '%^<[/]?(?:address|article|aside|base|basefont|blockquote|body|caption|center|col|colgroup|dd|details|dialog|dir|div|dl|dt|fieldset|figcaption|figure|footer|form|frame|frameset|h[123456]|head|header|hr|html|iframe|legend|li|link|main|menu|menuitem|meta|nav|noframes|ol|optgroup|option|p|param|section|source|title|summary|table|tbody|td|tfoot|th|thead|title|tr|track|ul)(?:\s|[/]?[>]|$)%i';
259 156
            case HtmlBlock::TYPE_7_MISC_ELEMENT:
260 156
                return '/^(?:' . self::PARTIAL_OPENTAG . '|' . self::PARTIAL_CLOSETAG . ')\\s*$/i';
261
        }
262
263
        return null;
264
    }
265
266
    /**
267
     * @param int $type HTML block type
268
     *
269
     * @return string|null
270
     */
271 60
    public static function getHtmlBlockCloseRegex(int $type): ?string
272
    {
273
        switch ($type) {
274 60
            case HtmlBlock::TYPE_1_CODE_CONTAINER:
275 36
                return '%<\/(?:script|pre|style)>%i';
276 24
            case HtmlBlock::TYPE_2_COMMENT:
277 15
                return '/-->/';
278 9
            case HtmlBlock::TYPE_3:
279 3
                return '/\?>/';
280 6
            case HtmlBlock::TYPE_4:
281 3
                return '/>/';
282 3
            case HtmlBlock::TYPE_5_CDATA:
283 3
                return '/\]\]>/';
284
        }
285
286
        return null;
287
    }
288
289
    /**
290
     * @param string $url
291
     *
292
     * @return bool
293
     */
294 33
    public static function isLinkPotentiallyUnsafe(string $url): bool
295
    {
296 33
        return preg_match(self::REGEX_UNSAFE_PROTOCOL, $url) !== 0 && preg_match(self::REGEX_SAFE_DATA_PROTOCOL, $url) === 0;
297
    }
298
}
299