Completed
Push — master ( d2636b...e88807 )
by Colin
02:51
created

RegexHelper::unescape()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 11
Code Lines 7

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 7
CRAP Score 1

Importance

Changes 0
Metric Value
dl 0
loc 11
ccs 7
cts 7
cp 1
rs 9.4285
c 0
b 0
f 0
cc 1
eloc 7
nc 1
nop 1
crap 1
1
<?php
2
3
/*
4
 * This file is part of the league/commonmark package.
5
 *
6
 * (c) Colin O'Dell <[email protected]>
7
 *
8
 * Original code based on the CommonMark JS reference parser (https://bitly.com/commonmark-js)
9
 *  - (c) John MacFarlane
10
 *
11
 * For the full copyright and license information, please view the LICENSE
12
 * file that was distributed with this source code.
13
 */
14
15
namespace League\CommonMark\Util;
16
17
use League\CommonMark\Block\Element\HtmlBlock;
18
19
/**
20
 * Provides regular expressions and utilities for parsing Markdown
21
 *
22
 * Singletons are generally bad, but it allows us to build the regexes once (and only once).
23
 */
24
class RegexHelper
25
{
26
    const ESCAPABLE = 0;
27
    const ESCAPED_CHAR = 1;
28
    const IN_DOUBLE_QUOTES = 2;
29
    const IN_SINGLE_QUOTES = 3;
30
    const IN_PARENS = 4;
31
    const REG_CHAR = 5;
32
    const IN_PARENS_NOSP = 6;
33
    const TAGNAME = 7;
34
    const BLOCKTAGNAME = 8;
35
    const ATTRIBUTENAME = 9;
36
    const UNQUOTEDVALUE = 10;
37
    const SINGLEQUOTEDVALUE = 11;
38
    const DOUBLEQUOTEDVALUE = 12;
39
    const ATTRIBUTEVALUE = 13;
40
    const ATTRIBUTEVALUESPEC = 14;
41
    const ATTRIBUTE = 15;
42
    const OPENTAG = 16;
43
    const CLOSETAG = 17;
44
    const OPENBLOCKTAG = 18;
45
    const CLOSEBLOCKTAG = 19;
46
    const HTMLCOMMENT = 20;
47
    const PROCESSINGINSTRUCTION = 21;
48
    const DECLARATION = 22;
49
    const CDATA = 23;
50
    const HTMLTAG = 24;
51
    const HTMLBLOCKOPEN = 25;
52
    const LINK_TITLE = 26;
53
54
    const REGEX_ESCAPABLE = '[!"#$%&\'()*+,.\/:;<=>?@[\\\\\]^_`{|}~-]';
55
    const REGEX_ENTITY = '&(?:#x[a-f0-9]{1,8}|#[0-9]{1,8}|[a-z][a-z0-9]{1,31});';
56
    const REGEX_PUNCTUATION = '/^[\x{2000}-\x{206F}\x{2E00}-\x{2E7F}\p{Pc}\p{Pd}\p{Pe}\p{Pf}\p{Pi}\p{Po}\p{Ps}\\\\\'!"#\$%&\(\)\*\+,\-\.\\/:;<=>\?@\[\]\^_`\{\|\}~]/u';
57
    const REGEX_UNSAFE_PROTOCOL = '/^javascript:|vbscript:|file:|data:/i';
58
    const REGEX_SAFE_DATA_PROTOCOL = '/^data:image\/(?:png|gif|jpeg|webp)/i';
59
    const REGEX_NON_SPACE = '/[^ \t\f\v\r\n]/';
60
61
    const REGEX_WHITESPACE_CHAR = '/^[ \t\n\x0b\x0c\x0d]/';
62
    const REGEX_WHITESPACE = '/[ \t\n\x0b\x0c\x0d]+/';
63
    const REGEX_UNICODE_WHITESPACE_CHAR = '/^\pZ|\s/u';
64
65
    /**
66
     * @deprecated
67
     */
68
    const REGEX_UNICODE_WHITESPACE = '/\pZ|\s/u';
69
70
    protected $regex = [];
71
72
    protected static $instance;
73
74
    /**
75
     * Constructor
76
     */
77 3
    protected function __construct()
78
    {
79 3
        $this->buildRegexPatterns();
80 3
    }
81
82
    /**
83
     * @return RegexHelper
84
     */
85 1815
    public static function getInstance()
86
    {
87 1815
        if (self::$instance === null) {
88 3
            self::$instance = new self();
89 2
        }
90
91 1815
        return self::$instance;
92
    }
93
94
    /**
95
     * Builds the regular expressions required to parse Markdown
96
     *
97
     * We could hard-code them all as pre-built constants, but that would be more difficult to manage.
98
     */
99 3
    protected function buildRegexPatterns()
100
    {
101 3
        $regex = [];
102 3
        $regex[self::ESCAPABLE] = self::REGEX_ESCAPABLE;
103 3
        $regex[self::ESCAPED_CHAR] = '\\\\' . $regex[self::ESCAPABLE];
104 3
        $regex[self::IN_DOUBLE_QUOTES] = '"(' . $regex[self::ESCAPED_CHAR] . '|[^"\x00])*"';
105 3
        $regex[self::IN_SINGLE_QUOTES] = '\'(' . $regex[self::ESCAPED_CHAR] . '|[^\'\x00])*\'';
106 3
        $regex[self::IN_PARENS] = '\\((' . $regex[self::ESCAPED_CHAR] . '|[^)\x00])*\\)';
107 3
        $regex[self::REG_CHAR] = '[^\\\\()\x00-\x20]';
108 3
        $regex[self::IN_PARENS_NOSP] = '\((' . $regex[self::REG_CHAR] . '|' . $regex[self::ESCAPED_CHAR] . '|\\\\)*\)';
109 3
        $regex[self::TAGNAME] = '[A-Za-z][A-Za-z0-9-]*';
110 3
        $regex[self::BLOCKTAGNAME] = '(?:address|article|aside|base|basefont|blockquote|body|caption|center|col|colgroup|dd|details|dialog|dir|div|dl|dt|fieldset|figcaption|figure|footer|form|frame|frameset|h1|head|header|hr|html|iframe|legend|li|link|main|menu|menuitem|meta|nav|noframes|ol|optgroup|option|p|param|section|source|title|summary|table|tbody|td|tfoot|th|thead|title|tr|track|ul)';
111 3
        $regex[self::ATTRIBUTENAME] = '[a-zA-Z_:][a-zA-Z0-9:._-]*';
112 3
        $regex[self::UNQUOTEDVALUE] = '[^"\'=<>`\x00-\x20]+';
113 3
        $regex[self::SINGLEQUOTEDVALUE] = '\'[^\']*\'';
114 3
        $regex[self::DOUBLEQUOTEDVALUE] = '"[^"]*"';
115 3
        $regex[self::ATTRIBUTEVALUE] = '(?:' . $regex[self::UNQUOTEDVALUE] . '|' . $regex[self::SINGLEQUOTEDVALUE] . '|' . $regex[self::DOUBLEQUOTEDVALUE] . ')';
116 3
        $regex[self::ATTRIBUTEVALUESPEC] = '(?:' . '\s*=' . '\s*' . $regex[self::ATTRIBUTEVALUE] . ')';
117 3
        $regex[self::ATTRIBUTE] = '(?:' . '\s+' . $regex[self::ATTRIBUTENAME] . $regex[self::ATTRIBUTEVALUESPEC] . '?)';
118 3
        $regex[self::OPENTAG] = '<' . $regex[self::TAGNAME] . $regex[self::ATTRIBUTE] . '*' . '\s*\/?>';
119 3
        $regex[self::CLOSETAG] = '<\/' . $regex[self::TAGNAME] . '\s*[>]';
120 3
        $regex[self::OPENBLOCKTAG] = '<' . $regex[self::BLOCKTAGNAME] . $regex[self::ATTRIBUTE] . '*' . '\s*\/?>';
121 3
        $regex[self::CLOSEBLOCKTAG] = '<\/' . $regex[self::BLOCKTAGNAME] . '\s*[>]';
122 3
        $regex[self::HTMLCOMMENT] = '<!---->|<!--(?:-?[^>-])(?:-?[^-])*-->';
123 3
        $regex[self::PROCESSINGINSTRUCTION] = '[<][?].*?[?][>]';
124 3
        $regex[self::DECLARATION] = '<![A-Z]+' . '\s+[^>]*>';
125 3
        $regex[self::CDATA] = '<!\[CDATA\[[\s\S]*?]\]>';
126 3
        $regex[self::HTMLTAG] = '(?:' . $regex[self::OPENTAG] . '|' . $regex[self::CLOSETAG] . '|' . $regex[self::HTMLCOMMENT] . '|' .
127 3
            $regex[self::PROCESSINGINSTRUCTION] . '|' . $regex[self::DECLARATION] . '|' . $regex[self::CDATA] . ')';
128 3
        $regex[self::HTMLBLOCKOPEN] = '<(?:' . $regex[self::BLOCKTAGNAME] . '(?:[\s\/>]|$)' . '|' .
129 3
            '\/' . $regex[self::BLOCKTAGNAME] . '(?:[\s>]|$)' . '|' . '[?!])';
130 3
        $regex[self::LINK_TITLE] = '^(?:"(' . $regex[self::ESCAPED_CHAR] . '|[^"\x00])*"' .
131 3
            '|' . '\'(' . $regex[self::ESCAPED_CHAR] . '|[^\'\x00])*\'' .
132 3
            '|' . '\((' . $regex[self::ESCAPED_CHAR] . '|[^)\x00])*\))';
133
134 3
        $this->regex = $regex;
135 3
    }
136
137
    /**
138
     * Returns a partial regex
139
     *
140
     * It'll need to be wrapped with /.../ before use
141
     *
142
     * @param int $const
143
     *
144
     * @return string
145
     */
146 615
    public function getPartialRegex($const)
147
    {
148 615
        return $this->regex[$const];
149
    }
150
151
    /**
152
     * @return string
153
     */
154 123
    public function getHtmlTagRegex()
155
    {
156 123
        return '/^' . $this->regex[self::HTMLTAG] . '/i';
157
    }
158
159
    /**
160
     * @return string
161
     */
162 270
    public function getLinkTitleRegex()
163
    {
164 270
        return '/' . $this->regex[self::LINK_TITLE] . '/';
165
    }
166
167
    /**
168
     * @return string
169
     *
170
     * @deprecated
171
     */
172
    public function getLinkDestinationRegex()
173
    {
174
        @trigger_error('RegexHelper::getLinkDestinationRegex() is no longer used and will be removed in a future 0.x release.', E_USER_DEPRECATED);
175
176
        return '/^' . '(?:' . $this->regex[self::REG_CHAR] . '+|' . $this->regex[self::ESCAPED_CHAR] . '|\\\\|' . $this->regex[self::IN_PARENS_NOSP] . ')*' . '/';
177
    }
178
179
    /**
180
     * @return string
181
     */
182 387
    public function getLinkDestinationBracesRegex()
183
    {
184 387
        return '/^(?:' . '[<](?:[^ <>\\t\\n\\\\\\x00]' . '|' . $this->regex[self::ESCAPED_CHAR] . '|' . '\\\\)*[>]' . ')/';
185
    }
186
187
    /**
188
     * @return string
189
     */
190 1689
    public function getThematicBreakRegex()
191
    {
192 1689
        return '/^(?:(?:\*[ \t]*){3,}|(?:_[ \t]*){3,}|(?:-[ \t]*){3,})[ \t]*$/';
193
    }
194
195
    /**
196
     * Attempt to match a regex in string s at offset offset
197
     *
198
     * @param string $regex
199
     * @param string $string
200
     * @param int    $offset
201
     *
202
     * @return int|null Index of match, or null
203
     */
204 1776
    public static function matchAt($regex, $string, $offset = 0)
205
    {
206 1776
        $matches = [];
207 1776
        $string = mb_substr($string, $offset, null, 'utf-8');
208 1776
        if (!preg_match($regex, $string, $matches, PREG_OFFSET_CAPTURE)) {
209 1716
            return;
210
        }
211
212
        // PREG_OFFSET_CAPTURE always returns the byte offset, not the char offset, which is annoying
213 294
        $charPos = mb_strlen(mb_strcut($string, 0, $matches[0][1], 'utf-8'), 'utf-8');
214
215 294
        return $offset + $charPos;
216
    }
217
218
    /**
219
     * Functional wrapper around preg_match_all
220
     *
221
     * @param string $pattern
222
     * @param string $subject
223
     * @param int    $offset
224
     *
225
     * @return array|null
226
     */
227 1875
    public static function matchAll($pattern, $subject, $offset = 0)
228
    {
229 1875
        $matches = [];
230 1875
        $subject = substr($subject, $offset);
231 1875
        preg_match_all($pattern, $subject, $matches, PREG_PATTERN_ORDER);
232
233 1875
        $fullMatches = reset($matches);
234 1875
        if (empty($fullMatches)) {
235 1830
            return;
236
        }
237
238 288
        if (count($fullMatches) === 1) {
239 288
            foreach ($matches as &$match) {
240 288
                $match = reset($match);
241 192
            }
242 192
        }
243
244 288
        if (!empty($matches)) {
245 288
            return $matches;
246
        }
247
    }
248
249
    /**
250
     * Replace backslash escapes with literal characters
251
     *
252
     * @param string $string
253
     *
254
     * @return string
255
     */
256 498
    public static function unescape($string)
257
    {
258 498
        $allEscapedChar = '/\\\\(' . self::REGEX_ESCAPABLE . ')/';
259
260 498
        $escaped = preg_replace($allEscapedChar, '$1', $string);
261 498
        $replaced = preg_replace_callback('/' . self::REGEX_ENTITY . '/i', function ($e) {
262 15
            return Html5Entities::decodeEntity($e[0]);
263 498
        }, $escaped);
264
265 498
        return $replaced;
266
    }
267
268
    /**
269
     * @param int $type HTML block type
270
     *
271
     * @return string|null
272
     */
273 279
    public static function getHtmlBlockOpenRegex($type)
274
    {
275
        switch ($type) {
276 279
            case HtmlBlock::TYPE_1_CODE_CONTAINER:
277 279
                return '/^<(?:script|pre|style)(?:\s|>|$)/i';
278 261
            case HtmlBlock::TYPE_2_COMMENT:
279 261
                return '/^<!--/';
280 246
            case HtmlBlock::TYPE_3:
281 246
                return '/^<[?]/';
282 243
            case HtmlBlock::TYPE_4:
283 243
                return '/^<![A-Z]/';
284 240
            case HtmlBlock::TYPE_5_CDATA:
285 240
                return '/^<!\[CDATA\[/';
286 237
            case HtmlBlock::TYPE_6_BLOCK_ELEMENT:
287 237
                return '%^<[/]?(?:address|article|aside|base|basefont|blockquote|body|caption|center|col|colgroup|dd|details|dialog|dir|div|dl|dt|fieldset|figcaption|figure|footer|form|frame|frameset|h[123456]|head|header|hr|html|iframe|legend|li|link|main|menu|menuitem|meta|nav|noframes|ol|optgroup|option|p|param|section|source|title|summary|table|tbody|td|tfoot|th|thead|title|tr|track|ul)(?:\s|[/]?[>]|$)%i';
288 156
            case HtmlBlock::TYPE_7_MISC_ELEMENT:
289 156
                $self = self::getInstance();
290
291 156
                return '/^(?:' . $self->getPartialRegex(self::OPENTAG) . '|' . $self->getPartialRegex(self::CLOSETAG) . ')\\s*$/i';
292
        }
293
    }
294
295
    /**
296
     * @param int $type HTML block type
297
     *
298
     * @return string|null
299
     */
300 60
    public static function getHtmlBlockCloseRegex($type)
301
    {
302
        switch ($type) {
303 60
            case HtmlBlock::TYPE_1_CODE_CONTAINER:
304 36
                return '%<\/(?:script|pre|style)>%i';
305 24
            case HtmlBlock::TYPE_2_COMMENT:
306 15
                return '/-->/';
307 9
            case HtmlBlock::TYPE_3:
308 3
                return '/\?>/';
309 6
            case HtmlBlock::TYPE_4:
310 3
                return '/>/';
311 3
            case HtmlBlock::TYPE_5_CDATA:
312 3
                return '/\]\]>/';
313
        }
314
    }
315
316
    /**
317
     * @param string $url
318
     *
319
     * @return bool
320
     */
321 30
    public static function isLinkPotentiallyUnsafe($url)
322
    {
323 30
        return preg_match(self::REGEX_UNSAFE_PROTOCOL, $url) !== 0 && preg_match(self::REGEX_SAFE_DATA_PROTOCOL, $url) === 0;
324
    }
325
}
326