Completed
Push — master ( 6c85d2...15e8d5 )
by Colin
04:57
created

RegexHelper::getThematicBreakRegex()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1
Metric Value
dl 0
loc 4
ccs 2
cts 2
cp 1
rs 10
cc 1
eloc 2
nc 1
nop 0
crap 1
1
<?php
2
3
/*
4
 * This file is part of the league/commonmark package.
5
 *
6
 * (c) Colin O'Dell <[email protected]>
7
 *
8
 * Original code based on the CommonMark JS reference parser (http://bitly.com/commonmark-js)
9
 *  - (c) John MacFarlane
10
 *
11
 * For the full copyright and license information, please view the LICENSE
12
 * file that was distributed with this source code.
13
 */
14
15
namespace League\CommonMark\Util;
16
17
use League\CommonMark\Block\Element\HtmlBlock;
18
19
/**
20
 * Provides regular expressions and utilties for parsing Markdown
21
 *
22
 * Singletons are generally bad, but it allows us to build the regexes once (and only once).
23
 */
24
class RegexHelper
25
{
26
    const ESCAPABLE = 0;
27
    const ESCAPED_CHAR = 1;
28
    const IN_DOUBLE_QUOTES = 2;
29
    const IN_SINGLE_QUOTES = 3;
30
    const IN_PARENS = 4;
31
    const REG_CHAR = 5;
32
    const IN_PARENS_NOSP = 6;
33
    const TAGNAME = 7;
34
    const BLOCKTAGNAME = 8;
35
    const ATTRIBUTENAME = 9;
36
    const UNQUOTEDVALUE = 10;
37
    const SINGLEQUOTEDVALUE = 11;
38
    const DOUBLEQUOTEDVALUE = 12;
39
    const ATTRIBUTEVALUE = 13;
40
    const ATTRIBUTEVALUESPEC = 14;
41
    const ATTRIBUTE = 15;
42
    const OPENTAG = 16;
43
    const CLOSETAG = 17;
44
    const OPENBLOCKTAG = 18;
45
    const CLOSEBLOCKTAG = 19;
46
    const HTMLCOMMENT = 20;
47
    const PROCESSINGINSTRUCTION = 21;
48
    const DECLARATION = 22;
49
    const CDATA = 23;
50
    const HTMLTAG = 24;
51
    const HTMLBLOCKOPEN = 25;
52
    const LINK_TITLE = 26;
53
54
    const REGEX_ESCAPABLE = '[!"#$%&\'()*+,.\/:;<=>?@[\\\\\]^_`{|}~-]';
55
    const REGEX_ENTITY = '&(?:#x[a-f0-9]{1,8}|#[0-9]{1,8}|[a-z][a-z0-9]{1,31});';
56
    const REGEX_PUNCTUATION = '/^[\x{2000}-\x{206F}\x{2E00}-\x{2E7F}\\\\\'!"#\$%&\(\)\*\+,\-\.\\/:;<=>\?@\[\]\^_`\{\|\}~]/u';
57
    const REGEX_UNSAFE_PROTOCOL = '/^javascript:|vbscript:|file:|data:/i';
58
    const REGEX_SAFE_DATA_PROTOCOL = '/^data:image\/(?:png|gif|jpeg|webp)/i';
59
60
    protected $regex = [];
61
62
    protected static $instance;
63
64
    /**
65
     * Constructor
66
     */
67 3
    protected function __construct()
68
    {
69 3
        $this->buildRegexPatterns();
70 3
    }
71
72
    /**
73
     * @return RegexHelper
74
     */
75 1746
    public static function getInstance()
76
    {
77 1746
        if (self::$instance === null) {
78 3
            self::$instance = new self();
79 3
        }
80
81 1746
        return self::$instance;
82
    }
83
84
    /**
85
     * Builds the regular expressions required to parse Markdown
86
     *
87
     * We could hard-code them all as pre-built constants, but that would be more difficult to manage.
88
     */
89 3
    protected function buildRegexPatterns()
90
    {
91 3
        $regex = [];
92 3
        $regex[self::ESCAPABLE] = self::REGEX_ESCAPABLE;
93 3
        $regex[self::ESCAPED_CHAR] = '\\\\' . $regex[self::ESCAPABLE];
94 3
        $regex[self::IN_DOUBLE_QUOTES] = '"(' . $regex[self::ESCAPED_CHAR] . '|[^"\x00])*"';
95 3
        $regex[self::IN_SINGLE_QUOTES] = '\'(' . $regex[self::ESCAPED_CHAR] . '|[^\'\x00])*\'';
96 3
        $regex[self::IN_PARENS] = '\\((' . $regex[self::ESCAPED_CHAR] . '|[^)\x00])*\\)';
97 3
        $regex[self::REG_CHAR] = '[^\\\\()\x00-\x20]';
98 3
        $regex[self::IN_PARENS_NOSP] = '\((' . $regex[self::REG_CHAR] . '|' . $regex[self::ESCAPED_CHAR] . '|\\\\)*\)';
99 3
        $regex[self::TAGNAME] = '[A-Za-z][A-Za-z0-9-]*';
100 3
        $regex[self::BLOCKTAGNAME] = '(?:address|article|aside|base|basefont|blockquote|body|caption|center|col|colgroup|dd|details|dialog|dir|div|dl|dt|fieldset|figcaption|figure|footer|form|frame|frameset|h1|head|header|hr|html|iframe|legend|li|link|main|menu|menuitem|meta|nav|noframes|ol|optgroup|option|p|param|section|source|title|summary|table|tbody|td|tfoot|th|thead|title|tr|track|ul)';
101 3
        $regex[self::ATTRIBUTENAME] = '[a-zA-Z_:][a-zA-Z0-9:._-]*';
102 3
        $regex[self::UNQUOTEDVALUE] = '[^"\'=<>`\x00-\x20]+';
103 3
        $regex[self::SINGLEQUOTEDVALUE] = '\'[^\']*\'';
104 3
        $regex[self::DOUBLEQUOTEDVALUE] = '"[^"]*"';
105 3
        $regex[self::ATTRIBUTEVALUE] = '(?:' . $regex[self::UNQUOTEDVALUE] . '|' . $regex[self::SINGLEQUOTEDVALUE] . '|' . $regex[self::DOUBLEQUOTEDVALUE] . ')';
106 3
        $regex[self::ATTRIBUTEVALUESPEC] = '(?:' . '\s*=' . '\s*' . $regex[self::ATTRIBUTEVALUE] . ')';
107 3
        $regex[self::ATTRIBUTE] = '(?:' . '\s+' . $regex[self::ATTRIBUTENAME] . $regex[self::ATTRIBUTEVALUESPEC] . '?)';
108 3
        $regex[self::OPENTAG] = '<' . $regex[self::TAGNAME] . $regex[self::ATTRIBUTE] . '*' . '\s*\/?>';
109 3
        $regex[self::CLOSETAG] = '<\/' . $regex[self::TAGNAME] . '\s*[>]';
110 3
        $regex[self::OPENBLOCKTAG] = '<' . $regex[self::BLOCKTAGNAME] . $regex[self::ATTRIBUTE] . '*' . '\s*\/?>';
111 3
        $regex[self::CLOSEBLOCKTAG] = '<\/' . $regex[self::BLOCKTAGNAME] . '\s*[>]';
112 3
        $regex[self::HTMLCOMMENT] = '<!---->|<!--(?:-?[^>-])(?:-?[^-])*-->';
113 3
        $regex[self::PROCESSINGINSTRUCTION] = '[<][?].*?[?][>]';
114 3
        $regex[self::DECLARATION] = '<![A-Z]+' . '\s+[^>]*>';
115 3
        $regex[self::CDATA] = '<!\[CDATA\[[\s\S]*?]\]>';
116 3
        $regex[self::HTMLTAG] = '(?:' . $regex[self::OPENTAG] . '|' . $regex[self::CLOSETAG] . '|' . $regex[self::HTMLCOMMENT] . '|' .
117 3
            $regex[self::PROCESSINGINSTRUCTION] . '|' . $regex[self::DECLARATION] . '|' . $regex[self::CDATA] . ')';
118 3
        $regex[self::HTMLBLOCKOPEN] = '<(?:' . $regex[self::BLOCKTAGNAME] . '(?:[\s\/>]|$)' . '|' .
119 3
            '\/' . $regex[self::BLOCKTAGNAME] . '(?:[\s>]|$)' . '|' . '[?!])';
120 3
        $regex[self::LINK_TITLE] = '^(?:"(' . $regex[self::ESCAPED_CHAR] . '|[^"\x00])*"' .
121 3
            '|' . '\'(' . $regex[self::ESCAPED_CHAR] . '|[^\'\x00])*\'' .
122 3
            '|' . '\((' . $regex[self::ESCAPED_CHAR] . '|[^)\x00])*\))';
123
124 3
        $this->regex = $regex;
125 3
    }
126
127
    /**
128
     * Returns a partial regex
129
     *
130
     * It'll need to be wrapped with /.../ before use
131
     *
132
     * @param int $const
133
     *
134
     * @return string
135
     */
136 582
    public function getPartialRegex($const)
137
    {
138 582
        return $this->regex[$const];
139
    }
140
141
    /**
142
     * @return string
143
     */
144 120
    public function getHtmlTagRegex()
145
    {
146 120
        return '/^' . $this->regex[self::HTMLTAG] . '/i';
147
    }
148
149
    /**
150
     * @return string
151
     */
152 258
    public function getLinkTitleRegex()
153
    {
154 258
        return '/' . $this->regex[self::LINK_TITLE] . '/';
155
    }
156
157
    /**
158
     * @return string
159
     */
160 357
    public function getLinkDestinationRegex()
161
    {
162 357
        return '/^' . '(?:' . $this->regex[self::REG_CHAR] . '+|' . $this->regex[self::ESCAPED_CHAR] . '|\\\\|' . $this->regex[self::IN_PARENS_NOSP] . ')*' . '/';
163
    }
164
165
    /**
166
     * @return string
167
     */
168 372
    public function getLinkDestinationBracesRegex()
169
    {
170 372
        return '/^(?:' . '[<](?:[^<>\\n\\\\\\x00]' . '|' . $this->regex[self::ESCAPED_CHAR] . '|' . '\\\\)*[>]' . ')/';
171
    }
172
173
    /**
174
     * @return string
175
     */
176 1620
    public function getThematicBreakRegex()
177
    {
178 1620
        return '/^(?:(?:\* *){3,}|(?:_ *){3,}|(?:- *){3,}) *$/';
179
    }
180
181
    /**
182
     * Attempt to match a regex in string s at offset offset
183
     *
184
     * @param string $regex
185
     * @param string $string
186
     * @param int    $offset
187
     *
188
     * @return int|null Index of match, or null
189
     */
190 1725
    public static function matchAt($regex, $string, $offset = 0)
191
    {
192 1725
        $matches = [];
193 1725
        $string = mb_substr($string, $offset, null, 'utf-8');
194 1725
        if (!preg_match($regex, $string, $matches, PREG_OFFSET_CAPTURE)) {
195 1659
            return;
196
        }
197
198
        // PREG_OFFSET_CAPTURE always returns the byte offset, not the char offset, which is annoying
199 237
        $charPos = mb_strlen(mb_strcut($string, 0, $matches[0][1], 'utf-8'), 'utf-8');
200
201 237
        return $offset + $charPos;
202
    }
203
204
    /**
205
     * Functional wrapper around preg_match_all
206
     *
207
     * @param string $pattern
208
     * @param string $subject
209
     * @param int    $offset
210
     *
211
     * @return array|null
212
     */
213 1818
    public static function matchAll($pattern, $subject, $offset = 0)
214
    {
215 1818
        $matches = [];
216 1818
        $subject = substr($subject, $offset);
217 1818
        preg_match_all($pattern, $subject, $matches, PREG_PATTERN_ORDER);
218
219 1818
        $fullMatches = reset($matches);
220 1818
        if (empty($fullMatches)) {
221 1779
            return;
222
        }
223
224 627
        if (count($fullMatches) === 1) {
225 627
            foreach ($matches as &$match) {
226 627
                $match = reset($match);
227 627
            }
228 627
        }
229
230 627
        if (!empty($matches)) {
231 627
            return $matches;
232
        }
233
    }
234
235
    /**
236
     * Replace backslash escapes with literal characters
237
     *
238
     * @param string $string
239
     *
240
     * @return string
241
     */
242 480
    public static function unescape($string)
243
    {
244 480
        $allEscapedChar = '/\\\\(' . self::REGEX_ESCAPABLE . ')/';
245
246 480
        $escaped = preg_replace($allEscapedChar, '$1', $string);
247 480
        $replaced = preg_replace_callback('/' . self::REGEX_ENTITY . '/i', function ($e) {
248 15
            return Html5Entities::decodeEntity($e[0]);
249 480
        }, $escaped);
250
251 480
        return $replaced;
252
    }
253
254
    /**
255
     * @param int $type HTML block type
256
     *
257
     * @return string|null
258
     */
259 240
    public static function getHtmlBlockOpenRegex($type)
260
    {
261
        switch ($type) {
262 240
            case HtmlBlock::TYPE_1_CODE_CONTAINER:
263 240
                return '/^<(?:script|pre|style)(?:\s|>|$)/i';
264 222
            case HtmlBlock::TYPE_2_COMMENT:
265 222
                return '/^<!--/';
266 213
            case HtmlBlock::TYPE_3:
267 213
                return '/^<[?]/';
268 210
            case HtmlBlock::TYPE_4:
269 210
                return '/^<![A-Z]/';
270 207
            case HtmlBlock::TYPE_5_CDATA:
271 207
                return '/^<!\[CDATA\[/';
272 204
            case HtmlBlock::TYPE_6_BLOCK_ELEMENT:
273 204
                return '%^<[/]?(?:address|article|aside|base|basefont|blockquote|body|caption|center|col|colgroup|dd|details|dialog|dir|div|dl|dt|fieldset|figcaption|figure|footer|form|frame|frameset|h1|head|header|hr|html|legend|li|link|main|menu|menuitem|meta|nav|noframes|ol|optgroup|option|p|param|pre|section|source|title|summary|table|tbody|td|tfoot|th|thead|title|tr|track|ul)(?:\s|[/]?[>]|$)%i';
274 138
            case HtmlBlock::TYPE_7_MISC_ELEMENT:
275 138
                $self = self::getInstance();
276
277 138
                return '/^(?:' . $self->getPartialRegex(self::OPENTAG) . '|' . $self->getPartialRegex(self::CLOSETAG) . ')\\s*$/i';
278
        }
279
    }
280
281
    /**
282
     * @param int $type HTML block type
283
     *
284
     * @return string|null
285
     */
286 42
    public static function getHtmlBlockCloseRegex($type)
287
    {
288
        switch ($type) {
289 42
            case HtmlBlock::TYPE_1_CODE_CONTAINER:
290 24
                return '%<\/(?:script|pre|style)>%i';
291 18
            case HtmlBlock::TYPE_2_COMMENT:
292 9
                return '/-->/';
293 9
            case HtmlBlock::TYPE_3:
294 3
                return '/\?>/';
295 6
            case HtmlBlock::TYPE_4:
296 3
                return '/>/';
297 3
            case HtmlBlock::TYPE_5_CDATA:
298 3
                return '/\]\]>/';
299
        }
300
    }
301
302
    /**
303
     * @param string $url
304
     *
305
     * @return bool
306
     */
307 3
    public static function isLinkPotentiallyUnsafe($url)
308
    {
309 3
        return preg_match(self::REGEX_UNSAFE_PROTOCOL, $url) !== 0 && preg_match(self::REGEX_SAFE_DATA_PROTOCOL, $url) === 0;
310
    }
311
}
312