GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.

RegexHelper   A
last analyzed

↳ Parent: Project

Coupling/Cohesion

Components 1
Dependencies 1

Complexity

Total Complexity 34

Size/Duplication

Total Lines 294
Duplicated Lines 0 %

Test Coverage

Coverage 97.39%

Importance

Changes 3
Bugs 0 Features 0
Metric Value
wmc 34
c 3
b 0
f 0
lcom 1
cbo 1
dl 0
loc 294
ccs 112
cts 115
cp 0.9739
rs 9.2

15 Methods

Rating   Name   Duplication   Size   Complexity  
A __construct() 0 4 1
A getInstance() 0 8 2
B buildRegexPatterns() 0 37 1
A getPartialRegex() 0 4 1
A getHtmlTagRegex() 0 4 1
A getLinkTitleRegex() 0 4 1
A getLinkDestinationRegex() 0 4 1
A getLinkDestinationBracesRegex() 0 4 1
A getThematicBreakRegex() 0 4 1
A matchAt() 0 13 2
B matchAll() 0 21 5
A unescape() 0 11 1
B getHtmlBlockOpenRegex() 0 21 8
B getHtmlBlockCloseRegex() 0 15 6
A isLinkPotentiallyUnsafe() 0 4 2
1
<?php
2
3
/*
4
 * This file is part of the league/commonmark package.
5
 *
6
 * (c) Colin O'Dell <[email protected]>
7
 *
8
 * Original code based on the CommonMark JS reference parser (https://bitly.com/commonmark-js)
9
 *  - (c) John MacFarlane
10
 *
11
 * For the full copyright and license information, please view the LICENSE
12
 * file that was distributed with this source code.
13
 */
14
15
namespace League\CommonMark\Util;
16
17
use League\CommonMark\Block\Element\HtmlBlock;
18
19
/**
20
 * Provides regular expressions and utilties for parsing Markdown
21
 *
22
 * Singletons are generally bad, but it allows us to build the regexes once (and only once).
23
 */
24
class RegexHelper
25
{
26
    const ESCAPABLE = 0;
27
    const ESCAPED_CHAR = 1;
28
    const IN_DOUBLE_QUOTES = 2;
29
    const IN_SINGLE_QUOTES = 3;
30
    const IN_PARENS = 4;
31
    const REG_CHAR = 5;
32
    const IN_PARENS_NOSP = 6;
33
    const TAGNAME = 7;
34
    const BLOCKTAGNAME = 8;
35
    const ATTRIBUTENAME = 9;
36
    const UNQUOTEDVALUE = 10;
37
    const SINGLEQUOTEDVALUE = 11;
38
    const DOUBLEQUOTEDVALUE = 12;
39
    const ATTRIBUTEVALUE = 13;
40
    const ATTRIBUTEVALUESPEC = 14;
41
    const ATTRIBUTE = 15;
42
    const OPENTAG = 16;
43
    const CLOSETAG = 17;
44
    const OPENBLOCKTAG = 18;
45
    const CLOSEBLOCKTAG = 19;
46
    const HTMLCOMMENT = 20;
47
    const PROCESSINGINSTRUCTION = 21;
48
    const DECLARATION = 22;
49
    const CDATA = 23;
50
    const HTMLTAG = 24;
51
    const HTMLBLOCKOPEN = 25;
52
    const LINK_TITLE = 26;
53
54
    const REGEX_ESCAPABLE = '[!"#$%&\'()*+,.\/:;<=>?@[\\\\\]^_`{|}~-]';
55
    const REGEX_ENTITY = '&(?:#x[a-f0-9]{1,8}|#[0-9]{1,8}|[a-z][a-z0-9]{1,31});';
56
    const REGEX_PUNCTUATION = '/^[\x{2000}-\x{206F}\x{2E00}-\x{2E7F}\p{Pc}\p{Pd}\p{Pe}\p{Pf}\p{Pi}\p{Po}\p{Ps}\\\\\'!"#\$%&\(\)\*\+,\-\.\\/:;<=>\?@\[\]\^_`\{\|\}~]/u';
57
    const REGEX_UNSAFE_PROTOCOL = '/^javascript:|vbscript:|file:|data:/i';
58
    const REGEX_SAFE_DATA_PROTOCOL = '/^data:image\/(?:png|gif|jpeg|webp)/i';
59
    const REGEX_NON_SPACE = '/[^ \t\f\v\r\n]/';
60
61
    const REGEX_WHITESPACE_CHAR = '/^[ \t\n\x0b\x0c\x0d]/';
62
    const REGEX_WHITESPACE = '/[ \t\n\x0b\x0c\x0d]+/';
63
    const REGEX_UNICODE_WHITESPACE_CHAR = '/^\pZ|\s/u';
64
    const REGEX_UNICODE_WHITESPACE = '/\pZ|\s/u';
65
66
    protected $regex = [];
67
68
    protected static $instance;
69
70
    /**
71
     * Constructor
72
     */
73 3
    protected function __construct()
74
    {
75 3
        $this->buildRegexPatterns();
76 3
    }
77
78
    /**
79
     * @return RegexHelper
80
     */
81 1797
    public static function getInstance()
82
    {
83 1797
        if (self::$instance === null) {
84 3
            self::$instance = new self();
85 3
        }
86
87 1797
        return self::$instance;
88
    }
89
90
    /**
91
     * Builds the regular expressions required to parse Markdown
92
     *
93
     * We could hard-code them all as pre-built constants, but that would be more difficult to manage.
94
     */
95 3
    protected function buildRegexPatterns()
96
    {
97 3
        $regex = [];
98 3
        $regex[self::ESCAPABLE] = self::REGEX_ESCAPABLE;
99 3
        $regex[self::ESCAPED_CHAR] = '\\\\' . $regex[self::ESCAPABLE];
100 3
        $regex[self::IN_DOUBLE_QUOTES] = '"(' . $regex[self::ESCAPED_CHAR] . '|[^"\x00])*"';
101 3
        $regex[self::IN_SINGLE_QUOTES] = '\'(' . $regex[self::ESCAPED_CHAR] . '|[^\'\x00])*\'';
102 3
        $regex[self::IN_PARENS] = '\\((' . $regex[self::ESCAPED_CHAR] . '|[^)\x00])*\\)';
103 3
        $regex[self::REG_CHAR] = '[^\\\\()\x00-\x20]';
104 3
        $regex[self::IN_PARENS_NOSP] = '\((' . $regex[self::REG_CHAR] . '|' . $regex[self::ESCAPED_CHAR] . '|\\\\)*\)';
105 3
        $regex[self::TAGNAME] = '[A-Za-z][A-Za-z0-9-]*';
106 3
        $regex[self::BLOCKTAGNAME] = '(?:address|article|aside|base|basefont|blockquote|body|caption|center|col|colgroup|dd|details|dialog|dir|div|dl|dt|fieldset|figcaption|figure|footer|form|frame|frameset|h1|head|header|hr|html|iframe|legend|li|link|main|menu|menuitem|meta|nav|noframes|ol|optgroup|option|p|param|section|source|title|summary|table|tbody|td|tfoot|th|thead|title|tr|track|ul)';
107 3
        $regex[self::ATTRIBUTENAME] = '[a-zA-Z_:][a-zA-Z0-9:._-]*';
108 3
        $regex[self::UNQUOTEDVALUE] = '[^"\'=<>`\x00-\x20]+';
109 3
        $regex[self::SINGLEQUOTEDVALUE] = '\'[^\']*\'';
110 3
        $regex[self::DOUBLEQUOTEDVALUE] = '"[^"]*"';
111 3
        $regex[self::ATTRIBUTEVALUE] = '(?:' . $regex[self::UNQUOTEDVALUE] . '|' . $regex[self::SINGLEQUOTEDVALUE] . '|' . $regex[self::DOUBLEQUOTEDVALUE] . ')';
112 3
        $regex[self::ATTRIBUTEVALUESPEC] = '(?:' . '\s*=' . '\s*' . $regex[self::ATTRIBUTEVALUE] . ')';
113 3
        $regex[self::ATTRIBUTE] = '(?:' . '\s+' . $regex[self::ATTRIBUTENAME] . $regex[self::ATTRIBUTEVALUESPEC] . '?)';
114 3
        $regex[self::OPENTAG] = '<' . $regex[self::TAGNAME] . $regex[self::ATTRIBUTE] . '*' . '\s*\/?>';
115 3
        $regex[self::CLOSETAG] = '<\/' . $regex[self::TAGNAME] . '\s*[>]';
116 3
        $regex[self::OPENBLOCKTAG] = '<' . $regex[self::BLOCKTAGNAME] . $regex[self::ATTRIBUTE] . '*' . '\s*\/?>';
117 3
        $regex[self::CLOSEBLOCKTAG] = '<\/' . $regex[self::BLOCKTAGNAME] . '\s*[>]';
118 3
        $regex[self::HTMLCOMMENT] = '<!---->|<!--(?:-?[^>-])(?:-?[^-])*-->';
119 3
        $regex[self::PROCESSINGINSTRUCTION] = '[<][?].*?[?][>]';
120 3
        $regex[self::DECLARATION] = '<![A-Z]+' . '\s+[^>]*>';
121 3
        $regex[self::CDATA] = '<!\[CDATA\[[\s\S]*?]\]>';
122 3
        $regex[self::HTMLTAG] = '(?:' . $regex[self::OPENTAG] . '|' . $regex[self::CLOSETAG] . '|' . $regex[self::HTMLCOMMENT] . '|' .
123 3
            $regex[self::PROCESSINGINSTRUCTION] . '|' . $regex[self::DECLARATION] . '|' . $regex[self::CDATA] . ')';
124 3
        $regex[self::HTMLBLOCKOPEN] = '<(?:' . $regex[self::BLOCKTAGNAME] . '(?:[\s\/>]|$)' . '|' .
125 3
            '\/' . $regex[self::BLOCKTAGNAME] . '(?:[\s>]|$)' . '|' . '[?!])';
126 3
        $regex[self::LINK_TITLE] = '^(?:"(' . $regex[self::ESCAPED_CHAR] . '|[^"\x00])*"' .
127 3
            '|' . '\'(' . $regex[self::ESCAPED_CHAR] . '|[^\'\x00])*\'' .
128 3
            '|' . '\((' . $regex[self::ESCAPED_CHAR] . '|[^)\x00])*\))';
129
130 3
        $this->regex = $regex;
131 3
    }
132
133
    /**
134
     * Returns a partial regex
135
     *
136
     * It'll need to be wrapped with /.../ before use
137
     *
138
     * @param int $const
139
     *
140
     * @return string
141
     */
142 612
    public function getPartialRegex($const)
143
    {
144 612
        return $this->regex[$const];
145
    }
146
147
    /**
148
     * @return string
149
     */
150 120
    public function getHtmlTagRegex()
151
    {
152 120
        return '/^' . $this->regex[self::HTMLTAG] . '/i';
153
    }
154
155
    /**
156
     * @return string
157
     */
158 270
    public function getLinkTitleRegex()
159
    {
160 270
        return '/' . $this->regex[self::LINK_TITLE] . '/';
161
    }
162
163
    /**
164
     * @return string
165
     */
166 378
    public function getLinkDestinationRegex()
167
    {
168 378
        return '/^' . '(?:' . $this->regex[self::REG_CHAR] . '+|' . $this->regex[self::ESCAPED_CHAR] . '|\\\\|' . $this->regex[self::IN_PARENS_NOSP] . ')*' . '/';
169
    }
170
171
    /**
172
     * @return string
173
     */
174 390
    public function getLinkDestinationBracesRegex()
175
    {
176 390
        return '/^(?:' . '[<](?:[^ <>\\t\\n\\\\\\x00]' . '|' . $this->regex[self::ESCAPED_CHAR] . '|' . '\\\\)*[>]' . ')/';
177
    }
178
179
    /**
180
     * @return string
181
     */
182 1671
    public function getThematicBreakRegex()
183
    {
184 1671
        return '/^(?:(?:\*[ \t]*){3,}|(?:_[ \t]*){3,}|(?:-[ \t]*){3,})[ \t]*$/';
185
    }
186
187
    /**
188
     * Attempt to match a regex in string s at offset offset
189
     *
190
     * @param string $regex
191
     * @param string $string
192
     * @param int    $offset
193
     *
194
     * @return int|null Index of match, or null
195
     */
196 1758
    public static function matchAt($regex, $string, $offset = 0)
197
    {
198 1758
        $matches = [];
199 1758
        $string = mb_substr($string, $offset, null, 'utf-8');
200 1758
        if (!preg_match($regex, $string, $matches, PREG_OFFSET_CAPTURE)) {
201 1698
            return;
202
        }
203
204
        // PREG_OFFSET_CAPTURE always returns the byte offset, not the char offset, which is annoying
205 288
        $charPos = mb_strlen(mb_strcut($string, 0, $matches[0][1], 'utf-8'), 'utf-8');
206
207 288
        return $offset + $charPos;
208
    }
209
210
    /**
211
     * Functional wrapper around preg_match_all
212
     *
213
     * @param string $pattern
214
     * @param string $subject
215
     * @param int    $offset
216
     *
217
     * @return array|null
218
     */
219 1854
    public static function matchAll($pattern, $subject, $offset = 0)
220
    {
221 1854
        $matches = [];
222 1854
        $subject = substr($subject, $offset);
223 1854
        preg_match_all($pattern, $subject, $matches, PREG_PATTERN_ORDER);
224
225 1854
        $fullMatches = reset($matches);
226 1854
        if (empty($fullMatches)) {
227 1812
            return;
228
        }
229
230 657
        if (count($fullMatches) === 1) {
231 657
            foreach ($matches as &$match) {
232 657
                $match = reset($match);
233 657
            }
234 657
        }
235
236 657
        if (!empty($matches)) {
237 657
            return $matches;
238
        }
239
    }
240
241
    /**
242
     * Replace backslash escapes with literal characters
243
     *
244
     * @param string $string
245
     *
246
     * @return string
247
     */
248 495
    public static function unescape($string)
249
    {
250 495
        $allEscapedChar = '/\\\\(' . self::REGEX_ESCAPABLE . ')/';
251
252 495
        $escaped = preg_replace($allEscapedChar, '$1', $string);
253 495
        $replaced = preg_replace_callback('/' . self::REGEX_ENTITY . '/i', function ($e) {
254 15
            return Html5Entities::decodeEntity($e[0]);
255 495
        }, $escaped);
256
257 495
        return $replaced;
258
    }
259
260
    /**
261
     * @param int $type HTML block type
262
     *
263
     * @return string|null
264
     */
265 273
    public static function getHtmlBlockOpenRegex($type)
266
    {
267
        switch ($type) {
268 273
            case HtmlBlock::TYPE_1_CODE_CONTAINER:
269 273
                return '/^<(?:script|pre|style)(?:\s|>|$)/i';
270 255
            case HtmlBlock::TYPE_2_COMMENT:
271 255
                return '/^<!--/';
272 240
            case HtmlBlock::TYPE_3:
273 240
                return '/^<[?]/';
274 237
            case HtmlBlock::TYPE_4:
275 237
                return '/^<![A-Z]/';
276 234
            case HtmlBlock::TYPE_5_CDATA:
277 234
                return '/^<!\[CDATA\[/';
278 231
            case HtmlBlock::TYPE_6_BLOCK_ELEMENT:
279 231
                return '%^<[/]?(?:address|article|aside|base|basefont|blockquote|body|caption|center|col|colgroup|dd|details|dialog|dir|div|dl|dt|fieldset|figcaption|figure|footer|form|frame|frameset|h[123456]|head|header|hr|html|legend|li|link|main|menu|menuitem|meta|nav|noframes|ol|optgroup|option|p|param|pre|section|source|title|summary|table|tbody|td|tfoot|th|thead|title|tr|track|ul)(?:\s|[/]?[>]|$)%i';
280 153
            case HtmlBlock::TYPE_7_MISC_ELEMENT:
281 153
                $self = self::getInstance();
282
283 153
                return '/^(?:' . $self->getPartialRegex(self::OPENTAG) . '|' . $self->getPartialRegex(self::CLOSETAG) . ')\\s*$/i';
284
        }
285
    }
286
287
    /**
288
     * @param int $type HTML block type
289
     *
290
     * @return string|null
291
     */
292 60
    public static function getHtmlBlockCloseRegex($type)
293
    {
294
        switch ($type) {
295 60
            case HtmlBlock::TYPE_1_CODE_CONTAINER:
296 36
                return '%<\/(?:script|pre|style)>%i';
297 24
            case HtmlBlock::TYPE_2_COMMENT:
298 15
                return '/-->/';
299 9
            case HtmlBlock::TYPE_3:
300 3
                return '/\?>/';
301 6
            case HtmlBlock::TYPE_4:
302 3
                return '/>/';
303 3
            case HtmlBlock::TYPE_5_CDATA:
304 3
                return '/\]\]>/';
305
        }
306
    }
307
308
    /**
309
     * @param string $url
310
     *
311
     * @return bool
312
     */
313 30
    public static function isLinkPotentiallyUnsafe($url)
314
    {
315 30
        return preg_match(self::REGEX_UNSAFE_PROTOCOL, $url) !== 0 && preg_match(self::REGEX_SAFE_DATA_PROTOCOL, $url) === 0;
316
    }
317
}
318