Failed Conditions
Push — stable ( 017e16...b83837 )
by
unknown
07:54 queued 02:55
created

inc/Parsing/Lexer/Lexer.php (3 issues)

Labels
Severity

Upgrade to new PHP Analysis Engine

These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more

1
<?php
2
/**
3
 * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
4
 * For an intro to the Lexer see:
5
 * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
6
 *
7
 * @author Marcus Baker http://www.lastcraft.com
8
 */
9
10
namespace dokuwiki\Parsing\Lexer;
11
12
// FIXME move elsewhere
13
14
define("DOKU_LEXER_ENTER", 1);
15
define("DOKU_LEXER_MATCHED", 2);
16
define("DOKU_LEXER_UNMATCHED", 3);
17
define("DOKU_LEXER_EXIT", 4);
18
define("DOKU_LEXER_SPECIAL", 5);
19
20
/**
21
 * Accepts text and breaks it into tokens.
22
 *
23
 * Some optimisation to make the sure the content is only scanned by the PHP regex
24
 * parser once. Lexer modes must not start with leading underscores.
25
 */
26
class Lexer
27
{
28
    /** @var ParallelRegex[] */
29
    protected $regexes;
30
    /** @var \Doku_Handler */
31
    protected $handler;
32
    /** @var StateStack */
33
    protected $modeStack;
34
    /** @var array mode "rewrites" */
35
    protected $mode_handlers;
36
    /** @var bool case sensitive? */
37
    protected $case;
38
39
    /**
40
     * Sets up the lexer in case insensitive matching by default.
41
     *
42
     * @param \Doku_Handler $handler  Handling strategy by reference.
43
     * @param string $start            Starting handler.
44
     * @param boolean $case            True for case sensitive.
45
     */
46
    public function __construct($handler, $start = "accept", $case = false)
47
    {
48
        $this->case = $case;
49
        $this->regexes = array();
50
        $this->handler = $handler;
51
        $this->modeStack = new StateStack($start);
52
        $this->mode_handlers = array();
53
    }
54
55
    /**
56
     * Adds a token search pattern for a particular parsing mode.
57
     *
58
     * The pattern does not change the current mode.
59
     *
60
     * @param string $pattern      Perl style regex, but ( and )
61
     *                             lose the usual meaning.
62
     * @param string $mode         Should only apply this
63
     *                             pattern when dealing with
64
     *                             this type of input.
65
     */
66
    public function addPattern($pattern, $mode = "accept")
67
    {
68
        if (! isset($this->regexes[$mode])) {
69
            $this->regexes[$mode] = new ParallelRegex($this->case);
70
        }
71
        $this->regexes[$mode]->addPattern($pattern);
72
    }
73
74
    /**
75
     * Adds a pattern that will enter a new parsing mode.
76
     *
77
     * Useful for entering parenthesis, strings, tags, etc.
78
     *
79
     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
80
     * @param string $mode         Should only apply this pattern when dealing with this type of input.
81
     * @param string $new_mode     Change parsing to this new nested mode.
82
     */
83
    public function addEntryPattern($pattern, $mode, $new_mode)
84
    {
85
        if (! isset($this->regexes[$mode])) {
86
            $this->regexes[$mode] = new ParallelRegex($this->case);
87
        }
88
        $this->regexes[$mode]->addPattern($pattern, $new_mode);
89
    }
90
91
    /**
92
     * Adds a pattern that will exit the current mode and re-enter the previous one.
93
     *
94
     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
95
     * @param string $mode         Mode to leave.
96
     */
97
    public function addExitPattern($pattern, $mode)
98
    {
99
        if (! isset($this->regexes[$mode])) {
100
            $this->regexes[$mode] = new ParallelRegex($this->case);
101
        }
102
        $this->regexes[$mode]->addPattern($pattern, "__exit");
103
    }
104
105
    /**
106
     * Adds a pattern that has a special mode.
107
     *
108
     * Acts as an entry and exit pattern in one go, effectively calling a special
109
     * parser handler for this token only.
110
     *
111
     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
112
     * @param string $mode         Should only apply this pattern when dealing with this type of input.
113
     * @param string $special      Use this mode for this one token.
114
     */
115
    public function addSpecialPattern($pattern, $mode, $special)
116
    {
117
        if (! isset($this->regexes[$mode])) {
118
            $this->regexes[$mode] = new ParallelRegex($this->case);
119
        }
120
        $this->regexes[$mode]->addPattern($pattern, "_$special");
121
    }
122
123
    /**
124
     * Adds a mapping from a mode to another handler.
125
     *
126
     * @param string $mode        Mode to be remapped.
127
     * @param string $handler     New target handler.
128
     */
129
    public function mapHandler($mode, $handler)
130
    {
131
        $this->mode_handlers[$mode] = $handler;
132
    }
133
134
    /**
135
     * Splits the page text into tokens.
136
     *
137
     * Will fail if the handlers report an error or if no content is consumed. If successful then each
138
     * unparsed and parsed token invokes a call to the held listener.
139
     *
140
     * @param string $raw        Raw HTML text.
141
     * @return boolean           True on success, else false.
142
     */
143
    public function parse($raw)
144
    {
145
        if (! isset($this->handler)) {
146
            return false;
147
        }
148
        $initialLength = strlen($raw);
149
        $length = $initialLength;
150
        $pos = 0;
151
        while (is_array($parsed = $this->reduce($raw))) {
152
            list($unmatched, $matched, $mode) = $parsed;
153
            $currentLength = strlen($raw);
154
            $matchPos = $initialLength - $currentLength - strlen($matched);
155
            if (! $this->dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) {
156
                return false;
157
            }
158
            if ($currentLength == $length) {
159
                return false;
160
            }
161
            $length = $currentLength;
162
            $pos = $initialLength - $currentLength;
163
        }
164
        if (!$parsed) {
165
            return false;
166
        }
167
        return $this->invokeHandler($raw, DOKU_LEXER_UNMATCHED, $pos);
168
    }
169
170
    /**
171
     * Gives plugins access to the mode stack
172
     *
173
     * @return StateStack
174
     */
175
    public function getModeStack()
176
    {
177
        return $this->modeStack;
178
    }
179
180
    /**
181
     * Sends the matched token and any leading unmatched
182
     * text to the parser changing the lexer to a new
183
     * mode if one is listed.
184
     *
185
     * @param string $unmatched Unmatched leading portion.
186
     * @param string $matched Actual token match.
187
     * @param bool|string $mode Mode after match. A boolean false mode causes no change.
188
     * @param int $initialPos
189
     * @param int $matchPos Current byte index location in raw doc thats being parsed
190
     * @return boolean             False if there was any error from the parser.
191
     */
192
    protected function dispatchTokens($unmatched, $matched, $mode, $initialPos, $matchPos)
193
    {
194
        if (! $this->invokeHandler($unmatched, DOKU_LEXER_UNMATCHED, $initialPos)) {
195
            return false;
196
        }
197
        if ($this->isModeEnd($mode)) {
0 ignored issues
show
It seems like $mode defined by parameter $mode on line 192 can also be of type boolean; however, dokuwiki\Parsing\Lexer\Lexer::isModeEnd() does only seem to accept string, maybe add an additional type check?

This check looks at variables that have been passed in as parameters and are passed out again to other methods.

If the outgoing method call has stricter type requirements than the method itself, an issue is raised.

An additional type check may prevent trouble.

Loading history...
198
            if (! $this->invokeHandler($matched, DOKU_LEXER_EXIT, $matchPos)) {
199
                return false;
200
            }
201
            return $this->modeStack->leave();
202
        }
203
        if ($this->isSpecialMode($mode)) {
0 ignored issues
show
It seems like $mode defined by parameter $mode on line 192 can also be of type boolean; however, dokuwiki\Parsing\Lexer\Lexer::isSpecialMode() does only seem to accept string, maybe add an additional type check?

This check looks at variables that have been passed in as parameters and are passed out again to other methods.

If the outgoing method call has stricter type requirements than the method itself, an issue is raised.

An additional type check may prevent trouble.

Loading history...
204
            $this->modeStack->enter($this->decodeSpecial($mode));
0 ignored issues
show
It seems like $mode defined by parameter $mode on line 192 can also be of type boolean; however, dokuwiki\Parsing\Lexer\Lexer::decodeSpecial() does only seem to accept string, maybe add an additional type check?

This check looks at variables that have been passed in as parameters and are passed out again to other methods.

If the outgoing method call has stricter type requirements than the method itself, an issue is raised.

An additional type check may prevent trouble.

Loading history...
205
            if (! $this->invokeHandler($matched, DOKU_LEXER_SPECIAL, $matchPos)) {
206
                return false;
207
            }
208
            return $this->modeStack->leave();
209
        }
210
        if (is_string($mode)) {
211
            $this->modeStack->enter($mode);
212
            return $this->invokeHandler($matched, DOKU_LEXER_ENTER, $matchPos);
213
        }
214
        return $this->invokeHandler($matched, DOKU_LEXER_MATCHED, $matchPos);
215
    }
216
217
    /**
218
     * Tests to see if the new mode is actually to leave the current mode and pop an item from the matching
219
     * mode stack.
220
     *
221
     * @param string $mode    Mode to test.
222
     * @return boolean        True if this is the exit mode.
223
     */
224
    protected function isModeEnd($mode)
225
    {
226
        return ($mode === "__exit");
227
    }
228
229
    /**
230
     * Test to see if the mode is one where this mode is entered for this token only and automatically
231
     * leaves immediately afterwoods.
232
     *
233
     * @param string $mode    Mode to test.
234
     * @return boolean        True if this is the exit mode.
235
     */
236
    protected function isSpecialMode($mode)
237
    {
238
        return (strncmp($mode, "_", 1) == 0);
239
    }
240
241
    /**
242
     * Strips the magic underscore marking single token modes.
243
     *
244
     * @param string $mode    Mode to decode.
245
     * @return string         Underlying mode name.
246
     */
247
    protected function decodeSpecial($mode)
248
    {
249
        return substr($mode, 1);
250
    }
251
252
    /**
253
     * Calls the parser method named after the current mode.
254
     *
255
     * Empty content will be ignored. The lexer has a parser handler for each mode in the lexer.
256
     *
257
     * @param string $content Text parsed.
258
     * @param boolean $is_match Token is recognised rather
259
     *                               than unparsed data.
260
     * @param int $pos Current byte index location in raw doc
261
     *                             thats being parsed
262
     * @return bool
263
     */
264
    protected function invokeHandler($content, $is_match, $pos)
265
    {
266
        if (($content === "") || ($content === false)) {
267
            return true;
268
        }
269
        $handler = $this->modeStack->getCurrent();
270
        if (isset($this->mode_handlers[$handler])) {
271
            $handler = $this->mode_handlers[$handler];
272
        }
273
274
        // modes starting with plugin_ are all handled by the same
275
        // handler but with an additional parameter
276
        if (substr($handler, 0, 7)=='plugin_') {
277
            list($handler,$plugin) = explode('_', $handler, 2);
278
            return $this->handler->$handler($content, $is_match, $pos, $plugin);
279
        }
280
281
        return $this->handler->$handler($content, $is_match, $pos);
282
    }
283
284
    /**
285
     * Tries to match a chunk of text and if successful removes the recognised chunk and any leading
286
     * unparsed data. Empty strings will not be matched.
287
     *
288
     * @param string $raw         The subject to parse. This is the content that will be eaten.
289
     * @return array|bool         Three item list of unparsed content followed by the
290
     *                            recognised token and finally the action the parser is to take.
291
     *                            True if no match, false if there is a parsing error.
292
     */
293
    protected function reduce(&$raw)
294
    {
295
        if (! isset($this->regexes[$this->modeStack->getCurrent()])) {
296
            return false;
297
        }
298
        if ($raw === "") {
299
            return true;
300
        }
301
        if ($action = $this->regexes[$this->modeStack->getCurrent()]->split($raw, $split)) {
302
            list($unparsed, $match, $raw) = $split;
303
            return array($unparsed, $match, $action);
304
        }
305
        return true;
306
    }
307
308
    /**
309
     * Escapes regex characters other than (, ) and /
310
     *
311
     * @param string $str
312
     * @return string
313
     */
314
    public static function escape($str)
315
    {
316
        $chars = array(
317
            '/\\\\/',
318
            '/\./',
319
            '/\+/',
320
            '/\*/',
321
            '/\?/',
322
            '/\[/',
323
            '/\^/',
324
            '/\]/',
325
            '/\$/',
326
            '/\{/',
327
            '/\}/',
328
            '/\=/',
329
            '/\!/',
330
            '/\</',
331
            '/\>/',
332
            '/\|/',
333
            '/\:/'
334
        );
335
336
        $escaped = array(
337
            '\\\\\\\\',
338
            '\.',
339
            '\+',
340
            '\*',
341
            '\?',
342
            '\[',
343
            '\^',
344
            '\]',
345
            '\$',
346
            '\{',
347
            '\}',
348
            '\=',
349
            '\!',
350
            '\<',
351
            '\>',
352
            '\|',
353
            '\:'
354
        );
355
        return preg_replace($chars, $escaped, $str);
356
    }
357
}
358