Failed Conditions
Push — psr2 ( ffc2cc...de2261 )
by Andreas
05:28
created

inc/Lexer/Lexer.php (3 issues)

Labels
Severity

Upgrade to new PHP Analysis Engine

These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more

1
<?php
2
/**
3
 * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
4
 * For an intro to the Lexer see:
5
 * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
6
 *
7
 * @author Marcus Baker http://www.lastcraft.com
8
 */
9
10
namespace dokuwiki\Lexer;
11
12
// FIXME move elsewhere
13
define("DOKU_LEXER_ENTER", 1);
14
define("DOKU_LEXER_MATCHED", 2);
15
define("DOKU_LEXER_UNMATCHED", 3);
16
define("DOKU_LEXER_EXIT", 4);
17
define("DOKU_LEXER_SPECIAL", 5);
18
19
/**
20
 * Accepts text and breaks it into tokens.
21
 *
22
 * Some optimisation to make the sure the content is only scanned by the PHP regex
23
 * parser once. Lexer modes must not start with leading underscores.
24
 */
25
class Lexer
26
{
27
    /** @var ParallelRegex[] */
28
    protected $regexes;
29
    /** @var \Doku_Handler */
30
    protected $handler;
31
    /** @var StateStack */
32
    protected $mode;
33
    /** @var array mode "rewrites" */
34
    protected $mode_handlers;
35
    /** @var bool case sensitive? */
36
    protected $case;
37
38
    /**
39
     * Sets up the lexer in case insensitive matching by default.
40
     *
41
     * @param \Doku_Handler $handler  Handling strategy by reference.
42
     * @param string $start            Starting handler.
43
     * @param boolean $case            True for case sensitive.
44
     */
45
    public function __construct($handler, $start = "accept", $case = false)
46
    {
47
        $this->case = $case;
48
        $this->regexes = array();
49
        $this->handler = $handler;
50
        $this->mode = new StateStack($start);
51
        $this->mode_handlers = array();
52
    }
53
54
    /**
55
     * Adds a token search pattern for a particular parsing mode.
56
     *
57
     * The pattern does not change the current mode.
58
     *
59
     * @param string $pattern      Perl style regex, but ( and )
60
     *                             lose the usual meaning.
61
     * @param string $mode         Should only apply this
62
     *                             pattern when dealing with
63
     *                             this type of input.
64
     */
65
    public function addPattern($pattern, $mode = "accept")
66
    {
67
        if (! isset($this->regexes[$mode])) {
68
            $this->regexes[$mode] = new ParallelRegex($this->case);
69
        }
70
        $this->regexes[$mode]->addPattern($pattern);
71
    }
72
73
    /**
74
     * Adds a pattern that will enter a new parsing mode.
75
     *
76
     * Useful for entering parenthesis, strings, tags, etc.
77
     *
78
     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
79
     * @param string $mode         Should only apply this pattern when dealing with this type of input.
80
     * @param string $new_mode     Change parsing to this new nested mode.
81
     */
82
    public function addEntryPattern($pattern, $mode, $new_mode)
83
    {
84
        if (! isset($this->regexes[$mode])) {
85
            $this->regexes[$mode] = new ParallelRegex($this->case);
86
        }
87
        $this->regexes[$mode]->addPattern($pattern, $new_mode);
88
    }
89
90
    /**
91
     * Adds a pattern that will exit the current mode and re-enter the previous one.
92
     *
93
     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
94
     * @param string $mode         Mode to leave.
95
     */
96
    public function addExitPattern($pattern, $mode)
97
    {
98
        if (! isset($this->regexes[$mode])) {
99
            $this->regexes[$mode] = new ParallelRegex($this->case);
100
        }
101
        $this->regexes[$mode]->addPattern($pattern, "__exit");
102
    }
103
104
    /**
105
     * Adds a pattern that has a special mode.
106
     *
107
     * Acts as an entry and exit pattern in one go, effectively calling a special
108
     * parser handler for this token only.
109
     *
110
     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
111
     * @param string $mode         Should only apply this pattern when dealing with this type of input.
112
     * @param string $special      Use this mode for this one token.
113
     */
114
    public function addSpecialPattern($pattern, $mode, $special)
115
    {
116
        if (! isset($this->regexes[$mode])) {
117
            $this->regexes[$mode] = new ParallelRegex($this->case);
118
        }
119
        $this->regexes[$mode]->addPattern($pattern, "_$special");
120
    }
121
122
    /**
123
     * Adds a mapping from a mode to another handler.
124
     *
125
     * @param string $mode        Mode to be remapped.
126
     * @param string $handler     New target handler.
127
     */
128
    public function mapHandler($mode, $handler)
129
    {
130
        $this->mode_handlers[$mode] = $handler;
131
    }
132
133
    /**
134
     * Splits the page text into tokens.
135
     *
136
     * Will fail if the handlers report an error or if no content is consumed. If successful then each
137
     * unparsed and parsed token invokes a call to the held listener.
138
     *
139
     * @param string $raw        Raw HTML text.
140
     * @return boolean           True on success, else false.
141
     */
142
    public function parse($raw)
143
    {
144
        if (! isset($this->handler)) {
145
            return false;
146
        }
147
        $initialLength = strlen($raw);
148
        $length = $initialLength;
149
        $pos = 0;
150
        while (is_array($parsed = $this->reduce($raw))) {
151
            list($unmatched, $matched, $mode) = $parsed;
152
            $currentLength = strlen($raw);
153
            $matchPos = $initialLength - $currentLength - strlen($matched);
154
            if (! $this->dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) {
155
                return false;
156
            }
157
            if ($currentLength == $length) {
158
                return false;
159
            }
160
            $length = $currentLength;
161
            $pos = $initialLength - $currentLength;
162
        }
163
        if (!$parsed) {
164
            return false;
165
        }
166
        return $this->invokeParser($raw, DOKU_LEXER_UNMATCHED, $pos);
167
    }
168
169
    /**
170
     * Sends the matched token and any leading unmatched
171
     * text to the parser changing the lexer to a new
172
     * mode if one is listed.
173
     *
174
     * @param string $unmatched Unmatched leading portion.
175
     * @param string $matched Actual token match.
176
     * @param bool|string $mode Mode after match. A boolean false mode causes no change.
177
     * @param int $initialPos
178
     * @param int $matchPos Current byte index location in raw doc thats being parsed
179
     * @return boolean             False if there was any error from the parser.
180
     */
181
    protected function dispatchTokens($unmatched, $matched, $mode = false, $initialPos, $matchPos)
182
    {
183
        if (! $this->invokeParser($unmatched, DOKU_LEXER_UNMATCHED, $initialPos)) {
184
            return false;
185
        }
186
        if ($this->isModeEnd($mode)) {
0 ignored issues
show
It seems like $mode defined by parameter $mode on line 181 can also be of type boolean; however, dokuwiki\Lexer\Lexer::isModeEnd() does only seem to accept string, maybe add an additional type check?

This check looks at variables that have been passed in as parameters and are passed out again to other methods.

If the outgoing method call has stricter type requirements than the method itself, an issue is raised.

An additional type check may prevent trouble.

Loading history...
187
            if (! $this->invokeParser($matched, DOKU_LEXER_EXIT, $matchPos)) {
188
                return false;
189
            }
190
            return $this->mode->leave();
191
        }
192
        if ($this->isSpecialMode($mode)) {
0 ignored issues
show
It seems like $mode defined by parameter $mode on line 181 can also be of type boolean; however, dokuwiki\Lexer\Lexer::isSpecialMode() does only seem to accept string, maybe add an additional type check?

This check looks at variables that have been passed in as parameters and are passed out again to other methods.

If the outgoing method call has stricter type requirements than the method itself, an issue is raised.

An additional type check may prevent trouble.

Loading history...
193
            $this->mode->enter($this->decodeSpecial($mode));
0 ignored issues
show
It seems like $mode defined by parameter $mode on line 181 can also be of type boolean; however, dokuwiki\Lexer\Lexer::decodeSpecial() does only seem to accept string, maybe add an additional type check?

This check looks at variables that have been passed in as parameters and are passed out again to other methods.

If the outgoing method call has stricter type requirements than the method itself, an issue is raised.

An additional type check may prevent trouble.

Loading history...
194
            if (! $this->invokeParser($matched, DOKU_LEXER_SPECIAL, $matchPos)) {
195
                return false;
196
            }
197
            return $this->mode->leave();
198
        }
199
        if (is_string($mode)) {
200
            $this->mode->enter($mode);
201
            return $this->invokeParser($matched, DOKU_LEXER_ENTER, $matchPos);
202
        }
203
        return $this->invokeParser($matched, DOKU_LEXER_MATCHED, $matchPos);
204
    }
205
206
    /**
207
     * Tests to see if the new mode is actually to leave the current mode and pop an item from the matching
208
     * mode stack.
209
     *
210
     * @param string $mode    Mode to test.
211
     * @return boolean        True if this is the exit mode.
212
     */
213
    protected function isModeEnd($mode)
214
    {
215
        return ($mode === "__exit");
216
    }
217
218
    /**
219
     * Test to see if the mode is one where this mode is entered for this token only and automatically
220
     * leaves immediately afterwoods.
221
     *
222
     * @param string $mode    Mode to test.
223
     * @return boolean        True if this is the exit mode.
224
     */
225
    protected function isSpecialMode($mode)
226
    {
227
        return (strncmp($mode, "_", 1) == 0);
228
    }
229
230
    /**
231
     * Strips the magic underscore marking single token modes.
232
     *
233
     * @param string $mode    Mode to decode.
234
     * @return string         Underlying mode name.
235
     */
236
    protected function decodeSpecial($mode)
237
    {
238
        return substr($mode, 1);
239
    }
240
241
    /**
242
     * Calls the parser method named after the current mode.
243
     *
244
     * Empty content will be ignored. The lexer has a parser handler for each mode in the lexer.
245
     *
246
     * @param string $content Text parsed.
247
     * @param boolean $is_match Token is recognised rather
248
     *                               than unparsed data.
249
     * @param int $pos Current byte index location in raw doc
250
     *                             thats being parsed
251
     * @return bool
252
     */
253
    protected function invokeParser($content, $is_match, $pos)
254
    {
255
        if (($content === "") || ($content === false)) {
256
            return true;
257
        }
258
        $handler = $this->mode->getCurrent();
259
        if (isset($this->mode_handlers[$handler])) {
260
            $handler = $this->mode_handlers[$handler];
261
        }
262
263
        // modes starting with plugin_ are all handled by the same
264
        // handler but with an additional parameter
265
        if (substr($handler, 0, 7)=='plugin_') {
266
            list($handler,$plugin) = explode('_', $handler, 2);
267
            return $this->handler->$handler($content, $is_match, $pos, $plugin);
268
        }
269
270
        return $this->handler->$handler($content, $is_match, $pos);
271
    }
272
273
    /**
274
     * Tries to match a chunk of text and if successful removes the recognised chunk and any leading
275
     * unparsed data. Empty strings will not be matched.
276
     *
277
     * @param string $raw         The subject to parse. This is the content that will be eaten.
278
     * @return array|bool         Three item list of unparsed content followed by the
279
     *                            recognised token and finally the action the parser is to take.
280
     *                            True if no match, false if there is a parsing error.
281
     */
282
    protected function reduce(&$raw)
283
    {
284
        if (! isset($this->regexes[$this->mode->getCurrent()])) {
285
            return false;
286
        }
287
        if ($raw === "") {
288
            return true;
289
        }
290
        if ($action = $this->regexes[$this->mode->getCurrent()]->split($raw, $split)) {
291
            list($unparsed, $match, $raw) = $split;
292
            return array($unparsed, $match, $action);
293
        }
294
        return true;
295
    }
296
297
    /**
298
     * Escapes regex characters other than (, ) and /
299
     *
300
     * @param string $str
301
     * @return string
302
     */
303
    public static function escape($str)
304
    {
305
        $chars = array(
306
            '/\\\\/',
307
            '/\./',
308
            '/\+/',
309
            '/\*/',
310
            '/\?/',
311
            '/\[/',
312
            '/\^/',
313
            '/\]/',
314
            '/\$/',
315
            '/\{/',
316
            '/\}/',
317
            '/\=/',
318
            '/\!/',
319
            '/\</',
320
            '/\>/',
321
            '/\|/',
322
            '/\:/'
323
        );
324
325
        $escaped = array(
326
            '\\\\\\\\',
327
            '\.',
328
            '\+',
329
            '\*',
330
            '\?',
331
            '\[',
332
            '\^',
333
            '\]',
334
            '\$',
335
            '\{',
336
            '\}',
337
            '\=',
338
            '\!',
339
            '\<',
340
            '\>',
341
            '\|',
342
            '\:'
343
        );
344
        return preg_replace($chars, $escaped, $str);
345
    }
346
}
347