1
|
|
|
<?php |
2
|
|
|
declare(strict_types=1); |
3
|
|
|
|
4
|
|
|
namespace TYPO3Fluid\Fluid\Core\Parser; |
5
|
|
|
|
6
|
|
|
/** |
7
|
|
|
* Splitter |
8
|
|
|
* |
9
|
|
|
* Byte-based calculations to perform splitting on Fluid template sources. |
10
|
|
|
* Uses (64bit) bit masking to detect characters that may split a template, |
11
|
|
|
* by grouping "interesting" bytes which have ordinal values within a value |
12
|
|
|
* range of maximum 64 and comparing the bit mask of this and the byte being |
13
|
|
|
* analysed. |
14
|
|
|
* |
15
|
|
|
* Contains the methods needed to iterate and match bytes based on (mutating) |
16
|
|
|
* bit-masks, and a couple of shorthand "peek" type methods to determine if |
17
|
|
|
* the current yield should be a certain type or another. |
18
|
|
|
* |
19
|
|
|
* The logic is essentially the equivalent of: |
20
|
|
|
* |
21
|
|
|
* - Using arrays of possible byte values |
22
|
|
|
* - Iterating characters and checking against the must-match bytes |
23
|
|
|
* - Using "substr" to extract relevant bits of template code |
24
|
|
|
* |
25
|
|
|
* The difference is that the method in this class is excessively faster than |
26
|
|
|
* any array-based counterpart and consumes orders of magnitude less memory. |
27
|
|
|
* It also means the opcode optimised version of the loop and comparisons use |
28
|
|
|
* ideal CPU instructions at the bit-level instead, making them both smaller |
29
|
|
|
* and even more efficient when compiled. |
30
|
|
|
* |
31
|
|
|
* Works by: |
32
|
|
|
* |
33
|
|
|
* - Iterating a byte value array while maintaining an internal pointer |
34
|
|
|
* - Yielding byte and position (which contains captured text since last yield) |
35
|
|
|
* - When yielding, reload the bit masks used in the next iteration |
36
|
|
|
*/ |
37
|
|
|
class Splitter |
38
|
|
|
{ |
39
|
|
|
public const MAX_NAMESPACE_LENGTH = 10; |
40
|
|
|
|
41
|
|
|
public const BYTE_NULL = 0; // Zero-byte for terminating documents |
42
|
|
|
public const BYTE_INLINE = 123; // The "{" character indicating an inline expression started |
43
|
|
|
public const BYTE_INLINE_END = 125; // The "}" character indicating an inline expression ended |
44
|
|
|
public const BYTE_PIPE = 124; // The "|" character indicating an inline expression pass operation |
45
|
|
|
public const BYTE_MINUS = 45; // The "-" character (for legacy pass operations) |
46
|
|
|
public const BYTE_TAG = 60; // The "<" character indicating a tag has started |
47
|
|
|
public const BYTE_TAG_END = 62; // The ">" character indicating a tag has ended |
48
|
|
|
public const BYTE_TAG_CLOSE = 47; // The "/" character indicating a tag is a closing tag |
49
|
|
|
public const BYTE_QUOTE_DOUBLE = 34; // The " (standard double-quote) character |
50
|
|
|
public const BYTE_QUOTE_SINGLE = 39; // The ' (standard single-quote) character |
51
|
|
|
public const BYTE_WHITESPACE_SPACE = 32; // A standard space character |
52
|
|
|
public const BYTE_WHITESPACE_TAB = 9; // A standard carriage-return character |
53
|
|
|
public const BYTE_WHITESPACE_RETURN = 13; // A standard tab character |
54
|
|
|
public const BYTE_WHITESPACE_EOL = 10; // A standard (UNIX) line-break character |
55
|
|
|
public const BYTE_SEPARATOR_EQUALS = 61; // The "=" character |
56
|
|
|
public const BYTE_SEPARATOR_COLON = 58; // The ":" character |
57
|
|
|
public const BYTE_SEPARATOR_COMMA = 44; // The "," character |
58
|
|
|
public const BYTE_SEPARATOR_PIPE = 124; // The "|" character |
59
|
|
|
public const BYTE_PARENTHESIS_START = 40; // The "(" character |
60
|
|
|
public const BYTE_PARENTHESIS_END = 41; // The ")" character |
61
|
|
|
public const BYTE_ARRAY_START = 91; // The "[" character |
62
|
|
|
public const BYTE_ARRAY_END = 93; // The "]" character |
63
|
|
|
public const BYTE_SLASH = 47; // The "/" character |
64
|
|
|
public const BYTE_BACKSLASH = 92; // The "\" character |
65
|
|
|
public const BYTE_BACKTICK = 96; // The "`" character |
66
|
|
|
public const MAP_SHIFT = 64; |
67
|
|
|
public const MASK_LINEBREAKS = 0 | (1 << self::BYTE_WHITESPACE_EOL) | (1 << self::BYTE_WHITESPACE_RETURN); |
68
|
|
|
public const MASK_WHITESPACE = 0 | self::MASK_LINEBREAKS | (1 << self::BYTE_WHITESPACE_SPACE) | (1 << self::BYTE_WHITESPACE_TAB); |
69
|
|
|
|
70
|
|
|
/** @var Source */ |
71
|
|
|
public $source; |
72
|
|
|
|
73
|
|
|
/** @var Context */ |
74
|
|
|
public $context; |
75
|
|
|
|
76
|
|
|
/** @var Contexts */ |
77
|
|
|
public $contexts; |
78
|
|
|
|
79
|
|
|
/** @var \NoRewindIterator */ |
80
|
|
|
public $sequence; |
81
|
|
|
|
82
|
|
|
public $index = 0; |
83
|
|
|
private $primaryMask = 0; |
84
|
|
|
private $secondaryMask = 0; |
85
|
|
|
|
86
|
|
|
public function __construct(Source $source, Contexts $contexts) |
87
|
|
|
{ |
88
|
|
|
$this->source = $source; |
89
|
|
|
$this->contexts = $contexts; |
90
|
|
|
$this->switch($contexts->root); |
91
|
|
|
$this->sequence = $this->parse(); |
|
|
|
|
92
|
|
|
} |
93
|
|
|
|
94
|
|
|
/** |
95
|
|
|
* Creates a dump, starting from the first line break before $position, |
96
|
|
|
* to the next line break from $position, counting the lines and characters |
97
|
|
|
* and inserting a marker pointing to the exact offending character. |
98
|
|
|
* |
99
|
|
|
* Is not very efficient - but adds bug tracing information. Should only |
100
|
|
|
* be called when exceptions are raised during sequencing. |
101
|
|
|
* |
102
|
|
|
* @param Position $position |
103
|
|
|
* @return string |
104
|
|
|
*/ |
105
|
|
|
public function extractSourceDumpOfLineAtPosition(Position $position): string |
106
|
|
|
{ |
107
|
|
|
$lines = $this->countCharactersMatchingMask(Splitter::MASK_LINEBREAKS, 1, $position->index) + 1; |
108
|
|
|
$offset = $this->findBytePositionBeforeOffset(Splitter::MASK_LINEBREAKS, $position->index); |
109
|
|
|
$line = substr( |
110
|
|
|
$this->source->source, |
111
|
|
|
$offset, |
112
|
|
|
$this->findBytePositionAfterOffset(Splitter::MASK_LINEBREAKS, $position->index) |
113
|
|
|
); |
114
|
|
|
$character = $position->index - $offset - 1; |
115
|
|
|
$string = 'Line ' . $lines . ' character ' . $character . PHP_EOL; |
116
|
|
|
$string .= PHP_EOL; |
117
|
|
|
$string .= str_repeat(' ', max($character, 0)) . 'v' . PHP_EOL; |
118
|
|
|
$string .= trim($line) . PHP_EOL; |
119
|
|
|
$string .= str_repeat(' ', max($character, 0)) . '^' . PHP_EOL; |
120
|
|
|
return $string; |
121
|
|
|
} |
122
|
|
|
|
123
|
|
|
public function createErrorAtPosition(string $message, int $code): SequencingException |
124
|
|
|
{ |
125
|
|
|
$position = new Position($this->context, $this->index); |
126
|
|
|
$ascii = (string) $this->source->bytes[$this->index]; |
127
|
|
|
$message .= ' ASCII: ' . $ascii . ': ' . $this->extractSourceDumpOfLineAtPosition($position); |
128
|
|
|
$error = new SequencingException($message, $code); |
129
|
|
|
return $error; |
130
|
|
|
} |
131
|
|
|
|
132
|
|
|
public function createUnsupportedArgumentError(string $argument, array $definitions): SequencingException |
133
|
|
|
{ |
134
|
|
|
return $this->createErrorAtPosition( |
135
|
|
|
sprintf( |
136
|
|
|
'Unsupported argument "%s". Supported: ' . implode(', ', array_keys($definitions)), |
137
|
|
|
$argument |
138
|
|
|
), |
139
|
|
|
1558298976 |
140
|
|
|
); |
141
|
|
|
} |
142
|
|
|
|
143
|
|
|
/** |
144
|
|
|
* Split a string by searching for recognized characters using at least one, |
145
|
|
|
* optionally two bit masks consisting of OR'ed bit values of each detectable |
146
|
|
|
* character (byte). The secondary bit mask is costless as it is OR'ed into |
147
|
|
|
* the primary bit mask. |
148
|
|
|
* |
149
|
|
|
* @return \NoRewindIterator|string[]|null[] |
150
|
|
|
*/ |
151
|
|
|
public function parse(): \NoRewindIterator |
152
|
|
|
{ |
153
|
|
|
return new \NoRewindIterator($this->createGenerator()); |
154
|
|
|
} |
155
|
|
|
|
156
|
|
|
/** |
157
|
|
|
* Split a string by searching for recognized characters using at least one, |
158
|
|
|
* optionally two bit masks consisting of OR'ed bit values of each detectable |
159
|
|
|
* character (byte). The secondary bit mask is costless as it is OR'ed into |
160
|
|
|
* the primary bit mask. |
161
|
|
|
* |
162
|
|
|
* @return \NoRewindIterator|string[]|null[] |
163
|
|
|
*/ |
164
|
|
|
public function createGenerator(): \Generator |
165
|
|
|
{ |
166
|
|
|
$bytes = &$this->source->bytes; |
167
|
|
|
$source = &$this->source->source; |
168
|
|
|
|
169
|
|
|
if (empty($bytes)) { |
170
|
|
|
yield Splitter::BYTE_NULL => null; |
171
|
|
|
return; |
172
|
|
|
} |
173
|
|
|
|
174
|
|
|
$captured = null; |
175
|
|
|
|
176
|
|
|
foreach ($bytes as $this->index => $byte) { |
177
|
|
|
// Decide which byte we encountered by explicitly checking if the encountered byte was in the minimum |
178
|
|
|
// range (not-mapped match). Next check is if the matched byte is within 64-128 range in which case |
179
|
|
|
// it is a mapped match. Anything else (>128) will be non-ASCII that is always captured. |
180
|
|
|
if ($byte < 64 && ($this->primaryMask & (1 << $byte))) { |
181
|
|
|
yield $byte => $captured; |
182
|
|
|
$captured = null; |
183
|
|
|
} elseif ($byte > 64 && $byte < 128 && ($this->secondaryMask & (1 << ($byte - static::MAP_SHIFT)))) { |
184
|
|
|
yield $byte => $captured; |
185
|
|
|
$captured = null; |
186
|
|
|
} else { |
187
|
|
|
// Append captured bytes from source, must happen after the conditions above so we avoid appending tokens. |
188
|
|
|
$captured .= $source{$this->index - 1}; |
189
|
|
|
} |
190
|
|
|
} |
191
|
|
|
if ($captured !== null) { |
192
|
|
|
yield Splitter::BYTE_NULL => $captured; |
193
|
|
|
} |
194
|
|
|
} |
195
|
|
|
|
196
|
|
|
public function switch(Context $context): Context |
197
|
|
|
{ |
198
|
|
|
$previous = $this->context; |
199
|
|
|
$this->context = $context; |
200
|
|
|
$this->primaryMask = $context->primaryMask; |
201
|
|
|
$this->secondaryMask = $context->secondaryMask; |
202
|
|
|
return $previous ?? $context; |
203
|
|
|
} |
204
|
|
|
|
205
|
|
|
public function countCharactersMatchingMask(int $primaryMask, int $offset, int $length): int |
|
|
|
|
206
|
|
|
{ |
207
|
|
|
$bytes = &$this->source->bytes; |
208
|
|
|
$counted = 0; |
209
|
|
|
for ($index = $offset; $index < $this->source->length; $index++) { |
210
|
|
|
if (($primaryMask & (1 << $bytes[$index])) && $bytes[$index] < 64) { |
211
|
|
|
$counted++; |
212
|
|
|
} |
213
|
|
|
} |
214
|
|
|
return $counted; |
215
|
|
|
} |
216
|
|
|
|
217
|
|
|
public function findBytePositionBeforeOffset(int $primaryMask, int $offset): int |
218
|
|
|
{ |
219
|
|
|
$bytes = &$this->source->bytes; |
220
|
|
|
for ($index = min($offset, $this->source->length); $index > 0; $index--) { |
221
|
|
|
if (($primaryMask & (1 << $bytes[$index])) && $bytes[$index] < 64) { |
222
|
|
|
return $index; |
223
|
|
|
} |
224
|
|
|
} |
225
|
|
|
return 0; |
226
|
|
|
} |
227
|
|
|
|
228
|
|
|
public function findBytePositionAfterOffset(int $primaryMask, int $offset): int |
229
|
|
|
{ |
230
|
|
|
$bytes = &$this->source->bytes; |
231
|
|
|
for ($index = $offset; $index < $this->source->length; $index++) { |
232
|
|
|
if (($primaryMask & (1 << $bytes[$index])) && $bytes[$index] < 64) { |
233
|
|
|
return $index; |
234
|
|
|
} |
235
|
|
|
} |
236
|
|
|
return max($this->source->length, $offset); |
237
|
|
|
} |
238
|
|
|
} |
239
|
|
|
|
Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a mixed type is assigned to a property that is type hinted more strictly.
For example, imagine you have a variable
$accountId
that can either hold an Id object or false (if there is no account id yet). Your code now assigns that value to theid
property of an instance of theAccount
class. This class holds a proper account, so the id value must no longer be false.Either this assignment is in error or a type check should be added for that assignment.