|
1
|
|
|
<?php |
|
2
|
|
|
declare(strict_types=1); |
|
3
|
|
|
|
|
4
|
|
|
namespace TYPO3Fluid\Fluid\Core\Parser; |
|
5
|
|
|
|
|
6
|
|
|
/** |
|
7
|
|
|
* Splitter |
|
8
|
|
|
* |
|
9
|
|
|
* Byte-based calculations to perform splitting on Fluid template sources. |
|
10
|
|
|
* Uses (64bit) bit masking to detect characters that may split a template, |
|
11
|
|
|
* by grouping "interesting" bytes which have ordinal values within a value |
|
12
|
|
|
* range of maximum 64 and comparing the bit mask of this and the byte being |
|
13
|
|
|
* analysed. |
|
14
|
|
|
* |
|
15
|
|
|
* Contains the methods needed to iterate and match bytes based on (mutating) |
|
16
|
|
|
* bit-masks, and a couple of shorthand "peek" type methods to determine if |
|
17
|
|
|
* the current yield should be a certain type or another. |
|
18
|
|
|
* |
|
19
|
|
|
* The logic is essentially the equivalent of: |
|
20
|
|
|
* |
|
21
|
|
|
* - Using arrays of possible byte values |
|
22
|
|
|
* - Iterating characters and checking against the must-match bytes |
|
23
|
|
|
* - Using "substr" to extract relevant bits of template code |
|
24
|
|
|
* |
|
25
|
|
|
* The difference is that the method in this class is excessively faster than |
|
26
|
|
|
* any array-based counterpart and consumes orders of magnitude less memory. |
|
27
|
|
|
* It also means the opcode optimised version of the loop and comparisons use |
|
28
|
|
|
* ideal CPU instructions at the bit-level instead, making them both smaller |
|
29
|
|
|
* and even more efficient when compiled. |
|
30
|
|
|
* |
|
31
|
|
|
* Works by: |
|
32
|
|
|
* |
|
33
|
|
|
* - Iterating a byte value array while maintaining an internal pointer |
|
34
|
|
|
* - Yielding byte and position (which contains captured text since last yield) |
|
35
|
|
|
* - When yielding, reload the bit masks used in the next iteration |
|
36
|
|
|
*/ |
|
37
|
|
|
class Splitter |
|
38
|
|
|
{ |
|
39
|
|
|
public const MAX_NAMESPACE_LENGTH = 10; |
|
40
|
|
|
|
|
41
|
|
|
public const BYTE_NULL = 0; // Zero-byte for terminating documents |
|
42
|
|
|
public const BYTE_INLINE = 123; // The "{" character indicating an inline expression started |
|
43
|
|
|
public const BYTE_INLINE_END = 125; // The "}" character indicating an inline expression ended |
|
44
|
|
|
public const BYTE_PIPE = 124; // The "|" character indicating an inline expression pass operation |
|
45
|
|
|
public const BYTE_MINUS = 45; // The "-" character (for legacy pass operations) |
|
46
|
|
|
public const BYTE_TAG = 60; // The "<" character indicating a tag has started |
|
47
|
|
|
public const BYTE_TAG_END = 62; // The ">" character indicating a tag has ended |
|
48
|
|
|
public const BYTE_TAG_CLOSE = 47; // The "/" character indicating a tag is a closing tag |
|
49
|
|
|
public const BYTE_QUOTE_DOUBLE = 34; // The " (standard double-quote) character |
|
50
|
|
|
public const BYTE_QUOTE_SINGLE = 39; // The ' (standard single-quote) character |
|
51
|
|
|
public const BYTE_WHITESPACE_SPACE = 32; // A standard space character |
|
52
|
|
|
public const BYTE_WHITESPACE_TAB = 9; // A standard carriage-return character |
|
53
|
|
|
public const BYTE_WHITESPACE_RETURN = 13; // A standard tab character |
|
54
|
|
|
public const BYTE_WHITESPACE_EOL = 10; // A standard (UNIX) line-break character |
|
55
|
|
|
public const BYTE_SEPARATOR_EQUALS = 61; // The "=" character |
|
56
|
|
|
public const BYTE_SEPARATOR_COLON = 58; // The ":" character |
|
57
|
|
|
public const BYTE_SEPARATOR_COMMA = 44; // The "," character |
|
58
|
|
|
public const BYTE_SEPARATOR_PIPE = 124; // The "|" character |
|
59
|
|
|
public const BYTE_PARENTHESIS_START = 40; // The "(" character |
|
60
|
|
|
public const BYTE_PARENTHESIS_END = 41; // The ")" character |
|
61
|
|
|
public const BYTE_ARRAY_START = 91; // The "[" character |
|
62
|
|
|
public const BYTE_ARRAY_END = 93; // The "]" character |
|
63
|
|
|
public const BYTE_SLASH = 47; // The "/" character |
|
64
|
|
|
public const BYTE_BACKSLASH = 92; // The "\" character |
|
65
|
|
|
public const BYTE_BACKTICK = 96; // The "`" character |
|
66
|
|
|
public const MAP_SHIFT = 64; |
|
67
|
|
|
public const MASK_LINEBREAKS = 0 | (1 << self::BYTE_WHITESPACE_EOL) | (1 << self::BYTE_WHITESPACE_RETURN); |
|
68
|
|
|
public const MASK_WHITESPACE = 0 | self::MASK_LINEBREAKS | (1 << self::BYTE_WHITESPACE_SPACE) | (1 << self::BYTE_WHITESPACE_TAB); |
|
69
|
|
|
|
|
70
|
|
|
/** @var Source */ |
|
71
|
|
|
public $source; |
|
72
|
|
|
|
|
73
|
|
|
/** @var Context */ |
|
74
|
|
|
public $context; |
|
75
|
|
|
|
|
76
|
|
|
public $index = 0; |
|
77
|
|
|
private $primaryMask = 0; |
|
78
|
|
|
private $secondaryMask = 0; |
|
79
|
|
|
|
|
80
|
|
|
public function __construct(Source $source, Contexts $contexts) |
|
81
|
|
|
{ |
|
82
|
|
|
$this->source = $source; |
|
83
|
|
|
$this->switch($contexts->root); |
|
84
|
|
|
} |
|
85
|
|
|
|
|
86
|
|
|
/** |
|
87
|
|
|
* Split a string by searching for recognized characters using at least one, |
|
88
|
|
|
* optionally two bit masks consisting of OR'ed bit values of each detectable |
|
89
|
|
|
* character (byte). The secondary bit mask is costless as it is OR'ed into |
|
90
|
|
|
* the primary bit mask. |
|
91
|
|
|
* |
|
92
|
|
|
* @return \Generator|?string[] |
|
|
|
|
|
|
93
|
|
|
*/ |
|
94
|
|
|
public function parse(): \Generator |
|
95
|
|
|
{ |
|
96
|
|
|
$bytes = &$this->source->bytes; |
|
97
|
|
|
$source = &$this->source->source; |
|
98
|
|
|
|
|
99
|
|
|
if (empty($bytes)) { |
|
100
|
|
|
yield Splitter::BYTE_NULL => null; |
|
101
|
|
|
return; |
|
102
|
|
|
} |
|
103
|
|
|
|
|
104
|
|
|
$captured = null; |
|
105
|
|
|
|
|
106
|
|
|
foreach ($bytes as $this->index => $byte) { |
|
107
|
|
|
// Decide which byte we encountered by explicitly checking if the encountered byte was in the minimum |
|
108
|
|
|
// range (not-mapped match). Next check is if the matched byte is within 64-128 range in which case |
|
109
|
|
|
// it is a mapped match. Anything else (>128) will be non-ASCII that is always captured. |
|
110
|
|
|
if ($byte < 64 && ($this->primaryMask & (1 << $byte))) { |
|
111
|
|
|
yield $byte => $captured; |
|
112
|
|
|
$captured = null; |
|
113
|
|
|
} elseif ($byte > 64 && $byte < 128 && ($this->secondaryMask & (1 << ($byte - static::MAP_SHIFT)))) { |
|
114
|
|
|
yield $byte => $captured; |
|
115
|
|
|
$captured = null; |
|
116
|
|
|
} else { |
|
117
|
|
|
// Append captured bytes from source, must happen after the conditions above so we avoid appending tokens. |
|
118
|
|
|
$captured .= $source{$this->index - 1}; |
|
119
|
|
|
} |
|
120
|
|
|
} |
|
121
|
|
|
if ($captured !== null) { |
|
122
|
|
|
yield Splitter::BYTE_NULL => $captured; |
|
123
|
|
|
} |
|
124
|
|
|
} |
|
125
|
|
|
|
|
126
|
|
|
public function switch(Context $context): Context |
|
127
|
|
|
{ |
|
128
|
|
|
$previous = $this->context; |
|
129
|
|
|
$this->context = $context; |
|
130
|
|
|
$this->primaryMask = $context->primaryMask; |
|
131
|
|
|
$this->secondaryMask = $context->secondaryMask; |
|
132
|
|
|
return $previous ?? $context; |
|
133
|
|
|
} |
|
134
|
|
|
|
|
135
|
|
|
public function countCharactersMatchingMask(int $primaryMask, int $offset, int $length): int |
|
|
|
|
|
|
136
|
|
|
{ |
|
137
|
|
|
$bytes = &$this->source->bytes; |
|
138
|
|
|
$counted = 0; |
|
139
|
|
|
for ($index = $offset; $index < $this->source->length; $index++) { |
|
140
|
|
|
if (($primaryMask & (1 << $bytes[$index])) && $bytes[$index] < 64) { |
|
141
|
|
|
$counted++; |
|
142
|
|
|
} |
|
143
|
|
|
} |
|
144
|
|
|
return $counted; |
|
145
|
|
|
} |
|
146
|
|
|
|
|
147
|
|
|
public function findBytePositionBeforeOffset(int $primaryMask, int $offset): int |
|
148
|
|
|
{ |
|
149
|
|
|
$bytes = &$this->source->bytes; |
|
150
|
|
|
for ($index = min($offset, $this->source->length); $index > 0; $index--) { |
|
151
|
|
|
if (($primaryMask & (1 << $bytes[$index])) && $bytes[$index] < 64) { |
|
152
|
|
|
return $index; |
|
153
|
|
|
} |
|
154
|
|
|
} |
|
155
|
|
|
return 0; |
|
156
|
|
|
} |
|
157
|
|
|
|
|
158
|
|
|
public function findBytePositionAfterOffset(int $primaryMask, int $offset): int |
|
159
|
|
|
{ |
|
160
|
|
|
$bytes = &$this->source->bytes; |
|
161
|
|
|
for ($index = $offset; $index < $this->source->length; $index++) { |
|
162
|
|
|
if (($primaryMask & (1 << $bytes[$index])) && $bytes[$index] < 64) { |
|
163
|
|
|
return $index; |
|
164
|
|
|
} |
|
165
|
|
|
} |
|
166
|
|
|
return max($this->source->length, $offset); |
|
167
|
|
|
} |
|
168
|
|
|
} |
|
169
|
|
|
|
This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.