1
|
|
|
<?php |
2
|
|
|
declare(strict_types=1); |
3
|
|
|
|
4
|
|
|
namespace TYPO3Fluid\Fluid\Core\Parser; |
5
|
|
|
|
6
|
|
|
/** |
7
|
|
|
* Splitter |
8
|
|
|
* |
9
|
|
|
* Byte-based calculations to perform splitting on Fluid template sources. |
10
|
|
|
* Uses (64bit) bit masking to detect characters that may split a template, |
11
|
|
|
* by grouping "interesting" bytes which have ordinal values within a value |
12
|
|
|
* range of maximum 64 and comparing the bit mask of this and the byte being |
13
|
|
|
* analysed. |
14
|
|
|
* |
15
|
|
|
* Contains the methods needed to iterate and match bytes based on (mutating) |
16
|
|
|
* bit-masks, and a couple of shorthand "peek" type methods to determine if |
17
|
|
|
* the current yield should be a certain type or another. |
18
|
|
|
* |
19
|
|
|
* The logic is essentially the equivalent of: |
20
|
|
|
* |
21
|
|
|
* - Using arrays of possible byte values |
22
|
|
|
* - Iterating characters and checking against the must-match bytes |
23
|
|
|
* - Using "substr" to extract relevant bits of template code |
24
|
|
|
* |
25
|
|
|
* The difference is that the method in this class is excessively faster than |
26
|
|
|
* any array-based counterpart and consumes orders of magnitude less memory. |
27
|
|
|
* It also means the opcode optimised version of the loop and comparisons use |
28
|
|
|
* ideal CPU instructions at the bit-level instead, making them both smaller |
29
|
|
|
* and even more efficient when compiled. |
30
|
|
|
* |
31
|
|
|
* Works by: |
32
|
|
|
* |
33
|
|
|
* - Iterating a byte value array while maintaining an internal pointer |
34
|
|
|
* - Yielding byte and position (which contains captured text since last yield) |
35
|
|
|
* - When yielding, reload the bit masks used in the next iteration |
36
|
|
|
*/ |
37
|
|
|
class Splitter |
38
|
|
|
{ |
39
|
|
|
public const MAX_NAMESPACE_LENGTH = 10; |
40
|
|
|
|
41
|
|
|
public const BYTE_NULL = 0; // Zero-byte for terminating documents |
42
|
|
|
public const BYTE_INLINE = 123; // The "{" character indicating an inline expression started |
43
|
|
|
public const BYTE_INLINE_END = 125; // The "}" character indicating an inline expression ended |
44
|
|
|
public const BYTE_PIPE = 124; // The "|" character indicating an inline expression pass operation |
45
|
|
|
public const BYTE_MINUS = 45; // The "-" character (for legacy pass operations) |
46
|
|
|
public const BYTE_TAG = 60; // The "<" character indicating a tag has started |
47
|
|
|
public const BYTE_TAG_END = 62; // The ">" character indicating a tag has ended |
48
|
|
|
public const BYTE_TAG_CLOSE = 47; // The "/" character indicating a tag is a closing tag |
49
|
|
|
public const BYTE_QUOTE_DOUBLE = 34; // The " (standard double-quote) character |
50
|
|
|
public const BYTE_QUOTE_SINGLE = 39; // The ' (standard single-quote) character |
51
|
|
|
public const BYTE_WHITESPACE_SPACE = 32; // A standard space character |
52
|
|
|
public const BYTE_WHITESPACE_TAB = 9; // A standard carriage-return character |
53
|
|
|
public const BYTE_WHITESPACE_RETURN = 13; // A standard tab character |
54
|
|
|
public const BYTE_WHITESPACE_EOL = 10; // A standard (UNIX) line-break character |
55
|
|
|
public const BYTE_SEPARATOR_EQUALS = 61; // The "=" character |
56
|
|
|
public const BYTE_SEPARATOR_COLON = 58; // The ":" character |
57
|
|
|
public const BYTE_SEPARATOR_COMMA = 44; // The "," character |
58
|
|
|
public const BYTE_SEPARATOR_PIPE = 124; // The "|" character |
59
|
|
|
public const BYTE_PARENTHESIS_START = 40; // The "(" character |
60
|
|
|
public const BYTE_PARENTHESIS_END = 41; // The ")" character |
61
|
|
|
public const BYTE_ARRAY_START = 91; // The "[" character |
62
|
|
|
public const BYTE_ARRAY_END = 93; // The "]" character |
63
|
|
|
public const BYTE_SLASH = 47; // The "/" character |
64
|
|
|
public const BYTE_BACKSLASH = 92; // The "\" character |
65
|
|
|
public const BYTE_BACKTICK = 96; // The "`" character |
66
|
|
|
public const MAP_SHIFT = 64; |
67
|
|
|
public const MASK_LINEBREAKS = 0 | (1 << self::BYTE_WHITESPACE_EOL) | (1 << self::BYTE_WHITESPACE_RETURN); |
68
|
|
|
public const MASK_WHITESPACE = 0 | self::MASK_LINEBREAKS | (1 << self::BYTE_WHITESPACE_SPACE) | (1 << self::BYTE_WHITESPACE_TAB); |
69
|
|
|
|
70
|
|
|
/** @var Source */ |
71
|
|
|
public $source; |
72
|
|
|
|
73
|
|
|
/** @var Context */ |
74
|
|
|
public $context; |
75
|
|
|
|
76
|
|
|
public $index = 0; |
77
|
|
|
private $primaryMask = 0; |
78
|
|
|
private $secondaryMask = 0; |
79
|
|
|
|
80
|
|
|
public function __construct(Source $source, Contexts $contexts) |
81
|
|
|
{ |
82
|
|
|
$this->source = $source; |
83
|
|
|
$this->switch($contexts->root); |
84
|
|
|
} |
85
|
|
|
|
86
|
|
|
/** |
87
|
|
|
* Split a string by searching for recognized characters using at least one, |
88
|
|
|
* optionally two bit masks consisting of OR'ed bit values of each detectable |
89
|
|
|
* character (byte). The secondary bit mask is costless as it is OR'ed into |
90
|
|
|
* the primary bit mask. |
91
|
|
|
* |
92
|
|
|
* @return \Generator|?string[] |
|
|
|
|
93
|
|
|
*/ |
94
|
|
|
public function parse(): \Generator |
95
|
|
|
{ |
96
|
|
|
$bytes = &$this->source->bytes; |
97
|
|
|
$source = &$this->source->source; |
98
|
|
|
|
99
|
|
|
if (empty($bytes)) { |
100
|
|
|
yield Splitter::BYTE_NULL => null; |
101
|
|
|
return; |
102
|
|
|
} |
103
|
|
|
|
104
|
|
|
$captured = null; |
105
|
|
|
|
106
|
|
|
foreach ($bytes as $this->index => $byte) { |
107
|
|
|
// Decide which byte we encountered by explicitly checking if the encountered byte was in the minimum |
108
|
|
|
// range (not-mapped match). Next check is if the matched byte is within 64-128 range in which case |
109
|
|
|
// it is a mapped match. Anything else (>128) will be non-ASCII that is always captured. |
110
|
|
|
if ($byte < 64 && ($this->primaryMask & (1 << $byte))) { |
111
|
|
|
yield $byte => $captured; |
112
|
|
|
$captured = null; |
113
|
|
|
} elseif ($byte > 64 && $byte < 128 && ($this->secondaryMask & (1 << ($byte - static::MAP_SHIFT)))) { |
114
|
|
|
yield $byte => $captured; |
115
|
|
|
$captured = null; |
116
|
|
|
} else { |
117
|
|
|
// Append captured bytes from source, must happen after the conditions above so we avoid appending tokens. |
118
|
|
|
$captured .= $source{$this->index - 1}; |
119
|
|
|
} |
120
|
|
|
} |
121
|
|
|
if ($captured !== null) { |
122
|
|
|
yield Splitter::BYTE_NULL => $captured; |
123
|
|
|
} |
124
|
|
|
} |
125
|
|
|
|
126
|
|
|
public function switch(Context $context): Context |
127
|
|
|
{ |
128
|
|
|
$previous = $this->context; |
129
|
|
|
$this->context = $context; |
130
|
|
|
$this->primaryMask = $context->primaryMask; |
131
|
|
|
$this->secondaryMask = $context->secondaryMask; |
132
|
|
|
return $previous ?? $context; |
133
|
|
|
} |
134
|
|
|
|
135
|
|
|
public function countCharactersMatchingMask(int $primaryMask, int $offset, int $length): int |
|
|
|
|
136
|
|
|
{ |
137
|
|
|
$bytes = &$this->source->bytes; |
138
|
|
|
$counted = 0; |
139
|
|
|
for ($index = $offset; $index < $this->source->length; $index++) { |
140
|
|
|
if (($primaryMask & (1 << $bytes[$index])) && $bytes[$index] < 64) { |
141
|
|
|
$counted++; |
142
|
|
|
} |
143
|
|
|
} |
144
|
|
|
return $counted; |
145
|
|
|
} |
146
|
|
|
|
147
|
|
|
public function findBytePositionBeforeOffset(int $primaryMask, int $offset): int |
148
|
|
|
{ |
149
|
|
|
$bytes = &$this->source->bytes; |
150
|
|
|
for ($index = min($offset, $this->source->length); $index > 0; $index--) { |
151
|
|
|
if (($primaryMask & (1 << $bytes[$index])) && $bytes[$index] < 64) { |
152
|
|
|
return $index; |
153
|
|
|
} |
154
|
|
|
} |
155
|
|
|
return 0; |
156
|
|
|
} |
157
|
|
|
|
158
|
|
|
public function findBytePositionAfterOffset(int $primaryMask, int $offset): int |
159
|
|
|
{ |
160
|
|
|
$bytes = &$this->source->bytes; |
161
|
|
|
for ($index = $offset; $index < $this->source->length; $index++) { |
162
|
|
|
if (($primaryMask & (1 << $bytes[$index])) && $bytes[$index] < 64) { |
163
|
|
|
return $index; |
164
|
|
|
} |
165
|
|
|
} |
166
|
|
|
return max($this->source->length, $offset); |
167
|
|
|
} |
168
|
|
|
} |
169
|
|
|
|
This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.