|
1
|
|
|
<?php |
|
2
|
|
|
/** |
|
3
|
|
|
* This file is part of NACL. |
|
4
|
|
|
* |
|
5
|
|
|
* For the full copyright and license information, please view the LICENSE |
|
6
|
|
|
* file that was distributed with this source code. |
|
7
|
|
|
* |
|
8
|
|
|
* @copyright 2019 Nuglif (2018) Inc. |
|
9
|
|
|
* @license http://www.opensource.org/licenses/mit-license.html MIT License |
|
10
|
|
|
* @author Pierrick Charron <[email protected]> |
|
11
|
|
|
* @author Charle Demers <[email protected]> |
|
12
|
|
|
*/ |
|
13
|
|
|
|
|
14
|
|
|
declare(strict_types=1); |
|
15
|
|
|
|
|
16
|
|
|
namespace Nuglif\Nacl; |
|
17
|
|
|
|
|
18
|
|
|
class Lexer extends AbstractLexer |
|
19
|
|
|
{ |
|
20
|
|
|
protected const STATE_INSTRING = 1; |
|
21
|
|
|
protected const STATE_INHEREDOC = 2; |
|
22
|
|
|
|
|
23
|
|
|
public const REGEX_SPACE = '[ \t\n\r]+'; |
|
24
|
|
|
public const REGEX_COMMENT = '(?://|\#).*'; |
|
25
|
|
|
public const REGEX_COMMENT_ML = '/\*'; |
|
26
|
|
|
public const REGEX_NAME = '[A-Za-z_][A-Za-z0-9_-]*'; |
|
27
|
|
|
public const REGEX_VAR = '?:\${([A-Za-z0-9_]+)}'; |
|
28
|
|
|
public const REGEX_NUM = '(?:[0-9]*\.?[0-9]+|[0-9]+\.)(?:[eE](?:\+|-)?[0-9]+)?(?:m(?:in|s)|[KkGgMm][Bb]?|[b|s|h|d|w|y])?'; |
|
29
|
|
|
public const REGEX_DQUOTE = '"'; |
|
30
|
|
|
public const REGEX_HEREDOC = '?:<<<([A-Za-z0-9_]+)\n'; |
|
31
|
|
|
public const REGEX_BOOL = '(?:true|false|yes|no|on|off)\b'; |
|
32
|
|
|
public const REGEX_NULL = 'null\b'; |
|
33
|
|
|
public const REGEX_TOKEN = '[\[\]=:{};,.()&|%^/*+-]|<<|>>'; |
|
34
|
|
|
public const REGEX_ANY = '.'; |
|
35
|
|
|
|
|
36
|
|
|
private string $textBuffer = ''; |
|
37
|
|
|
|
|
38
|
591 |
|
protected function getRules(): array |
|
39
|
|
|
{ |
|
40
|
591 |
|
return [ |
|
41
|
591 |
|
self::STATE_INITIAL => [ |
|
42
|
591 |
|
self::REGEX_SPACE => false, |
|
43
|
591 |
|
self::REGEX_COMMENT => false, |
|
44
|
591 |
|
self::REGEX_COMMENT_ML => function () { |
|
45
|
2 |
|
$pos = strpos($this->content, '*/', $this->count); |
|
46
|
2 |
|
if (false === $pos) { |
|
47
|
1 |
|
$this->line += substr_count(substr($this->content, $this->count), "\n"); |
|
48
|
1 |
|
$this->error('Unterminated multiline comment'); |
|
49
|
|
|
} |
|
50
|
1 |
|
$this->line += substr_count(substr($this->content, $this->count, (int) $pos - $this->count + 2), "\n"); |
|
51
|
1 |
|
$this->count = (int) $pos + 2; |
|
52
|
591 |
|
}, |
|
53
|
591 |
|
self::REGEX_DQUOTE => function (): void { |
|
54
|
565 |
|
$this->begin(self::STATE_INSTRING); |
|
55
|
565 |
|
$this->textBuffer = ''; |
|
56
|
591 |
|
}, |
|
57
|
591 |
|
self::REGEX_BOOL => function (mixed &$yylval): int { |
|
58
|
76 |
|
$yylval = TypeCaster::toBool($yylval); |
|
59
|
|
|
|
|
60
|
76 |
|
return Token::T_BOOL; |
|
61
|
591 |
|
}, |
|
62
|
591 |
|
self::REGEX_NULL => function (mixed &$yylval): int { |
|
63
|
68 |
|
$yylval = null; |
|
64
|
|
|
|
|
65
|
68 |
|
return Token::T_NULL; |
|
66
|
591 |
|
}, |
|
67
|
591 |
|
self::REGEX_NUM => function (mixed &$yylval): int { |
|
68
|
144 |
|
$yylval = TypeCaster::toNum($yylval); |
|
69
|
|
|
|
|
70
|
144 |
|
return Token::T_NUM; |
|
71
|
591 |
|
}, |
|
72
|
591 |
|
self::REGEX_NAME => fn() => Token::T_NAME, |
|
73
|
591 |
|
self::REGEX_HEREDOC => function (mixed &$yylval): int { |
|
74
|
4 |
|
$needle = "\n" . $yylval; |
|
75
|
4 |
|
$pos = strpos($this->content, $needle, $this->count); |
|
76
|
4 |
|
if (false === $pos) { |
|
77
|
1 |
|
$this->line += substr_count(substr($this->content, $this->count), "\n"); |
|
78
|
1 |
|
$this->error('Unterminated HEREDOC'); |
|
79
|
|
|
} |
|
80
|
|
|
|
|
81
|
3 |
|
$yylval = substr($this->content, $this->count, (int) $pos - $this->count); |
|
82
|
3 |
|
$this->line += substr_count($yylval, "\n") + 1; |
|
83
|
3 |
|
$this->count += strlen($yylval) + strlen($needle); |
|
84
|
|
|
|
|
85
|
3 |
|
return Token::T_END_STR; |
|
86
|
591 |
|
}, |
|
87
|
591 |
|
self::REGEX_TOKEN => fn(mixed $yylval): string => $yylval, |
|
88
|
591 |
|
self::REGEX_VAR => fn(): int => Token::T_VAR, |
|
89
|
591 |
|
self::REGEX_ANY => function (mixed $yylval): void { |
|
90
|
|
|
$this->error('Unexpected char \'' . $yylval . '\''); |
|
91
|
591 |
|
}, |
|
92
|
591 |
|
self::EOF => fn(): int => Token::T_EOF, |
|
93
|
591 |
|
], |
|
94
|
591 |
|
self::STATE_INSTRING => [ |
|
95
|
591 |
|
'[^\\\"$]+' => function (mixed &$yylval) { |
|
96
|
565 |
|
$this->textBuffer .= $yylval; |
|
97
|
565 |
|
if ('$' == substr($this->content, $this->count, 1)) { |
|
98
|
5 |
|
$yylval = $this->textBuffer; |
|
99
|
5 |
|
$this->textBuffer = ''; |
|
100
|
|
|
|
|
101
|
5 |
|
return Token::T_STRING; |
|
102
|
|
|
} |
|
103
|
591 |
|
}, |
|
104
|
591 |
|
'?:\\\(.)' => function (mixed $yylval) { |
|
105
|
|
|
switch ($yylval) { |
|
106
|
72 |
|
case 'n': |
|
107
|
2 |
|
$this->textBuffer .= "\n"; |
|
108
|
2 |
|
break; |
|
109
|
72 |
|
case 't': |
|
110
|
2 |
|
$this->textBuffer .= "\t"; |
|
111
|
2 |
|
break; |
|
112
|
72 |
|
case '\\': |
|
113
|
5 |
|
case '/': |
|
114
|
5 |
|
case '"': |
|
115
|
69 |
|
$this->textBuffer .= $yylval; |
|
116
|
69 |
|
break; |
|
117
|
3 |
|
case 'u': |
|
118
|
2 |
|
$utfCode = substr($this->content, $this->count, 4); |
|
119
|
2 |
|
if (preg_match('/[A-Fa-f0-9]{4,4}/', $utfCode)) { |
|
120
|
2 |
|
$utf = hexdec($utfCode); |
|
121
|
2 |
|
$this->count += 4; |
|
122
|
|
|
// UTF-32 ? |
|
123
|
2 |
|
if ($utf >= 0xD800 && $utf <= 0xDBFF && preg_match('/^\\\\u[dD][c-fC-F][0-9a-fA-F][0-9a-fA-F]/', substr($this->content, $this->count, 6), $matches)) { |
|
124
|
1 |
|
$utf_hi = hexdec(substr($matches[0], -4)); |
|
125
|
1 |
|
$utf = (($utf & 0x3FF) << 10) + ($utf_hi & 0x3FF) + 0x10000; |
|
126
|
1 |
|
$this->count += 6; |
|
127
|
|
|
} |
|
128
|
2 |
|
$this->textBuffer .= $this->fromCharCode($utf); |
|
|
|
|
|
|
129
|
2 |
|
break; |
|
130
|
|
|
} |
|
131
|
|
|
/* no break */ |
|
132
|
|
|
default: |
|
133
|
1 |
|
$this->textBuffer .= '\\' . $yylval; |
|
134
|
1 |
|
break; |
|
135
|
|
|
} |
|
136
|
591 |
|
}, |
|
137
|
591 |
|
'\$' => function (mixed &$yylval) { |
|
138
|
7 |
|
if (preg_match('/^{([A-Za-z0-9_]+)}/', substr($this->content, $this->count), $matches)) { |
|
139
|
7 |
|
$this->count += strlen($matches[0]); |
|
140
|
7 |
|
$yylval = $matches[1]; |
|
141
|
|
|
|
|
142
|
7 |
|
return Token::T_ENCAPSED_VAR; |
|
143
|
|
|
} |
|
144
|
|
|
|
|
145
|
|
|
$this->textBuffer .= $yylval; |
|
146
|
591 |
|
}, |
|
147
|
591 |
|
self::REGEX_DQUOTE => function (mixed &$yylval) { |
|
148
|
564 |
|
$yylval = $this->textBuffer; |
|
149
|
564 |
|
$this->begin(self::STATE_INITIAL); |
|
150
|
|
|
|
|
151
|
564 |
|
return Token::T_END_STR; |
|
152
|
591 |
|
}, |
|
153
|
591 |
|
self::EOF => function () { |
|
154
|
1 |
|
$this->error('Unterminated string'); |
|
155
|
591 |
|
}, |
|
156
|
591 |
|
], |
|
157
|
591 |
|
]; |
|
158
|
|
|
} |
|
159
|
|
|
|
|
160
|
2 |
|
private function fromCharCode(int $bytes): string |
|
161
|
|
|
{ |
|
162
|
2 |
|
return match (true) { |
|
163
|
2 |
|
(0x7F & $bytes) == $bytes => chr($bytes), |
|
164
|
2 |
|
(0x07FF & $bytes) == $bytes => chr(0xc0 | ($bytes >> 6)) |
|
165
|
2 |
|
. chr(0x80 | ($bytes & 0x3F)), |
|
166
|
2 |
|
(0xFFFF & $bytes) == $bytes => chr(0xe0 | ($bytes >> 12)) |
|
167
|
2 |
|
. chr(0x80 | (($bytes >> 6) & 0x3F)) |
|
168
|
2 |
|
. chr(0x80 | ($bytes & 0x3F)), |
|
169
|
2 |
|
default => chr(0xF0 | ($bytes >> 18)) |
|
170
|
2 |
|
. chr(0x80 | (($bytes >> 12) & 0x3F)) |
|
171
|
2 |
|
. chr(0x80 | (($bytes >> 6) & 0x3F)) |
|
172
|
2 |
|
. chr(0x80 | ($bytes & 0x3F)), |
|
173
|
2 |
|
}; |
|
174
|
|
|
} |
|
175
|
|
|
} |
|
176
|
|
|
|