1
|
|
|
<?php |
2
|
|
|
/** |
3
|
|
|
* This file is part of NACL. |
4
|
|
|
* |
5
|
|
|
* For the full copyright and license information, please view the LICENSE |
6
|
|
|
* file that was distributed with this source code. |
7
|
|
|
* |
8
|
|
|
* @copyright 2019 Nuglif (2018) Inc. |
9
|
|
|
* @license http://www.opensource.org/licenses/mit-license.html MIT License |
10
|
|
|
* @author Pierrick Charron <[email protected]> |
11
|
|
|
* @author Charle Demers <[email protected]> |
12
|
|
|
*/ |
13
|
|
|
|
14
|
|
|
declare(strict_types=1); |
15
|
|
|
|
16
|
|
|
namespace Nuglif\Nacl; |
17
|
|
|
|
18
|
|
|
class Lexer extends AbstractLexer |
19
|
|
|
{ |
20
|
|
|
protected const STATE_INSTRING = 1; |
21
|
|
|
protected const STATE_INHEREDOC = 2; |
22
|
|
|
|
23
|
|
|
public const REGEX_SPACE = '[ \t\n\r]+'; |
24
|
|
|
public const REGEX_COMMENT = '(?://|\#).*'; |
25
|
|
|
public const REGEX_COMMENT_ML = '/\*'; |
26
|
|
|
public const REGEX_NAME = '[A-Za-z_][A-Za-z0-9_-]*'; |
27
|
|
|
public const REGEX_VAR = '?:\${([A-Za-z0-9_]+)}'; |
28
|
|
|
public const REGEX_NUM = '(?:[0-9]*\.?[0-9]+|[0-9]+\.)(?:[eE](?:\+|-)?[0-9]+)?(?:m(?:in|s)|[KkGgMm][Bb]?|[b|s|h|d|w|y])?'; |
29
|
|
|
public const REGEX_DQUOTE = '"'; |
30
|
|
|
public const REGEX_HEREDOC = '?:<<<([A-Za-z0-9_]+)\n'; |
31
|
|
|
public const REGEX_BOOL = '(?:true|false|yes|no|on|off)\b'; |
32
|
|
|
public const REGEX_NULL = 'null\b'; |
33
|
|
|
public const REGEX_TOKEN = '[\[\]=:{};,.()&|%^/*+-]|<<|>>'; |
34
|
|
|
public const REGEX_ANY = '.'; |
35
|
|
|
|
36
|
|
|
private string $textBuffer = ''; |
37
|
|
|
|
38
|
591 |
|
protected function getRules(): array |
39
|
|
|
{ |
40
|
591 |
|
return [ |
41
|
591 |
|
self::STATE_INITIAL => [ |
42
|
591 |
|
self::REGEX_SPACE => false, |
43
|
591 |
|
self::REGEX_COMMENT => false, |
44
|
591 |
|
self::REGEX_COMMENT_ML => function () { |
45
|
2 |
|
$pos = strpos($this->content, '*/', $this->count); |
46
|
2 |
|
if (false === $pos) { |
47
|
1 |
|
$this->line += substr_count(substr($this->content, $this->count), "\n"); |
48
|
1 |
|
$this->error('Unterminated multiline comment'); |
49
|
|
|
} |
50
|
1 |
|
$this->line += substr_count(substr($this->content, $this->count, (int) $pos - $this->count + 2), "\n"); |
51
|
1 |
|
$this->count = (int) $pos + 2; |
52
|
591 |
|
}, |
53
|
591 |
|
self::REGEX_DQUOTE => function (): void { |
54
|
565 |
|
$this->begin(self::STATE_INSTRING); |
55
|
565 |
|
$this->textBuffer = ''; |
56
|
591 |
|
}, |
57
|
591 |
|
self::REGEX_BOOL => function (mixed &$yylval): int { |
58
|
76 |
|
$yylval = TypeCaster::toBool($yylval); |
59
|
|
|
|
60
|
76 |
|
return Token::T_BOOL; |
61
|
591 |
|
}, |
62
|
591 |
|
self::REGEX_NULL => function (mixed &$yylval): int { |
63
|
68 |
|
$yylval = null; |
64
|
|
|
|
65
|
68 |
|
return Token::T_NULL; |
66
|
591 |
|
}, |
67
|
591 |
|
self::REGEX_NUM => function (mixed &$yylval): int { |
68
|
144 |
|
$yylval = TypeCaster::toNum($yylval); |
69
|
|
|
|
70
|
144 |
|
return Token::T_NUM; |
71
|
591 |
|
}, |
72
|
591 |
|
self::REGEX_NAME => fn() => Token::T_NAME, |
73
|
591 |
|
self::REGEX_HEREDOC => function (mixed &$yylval): int { |
74
|
4 |
|
$needle = "\n" . $yylval; |
75
|
4 |
|
$pos = strpos($this->content, $needle, $this->count); |
76
|
4 |
|
if (false === $pos) { |
77
|
1 |
|
$this->line += substr_count(substr($this->content, $this->count), "\n"); |
78
|
1 |
|
$this->error('Unterminated HEREDOC'); |
79
|
|
|
} |
80
|
|
|
|
81
|
3 |
|
$yylval = substr($this->content, $this->count, (int) $pos - $this->count); |
82
|
3 |
|
$this->line += substr_count($yylval, "\n") + 1; |
83
|
3 |
|
$this->count += strlen($yylval) + strlen($needle); |
84
|
|
|
|
85
|
3 |
|
return Token::T_END_STR; |
86
|
591 |
|
}, |
87
|
591 |
|
self::REGEX_TOKEN => fn(mixed $yylval): string => $yylval, |
88
|
591 |
|
self::REGEX_VAR => fn(): int => Token::T_VAR, |
89
|
591 |
|
self::REGEX_ANY => function (mixed $yylval): void { |
90
|
|
|
$this->error('Unexpected char \'' . $yylval . '\''); |
91
|
591 |
|
}, |
92
|
591 |
|
self::EOF => fn(): int => Token::T_EOF, |
93
|
591 |
|
], |
94
|
591 |
|
self::STATE_INSTRING => [ |
95
|
591 |
|
'[^\\\"$]+' => function (mixed &$yylval) { |
96
|
565 |
|
$this->textBuffer .= $yylval; |
97
|
565 |
|
if ('$' == substr($this->content, $this->count, 1)) { |
98
|
5 |
|
$yylval = $this->textBuffer; |
99
|
5 |
|
$this->textBuffer = ''; |
100
|
|
|
|
101
|
5 |
|
return Token::T_STRING; |
102
|
|
|
} |
103
|
591 |
|
}, |
104
|
591 |
|
'?:\\\(.)' => function (mixed $yylval) { |
105
|
|
|
switch ($yylval) { |
106
|
72 |
|
case 'n': |
107
|
2 |
|
$this->textBuffer .= "\n"; |
108
|
2 |
|
break; |
109
|
72 |
|
case 't': |
110
|
2 |
|
$this->textBuffer .= "\t"; |
111
|
2 |
|
break; |
112
|
72 |
|
case '\\': |
113
|
5 |
|
case '/': |
114
|
5 |
|
case '"': |
115
|
69 |
|
$this->textBuffer .= $yylval; |
116
|
69 |
|
break; |
117
|
3 |
|
case 'u': |
118
|
2 |
|
$utfCode = substr($this->content, $this->count, 4); |
119
|
2 |
|
if (preg_match('/[A-Fa-f0-9]{4,4}/', $utfCode)) { |
120
|
2 |
|
$utf = hexdec($utfCode); |
121
|
2 |
|
$this->count += 4; |
122
|
|
|
// UTF-32 ? |
123
|
2 |
|
if ($utf >= 0xD800 && $utf <= 0xDBFF && preg_match('/^\\\\u[dD][c-fC-F][0-9a-fA-F][0-9a-fA-F]/', substr($this->content, $this->count, 6), $matches)) { |
124
|
1 |
|
$utf_hi = hexdec(substr($matches[0], -4)); |
125
|
1 |
|
$utf = (($utf & 0x3FF) << 10) + ($utf_hi & 0x3FF) + 0x10000; |
126
|
1 |
|
$this->count += 6; |
127
|
|
|
} |
128
|
2 |
|
$this->textBuffer .= $this->fromCharCode($utf); |
|
|
|
|
129
|
2 |
|
break; |
130
|
|
|
} |
131
|
|
|
/* no break */ |
132
|
|
|
default: |
133
|
1 |
|
$this->textBuffer .= '\\' . $yylval; |
134
|
1 |
|
break; |
135
|
|
|
} |
136
|
591 |
|
}, |
137
|
591 |
|
'\$' => function (mixed &$yylval) { |
138
|
7 |
|
if (preg_match('/^{([A-Za-z0-9_]+)}/', substr($this->content, $this->count), $matches)) { |
139
|
7 |
|
$this->count += strlen($matches[0]); |
140
|
7 |
|
$yylval = $matches[1]; |
141
|
|
|
|
142
|
7 |
|
return Token::T_ENCAPSED_VAR; |
143
|
|
|
} |
144
|
|
|
|
145
|
|
|
$this->textBuffer .= $yylval; |
146
|
591 |
|
}, |
147
|
591 |
|
self::REGEX_DQUOTE => function (mixed &$yylval) { |
148
|
564 |
|
$yylval = $this->textBuffer; |
149
|
564 |
|
$this->begin(self::STATE_INITIAL); |
150
|
|
|
|
151
|
564 |
|
return Token::T_END_STR; |
152
|
591 |
|
}, |
153
|
591 |
|
self::EOF => function () { |
154
|
1 |
|
$this->error('Unterminated string'); |
155
|
591 |
|
}, |
156
|
591 |
|
], |
157
|
591 |
|
]; |
158
|
|
|
} |
159
|
|
|
|
160
|
2 |
|
private function fromCharCode(int $bytes): string |
161
|
|
|
{ |
162
|
2 |
|
return match (true) { |
163
|
2 |
|
(0x7F & $bytes) == $bytes => chr($bytes), |
164
|
2 |
|
(0x07FF & $bytes) == $bytes => chr(0xc0 | ($bytes >> 6)) |
165
|
2 |
|
. chr(0x80 | ($bytes & 0x3F)), |
166
|
2 |
|
(0xFFFF & $bytes) == $bytes => chr(0xe0 | ($bytes >> 12)) |
167
|
2 |
|
. chr(0x80 | (($bytes >> 6) & 0x3F)) |
168
|
2 |
|
. chr(0x80 | ($bytes & 0x3F)), |
169
|
2 |
|
default => chr(0xF0 | ($bytes >> 18)) |
170
|
2 |
|
. chr(0x80 | (($bytes >> 12) & 0x3F)) |
171
|
2 |
|
. chr(0x80 | (($bytes >> 6) & 0x3F)) |
172
|
2 |
|
. chr(0x80 | ($bytes & 0x3F)), |
173
|
2 |
|
}; |
174
|
|
|
} |
175
|
|
|
} |
176
|
|
|
|