|
1
|
|
|
<?php |
|
2
|
|
|
/** |
|
3
|
|
|
* This file is part of NACL. |
|
4
|
|
|
* |
|
5
|
|
|
* For the full copyright and license information, please view the LICENSE |
|
6
|
|
|
* file that was distributed with this source code. |
|
7
|
|
|
* |
|
8
|
|
|
* @copyright 2019 Nuglif (2018) Inc. |
|
9
|
|
|
* @license http://www.opensource.org/licenses/mit-license.html MIT License |
|
10
|
|
|
* @author Pierrick Charron <[email protected]> |
|
11
|
|
|
* @author Charle Demers <[email protected]> |
|
12
|
|
|
*/ |
|
13
|
|
|
|
|
14
|
|
|
declare(strict_types=1); |
|
15
|
|
|
|
|
16
|
|
|
namespace Nuglif\Nacl; |
|
17
|
|
|
|
|
18
|
|
|
class Lexer extends AbstractLexer |
|
19
|
|
|
{ |
|
20
|
|
|
const STATE_INITIAL = 0; |
|
21
|
|
|
const STATE_INSTRING = 1; |
|
22
|
|
|
const STATE_INHEREDOC = 2; |
|
23
|
|
|
|
|
24
|
|
|
const REGEX_SPACE = '[ \t\n\r]+'; |
|
25
|
|
|
const REGEX_COMMENT = '(?://|\#).*'; |
|
26
|
|
|
const REGEX_COMMENT_ML = '/\*'; |
|
27
|
|
|
const REGEX_NAME = '[A-Za-z_][A-Za-z0-9_-]*'; |
|
28
|
|
|
const REGEX_VAR = '?:\${([A-Za-z0-9_]+)}'; |
|
29
|
|
|
const REGEX_NUM = '(?:[0-9]*\.?[0-9]+|[0-9]+\.)(?:[eE](?:\+|-)?[0-9]+)?(?:m(?:in|s)|[KkGgMm][Bb]?|[b|s|h|d|w|y])?'; |
|
30
|
|
|
const REGEX_DQUOTE = '"'; |
|
31
|
|
|
const REGEX_HEREDOC = '?:<<<([A-Za-z0-9_]+)\n'; |
|
32
|
|
|
const REGEX_BOOL = '(?:true|false|yes|no|on|off)\b'; |
|
33
|
|
|
const REGEX_NULL = 'null\b'; |
|
34
|
|
|
const REGEX_TOKEN = '[\[\]=:{};,.()&|%^/*+-]|<<|>>'; |
|
35
|
|
|
const REGEX_ANY = '.'; |
|
36
|
|
|
|
|
37
|
|
|
private $textBuffer; |
|
38
|
|
|
|
|
39
|
590 |
|
public function __construct() |
|
40
|
|
|
{ |
|
41
|
590 |
|
parent::__construct(); |
|
42
|
590 |
|
} |
|
43
|
|
|
|
|
44
|
590 |
|
protected function getRules() |
|
45
|
|
|
{ |
|
46
|
|
|
return [ |
|
47
|
590 |
|
self::STATE_INITIAL => [ |
|
48
|
590 |
|
self::REGEX_SPACE => false, |
|
49
|
590 |
|
self::REGEX_COMMENT => false, |
|
50
|
|
|
self::REGEX_COMMENT_ML => function () { |
|
51
|
2 |
|
$pos = strpos($this->content, '*/', $this->count); |
|
52
|
2 |
|
if (false === $pos) { |
|
53
|
1 |
|
$this->line += substr_count(substr($this->content, $this->count), "\n"); |
|
54
|
1 |
|
$this->error('Unterminated multiline comment'); |
|
55
|
|
|
} |
|
56
|
1 |
|
$this->line += substr_count(substr($this->content, $this->count, $pos - $this->count + 2), "\n"); |
|
57
|
1 |
|
$this->count = $pos + 2; |
|
58
|
590 |
|
}, |
|
59
|
|
|
self::REGEX_DQUOTE => function () { |
|
60
|
564 |
|
$this->begin(self::STATE_INSTRING); |
|
61
|
564 |
|
$this->textBuffer = ''; |
|
62
|
590 |
|
}, |
|
63
|
|
|
self::REGEX_BOOL => function (&$yylval) { |
|
64
|
76 |
|
$yylval = TypeCaster::toBool($yylval); |
|
65
|
|
|
|
|
66
|
76 |
|
return Token::T_BOOL; |
|
67
|
590 |
|
}, |
|
68
|
|
|
self::REGEX_NULL => function (&$yylval) { |
|
69
|
68 |
|
$yylval = null; |
|
70
|
|
|
|
|
71
|
68 |
|
return Token::T_NULL; |
|
72
|
590 |
|
}, |
|
73
|
|
|
self::REGEX_NUM => function (&$yylval) { |
|
74
|
144 |
|
$yylval = TypeCaster::toNum($yylval); |
|
75
|
|
|
|
|
76
|
144 |
|
return Token::T_NUM; |
|
77
|
590 |
|
}, |
|
78
|
|
|
self::REGEX_NAME => function () { |
|
79
|
495 |
|
return Token::T_NAME; |
|
80
|
590 |
|
}, |
|
81
|
|
|
self::REGEX_HEREDOC => function (&$yylval) { |
|
82
|
4 |
|
$needle = "\n" . $yylval; |
|
83
|
4 |
|
$pos = strpos($this->content, $needle, $this->count); |
|
84
|
4 |
|
if (false === $pos) { |
|
85
|
1 |
|
$this->line += substr_count(substr($this->content, $this->count), "\n"); |
|
86
|
1 |
|
$this->error('Unterminated HEREDOC'); |
|
87
|
|
|
} |
|
88
|
|
|
|
|
89
|
3 |
|
$yylval = substr($this->content, $this->count, $pos - $this->count); |
|
90
|
3 |
|
$this->line += substr_count($yylval, "\n") + 1; |
|
91
|
3 |
|
$this->count += strlen($yylval) + strlen($needle); |
|
92
|
|
|
|
|
93
|
3 |
|
return Token::T_END_STR; |
|
94
|
590 |
|
}, |
|
95
|
|
|
self::REGEX_TOKEN => function ($yylval) { |
|
96
|
570 |
|
return $yylval; |
|
97
|
590 |
|
}, |
|
98
|
|
|
self::REGEX_VAR => function () { |
|
99
|
9 |
|
return Token::T_VAR; |
|
100
|
590 |
|
}, |
|
101
|
|
|
self::REGEX_ANY => function ($yylval) { |
|
102
|
|
|
$this->error('Unexpected char \'' . $yylval . '\''); |
|
103
|
590 |
|
}, |
|
104
|
|
|
self::EOF => function () { |
|
105
|
583 |
|
return Token::T_EOF; |
|
106
|
590 |
|
}, |
|
107
|
|
|
], |
|
108
|
590 |
|
self::STATE_INSTRING => [ |
|
109
|
|
|
'[^\\\"$]+' => function (&$yylval) { |
|
110
|
564 |
|
$this->textBuffer .= $yylval; |
|
111
|
564 |
|
if ('$' == substr($this->content, $this->count, 1)) { |
|
112
|
5 |
|
$yylval = $this->textBuffer; |
|
113
|
5 |
|
$this->textBuffer = ''; |
|
114
|
|
|
|
|
115
|
5 |
|
return Token::T_STRING; |
|
116
|
|
|
} |
|
117
|
590 |
|
}, |
|
118
|
|
|
'?:\\\(.)' => function ($yylval) { |
|
119
|
72 |
|
switch ($yylval) { |
|
120
|
72 |
|
case 'n': |
|
121
|
2 |
|
$this->textBuffer .= "\n"; |
|
122
|
2 |
|
break; |
|
123
|
72 |
|
case 't': |
|
124
|
2 |
|
$this->textBuffer .= "\t"; |
|
125
|
2 |
|
break; |
|
126
|
72 |
|
case '\\': |
|
127
|
5 |
|
case '/': |
|
128
|
5 |
|
case '"': |
|
129
|
69 |
|
$this->textBuffer .= $yylval; |
|
130
|
69 |
|
break; |
|
131
|
3 |
|
case 'u': |
|
132
|
2 |
|
$utfCode = substr($this->content, $this->count, 4); |
|
133
|
2 |
|
if (preg_match('/[A-Fa-f0-9]{4,4}/', $utfCode)) { |
|
134
|
2 |
|
$utf = hexdec($utfCode); |
|
135
|
2 |
|
$this->count += 4; |
|
136
|
|
|
// UTF-32 ? |
|
137
|
2 |
|
if ($utf >= 0xD800 && $utf <= 0xDBFF && preg_match('/^\\\\u[dD][c-fC-F][0-9a-fA-F][0-9a-fA-F]/', substr($this->content, $this->count, 6), $matches)) { |
|
138
|
1 |
|
$utf_hi = hexdec(substr($matches[0], -4)); |
|
139
|
1 |
|
$utf = (($utf & 0x3FF) << 10) + ($utf_hi & 0x3FF) + 0x10000; |
|
140
|
1 |
|
$this->count += 6; |
|
141
|
|
|
} |
|
142
|
2 |
|
$this->textBuffer .= $this->fromCharCode($utf); |
|
143
|
2 |
|
break; |
|
144
|
|
|
} |
|
145
|
|
|
/* no break */ |
|
146
|
|
|
default: |
|
147
|
1 |
|
$this->textBuffer .= '\\' . $yylval; |
|
148
|
1 |
|
break; |
|
149
|
|
|
} |
|
150
|
590 |
|
}, |
|
151
|
|
|
'\$' => function (&$yylval) { |
|
152
|
7 |
|
if (preg_match('/^{([A-Za-z0-9_]+)}/', substr($this->content, $this->count), $matches)) { |
|
153
|
7 |
|
$this->count += strlen($matches[0]); |
|
154
|
7 |
|
$yylval = $matches[1]; |
|
155
|
|
|
|
|
156
|
7 |
|
return Token::T_ENCAPSED_VAR; |
|
157
|
|
|
} |
|
158
|
|
|
|
|
159
|
|
|
$this->textBuffer .= $yylval; |
|
160
|
590 |
|
}, |
|
161
|
|
|
self::REGEX_DQUOTE => function (&$yylval) { |
|
162
|
563 |
|
$yylval = $this->textBuffer; |
|
163
|
563 |
|
$this->begin(self::STATE_INITIAL); |
|
164
|
|
|
|
|
165
|
563 |
|
return Token::T_END_STR; |
|
166
|
590 |
|
}, |
|
167
|
|
|
self::EOF => function () { |
|
168
|
1 |
|
$this->error('Unterminated string'); |
|
169
|
590 |
|
}, |
|
170
|
|
|
], |
|
171
|
|
|
]; |
|
172
|
|
|
} |
|
173
|
|
|
|
|
174
|
2 |
|
private function fromCharCode($bytes) |
|
175
|
|
|
{ |
|
176
|
|
|
switch (true) { |
|
177
|
2 |
|
case (0x7F & $bytes) == $bytes: |
|
178
|
|
|
return chr($bytes); |
|
179
|
|
|
|
|
180
|
2 |
|
case (0x07FF & $bytes) == $bytes: |
|
181
|
1 |
|
return chr(0xc0 | ($bytes >> 6)) |
|
182
|
1 |
|
. chr(0x80 | ($bytes & 0x3F)); |
|
183
|
|
|
|
|
184
|
1 |
|
case (0xFFFF & $bytes) == $bytes: |
|
185
|
|
|
return chr(0xe0 | ($bytes >> 12)) |
|
186
|
|
|
. chr(0x80 | (($bytes >> 6) & 0x3F)) |
|
187
|
|
|
. chr(0x80 | ($bytes & 0x3F)); |
|
188
|
|
|
|
|
189
|
|
|
default: |
|
190
|
1 |
|
return chr(0xF0 | ($bytes >> 18)) |
|
191
|
1 |
|
. chr(0x80 | (($bytes >> 12) & 0x3F)) |
|
192
|
1 |
|
. chr(0x80 | (($bytes >> 6) & 0x3F)) |
|
193
|
1 |
|
. chr(0x80 | ($bytes & 0x3F)); |
|
194
|
|
|
} |
|
195
|
|
|
} |
|
196
|
|
|
} |
|
197
|
|
|
|