1
|
|
|
<?php |
2
|
|
|
/** |
3
|
|
|
* This file is part of NACL. |
4
|
|
|
* |
5
|
|
|
* For the full copyright and license information, please view the LICENSE |
6
|
|
|
* file that was distributed with this source code. |
7
|
|
|
* |
8
|
|
|
* @copyright 2019 Nuglif (2018) Inc. |
9
|
|
|
* @license http://www.opensource.org/licenses/mit-license.html MIT License |
10
|
|
|
* @author Pierrick Charron <[email protected]> |
11
|
|
|
* @author Charle Demers <[email protected]> |
12
|
|
|
*/ |
13
|
|
|
|
14
|
|
|
declare(strict_types=1); |
15
|
|
|
|
16
|
|
|
namespace Nuglif\Nacl; |
17
|
|
|
|
18
|
|
|
class Lexer extends AbstractLexer |
19
|
|
|
{ |
20
|
|
|
const STATE_INITIAL = 0; |
21
|
|
|
const STATE_INSTRING = 1; |
22
|
|
|
const STATE_INHEREDOC = 2; |
23
|
|
|
|
24
|
|
|
const REGEX_SPACE = '[ \t\n\r]+'; |
25
|
|
|
const REGEX_COMMENT = '(?://|\#).*'; |
26
|
|
|
const REGEX_COMMENT_ML = '/\*'; |
27
|
|
|
const REGEX_NAME = '[A-Za-z_][A-Za-z0-9_-]*'; |
28
|
|
|
const REGEX_VAR = '?:\${([A-Za-z0-9_]+)}'; |
29
|
|
|
const REGEX_NUM = '(?:[0-9]*\.?[0-9]+|[0-9]+\.)(?:[eE](?:\+|-)?[0-9]+)?(?:m(?:in|s)|[KkGgMm][Bb]?|[b|s|h|d|w|y])?'; |
30
|
|
|
const REGEX_DQUOTE = '"'; |
31
|
|
|
const REGEX_HEREDOC = '?:<<<([A-Za-z0-9_]+)\n'; |
32
|
|
|
const REGEX_BOOL = '(?:true|false|yes|no|on|off)\b'; |
33
|
|
|
const REGEX_NULL = 'null\b'; |
34
|
|
|
const REGEX_TOKEN = '[\[\]=:{};,.()&|%^/*+-]|<<|>>'; |
35
|
|
|
const REGEX_ANY = '.'; |
36
|
|
|
|
37
|
|
|
private $textBuffer; |
38
|
|
|
|
39
|
590 |
|
public function __construct() |
40
|
|
|
{ |
41
|
590 |
|
parent::__construct(); |
42
|
590 |
|
} |
43
|
|
|
|
44
|
590 |
|
protected function getRules() |
45
|
|
|
{ |
46
|
|
|
return [ |
47
|
590 |
|
self::STATE_INITIAL => [ |
48
|
590 |
|
self::REGEX_SPACE => false, |
49
|
590 |
|
self::REGEX_COMMENT => false, |
50
|
|
|
self::REGEX_COMMENT_ML => function () { |
51
|
2 |
|
$pos = strpos($this->content, '*/', $this->count); |
52
|
2 |
|
if (false === $pos) { |
53
|
1 |
|
$this->line += substr_count(substr($this->content, $this->count), "\n"); |
54
|
1 |
|
$this->error('Unterminated multiline comment'); |
55
|
|
|
} |
56
|
1 |
|
$this->line += substr_count(substr($this->content, $this->count, $pos - $this->count + 2), "\n"); |
57
|
1 |
|
$this->count = $pos + 2; |
58
|
590 |
|
}, |
59
|
|
|
self::REGEX_DQUOTE => function () { |
60
|
564 |
|
$this->begin(self::STATE_INSTRING); |
61
|
564 |
|
$this->textBuffer = ''; |
62
|
590 |
|
}, |
63
|
|
|
self::REGEX_BOOL => function (&$yylval) { |
64
|
76 |
|
$yylval = TypeCaster::toBool($yylval); |
65
|
|
|
|
66
|
76 |
|
return Token::T_BOOL; |
67
|
590 |
|
}, |
68
|
|
|
self::REGEX_NULL => function (&$yylval) { |
69
|
68 |
|
$yylval = null; |
70
|
|
|
|
71
|
68 |
|
return Token::T_NULL; |
72
|
590 |
|
}, |
73
|
|
|
self::REGEX_NUM => function (&$yylval) { |
74
|
144 |
|
$yylval = TypeCaster::toNum($yylval); |
75
|
|
|
|
76
|
144 |
|
return Token::T_NUM; |
77
|
590 |
|
}, |
78
|
|
|
self::REGEX_NAME => function () { |
79
|
495 |
|
return Token::T_NAME; |
80
|
590 |
|
}, |
81
|
|
|
self::REGEX_HEREDOC => function (&$yylval) { |
82
|
4 |
|
$needle = "\n" . $yylval; |
83
|
4 |
|
$pos = strpos($this->content, $needle, $this->count); |
84
|
4 |
|
if (false === $pos) { |
85
|
1 |
|
$this->line += substr_count(substr($this->content, $this->count), "\n"); |
86
|
1 |
|
$this->error('Unterminated HEREDOC'); |
87
|
|
|
} |
88
|
|
|
|
89
|
3 |
|
$yylval = substr($this->content, $this->count, $pos - $this->count); |
90
|
3 |
|
$this->line += substr_count($yylval, "\n") + 1; |
91
|
3 |
|
$this->count += strlen($yylval) + strlen($needle); |
92
|
|
|
|
93
|
3 |
|
return Token::T_END_STR; |
94
|
590 |
|
}, |
95
|
|
|
self::REGEX_TOKEN => function ($yylval) { |
96
|
570 |
|
return $yylval; |
97
|
590 |
|
}, |
98
|
|
|
self::REGEX_VAR => function () { |
99
|
9 |
|
return Token::T_VAR; |
100
|
590 |
|
}, |
101
|
|
|
self::REGEX_ANY => function ($yylval) { |
102
|
|
|
$this->error('Unexpected char \'' . $yylval . '\''); |
103
|
590 |
|
}, |
104
|
|
|
self::EOF => function () { |
105
|
583 |
|
return Token::T_EOF; |
106
|
590 |
|
}, |
107
|
|
|
], |
108
|
590 |
|
self::STATE_INSTRING => [ |
109
|
|
|
'[^\\\"$]+' => function (&$yylval) { |
110
|
564 |
|
$this->textBuffer .= $yylval; |
111
|
564 |
|
if ('$' == substr($this->content, $this->count, 1)) { |
112
|
5 |
|
$yylval = $this->textBuffer; |
113
|
5 |
|
$this->textBuffer = ''; |
114
|
|
|
|
115
|
5 |
|
return Token::T_STRING; |
116
|
|
|
} |
117
|
590 |
|
}, |
118
|
|
|
'?:\\\(.)' => function ($yylval) { |
119
|
72 |
|
switch ($yylval) { |
120
|
72 |
|
case 'n': |
121
|
2 |
|
$this->textBuffer .= "\n"; |
122
|
2 |
|
break; |
123
|
72 |
|
case 't': |
124
|
2 |
|
$this->textBuffer .= "\t"; |
125
|
2 |
|
break; |
126
|
72 |
|
case '\\': |
127
|
5 |
|
case '/': |
128
|
5 |
|
case '"': |
129
|
69 |
|
$this->textBuffer .= $yylval; |
130
|
69 |
|
break; |
131
|
3 |
|
case 'u': |
132
|
2 |
|
$utfCode = substr($this->content, $this->count, 4); |
133
|
2 |
|
if (preg_match('/[A-Fa-f0-9]{4,4}/', $utfCode)) { |
134
|
2 |
|
$utf = hexdec($utfCode); |
135
|
2 |
|
$this->count += 4; |
136
|
|
|
// UTF-32 ? |
137
|
2 |
|
if ($utf >= 0xD800 && $utf <= 0xDBFF && preg_match('/^\\\\u[dD][c-fC-F][0-9a-fA-F][0-9a-fA-F]/', substr($this->content, $this->count, 6), $matches)) { |
138
|
1 |
|
$utf_hi = hexdec(substr($matches[0], -4)); |
139
|
1 |
|
$utf = (($utf & 0x3FF) << 10) + ($utf_hi & 0x3FF) + 0x10000; |
140
|
1 |
|
$this->count += 6; |
141
|
|
|
} |
142
|
2 |
|
$this->textBuffer .= $this->fromCharCode($utf); |
143
|
2 |
|
break; |
144
|
|
|
} |
145
|
|
|
/* no break */ |
146
|
|
|
default: |
147
|
1 |
|
$this->textBuffer .= '\\' . $yylval; |
148
|
1 |
|
break; |
149
|
|
|
} |
150
|
590 |
|
}, |
151
|
|
|
'\$' => function (&$yylval) { |
152
|
7 |
|
if (preg_match('/^{([A-Za-z0-9_]+)}/', substr($this->content, $this->count), $matches)) { |
153
|
7 |
|
$this->count += strlen($matches[0]); |
154
|
7 |
|
$yylval = $matches[1]; |
155
|
|
|
|
156
|
7 |
|
return Token::T_ENCAPSED_VAR; |
157
|
|
|
} |
158
|
|
|
|
159
|
|
|
$this->textBuffer .= $yylval; |
160
|
590 |
|
}, |
161
|
|
|
self::REGEX_DQUOTE => function (&$yylval) { |
162
|
563 |
|
$yylval = $this->textBuffer; |
163
|
563 |
|
$this->begin(self::STATE_INITIAL); |
164
|
|
|
|
165
|
563 |
|
return Token::T_END_STR; |
166
|
590 |
|
}, |
167
|
|
|
self::EOF => function () { |
168
|
1 |
|
$this->error('Unterminated string'); |
169
|
590 |
|
}, |
170
|
|
|
], |
171
|
|
|
]; |
172
|
|
|
} |
173
|
|
|
|
174
|
2 |
|
private function fromCharCode($bytes) |
175
|
|
|
{ |
176
|
|
|
switch (true) { |
177
|
2 |
|
case (0x7F & $bytes) == $bytes: |
178
|
|
|
return chr($bytes); |
179
|
|
|
|
180
|
2 |
|
case (0x07FF & $bytes) == $bytes: |
181
|
1 |
|
return chr(0xc0 | ($bytes >> 6)) |
182
|
1 |
|
. chr(0x80 | ($bytes & 0x3F)); |
183
|
|
|
|
184
|
1 |
|
case (0xFFFF & $bytes) == $bytes: |
185
|
|
|
return chr(0xe0 | ($bytes >> 12)) |
186
|
|
|
. chr(0x80 | (($bytes >> 6) & 0x3F)) |
187
|
|
|
. chr(0x80 | ($bytes & 0x3F)); |
188
|
|
|
|
189
|
|
|
default: |
190
|
1 |
|
return chr(0xF0 | ($bytes >> 18)) |
191
|
1 |
|
. chr(0x80 | (($bytes >> 12) & 0x3F)) |
192
|
1 |
|
. chr(0x80 | (($bytes >> 6) & 0x3F)) |
193
|
1 |
|
. chr(0x80 | ($bytes & 0x3F)); |
194
|
|
|
} |
195
|
|
|
} |
196
|
|
|
} |
197
|
|
|
|