1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace PeacefulBit\Slate\Parser; |
4
|
|
|
|
5
|
|
|
use function Nerd\Common\Arrays\append; |
6
|
|
|
use function Nerd\Common\Arrays\toHeadTail; |
7
|
|
|
use function Nerd\Common\Functional\tail; |
8
|
|
|
use function Nerd\Common\Strings\toArray; |
9
|
|
|
|
10
|
|
|
use PeacefulBit\Slate\Exceptions\TokenizerException; |
11
|
|
|
use PeacefulBit\Slate\Parser\Tokens; |
12
|
|
|
|
13
|
|
|
class Tokenizer |
14
|
|
|
{ |
15
|
|
|
const TOKEN_OPEN_BRACKET = '('; |
16
|
|
|
const TOKEN_CLOSE_BRACKET = ')'; |
17
|
|
|
const TOKEN_DOUBLE_QUOTE = '"'; |
18
|
|
|
const TOKEN_BACK_SLASH = '\\'; |
19
|
|
|
const TOKEN_SEMICOLON = ';'; |
20
|
|
|
|
21
|
|
|
const TOKEN_TAB = "\t"; |
22
|
|
|
const TOKEN_SPACE = " "; |
23
|
|
|
const TOKEN_NEW_LINE = "\n"; |
24
|
|
|
const TOKEN_CARRIAGE_RETURN = "\r"; |
25
|
|
|
|
26
|
|
|
const CHAR_DIGITS = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]; |
27
|
|
|
const CHAR_DOT = '.'; |
28
|
|
|
|
29
|
|
|
/** |
30
|
|
|
* @param $char |
31
|
|
|
* @return bool |
32
|
|
|
*/ |
33
|
|
View Code Duplication |
private function isStructural($char) |
|
|
|
|
34
|
|
|
{ |
35
|
|
|
return in_array($char, [ |
36
|
|
|
self::TOKEN_OPEN_BRACKET, |
37
|
|
|
self::TOKEN_CLOSE_BRACKET, |
38
|
|
|
self::TOKEN_DOUBLE_QUOTE, |
39
|
|
|
self::TOKEN_SEMICOLON |
40
|
|
|
]); |
41
|
|
|
} |
42
|
|
|
|
43
|
|
|
/** |
44
|
|
|
* @param $char |
45
|
|
|
* @return bool |
46
|
|
|
*/ |
47
|
|
View Code Duplication |
private function isDelimiter($char) |
|
|
|
|
48
|
|
|
{ |
49
|
|
|
return in_array($char, [ |
50
|
|
|
self::TOKEN_TAB, |
51
|
|
|
self::TOKEN_CARRIAGE_RETURN, |
52
|
|
|
self::TOKEN_NEW_LINE, |
53
|
|
|
self::TOKEN_SPACE |
54
|
|
|
]); |
55
|
|
|
} |
56
|
|
|
|
57
|
|
|
/** |
58
|
|
|
* @param $char |
59
|
|
|
* @return bool |
60
|
|
|
*/ |
61
|
|
|
private function isSymbol($char) |
62
|
|
|
{ |
63
|
|
|
return !$this->isDelimiter($char) && !$this->isStructural($char); |
64
|
|
|
} |
65
|
|
|
|
66
|
|
|
/** |
67
|
|
|
* @param $char |
68
|
|
|
* @return bool |
69
|
|
|
*/ |
70
|
|
|
private function isNumber($char) |
|
|
|
|
71
|
|
|
{ |
72
|
|
|
return in_array($char, self::CHAR_DIGITS); |
73
|
|
|
} |
74
|
|
|
|
75
|
|
|
/** |
76
|
|
|
* Convert source code to array of tokens. |
77
|
|
|
* |
78
|
|
|
* @param string $code |
79
|
|
|
* @return Token[] |
80
|
|
|
*/ |
81
|
|
|
public function tokenize($code) |
82
|
|
|
{ |
83
|
|
|
// Initial state of parser |
84
|
|
|
$baseIter = tail(function ($rest, $acc) use (&$baseIter, &$symbolIter, &$stringIter, &$commentIter) { |
85
|
|
|
if (sizeof($rest) == 0) { |
86
|
|
|
return $acc; |
87
|
|
|
} |
88
|
|
|
list ($head, $tail) = toHeadTail($rest); |
89
|
|
|
switch ($head) { |
90
|
|
|
// We got '(', so we just add it to accumulator. |
91
|
|
|
case self::TOKEN_OPEN_BRACKET: |
92
|
|
|
return $baseIter($tail, append($acc, new Tokens\OpenBracketToken)); |
93
|
|
|
// We got ')', and doing the same as in previous case. |
94
|
|
|
case self::TOKEN_CLOSE_BRACKET: |
95
|
|
|
return $baseIter($tail, append($acc, new Tokens\CloseBracketToken)); |
96
|
|
|
// We got '"'. That means that we're in the beginning of the string. |
97
|
|
|
// So we switch our state to stringIter. |
98
|
|
|
case self::TOKEN_DOUBLE_QUOTE: |
99
|
|
|
return $stringIter($tail, '', $acc); |
100
|
|
|
// We got ';'. And that means that comment is starting here. So we |
101
|
|
|
// change our state to commentIter. |
102
|
|
|
case self::TOKEN_SEMICOLON: |
103
|
|
|
return $commentIter($tail, '', $acc); |
104
|
|
|
default: |
105
|
|
|
// If current char is a delimiter, we ignore it. |
106
|
|
|
if ($this->isDelimiter($head)) { |
107
|
|
|
return $baseIter($tail, $acc); |
108
|
|
|
} |
109
|
|
|
// In all other cases we interpret current char as first char |
110
|
|
|
// of symbol and change our state to symbolIter. |
111
|
|
|
return $symbolIter($tail, $head, $acc); |
112
|
|
|
} |
113
|
|
|
}); |
114
|
|
|
|
115
|
|
|
// State when parser parses any symbol |
116
|
|
|
$symbolIter = tail(function ($rest, $buffer, $acc) use (&$symbolIter, &$baseIter, &$delimiterIter) { |
|
|
|
|
117
|
|
View Code Duplication |
if (sizeof($rest) > 0) { |
|
|
|
|
118
|
|
|
list ($head, $tail) = toHeadTail($rest); |
119
|
|
|
if ($this->isSymbol($head)) { |
120
|
|
|
return $symbolIter($tail, $buffer . $head, $acc); |
121
|
|
|
} |
122
|
|
|
} |
123
|
|
|
if (is_numeric($buffer)) { |
124
|
|
|
$symbolToken = new Tokens\NumericToken($buffer); |
125
|
|
|
} else { |
126
|
|
|
$symbolToken = new Tokens\IdentifierToken($buffer); |
127
|
|
|
} |
128
|
|
|
return $baseIter($rest, append($acc, $symbolToken)); |
129
|
|
|
}); |
130
|
|
|
|
131
|
|
|
// State when parser parses string |
132
|
|
|
$stringIter = tail(function ($rest, $buffer, $acc) use (&$stringIter, &$baseIter, &$escapeIter) { |
|
|
|
|
133
|
|
|
if (sizeof($rest) == 0) { |
134
|
|
|
throw new TokenizerException("Unexpected end of string"); |
135
|
|
|
} |
136
|
|
|
list ($head, $tail) = toHeadTail($rest); |
137
|
|
|
if ($head == self::TOKEN_DOUBLE_QUOTE) { |
138
|
|
|
return $baseIter($tail, append($acc, new Tokens\StringToken($buffer))); |
139
|
|
|
} |
140
|
|
|
if ($head == Tokenizer::TOKEN_BACK_SLASH) { |
141
|
|
|
return $escapeIter($tail, $buffer, $acc); |
142
|
|
|
} |
143
|
|
|
return $stringIter($tail, $buffer . $head, $acc); |
144
|
|
|
}); |
145
|
|
|
|
146
|
|
|
// State when parser parses escaped symbol |
147
|
|
|
$escapeIter = tail(function ($rest, $buffer, $acc) use (&$stringIter) { |
|
|
|
|
148
|
|
|
if (sizeof($rest) == 0) { |
149
|
|
|
throw new TokenizerException("Unused escape character"); |
150
|
|
|
} |
151
|
|
|
list ($head, $tail) = toHeadTail($rest); |
152
|
|
|
return $stringIter($tail, $buffer . $head, $acc); |
153
|
|
|
}); |
154
|
|
|
|
155
|
|
|
// State when parser ignores comments |
156
|
|
|
$commentIter = function ($rest, $buffer, $acc) use (&$commentIter, &$baseIter) { |
|
|
|
|
157
|
|
View Code Duplication |
if (sizeof($rest) > 0) { |
|
|
|
|
158
|
|
|
list ($head, $tail) = toHeadTail($rest); |
159
|
|
|
if ($head != Tokenizer::TOKEN_NEW_LINE) { |
160
|
|
|
return $commentIter($tail, $buffer . $head, $acc); |
161
|
|
|
} |
162
|
|
|
} |
163
|
|
|
return $baseIter($rest, $acc); |
164
|
|
|
}; |
165
|
|
|
|
166
|
|
|
return $baseIter(toArray($code), []); |
167
|
|
|
} |
168
|
|
|
} |
169
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.