|
1
|
|
|
<?php declare(strict_types = 1); |
|
2
|
|
|
|
|
3
|
|
|
namespace Apicart\FQL\Tests\Tokenizer; |
|
4
|
|
|
|
|
5
|
|
|
use Apicart\FQL\Token\Token\GroupBegin; |
|
6
|
|
|
use Apicart\FQL\Token\Token\GroupBegin as GroupBeginToken; |
|
7
|
|
|
use Apicart\FQL\Token\Token\Phrase as PhraseToken; |
|
8
|
|
|
use Apicart\FQL\Token\Token\Range as RangeToken; |
|
9
|
|
|
use Apicart\FQL\Token\Token\Tag as TagToken; |
|
10
|
|
|
use Apicart\FQL\Token\Token\User as UserToken; |
|
11
|
|
|
use Apicart\FQL\Token\Token\Word as WordToken; |
|
12
|
|
|
use Apicart\FQL\Tokenizer\AbstractTokenExtractor; |
|
13
|
|
|
use Apicart\FQL\Tokenizer\Full; |
|
14
|
|
|
use Apicart\FQL\Tokenizer\Tokenizer; |
|
15
|
|
|
use Apicart\FQL\Value\Token; |
|
16
|
|
|
use Apicart\FQL\Value\TokenSequence; |
|
17
|
|
|
use PHPUnit\Framework\TestCase; |
|
18
|
|
|
|
|
19
|
|
|
class FullTest extends TestCase |
|
20
|
|
|
{ |
|
21
|
|
|
|
|
22
|
|
|
public function providerForTestTokenize(): array |
|
23
|
|
|
{ |
|
24
|
|
|
return [ |
|
25
|
|
|
[" \n", [new Token(Tokenizer::TOKEN_WHITESPACE, " \n", 0)]], |
|
26
|
|
|
['word', [new WordToken('word', 0, '', 'word')]], |
|
27
|
|
|
["word\n", [new WordToken('word', 0, '', 'word'), new Token(Tokenizer::TOKEN_WHITESPACE, "\n", 4)]], |
|
28
|
|
|
['word ', [new WordToken('word', 0, '', 'word'), new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 4)]], |
|
29
|
|
|
['word(', [new WordToken('word', 0, '', 'word'), new GroupBeginToken('(', 4, '(', null)]], |
|
30
|
|
|
['word)', [new WordToken('word', 0, '', 'word'), new Token(Tokenizer::TOKEN_GROUP_END, ')', 4)]], |
|
31
|
|
|
['šđčćž', [new WordToken('šđčćž', 0, '', 'šđčćž')]], |
|
32
|
|
|
[ |
|
33
|
|
|
$jajeNaOko = mb_convert_encoding('🍳', 'UTF-8', 'HTML-ENTITIES'), |
|
34
|
|
|
[new WordToken($jajeNaOko, 0, '', $jajeNaOko)], |
|
|
|
|
|
|
35
|
|
|
], |
|
36
|
|
|
[ |
|
37
|
|
|
$blah = mb_convert_encoding( |
|
38
|
|
|
'👩‍👩‍👧‍👧', |
|
39
|
|
|
'UTF-8', |
|
40
|
|
|
'HTML-ENTITIES' |
|
41
|
|
|
), |
|
42
|
|
|
[new WordToken($blah, 0, '', $blah)], |
|
43
|
|
|
], |
|
44
|
|
|
['word-word', [new WordToken('word-word', 0, '', 'word-word')]], |
|
45
|
|
|
[ |
|
46
|
|
|
"word\nword", |
|
47
|
|
|
[ |
|
48
|
|
|
new WordToken('word', 0, '', 'word'), |
|
49
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, "\n", 4), |
|
50
|
|
|
new WordToken('word', 5, '', 'word'), |
|
51
|
|
|
], |
|
52
|
|
|
], |
|
53
|
|
|
[ |
|
54
|
|
|
'word word', |
|
55
|
|
|
[ |
|
56
|
|
|
new WordToken('word', 0, '', 'word'), |
|
57
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 4), |
|
58
|
|
|
new WordToken('word', 5, '', 'word'), |
|
59
|
|
|
], |
|
60
|
|
|
], |
|
61
|
|
|
['word\\ word', [new WordToken('word\\ word', 0, '', 'word word')]], |
|
62
|
|
|
['[a TO b]', [new RangeToken('[a TO b]', 0, '', 'a', 'b', 'inclusive', 'inclusive')]], |
|
63
|
|
|
['[a TO b}', [new RangeToken('[a TO b}', 0, '', 'a', 'b', 'inclusive', 'exclusive')]], |
|
64
|
|
|
['{a TO b}', [new RangeToken('{a TO b}', 0, '', 'a', 'b', 'exclusive', 'exclusive')]], |
|
65
|
|
|
['{a TO b]', [new RangeToken('{a TO b]', 0, '', 'a', 'b', 'exclusive', 'inclusive')]], |
|
66
|
|
|
[ |
|
67
|
|
|
'[2017-01-01 TO 2017-01-05]', |
|
68
|
|
|
[ |
|
69
|
|
|
new RangeToken( |
|
70
|
|
|
'[2017-01-01 TO 2017-01-05]', |
|
71
|
|
|
0, |
|
72
|
|
|
'', |
|
73
|
|
|
'2017-01-01', |
|
74
|
|
|
'2017-01-05', |
|
75
|
|
|
'inclusive', |
|
76
|
|
|
'inclusive' |
|
77
|
|
|
), |
|
78
|
|
|
], |
|
79
|
|
|
], |
|
80
|
|
|
['[20 TO *]', [new RangeToken('[20 TO *]', 0, '', '20', '*', 'inclusive', 'inclusive')]], |
|
81
|
|
|
['[* TO 20]', [new RangeToken('[* TO 20]', 0, '', '*', '20', 'inclusive', 'inclusive')]], |
|
82
|
|
|
['"phrase"', [new PhraseToken('"phrase"', 0, '', '"', 'phrase')]], |
|
83
|
|
|
[ |
|
84
|
|
|
'"phrase" "phrase"', |
|
85
|
|
|
[ |
|
86
|
|
|
new PhraseToken('"phrase"', 0, '', '"', 'phrase'), |
|
87
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 8), |
|
88
|
|
|
new PhraseToken('"phrase"', 9, '', '"', 'phrase'), |
|
89
|
|
|
], |
|
90
|
|
|
], |
|
91
|
|
|
["\"phrase\nphrase\"", [new PhraseToken("\"phrase\nphrase\"", 0, '', '"', "phrase\nphrase")]], |
|
92
|
|
|
["'phrase'", [new WordToken("'phrase'", 0, '', "'phrase'")]], |
|
93
|
|
|
[ |
|
94
|
|
|
"'phrase' 'phrase'", |
|
95
|
|
|
[ |
|
96
|
|
|
new WordToken("'phrase'", 0, '', "'phrase'"), |
|
97
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 8), |
|
98
|
|
|
new WordToken("'phrase'", 9, '', "'phrase'"), |
|
99
|
|
|
], |
|
100
|
|
|
], |
|
101
|
|
|
[ |
|
102
|
|
|
"'phrase\nphrase'", |
|
103
|
|
|
[ |
|
104
|
|
|
new WordToken("'phrase", 0, '', "'phrase"), |
|
105
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, "\n", 7), |
|
106
|
|
|
new WordToken("phrase'", 8, '', "phrase'"), |
|
107
|
|
|
], |
|
108
|
|
|
], |
|
109
|
|
|
['"phrase\"phrase"', [new PhraseToken('"phrase\"phrase"', 0, '', '"', 'phrase"phrase')]], |
|
110
|
|
|
["'phrase\\'phrase'", [new WordToken("'phrase\\'phrase'", 0, '', "'phrase\\'phrase'")]], |
|
111
|
|
|
['"phrase\'phrase"', [new PhraseToken('"phrase\'phrase"', 0, '', '"', 'phrase\'phrase')]], |
|
112
|
|
|
[ |
|
113
|
|
|
"'phrase\"phrase'", |
|
114
|
|
|
[ |
|
115
|
|
|
new WordToken("'phrase", 0, '', "'phrase"), |
|
116
|
|
|
new Token(Tokenizer::TOKEN_BAILOUT, '"', 7), |
|
117
|
|
|
new WordToken("phrase'", 8, '', "phrase'"), |
|
118
|
|
|
], |
|
119
|
|
|
], |
|
120
|
|
|
['\"not_phrase\"', [new WordToken('\"not_phrase\"', 0, '', '"not_phrase"')]], |
|
121
|
|
|
["\\'not_phrase\\'", [new WordToken("\\'not_phrase\\'", 0, '', "\\'not_phrase\\'")]], |
|
122
|
|
|
[ |
|
123
|
|
|
'"phrase + - ! ( ) AND OR NOT \\ phrase"', |
|
124
|
|
|
[ |
|
125
|
|
|
new PhraseToken( |
|
126
|
|
|
'"phrase + - ! ( ) AND OR NOT \\ phrase"', |
|
127
|
|
|
0, |
|
128
|
|
|
'', |
|
129
|
|
|
'"', |
|
130
|
|
|
'phrase + - ! ( ) AND OR NOT \\ phrase' |
|
131
|
|
|
), |
|
132
|
|
|
], |
|
133
|
|
|
], |
|
134
|
|
|
[ |
|
135
|
|
|
"'word + - ! ( ) AND OR NOT \\ word'", |
|
136
|
|
|
[ |
|
137
|
|
|
new WordToken("'word", 0, '', "'word"), |
|
138
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 5), |
|
139
|
|
|
new Token(Tokenizer::TOKEN_MANDATORY, '+', 6), |
|
140
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 7), |
|
141
|
|
|
new Token(Tokenizer::TOKEN_PROHIBITED, '-', 8), |
|
142
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 9), |
|
143
|
|
|
new Token(Tokenizer::TOKEN_LOGICAL_NOT_2, '!', 10), |
|
144
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 11), |
|
145
|
|
|
new GroupBegin('(', 12, '(', ''), |
|
146
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 13), |
|
147
|
|
|
new Token(Tokenizer::TOKEN_GROUP_END, ')', 14), |
|
148
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 15), |
|
149
|
|
|
new Token(Tokenizer::TOKEN_LOGICAL_AND, 'AND', 16), |
|
150
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 19), |
|
151
|
|
|
new Token(Tokenizer::TOKEN_LOGICAL_OR, 'OR', 20), |
|
152
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 22), |
|
153
|
|
|
new Token(Tokenizer::TOKEN_LOGICAL_NOT, 'NOT', 23), |
|
154
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 26), |
|
155
|
|
|
new WordToken("\\ word'", 27, '', " word'"), |
|
156
|
|
|
], |
|
157
|
|
|
], |
|
158
|
|
|
[ |
|
159
|
|
|
'"phrase \+ \- \! \( \) \AND \OR \NOT \\\\ phrase"', |
|
160
|
|
|
[ |
|
161
|
|
|
new PhraseToken( |
|
162
|
|
|
'"phrase \+ \- \! \( \) \AND \OR \NOT \\\\ phrase"', |
|
163
|
|
|
0, |
|
164
|
|
|
'', |
|
165
|
|
|
'"', |
|
166
|
|
|
'phrase \+ \- \! \( \) \AND \OR \NOT \\\\ phrase' |
|
167
|
|
|
), |
|
168
|
|
|
], |
|
169
|
|
|
], |
|
170
|
|
|
[ |
|
171
|
|
|
"'word \\+ \\- \\! \\( \\) \\AND \\OR \\NOT \\\\ word'", |
|
172
|
|
|
[ |
|
173
|
|
|
new WordToken("'word", 0, '', "'word"), |
|
174
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 5), |
|
175
|
|
|
new WordToken('\\+', 6, '', '+'), |
|
176
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 8), |
|
177
|
|
|
new WordToken('\\-', 9, '', '-'), |
|
178
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 11), |
|
179
|
|
|
new WordToken('\\!', 12, '', '!'), |
|
180
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 14), |
|
181
|
|
|
new WordToken('\\(', 15, '', '('), |
|
182
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 17), |
|
183
|
|
|
new WordToken('\\)', 18, '', ')'), |
|
184
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 20), |
|
185
|
|
|
new WordToken('\\AND', 21, '', '\AND'), |
|
186
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 25), |
|
187
|
|
|
new WordToken('\\OR', 26, '', '\OR'), |
|
188
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 29), |
|
189
|
|
|
new WordToken('\\NOT', 30, '', '\NOT'), |
|
190
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 34), |
|
191
|
|
|
new WordToken('\\\\', 35, '', '\\'), |
|
192
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 37), |
|
193
|
|
|
new WordToken("word'", 38, '', "word'"), |
|
194
|
|
|
], |
|
195
|
|
|
], |
|
196
|
|
|
['#tag', [new TagToken('#tag', 0, '#', 'tag')]], |
|
197
|
|
|
['\#tag', [new WordToken('\#tag', 0, '', '#tag')]], |
|
198
|
|
|
['#tagšđčćž', [new WordToken('#tagšđčćž', 0, '', '#tagšđčćž')]], |
|
199
|
|
|
['#_tag-tag', [new TagToken('#_tag-tag', 0, '#', '_tag-tag')]], |
|
200
|
|
|
['#-not-tag', [new WordToken('#-not-tag', 0, '', '#-not-tag')]], |
|
201
|
|
|
['#tag+', [new TagToken('#tag', 0, '#', 'tag'), new Token(Tokenizer::TOKEN_MANDATORY, '+', 4)]], |
|
202
|
|
|
['#tag-', [new TagToken('#tag-', 0, '#', 'tag-')]], |
|
203
|
|
|
['#tag!', [new TagToken('#tag', 0, '#', 'tag'), new Token(Tokenizer::TOKEN_LOGICAL_NOT_2, '!', 4)]], |
|
204
|
|
|
["#tag\n", [new TagToken('#tag', 0, '#', 'tag'), new Token(Tokenizer::TOKEN_WHITESPACE, "\n", 4)]], |
|
205
|
|
|
['#tag ', [new TagToken('#tag', 0, '#', 'tag'), new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 4)]], |
|
206
|
|
|
['#tag(', [new TagToken('#tag', 0, '#', 'tag'), new GroupBeginToken('(', 4, '(', null)]], |
|
207
|
|
|
['#tag)', [new TagToken('#tag', 0, '#', 'tag'), new Token(Tokenizer::TOKEN_GROUP_END, ')', 4)]], |
|
208
|
|
|
['@user', [new UserToken('@user', 0, '@', 'user')]], |
|
209
|
|
|
['@user.user', [new UserToken('@user.user', 0, '@', 'user.user')]], |
|
210
|
|
|
['\@user', [new WordToken('\@user', 0, '', '@user')]], |
|
211
|
|
|
['@useršđčćž', [new WordToken('@useršđčćž', 0, '', '@useršđčćž')]], |
|
212
|
|
|
['@_user-user', [new UserToken('@_user-user', 0, '@', '_user-user')]], |
|
213
|
|
|
['@-not-user', [new WordToken('@-not-user', 0, '', '@-not-user')]], |
|
214
|
|
|
['@user+', [new UserToken('@user', 0, '@', 'user'), new Token(Tokenizer::TOKEN_MANDATORY, '+', 5)]], |
|
215
|
|
|
['@user-', [new UserToken('@user-', 0, '@', 'user-')]], |
|
216
|
|
|
['@user!', [new UserToken('@user', 0, '@', 'user'), new Token(Tokenizer::TOKEN_LOGICAL_NOT_2, '!', 5)]], |
|
217
|
|
|
["@user\n", [new UserToken('@user', 0, '@', 'user'), new Token(Tokenizer::TOKEN_WHITESPACE, "\n", 5)]], |
|
218
|
|
|
['@user ', [new UserToken('@user', 0, '@', 'user'), new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 5)]], |
|
219
|
|
|
['@user(', [new UserToken('@user', 0, '@', 'user'), new GroupBeginToken('(', 5, '(', null)]], |
|
220
|
|
|
['@user)', [new UserToken('@user', 0, '@', 'user'), new Token(Tokenizer::TOKEN_GROUP_END, ')', 5)]], |
|
221
|
|
|
['domain:', [new WordToken('domain:', 0, '', 'domain:')]], |
|
222
|
|
|
['some.domain:', [new WordToken('some.domain:', 0, '', 'some.domain:')]], |
|
223
|
|
|
['domain:domain:', [new WordToken('domain:domain:', 0, 'domain', 'domain:')]], |
|
224
|
|
|
['some.domain:some.domain:', [new WordToken('some.domain:some.domain:', 0, 'some.domain', 'some.domain:')]], |
|
225
|
|
|
[ |
|
226
|
|
|
'domain:domain:domain:domain', |
|
227
|
|
|
[new WordToken('domain:domain:domain:domain', 0, 'domain', 'domain:domain:domain')], |
|
228
|
|
|
], |
|
229
|
|
|
['domain\:', [new WordToken('domain\:', 0, '', 'domain:')]], |
|
230
|
|
|
['domain\::', [new WordToken('domain\::', 0, '', 'domain::')]], |
|
231
|
|
|
['domain:word', [new WordToken('domain:word', 0, 'domain', 'word')]], |
|
232
|
|
|
['domain\:word', [new WordToken('domain\:word', 0, '', 'domain:word')]], |
|
233
|
|
|
['domain:"phrase"', [new PhraseToken('domain:"phrase"', 0, 'domain', '"', 'phrase')]], |
|
234
|
|
|
['some.domain:"phrase"', [new PhraseToken('some.domain:"phrase"', 0, 'some.domain', '"', 'phrase')]], |
|
235
|
|
|
[ |
|
236
|
|
|
'domain\:"phrase"', |
|
237
|
|
|
[new WordToken('domain\:', 0, '', 'domain:'), new PhraseToken('"phrase"', 8, '', '"', 'phrase')], |
|
238
|
|
|
], |
|
239
|
|
|
[ |
|
240
|
|
|
'domain:(one)', |
|
241
|
|
|
[ |
|
242
|
|
|
new GroupBeginToken('domain:(', 0, '(', 'domain'), |
|
243
|
|
|
new WordToken('one', 8, '', 'one'), |
|
244
|
|
|
new Token(Tokenizer::TOKEN_GROUP_END, ')', 11), |
|
245
|
|
|
], |
|
246
|
|
|
], |
|
247
|
|
|
[ |
|
248
|
|
|
'some.domain:(one)', |
|
249
|
|
|
[ |
|
250
|
|
|
new GroupBeginToken('some.domain:(', 0, '(', 'some.domain'), |
|
251
|
|
|
new WordToken('one', 13, '', 'one'), |
|
252
|
|
|
new Token(Tokenizer::TOKEN_GROUP_END, ')', 16), |
|
253
|
|
|
], |
|
254
|
|
|
], |
|
255
|
|
|
[ |
|
256
|
|
|
'one AND two', |
|
257
|
|
|
[ |
|
258
|
|
|
new WordToken('one', 0, '', 'one'), |
|
259
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 3), |
|
260
|
|
|
new Token(Tokenizer::TOKEN_LOGICAL_AND, 'AND', 4), |
|
261
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 7), |
|
262
|
|
|
new WordToken('two', 8, '', 'two'), |
|
263
|
|
|
], |
|
264
|
|
|
], |
|
265
|
|
|
[ |
|
266
|
|
|
'one && two', |
|
267
|
|
|
[ |
|
268
|
|
|
new WordToken('one', 0, '', 'one'), |
|
269
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 3), |
|
270
|
|
|
new Token(Tokenizer::TOKEN_LOGICAL_AND, '&&', 4), |
|
271
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 6), |
|
272
|
|
|
new WordToken('two', 7, '', 'two'), |
|
273
|
|
|
], |
|
274
|
|
|
], |
|
275
|
|
|
[ |
|
276
|
|
|
'one OR two', |
|
277
|
|
|
[ |
|
278
|
|
|
new WordToken('one', 0, '', 'one'), |
|
279
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 3), |
|
280
|
|
|
new Token(Tokenizer::TOKEN_LOGICAL_OR, 'OR', 4), |
|
281
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 6), |
|
282
|
|
|
new WordToken('two', 7, '', 'two'), |
|
283
|
|
|
], |
|
284
|
|
|
], |
|
285
|
|
|
[ |
|
286
|
|
|
'one || two', |
|
287
|
|
|
[ |
|
288
|
|
|
new WordToken('one', 0, '', 'one'), |
|
289
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 3), |
|
290
|
|
|
new Token(Tokenizer::TOKEN_LOGICAL_OR, '||', 4), |
|
291
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 6), |
|
292
|
|
|
new WordToken('two', 7, '', 'two'), |
|
293
|
|
|
], |
|
294
|
|
|
], |
|
295
|
|
|
[ |
|
296
|
|
|
'one NOT two', |
|
297
|
|
|
[ |
|
298
|
|
|
new WordToken('one', 0, '', 'one'), |
|
299
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 3), |
|
300
|
|
|
new Token(Tokenizer::TOKEN_LOGICAL_NOT, 'NOT', 4), |
|
301
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 7), |
|
302
|
|
|
new WordToken('two', 8, '', 'two'), |
|
303
|
|
|
], |
|
304
|
|
|
], |
|
305
|
|
|
['AND', [new Token(Tokenizer::TOKEN_LOGICAL_AND, 'AND', 0)]], |
|
306
|
|
|
['ANDword', [new WordToken('ANDword', 0, '', 'ANDword')]], |
|
307
|
|
|
['wordAND', [new WordToken('wordAND', 0, '', 'wordAND')]], |
|
308
|
|
|
[ |
|
309
|
|
|
'AND+', |
|
310
|
|
|
[new Token(Tokenizer::TOKEN_LOGICAL_AND, 'AND', 0), new Token(Tokenizer::TOKEN_MANDATORY, '+', 3)], |
|
311
|
|
|
], |
|
312
|
|
|
['AND\+', [new WordToken('AND\+', 0, '', 'AND+')]], |
|
313
|
|
|
[ |
|
314
|
|
|
'+AND', |
|
315
|
|
|
[new Token(Tokenizer::TOKEN_MANDATORY, '+', 0), new Token(Tokenizer::TOKEN_LOGICAL_AND, 'AND', 1)], |
|
316
|
|
|
], |
|
317
|
|
|
[ |
|
318
|
|
|
'AND-', |
|
319
|
|
|
[new Token(Tokenizer::TOKEN_LOGICAL_AND, 'AND', 0), new Token(Tokenizer::TOKEN_PROHIBITED, '-', 3)], |
|
320
|
|
|
], |
|
321
|
|
|
['AND\-', [new WordToken('AND\-', 0, '', 'AND-')]], |
|
322
|
|
|
[ |
|
323
|
|
|
'-AND', |
|
324
|
|
|
[new Token(Tokenizer::TOKEN_PROHIBITED, '-', 0), new Token(Tokenizer::TOKEN_LOGICAL_AND, 'AND', 1)], |
|
325
|
|
|
], |
|
326
|
|
|
[ |
|
327
|
|
|
'AND!', |
|
328
|
|
|
[new Token(Tokenizer::TOKEN_LOGICAL_AND, 'AND', 0), new Token(Tokenizer::TOKEN_LOGICAL_NOT_2, '!', 3)], |
|
329
|
|
|
], |
|
330
|
|
|
['AND\!', [new WordToken('AND\!', 0, '', 'AND!')]], |
|
331
|
|
|
[ |
|
332
|
|
|
'!AND', |
|
333
|
|
|
[new Token(Tokenizer::TOKEN_LOGICAL_NOT_2, '!', 0), new Token(Tokenizer::TOKEN_LOGICAL_AND, 'AND', 1)], |
|
334
|
|
|
], |
|
335
|
|
|
[ |
|
336
|
|
|
"AND\n", |
|
337
|
|
|
[new Token(Tokenizer::TOKEN_LOGICAL_AND, 'AND', 0), new Token(Tokenizer::TOKEN_WHITESPACE, "\n", 3)], |
|
338
|
|
|
], |
|
339
|
|
|
[ |
|
340
|
|
|
'AND ', |
|
341
|
|
|
[new Token(Tokenizer::TOKEN_LOGICAL_AND, 'AND', 0), new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 3)], |
|
342
|
|
|
], |
|
343
|
|
|
['AND(', [new Token(Tokenizer::TOKEN_LOGICAL_AND, 'AND', 0), new GroupBeginToken('(', 3, '(', null)]], |
|
344
|
|
|
[ |
|
345
|
|
|
'AND)', |
|
346
|
|
|
[new Token(Tokenizer::TOKEN_LOGICAL_AND, 'AND', 0), new Token(Tokenizer::TOKEN_GROUP_END, ')', 3)], |
|
347
|
|
|
], |
|
348
|
|
|
['ORword', [new WordToken('ORword', 0, '', 'ORword')]], |
|
349
|
|
|
['ORword', [new WordToken('ORword', 0, '', 'ORword')]], |
|
350
|
|
|
['OR', [new Token(Tokenizer::TOKEN_LOGICAL_OR, 'OR', 0)]], |
|
351
|
|
|
['OR+', [new Token(Tokenizer::TOKEN_LOGICAL_OR, 'OR', 0), new Token(Tokenizer::TOKEN_MANDATORY, '+', 2)]], |
|
352
|
|
|
['OR\+', [new WordToken('OR\+', 0, '', 'OR+')]], |
|
353
|
|
|
['+OR', [new Token(Tokenizer::TOKEN_MANDATORY, '+', 0), new Token(Tokenizer::TOKEN_LOGICAL_OR, 'OR', 1)]], |
|
354
|
|
|
['OR-', [new Token(Tokenizer::TOKEN_LOGICAL_OR, 'OR', 0), new Token(Tokenizer::TOKEN_PROHIBITED, '-', 2)]], |
|
355
|
|
|
['OR\+', [new WordToken('OR\+', 0, '', 'OR+')]], |
|
356
|
|
|
['-OR', [new Token(Tokenizer::TOKEN_PROHIBITED, '-', 0), new Token(Tokenizer::TOKEN_LOGICAL_OR, 'OR', 1)]], |
|
357
|
|
|
[ |
|
358
|
|
|
'OR!', |
|
359
|
|
|
[new Token(Tokenizer::TOKEN_LOGICAL_OR, 'OR', 0), new Token(Tokenizer::TOKEN_LOGICAL_NOT_2, '!', 2)], |
|
360
|
|
|
], |
|
361
|
|
|
['OR\!', [new WordToken('OR\!', 0, '', 'OR!')]], |
|
362
|
|
|
[ |
|
363
|
|
|
'!OR', |
|
364
|
|
|
[new Token(Tokenizer::TOKEN_LOGICAL_NOT_2, '!', 0), new Token(Tokenizer::TOKEN_LOGICAL_OR, 'OR', 1)], |
|
365
|
|
|
], |
|
366
|
|
|
[ |
|
367
|
|
|
"OR\n", |
|
368
|
|
|
[new Token(Tokenizer::TOKEN_LOGICAL_OR, 'OR', 0), new Token(Tokenizer::TOKEN_WHITESPACE, "\n", 2)], |
|
369
|
|
|
], |
|
370
|
|
|
['OR ', [new Token(Tokenizer::TOKEN_LOGICAL_OR, 'OR', 0), new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 2)]], |
|
371
|
|
|
['OR(', [new Token(Tokenizer::TOKEN_LOGICAL_OR, 'OR', 0), new GroupBeginToken('(', 2, '(', null)]], |
|
372
|
|
|
['OR)', [new Token(Tokenizer::TOKEN_LOGICAL_OR, 'OR', 0), new Token(Tokenizer::TOKEN_GROUP_END, ')', 2)]], |
|
373
|
|
|
['NOT', [new Token(Tokenizer::TOKEN_LOGICAL_NOT, 'NOT', 0)]], |
|
374
|
|
|
['NOTword', [new WordToken('NOTword', 0, '', 'NOTword')]], |
|
375
|
|
|
['wordNOT', [new WordToken('wordNOT', 0, '', 'wordNOT')]], |
|
376
|
|
|
[ |
|
377
|
|
|
'NOT+', |
|
378
|
|
|
[new Token(Tokenizer::TOKEN_LOGICAL_NOT, 'NOT', 0), new Token(Tokenizer::TOKEN_MANDATORY, '+', 3)], |
|
379
|
|
|
], |
|
380
|
|
|
[ |
|
381
|
|
|
'+NOT', |
|
382
|
|
|
[new Token(Tokenizer::TOKEN_MANDATORY, '+', 0), new Token(Tokenizer::TOKEN_LOGICAL_NOT, 'NOT', 1)], |
|
383
|
|
|
], |
|
384
|
|
|
[ |
|
385
|
|
|
'NOT-', |
|
386
|
|
|
[new Token(Tokenizer::TOKEN_LOGICAL_NOT, 'NOT', 0), new Token(Tokenizer::TOKEN_PROHIBITED, '-', 3)], |
|
387
|
|
|
], |
|
388
|
|
|
[ |
|
389
|
|
|
'-NOT', |
|
390
|
|
|
[new Token(Tokenizer::TOKEN_PROHIBITED, '-', 0), new Token(Tokenizer::TOKEN_LOGICAL_NOT, 'NOT', 1)], |
|
391
|
|
|
], |
|
392
|
|
|
[ |
|
393
|
|
|
'NOT!', |
|
394
|
|
|
[new Token(Tokenizer::TOKEN_LOGICAL_NOT, 'NOT', 0), new Token(Tokenizer::TOKEN_LOGICAL_NOT_2, '!', 3)], |
|
395
|
|
|
], |
|
396
|
|
|
[ |
|
397
|
|
|
'!NOT', |
|
398
|
|
|
[new Token(Tokenizer::TOKEN_LOGICAL_NOT_2, '!', 0), new Token(Tokenizer::TOKEN_LOGICAL_NOT, 'NOT', 1)], |
|
399
|
|
|
], |
|
400
|
|
|
[ |
|
401
|
|
|
"NOT\n", |
|
402
|
|
|
[new Token(Tokenizer::TOKEN_LOGICAL_NOT, 'NOT', 0), new Token(Tokenizer::TOKEN_WHITESPACE, "\n", 3)], |
|
403
|
|
|
], |
|
404
|
|
|
[ |
|
405
|
|
|
'NOT ', |
|
406
|
|
|
[new Token(Tokenizer::TOKEN_LOGICAL_NOT, 'NOT', 0), new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 3)], |
|
407
|
|
|
], |
|
408
|
|
|
['NOT(', [new Token(Tokenizer::TOKEN_LOGICAL_NOT, 'NOT', 0), new GroupBeginToken('(', 3, '(', null)]], |
|
409
|
|
|
[ |
|
410
|
|
|
'NOT)', |
|
411
|
|
|
[new Token(Tokenizer::TOKEN_LOGICAL_NOT, 'NOT', 0), new Token(Tokenizer::TOKEN_GROUP_END, ')', 3)], |
|
412
|
|
|
], |
|
413
|
|
|
['+', [new Token(Tokenizer::TOKEN_MANDATORY, '+', 0)]], |
|
414
|
|
|
['++', [new Token(Tokenizer::TOKEN_MANDATORY, '+', 0), new Token(Tokenizer::TOKEN_MANDATORY, '+', 1)]], |
|
415
|
|
|
['-', [new Token(Tokenizer::TOKEN_PROHIBITED, '-', 0)]], |
|
416
|
|
|
['--', [new Token(Tokenizer::TOKEN_PROHIBITED, '-', 0), new Token(Tokenizer::TOKEN_PROHIBITED, '-', 1)]], |
|
417
|
|
|
['!', [new Token(Tokenizer::TOKEN_LOGICAL_NOT_2, '!', 0)]], |
|
418
|
|
|
[ |
|
419
|
|
|
'!!', |
|
420
|
|
|
[new Token(Tokenizer::TOKEN_LOGICAL_NOT_2, '!', 0), new Token(Tokenizer::TOKEN_LOGICAL_NOT_2, '!', 1)], |
|
421
|
|
|
], |
|
422
|
|
|
['+word', [new Token(Tokenizer::TOKEN_MANDATORY, '+', 0), new WordToken('word', 1, '', 'word')]], |
|
423
|
|
|
['-word', [new Token(Tokenizer::TOKEN_PROHIBITED, '-', 0), new WordToken('word', 1, '', 'word')]], |
|
424
|
|
|
['!word', [new Token(Tokenizer::TOKEN_LOGICAL_NOT_2, '!', 0), new WordToken('word', 1, '', 'word')]], |
|
425
|
|
|
['(word', [new GroupBeginToken('(', 0, '(', null), new WordToken('word', 1, '', 'word')]], |
|
426
|
|
|
[')word', [new Token(Tokenizer::TOKEN_GROUP_END, ')', 0), new WordToken('word', 1, '', 'word')]], |
|
427
|
|
|
['word+', [new WordToken('word+', 0, '', 'word+')]], |
|
428
|
|
|
['word-', [new WordToken('word-', 0, '', 'word-')]], |
|
429
|
|
|
['word!', [new WordToken('word!', 0, '', 'word!')]], |
|
430
|
|
|
['word(', [new WordToken('word', 0, '', 'word'), new GroupBeginToken('(', 4, '(', null)]], |
|
431
|
|
|
['word)', [new WordToken('word', 0, '', 'word'), new Token(Tokenizer::TOKEN_GROUP_END, ')', 4)]], |
|
432
|
|
|
['one+two+', [new WordToken('one+two+', 0, '', 'one+two+')]], |
|
433
|
|
|
['one-two-', [new WordToken('one-two-', 0, '', 'one-two-')]], |
|
434
|
|
|
['one!two!', [new WordToken('one!two!', 0, '', 'one!two!')]], |
|
435
|
|
|
[ |
|
436
|
|
|
'one(two(', |
|
437
|
|
|
[ |
|
438
|
|
|
new WordToken('one', 0, '', 'one'), |
|
439
|
|
|
new GroupBeginToken('(', 3, '(', null), |
|
440
|
|
|
new WordToken('two', 4, '', 'two'), |
|
441
|
|
|
new GroupBeginToken('(', 7, '(', null), |
|
442
|
|
|
], |
|
443
|
|
|
], |
|
444
|
|
|
[ |
|
445
|
|
|
'one)two)', |
|
446
|
|
|
[ |
|
447
|
|
|
new WordToken('one', 0, '', 'one'), |
|
448
|
|
|
new Token(Tokenizer::TOKEN_GROUP_END, ')', 3), |
|
449
|
|
|
new WordToken('two', 4, '', 'two'), |
|
450
|
|
|
new Token(Tokenizer::TOKEN_GROUP_END, ')', 7), |
|
451
|
|
|
], |
|
452
|
|
|
], |
|
453
|
|
|
['word\+', [new WordToken('word\+', 0, '', 'word+')]], |
|
454
|
|
|
['word\-', [new WordToken('word\-', 0, '', 'word-')]], |
|
455
|
|
|
['word\!', [new WordToken('word\!', 0, '', 'word!')]], |
|
456
|
|
|
['word\(', [new WordToken('word\(', 0, '', 'word(')]], |
|
457
|
|
|
['word\)', [new WordToken('word\)', 0, '', 'word)')]], |
|
458
|
|
|
['\+word', [new WordToken('\+word', 0, '', '+word')]], |
|
459
|
|
|
['\-word', [new WordToken('\-word', 0, '', '-word')]], |
|
460
|
|
|
['\!word', [new WordToken('\!word', 0, '', '!word')]], |
|
461
|
|
|
['\(word', [new WordToken('\(word', 0, '', '(word')]], |
|
462
|
|
|
['\)word', [new WordToken('\)word', 0, '', ')word')]], |
|
463
|
|
|
['one\+two\+', [new WordToken('one\+two\+', 0, '', 'one+two+')]], |
|
464
|
|
|
['one\-two\-', [new WordToken('one\-two\-', 0, '', 'one-two-')]], |
|
465
|
|
|
['one\!two\!', [new WordToken('one\!two\!', 0, '', 'one!two!')]], |
|
466
|
|
|
['one\(two\(', [new WordToken('one\(two\(', 0, '', 'one(two(')]], |
|
467
|
|
|
['one\)two\)', [new WordToken('one\)two\)', 0, '', 'one)two)')]], |
|
468
|
|
|
[ |
|
469
|
|
|
'one\\\\\)two\\\\\(one\\\\\+two\\\\\-one\\\\\!two', |
|
470
|
|
|
[ |
|
471
|
|
|
new WordToken( |
|
472
|
|
|
'one\\\\\)two\\\\\(one\\\\\+two\\\\\-one\\\\\!two', |
|
473
|
|
|
0, |
|
474
|
|
|
'', |
|
475
|
|
|
'one\)two\(one\+two\-one\!two' |
|
476
|
|
|
), |
|
477
|
|
|
], |
|
478
|
|
|
], |
|
479
|
|
|
[ |
|
480
|
|
|
'one\\\\)two\\\\(one\\\\+two\\\\-one\\\\!two', |
|
481
|
|
|
[ |
|
482
|
|
|
new WordToken('one\\\\', 0, '', 'one\\'), |
|
483
|
|
|
new Token(Tokenizer::TOKEN_GROUP_END, ')', 5), |
|
484
|
|
|
new WordToken('two\\\\', 6, '', 'two\\'), |
|
485
|
|
|
new GroupBeginToken('(', 11, '(', null), |
|
486
|
|
|
new WordToken('one\\\\+two\\\\-one\\\\!two', 12, '', 'one\+two\-one\!two'), |
|
487
|
|
|
], |
|
488
|
|
|
], |
|
489
|
|
|
['one+two-one!two', [new WordToken('one+two-one!two', 0, '', 'one+two-one!two')]], |
|
490
|
|
|
['one\\\'two', [new WordToken('one\\\'two', 0, '', "one\\'two")]], |
|
491
|
|
|
['one\\"two', [new WordToken('one\\"two', 0, '', 'one"two')]], |
|
492
|
|
|
['\\', [new WordToken('\\', 0, '', '\\')]], |
|
493
|
|
|
['one\\two', [new WordToken('one\\two', 0, '', 'one\\two')]], |
|
494
|
|
|
['one\\\\+\\-\\!\\(\\)two', [new WordToken('one\\\\+\\-\\!\\(\\)two', 0, '', 'one\\+-!()two')]], |
|
495
|
|
|
['\\\\', [new WordToken('\\\\', 0, '', '\\')]], |
|
496
|
|
|
[ |
|
497
|
|
|
'(type:)', |
|
498
|
|
|
[ |
|
499
|
|
|
new GroupBeginToken('(', 0, '(', null), |
|
500
|
|
|
new WordToken('type:', 1, '', 'type:'), |
|
501
|
|
|
new Token(Tokenizer::TOKEN_GROUP_END, ')', 6), |
|
502
|
|
|
], |
|
503
|
|
|
], |
|
504
|
|
|
[ |
|
505
|
|
|
'type: AND', |
|
506
|
|
|
[ |
|
507
|
|
|
new WordToken('type:', 0, '', 'type:'), |
|
508
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 5), |
|
509
|
|
|
new Token(Tokenizer::TOKEN_LOGICAL_AND, 'AND', 6), |
|
510
|
|
|
], |
|
511
|
|
|
], |
|
512
|
|
|
["word'", [new WordToken("word'", 0, '', "word'")]], |
|
513
|
|
|
['one\'two', [new WordToken("one'two", 0, '', "one'two")]], |
|
514
|
|
|
["AND'", [new WordToken("AND'", 0, '', "AND'")]], |
|
515
|
|
|
["OR'", [new WordToken("OR'", 0, '', "OR'")]], |
|
516
|
|
|
["NOT'", [new WordToken("NOT'", 0, '', "NOT'")]], |
|
517
|
|
|
]; |
|
518
|
|
|
} |
|
519
|
|
|
|
|
520
|
|
|
|
|
521
|
|
|
/** |
|
522
|
|
|
* @dataProvider providerForTestTokenize |
|
523
|
|
|
* |
|
524
|
|
|
* @param string $string |
|
525
|
|
|
* @param Token[] $expectedTokens |
|
526
|
|
|
*/ |
|
527
|
|
|
public function testTokenize($string, array $expectedTokens): void |
|
528
|
|
|
{ |
|
529
|
|
|
$tokenExtractor = $this->getTokenExtractor(); |
|
530
|
|
|
$tokenizer = new Tokenizer($tokenExtractor); |
|
531
|
|
|
$tokenSequence = $tokenizer->tokenize($string); |
|
532
|
|
|
self::assertInstanceOf(TokenSequence::class, $tokenSequence); |
|
533
|
|
|
self::assertEquals($expectedTokens, $tokenSequence->getTokens()); |
|
534
|
|
|
self::assertEquals($string, $tokenSequence->getSource()); |
|
535
|
|
|
} |
|
536
|
|
|
|
|
537
|
|
|
|
|
538
|
|
|
public function providerForTestTokenizeNotRecognized(): array |
|
539
|
|
|
{ |
|
540
|
|
|
return [ |
|
541
|
|
|
[ |
|
542
|
|
|
( |
|
543
|
|
|
$blah = mb_convert_encoding( |
|
|
|
|
|
|
544
|
|
|
'👩‍👩‍👧‍👧', |
|
545
|
|
|
'UTF-8', |
|
546
|
|
|
'HTML-ENTITIES' |
|
547
|
|
|
) |
|
548
|
|
|
) . '"', |
|
549
|
|
|
[new WordToken($blah, 0, '', $blah), new Token(Tokenizer::TOKEN_BAILOUT, '"', 7)], |
|
|
|
|
|
|
550
|
|
|
], |
|
551
|
|
|
['"' . $blah, [new Token(Tokenizer::TOKEN_BAILOUT, '"', 0), new WordToken($blah, 1, '', $blah)]], |
|
|
|
|
|
|
552
|
|
|
['word"', [new WordToken('word', 0, '', 'word'), new Token(Tokenizer::TOKEN_BAILOUT, '"', 4)]], |
|
553
|
|
|
[ |
|
554
|
|
|
'one"two', |
|
555
|
|
|
[ |
|
556
|
|
|
new WordToken('one', 0, '', 'one'), |
|
557
|
|
|
new Token(Tokenizer::TOKEN_BAILOUT, '"', 3), |
|
558
|
|
|
new WordToken('two', 4, '', 'two'), |
|
559
|
|
|
], |
|
560
|
|
|
], |
|
561
|
|
|
[ |
|
562
|
|
|
'šđ"čćž', |
|
563
|
|
|
[ |
|
564
|
|
|
new WordToken('šđ', 0, '', 'šđ'), |
|
565
|
|
|
new Token(Tokenizer::TOKEN_BAILOUT, '"', 2), |
|
566
|
|
|
new WordToken('čćž', 3, '', 'čćž'), |
|
567
|
|
|
], |
|
568
|
|
|
], |
|
569
|
|
|
['AND"', [new Token(Tokenizer::TOKEN_LOGICAL_AND, 'AND', 0), new Token(Tokenizer::TOKEN_BAILOUT, '"', 3)]], |
|
570
|
|
|
['OR"', [new Token(Tokenizer::TOKEN_LOGICAL_OR, 'OR', 0), new Token(Tokenizer::TOKEN_BAILOUT, '"', 2)]], |
|
571
|
|
|
['NOT"', [new Token(Tokenizer::TOKEN_LOGICAL_NOT, 'NOT', 0), new Token(Tokenizer::TOKEN_BAILOUT, '"', 3)]], |
|
572
|
|
|
]; |
|
573
|
|
|
} |
|
574
|
|
|
|
|
575
|
|
|
|
|
576
|
|
|
/** |
|
577
|
|
|
* @dataProvider providerForTestTokenizeNotRecognized |
|
578
|
|
|
* |
|
579
|
|
|
* @param string $string |
|
580
|
|
|
* @param Token[] $expectedTokens |
|
581
|
|
|
*/ |
|
582
|
|
|
public function testTokenizeNotRecognized($string, array $expectedTokens): void |
|
583
|
|
|
{ |
|
584
|
|
|
$tokenExtractor = $this->getTokenExtractor(); |
|
585
|
|
|
$tokenizer = new Tokenizer($tokenExtractor); |
|
586
|
|
|
$tokenSequence = $tokenizer->tokenize($string); |
|
587
|
|
|
self::assertInstanceOf(TokenSequence::class, $tokenSequence); |
|
588
|
|
|
self::assertEquals($expectedTokens, $tokenSequence->getTokens()); |
|
589
|
|
|
self::assertEquals($string, $tokenSequence->getSource()); |
|
590
|
|
|
} |
|
591
|
|
|
|
|
592
|
|
|
|
|
593
|
|
|
protected function getTokenExtractor(): AbstractTokenExtractor |
|
594
|
|
|
{ |
|
595
|
|
|
return new Full; |
|
596
|
|
|
} |
|
597
|
|
|
|
|
598
|
|
|
} |
|
599
|
|
|
|