1
|
|
|
<?php declare(strict_types = 1); |
2
|
|
|
|
3
|
|
|
namespace Apicart\FQL\Tests\Tokenizer; |
4
|
|
|
|
5
|
|
|
use Apicart\FQL\Token\Token\GroupBegin; |
6
|
|
|
use Apicart\FQL\Token\Token\GroupBegin as GroupBeginToken; |
7
|
|
|
use Apicart\FQL\Token\Token\Phrase as PhraseToken; |
8
|
|
|
use Apicart\FQL\Token\Token\Range as RangeToken; |
9
|
|
|
use Apicart\FQL\Token\Token\Tag as TagToken; |
10
|
|
|
use Apicart\FQL\Token\Token\User as UserToken; |
11
|
|
|
use Apicart\FQL\Token\Token\Word as WordToken; |
12
|
|
|
use Apicart\FQL\Tokenizer\AbstractTokenExtractor; |
13
|
|
|
use Apicart\FQL\Tokenizer\Full; |
14
|
|
|
use Apicart\FQL\Tokenizer\Tokenizer; |
15
|
|
|
use Apicart\FQL\Value\Token; |
16
|
|
|
use Apicart\FQL\Value\TokenSequence; |
17
|
|
|
use PHPUnit\Framework\TestCase; |
18
|
|
|
|
19
|
|
|
class FullTest extends TestCase |
20
|
|
|
{ |
21
|
|
|
|
22
|
|
|
public function providerForTestTokenize(): array |
23
|
|
|
{ |
24
|
|
|
return [ |
25
|
|
|
[" \n", [new Token(Tokenizer::TOKEN_WHITESPACE, " \n", 0)]], |
26
|
|
|
['word', [new WordToken('word', 0, '', 'word')]], |
27
|
|
|
["word\n", [new WordToken('word', 0, '', 'word'), new Token(Tokenizer::TOKEN_WHITESPACE, "\n", 4)]], |
28
|
|
|
['word ', [new WordToken('word', 0, '', 'word'), new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 4)]], |
29
|
|
|
['word(', [new WordToken('word', 0, '', 'word'), new GroupBeginToken('(', 4, '(', null)]], |
30
|
|
|
['word)', [new WordToken('word', 0, '', 'word'), new Token(Tokenizer::TOKEN_GROUP_END, ')', 4)]], |
31
|
|
|
['šđčćž', [new WordToken('šđčćž', 0, '', 'šđčćž')]], |
32
|
|
|
[ |
33
|
|
|
$jajeNaOko = mb_convert_encoding('🍳', 'UTF-8', 'HTML-ENTITIES'), |
34
|
|
|
[new WordToken($jajeNaOko, 0, '', $jajeNaOko)], |
|
|
|
|
35
|
|
|
], |
36
|
|
|
[ |
37
|
|
|
$blah = mb_convert_encoding( |
38
|
|
|
'👩‍👩‍👧‍👧', |
39
|
|
|
'UTF-8', |
40
|
|
|
'HTML-ENTITIES' |
41
|
|
|
), |
42
|
|
|
[new WordToken($blah, 0, '', $blah)], |
43
|
|
|
], |
44
|
|
|
['word-word', [new WordToken('word-word', 0, '', 'word-word')]], |
45
|
|
|
[ |
46
|
|
|
"word\nword", |
47
|
|
|
[ |
48
|
|
|
new WordToken('word', 0, '', 'word'), |
49
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, "\n", 4), |
50
|
|
|
new WordToken('word', 5, '', 'word'), |
51
|
|
|
], |
52
|
|
|
], |
53
|
|
|
[ |
54
|
|
|
'word word', |
55
|
|
|
[ |
56
|
|
|
new WordToken('word', 0, '', 'word'), |
57
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 4), |
58
|
|
|
new WordToken('word', 5, '', 'word'), |
59
|
|
|
], |
60
|
|
|
], |
61
|
|
|
['word\\ word', [new WordToken('word\\ word', 0, '', 'word word')]], |
62
|
|
|
['[a TO b]', [new RangeToken('[a TO b]', 0, '', 'a', 'b', 'inclusive', 'inclusive')]], |
63
|
|
|
['[a TO b}', [new RangeToken('[a TO b}', 0, '', 'a', 'b', 'inclusive', 'exclusive')]], |
64
|
|
|
['{a TO b}', [new RangeToken('{a TO b}', 0, '', 'a', 'b', 'exclusive', 'exclusive')]], |
65
|
|
|
['{a TO b]', [new RangeToken('{a TO b]', 0, '', 'a', 'b', 'exclusive', 'inclusive')]], |
66
|
|
|
[ |
67
|
|
|
'[2017-01-01 TO 2017-01-05]', |
68
|
|
|
[ |
69
|
|
|
new RangeToken( |
70
|
|
|
'[2017-01-01 TO 2017-01-05]', |
71
|
|
|
0, |
72
|
|
|
'', |
73
|
|
|
'2017-01-01', |
74
|
|
|
'2017-01-05', |
75
|
|
|
'inclusive', |
76
|
|
|
'inclusive' |
77
|
|
|
), |
78
|
|
|
], |
79
|
|
|
], |
80
|
|
|
['[20 TO *]', [new RangeToken('[20 TO *]', 0, '', '20', '*', 'inclusive', 'inclusive')]], |
81
|
|
|
['[* TO 20]', [new RangeToken('[* TO 20]', 0, '', '*', '20', 'inclusive', 'inclusive')]], |
82
|
|
|
['"phrase"', [new PhraseToken('"phrase"', 0, '', '"', 'phrase')]], |
83
|
|
|
[ |
84
|
|
|
'"phrase" "phrase"', |
85
|
|
|
[ |
86
|
|
|
new PhraseToken('"phrase"', 0, '', '"', 'phrase'), |
87
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 8), |
88
|
|
|
new PhraseToken('"phrase"', 9, '', '"', 'phrase'), |
89
|
|
|
], |
90
|
|
|
], |
91
|
|
|
["\"phrase\nphrase\"", [new PhraseToken("\"phrase\nphrase\"", 0, '', '"', "phrase\nphrase")]], |
92
|
|
|
["'phrase'", [new WordToken("'phrase'", 0, '', "'phrase'")]], |
93
|
|
|
[ |
94
|
|
|
"'phrase' 'phrase'", |
95
|
|
|
[ |
96
|
|
|
new WordToken("'phrase'", 0, '', "'phrase'"), |
97
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 8), |
98
|
|
|
new WordToken("'phrase'", 9, '', "'phrase'"), |
99
|
|
|
], |
100
|
|
|
], |
101
|
|
|
[ |
102
|
|
|
"'phrase\nphrase'", |
103
|
|
|
[ |
104
|
|
|
new WordToken("'phrase", 0, '', "'phrase"), |
105
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, "\n", 7), |
106
|
|
|
new WordToken("phrase'", 8, '', "phrase'"), |
107
|
|
|
], |
108
|
|
|
], |
109
|
|
|
['"phrase\"phrase"', [new PhraseToken('"phrase\"phrase"', 0, '', '"', 'phrase"phrase')]], |
110
|
|
|
["'phrase\\'phrase'", [new WordToken("'phrase\\'phrase'", 0, '', "'phrase\\'phrase'")]], |
111
|
|
|
['"phrase\'phrase"', [new PhraseToken('"phrase\'phrase"', 0, '', '"', 'phrase\'phrase')]], |
112
|
|
|
[ |
113
|
|
|
"'phrase\"phrase'", |
114
|
|
|
[ |
115
|
|
|
new WordToken("'phrase", 0, '', "'phrase"), |
116
|
|
|
new Token(Tokenizer::TOKEN_BAILOUT, '"', 7), |
117
|
|
|
new WordToken("phrase'", 8, '', "phrase'"), |
118
|
|
|
], |
119
|
|
|
], |
120
|
|
|
['\"not_phrase\"', [new WordToken('\"not_phrase\"', 0, '', '"not_phrase"')]], |
121
|
|
|
["\\'not_phrase\\'", [new WordToken("\\'not_phrase\\'", 0, '', "\\'not_phrase\\'")]], |
122
|
|
|
[ |
123
|
|
|
'"phrase + - ! ( ) AND OR NOT \\ phrase"', |
124
|
|
|
[ |
125
|
|
|
new PhraseToken( |
126
|
|
|
'"phrase + - ! ( ) AND OR NOT \\ phrase"', |
127
|
|
|
0, |
128
|
|
|
'', |
129
|
|
|
'"', |
130
|
|
|
'phrase + - ! ( ) AND OR NOT \\ phrase' |
131
|
|
|
), |
132
|
|
|
], |
133
|
|
|
], |
134
|
|
|
[ |
135
|
|
|
"'word + - ! ( ) AND OR NOT \\ word'", |
136
|
|
|
[ |
137
|
|
|
new WordToken("'word", 0, '', "'word"), |
138
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 5), |
139
|
|
|
new Token(Tokenizer::TOKEN_MANDATORY, '+', 6), |
140
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 7), |
141
|
|
|
new Token(Tokenizer::TOKEN_PROHIBITED, '-', 8), |
142
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 9), |
143
|
|
|
new Token(Tokenizer::TOKEN_LOGICAL_NOT_2, '!', 10), |
144
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 11), |
145
|
|
|
new GroupBegin('(', 12, '(', ''), |
146
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 13), |
147
|
|
|
new Token(Tokenizer::TOKEN_GROUP_END, ')', 14), |
148
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 15), |
149
|
|
|
new Token(Tokenizer::TOKEN_LOGICAL_AND, 'AND', 16), |
150
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 19), |
151
|
|
|
new Token(Tokenizer::TOKEN_LOGICAL_OR, 'OR', 20), |
152
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 22), |
153
|
|
|
new Token(Tokenizer::TOKEN_LOGICAL_NOT, 'NOT', 23), |
154
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 26), |
155
|
|
|
new WordToken("\\ word'", 27, '', " word'"), |
156
|
|
|
], |
157
|
|
|
], |
158
|
|
|
[ |
159
|
|
|
'"phrase \+ \- \! \( \) \AND \OR \NOT \\\\ phrase"', |
160
|
|
|
[ |
161
|
|
|
new PhraseToken( |
162
|
|
|
'"phrase \+ \- \! \( \) \AND \OR \NOT \\\\ phrase"', |
163
|
|
|
0, |
164
|
|
|
'', |
165
|
|
|
'"', |
166
|
|
|
'phrase \+ \- \! \( \) \AND \OR \NOT \\\\ phrase' |
167
|
|
|
), |
168
|
|
|
], |
169
|
|
|
], |
170
|
|
|
[ |
171
|
|
|
"'word \\+ \\- \\! \\( \\) \\AND \\OR \\NOT \\\\ word'", |
172
|
|
|
[ |
173
|
|
|
new WordToken("'word", 0, '', "'word"), |
174
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 5), |
175
|
|
|
new WordToken('\\+', 6, '', '+'), |
176
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 8), |
177
|
|
|
new WordToken('\\-', 9, '', '-'), |
178
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 11), |
179
|
|
|
new WordToken('\\!', 12, '', '!'), |
180
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 14), |
181
|
|
|
new WordToken('\\(', 15, '', '('), |
182
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 17), |
183
|
|
|
new WordToken('\\)', 18, '', ')'), |
184
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 20), |
185
|
|
|
new WordToken('\\AND', 21, '', '\AND'), |
186
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 25), |
187
|
|
|
new WordToken('\\OR', 26, '', '\OR'), |
188
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 29), |
189
|
|
|
new WordToken('\\NOT', 30, '', '\NOT'), |
190
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 34), |
191
|
|
|
new WordToken('\\\\', 35, '', '\\'), |
192
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 37), |
193
|
|
|
new WordToken("word'", 38, '', "word'"), |
194
|
|
|
], |
195
|
|
|
], |
196
|
|
|
['#tag', [new TagToken('#tag', 0, '#', 'tag')]], |
197
|
|
|
['\#tag', [new WordToken('\#tag', 0, '', '#tag')]], |
198
|
|
|
['#tagšđčćž', [new WordToken('#tagšđčćž', 0, '', '#tagšđčćž')]], |
199
|
|
|
['#_tag-tag', [new TagToken('#_tag-tag', 0, '#', '_tag-tag')]], |
200
|
|
|
['#-not-tag', [new WordToken('#-not-tag', 0, '', '#-not-tag')]], |
201
|
|
|
['#tag+', [new TagToken('#tag', 0, '#', 'tag'), new Token(Tokenizer::TOKEN_MANDATORY, '+', 4)]], |
202
|
|
|
['#tag-', [new TagToken('#tag-', 0, '#', 'tag-')]], |
203
|
|
|
['#tag!', [new TagToken('#tag', 0, '#', 'tag'), new Token(Tokenizer::TOKEN_LOGICAL_NOT_2, '!', 4)]], |
204
|
|
|
["#tag\n", [new TagToken('#tag', 0, '#', 'tag'), new Token(Tokenizer::TOKEN_WHITESPACE, "\n", 4)]], |
205
|
|
|
['#tag ', [new TagToken('#tag', 0, '#', 'tag'), new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 4)]], |
206
|
|
|
['#tag(', [new TagToken('#tag', 0, '#', 'tag'), new GroupBeginToken('(', 4, '(', null)]], |
207
|
|
|
['#tag)', [new TagToken('#tag', 0, '#', 'tag'), new Token(Tokenizer::TOKEN_GROUP_END, ')', 4)]], |
208
|
|
|
['@user', [new UserToken('@user', 0, '@', 'user')]], |
209
|
|
|
['@user.user', [new UserToken('@user.user', 0, '@', 'user.user')]], |
210
|
|
|
['\@user', [new WordToken('\@user', 0, '', '@user')]], |
211
|
|
|
['@useršđčćž', [new WordToken('@useršđčćž', 0, '', '@useršđčćž')]], |
212
|
|
|
['@_user-user', [new UserToken('@_user-user', 0, '@', '_user-user')]], |
213
|
|
|
['@-not-user', [new WordToken('@-not-user', 0, '', '@-not-user')]], |
214
|
|
|
['@user+', [new UserToken('@user', 0, '@', 'user'), new Token(Tokenizer::TOKEN_MANDATORY, '+', 5)]], |
215
|
|
|
['@user-', [new UserToken('@user-', 0, '@', 'user-')]], |
216
|
|
|
['@user!', [new UserToken('@user', 0, '@', 'user'), new Token(Tokenizer::TOKEN_LOGICAL_NOT_2, '!', 5)]], |
217
|
|
|
["@user\n", [new UserToken('@user', 0, '@', 'user'), new Token(Tokenizer::TOKEN_WHITESPACE, "\n", 5)]], |
218
|
|
|
['@user ', [new UserToken('@user', 0, '@', 'user'), new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 5)]], |
219
|
|
|
['@user(', [new UserToken('@user', 0, '@', 'user'), new GroupBeginToken('(', 5, '(', null)]], |
220
|
|
|
['@user)', [new UserToken('@user', 0, '@', 'user'), new Token(Tokenizer::TOKEN_GROUP_END, ')', 5)]], |
221
|
|
|
['domain:', [new WordToken('domain:', 0, '', 'domain:')]], |
222
|
|
|
['some.domain:', [new WordToken('some.domain:', 0, '', 'some.domain:')]], |
223
|
|
|
['domain:domain:', [new WordToken('domain:domain:', 0, 'domain', 'domain:')]], |
224
|
|
|
['some.domain:some.domain:', [new WordToken('some.domain:some.domain:', 0, 'some.domain', 'some.domain:')]], |
225
|
|
|
[ |
226
|
|
|
'domain:domain:domain:domain', |
227
|
|
|
[new WordToken('domain:domain:domain:domain', 0, 'domain', 'domain:domain:domain')], |
228
|
|
|
], |
229
|
|
|
['domain\:', [new WordToken('domain\:', 0, '', 'domain:')]], |
230
|
|
|
['domain\::', [new WordToken('domain\::', 0, '', 'domain::')]], |
231
|
|
|
['domain:word', [new WordToken('domain:word', 0, 'domain', 'word')]], |
232
|
|
|
['domain\:word', [new WordToken('domain\:word', 0, '', 'domain:word')]], |
233
|
|
|
['domain:"phrase"', [new PhraseToken('domain:"phrase"', 0, 'domain', '"', 'phrase')]], |
234
|
|
|
['some.domain:"phrase"', [new PhraseToken('some.domain:"phrase"', 0, 'some.domain', '"', 'phrase')]], |
235
|
|
|
[ |
236
|
|
|
'domain\:"phrase"', |
237
|
|
|
[new WordToken('domain\:', 0, '', 'domain:'), new PhraseToken('"phrase"', 8, '', '"', 'phrase')], |
238
|
|
|
], |
239
|
|
|
[ |
240
|
|
|
'domain:(one)', |
241
|
|
|
[ |
242
|
|
|
new GroupBeginToken('domain:(', 0, '(', 'domain'), |
243
|
|
|
new WordToken('one', 8, '', 'one'), |
244
|
|
|
new Token(Tokenizer::TOKEN_GROUP_END, ')', 11), |
245
|
|
|
], |
246
|
|
|
], |
247
|
|
|
[ |
248
|
|
|
'some.domain:(one)', |
249
|
|
|
[ |
250
|
|
|
new GroupBeginToken('some.domain:(', 0, '(', 'some.domain'), |
251
|
|
|
new WordToken('one', 13, '', 'one'), |
252
|
|
|
new Token(Tokenizer::TOKEN_GROUP_END, ')', 16), |
253
|
|
|
], |
254
|
|
|
], |
255
|
|
|
[ |
256
|
|
|
'one AND two', |
257
|
|
|
[ |
258
|
|
|
new WordToken('one', 0, '', 'one'), |
259
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 3), |
260
|
|
|
new Token(Tokenizer::TOKEN_LOGICAL_AND, 'AND', 4), |
261
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 7), |
262
|
|
|
new WordToken('two', 8, '', 'two'), |
263
|
|
|
], |
264
|
|
|
], |
265
|
|
|
[ |
266
|
|
|
'one && two', |
267
|
|
|
[ |
268
|
|
|
new WordToken('one', 0, '', 'one'), |
269
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 3), |
270
|
|
|
new Token(Tokenizer::TOKEN_LOGICAL_AND, '&&', 4), |
271
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 6), |
272
|
|
|
new WordToken('two', 7, '', 'two'), |
273
|
|
|
], |
274
|
|
|
], |
275
|
|
|
[ |
276
|
|
|
'one OR two', |
277
|
|
|
[ |
278
|
|
|
new WordToken('one', 0, '', 'one'), |
279
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 3), |
280
|
|
|
new Token(Tokenizer::TOKEN_LOGICAL_OR, 'OR', 4), |
281
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 6), |
282
|
|
|
new WordToken('two', 7, '', 'two'), |
283
|
|
|
], |
284
|
|
|
], |
285
|
|
|
[ |
286
|
|
|
'one || two', |
287
|
|
|
[ |
288
|
|
|
new WordToken('one', 0, '', 'one'), |
289
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 3), |
290
|
|
|
new Token(Tokenizer::TOKEN_LOGICAL_OR, '||', 4), |
291
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 6), |
292
|
|
|
new WordToken('two', 7, '', 'two'), |
293
|
|
|
], |
294
|
|
|
], |
295
|
|
|
[ |
296
|
|
|
'one NOT two', |
297
|
|
|
[ |
298
|
|
|
new WordToken('one', 0, '', 'one'), |
299
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 3), |
300
|
|
|
new Token(Tokenizer::TOKEN_LOGICAL_NOT, 'NOT', 4), |
301
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 7), |
302
|
|
|
new WordToken('two', 8, '', 'two'), |
303
|
|
|
], |
304
|
|
|
], |
305
|
|
|
['AND', [new Token(Tokenizer::TOKEN_LOGICAL_AND, 'AND', 0)]], |
306
|
|
|
['ANDword', [new WordToken('ANDword', 0, '', 'ANDword')]], |
307
|
|
|
['wordAND', [new WordToken('wordAND', 0, '', 'wordAND')]], |
308
|
|
|
[ |
309
|
|
|
'AND+', |
310
|
|
|
[new Token(Tokenizer::TOKEN_LOGICAL_AND, 'AND', 0), new Token(Tokenizer::TOKEN_MANDATORY, '+', 3)], |
311
|
|
|
], |
312
|
|
|
['AND\+', [new WordToken('AND\+', 0, '', 'AND+')]], |
313
|
|
|
[ |
314
|
|
|
'+AND', |
315
|
|
|
[new Token(Tokenizer::TOKEN_MANDATORY, '+', 0), new Token(Tokenizer::TOKEN_LOGICAL_AND, 'AND', 1)], |
316
|
|
|
], |
317
|
|
|
[ |
318
|
|
|
'AND-', |
319
|
|
|
[new Token(Tokenizer::TOKEN_LOGICAL_AND, 'AND', 0), new Token(Tokenizer::TOKEN_PROHIBITED, '-', 3)], |
320
|
|
|
], |
321
|
|
|
['AND\-', [new WordToken('AND\-', 0, '', 'AND-')]], |
322
|
|
|
[ |
323
|
|
|
'-AND', |
324
|
|
|
[new Token(Tokenizer::TOKEN_PROHIBITED, '-', 0), new Token(Tokenizer::TOKEN_LOGICAL_AND, 'AND', 1)], |
325
|
|
|
], |
326
|
|
|
[ |
327
|
|
|
'AND!', |
328
|
|
|
[new Token(Tokenizer::TOKEN_LOGICAL_AND, 'AND', 0), new Token(Tokenizer::TOKEN_LOGICAL_NOT_2, '!', 3)], |
329
|
|
|
], |
330
|
|
|
['AND\!', [new WordToken('AND\!', 0, '', 'AND!')]], |
331
|
|
|
[ |
332
|
|
|
'!AND', |
333
|
|
|
[new Token(Tokenizer::TOKEN_LOGICAL_NOT_2, '!', 0), new Token(Tokenizer::TOKEN_LOGICAL_AND, 'AND', 1)], |
334
|
|
|
], |
335
|
|
|
[ |
336
|
|
|
"AND\n", |
337
|
|
|
[new Token(Tokenizer::TOKEN_LOGICAL_AND, 'AND', 0), new Token(Tokenizer::TOKEN_WHITESPACE, "\n", 3)], |
338
|
|
|
], |
339
|
|
|
[ |
340
|
|
|
'AND ', |
341
|
|
|
[new Token(Tokenizer::TOKEN_LOGICAL_AND, 'AND', 0), new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 3)], |
342
|
|
|
], |
343
|
|
|
['AND(', [new Token(Tokenizer::TOKEN_LOGICAL_AND, 'AND', 0), new GroupBeginToken('(', 3, '(', null)]], |
344
|
|
|
[ |
345
|
|
|
'AND)', |
346
|
|
|
[new Token(Tokenizer::TOKEN_LOGICAL_AND, 'AND', 0), new Token(Tokenizer::TOKEN_GROUP_END, ')', 3)], |
347
|
|
|
], |
348
|
|
|
['ORword', [new WordToken('ORword', 0, '', 'ORword')]], |
349
|
|
|
['ORword', [new WordToken('ORword', 0, '', 'ORword')]], |
350
|
|
|
['OR', [new Token(Tokenizer::TOKEN_LOGICAL_OR, 'OR', 0)]], |
351
|
|
|
['OR+', [new Token(Tokenizer::TOKEN_LOGICAL_OR, 'OR', 0), new Token(Tokenizer::TOKEN_MANDATORY, '+', 2)]], |
352
|
|
|
['OR\+', [new WordToken('OR\+', 0, '', 'OR+')]], |
353
|
|
|
['+OR', [new Token(Tokenizer::TOKEN_MANDATORY, '+', 0), new Token(Tokenizer::TOKEN_LOGICAL_OR, 'OR', 1)]], |
354
|
|
|
['OR-', [new Token(Tokenizer::TOKEN_LOGICAL_OR, 'OR', 0), new Token(Tokenizer::TOKEN_PROHIBITED, '-', 2)]], |
355
|
|
|
['OR\+', [new WordToken('OR\+', 0, '', 'OR+')]], |
356
|
|
|
['-OR', [new Token(Tokenizer::TOKEN_PROHIBITED, '-', 0), new Token(Tokenizer::TOKEN_LOGICAL_OR, 'OR', 1)]], |
357
|
|
|
[ |
358
|
|
|
'OR!', |
359
|
|
|
[new Token(Tokenizer::TOKEN_LOGICAL_OR, 'OR', 0), new Token(Tokenizer::TOKEN_LOGICAL_NOT_2, '!', 2)], |
360
|
|
|
], |
361
|
|
|
['OR\!', [new WordToken('OR\!', 0, '', 'OR!')]], |
362
|
|
|
[ |
363
|
|
|
'!OR', |
364
|
|
|
[new Token(Tokenizer::TOKEN_LOGICAL_NOT_2, '!', 0), new Token(Tokenizer::TOKEN_LOGICAL_OR, 'OR', 1)], |
365
|
|
|
], |
366
|
|
|
[ |
367
|
|
|
"OR\n", |
368
|
|
|
[new Token(Tokenizer::TOKEN_LOGICAL_OR, 'OR', 0), new Token(Tokenizer::TOKEN_WHITESPACE, "\n", 2)], |
369
|
|
|
], |
370
|
|
|
['OR ', [new Token(Tokenizer::TOKEN_LOGICAL_OR, 'OR', 0), new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 2)]], |
371
|
|
|
['OR(', [new Token(Tokenizer::TOKEN_LOGICAL_OR, 'OR', 0), new GroupBeginToken('(', 2, '(', null)]], |
372
|
|
|
['OR)', [new Token(Tokenizer::TOKEN_LOGICAL_OR, 'OR', 0), new Token(Tokenizer::TOKEN_GROUP_END, ')', 2)]], |
373
|
|
|
['NOT', [new Token(Tokenizer::TOKEN_LOGICAL_NOT, 'NOT', 0)]], |
374
|
|
|
['NOTword', [new WordToken('NOTword', 0, '', 'NOTword')]], |
375
|
|
|
['wordNOT', [new WordToken('wordNOT', 0, '', 'wordNOT')]], |
376
|
|
|
[ |
377
|
|
|
'NOT+', |
378
|
|
|
[new Token(Tokenizer::TOKEN_LOGICAL_NOT, 'NOT', 0), new Token(Tokenizer::TOKEN_MANDATORY, '+', 3)], |
379
|
|
|
], |
380
|
|
|
[ |
381
|
|
|
'+NOT', |
382
|
|
|
[new Token(Tokenizer::TOKEN_MANDATORY, '+', 0), new Token(Tokenizer::TOKEN_LOGICAL_NOT, 'NOT', 1)], |
383
|
|
|
], |
384
|
|
|
[ |
385
|
|
|
'NOT-', |
386
|
|
|
[new Token(Tokenizer::TOKEN_LOGICAL_NOT, 'NOT', 0), new Token(Tokenizer::TOKEN_PROHIBITED, '-', 3)], |
387
|
|
|
], |
388
|
|
|
[ |
389
|
|
|
'-NOT', |
390
|
|
|
[new Token(Tokenizer::TOKEN_PROHIBITED, '-', 0), new Token(Tokenizer::TOKEN_LOGICAL_NOT, 'NOT', 1)], |
391
|
|
|
], |
392
|
|
|
[ |
393
|
|
|
'NOT!', |
394
|
|
|
[new Token(Tokenizer::TOKEN_LOGICAL_NOT, 'NOT', 0), new Token(Tokenizer::TOKEN_LOGICAL_NOT_2, '!', 3)], |
395
|
|
|
], |
396
|
|
|
[ |
397
|
|
|
'!NOT', |
398
|
|
|
[new Token(Tokenizer::TOKEN_LOGICAL_NOT_2, '!', 0), new Token(Tokenizer::TOKEN_LOGICAL_NOT, 'NOT', 1)], |
399
|
|
|
], |
400
|
|
|
[ |
401
|
|
|
"NOT\n", |
402
|
|
|
[new Token(Tokenizer::TOKEN_LOGICAL_NOT, 'NOT', 0), new Token(Tokenizer::TOKEN_WHITESPACE, "\n", 3)], |
403
|
|
|
], |
404
|
|
|
[ |
405
|
|
|
'NOT ', |
406
|
|
|
[new Token(Tokenizer::TOKEN_LOGICAL_NOT, 'NOT', 0), new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 3)], |
407
|
|
|
], |
408
|
|
|
['NOT(', [new Token(Tokenizer::TOKEN_LOGICAL_NOT, 'NOT', 0), new GroupBeginToken('(', 3, '(', null)]], |
409
|
|
|
[ |
410
|
|
|
'NOT)', |
411
|
|
|
[new Token(Tokenizer::TOKEN_LOGICAL_NOT, 'NOT', 0), new Token(Tokenizer::TOKEN_GROUP_END, ')', 3)], |
412
|
|
|
], |
413
|
|
|
['+', [new Token(Tokenizer::TOKEN_MANDATORY, '+', 0)]], |
414
|
|
|
['++', [new Token(Tokenizer::TOKEN_MANDATORY, '+', 0), new Token(Tokenizer::TOKEN_MANDATORY, '+', 1)]], |
415
|
|
|
['-', [new Token(Tokenizer::TOKEN_PROHIBITED, '-', 0)]], |
416
|
|
|
['--', [new Token(Tokenizer::TOKEN_PROHIBITED, '-', 0), new Token(Tokenizer::TOKEN_PROHIBITED, '-', 1)]], |
417
|
|
|
['!', [new Token(Tokenizer::TOKEN_LOGICAL_NOT_2, '!', 0)]], |
418
|
|
|
[ |
419
|
|
|
'!!', |
420
|
|
|
[new Token(Tokenizer::TOKEN_LOGICAL_NOT_2, '!', 0), new Token(Tokenizer::TOKEN_LOGICAL_NOT_2, '!', 1)], |
421
|
|
|
], |
422
|
|
|
['+word', [new Token(Tokenizer::TOKEN_MANDATORY, '+', 0), new WordToken('word', 1, '', 'word')]], |
423
|
|
|
['-word', [new Token(Tokenizer::TOKEN_PROHIBITED, '-', 0), new WordToken('word', 1, '', 'word')]], |
424
|
|
|
['!word', [new Token(Tokenizer::TOKEN_LOGICAL_NOT_2, '!', 0), new WordToken('word', 1, '', 'word')]], |
425
|
|
|
['(word', [new GroupBeginToken('(', 0, '(', null), new WordToken('word', 1, '', 'word')]], |
426
|
|
|
[')word', [new Token(Tokenizer::TOKEN_GROUP_END, ')', 0), new WordToken('word', 1, '', 'word')]], |
427
|
|
|
['word+', [new WordToken('word+', 0, '', 'word+')]], |
428
|
|
|
['word-', [new WordToken('word-', 0, '', 'word-')]], |
429
|
|
|
['word!', [new WordToken('word!', 0, '', 'word!')]], |
430
|
|
|
['word(', [new WordToken('word', 0, '', 'word'), new GroupBeginToken('(', 4, '(', null)]], |
431
|
|
|
['word)', [new WordToken('word', 0, '', 'word'), new Token(Tokenizer::TOKEN_GROUP_END, ')', 4)]], |
432
|
|
|
['one+two+', [new WordToken('one+two+', 0, '', 'one+two+')]], |
433
|
|
|
['one-two-', [new WordToken('one-two-', 0, '', 'one-two-')]], |
434
|
|
|
['one!two!', [new WordToken('one!two!', 0, '', 'one!two!')]], |
435
|
|
|
[ |
436
|
|
|
'one(two(', |
437
|
|
|
[ |
438
|
|
|
new WordToken('one', 0, '', 'one'), |
439
|
|
|
new GroupBeginToken('(', 3, '(', null), |
440
|
|
|
new WordToken('two', 4, '', 'two'), |
441
|
|
|
new GroupBeginToken('(', 7, '(', null), |
442
|
|
|
], |
443
|
|
|
], |
444
|
|
|
[ |
445
|
|
|
'one)two)', |
446
|
|
|
[ |
447
|
|
|
new WordToken('one', 0, '', 'one'), |
448
|
|
|
new Token(Tokenizer::TOKEN_GROUP_END, ')', 3), |
449
|
|
|
new WordToken('two', 4, '', 'two'), |
450
|
|
|
new Token(Tokenizer::TOKEN_GROUP_END, ')', 7), |
451
|
|
|
], |
452
|
|
|
], |
453
|
|
|
['word\+', [new WordToken('word\+', 0, '', 'word+')]], |
454
|
|
|
['word\-', [new WordToken('word\-', 0, '', 'word-')]], |
455
|
|
|
['word\!', [new WordToken('word\!', 0, '', 'word!')]], |
456
|
|
|
['word\(', [new WordToken('word\(', 0, '', 'word(')]], |
457
|
|
|
['word\)', [new WordToken('word\)', 0, '', 'word)')]], |
458
|
|
|
['\+word', [new WordToken('\+word', 0, '', '+word')]], |
459
|
|
|
['\-word', [new WordToken('\-word', 0, '', '-word')]], |
460
|
|
|
['\!word', [new WordToken('\!word', 0, '', '!word')]], |
461
|
|
|
['\(word', [new WordToken('\(word', 0, '', '(word')]], |
462
|
|
|
['\)word', [new WordToken('\)word', 0, '', ')word')]], |
463
|
|
|
['one\+two\+', [new WordToken('one\+two\+', 0, '', 'one+two+')]], |
464
|
|
|
['one\-two\-', [new WordToken('one\-two\-', 0, '', 'one-two-')]], |
465
|
|
|
['one\!two\!', [new WordToken('one\!two\!', 0, '', 'one!two!')]], |
466
|
|
|
['one\(two\(', [new WordToken('one\(two\(', 0, '', 'one(two(')]], |
467
|
|
|
['one\)two\)', [new WordToken('one\)two\)', 0, '', 'one)two)')]], |
468
|
|
|
[ |
469
|
|
|
'one\\\\\)two\\\\\(one\\\\\+two\\\\\-one\\\\\!two', |
470
|
|
|
[ |
471
|
|
|
new WordToken( |
472
|
|
|
'one\\\\\)two\\\\\(one\\\\\+two\\\\\-one\\\\\!two', |
473
|
|
|
0, |
474
|
|
|
'', |
475
|
|
|
'one\)two\(one\+two\-one\!two' |
476
|
|
|
), |
477
|
|
|
], |
478
|
|
|
], |
479
|
|
|
[ |
480
|
|
|
'one\\\\)two\\\\(one\\\\+two\\\\-one\\\\!two', |
481
|
|
|
[ |
482
|
|
|
new WordToken('one\\\\', 0, '', 'one\\'), |
483
|
|
|
new Token(Tokenizer::TOKEN_GROUP_END, ')', 5), |
484
|
|
|
new WordToken('two\\\\', 6, '', 'two\\'), |
485
|
|
|
new GroupBeginToken('(', 11, '(', null), |
486
|
|
|
new WordToken('one\\\\+two\\\\-one\\\\!two', 12, '', 'one\+two\-one\!two'), |
487
|
|
|
], |
488
|
|
|
], |
489
|
|
|
['one+two-one!two', [new WordToken('one+two-one!two', 0, '', 'one+two-one!two')]], |
490
|
|
|
['one\\\'two', [new WordToken('one\\\'two', 0, '', "one\\'two")]], |
491
|
|
|
['one\\"two', [new WordToken('one\\"two', 0, '', 'one"two')]], |
492
|
|
|
['\\', [new WordToken('\\', 0, '', '\\')]], |
493
|
|
|
['one\\two', [new WordToken('one\\two', 0, '', 'one\\two')]], |
494
|
|
|
['one\\\\+\\-\\!\\(\\)two', [new WordToken('one\\\\+\\-\\!\\(\\)two', 0, '', 'one\\+-!()two')]], |
495
|
|
|
['\\\\', [new WordToken('\\\\', 0, '', '\\')]], |
496
|
|
|
[ |
497
|
|
|
'(type:)', |
498
|
|
|
[ |
499
|
|
|
new GroupBeginToken('(', 0, '(', null), |
500
|
|
|
new WordToken('type:', 1, '', 'type:'), |
501
|
|
|
new Token(Tokenizer::TOKEN_GROUP_END, ')', 6), |
502
|
|
|
], |
503
|
|
|
], |
504
|
|
|
[ |
505
|
|
|
'type: AND', |
506
|
|
|
[ |
507
|
|
|
new WordToken('type:', 0, '', 'type:'), |
508
|
|
|
new Token(Tokenizer::TOKEN_WHITESPACE, ' ', 5), |
509
|
|
|
new Token(Tokenizer::TOKEN_LOGICAL_AND, 'AND', 6), |
510
|
|
|
], |
511
|
|
|
], |
512
|
|
|
["word'", [new WordToken("word'", 0, '', "word'")]], |
513
|
|
|
['one\'two', [new WordToken("one'two", 0, '', "one'two")]], |
514
|
|
|
["AND'", [new WordToken("AND'", 0, '', "AND'")]], |
515
|
|
|
["OR'", [new WordToken("OR'", 0, '', "OR'")]], |
516
|
|
|
["NOT'", [new WordToken("NOT'", 0, '', "NOT'")]], |
517
|
|
|
]; |
518
|
|
|
} |
519
|
|
|
|
520
|
|
|
|
521
|
|
|
/** |
522
|
|
|
* @dataProvider providerForTestTokenize |
523
|
|
|
* |
524
|
|
|
* @param string $string |
525
|
|
|
* @param Token[] $expectedTokens |
526
|
|
|
*/ |
527
|
|
|
public function testTokenize($string, array $expectedTokens): void |
528
|
|
|
{ |
529
|
|
|
$tokenExtractor = $this->getTokenExtractor(); |
530
|
|
|
$tokenizer = new Tokenizer($tokenExtractor); |
531
|
|
|
$tokenSequence = $tokenizer->tokenize($string); |
532
|
|
|
self::assertInstanceOf(TokenSequence::class, $tokenSequence); |
533
|
|
|
self::assertEquals($expectedTokens, $tokenSequence->getTokens()); |
534
|
|
|
self::assertEquals($string, $tokenSequence->getSource()); |
535
|
|
|
} |
536
|
|
|
|
537
|
|
|
|
538
|
|
|
public function providerForTestTokenizeNotRecognized(): array |
539
|
|
|
{ |
540
|
|
|
return [ |
541
|
|
|
[ |
542
|
|
|
( |
543
|
|
|
$blah = mb_convert_encoding( |
|
|
|
|
544
|
|
|
'👩‍👩‍👧‍👧', |
545
|
|
|
'UTF-8', |
546
|
|
|
'HTML-ENTITIES' |
547
|
|
|
) |
548
|
|
|
) . '"', |
549
|
|
|
[new WordToken($blah, 0, '', $blah), new Token(Tokenizer::TOKEN_BAILOUT, '"', 7)], |
|
|
|
|
550
|
|
|
], |
551
|
|
|
['"' . $blah, [new Token(Tokenizer::TOKEN_BAILOUT, '"', 0), new WordToken($blah, 1, '', $blah)]], |
|
|
|
|
552
|
|
|
['word"', [new WordToken('word', 0, '', 'word'), new Token(Tokenizer::TOKEN_BAILOUT, '"', 4)]], |
553
|
|
|
[ |
554
|
|
|
'one"two', |
555
|
|
|
[ |
556
|
|
|
new WordToken('one', 0, '', 'one'), |
557
|
|
|
new Token(Tokenizer::TOKEN_BAILOUT, '"', 3), |
558
|
|
|
new WordToken('two', 4, '', 'two'), |
559
|
|
|
], |
560
|
|
|
], |
561
|
|
|
[ |
562
|
|
|
'šđ"čćž', |
563
|
|
|
[ |
564
|
|
|
new WordToken('šđ', 0, '', 'šđ'), |
565
|
|
|
new Token(Tokenizer::TOKEN_BAILOUT, '"', 2), |
566
|
|
|
new WordToken('čćž', 3, '', 'čćž'), |
567
|
|
|
], |
568
|
|
|
], |
569
|
|
|
['AND"', [new Token(Tokenizer::TOKEN_LOGICAL_AND, 'AND', 0), new Token(Tokenizer::TOKEN_BAILOUT, '"', 3)]], |
570
|
|
|
['OR"', [new Token(Tokenizer::TOKEN_LOGICAL_OR, 'OR', 0), new Token(Tokenizer::TOKEN_BAILOUT, '"', 2)]], |
571
|
|
|
['NOT"', [new Token(Tokenizer::TOKEN_LOGICAL_NOT, 'NOT', 0), new Token(Tokenizer::TOKEN_BAILOUT, '"', 3)]], |
572
|
|
|
]; |
573
|
|
|
} |
574
|
|
|
|
575
|
|
|
|
576
|
|
|
/** |
577
|
|
|
* @dataProvider providerForTestTokenizeNotRecognized |
578
|
|
|
* |
579
|
|
|
* @param string $string |
580
|
|
|
* @param Token[] $expectedTokens |
581
|
|
|
*/ |
582
|
|
|
public function testTokenizeNotRecognized($string, array $expectedTokens): void |
583
|
|
|
{ |
584
|
|
|
$tokenExtractor = $this->getTokenExtractor(); |
585
|
|
|
$tokenizer = new Tokenizer($tokenExtractor); |
586
|
|
|
$tokenSequence = $tokenizer->tokenize($string); |
587
|
|
|
self::assertInstanceOf(TokenSequence::class, $tokenSequence); |
588
|
|
|
self::assertEquals($expectedTokens, $tokenSequence->getTokens()); |
589
|
|
|
self::assertEquals($string, $tokenSequence->getSource()); |
590
|
|
|
} |
591
|
|
|
|
592
|
|
|
|
593
|
|
|
protected function getTokenExtractor(): AbstractTokenExtractor |
594
|
|
|
{ |
595
|
|
|
return new Full; |
596
|
|
|
} |
597
|
|
|
|
598
|
|
|
} |
599
|
|
|
|