GenericEngine::tokenizer()   A
last analyzed

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 2
nc 1
nop 1
dl 0
loc 4
rs 10
c 0
b 0
f 0
1
<?php
2
/**
3
 * Licensed under The GPL-3.0 License
4
 * For full copyright and license information, please see the LICENSE.txt
5
 * Redistributions of files must retain the above copyright notice.
6
 *
7
 * @since    2.0.0
8
 * @author   Christopher Castro <[email protected]>
9
 * @link     http://www.quickappscms.org
10
 * @license  http://opensource.org/licenses/gpl-3.0.html GPL-3.0 License
11
 */
12
namespace Search\Engine\Generic;
13
14
use Cake\Cache\Cache;
15
use Cake\Core\InstanceConfigTrait;
16
use Cake\Datasource\EntityInterface;
17
use Cake\Error\FatalErrorException;
18
use Cake\Event\Event;
19
use Cake\Event\EventManager;
20
use Cake\ORM\Query;
21
use Cake\ORM\Table;
22
use Cake\ORM\TableRegistry;
23
use Cake\Utility\Hash;
24
use Cake\Utility\Inflector;
25
use Search\Engine\BaseEngine;
26
use Search\Engine\Generic\Exception\CompoundPrimaryKeyException;
27
use Search\Parser\MiniLanguage\MiniLanguageParser;
28
use Search\Parser\TokenInterface;
29
use \ArrayObject;
30
31
/**
32
 * This Search Engine allows entities to be searchable through an auto-generated
33
 * list of words.
34
 *
35
 * ## Using Generic Engine
36
 *
37
 * You must indicate Searchable behavior to use this engine, for example when
38
 * attaching Searchable behavior to `Articles` table:
39
 *
40
 * ```php
41
 * $this->addBehavior('Search.Searchable', [
42
 *     'engine' => [
43
 *         'className' => 'Search\Engine\Generic\GenericEngine',
44
 *         'config' => [
45
 *             'bannedWords' => []
46
 *         ]
47
 *     ]
48
 * ]);
49
 * ```
50
 *
51
 * This engine will apply a series of filters (converts to lowercase, remove line
52
 * breaks, etc) to words list extracted from each entity being indexed.
53
 *
54
 * ### Banned Words
55
 *
56
 * You can use the `bannedWords` option to tell which words should not be indexed by
57
 * this engine. For example:
58
 *
59
 * ```php
60
 * $this->addBehavior('Search.Searchable', [
61
 *     'engine' => [
62
 *         'className' => 'Search\Engine\Generic\GenericEngine',
63
 *         'config' => [
64
 *             'bannedWords' => ['of', 'the', 'and']
65
 *         ]
66
 *     ]
67
 * ]);
68
 * ```
69
 *
70
 * If you need to ban a really specific list of words you can set `bannedWords`
71
 * option as a callable method that should return true or false to tell if a words
72
 * should be indexed or not. For example:
73
 *
74
 * ```php
75
 * $this->addBehavior('Search.Searchable', [
76
 *     'engine' => [
77
 *         'className' => 'Search\Engine\Generic\GenericEngine',
78
 *         'config' => [
79
 *             'bannedWords' => function ($word) {
80
 *                 return strlen($word) > 3;
81
 *             }
82
 *         ]
83
 *     ]
84
 * ]);
85
 * ```
86
 *
87
 * - Returning TRUE indicates that the word is safe for indexing (not banned).
88
 * - Returning FALSE indicates that the word should NOT be indexed (banned).
89
 *
90
 * In the example, above any word of 4 or more characters will be indexed
91
 * (e.g. "home", "name", "quickapps", etc). Any word of 3 or less characters will
92
 * be banned (e.g. "and", "or", "the").
93
 *
94
 * ## Searching Entities
95
 *
96
 * When using this engine, every entity under your table gets a list of indexed
97
 * words. The idea behind this is that you can use this list of words to locate any
98
 * entity based on a customized search-criteria. A search-criteria looks as follow:
99
 *
100
 *     "this phrase" OR -"not this one" AND this
101
 *
102
 * ---
103
 *
104
 * Use wildcard searches to broaden results; asterisk (`*`) matches any one or
105
 * more characters, exclamation mark (`!`) matches any single character:
106
 *
107
 *     "this *rase" OR -"not th!! one" AND thi!
108
 *
109
 * Anything containing space (" ") characters must be wrapper between quotation
110
 * marks:
111
 *
112
 *     "this phrase" special_operator:"[100 to 500]" -word -"more words" -word_1 word_2
113
 *
114
 * The search criteria above will be treated as it were composed by the
115
 * following parts:
116
 *
117
 * - `this phrase`
118
 * - `special_operator:[100 to 500]`
119
 * - `-word`
120
 * - `-more words`
121
 * - `-word_1`
122
 * - `word_2`
123
 *
124
 * ---
125
 *
126
 * Search criteria allows you to perform complex search conditions in a
127
 * human-readable way. Allows you, for example, create user-friendly search-forms,
128
 * or create some RSS feed just by creating a friendly URL using a search-criteria.
129
 * e.g.: `http://example.com/rss/category:art date:>2014-01-01`
130
 *
131
 * You must use the `search()` method to scope any query using a search-criteria.
132
 * For example, in one controller using `Users` model:
133
 *
134
 * ```php
135
 * $criteria = '"this phrase" OR -"not this one" AND this';
136
 * $query = $this->Users->find();
137
 * $query = $this->Users->search($criteria, $query);
138
 * ```
139
 *
140
 * The above will alter the given $query object according to the given criteria.
141
 * The second argument (query object) is optional, if not provided this Behavior
142
 * automatically generates a find-query for you. Previous example and the one
143
 * below are equivalent:
144
 *
145
 * ```php
146
 * $criteria = '"this phrase" OR -"not this one" AND this';
147
 * $query = $this->Users->search($criteria);
148
 * ```
149
 */
150
class GenericEngine extends BaseEngine
151
{
152
153
    /**
154
     * {@inheritDoc}
155
     *
156
     * - operators: A list of registered operators methods as `name` =>
157
     *   `methodName`.
158
     *
159
     * - strict: Used to filter any invalid word. Set to a string representing a
160
     *   regular expression describing which charaters should be removed. Or set
161
     *   to TRUE to used default discard criteria: only letters, digits and few
162
     *   basic symbols (".", ",", "/", etc). Defaults to TRUE (custom filter
163
     *   regex). VALID ONLY when `wordsExtractor` is set to null.
164
     *
165
     * - bannedWords: Array list of banned words, or a callable that should decide
166
     *   if the given word is banned or not. Defaults to empty array (allow
167
     *   everything). VALID ONLY when `wordsExtractor` is set to null.
168
     *
169
     * - fulltext: Whether to use FULLTEXT search whenever it is possible. Defaults to
170
     *   TRUE. This feature is only supported for MySQL InnoDB database engines.
171
     *
172
     * - datasetTable: Name of the MySQL table where words dataset should be stored and
173
     *   read from. This allows you to split large sets into different tables.
174
     *
175
     * - wordsExtractor: Callable function used to extract words from each entity being
176
     *   indexed. Such functions will received an Entity object as first argument, and
177
     *   should return a string of words. e.g. `lorem ipsum dolorem`. Defaults to internal
178
     *   method `extractEntityWords()`
179
     */
180
    protected $_defaultConfig = [
181
        'operators' => [],
182
        'strict' => true,
183
        'bannedWords' => [],
184
        'wordsExtractor' => null,
185
        'fulltext' => true,
186
        'datasetTable' => 'search_datasets',
187
    ];
188
189
    /**
190
     * {@inheritDoc}
191
     *
192
     * @throws \Search\Engine\Generic\Exception\CompoundPrimaryKeyException When using
193
     *   compound primary keys
194
     */
195
    public function __construct(Table $table, array $config = [])
196
    {
197
        $config['tableAlias'] = (string)Inflector::underscore($table->table());
198
        $config['pk'] = $table->primaryKey();
199
        $this->_defaultConfig['wordsExtractor'] = function (EntityInterface $entity) {
200
            return $this->extractEntityWords($entity);
201
        };
202
203
        if (is_array($config['pk'])) {
204
            throw new CompoundPrimaryKeyException($config['tableAlias']);
205
        }
206
207
        parent::__construct($table, $config);
208
209
        $assocOptions = [
210
            'foreignKey' => 'entity_id',
211
            'joinType' => 'INNER',
212
            'conditions' => [
213
                'SearchDatasets.table_alias' => $config['tableAlias'],
214
            ],
215
            'dependent' => true
216
        ];
217
218
        if ($this->config('datasetTable') != $this->_defaultConfig['datasetTable']) {
219
            $datasetTableObject = clone TableRegistry::get('Search.SearchDatasets');
220
            $datasetTableObject->table($this->config('datasetTable'));
221
            $assocOptions['targetTable'] = $datasetTableObject;
222
        }
223
224
        $this->_table->hasOne('Search.SearchDatasets', $assocOptions);
225
    }
226
227
    /**
228
     * {@inheritDoc}
229
     */
230
    public function index(EntityInterface $entity)
231
    {
232
        $set = $this->_table->SearchDatasets->find()
233
            ->where([
234
                'entity_id' => $this->_entityId($entity),
0 ignored issues
show
Deprecated Code introduced by
The method Search\Engine\Generic\GenericEngine::_entityId() has been deprecated with message: Use direct access as `$entity->get($this->config('pk'))`

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
235
                'table_alias' => $this->config('tableAlias'),
236
            ])
237
            ->limit(1)
238
            ->first();
239
240
        if (!$set) {
241
            $set = $this->_table->SearchDatasets->newEntity([
242
                'entity_id' => $this->_entityId($entity),
0 ignored issues
show
Deprecated Code introduced by
The method Search\Engine\Generic\GenericEngine::_entityId() has been deprecated with message: Use direct access as `$entity->get($this->config('pk'))`

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
243
                'table_alias' => $this->config('tableAlias'),
244
                'words' => '',
245
            ]);
246
        }
247
248
        // We add starting and trailing space to allow LIKE %something-to-match%
249
        $extractor = $this->config('wordsExtractor');
250
        $set = $this->_table->SearchDatasets->patchEntity($set, [
251
            'words' => ' ' . $extractor($entity) . ' '
252
        ]);
253
254
        return (bool)$this->_table->SearchDatasets->save($set);
255
    }
256
257
    /**
258
     * {@inheritDoc}
259
     */
260
    public function delete(EntityInterface $entity)
261
    {
262
        $this->_table->SearchDatasets->deleteAll([
263
            'entity_id' => $this->_entityId($entity),
0 ignored issues
show
Deprecated Code introduced by
The method Search\Engine\Generic\GenericEngine::_entityId() has been deprecated with message: Use direct access as `$entity->get($this->config('pk'))`

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
264
            'table_alias' => $this->config('tableAlias'),
265
        ]);
266
267
        return true;
268
    }
269
270
    /**
271
     * {@inheritDoc}
272
     */
273
    public function get(EntityInterface $entity)
274
    {
275
        return $this->_table->SearchDatasets->find()
276
            ->where([
277
                'entity_id' => $this->_entityId($entity),
0 ignored issues
show
Deprecated Code introduced by
The method Search\Engine\Generic\GenericEngine::_entityId() has been deprecated with message: Use direct access as `$entity->get($this->config('pk'))`

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
278
                'table_alias' => $this->config('tableAlias'),
279
            ])
280
            ->limit(1)
281
            ->first();
282
    }
283
284
    /**
285
     * {@inheritDoc}
286
     *
287
     * It looks for search-criteria and applies them over the query object. For
288
     * example, given the criteria below:
289
     *
290
     *     "this phrase" -"and not this one"
291
     *
292
     * Alters the query object as follow:
293
     *
294
     * ```php
295
     * $query->where([
296
     *    'indexed_words LIKE' => '%this phrase%',
297
     *    'indexed_words NOT LIKE' => '%and not this one%'
298
     * ]);
299
     * ```
300
     *
301
     * The `AND` & `OR` keywords are allowed to create complex conditions. For
302
     * example:
303
     *
304
     *     "this phrase" OR -"and not this one" AND "this"
305
     *
306
     * Will produce something like:
307
     *
308
     * ```php
309
     * $query
310
     *     ->where(['indexed_words LIKE' => '%this phrase%'])
311
     *     ->orWhere(['indexed_words NOT LIKE' => '%and not this one%']);
312
     *     ->andWhere(['indexed_words LIKE' => '%this%']);
313
     * ```
314
     *
315
     * ### Options
316
     *
317
     * - `missingOperators`: Controls what to do when an undefined operator is found.
318
     *    Possible values are:
319
     *
320
     *    - `event` (default): Triggers an event so other parts of the system can react
321
     *      to any missing operator.
322
     *
323
     *    - `ignore`: Ignore any undefined operator.
324
     *
325
     *    - `words`: Converts operator information into a set of literal words.
326
     *
327
     * - `tokenDecorator`: Callable function which is applied to every token before it
328
     *   gets applied. Retuning anything that is not a `TokenInterface` will skip that
329
     *   token from being used.
330
     */
331
    public function search($criteria, Query $query, array $options = [])
332
    {
333
        $tokens = $this->tokenizer($criteria);
334
        $options += [
335
            'missingOperators' => 'event',
336
            'tokenDecorator' => function ($t) {
337
                return $t;
338
            },
339
        ];
340
341
        if (!empty($tokens)) {
342
            $query->innerJoinWith('SearchDatasets');
343
            $decorator = $options['tokenDecorator'];
344
            $operators = $this->_table->behaviors()
345
                ->get('Searchable')
346
                ->config('operators');
347
348
            foreach ($tokens as $token) {
349
                $token = $decorator($token);
350
                $method = '_scopeWords';
351
352
                if (!($token instanceof TokenInterface)) {
353
                    continue;
354
                }
355
356
                if ($token->isOperator()) {
357
                    $method = '_scopeOperator';
358
                    $operatorName = mb_strtolower($token->operatorName());
359
360
                    if (!isset($operators[$operatorName])) {
361
                        switch ($options['missingOperators']) {
362
                            case 'ignore':
363
                                $method = null;
364
                                break;
365
366
                            case 'words':
367
                                $method = '_scopeWords';
368
                                break;
369
370
                            case 'event':
371
                            default:
372
                                // `event` is how missing operator are handled by default by
373
                                // Searchable Behavior, so no specific action is required.
374
                                break;
375
                        }
376
                    }
377
                }
378
379
                if ($method) {
380
                    $query = $this->$method($query, $token);
381
                }
382
            }
383
        }
384
385
        return $query;
386
    }
387
388
    /**
389
     * Extracts every token found on the given search criteria.
390
     *
391
     * @param string $criteria A search criteria. e.g. `-hello +world`
392
     * @return array List of tokens found
393
     */
394
    public function tokenizer($criteria)
395
    {
396
        return (array)(new MiniLanguageParser($criteria))->parse();
397
    }
398
399
    /**
400
     * Scopes the given query using the given operator token.
401
     *
402
     * @param \Cake\ORM\Query $query The query to scope
403
     * @param \Search\Token $token Token describing an operator. e.g `-op_name:op_value`
404
     * @return \Cake\ORM\Query Scoped query
405
     */
406
    protected function _scopeOperator(Query $query, TokenInterface $token)
407
    {
408
        return $this->_table->applySearchOperator($query, $token);
409
    }
410
411
    /**
412
     * Scopes the given query using the given words token.
413
     *
414
     * @param \Cake\ORM\Query $query The query to scope
415
     * @param \Search\TokenInterface $token Token describing a words sequence. e.g `this is a phrase`
416
     * @return \Cake\ORM\Query Scoped query
417
     */
418
    protected function _scopeWords(Query $query, TokenInterface $token)
419
    {
420
        if ($this->_isFullTextEnabled()) {
421
            return $this->_scopeWordsInFulltext($query, $token);
422
        }
423
424
        $like = 'LIKE';
425
        if ($token->negated()) {
426
            $like = 'NOT LIKE';
427
        }
428
429
        // * Matches any one or more characters.
430
        // ! Matches any single character.
431
        $value = str_replace(['*', '!'], ['%', '_'], $token->value());
432
433
        if ($token->where() === 'or') {
434
            $query->orWhere(["SearchDatasets.words {$like}" => "%{$value}%"]);
435
        } elseif ($token->where() === 'and') {
436
            $query->andWhere(["SearchDatasets.words {$like}" => "%{$value}%"]);
437
        } else {
438
            $query->where(["SearchDatasets.words {$like}" => "%{$value}%"]);
439
        }
440
441
        return $query;
442
    }
443
444
    /**
445
     * Similar to "_scopeWords" but using MySQL's fulltext indexes.
446
     *
447
     * @param \Cake\ORM\Query $query The query to scope
448
     * @param \Search\TokenInterface $token Token describing a words sequence. e.g `this is a phrase`
449
     * @return \Cake\ORM\Query Scoped query
450
     */
451
    protected function _scopeWordsInFulltext(Query $query, TokenInterface $token)
452
    {
453
        $value = str_replace(['*', '!'], ['*', '*'], $token->value());
454
        $value = mb_strpos($value, '+') === 0 ? mb_substr($value, 1) : $value;
455
456
        if (empty($value) || in_array($value, $this->_stopWords())) {
457
            return $query;
458
        }
459
460
        $not = $token->negated() ? 'NOT' : '';
461
        $value = str_replace(["'", '@'], ['"', ' '], $value);
462
        $conditions = ["{$not} MATCH(SearchDatasets.words) AGAINST('{$value}' IN BOOLEAN MODE) > 0"];
463
464 View Code Duplication
        if ($token->where() === 'or') {
465
            $query->orWhere($conditions);
466
        } elseif ($token->where() === 'and') {
467
            $query->andWhere($conditions);
468
        } else {
469
            $query->where($conditions);
470
        }
471
472
        return $query;
473
    }
474
475
    /**
476
     * Whether FullText index is available or not and should be used.
477
     *
478
     * @return bool True if enabled and should be used, false otherwise
479
     */
480
    protected function _isFullTextEnabled()
481
    {
482
        if (!$this->config('fulltext')) {
483
            return false;
484
        }
485
486
        static $enabled = null;
487
        if ($enabled !== null) {
488
            return $enabled;
489
        }
490
491
        list(, $driverClass) = namespaceSplit(strtolower(get_class($this->_table->connection()->driver())));
492
        if ($driverClass != 'mysql') {
493
            $enabled = false;
494
495
            return false;
496
        }
497
498
        $schema = $this->_table->SearchDatasets->schema();
499
        foreach ($schema->indexes() as $index) {
500
            $info = $schema->index($index);
501
            if (in_array('words', $info['columns']) &&
502
                strtolower($info['type']) == 'fulltext'
503
            ) {
504
                $enabled = true;
505
506
                return true;
507
            }
508
        }
509
510
        $enabled = false;
511
512
        return false;
513
    }
514
515
    /**
516
     * Gets a list of storage engine's stopwords. That is words that is considered
517
     * common or Trivial enough that it is omitted from the search index and ignored
518
     * in search queries
519
     *
520
     * @return array List of words
521
     */
522
    protected function _stopWords()
523
    {
524
        $conn = $this->_table->find()->connection();
525
        $cacheKey = $conn->configName() . '_generic_engine_stopwords_list';
526
        if ($cache = Cache::read($cacheKey, '_cake_model_')) {
527
            return (array)$cache;
528
        }
529
530
        $words = [];
531
        $sql = $conn
532
            ->execute('SELECT * FROM INFORMATION_SCHEMA.INNODB_FT_DEFAULT_STOPWORD')
533
            ->fetchAll('assoc');
534
535
        foreach ((array)$sql as $row) {
536
            if (!empty($row['value'])) {
537
                $words[] = $row['value'];
538
            }
539
        }
540
541
        Cache::write($cacheKey, $words, '_cake_model_');
542
543
        return $words;
544
    }
545
546
    /**
547
     * Calculates entity's primary key.
548
     *
549
     * @param \Cake\Datasource\EntityInterface $entity The entity
550
     * @return string
551
     * @deprecated Use direct access as `$entity->get($this->config('pk'))`
552
     */
553
    protected function _entityId(EntityInterface $entity)
554
    {
555
        return $entity->get($this->config('pk'));
556
    }
557
558
    /**
559
     * Extracts a list of words to by indexed for given entity.
560
     *
561
     * NOTE: Words can be repeated, this allows to search phrases.
562
     *
563
     * @param \Cake\Datasource\EntityInterface $entity The entity for which generate
564
     *  the list of words
565
     * @return string Space-separated list of words. e.g. `cat dog this that`
566
     */
567
    public function extractEntityWords(EntityInterface $entity)
568
    {
569
        $text = '';
570
        $entityArray = $entity->toArray();
571
        $entityArray = Hash::flatten($entityArray);
572
        foreach ($entityArray as $key => $value) {
573
            if (is_string($value) || is_numeric($value)) {
574
                $text .= " {$value}";
575
            }
576
        }
577
578
        $text = str_replace(["\n", "\r"], '', trim((string)$text)); // remove new lines
579
        $text = strip_tags($text); // remove HTML tags, but keep their content
580
        $strict = $this->config('strict');
581
582
        if (!empty($strict)) {
583
            // only: space, digits (0-9), letters (any language), ".", ",", "-", "_", "/", "\"
584
            $pattern = is_string($strict) ? $strict : '[^\p{L}\p{N}\s\@\.\,\-\_\/\\0-9]';
585
            $text = preg_replace('/' . $pattern . '/ui', ' ', $text);
586
        }
587
588
        $text = trim(preg_replace('/\s{2,}/i', ' ', $text)); // remove double spaces
589
        $text = mb_strtolower($text); // all to lowercase
590
        $text = $this->_filterText($text); // filter
591
        $text = iconv('UTF-8', 'UTF-8//IGNORE', mb_convert_encoding($text, 'UTF-8')); // remove any invalid character
592
593
        return trim($text);
594
    }
595
596
    /**
597
     * Removes any invalid word from the given text.
598
     *
599
     * @param string $text The text to filter
600
     * @return string Filtered text
601
     */
602
    protected function _filterText($text)
603
    {
604
        // return true means `yes, it's banned`
605
        if (is_callable($this->config('bannedWords'))) {
606
            $isBanned = function ($word) {
607
                $callable = $this->config('bannedWords');
608
609
                return $callable($word);
610
            };
611
        } else {
612
            $isBanned = function ($word) {
613
                return in_array($word, (array)$this->config('bannedWords')) || empty($word);
614
            };
615
        }
616
617
        $words = explode(' ', $text);
618
        foreach ($words as $i => $w) {
619
            if ($isBanned($w)) {
620
                unset($words[$i]);
621
            }
622
        }
623
624
        return implode(' ', $words);
625
    }
626
}
627