Completed
Push — 2.0 ( a7a4d2...d55693 )
by Christopher
01:50
created

GenericEngine::_scopeOperator()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 2
nc 1
nop 2
dl 0
loc 4
rs 10
c 0
b 0
f 0
1
<?php
2
/**
3
 * Licensed under The GPL-3.0 License
4
 * For full copyright and license information, please see the LICENSE.txt
5
 * Redistributions of files must retain the above copyright notice.
6
 *
7
 * @since    2.0.0
8
 * @author   Christopher Castro <[email protected]>
9
 * @link     http://www.quickappscms.org
10
 * @license  http://opensource.org/licenses/gpl-3.0.html GPL-3.0 License
11
 */
12
namespace Search\Engine\Generic;
13
14
use Cake\Cache\Cache;
15
use Cake\Core\InstanceConfigTrait;
16
use Cake\Datasource\EntityInterface;
17
use Cake\Error\FatalErrorException;
18
use Cake\Event\Event;
19
use Cake\Event\EventManager;
20
use Cake\ORM\Query;
21
use Cake\ORM\Table;
22
use Cake\ORM\TableRegistry;
23
use Cake\Utility\Hash;
24
use Cake\Utility\Inflector;
25
use Search\Engine\BaseEngine;
26
use Search\Engine\Generic\Exception\CompoundPrimaryKeyException;
27
use Search\Parser\MiniLanguage\MiniLanguageParser;
28
use Search\Parser\TokenInterface;
29
use \ArrayObject;
30
31
/**
32
 * This Search Engine allows entities to be searchable through an auto-generated
33
 * list of words.
34
 *
35
 * ## Using Generic Engine
36
 *
37
 * You must indicate Searchable behavior to use this engine, for example when
38
 * attaching Searchable behavior to `Articles` table:
39
 *
40
 * ```php
41
 * $this->addBehavior('Search.Searchable', [
42
 *     'engine' => [
43
 *         'className' => 'Search\Engine\Generic\GenericEngine',
44
 *         'config' => [
45
 *             'bannedWords' => []
46
 *         ]
47
 *     ]
48
 * ]);
49
 * ```
50
 *
51
 * This engine will apply a series of filters (converts to lowercase, remove line
52
 * breaks, etc) to words list extracted from each entity being indexed.
53
 *
54
 * ### Banned Words
55
 *
56
 * You can use the `bannedWords` option to tell which words should not be indexed by
57
 * this engine. For example:
58
 *
59
 * ```php
60
 * $this->addBehavior('Search.Searchable', [
61
 *     'engine' => [
62
 *         'className' => 'Search\Engine\Generic\GenericEngine',
63
 *         'config' => [
64
 *             'bannedWords' => ['of', 'the', 'and']
65
 *         ]
66
 *     ]
67
 * ]);
68
 * ```
69
 *
70
 * If you need to ban a really specific list of words you can set `bannedWords`
71
 * option as a callable method that should return true or false to tell if a words
72
 * should be indexed or not. For example:
73
 *
74
 * ```php
75
 * $this->addBehavior('Search.Searchable', [
76
 *     'engine' => [
77
 *         'className' => 'Search\Engine\Generic\GenericEngine',
78
 *         'config' => [
79
 *             'bannedWords' => function ($word) {
80
 *                 return strlen($word) > 3;
81
 *             }
82
 *         ]
83
 *     ]
84
 * ]);
85
 * ```
86
 *
87
 * - Returning TRUE indicates that the word is safe for indexing (not banned).
88
 * - Returning FALSE indicates that the word should NOT be indexed (banned).
89
 *
90
 * In the example, above any word of 4 or more characters will be indexed
91
 * (e.g. "home", "name", "quickapps", etc). Any word of 3 or less characters will
92
 * be banned (e.g. "and", "or", "the").
93
 *
94
 * ## Searching Entities
95
 *
96
 * When using this engine, every entity under your table gets a list of indexed
97
 * words. The idea behind this is that you can use this list of words to locate any
98
 * entity based on a customized search-criteria. A search-criteria looks as follow:
99
 *
100
 *     "this phrase" OR -"not this one" AND this
101
 *
102
 * ---
103
 *
104
 * Use wildcard searches to broaden results; asterisk (`*`) matches any one or
105
 * more characters, exclamation mark (`!`) matches any single character:
106
 *
107
 *     "this *rase" OR -"not th!! one" AND thi!
108
 *
109
 * Anything containing space (" ") characters must be wrapper between quotation
110
 * marks:
111
 *
112
 *     "this phrase" special_operator:"[100 to 500]" -word -"more words" -word_1 word_2
113
 *
114
 * The search criteria above will be treated as it were composed by the
115
 * following parts:
116
 *
117
 * - `this phrase`
118
 * - `special_operator:[100 to 500]`
119
 * - `-word`
120
 * - `-more words`
121
 * - `-word_1`
122
 * - `word_2`
123
 *
124
 * ---
125
 *
126
 * Search criteria allows you to perform complex search conditions in a
127
 * human-readable way. Allows you, for example, create user-friendly search-forms,
128
 * or create some RSS feed just by creating a friendly URL using a search-criteria.
129
 * e.g.: `http://example.com/rss/category:art date:>2014-01-01`
130
 *
131
 * You must use the `search()` method to scope any query using a search-criteria.
132
 * For example, in one controller using `Users` model:
133
 *
134
 * ```php
135
 * $criteria = '"this phrase" OR -"not this one" AND this';
136
 * $query = $this->Users->find();
137
 * $query = $this->Users->search($criteria, $query);
138
 * ```
139
 *
140
 * The above will alter the given $query object according to the given criteria.
141
 * The second argument (query object) is optional, if not provided this Behavior
142
 * automatically generates a find-query for you. Previous example and the one
143
 * below are equivalent:
144
 *
145
 * ```php
146
 * $criteria = '"this phrase" OR -"not this one" AND this';
147
 * $query = $this->Users->search($criteria);
148
 * ```
149
 */
150
class GenericEngine extends BaseEngine
151
{
152
153
    /**
154
     * {@inheritDoc}
155
     *
156
     * - operators: A list of registered operators methods as `name` =>
157
     *   `methodName`.
158
     *
159
     * - strict: Used to filter any invalid word. Set to a string representing a
160
     *   regular expression describing which charaters should be removed. Or set
161
     *   to TRUE to used default discard criteria: only letters, digits and few
162
     *   basic symbols (".", ",", "/", etc). Defaults to TRUE (custom filter
163
     *   regex). VALID ONLY when `wordsExtractor` is set to null.
164
     *
165
     * - bannedWords: Array list of banned words, or a callable that should decide
166
     *   if the given word is banned or not. Defaults to empty array (allow
167
     *   everything). VALID ONLY when `wordsExtractor` is set to null.
168
     *
169
     * - fulltext: Whether to use FULLTEXT search whenever it is possible. Defaults to
170
     *   TRUE. This feature is only supported for MySQL InnoDB database engines.
171
     *
172
     * - datasetTable: Name of the MySQL table where words dataset should be stored and
173
     *   read from. This allows you to split large sets into different tables.
174
     *
175
     * - wordsExtractor: Callable function used to extract words from each entity being
176
     *   indexed. Such functions will received an Entity object as first argument, and
177
     *   should return a string of words. e.g. `lorem ipsum dolorem`. Defaults to internal
178
     *   method `extractEntityWords()`
179
     */
180
    protected $_defaultConfig = [
181
        'operators' => [],
182
        'strict' => true,
183
        'bannedWords' => [],
184
        'wordsExtractor' => null,
185
        'fulltext' => true,
186
        'datasetTable' => 'search_datasets',
187
    ];
188
189
    /**
190
     * {@inheritDoc}
191
     *
192
     * @throws \Search\Engine\Generic\Exception\CompoundPrimaryKeyException When using
193
     *   compound primary keys
194
     */
195
    public function __construct(Table $table, array $config = [])
196
    {
197
        $config['tableAlias'] = (string)Inflector::underscore($table->table());
198
        $config['pk'] = $table->primaryKey();
199
        $this->_defaultConfig['wordsExtractor'] = function (EntityInterface $entity) {
200
            return $this->extractEntityWords($entity);
201
        };
202
203
        if (is_array($config['pk'])) {
204
            throw new CompoundPrimaryKeyException($config['tableAlias']);
205
        }
206
207
        parent::__construct($table, $config);
208
209
        $assocOptions = [
210
            'foreignKey' => 'entity_id',
211
            'joinType' => 'INNER',
212
            'conditions' => [
213
                'SearchDatasets.table_alias' => $config['tableAlias'],
214
            ],
215
            'dependent' => true
216
        ];
217
218
        if ($this->config('datasetTable') != $this->_defaultConfig['datasetTable']) {
219
            $datasetTableObject = clone TableRegistry::get('Search.SearchDatasets');
220
            $datasetTableObject->table($this->config('datasetTable'));
221
            $assocOptions['targetTable'] = $datasetTableObject;
222
        }
223
224
        $this->_table->hasOne('Search.SearchDatasets', $assocOptions);
225
    }
226
227
    /**
228
     * {@inheritDoc}
229
     */
230
    public function index(EntityInterface $entity)
231
    {
232
        $set = $this->_table->SearchDatasets->find()
233
            ->where([
234
                'entity_id' => $this->_entityId($entity),
0 ignored issues
show
Deprecated Code introduced by
The method Search\Engine\Generic\GenericEngine::_entityId() has been deprecated with message: Use direct access as `$entity->get($this->config('pk'))`

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
235
                'table_alias' => $this->config('tableAlias'),
236
            ])
237
            ->limit(1)
238
            ->first();
239
240
        if (!$set) {
241
            $set = $this->_table->SearchDatasets->newEntity([
242
                'entity_id' => $this->_entityId($entity),
0 ignored issues
show
Deprecated Code introduced by
The method Search\Engine\Generic\GenericEngine::_entityId() has been deprecated with message: Use direct access as `$entity->get($this->config('pk'))`

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
243
                'table_alias' => $this->config('tableAlias'),
244
                'words' => '',
245
            ]);
246
        }
247
248
        // We add starting and trailing space to allow LIKE %something-to-match%
249
        $extractor = $this->config('wordsExtractor');
250
        $set = $this->_table->SearchDatasets->patchEntity($set, [
251
            'words' => ' ' . $extractor($entity) . ' '
252
        ]);
253
254
        return (bool)$this->_table->SearchDatasets->save($set);
255
    }
256
257
    /**
258
     * {@inheritDoc}
259
     */
260
    public function delete(EntityInterface $entity)
261
    {
262
        $this->_table->SearchDatasets->deleteAll([
263
            'entity_id' => $this->_entityId($entity),
0 ignored issues
show
Deprecated Code introduced by
The method Search\Engine\Generic\GenericEngine::_entityId() has been deprecated with message: Use direct access as `$entity->get($this->config('pk'))`

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
264
            'table_alias' => $this->config('tableAlias'),
265
        ]);
266
267
        return true;
268
    }
269
270
    /**
271
     * {@inheritDoc}
272
     */
273
    public function get(EntityInterface $entity)
274
    {
275
        return $this->_table->SearchDatasets->find()
276
            ->where([
277
                'entity_id' => $this->_entityId($entity),
0 ignored issues
show
Deprecated Code introduced by
The method Search\Engine\Generic\GenericEngine::_entityId() has been deprecated with message: Use direct access as `$entity->get($this->config('pk'))`

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
278
                'table_alias' => $this->config('tableAlias'),
279
            ])
280
            ->limit(1)
281
            ->first();
282
    }
283
284
    /**
285
     * {@inheritDoc}
286
     *
287
     * It looks for search-criteria and applies them over the query object. For
288
     * example, given the criteria below:
289
     *
290
     *     "this phrase" -"and not this one"
291
     *
292
     * Alters the query object as follow:
293
     *
294
     * ```php
295
     * $query->where([
296
     *    'indexed_words LIKE' => '%this phrase%',
297
     *    'indexed_words NOT LIKE' => '%and not this one%'
298
     * ]);
299
     * ```
300
     *
301
     * The `AND` & `OR` keywords are allowed to create complex conditions. For
302
     * example:
303
     *
304
     *     "this phrase" OR -"and not this one" AND "this"
305
     *
306
     * Will produce something like:
307
     *
308
     * ```php
309
     * $query
310
     *     ->where(['indexed_words LIKE' => '%this phrase%'])
311
     *     ->orWhere(['indexed_words NOT LIKE' => '%and not this one%']);
312
     *     ->andWhere(['indexed_words LIKE' => '%this%']);
313
     * ```
314
     *
315
     * ### Options
316
     *
317
     * - `missingOperators`: Controls what to do when an undefined operator is found.
318
     *    Possible values are:
319
     *    
320
     *    - `event` (default): Triggers an event so other parts of the system can react
321
     *      to any missing operator.
322
     *
323
     *    - `ignore`: Ignore any undefined operator.
324
     *
325
     *    - `words`: Converts operator information into a set of literal words.
326
     *
327
     * - `tokenDecorator`: Callable function which is applied to every token before it
328
     *   gets applied. Retuning anything that is not a `TokenInterface` will skip that
329
     *   token from being used.
330
     */
331
    public function search($criteria, Query $query, array $options = [])
332
    {
333
        $tokens = $this->tokenizer($criteria);
334
        $options += [
335
            'missingOperators' => 'event',
336
            'tokenDecorator' => function ($t) {
337
                return $t;
338
            },
339
        ];
340
341
        if (!empty($tokens)) {
342
            $query->innerJoinWith('SearchDatasets');
343
            $decorator = $options['tokenDecorator'];
344
            $operators = $this->_table->behaviors()
345
                ->get('Searchable')
346
                ->config('operators');
347
348
            foreach ($tokens as $token) {
349
                $token = $decorator($token);
350
                $method = '_scopeWords';
351
352
                if (!($token instanceof TokenInterface)) {
353
                    continue;
354
                }
355
356
                if ($token->isOperator()) {
357
                    $method = '_scopeOperator';
358
359
                    if (!isset($operators[$token->operatorName()])) {
360
                        switch ($options['missingOperators']) {
361
                            case 'ignore':
362
                                $method = null;
363
                                break;
364
365
                            case 'words':
366
                                $method = '_scopeWords';
367
                                break;
368
369
                            case 'event':
370
                                default:
371
                                    // `event` is how missing operator are handled by default by
372
                                    // Searchable Behavior, so no specific action is required.
373
                                    break;
374
                        }
375
                    }
376
                }
377
378
                if ($method) {
379
                    $query = $this->$method($query, $token);
380
                }
381
            }
382
        }
383
384
        return $query;
385
    }
386
387
    /**
388
     * Extracts every token found on the given search criteria.
389
     *
390
     * @param string $criteria A search criteria. e.g. `-hello +world`
391
     * @return array List of tokens found
392
     */
393
    public function tokenizer($criteria)
394
    {
395
        return (array)(new MiniLanguageParser($criteria))->parse();
396
    }
397
398
    /**
399
     * Scopes the given query using the given operator token.
400
     *
401
     * @param \Cake\ORM\Query $query The query to scope
402
     * @param \Search\Token $token Token describing an operator. e.g `-op_name:op_value`
403
     * @return \Cake\ORM\Query Scoped query
404
     */
405
    protected function _scopeOperator(Query $query, TokenInterface $token)
406
    {
407
        return $this->_table->applySearchOperator($query, $token);
408
    }
409
410
    /**
411
     * Scopes the given query using the given words token.
412
     *
413
     * @param \Cake\ORM\Query $query The query to scope
414
     * @param \Search\TokenInterface $token Token describing a words sequence. e.g `this is a phrase`
415
     * @return \Cake\ORM\Query Scoped query
416
     */
417
    protected function _scopeWords(Query $query, TokenInterface $token)
418
    {
419
        if ($this->_isFullTextEnabled()) {
420
            return $this->_scopeWordsInFulltext($query, $token);
421
        }
422
423
        $like = 'LIKE';
424
        if ($token->negated()) {
425
            $like = 'NOT LIKE';
426
        }
427
428
        // * Matches any one or more characters.
429
        // ! Matches any single character.
430
        $value = str_replace(['*', '!'], ['%', '_'], $token->value());
431
432
        if ($token->where() === 'or') {
433
            $query->orWhere(["SearchDatasets.words {$like}" => "%{$value}%"]);
434
        } elseif ($token->where() === 'and') {
435
            $query->andWhere(["SearchDatasets.words {$like}" => "%{$value}%"]);
436
        } else {
437
            $query->where(["SearchDatasets.words {$like}" => "%{$value}%"]);
438
        }
439
440
        return $query;
441
    }
442
443
    /**
444
     * Similar to "_scopeWords" but using MySQL's fulltext indexes.
445
     *
446
     * @param \Cake\ORM\Query $query The query to scope
447
     * @param \Search\TokenInterface $token Token describing a words sequence. e.g `this is a phrase`
448
     * @return \Cake\ORM\Query Scoped query
449
     */
450
    protected function _scopeWordsInFulltext(Query $query, TokenInterface $token)
451
    {
452
        $value = str_replace(['*', '!'], ['*', '*'], $token->value());
453
        $value = mb_strpos($value, '+') === 0 ? mb_substr($value, 1) : $value;
454
455
        if (empty($value) || in_array($value, $this->_stopWords())) {
456
            return $query;
457
        }
458
459
        $not = $token->negated() ? 'NOT' : '';
460
        $value = str_replace("'", '"', $value);
461
        $conditions = ["{$not} MATCH(SearchDatasets.words) AGAINST('{$value}' IN BOOLEAN MODE) > 0"];
462
463 View Code Duplication
        if ($token->where() === 'or') {
464
            $query->orWhere($conditions);
465
        } elseif ($token->where() === 'and') {
466
            $query->andWhere($conditions);
467
        } else {
468
            $query->where($conditions);
469
        }
470
471
        return $query;
472
    }
473
474
    /**
475
     * Whether FullText index is available or not and should be used.
476
     *
477
     * @return bool True if enabled and should be used, false otherwise
478
     */
479
    protected function _isFullTextEnabled()
480
    {
481
        if (!$this->config('fulltext')) {
482
            return false;
483
        }
484
485
        static $enabled = null;
486
        if ($enabled !== null) {
487
            return $enabled;
488
        }
489
490
        list(, $driverClass) = namespaceSplit(strtolower(get_class($this->_table->connection()->driver())));
491
        if ($driverClass != 'mysql') {
492
            $enabled = false;
493
494
            return false;
495
        }
496
497
        $schema = $this->_table->SearchDatasets->schema();
498
        foreach ($schema->indexes() as $index) {
499
            $info = $schema->index($index);
500
            if (in_array('words', $info['columns']) &&
501
                strtolower($info['type']) == 'fulltext'
502
            ) {
503
                $enabled = true;
504
505
                return true;
506
            }
507
        }
508
509
        $enabled = false;
510
511
        return false;
512
    }
513
514
    /**
515
     * Gets a list of storage engine's stopwords. That is words that is considered
516
     * common or Trivial enough that it is omitted from the search index and ignored
517
     * in search queries
518
     *
519
     * @return array List of words
520
     */
521
    protected function _stopWords()
522
    {
523
        $conn = $this->_table->find()->connection();
524
        $cacheKey = $conn->configName() . '_generic_engine_stopwords_list';
525
        if ($cache = Cache::read($cacheKey, '_cake_model_')) {
526
            return (array)$cache;
527
        }
528
529
        $words = [];
530
        $sql = $conn
531
            ->execute('SELECT * FROM INFORMATION_SCHEMA.INNODB_FT_DEFAULT_STOPWORD')
532
            ->fetchAll('assoc');
533
534
        foreach ((array)$sql as $row) {
535
            if (!empty($row['value'])) {
536
                $words[] = $row['value'];
537
            }
538
        }
539
540
        Cache::write($cacheKey, $words, '_cake_model_');
541
542
        return $words;
543
    }
544
545
    /**
546
     * Calculates entity's primary key.
547
     *
548
     * @param \Cake\Datasource\EntityInterface $entity The entity
549
     * @return string
550
     * @deprecated Use direct access as `$entity->get($this->config('pk'))`
551
     */
552
    protected function _entityId(EntityInterface $entity)
553
    {
554
        return $entity->get($this->config('pk'));
555
    }
556
557
    /**
558
     * Extracts a list of words to by indexed for given entity.
559
     *
560
     * NOTE: Words can be repeated, this allows to search phrases.
561
     *
562
     * @param \Cake\Datasource\EntityInterface $entity The entity for which generate
563
     *  the list of words
564
     * @return string Space-separated list of words. e.g. `cat dog this that`
565
     */
566
    public function extractEntityWords(EntityInterface $entity)
567
    {
568
        $text = '';
569
        $entityArray = $entity->toArray();
570
        $entityArray = Hash::flatten($entityArray);
571
        foreach ($entityArray as $key => $value) {
572
            if (is_string($value) || is_numeric($value)) {
573
                $text .= " {$value}";
574
            }
575
        }
576
577
        $text = str_replace(["\n", "\r"], '', trim((string)$text)); // remove new lines
578
        $text = strip_tags($text); // remove HTML tags, but keep their content
579
        $strict = $this->config('strict');
580
581
        if (!empty($strict)) {
582
            // only: space, digits (0-9), letters (any language), ".", ",", "-", "_", "/", "\"
583
            $pattern = is_string($strict) ? $strict : '[^\p{L}\p{N}\s\@\.\,\-\_\/\\0-9]';
584
            $text = preg_replace('/' . $pattern . '/ui', ' ', $text);
585
        }
586
587
        $text = trim(preg_replace('/\s{2,}/i', ' ', $text)); // remove double spaces
588
        $text = mb_strtolower($text); // all to lowercase
589
        $text = $this->_filterText($text); // filter
590
        $text = iconv('UTF-8', 'UTF-8//IGNORE', mb_convert_encoding($text, 'UTF-8')); // remove any invalid character
591
592
        return trim($text);
593
    }
594
595
    /**
596
     * Removes any invalid word from the given text.
597
     *
598
     * @param string $text The text to filter
599
     * @return string Filtered text
600
     */
601
    protected function _filterText($text)
602
    {
603
        // return true means `yes, it's banned`
604
        if (is_callable($this->config('bannedWords'))) {
605
            $isBanned = function ($word) {
606
                $callable = $this->config('bannedWords');
607
608
                return $callable($word);
609
            };
610
        } else {
611
            $isBanned = function ($word) {
612
                return in_array($word, (array)$this->config('bannedWords')) || empty($word);
613
            };
614
        }
615
616
        $words = explode(' ', $text);
617
        foreach ($words as $i => $w) {
618
            if ($isBanned($w)) {
619
                unset($words[$i]);
620
            }
621
        }
622
623
        return implode(' ', $words);
624
    }
625
}
626