Completed
Push — 2.0 ( 23edf6...56c8f7 )
by Christopher
05:42
created

GenericEngine::delete()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 9
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 5
nc 1
nop 1
dl 0
loc 9
rs 9.6666
c 0
b 0
f 0
1
<?php
2
/**
3
 * Licensed under The GPL-3.0 License
4
 * For full copyright and license information, please see the LICENSE.txt
5
 * Redistributions of files must retain the above copyright notice.
6
 *
7
 * @since    2.0.0
8
 * @author   Christopher Castro <[email protected]>
9
 * @link     http://www.quickappscms.org
10
 * @license  http://opensource.org/licenses/gpl-3.0.html GPL-3.0 License
11
 */
12
namespace Search\Engine\Generic;
13
14
use Cake\Cache\Cache;
15
use Cake\Core\InstanceConfigTrait;
16
use Cake\Datasource\EntityInterface;
17
use Cake\Error\FatalErrorException;
18
use Cake\Event\Event;
19
use Cake\Event\EventManager;
20
use Cake\ORM\Query;
21
use Cake\ORM\Table;
22
use Cake\ORM\TableRegistry;
23
use Cake\Utility\Hash;
24
use Cake\Utility\Inflector;
25
use Search\Engine\BaseEngine;
26
use Search\Engine\Generic\Exception\CompoundPrimaryKeyException;
27
use Search\Parser\MiniLanguage\MiniLanguageParser;
28
use Search\Parser\TokenInterface;
29
use \ArrayObject;
30
31
/**
32
 * This Search Engine allows entities to be searchable through an auto-generated
33
 * list of words.
34
 *
35
 * ## Using Generic Engine
36
 *
37
 * You must indicate Searchable behavior to use this engine, for example when
38
 * attaching Searchable behavior to `Articles` table:
39
 *
40
 * ```php
41
 * $this->addBehavior('Search.Searchable', [
42
 *     'engine' => [
43
 *         'className' => 'Search\Engine\Generic\GenericEngine',
44
 *         'config' => [
45
 *             'bannedWords' => []
46
 *         ]
47
 *     ]
48
 * ]);
49
 * ```
50
 *
51
 * This engine will apply a series of filters (converts to lowercase, remove line
52
 * breaks, etc) to words list extracted from each entity being indexed.
53
 *
54
 * ### Banned Words
55
 *
56
 * You can use the `bannedWords` option to tell which words should not be indexed by
57
 * this engine. For example:
58
 *
59
 * ```php
60
 * $this->addBehavior('Search.Searchable', [
61
 *     'engine' => [
62
 *         'className' => 'Search\Engine\Generic\GenericEngine',
63
 *         'config' => [
64
 *             'bannedWords' => ['of', 'the', 'and']
65
 *         ]
66
 *     ]
67
 * ]);
68
 * ```
69
 *
70
 * If you need to ban a really specific list of words you can set `bannedWords`
71
 * option as a callable method that should return true or false to tell if a words
72
 * should be indexed or not. For example:
73
 *
74
 * ```php
75
 * $this->addBehavior('Search.Searchable', [
76
 *     'engine' => [
77
 *         'className' => 'Search\Engine\Generic\GenericEngine',
78
 *         'config' => [
79
 *             'bannedWords' => function ($word) {
80
 *                 return strlen($word) > 3;
81
 *             }
82
 *         ]
83
 *     ]
84
 * ]);
85
 * ```
86
 *
87
 * - Returning TRUE indicates that the word is safe for indexing (not banned).
88
 * - Returning FALSE indicates that the word should NOT be indexed (banned).
89
 *
90
 * In the example, above any word of 4 or more characters will be indexed
91
 * (e.g. "home", "name", "quickapps", etc). Any word of 3 or less characters will
92
 * be banned (e.g. "and", "or", "the").
93
 *
94
 * ## Searching Entities
95
 *
96
 * When using this engine, every entity under your table gets a list of indexed
97
 * words. The idea behind this is that you can use this list of words to locate any
98
 * entity based on a customized search-criteria. A search-criteria looks as follow:
99
 *
100
 *     "this phrase" OR -"not this one" AND this
101
 *
102
 * ---
103
 *
104
 * Use wildcard searches to broaden results; asterisk (`*`) matches any one or
105
 * more characters, exclamation mark (`!`) matches any single character:
106
 *
107
 *     "this *rase" OR -"not th!! one" AND thi!
108
 *
109
 * Anything containing space (" ") characters must be wrapper between quotation
110
 * marks:
111
 *
112
 *     "this phrase" special_operator:"[100 to 500]" -word -"more words" -word_1 word_2
113
 *
114
 * The search criteria above will be treated as it were composed by the
115
 * following parts:
116
 *
117
 * - `this phrase`
118
 * - `special_operator:[100 to 500]`
119
 * - `-word`
120
 * - `-more words`
121
 * - `-word_1`
122
 * - `word_2`
123
 *
124
 * ---
125
 *
126
 * Search criteria allows you to perform complex search conditions in a
127
 * human-readable way. Allows you, for example, create user-friendly search-forms,
128
 * or create some RSS feed just by creating a friendly URL using a search-criteria.
129
 * e.g.: `http://example.com/rss/category:art date:>2014-01-01`
130
 *
131
 * You must use the `search()` method to scope any query using a search-criteria.
132
 * For example, in one controller using `Users` model:
133
 *
134
 * ```php
135
 * $criteria = '"this phrase" OR -"not this one" AND this';
136
 * $query = $this->Users->find();
137
 * $query = $this->Users->search($criteria, $query);
138
 * ```
139
 *
140
 * The above will alter the given $query object according to the given criteria.
141
 * The second argument (query object) is optional, if not provided this Behavior
142
 * automatically generates a find-query for you. Previous example and the one
143
 * below are equivalent:
144
 *
145
 * ```php
146
 * $criteria = '"this phrase" OR -"not this one" AND this';
147
 * $query = $this->Users->search($criteria);
148
 * ```
149
 */
150
class GenericEngine extends BaseEngine
151
{
152
153
    /**
154
     * {@inheritDoc}
155
     *
156
     * - operators: A list of registered operators methods as `name` =>
157
     *   `methodName`.
158
     *
159
     * - strict: Used to filter any invalid word. Set to a string representing a
160
     *   regular expression describing which charaters should be removed. Or set
161
     *   to TRUE to used default discard criteria: only letters, digits and few
162
     *   basic symbols (".", ",", "/", etc). Defaults to TRUE (custom filter
163
     *   regex). VALID ONLY when `wordsExtractor` is set to null.
164
     *
165
     * - bannedWords: Array list of banned words, or a callable that should decide
166
     *   if the given word is banned or not. Defaults to empty array (allow
167
     *   everything). VALID ONLY when `wordsExtractor` is set to null.
168
     *
169
     * - fulltext: Whether to use FULLTEXT search whenever it is possible. Defaults to
170
     *   TRUE. This feature is only supported for MySQL InnoDB database engines.
171
     *
172
     * - datasetTable: Name of the MySQL table where words dataset should be stored and
173
     *   read from. This allows you to split large sets into different tables.
174
     *
175
     * - wordsExtractor: Callable function used to extract words from each entity being
176
     *   indexed. Such functions will received an Entity object as first argument, and
177
     *   should return a string of words. e.g. `lorem ipsum dolorem`. Defaults to internal
178
     *   method `extractEntityWords()`
179
     */
180
    protected $_defaultConfig = [
181
        'operators' => [],
182
        'strict' => true,
183
        'bannedWords' => [],
184
        'wordsExtractor' => null,
185
        'fulltext' => true,
186
        'datasetTable' => 'search_datasets',
187
    ];
188
189
    /**
190
     * {@inheritDoc}
191
     *
192
     * @throws \Search\Engine\Generic\Exception\CompoundPrimaryKeyException When using
193
     *   compound primary keys
194
     */
195
    public function __construct(Table $table, array $config = [])
196
    {
197
        $config['tableAlias'] = (string)Inflector::underscore($table->table());
198
        $config['pk'] = $table->primaryKey();
199
        $this->_defaultConfig['wordsExtractor'] = function (EntityInterface $entity) {
200
            return $this->extractEntityWords($entity);
201
        };
202
203
        if (is_array($config['pk'])) {
204
            throw new CompoundPrimaryKeyException($config['tableAlias']);
205
        }
206
207
        parent::__construct($table, $config);
208
209
        $assocOptions = [
210
            'foreignKey' => 'entity_id',
211
            'joinType' => 'INNER',
212
            'conditions' => [
213
                'SearchDatasets.table_alias' => $config['tableAlias'],
214
            ],
215
            'dependent' => true
216
        ];
217
218
        if ($this->config('datasetTable') != $this->_defaultConfig['datasetTable']) {
219
            $datasetTableObject = clone TableRegistry::get('Search.SearchDatasets');
220
            $datasetTableObject->table($this->config('datasetTable'));
221
            $assocOptions['targetTable'] = $datasetTableObject;
222
        }
223
224
        $this->_table->hasOne('Search.SearchDatasets', $assocOptions);
225
    }
226
227
    /**
228
     * {@inheritDoc}
229
     */
230
    public function index(EntityInterface $entity)
231
    {
232
        $set = $this->_table->SearchDatasets->find()
233
            ->where([
234
                'entity_id' => $this->_entityId($entity),
0 ignored issues
show
Deprecated Code introduced by
The method Search\Engine\Generic\GenericEngine::_entityId() has been deprecated with message: Use direct access as `$entity->get($this->config('pk'))`

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
235
                'table_alias' => $this->config('tableAlias'),
236
            ])
237
            ->limit(1)
238
            ->first();
239
240
        if (!$set) {
241
            $set = $this->_table->SearchDatasets->newEntity([
242
                'entity_id' => $this->_entityId($entity),
0 ignored issues
show
Deprecated Code introduced by
The method Search\Engine\Generic\GenericEngine::_entityId() has been deprecated with message: Use direct access as `$entity->get($this->config('pk'))`

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
243
                'table_alias' => $this->config('tableAlias'),
244
                'words' => '',
245
            ]);
246
        }
247
248
        // We add starting and trailing space to allow LIKE %something-to-match%
249
        $extractor = $this->config('wordsExtractor');
250
        $set = $this->_table->SearchDatasets->patchEntity($set, [
251
            'words' => ' ' . $extractor($entity) . ' '
252
        ]);
253
254
        return (bool)$this->_table->SearchDatasets->save($set);
255
    }
256
257
    /**
258
     * {@inheritDoc}
259
     */
260
    public function delete(EntityInterface $entity)
261
    {
262
        $this->_table->SearchDatasets->deleteAll([
263
            'entity_id' => $this->_entityId($entity),
0 ignored issues
show
Deprecated Code introduced by
The method Search\Engine\Generic\GenericEngine::_entityId() has been deprecated with message: Use direct access as `$entity->get($this->config('pk'))`

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
264
            'table_alias' => $this->config('tableAlias'),
265
        ]);
266
267
        return true;
268
    }
269
270
    /**
271
     * {@inheritDoc}
272
     */
273
    public function get(EntityInterface $entity)
274
    {
275
        return $this->_table->SearchDatasets->find()
276
            ->where([
277
                'entity_id' => $this->_entityId($entity),
0 ignored issues
show
Deprecated Code introduced by
The method Search\Engine\Generic\GenericEngine::_entityId() has been deprecated with message: Use direct access as `$entity->get($this->config('pk'))`

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
278
                'table_alias' => $this->config('tableAlias'),
279
            ])
280
            ->limit(1)
281
            ->first();
282
    }
283
284
    /**
285
     * {@inheritDoc}
286
     *
287
     * It looks for search-criteria and applies them over the query object. For
288
     * example, given the criteria below:
289
     *
290
     *     "this phrase" -"and not this one"
291
     *
292
     * Alters the query object as follow:
293
     *
294
     * ```php
295
     * $query->where([
296
     *    'indexed_words LIKE' => '%this phrase%',
297
     *    'indexed_words NOT LIKE' => '%and not this one%'
298
     * ]);
299
     * ```
300
     *
301
     * The `AND` & `OR` keywords are allowed to create complex conditions. For
302
     * example:
303
     *
304
     *     "this phrase" OR -"and not this one" AND "this"
305
     *
306
     * Will produce something like:
307
     *
308
     * ```php
309
     * $query
310
     *     ->where(['indexed_words LIKE' => '%this phrase%'])
311
     *     ->orWhere(['indexed_words NOT LIKE' => '%and not this one%']);
312
     *     ->andWhere(['indexed_words LIKE' => '%this%']);
313
     * ```
314
     *
315
     * ### Options
316
     *
317
     * - `tokenDecorator`: Callable function which is applied to every token before it
318
     *   gets applied. Retuning anything that is not a `TokenInterface` will skip that
319
     *   token from being used.
320
     */
321
    public function search($criteria, Query $query, array $options = [])
322
    {
323
        $tokens = $this->tokenizer($criteria);
324
        $options += [
325
            'tokenDecorator' => function ($t) {
326
                return $t;
327
            },
328
        ];
329
330
        if (!empty($tokens)) {
331
            $query->innerJoinWith('SearchDatasets');
332
333
            foreach ($tokens as $token) {
334
                $token = $decorator($token);
0 ignored issues
show
Bug introduced by
The variable $decorator does not exist. Did you forget to declare it?

This check marks access to variables or properties that have not been declared yet. While PHP has no explicit notion of declaring a variable, accessing it before a value is assigned to it is most likely a bug.

Loading history...
335
336
                if (!($token instanceof TokenInterface)) {
337
                    continue;
338
                }
339
340
                if ($token->isOperator()) {
341
                    $query = $this->_scopeOperator($query, $token);
342
                } else {
343
                    $query = $this->_scopeWords($query, $token);
344
                }
345
            }
346
        }
347
348
        return $query;
349
    }
350
351
    /**
352
     * Extracts every token found on the given search criteria.
353
     *
354
     * @param string $criteria A search criteria. e.g. `-hello +world`
355
     * @return array List of tokens found
356
     */
357
    public function tokenizer($criteria)
358
    {
359
        return (array)(new MiniLanguageParser($criteria))->parse(); 
360
    }
361
362
    /**
363
     * Scopes the given query using the given operator token.
364
     *
365
     * @param \Cake\ORM\Query $query The query to scope
366
     * @param \Search\Token $token Token describing an operator. e.g `-op_name:op_value`
367
     * @return \Cake\ORM\Query Scoped query
368
     */
369
    protected function _scopeOperator(Query $query, TokenInterface $token)
370
    {
371
        return $this->_table->applySearchOperator($query, $token);
372
    }
373
374
    /**
375
     * Scopes the given query using the given words token.
376
     *
377
     * @param \Cake\ORM\Query $query The query to scope
378
     * @param \Search\TokenInterface $token Token describing a words sequence. e.g `this is a phrase`
379
     * @return \Cake\ORM\Query Scoped query
380
     */
381
    protected function _scopeWords(Query $query, TokenInterface $token)
382
    {
383
        if ($this->_isFullTextEnabled()) {
384
            return $this->_scopeWordsInFulltext($query, $token);
385
        }
386
387
        $like = 'LIKE';
388
        if ($token->negated()) {
389
            $like = 'NOT LIKE';
390
        }
391
392
        // * Matches any one or more characters.
393
        // ! Matches any single character.
394
        $value = str_replace(['*', '!'], ['%', '_'], $token->value());
395
396
        if ($token->where() === 'or') {
397
            $query->orWhere(["SearchDatasets.words {$like}" => "%{$value}%"]);
398
        } elseif ($token->where() === 'and') {
399
            $query->andWhere(["SearchDatasets.words {$like}" => "%{$value}%"]);
400
        } else {
401
            $query->where(["SearchDatasets.words {$like}" => "%{$value}%"]);
402
        }
403
404
        return $query;
405
    }
406
407
    /**
408
     * Similar to "_scopeWords" but using MySQL's fulltext indexes.
409
     *
410
     * @param \Cake\ORM\Query $query The query to scope
411
     * @param \Search\TokenInterface $token Token describing a words sequence. e.g `this is a phrase`
412
     * @return \Cake\ORM\Query Scoped query
413
     */
414
    protected function _scopeWordsInFulltext(Query $query, TokenInterface $token)
415
    {
416
        $value = str_replace(['*', '!'], ['*', '*'], $token->value());
417
        $value = mb_strpos($value, '+') === 0 ? mb_substr($value, 1) : $value;
418
419
        if (empty($value) || in_array($value, $this->_stopWords())) {
420
            return $query;
421
        }
422
423
        $not = $token->negated() ? 'NOT' : '';
424
        $value = str_replace("'", '"', $value);
425
        $conditions = ["{$not} MATCH(SearchDatasets.words) AGAINST('{$value}' IN BOOLEAN MODE) > 0"];
426
427 View Code Duplication
        if ($token->where() === 'or') {
428
            $query->orWhere($conditions);
429
        } elseif ($token->where() === 'and') {
430
            $query->andWhere($conditions);
431
        } else {
432
            $query->where($conditions);
433
        }
434
435
        return $query;
436
    }
437
438
    /**
439
     * Whether FullText index is available or not and should be used.
440
     *
441
     * @return bool True if enabled and should be used, false otherwise
442
     */
443
    protected function _isFullTextEnabled()
444
    {
445
        if (!$this->config('fulltext')) {
446
            return false;
447
        }
448
449
        static $enabled = null;
450
        if ($enabled !== null) {
451
            return $enabled;
452
        }
453
454
        list(, $driverClass) = namespaceSplit(strtolower(get_class($this->_table->connection()->driver())));
455
        if ($driverClass != 'mysql') {
456
            $enabled = false;
457
458
            return false;
459
        }
460
461
        $schema = $this->_table->SearchDatasets->schema();
462
        foreach ($schema->indexes() as $index) {
463
            $info = $schema->index($index);
464
            if (in_array('words', $info['columns']) &&
465
                strtolower($info['type']) == 'fulltext'
466
            ) {
467
                $enabled = true;
468
469
                return true;
470
            }
471
        }
472
473
        $enabled = false;
474
475
        return false;
476
    }
477
478
    /**
479
     * Gets a list of storage engine's stopwords. That is words that is considered
480
     * common or Trivial enough that it is omitted from the search index and ignored
481
     * in search queries
482
     *
483
     * @return array List of words
484
     */
485
    protected function _stopWords()
486
    {
487
        $conn = $this->_table->find()->connection();
488
        $cacheKey = $conn->configName() . '_generic_engine_stopwords_list';
489
        if ($cache = Cache::read($cacheKey, '_cake_model_')) {
490
            return (array)$cache;
491
        }
492
493
        $words = [];
494
        $sql = $conn
495
            ->execute('SELECT * FROM INFORMATION_SCHEMA.INNODB_FT_DEFAULT_STOPWORD')
496
            ->fetchAll('assoc');
497
498
        foreach ((array)$sql as $row) {
499
            if (!empty($row['value'])) {
500
                $words[] = $row['value'];
501
            }
502
        }
503
504
        Cache::write($cacheKey, $words, '_cake_model_');
505
506
        return $words;
507
    }
508
509
    /**
510
     * Calculates entity's primary key.
511
     *
512
     * @param \Cake\Datasource\EntityInterface $entity The entity
513
     * @return string
514
     * @deprecated Use direct access as `$entity->get($this->config('pk'))`
515
     */
516
    protected function _entityId(EntityInterface $entity)
517
    {
518
        return $entity->get($this->config('pk'));
519
    }
520
521
    /**
522
     * Extracts a list of words to by indexed for given entity.
523
     *
524
     * NOTE: Words can be repeated, this allows to search phrases.
525
     *
526
     * @param \Cake\Datasource\EntityInterface $entity The entity for which generate
527
     *  the list of words
528
     * @return string Space-separated list of words. e.g. `cat dog this that`
529
     */
530
    public function extractEntityWords(EntityInterface $entity)
531
    {
532
        $text = '';
533
        $entityArray = $entity->toArray();
534
        $entityArray = Hash::flatten($entityArray);
535
        foreach ($entityArray as $key => $value) {
536
            if (is_string($value) || is_numeric($value)) {
537
                $text .= " {$value}";
538
            }
539
        }
540
541
        $text = str_replace(["\n", "\r"], '', trim((string)$text)); // remove new lines
542
        $text = strip_tags($text); // remove HTML tags, but keep their content
543
        $strict = $this->config('strict');
544
545
        if (!empty($strict)) {
546
            // only: space, digits (0-9), letters (any language), ".", ",", "-", "_", "/", "\"
547
            $pattern = is_string($strict) ? $strict : '[^\p{L}\p{N}\s\@\.\,\-\_\/\\0-9]';
548
            $text = preg_replace('/' . $pattern . '/ui', ' ', $text);
549
        }
550
551
        $text = trim(preg_replace('/\s{2,}/i', ' ', $text)); // remove double spaces
552
        $text = mb_strtolower($text); // all to lowercase
553
        $text = $this->_filterText($text); // filter
554
        $text = iconv('UTF-8', 'UTF-8//IGNORE', mb_convert_encoding($text, 'UTF-8')); // remove any invalid character
555
556
        return trim($text);
557
    }
558
559
    /**
560
     * Removes any invalid word from the given text.
561
     *
562
     * @param string $text The text to filter
563
     * @return string Filtered text
564
     */
565
    protected function _filterText($text)
566
    {
567
        // return true means `yes, it's banned`
568
        if (is_callable($this->config('bannedWords'))) {
569
            $isBanned = function ($word) {
570
                $callable = $this->config('bannedWords');
571
572
                return $callable($word);
573
            };
574
        } else {
575
            $isBanned = function ($word) {
576
                return in_array($word, (array)$this->config('bannedWords')) || empty($word);
577
            };
578
        }
579
580
        $words = explode(' ', $text);
581
        foreach ($words as $i => $w) {
582
            if ($isBanned($w)) {
583
                unset($words[$i]);
584
            }
585
        }
586
587
        return implode(' ', $words);
588
    }
589
}
590