Completed
Push — 2.0 ( 51d40e...f6666e )
by Christopher
07:21
created

GenericEngine::_isFullTextEnabled()   C

Complexity

Conditions 7
Paths 6

Size

Total Lines 34
Code Lines 19

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 7
eloc 19
nc 6
nop 0
dl 0
loc 34
rs 6.7272
c 0
b 0
f 0
1
<?php
2
/**
3
 * Licensed under The GPL-3.0 License
4
 * For full copyright and license information, please see the LICENSE.txt
5
 * Redistributions of files must retain the above copyright notice.
6
 *
7
 * @since    2.0.0
8
 * @author   Christopher Castro <[email protected]>
9
 * @link     http://www.quickappscms.org
10
 * @license  http://opensource.org/licenses/gpl-3.0.html GPL-3.0 License
11
 */
12
namespace Search\Engine\Generic;
13
14
use Cake\Cache\Cache;
15
use Cake\Core\InstanceConfigTrait;
16
use Cake\Datasource\EntityInterface;
17
use Cake\Error\FatalErrorException;
18
use Cake\Event\Event;
19
use Cake\Event\EventManager;
20
use Cake\ORM\Query;
21
use Cake\ORM\Table;
22
use Cake\ORM\TableRegistry;
23
use Cake\Utility\Hash;
24
use Cake\Utility\Inflector;
25
use Search\Engine\BaseEngine;
26
use Search\Engine\Generic\Exception\CompoundPrimaryKeyException;
27
use Search\Parser\MiniLanguage\MiniLanguageParser;
28
use Search\Parser\TokenInterface;
29
use \ArrayObject;
30
31
/**
32
 * This Search Engine allows entities to be searchable through an auto-generated
33
 * list of words.
34
 *
35
 * ## Using Generic Engine
36
 *
37
 * You must indicate Searchable behavior to use this engine, for example when
38
 * attaching Searchable behavior to `Articles` table:
39
 *
40
 * ```php
41
 * $this->addBehavior('Search.Searchable', [
42
 *     'engine' => [
43
 *         'className' => 'Search\Engine\Generic\GenericEngine',
44
 *         'config' => [
45
 *             'bannedWords' => []
46
 *         ]
47
 *     ]
48
 * ]);
49
 * ```
50
 *
51
 * This engine will apply a series of filters (converts to lowercase, remove line
52
 * breaks, etc) to words list extracted from each entity being indexed.
53
 *
54
 * ### Banned Words
55
 *
56
 * You can use the `bannedWords` option to tell which words should not be indexed by
57
 * this engine. For example:
58
 *
59
 * ```php
60
 * $this->addBehavior('Search.Searchable', [
61
 *     'engine' => [
62
 *         'className' => 'Search\Engine\Generic\GenericEngine',
63
 *         'config' => [
64
 *             'bannedWords' => ['of', 'the', 'and']
65
 *         ]
66
 *     ]
67
 * ]);
68
 * ```
69
 *
70
 * If you need to ban a really specific list of words you can set `bannedWords`
71
 * option as a callable method that should return true or false to tell if a words
72
 * should be indexed or not. For example:
73
 *
74
 * ```php
75
 * $this->addBehavior('Search.Searchable', [
76
 *     'engine' => [
77
 *         'className' => 'Search\Engine\Generic\GenericEngine',
78
 *         'config' => [
79
 *             'bannedWords' => function ($word) {
80
 *                 return strlen($word) > 3;
81
 *             }
82
 *         ]
83
 *     ]
84
 * ]);
85
 * ```
86
 *
87
 * - Returning TRUE indicates that the word is safe for indexing (not banned).
88
 * - Returning FALSE indicates that the word should NOT be indexed (banned).
89
 *
90
 * In the example, above any word of 4 or more characters will be indexed
91
 * (e.g. "home", "name", "quickapps", etc). Any word of 3 or less characters will
92
 * be banned (e.g. "and", "or", "the").
93
 *
94
 * ## Searching Entities
95
 *
96
 * When using this engine, every entity under your table gets a list of indexed
97
 * words. The idea behind this is that you can use this list of words to locate any
98
 * entity based on a customized search-criteria. A search-criteria looks as follow:
99
 *
100
 *     "this phrase" OR -"not this one" AND this
101
 *
102
 * ---
103
 *
104
 * Use wildcard searches to broaden results; asterisk (`*`) matches any one or
105
 * more characters, exclamation mark (`!`) matches any single character:
106
 *
107
 *     "this *rase" OR -"not th!! one" AND thi!
108
 *
109
 * Anything containing space (" ") characters must be wrapper between quotation
110
 * marks:
111
 *
112
 *     "this phrase" special_operator:"[100 to 500]" -word -"more words" -word_1 word_2
113
 *
114
 * The search criteria above will be treated as it were composed by the
115
 * following parts:
116
 *
117
 * - `this phrase`
118
 * - `special_operator:[100 to 500]`
119
 * - `-word`
120
 * - `-more words`
121
 * - `-word_1`
122
 * - `word_2`
123
 *
124
 * ---
125
 *
126
 * Search criteria allows you to perform complex search conditions in a
127
 * human-readable way. Allows you, for example, create user-friendly search-forms,
128
 * or create some RSS feed just by creating a friendly URL using a search-criteria.
129
 * e.g.: `http://example.com/rss/category:art date:>2014-01-01`
130
 *
131
 * You must use the `search()` method to scope any query using a search-criteria.
132
 * For example, in one controller using `Users` model:
133
 *
134
 * ```php
135
 * $criteria = '"this phrase" OR -"not this one" AND this';
136
 * $query = $this->Users->find();
137
 * $query = $this->Users->search($criteria, $query);
138
 * ```
139
 *
140
 * The above will alter the given $query object according to the given criteria.
141
 * The second argument (query object) is optional, if not provided this Behavior
142
 * automatically generates a find-query for you. Previous example and the one
143
 * below are equivalent:
144
 *
145
 * ```php
146
 * $criteria = '"this phrase" OR -"not this one" AND this';
147
 * $query = $this->Users->search($criteria);
148
 * ```
149
 */
150
class GenericEngine extends BaseEngine
151
{
152
153
    /**
154
     * {@inheritDoc}
155
     *
156
     * - operators: A list of registered operators methods as `name` =>
157
     *   `methodName`.
158
     *
159
     * - strict: Used to filter any invalid word. Set to a string representing a
160
     *   regular expression describing which charaters should be removed. Or set
161
     *   to TRUE to used default discard criteria: only letters, digits and few
162
     *   basic symbols (".", ",", "/", etc). Defaults to TRUE (custom filter
163
     *   regex). VALID ONLY when `wordsExtractor` is set to null.
164
     *
165
     * - bannedWords: Array list of banned words, or a callable that should decide
166
     *   if the given word is banned or not. Defaults to empty array (allow
167
     *   everything). VALID ONLY when `wordsExtractor` is set to null.
168
     *
169
     * - fulltext: Whether to use FULLTEXT search whenever it is possible. Defaults to
170
     *   TRUE. This feature is only supported for MySQL InnoDB database engines.
171
     *
172
     * - datasetTable: Name of the MySQL table where words dataset should be stored and
173
     *   read from. This allows you to split large sets into different tables.
174
     *
175
     * - wordsExtractor: Callable function used to extract words from each entity being
176
     *   indexed. Such functions will received an Entity object as first argument, and
177
     *   should return a string of words. e.g. `lorem ipsum dolorem`. Defaults to internal
178
     *   method `extractEntityWords()`
179
     */
180
    protected $_defaultConfig = [
181
        'operators' => [],
182
        'strict' => true,
183
        'bannedWords' => [],
184
        'wordsExtractor' => null,
185
        'fulltext' => true,
186
        'datasetTable' => 'search_datasets',
187
    ];
188
189
    /**
190
     * {@inheritDoc}
191
     *
192
     * @throws \Search\Engine\Generic\Exception\CompoundPrimaryKeyException When using
193
     *   compound primary keys
194
     */
195
    public function __construct(Table $table, array $config = [])
196
    {
197
        $config['tableAlias'] = (string)Inflector::underscore($table->table());
198
        $config['pk'] = $table->primaryKey();
199
        $this->_defaultConfig['wordsExtractor'] = function (EntityInterface $entity) {
200
            return $this->extractEntityWords($entity);
201
        };
202
203
        if (is_array($config['pk'])) {
204
            throw new CompoundPrimaryKeyException($config['tableAlias']);
205
        }
206
207
        parent::__construct($table, $config);
208
209
        $assocOptions = [
210
            'foreignKey' => 'entity_id',
211
            'joinType' => 'INNER',
212
            'conditions' => [
213
                'SearchDatasets.table_alias' => $config['tableAlias'],
214
            ],
215
            'dependent' => true
216
        ];
217
218
        if ($this->config('datasetTable') != $this->_defaultConfig['datasetTable']) {
219
            $datasetTableObject = clone TableRegistry::get('Search.SearchDatasets');
220
            $datasetTableObject->table($this->config('datasetTable'));
221
            $assocOptions['targetTable'] = $datasetTableObject;
222
        }
223
224
        $this->_table->hasOne('Search.SearchDatasets', $assocOptions);
225
    }
226
227
    /**
228
     * {@inheritDoc}
229
     */
230
    public function index(EntityInterface $entity)
231
    {
232
        $set = $this->_table->SearchDatasets->find()
233
            ->where([
234
                'entity_id' => $this->_entityId($entity),
0 ignored issues
show
Deprecated Code introduced by
The method Search\Engine\Generic\GenericEngine::_entityId() has been deprecated with message: Use direct access as `$entity->get($this->config('pk'))`

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
235
                'table_alias' => $this->config('tableAlias'),
236
            ])
237
            ->limit(1)
238
            ->first();
239
240
        if (!$set) {
241
            $set = $this->_table->SearchDatasets->newEntity([
242
                'entity_id' => $this->_entityId($entity),
0 ignored issues
show
Deprecated Code introduced by
The method Search\Engine\Generic\GenericEngine::_entityId() has been deprecated with message: Use direct access as `$entity->get($this->config('pk'))`

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
243
                'table_alias' => $this->config('tableAlias'),
244
                'words' => '',
245
            ]);
246
        }
247
248
        // We add starting and trailing space to allow LIKE %something-to-match%
249
        $set = $this->_table->SearchDatasets->patchEntity($set, [
250
            'words' => ' ' . $this->config('wordExtractor')($entity) . ' '
251
        ]);
252
253
        return (bool)$this->_table->SearchDatasets->save($set);
254
    }
255
256
    /**
257
     * {@inheritDoc}
258
     */
259
    public function delete(EntityInterface $entity)
260
    {
261
        $this->_table->SearchDatasets->deleteAll([
262
            'entity_id' => $this->_entityId($entity),
0 ignored issues
show
Deprecated Code introduced by
The method Search\Engine\Generic\GenericEngine::_entityId() has been deprecated with message: Use direct access as `$entity->get($this->config('pk'))`

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
263
            'table_alias' => $this->config('tableAlias'),
264
        ]);
265
266
        return true;
267
    }
268
269
    /**
270
     * {@inheritDoc}
271
     */
272
    public function get(EntityInterface $entity)
273
    {
274
        return $this->_table->SearchDatasets->find()
275
            ->where([
276
                'entity_id' => $this->_entityId($entity),
0 ignored issues
show
Deprecated Code introduced by
The method Search\Engine\Generic\GenericEngine::_entityId() has been deprecated with message: Use direct access as `$entity->get($this->config('pk'))`

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
277
                'table_alias' => $this->config('tableAlias'),
278
            ])
279
            ->limit(1)
280
            ->first();
281
    }
282
283
    /**
284
     * {@inheritDoc}
285
     *
286
     * It looks for search-criteria and applies them over the query object. For
287
     * example, given the criteria below:
288
     *
289
     *     "this phrase" -"and not this one"
290
     *
291
     * Alters the query object as follow:
292
     *
293
     * ```php
294
     * $query->where([
295
     *    'indexed_words LIKE' => '%this phrase%',
296
     *    'indexed_words NOT LIKE' => '%and not this one%'
297
     * ]);
298
     * ```
299
     *
300
     * The `AND` & `OR` keywords are allowed to create complex conditions. For
301
     * example:
302
     *
303
     *     "this phrase" OR -"and not this one" AND "this"
304
     *
305
     * Will produce something like:
306
     *
307
     * ```php
308
     * $query->where(['indexed_words LIKE' => '%this phrase%'])
309
     *     ->orWhere(['indexed_words NOT LIKE' => '%and not this one%']);
310
     *     ->andWhere(['indexed_words LIKE' => '%this%']);
311
     * ```
312
     */
313
    public function search($criteria, Query $query)
314
    {
315
        $tokens = (array)(new MiniLanguageParser($criteria))->parse();
316
317
        if (!empty($tokens)) {
318
            $query->innerJoinWith('SearchDatasets');
319
320
            foreach ($tokens as $token) {
321
                if ($token->isOperator()) {
322
                    $query = $this->_scopeOperator($query, $token);
323
                } else {
324
                    $query = $this->_scopeWords($query, $token);
325
                }
326
            }
327
        }
328
329
        return $query;
330
    }
331
332
    /**
333
     * Scopes the given query using the given operator token.
334
     *
335
     * @param \Cake\ORM\Query $query The query to scope
336
     * @param \Search\Token $token Token describing an operator. e.g `-op_name:op_value`
337
     * @return \Cake\ORM\Query Scoped query
338
     */
339
    protected function _scopeOperator(Query $query, TokenInterface $token)
340
    {
341
        return $this->_table->applySearchOperator($query, $token);
342
    }
343
344
    /**
345
     * Scopes the given query using the given words token.
346
     *
347
     * @param \Cake\ORM\Query $query The query to scope
348
     * @param \Search\TokenInterface $token Token describing a words sequence. e.g `this is a phrase`
349
     * @return \Cake\ORM\Query Scoped query
350
     */
351
    protected function _scopeWords(Query $query, TokenInterface $token)
352
    {
353
        if ($this->_isFullTextEnabled()) {
354
            return $this->_scopeWordsInFulltext($query, $token);
355
        }
356
357
        $like = 'LIKE';
358
        if ($token->negated()) {
359
            $like = 'NOT LIKE';
360
        }
361
362
        // * Matches any one or more characters.
363
        // ! Matches any single character.
364
        $value = str_replace(['*', '!'], ['%', '_'], $token->value());
365
366
        if ($token->where() === 'or') {
367
            $query->orWhere(["SearchDatasets.words {$like}" => "%{$value}%"]);
368
        } elseif ($token->where() === 'and') {
369
            $query->andWhere(["SearchDatasets.words {$like}" => "%{$value}%"]);
370
        } else {
371
            $query->where(["SearchDatasets.words {$like}" => "%{$value}%"]);
372
        }
373
374
        return $query;
375
    }
376
377
    /**
378
     * Similar to "_scopeWords" but using MySQL's fulltext indexes.
379
     *
380
     * @param \Cake\ORM\Query $query The query to scope
381
     * @param \Search\TokenInterface $token Token describing a words sequence. e.g `this is a phrase`
382
     * @return \Cake\ORM\Query Scoped query
383
     */
384
    protected function _scopeWordsInFulltext(Query $query, TokenInterface $token)
385
    {
386
        $value = str_replace(['*', '!'], ['*', '*'], $token->value());
387
        $value = mb_strpos($value, '+') === 0 ? mb_substr($value, 1) : $value;
388
389
        if (empty($value) || in_array($value, $this->_stopWords())) {
390
            return $query;
391
        }
392
393
        $not = $token->negated() ? 'NOT' : '';
394
        $value = str_replace("'", '"', $value);
395
        $conditions = ["{$not} MATCH(SearchDatasets.words) AGAINST('{$value}' IN BOOLEAN MODE) > 0"];
396
397 View Code Duplication
        if ($token->where() === 'or') {
398
            $query->orWhere($conditions);
399
        } elseif ($token->where() === 'and') {
400
            $query->andWhere($conditions);
401
        } else {
402
            $query->where($conditions);
403
        }
404
405
        return $query;
406
    }
407
408
    /**
409
     * Whether FullText index is available or not and should be used.
410
     *
411
     * @return bool True if enabled and should be used, false otherwise
412
     */
413
    protected function _isFullTextEnabled()
414
    {
415
        if (!$this->config('fulltext')) {
416
            return false;
417
        }
418
419
        static $enabled = null;
420
        if ($enabled !== null) {
421
            return $enabled;
422
        }
423
424
        list(, $driverClass) = namespaceSplit(strtolower(get_class($this->_table->connection()->driver())));
425
        if ($driverClass != 'mysql') {
426
            $enabled = false;
427
428
            return false;
429
        }
430
431
        $schema = $this->_table->SearchDatasets->schema();
432
        foreach ($schema->indexes() as $index) {
433
            $info = $schema->index($index);
434
            if (in_array('words', $info['columns']) &&
435
                strtolower($info['type']) == 'fulltext'
436
            ) {
437
                $enabled = true;
438
439
                return true;
440
            }
441
        }
442
443
        $enabled = false;
444
445
        return false;
446
    }
447
448
    /**
449
     * Gets a list of storage engine's stopwords. That is words that is considered
450
     * common or Trivial enough that it is omitted from the search index and ignored
451
     * in search queries
452
     *
453
     * @return array List of words
454
     */
455
    protected function _stopWords()
456
    {
457
        $conn = $this->_table->find()->connection();
458
        $cacheKey = $conn->configName() . '_generic_engine_stopwords_list';
459
        if ($cache = Cache::read($cacheKey, '_cake_model_')) {
460
            return (array)$cache;
461
        }
462
463
        $words = [];
464
        $sql = $conn
465
            ->execute('SELECT * FROM INFORMATION_SCHEMA.INNODB_FT_DEFAULT_STOPWORD')
466
            ->fetchAll('assoc');
467
468
        foreach ((array)$sql as $row) {
469
            if (!empty($row['value'])) {
470
                $words[] = $row['value'];
471
            }
472
        }
473
474
        Cache::write($cacheKey, $words, '_cake_model_');
475
476
        return $words;
477
    }
478
479
    /**
480
     * Calculates entity's primary key.
481
     *
482
     * @param \Cake\Datasource\EntityInterface $entity The entity
483
     * @return string
484
     * @deprecated Use direct access as `$entity->get($this->config('pk'))`
485
     */
486
    protected function _entityId(EntityInterface $entity)
487
    {
488
        return $entity->get($this->config('pk'));
489
    }
490
491
    /**
492
     * Extracts a list of words to by indexed for given entity.
493
     *
494
     * NOTE: Words can be repeated, this allows to search phrases.
495
     *
496
     * @param \Cake\Datasource\EntityInterface $entity The entity for which generate
497
     *  the list of words
498
     * @return string Space-separated list of words. e.g. `cat dog this that`
499
     */
500
    public function extractEntityWords(EntityInterface $entity)
501
    {
502
        $text = '';
503
        $entityArray = $entity->toArray();
504
        $entityArray = Hash::flatten($entityArray);
505
        foreach ($entityArray as $key => $value) {
506
            if (is_string($value) || is_numeric($value)) {
507
                $text .= " {$value}";
508
            }
509
        }
510
511
        $text = str_replace(["\n", "\r"], '', trim((string)$text)); // remove new lines
512
        $text = strip_tags($text); // remove HTML tags, but keep their content
513
        $strict = $this->config('strict');
514
515
        if (!empty($strict)) {
516
            // only: space, digits (0-9), letters (any language), ".", ",", "-", "_", "/", "\"
517
            $pattern = is_string($strict) ? $strict : '[^\p{L}\p{N}\s\@\.\,\-\_\/\\0-9]';
518
            $text = preg_replace('/' . $pattern . '/ui', ' ', $text);
519
        }
520
521
        $text = trim(preg_replace('/\s{2,}/i', ' ', $text)); // remove double spaces
522
        $text = mb_strtolower($text); // all to lowercase
523
        $text = $this->_filterText($text); // filter
524
        $text = iconv('UTF-8', 'UTF-8//IGNORE', mb_convert_encoding($text, 'UTF-8')); // remove any invalid character
525
526
        return trim($text);
527
    }
528
529
    /**
530
     * Removes any invalid word from the given text.
531
     *
532
     * @param string $text The text to filter
533
     * @return string Filtered text
534
     */
535
    protected function _filterText($text)
536
    {
537
        // return true means `yes, it's banned`
538
        if (is_callable($this->config('bannedWords'))) {
539
            $isBanned = function ($word) {
540
                $callable = $this->config('bannedWords');
541
542
                return $callable($word);
543
            };
544
        } else {
545
            $isBanned = function ($word) {
546
                return in_array($word, (array)$this->config('bannedWords')) || empty($word);
547
            };
548
        }
549
550
        $words = explode(' ', $text);
551
        foreach ($words as $i => $w) {
552
            if ($isBanned($w)) {
553
                unset($words[$i]);
554
            }
555
        }
556
557
        return implode(' ', $words);
558
    }
559
}
560