Failed Conditions
Pull Request — master (#2943)
by Andreas
03:19
created

FulltextSearch::resultCombine()   B

Complexity

Conditions 6
Paths 3

Size

Total Lines 22

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 6
nc 3
nop 1
dl 0
loc 22
rs 8.9457
c 0
b 0
f 0
1
<?php
2
3
namespace dokuwiki\Search;
4
5
use dokuwiki\Extension\Event;
6
use dokuwiki\Search\FulltextIndex;
7
use dokuwiki\Search\QueryParser;
8
use dokuwiki\Utf8;
9
10
// create snippets for the first few results only
11
const FT_SNIPPET_NUMBER = 15;
12
13
/**
14
 * Class DokuWiki Fulltext Search
15
 *
16
 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
17
 * @author     Andreas Gohr <[email protected]>
18
 */
19
class FulltextSearch
20
{
21
    /** @var FulltextSearch $instance */
22
    protected static $instance = null;
23
24
    /**
25
     * Get new or existing singleton instance of the FulltextSearch
26
     *
27
     * @return FulltextSearch
28
     */
29
    public static function getInstance()
30
    {
31
        if (is_null(static::$instance)) {
32
            static::$instance = new static();
33
        }
34
        return static::$instance;
35
    }
36
37
    /**
38
     *  Fulltext Search constructor. prevent direct object creation
39
     */
40
    protected function __construct() {}
41
42
    /**
43
     * The fulltext search
44
     *
45
     * Returns a list of matching documents for the given query
46
     *
47
     * refactored into pageSearch(), pageSearchCallBack() and trigger_event()
48
     *
49
     * @param string     $query
50
     * @param array      $highlight
51
     * @param string     $sort
52
     * @param int|string $after  only show results with mtime after this date,
53
     *                           accepts timestap or strtotime arguments
54
     * @param int|string $before only show results with mtime before this date,
55
     *                           accepts timestap or strtotime arguments
56
     *
57
     * @return array
58
     */
59
    public function pageSearch($query, &$highlight, $sort = null, $after = null, $before = null)
60
    {
61
        if ($sort === null) {
62
            $sort = 'hits';
63
        }
64
        $data = [
65
            'query' => $query,
66
            'sort' => $sort,
67
            'after' => $after,
68
            'before' => $before
69
        ];
70
        $data['highlight'] =& $highlight;
71
        $action = [$this, 'pageSearchCallBack'];
72
        return Event::createAndTrigger('SEARCH_QUERY_FULLPAGE', $data, $action);
73
    }
74
75
    /**
76
     * Returns a list of matching documents for the given query
77
     *
78
     * @author Andreas Gohr <[email protected]>
79
     * @author Kazutaka Miyasaka <[email protected]>
80
     *
81
     * @param array $data  event data
82
     * @return array       matching documents
83
     */
84
    public function pageSearchCallBack(&$data)
85
    {
86
        // parse the given query
87
        $q = (new QueryParser)->convert($data['query']);
88
        $data['highlight'] = $q['highlight'];
89
90
        if (empty($q['parsed_ary'])) return array();
91
92
        // lookup all words found in the query
93
        $FulltextIndex = FulltextIndex::getInstance();
94
        $lookup = $FulltextIndex->lookupWords($q['words']);
95
96
        // get all pages in this dokuwiki site (!: includes nonexistent pages)
97
        $pages_all = array();
98
        foreach ($FulltextIndex->getPages() as $id) {
99
            $pages_all[$id] = 0; // base: 0 hit
100
        }
101
102
        // process the query
103
        $stack = array();
104
        foreach ($q['parsed_ary'] as $token) {
0 ignored issues
show
Bug introduced by
The expression $q['parsed_ary'] of type string|array is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
105
            switch (substr($token, 0, 3)) {
106
                case 'W+:':
107
                case 'W-:':
108
                case 'W_:': // word
109
                    $word    = substr($token, 3);
110
                    $stack[] = (array) $lookup[$word];
111
                    break;
112
                case 'P+:':
113
                case 'P-:': // phrase
114
                    $phrase = substr($token, 3);
115
                    // since phrases are always parsed as ((W1)(W2)...(P)),
116
                    // the end($stack) always points the pages that contain
117
                    // all words in this phrase
118
                    $pages  = end($stack);
119
                    $pages_matched = array();
120
                    foreach (array_keys($pages) as $id) {
121
                        $evdata = array(
122
                            'id' => $id,
123
                            'phrase' => $phrase,
124
                            'text' => rawWiki($id)
125
                        );
126
                        $event = new Event('FULLTEXT_PHRASE_MATCH', $evdata);
127
                        if ($event->advise_before() && $event->result !== true) {
128
                            $text = Utf8\PhpString::strtolower($evdata['text']);
129
                            if (strpos($text, $phrase) !== false) {
130
                                $event->result = true;
131
                            }
132
                        }
133
                        $event->advise_after();
134
                        if ($event->result === true) {
135
                            $pages_matched[$id] = 0; // phrase: always 0 hit
136
                        }
137
                    }
138
                    $stack[] = $pages_matched;
139
                    break;
140
                case 'N+:':
141
                case 'N-:': // namespace
142
                    $ns = cleanID(substr($token, 3)) . ':';
143
                    $pages_matched = array();
144
                    foreach (array_keys($pages_all) as $id) {
145
                        if (strpos($id, $ns) === 0) {
146
                            $pages_matched[$id] = 0; // namespace: always 0 hit
147
                        }
148
                    }
149
                    $stack[] = $pages_matched;
150
                    break;
151
                case 'AND': // and operation
152
                    list($pages1, $pages2) = array_splice($stack, -2);
153
                    $stack[] = $this->resultCombine(array($pages1, $pages2));
154
                    break;
155
                case 'OR':  // or operation
156
                    list($pages1, $pages2) = array_splice($stack, -2);
157
                    $stack[] = $this->resultUnite(array($pages1, $pages2));
158
                    break;
159
                case 'NOT': // not operation (unary)
160
                    $pages   = array_pop($stack);
161
                    $stack[] = $this->resultComplement(array($pages_all, $pages));
162
                    break;
163
            }
164
        }
165
        $docs = array_pop($stack);
166
167
        if (empty($docs)) return array();
168
169
        // check: settings, acls, existence
170
        foreach (array_keys($docs) as $id) {
171
            if (isHiddenPage($id)
172
                || auth_quickaclcheck($id) < AUTH_READ
173
                || !page_exists($id, '', false)
174
            ) {
175
                unset($docs[$id]);
176
            }
177
        }
178
179
        $docs = $this->filterResultsByTime($docs, $data['after'], $data['before']);
180
181
        if ($data['sort'] === 'mtime') {
182
            uksort($docs, [$this, 'pagemtimesorter']);
183
        } else {
184
            // sort docs by count
185
            arsort($docs);
186
        }
187
188
        return $docs;
189
    }
190
191
    /**
192
     * @param array      $results search results in the form pageid => value
193
     * @param int|string $after   only returns results with mtime after this date,
194
     *                            accepts timestap or strtotime arguments
195
     * @param int|string $before  only returns results with mtime after this date,
196
     *                            accepts timestap or strtotime arguments
197
     *
198
     * @return array
199
     */
200
    protected function filterResultsByTime(array $results, $after, $before)
201
    {
202
        if ($after || $before) {
203
            $after = is_int($after) ? $after : strtotime($after);
204
            $before = is_int($before) ? $before : strtotime($before);
205
206
            foreach ($results as $id => $value) {
207
                $mTime = filemtime(wikiFN($id));
208
                if ($after && $after > $mTime) {
209
                    unset($results[$id]);
210
                    continue;
211
                }
212
                if ($before && $before < $mTime) {
213
                    unset($results[$id]);
214
                }
215
            }
216
        }
217
        return $results;
218
    }
219
220
    /**
221
     * Sort pages by their mtime, from newest to oldest
222
     *
223
     * @param string $a
224
     * @param string $b
225
     *
226
     * @return int Returns < 0 if $a is newer than $b, > 0 if $b is newer than $a
227
     *             and 0 if they are of the same age
228
     */
229
    protected function pagemtimesorter($a, $b)
230
    {
231
        $mtimeA = filemtime(wikiFN($a));
232
        $mtimeB = filemtime(wikiFN($b));
233
        return $mtimeB - $mtimeA;
234
    }
235
236
    /**
237
     * Creates a snippet extract
238
     *
239
     * @author Andreas Gohr <[email protected]>
240
     * @triggers FULLTEXT_SNIPPET_CREATE
241
     *
242
     * @param string $id page id
243
     * @param array $highlight
244
     * @return mixed
245
     */
246
    public function snippet($id, $highlight)
247
    {
248
        $text = rawWiki($id);
249
        $text = str_replace("\xC2\xAD",'',$text); // remove soft-hyphens
250
        $evdata = array(
251
            'id'        => $id,
252
            'text'      => &$text,
253
            'highlight' => &$highlight,
254
            'snippet'   => '',
255
        );
256
257
        $evt = new Event('FULLTEXT_SNIPPET_CREATE', $evdata);
258
        if ($evt->advise_before()) {
259
            $match = array();
260
            $snippets = array();
261
            $utf8_offset = $offset = $end = 0;
262
            $len = Utf8\PhpString::strlen($text);
263
264
            // build a regexp from the phrases to highlight
265
            $re1 = '(' .
266
                join(
267
                    '|',
268
                    array_map(
269
                        [$this, 'snippetRePreprocess'],
270
                        array_map(
271
                            'preg_quote_cb',
272
                            array_filter((array) $highlight)
273
                        )
274
                    )
275
                ) .
276
                ')';
277
            $re2 = "$re1.{0,75}(?!\\1)$re1";
278
            $re3 = "$re1.{0,45}(?!\\1)$re1.{0,45}(?!\\1)(?!\\2)$re1";
279
280
            for ($cnt=4; $cnt--;) {
281
                if (0) {
282
                } elseif (preg_match('/'.$re3.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
283
                } elseif (preg_match('/'.$re2.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
284
                } elseif (preg_match('/'.$re1.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
285
                } else {
286
                    break;
287
                }
288
289
                list($str, $idx) = $match[0];
290
291
                // convert $idx (a byte offset) into a utf8 character offset
292
                $utf8_idx = Utf8\PhpString::strlen(substr($text, 0, $idx));
293
                $utf8_len = Utf8\PhpString::strlen($str);
294
295
                // establish context, 100 bytes surrounding the match string
296
                // first look to see if we can go 100 either side,
297
                // then drop to 50 adding any excess if the other side can't go to 50,
298
                $pre = min($utf8_idx - $utf8_offset, 100);
299
                $post = min($len - $utf8_idx - $utf8_len, 100);
300
301
                if ($pre > 50 && $post > 50) {
302
                    $pre = $post = 50;
303
                } elseif ($pre > 50) {
304
                    $pre = min($pre, 100 - $post);
305
                } elseif ($post > 50) {
306
                    $post = min($post, 100 - $pre);
307
                } elseif ($offset == 0) {
308
                    // both are less than 50, means the context is the whole string
309
                    // make it so and break out of this loop - there is no need for the
310
                    // complex snippet calculations
311
                    $snippets = array($text);
312
                    break;
313
                }
314
315
                // establish context start and end points, try to append to previous
316
                // context if possible
317
                $start = $utf8_idx - $pre;
318
                $append = ($start < $end) ? $end : false;  // still the end of the previous context snippet
319
                $end = $utf8_idx + $utf8_len + $post;      // now set it to the end of this context
320
321
                if ($append) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $append of type integer|false is loosely compared to true; this is ambiguous if the integer can be zero. You might want to explicitly use !== null instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For integer values, zero is a special case, in particular the following results might be unexpected:

0   == false // true
0   == null  // true
123 == false // false
123 == null  // false

// It is often better to use strict comparison
0 === false // false
0 === null  // false
Loading history...
322
                    $snippets[count($snippets)-1] .= Utf8\PhpString::substr($text, $append, $end-$append);
323
                } else {
324
                    $snippets[] = Utf8\PhpString::substr($text, $start, $end-$start);
325
                }
326
327
                // set $offset for next match attempt
328
                // continue matching after the current match
329
                // if the current match is not the longest possible match starting at the current offset
330
                // this prevents further matching of this snippet but for possible matches of length
331
                // smaller than match length + context (at least 50 characters) this match is part of the context
332
                $utf8_offset = $utf8_idx + $utf8_len;
333
                $offset = $idx + strlen(Utf8\PhpString::substr($text, $utf8_idx, $utf8_len));
334
                $offset = Utf8\Clean::correctIdx($text, $offset);
335
            }
336
337
            $m = "\1";
338
            $snippets = preg_replace('/'.$re1.'/iu', $m.'$1'.$m, $snippets);
339
            $snippet = preg_replace(
340
                '/' . $m . '([^' . $m . ']*?)' . $m . '/iu',
341
                '<strong class="search_hit">$1</strong>',
342
                hsc(join('... ', $snippets))
343
            );
344
345
            $evdata['snippet'] = $snippet;
346
        }
347
        $evt->advise_after();
348
        unset($evt);
349
350
        return $evdata['snippet'];
351
    }
352
353
    /**
354
     * Wraps a search term in regex boundary checks.
355
     *
356
     * @param string $term
357
     * @return string
358
     */
359
    public function snippetRePreprocess($term)
360
    {
361
        // do not process asian terms where word boundaries are not explicit
362
        if (Utf8\Asian::isAsianWords($term)) return $term;
363
364
        if (UTF8_PROPERTYSUPPORT) {
365
            // unicode word boundaries
366
            // see http://stackoverflow.com/a/2449017/172068
367
            $BL = '(?<!\pL)';
368
            $BR = '(?!\pL)';
369
        } else {
370
            // not as correct as above, but at least won't break
371
            $BL = '\b';
372
            $BR = '\b';
373
        }
374
375
        if (substr($term, 0, 2) == '\\*') {
376
            $term = substr($term, 2);
377
        } else {
378
            $term = $BL.$term;
379
        }
380
381
        if (substr($term, -2, 2) == '\\*') {
382
            $term = substr($term, 0, -2);
383
        } else {
384
            $term = $term.$BR;
385
        }
386
387
        if ($term == $BL || $term == $BR || $term == $BL.$BR) {
388
            $term = '';
389
        }
390
        return $term;
391
    }
392
393
    /**
394
     * Combine found documents and sum up their scores
395
     *
396
     * This function is used to combine searched words with a logical
397
     * AND. Only documents available in all arrays are returned.
398
     *
399
     * based upon PEAR's PHP_Compat function for array_intersect_key()
400
     *
401
     * @param array $args An array of page arrays
402
     * @return array
403
     */
404
    protected function resultCombine($args)
405
    {
406
        $array_count = count($args);
407
        if ($array_count == 1) {
408
            return $args[0];
409
        }
410
411
        $result = array();
412
        if ($array_count > 1) {
413
            foreach ($args[0] as $key => $value) {
414
                $result[$key] = $value;
415
                for ($i = 1; $i !== $array_count; $i++) {
416
                    if (!isset($args[$i][$key])) {
417
                        unset($result[$key]);
418
                        break;
419
                    }
420
                    $result[$key] += $args[$i][$key];
421
                }
422
            }
423
        }
424
        return $result;
425
    }
426
427
    /**
428
     * Unites found documents and sum up their scores
429
     * based upon resultCombine() method
430
     *
431
     * @param array $args An array of page arrays
432
     * @return array
433
     *
434
     * @author Kazutaka Miyasaka <[email protected]>
435
     */
436
    protected function resultUnite($args)
437
    {
438
        $array_count = count($args);
439
        if ($array_count === 1) {
440
            return $args[0];
441
        }
442
443
        $result = $args[0];
444
        for ($i = 1; $i !== $array_count; $i++) {
445
            foreach (array_keys($args[$i]) as $id) {
446
                $result[$id] += $args[$i][$id];
447
            }
448
        }
449
        return $result;
450
    }
451
452
    /**
453
     * Computes the difference of documents using page id for comparison
454
     * nearly identical to PHP5's array_diff_key()
455
     *
456
     * @param array $args An array of page arrays
457
     * @return array
458
     *
459
     * @author Kazutaka Miyasaka <[email protected]>
460
     */
461
    protected function resultComplement($args)
462
    {
463
        $array_count = count($args);
464
        if ($array_count === 1) {
465
            return $args[0];
466
        }
467
468
        $result = $args[0];
469
        foreach (array_keys($result) as $id) {
470
            for ($i = 1; $i !== $array_count; $i++) {
471
                if (isset($args[$i][$id])) unset($result[$id]);
472
            }
473
        }
474
        return $result;
475
    }
476
}
477