Completed
Push — betterCoreSearch ( 940f24...01c23b )
by Michael
06:20
created

fulltext.php ➔ ft_pagemtimesorter()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 5
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 4
nc 1
nop 2
dl 0
loc 5
rs 9.4285
c 0
b 0
f 0
1
<?php
2
/**
3
 * DokuWiki fulltextsearch functions using the index
4
 *
5
 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
6
 * @author     Andreas Gohr <[email protected]>
7
 */
8
9
if(!defined('DOKU_INC')) die('meh.');
10
11
/**
12
 * create snippets for the first few results only
13
 */
14
if(!defined('FT_SNIPPET_NUMBER')) define('FT_SNIPPET_NUMBER',15);
15
16
/**
17
 * The fulltext search
18
 *
19
 * Returns a list of matching documents for the given query
20
 *
21
 * refactored into ft_pageSearch(), _ft_pageSearch() and trigger_event()
22
 *
23
 * @param string $query
24
 * @param array $highlight
25
 * @return array
26
 */
27
function ft_pageSearch($query,&$highlight, $sort = 'hits'){
28
29
    $data = array();
30
    $data['query'] = $query;
31
    $data['highlight'] =& $highlight;
32
    $data['sort'] = $sort;
33
34
    return trigger_event('SEARCH_QUERY_FULLPAGE', $data, '_ft_pageSearch');
35
}
36
37
/**
38
 * Returns a list of matching documents for the given query
39
 *
40
 * @author Andreas Gohr <[email protected]>
41
 * @author Kazutaka Miyasaka <[email protected]>
42
 *
43
 * @param array $data event data
44
 * @return array matching documents
45
 */
46
function _ft_pageSearch(&$data) {
47
    $Indexer = idx_get_indexer();
48
49
    // parse the given query
50
    $q = ft_queryParser($Indexer, $data['query']);
51
    $data['highlight'] = $q['highlight'];
52
53
    if (empty($q['parsed_ary'])) return array();
54
55
    // lookup all words found in the query
56
    $lookup = $Indexer->lookup($q['words']);
57
58
    // get all pages in this dokuwiki site (!: includes nonexistent pages)
59
    $pages_all = array();
60
    foreach ($Indexer->getPages() as $id) {
61
        $pages_all[$id] = 0; // base: 0 hit
62
    }
63
64
    // process the query
65
    $stack = array();
66
    foreach ($q['parsed_ary'] as $token) {
0 ignored issues
show
Bug introduced by
The expression $q['parsed_ary'] of type string|array is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
67
        switch (substr($token, 0, 3)) {
68
            case 'W+:':
69
            case 'W-:':
70
            case 'W_:': // word
71
                $word    = substr($token, 3);
72
                $stack[] = (array) $lookup[$word];
73
                break;
74
            case 'P+:':
75
            case 'P-:': // phrase
76
                $phrase = substr($token, 3);
77
                // since phrases are always parsed as ((W1)(W2)...(P)),
78
                // the end($stack) always points the pages that contain
79
                // all words in this phrase
80
                $pages  = end($stack);
81
                $pages_matched = array();
82
                foreach(array_keys($pages) as $id){
83
                    $evdata = array(
84
                        'id' => $id,
85
                        'phrase' => $phrase,
86
                        'text' => rawWiki($id)
87
                    );
88
                    $evt = new Doku_Event('FULLTEXT_PHRASE_MATCH',$evdata);
89
                    if ($evt->advise_before() && $evt->result !== true) {
90
                        $text = utf8_strtolower($evdata['text']);
91
                        if (strpos($text, $phrase) !== false) {
92
                            $evt->result = true;
93
                        }
94
                    }
95
                    $evt->advise_after();
96
                    if ($evt->result === true) {
97
                        $pages_matched[$id] = 0; // phrase: always 0 hit
98
                    }
99
                }
100
                $stack[] = $pages_matched;
101
                break;
102
            case 'N+:':
103
            case 'N-:': // namespace
104
                $ns = cleanID(substr($token, 3)) . ':';
105
                $pages_matched = array();
106
                foreach (array_keys($pages_all) as $id) {
107
                    if (strpos($id, $ns) === 0) {
108
                        $pages_matched[$id] = 0; // namespace: always 0 hit
109
                    }
110
                }
111
                $stack[] = $pages_matched;
112
                break;
113
            case 'AND': // and operation
114
                list($pages1, $pages2) = array_splice($stack, -2);
115
                $stack[] = ft_resultCombine(array($pages1, $pages2));
116
                break;
117
            case 'OR':  // or operation
118
                list($pages1, $pages2) = array_splice($stack, -2);
119
                $stack[] = ft_resultUnite(array($pages1, $pages2));
120
                break;
121
            case 'NOT': // not operation (unary)
122
                $pages   = array_pop($stack);
123
                $stack[] = ft_resultComplement(array($pages_all, $pages));
124
                break;
125
        }
126
    }
127
    $docs = array_pop($stack);
128
129
    if (empty($docs)) return array();
130
131
    // check: settings, acls, existence
132
    foreach (array_keys($docs) as $id) {
133
        if (isHiddenPage($id) || auth_quickaclcheck($id) < AUTH_READ || !page_exists($id, '', false)) {
134
            unset($docs[$id]);
135
        }
136
    }
137
138
    $docs = _ft_filterResultsByTime($docs);
139
140
    if ($data['sort'] === 'mtime') {
141
        uksort($docs, 'ft_pagemtimesorter');
142
    } else {
143
        // sort docs by count
144
        arsort($docs);
145
    }
146
147
    return $docs;
148
}
149
150
/**
151
 * Returns the backlinks for a given page
152
 *
153
 * Uses the metadata index.
154
 *
155
 * @param string $id           The id for which links shall be returned
156
 * @param bool   $ignore_perms Ignore the fact that pages are hidden or read-protected
157
 * @return array The pages that contain links to the given page
158
 */
159
function ft_backlinks($id, $ignore_perms = false){
160
    $result = idx_get_indexer()->lookupKey('relation_references', $id);
161
162
    if(!count($result)) return $result;
163
164
    // check ACL permissions
165
    foreach(array_keys($result) as $idx){
166
        if(($ignore_perms !== true && (
167
                isHiddenPage($result[$idx]) || auth_quickaclcheck($result[$idx]) < AUTH_READ
168
            )) || !page_exists($result[$idx], '', false)){
169
            unset($result[$idx]);
170
        }
171
    }
172
173
    sort($result);
174
    return $result;
175
}
176
177
/**
178
 * Returns the pages that use a given media file
179
 *
180
 * Uses the relation media metadata property and the metadata index.
181
 *
182
 * Note that before 2013-07-31 the second parameter was the maximum number of results and
183
 * permissions were ignored. That's why the parameter is now checked to be explicitely set
184
 * to true (with type bool) in order to be compatible with older uses of the function.
185
 *
186
 * @param string $id           The media id to look for
187
 * @param bool   $ignore_perms Ignore hidden pages and acls (optional, default: false)
188
 * @return array A list of pages that use the given media file
189
 */
190
function ft_mediause($id, $ignore_perms = false){
191
    $result = idx_get_indexer()->lookupKey('relation_media', $id);
192
193
    if(!count($result)) return $result;
194
195
    // check ACL permissions
196
    foreach(array_keys($result) as $idx){
197
        if(($ignore_perms !== true && (
198
                    isHiddenPage($result[$idx]) || auth_quickaclcheck($result[$idx]) < AUTH_READ
199
                )) || !page_exists($result[$idx], '', false)){
200
            unset($result[$idx]);
201
        }
202
    }
203
204
    sort($result);
205
    return $result;
206
}
207
208
209
210
/**
211
 * Quicksearch for pagenames
212
 *
213
 * By default it only matches the pagename and ignores the
214
 * namespace. This can be changed with the second parameter.
215
 * The third parameter allows to search in titles as well.
216
 *
217
 * The function always returns titles as well
218
 *
219
 * @triggers SEARCH_QUERY_PAGELOOKUP
220
 * @author Andreas Gohr <[email protected]>
221
 * @author Adrian Lang <[email protected]>
222
 *
223
 * @param string $id        page id
224
 * @param bool   $in_ns     match against namespace as well?
225
 * @param bool   $in_title  search in title?
226
 * @return string[]
227
 */
228
function ft_pageLookup($id, $in_ns=false, $in_title=false){
229
    $data = compact('id', 'in_ns', 'in_title');
230
    $data['has_titles'] = true; // for plugin backward compatibility check
231
    return trigger_event('SEARCH_QUERY_PAGELOOKUP', $data, '_ft_pageLookup');
232
}
233
234
/**
235
 * Returns list of pages as array(pageid => First Heading)
236
 *
237
 * @param array &$data event data
238
 * @return string[]
239
 */
240
function _ft_pageLookup(&$data){
241
    // split out original parameters
242
    $id = $data['id'];
243
    $Indexer = idx_get_indexer();
244
    $parsedQuery = ft_queryParser($Indexer, $id);
245
    if (count($parsedQuery['ns']) > 0) {
246
        $ns = cleanID($parsedQuery['ns'][0]) . ':';
247
        $id = implode(' ', $parsedQuery['highlight']);
248
    }
249
250
    $in_ns    = $data['in_ns'];
251
    $in_title = $data['in_title'];
252
    $cleaned = cleanID($id);
253
254
    $Indexer = idx_get_indexer();
255
    $page_idx = $Indexer->getPages();
256
257
    $pages = array();
258
    if ($id !== '' && $cleaned !== '') {
259
        foreach ($page_idx as $p_id) {
260
            if ((strpos($in_ns ? $p_id : noNSorNS($p_id), $cleaned) !== false)) {
261
                if (!isset($pages[$p_id]))
262
                    $pages[$p_id] = p_get_first_heading($p_id, METADATA_DONT_RENDER);
263
            }
264
        }
265
        if ($in_title) {
266
            foreach ($Indexer->lookupKey('title', $id, '_ft_pageLookupTitleCompare') as $p_id) {
267
                if (!isset($pages[$p_id]))
268
                    $pages[$p_id] = p_get_first_heading($p_id, METADATA_DONT_RENDER);
269
            }
270
        }
271
    }
272
273
    if (isset($ns)) {
274
        foreach (array_keys($pages) as $p_id) {
275
            if (strpos($p_id, $ns) !== 0) {
276
                unset($pages[$p_id]);
277
            }
278
        }
279
    }
280
281
    // discard hidden pages
282
    // discard nonexistent pages
283
    // check ACL permissions
284
    foreach(array_keys($pages) as $idx){
285
        if(!isVisiblePage($idx) || !page_exists($idx) ||
286
           auth_quickaclcheck($idx) < AUTH_READ) {
287
            unset($pages[$idx]);
288
        }
289
    }
290
291
    $pages = _ft_filterResultsByTime($pages);
292
293
    uksort($pages,'ft_pagesorter');
294
    return $pages;
295
}
296
297
298
/**
299
 * @param array $results search results in the form pageid => value
300
 *
301
 * @return array
302
 */
303
function _ft_filterResultsByTime(array $results) {
304
    global $INPUT;
305
    if ($INPUT->has('after') || $INPUT->has('before')) {
306
        $after = $INPUT->str('after');
307
        $after = is_int($after) ? $after : strtotime($after);
308
309
        $before = $INPUT->str('before');
310
        $before = is_int($before) ? $before : strtotime($before);
311
312
        foreach ($results as $id => $value) {
313
            $mTime = filemtime(wikiFN($id));
314
            if ($after && $after > $mTime) {
315
                unset($results[$id]);
316
                continue;
317
            }
318
            if ($before && $before < $mTime) {
319
                unset($results[$id]);
320
            }
321
        }
322
    }
323
324
    return $results;
325
}
326
327
/**
328
 * Tiny helper function for comparing the searched title with the title
329
 * from the search index. This function is a wrapper around stripos with
330
 * adapted argument order and return value.
331
 *
332
 * @param string $search searched title
333
 * @param string $title  title from index
334
 * @return bool
335
 */
336
function _ft_pageLookupTitleCompare($search, $title) {
337
    return stripos($title, $search) !== false;
338
}
339
340
/**
341
 * Sort pages based on their namespace level first, then on their string
342
 * values. This makes higher hierarchy pages rank higher than lower hierarchy
343
 * pages.
344
 *
345
 * @param string $a
346
 * @param string $b
347
 * @return int Returns < 0 if $a is less than $b; > 0 if $a is greater than $b, and 0 if they are equal.
348
 */
349
function ft_pagesorter($a, $b){
350
    $ac = count(explode(':',$a));
351
    $bc = count(explode(':',$b));
352
    if($ac < $bc){
353
        return -1;
354
    }elseif($ac > $bc){
355
        return 1;
356
    }
357
    return strcmp ($a,$b);
358
}
359
360
/**
361
 * Sort pages by their mtime, from newest to oldest
362
 *
363
 * @param string $a
364
 * @param string $b
365
 *
366
 * @return int Returns < 0 if $a is newer than $b, > 0 if $b is newer than $a and 0 if they are of the same age
367
 */
368
function ft_pagemtimesorter($a, $b) {
369
    $mtimeA = filemtime(wikiFN($a));
370
    $mtimeB = filemtime(wikiFN($b));
371
    return $mtimeB - $mtimeA;
372
}
373
374
/**
375
 * Creates a snippet extract
376
 *
377
 * @author Andreas Gohr <[email protected]>
378
 * @triggers FULLTEXT_SNIPPET_CREATE
379
 *
380
 * @param string $id page id
381
 * @param array $highlight
382
 * @return mixed
383
 */
384
function ft_snippet($id,$highlight){
385
    $text = rawWiki($id);
386
    $text = str_replace("\xC2\xAD",'',$text); // remove soft-hyphens
387
    $evdata = array(
388
            'id'        => $id,
389
            'text'      => &$text,
390
            'highlight' => &$highlight,
391
            'snippet'   => '',
392
            );
393
394
    $evt = new Doku_Event('FULLTEXT_SNIPPET_CREATE',$evdata);
395
    if ($evt->advise_before()) {
396
        $match = array();
397
        $snippets = array();
398
        $utf8_offset = $offset = $end = 0;
399
        $len = utf8_strlen($text);
400
401
        // build a regexp from the phrases to highlight
402
        $re1 = '('.join('|',array_map('ft_snippet_re_preprocess', array_map('preg_quote_cb',array_filter((array) $highlight)))).')';
403
        $re2 = "$re1.{0,75}(?!\\1)$re1";
404
        $re3 = "$re1.{0,45}(?!\\1)$re1.{0,45}(?!\\1)(?!\\2)$re1";
405
406
        for ($cnt=4; $cnt--;) {
407
            if (0) {
408
            } else if (preg_match('/'.$re3.'/iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) {
409
            } else if (preg_match('/'.$re2.'/iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) {
410
            } else if (preg_match('/'.$re1.'/iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) {
411
            } else {
412
                break;
413
            }
414
415
            list($str,$idx) = $match[0];
416
417
            // convert $idx (a byte offset) into a utf8 character offset
418
            $utf8_idx = utf8_strlen(substr($text,0,$idx));
419
            $utf8_len = utf8_strlen($str);
420
421
            // establish context, 100 bytes surrounding the match string
422
            // first look to see if we can go 100 either side,
423
            // then drop to 50 adding any excess if the other side can't go to 50,
424
            $pre = min($utf8_idx-$utf8_offset,100);
425
            $post = min($len-$utf8_idx-$utf8_len,100);
426
427
            if ($pre>50 && $post>50) {
428
                $pre = $post = 50;
429
            } else if ($pre>50) {
430
                $pre = min($pre,100-$post);
431
            } else if ($post>50) {
432
                $post = min($post, 100-$pre);
433
            } else if ($offset == 0) {
434
                // both are less than 50, means the context is the whole string
435
                // make it so and break out of this loop - there is no need for the
436
                // complex snippet calculations
437
                $snippets = array($text);
438
                break;
439
            }
440
441
            // establish context start and end points, try to append to previous
442
            // context if possible
443
            $start = $utf8_idx - $pre;
444
            $append = ($start < $end) ? $end : false;  // still the end of the previous context snippet
445
            $end = $utf8_idx + $utf8_len + $post;      // now set it to the end of this context
446
447
            if ($append) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $append of type integer|false is loosely compared to true; this is ambiguous if the integer can be zero. You might want to explicitly use !== null instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For integer values, zero is a special case, in particular the following results might be unexpected:

0   == false // true
0   == null  // true
123 == false // false
123 == null  // false

// It is often better to use strict comparison
0 === false // false
0 === null  // false
Loading history...
448
                $snippets[count($snippets)-1] .= utf8_substr($text,$append,$end-$append);
449
            } else {
450
                $snippets[] = utf8_substr($text,$start,$end-$start);
451
            }
452
453
            // set $offset for next match attempt
454
            // continue matching after the current match
455
            // if the current match is not the longest possible match starting at the current offset
456
            // this prevents further matching of this snippet but for possible matches of length
457
            // smaller than match length + context (at least 50 characters) this match is part of the context
458
            $utf8_offset = $utf8_idx + $utf8_len;
459
            $offset = $idx + strlen(utf8_substr($text,$utf8_idx,$utf8_len));
460
            $offset = utf8_correctIdx($text,$offset);
461
        }
462
463
        $m = "\1";
464
        $snippets = preg_replace('/'.$re1.'/iu',$m.'$1'.$m,$snippets);
465
        $snippet = preg_replace('/'.$m.'([^'.$m.']*?)'.$m.'/iu','<strong class="search_hit">$1</strong>',hsc(join('... ',$snippets)));
466
467
        $evdata['snippet'] = $snippet;
468
    }
469
    $evt->advise_after();
470
    unset($evt);
471
472
    return $evdata['snippet'];
473
}
474
475
/**
476
 * Wraps a search term in regex boundary checks.
477
 *
478
 * @param string $term
479
 * @return string
480
 */
481
function ft_snippet_re_preprocess($term) {
482
    // do not process asian terms where word boundaries are not explicit
483
    if(preg_match('/'.IDX_ASIAN.'/u',$term)){
484
        return $term;
485
    }
486
487
    if (UTF8_PROPERTYSUPPORT) {
488
        // unicode word boundaries
489
        // see http://stackoverflow.com/a/2449017/172068
490
        $BL = '(?<!\pL)';
491
        $BR = '(?!\pL)';
492
    } else {
493
        // not as correct as above, but at least won't break
494
        $BL = '\b';
495
        $BR = '\b';
496
    }
497
498
    if(substr($term,0,2) == '\\*'){
499
        $term = substr($term,2);
500
    }else{
501
        $term = $BL.$term;
502
    }
503
504
    if(substr($term,-2,2) == '\\*'){
505
        $term = substr($term,0,-2);
506
    }else{
507
        $term = $term.$BR;
508
    }
509
510
    if($term == $BL || $term == $BR || $term == $BL.$BR) $term = '';
511
    return $term;
512
}
513
514
/**
515
 * Combine found documents and sum up their scores
516
 *
517
 * This function is used to combine searched words with a logical
518
 * AND. Only documents available in all arrays are returned.
519
 *
520
 * based upon PEAR's PHP_Compat function for array_intersect_key()
521
 *
522
 * @param array $args An array of page arrays
523
 * @return array
524
 */
525
function ft_resultCombine($args){
526
    $array_count = count($args);
527
    if($array_count == 1){
528
        return $args[0];
529
    }
530
531
    $result = array();
532
    if ($array_count > 1) {
533
        foreach ($args[0] as $key => $value) {
534
            $result[$key] = $value;
535
            for ($i = 1; $i !== $array_count; $i++) {
536
                if (!isset($args[$i][$key])) {
537
                    unset($result[$key]);
538
                    break;
539
                }
540
                $result[$key] += $args[$i][$key];
541
            }
542
        }
543
    }
544
    return $result;
545
}
546
547
/**
548
 * Unites found documents and sum up their scores
549
 *
550
 * based upon ft_resultCombine() function
551
 *
552
 * @param array $args An array of page arrays
553
 * @return array
554
 *
555
 * @author Kazutaka Miyasaka <[email protected]>
556
 */
557
function ft_resultUnite($args) {
558
    $array_count = count($args);
559
    if ($array_count === 1) {
560
        return $args[0];
561
    }
562
563
    $result = $args[0];
564
    for ($i = 1; $i !== $array_count; $i++) {
565
        foreach (array_keys($args[$i]) as $id) {
566
            $result[$id] += $args[$i][$id];
567
        }
568
    }
569
    return $result;
570
}
571
572
/**
573
 * Computes the difference of documents using page id for comparison
574
 *
575
 * nearly identical to PHP5's array_diff_key()
576
 *
577
 * @param array $args An array of page arrays
578
 * @return array
579
 *
580
 * @author Kazutaka Miyasaka <[email protected]>
581
 */
582
function ft_resultComplement($args) {
583
    $array_count = count($args);
584
    if ($array_count === 1) {
585
        return $args[0];
586
    }
587
588
    $result = $args[0];
589
    foreach (array_keys($result) as $id) {
590
        for ($i = 1; $i !== $array_count; $i++) {
591
            if (isset($args[$i][$id])) unset($result[$id]);
592
        }
593
    }
594
    return $result;
595
}
596
597
/**
598
 * Parses a search query and builds an array of search formulas
599
 *
600
 * @author Andreas Gohr <[email protected]>
601
 * @author Kazutaka Miyasaka <[email protected]>
602
 *
603
 * @param Doku_Indexer $Indexer
604
 * @param string $query search query
605
 * @return array of search formulas
606
 */
607
function ft_queryParser($Indexer, $query){
608
    /**
609
     * parse a search query and transform it into intermediate representation
610
     *
611
     * in a search query, you can use the following expressions:
612
     *
613
     *   words:
614
     *     include
615
     *     -exclude
616
     *   phrases:
617
     *     "phrase to be included"
618
     *     -"phrase you want to exclude"
619
     *   namespaces:
620
     *     @include:namespace (or ns:include:namespace)
621
     *     ^exclude:namespace (or -ns:exclude:namespace)
622
     *   groups:
623
     *     ()
624
     *     -()
625
     *   operators:
626
     *     and ('and' is the default operator: you can always omit this)
627
     *     or  (or pipe symbol '|', lower precedence than 'and')
628
     *
629
     * e.g. a query [ aa "bb cc" @dd:ee ] means "search pages which contain
630
     *      a word 'aa', a phrase 'bb cc' and are within a namespace 'dd:ee'".
631
     *      this query is equivalent to [ -(-aa or -"bb cc" or -ns:dd:ee) ]
632
     *      as long as you don't mind hit counts.
633
     *
634
     * intermediate representation consists of the following parts:
635
     *
636
     *   ( )           - group
637
     *   AND           - logical and
638
     *   OR            - logical or
639
     *   NOT           - logical not
640
     *   W+:, W-:, W_: - word      (underscore: no need to highlight)
641
     *   P+:, P-:      - phrase    (minus sign: logically in NOT group)
642
     *   N+:, N-:      - namespace
643
     */
644
    $parsed_query = '';
645
    $parens_level = 0;
646
    $terms = preg_split('/(-?".*?")/u', utf8_strtolower($query), -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
647
648
    foreach ($terms as $term) {
649
        $parsed = '';
650
        if (preg_match('/^(-?)"(.+)"$/u', $term, $matches)) {
651
            // phrase-include and phrase-exclude
652
            $not = $matches[1] ? 'NOT' : '';
653
            $parsed = $not.ft_termParser($Indexer, $matches[2], false, true);
654
        } else {
655
            // fix incomplete phrase
656
            $term = str_replace('"', ' ', $term);
657
658
            // fix parentheses
659
            $term = str_replace(')'  , ' ) ', $term);
660
            $term = str_replace('('  , ' ( ', $term);
661
            $term = str_replace('- (', ' -(', $term);
662
663
            // treat pipe symbols as 'OR' operators
664
            $term = str_replace('|', ' or ', $term);
665
666
            // treat ideographic spaces (U+3000) as search term separators
667
            // FIXME: some more separators?
668
            $term = preg_replace('/[ \x{3000}]+/u', ' ',  $term);
669
            $term = trim($term);
670
            if ($term === '') continue;
671
672
            $tokens = explode(' ', $term);
673
            foreach ($tokens as $token) {
674
                if ($token === '(') {
675
                    // parenthesis-include-open
676
                    $parsed .= '(';
677
                    ++$parens_level;
678
                } elseif ($token === '-(') {
679
                    // parenthesis-exclude-open
680
                    $parsed .= 'NOT(';
681
                    ++$parens_level;
682
                } elseif ($token === ')') {
683
                    // parenthesis-any-close
684
                    if ($parens_level === 0) continue;
685
                    $parsed .= ')';
686
                    $parens_level--;
687
                } elseif ($token === 'and') {
688
                    // logical-and (do nothing)
689
                } elseif ($token === 'or') {
690
                    // logical-or
691
                    $parsed .= 'OR';
692
                } elseif (preg_match('/^(?:\^|-ns:)(.+)$/u', $token, $matches)) {
693
                    // namespace-exclude
694
                    $parsed .= 'NOT(N+:'.$matches[1].')';
695
                } elseif (preg_match('/^(?:@|ns:)(.+)$/u', $token, $matches)) {
696
                    // namespace-include
697
                    $parsed .= '(N+:'.$matches[1].')';
698
                } elseif (preg_match('/^-(.+)$/', $token, $matches)) {
699
                    // word-exclude
700
                    $parsed .= 'NOT('.ft_termParser($Indexer, $matches[1]).')';
701
                } else {
702
                    // word-include
703
                    $parsed .= ft_termParser($Indexer, $token);
704
                }
705
            }
706
        }
707
        $parsed_query .= $parsed;
708
    }
709
710
    // cleanup (very sensitive)
711
    $parsed_query .= str_repeat(')', $parens_level);
712
    do {
713
        $parsed_query_old = $parsed_query;
714
        $parsed_query = preg_replace('/(NOT)?\(\)/u', '', $parsed_query);
715
    } while ($parsed_query !== $parsed_query_old);
716
    $parsed_query = preg_replace('/(NOT|OR)+\)/u', ')'      , $parsed_query);
717
    $parsed_query = preg_replace('/(OR)+/u'      , 'OR'     , $parsed_query);
718
    $parsed_query = preg_replace('/\(OR/u'       , '('      , $parsed_query);
719
    $parsed_query = preg_replace('/^OR|OR$/u'    , ''       , $parsed_query);
720
    $parsed_query = preg_replace('/\)(NOT)?\(/u' , ')AND$1(', $parsed_query);
721
722
    // adjustment: make highlightings right
723
    $parens_level     = 0;
724
    $notgrp_levels    = array();
725
    $parsed_query_new = '';
726
    $tokens = preg_split('/(NOT\(|[()])/u', $parsed_query, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
727
    foreach ($tokens as $token) {
728
        if ($token === 'NOT(') {
729
            $notgrp_levels[] = ++$parens_level;
730
        } elseif ($token === '(') {
731
            ++$parens_level;
732
        } elseif ($token === ')') {
733
            if ($parens_level-- === end($notgrp_levels)) array_pop($notgrp_levels);
734
        } elseif (count($notgrp_levels) % 2 === 1) {
735
            // turn highlight-flag off if terms are logically in "NOT" group
736
            $token = preg_replace('/([WPN])\+\:/u', '$1-:', $token);
737
        }
738
        $parsed_query_new .= $token;
739
    }
740
    $parsed_query = $parsed_query_new;
741
742
    /**
743
     * convert infix notation string into postfix (Reverse Polish notation) array
744
     * by Shunting-yard algorithm
745
     *
746
     * see: http://en.wikipedia.org/wiki/Reverse_Polish_notation
747
     * see: http://en.wikipedia.org/wiki/Shunting-yard_algorithm
748
     */
749
    $parsed_ary     = array();
750
    $ope_stack      = array();
751
    $ope_precedence = array(')' => 1, 'OR' => 2, 'AND' => 3, 'NOT' => 4, '(' => 5);
752
    $ope_regex      = '/([()]|OR|AND|NOT)/u';
753
754
    $tokens = preg_split($ope_regex, $parsed_query, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
755
    foreach ($tokens as $token) {
756
        if (preg_match($ope_regex, $token)) {
757
            // operator
758
            $last_ope = end($ope_stack);
759
            while ($last_ope !== false && $ope_precedence[$token] <= $ope_precedence[$last_ope] && $last_ope != '(') {
760
                $parsed_ary[] = array_pop($ope_stack);
761
                $last_ope = end($ope_stack);
762
            }
763
            if ($token == ')') {
764
                array_pop($ope_stack); // this array_pop always deletes '('
765
            } else {
766
                $ope_stack[] = $token;
767
            }
768
        } else {
769
            // operand
770
            $token_decoded = str_replace(array('OP', 'CP'), array('(', ')'), $token);
771
            $parsed_ary[] = $token_decoded;
772
        }
773
    }
774
    $parsed_ary = array_values(array_merge($parsed_ary, array_reverse($ope_stack)));
775
776
    // cleanup: each double "NOT" in RPN array actually does nothing
777
    $parsed_ary_count = count($parsed_ary);
778
    for ($i = 1; $i < $parsed_ary_count; ++$i) {
779
        if ($parsed_ary[$i] === 'NOT' && $parsed_ary[$i - 1] === 'NOT') {
780
            unset($parsed_ary[$i], $parsed_ary[$i - 1]);
781
        }
782
    }
783
    $parsed_ary = array_values($parsed_ary);
784
785
    // build return value
786
    $q = array();
787
    $q['query']      = $query;
788
    $q['parsed_str'] = $parsed_query;
789
    $q['parsed_ary'] = $parsed_ary;
790
791
    foreach ($q['parsed_ary'] as $token) {
792
        if ($token[2] !== ':') continue;
793
        $body = substr($token, 3);
794
795
        switch (substr($token, 0, 3)) {
796
            case 'N+:':
797
                     $q['ns'][]        = $body; // for backward compatibility
798
                     break;
799
            case 'N-:':
800
                     $q['notns'][]     = $body; // for backward compatibility
801
                     break;
802
            case 'W_:':
803
                     $q['words'][]     = $body;
804
                     break;
805
            case 'W-:':
806
                     $q['words'][]     = $body;
807
                     $q['not'][]       = $body; // for backward compatibility
808
                     break;
809
            case 'W+:':
810
                     $q['words'][]     = $body;
811
                     $q['highlight'][] = $body;
812
                     $q['and'][]       = $body; // for backward compatibility
813
                     break;
814
            case 'P-:':
815
                     $q['phrases'][]   = $body;
816
                     break;
817
            case 'P+:':
818
                     $q['phrases'][]   = $body;
819
                     $q['highlight'][] = $body;
820
                     break;
821
        }
822
    }
823
    foreach (array('words', 'phrases', 'highlight', 'ns', 'notns', 'and', 'not') as $key) {
824
        $q[$key] = empty($q[$key]) ? array() : array_values(array_unique($q[$key]));
825
    }
826
827
    return $q;
828
}
829
830
/**
831
 * Transforms given search term into intermediate representation
832
 *
833
 * This function is used in ft_queryParser() and not for general purpose use.
834
 *
835
 * @author Kazutaka Miyasaka <[email protected]>
836
 *
837
 * @param Doku_Indexer $Indexer
838
 * @param string       $term
839
 * @param bool         $consider_asian
840
 * @param bool         $phrase_mode
841
 * @return string
842
 */
843
function ft_termParser($Indexer, $term, $consider_asian = true, $phrase_mode = false) {
844
    $parsed = '';
845
    if ($consider_asian) {
846
        // successive asian characters need to be searched as a phrase
847
        $words = preg_split('/('.IDX_ASIAN.'+)/u', $term, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
848
        foreach ($words as $word) {
849
            $phrase_mode = $phrase_mode ? true : preg_match('/'.IDX_ASIAN.'/u', $word);
850
            $parsed .= ft_termParser($Indexer, $word, false, $phrase_mode);
0 ignored issues
show
Bug introduced by
It seems like $phrase_mode defined by $phrase_mode ? true : pr...DX_ASIAN . '/u', $word) on line 849 can also be of type integer; however, ft_termParser() does only seem to accept boolean, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
851
        }
852
    } else {
853
        $term_noparen = str_replace(array('(', ')'), ' ', $term);
854
        $words = $Indexer->tokenizer($term_noparen, true);
855
856
        // W_: no need to highlight
857
        if (empty($words)) {
858
            $parsed = '()'; // important: do not remove
859
        } elseif ($words[0] === $term) {
860
            $parsed = '(W+:'.$words[0].')';
861
        } elseif ($phrase_mode) {
862
            $term_encoded = str_replace(array('(', ')'), array('OP', 'CP'), $term);
863
            $parsed = '((W_:'.implode(')(W_:', $words).')(P+:'.$term_encoded.'))';
864
        } else {
865
            $parsed = '((W+:'.implode(')(W+:', $words).'))';
866
        }
867
    }
868
    return $parsed;
869
}
870
871
/**
872
 * Recreate a search query string based on parsed parts, doesn't support negated phrases and `OR` searches
873
 *
874
 * @param array $and
875
 * @param array $not
876
 * @param array $phrases
877
 * @param array $ns
878
 * @param array $notns
879
 *
880
 * @return string
881
 */
882
function ft_queryUnparser_simple(array $and, array $not, array $phrases, array $ns, array $notns) {
883
    $query = implode(' ', $and);
884
    if (!empty($not)) {
885
        $query .= ' -' . implode(' -', $not);
886
    }
887
888
    if (!empty($phrases)) {
889
        $query .= ' "' . implode('" "', $phrases) . '"';
890
    }
891
892
    if (!empty($ns)) {
893
        $query .= ' @' . implode(' @', $ns);
894
    }
895
896
    if (!empty($notns)) {
897
        $query .= ' ^' . implode(' ^', $notns);
898
    }
899
900
    return $query;
901
}
902
903
//Setup VIM: ex: et ts=4 :
904