Failed Conditions
Pull Request — master (#2943)
by
unknown
03:19
created

FulltextIndex::getIndexWords()   F

Complexity

Conditions 24
Paths 580

Size

Total Lines 79

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 24
nc 580
nop 2
dl 0
loc 79
rs 0.5833
c 0
b 0
f 0

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
namespace dokuwiki\Search;
4
5
use dokuwiki\Search\Tokenizer;
6
use dokuwiki\Utf8;
7
8
/**
9
 * Class DokuWiki Fulltext Index (Singleton)
10
 *
11
 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
12
 * @author     Andreas Gohr <[email protected]>
13
 * @author Tom N Harris <[email protected]>
14
 */
15
class FulltextIndex extends AbstractIndex
16
{
17
    /** @var FulltextIndex $instance */
18
    protected static $instance = null;
19
20
    /**
21
     * Get new or existing singleton instance of the FulltextIndex
22
     *
23
     * @return FulltextIndex
24
     */
25
    public static function getInstance()
26
    {
27
        if (is_null(static::$instance)) {
28
            static::$instance = new static();
29
        }
30
        return static::$instance;
31
    }
32
33
    /**
34
     * Measure the length of a string
35
     * Differs from strlen in handling of asian characters.
36
     *
37
     * @author Tom N Harris <[email protected]>
38
     *
39
     * @param string $w
40
     * @return int
41
     */
42
    public function wordlen($w)
43
    {
44
        $l = strlen($w);
45
        // If left alone, all chinese "words" will get put into w3.idx
46
        // So the "length" of a "word" is faked
47
        if (preg_match_all('/[\xE2-\xEF]/', $w, $leadbytes)) {
48
            foreach ($leadbytes[0] as $b) {
49
                $l += ord($b) - 0xE1;
50
            }
51
        }
52
        return $l;
53
    }
54
55
    /**
56
     * Adds the contents of a page to the fulltext index
57
     *
58
     * The added text replaces previous words for the same page.
59
     * An empty value erases the page.
60
     *
61
     * @param string $page   a page name
62
     * @param string $text   the body of the page
63
     * @param bool   $requireLock  should be false only if the caller is resposible for index lock
64
     * @return bool  if the function completed successfully
65
     *
66
     * @author Tom N Harris <[email protected]>
67
     * @author Andreas Gohr <[email protected]>
68
     */
69
    public function addPageWords($page, $text, $requireLock = true)
70
    {
71
        // load known documents
72
        $pid = $this->getPID($page);
73
        if ($pid === false) {
74
            return false;
75
        }
76
77
        if ($requireLock && !$this->lock()) return false;
78
79
        $pagewords = array();
80
        // get word usage in page
81
        $words = $this->getPageWords($text);
82
        if ($words === false) {
83
            $this->unlock();
84
            return false;
85
        }
86
87
        if (!empty($words)) {
88
            foreach (array_keys($words) as $wlen) {
89
                $index = $this->getIndex('i', $wlen);
90
                foreach ($words[$wlen] as $wid => $freq) {
91
                    $idx = ($wid < count($index)) ? $index[$wid] : '';
92
                    $index[$wid] = $this->updateTuple($idx, $pid, $freq);
93
                    $pagewords[] = "{$wlen}*{$wid}";
94
                }
95
                if (!$this->saveIndex('i', $wlen, $index)) {
96
                    $this->unlock();
97
                    return false;
98
                }
99
            }
100
        }
101
102
        // Remove obsolete index entries
103
        $pageword_idx = $this->getIndexKey('pageword', '', $pid);
104
        if ($pageword_idx !== '') {
105
            $oldwords = explode(':',$pageword_idx);
106
            $delwords = array_diff($oldwords, $pagewords);
107
            $upwords = array();
108
            foreach ($delwords as $word) {
109
                if ($word != '') {
110
                    list($wlen, $wid) = explode('*', $word);
111
                    $wid = (int)$wid;
112
                    $upwords[$wlen][] = $wid;
113
                }
114
            }
115
            foreach ($upwords as $wlen => $widx) {
116
                $index = $this->getIndex('i', $wlen);
117
                foreach ($widx as $wid) {
118
                    $index[$wid] = $this->updateTuple($index[$wid], $pid, 0);
119
                }
120
                $this->saveIndex('i', $wlen, $index);
121
            }
122
        }
123
        // Save the reverse index
124
        $pageword_idx = implode(':', $pagewords);
125
        if (!$this->saveIndexKey('pageword', '', $pid, $pageword_idx)) {
126
            $result = false;
127
        } else {
128
            $result = true;
129
        }
130
131
        if ($requireLock) $this->unlock();
132
        return $result;
133
    }
134
135
    /**
136
     * Split the words in a page and add them to the index
137
     *
138
     * @param string    $text   content of the page
139
     * @return array            list of word IDs and number of times used
140
     *
141
     * @author Andreas Gohr <[email protected]>
142
     * @author Christopher Smith <[email protected]>
143
     * @author Tom N Harris <[email protected]>
144
     */
145
    protected function getPageWords($text)
146
    {
147
        $Tokenizer = Tokenizer::getInstance();
148
        $tokens = $Tokenizer->getWords($text);
149
        $tokens = array_count_values($tokens);  // count the frequency of each token
150
151
        $words = array();
152
        foreach ($tokens as $w => $c) {
153
            $l = $this->wordlen($w);
154
            if (isset($words[$l])) {
155
                $words[$l][$w] = $c + (isset($words[$l][$w]) ? $words[$l][$w] : 0);
156
            } else {
157
                $words[$l] = array($w => $c);
158
            }
159
        }
160
161
        // arrive here with $words = array(wordlen => array(word => frequency))
162
        $word_idx_modified = false;
163
        $index = array();   //resulting index
164
        foreach (array_keys($words) as $wlen) {
165
            $word_idx = $this->getIndex('w', $wlen);
166
            foreach ($words[$wlen] as $word => $freq) {
167
                $word = (string)$word;
168
                $wid = array_search($word, $word_idx, true);
169
                if ($wid === false) {
170
                    $wid = count($word_idx);
171
                    $word_idx[] = $word;
172
                    $word_idx_modified = true;
173
                }
174
                if (!isset($index[$wlen])) {
175
                    $index[$wlen] = array();
176
                }
177
                $index[$wlen][$wid] = $freq;
178
            }
179
            // save back the word index
180
            if ($word_idx_modified && !$this->saveIndex('w', $wlen, $word_idx)) {
181
                return false;
182
            }
183
        }
184
185
        return $index;
186
    }
187
188
    /**
189
     * Delete the contents of a page to the fulltext index
190
     *
191
     * @param string $page   a page name
192
     * @param bool   $requireLock  should be false only if the caller is resposible for index lock
193
     * @return bool  If renaming the value has been successful, false on error
194
     *
195
     * @author Tom N Harris <[email protected]>
196
     * @author Satoshi Sahara <[email protected]>
197
     */
198
    public function deletePageWords($page, $requireLock = true)
199
    {
200
        // load known documents
201
        $pid = $this->getPID($page);
202
        if ($pid === false) {
203
            return false;
204
        }
205
206
        if ($requireLock && !$this->lock()) return false;
207
208
        // remove obsolete index entries
209
        $pageword_idx = $this->getIndexKey('pageword', '', $pid);
210
        if ($pageword_idx !== '') {
211
            $delwords = explode(':', $pageword_idx);
212
            $upwords = array();
213
            foreach ($delwords as $word) {
214
                if ($word != '') {
215
                    list($wlen, $wid) = explode('*', $word);
216
                    $wid = (int)$wid;
217
                    $upwords[$wlen][] = $wid;
218
                }
219
            }
220
            foreach ($upwords as $wlen => $widx) {
221
                $index = $this->getIndex('i', $wlen);
222
                foreach ($widx as $wid) {
223
                    $index[$wid] = $this->updateTuple($index[$wid], $pid, 0);
224
                }
225
                $this->saveIndex('i', $wlen, $index);
226
            }
227
        }
228
        // save the reverse index
229
        if (!$this->saveIndexKey('pageword', '', $pid, '')) {
230
            return false;
231
        }
232
233
        if ($requireLock) $this->unlock();
234
        return true;
235
    }
236
237
    /**
238
     * Find pages in the fulltext index containing the words,
239
     *
240
     * The search words must be pre-tokenized, meaning only letters and
241
     * numbers with an optional wildcard
242
     *
243
     * The returned array will have the original tokens as key. The values
244
     * in the returned list is an array with the page names as keys and the
245
     * number of times that token appears on the page as value.
246
     *
247
     * @param array  $tokens list of words to search for
248
     * @return array         list of page names with usage counts
249
     *
250
     * @author Tom N Harris <[email protected]>
251
     * @author Andreas Gohr <[email protected]>
252
     */
253
    public function lookupWords(&$tokens)
254
    {
255
        $result = array();
256
        $wids = $this->getIndexWords($tokens, $result);
257
        if (empty($wids)) return array();
258
        // load known words and documents
259
        $page_idx = $this->getIndex('page', '');
260
        $docs = array();
261
        foreach (array_keys($wids) as $wlen) {
262
            $wids[$wlen] = array_unique($wids[$wlen]);
263
            $index = $this->getIndex('i', $wlen);
264
            foreach ($wids[$wlen] as $ixid) {
265
                if ($ixid < count($index)) {
266
                    $docs["{$wlen}*{$ixid}"] = $this->parseTuples($page_idx, $index[$ixid]);
267
                }
268
            }
269
        }
270
        // merge found pages into final result array
271
        $final = array();
272
        foreach ($result as $word => $res) {
273
            $final[$word] = array();
274
            foreach ($res as $wid) {
275
                // handle the case when ($ixid < count($index)) has been false
276
                // and thus $docs[$wid] hasn't been set.
277
                if (!isset($docs[$wid])) continue;
278
                $hits =& $docs[$wid];
279
                foreach ($hits as $hitkey => $hitcnt) {
280
                    // make sure the document still exists
281
                    if (!page_exists($hitkey, '', false)) continue;
282
                    if (!isset($final[$word][$hitkey])) {
283
                        $final[$word][$hitkey] = $hitcnt;
284
                    } else {
285
                        $final[$word][$hitkey] += $hitcnt;
286
                    }
287
                }
288
            }
289
        }
290
        return $final;
291
    }
292
293
    /**
294
     * Find the index ID of each search term
295
     *
296
     * The query terms should only contain valid characters, with a '*' at
297
     * either the beginning or end of the word (or both).
298
     * The $result parameter can be used to merge the index locations with
299
     * the appropriate query term.
300
     *
301
     * @param array  $words  The query terms.
302
     * @param array  $result Set to word => array("length*id" ...)
303
     * @return array         Set to length => array(id ...)
304
     *
305
     * @author Tom N Harris <[email protected]>
306
     */
307
    protected function getIndexWords(&$words, &$result)
308
    {
309
        $Tokenizer = Tokenizer::getInstance();
310
311
        $tokens = array();
312
        $tokenlength = array();
313
        $tokenwild = array();
314
        foreach ($words as $word) {
315
            $result[$word] = array();
316
            $caret = '^';
317
            $dollar = '$';
318
            $xword = $word;
319
            $wlen = $this->wordlen($word);
320
321
            // check for wildcards
322
            if (substr($xword, 0, 1) == '*') {
323
                $xword = substr($xword, 1);
324
                $caret = '';
325
                $wlen -= 1;
326
            }
327
            if (substr($xword, -1, 1) == '*') {
328
                $xword = substr($xword, 0, -1);
329
                $dollar = '';
330
                $wlen -= 1;
331
            }
332
            if ($wlen < $Tokenizer->getMinWordLength()
333
                && $caret && $dollar && !is_numeric($xword)
334
            ) {
335
                continue;
336
            }
337
            if (!isset($tokens[$xword])) {
338
                $tokenlength[$wlen][] = $xword;
339
            }
340
            if (!$caret || !$dollar) {
341
                $re = $caret.preg_quote($xword, '/').$dollar;
342
                $tokens[$xword][] = array($word, '/'.$re.'/');
343
                if (!isset($tokenwild[$xword])) {
344
                    $tokenwild[$xword] = $wlen;
345
                }
346
            } else {
347
                $tokens[$xword][] = array($word, null);
348
            }
349
        }
350
        asort($tokenwild);
351
        // $tokens = array( base word => array( [ query term , regexp ] ... ) ... )
352
        // $tokenlength = array( base word length => base word ... )
353
        // $tokenwild = array( base word => base word length ... )
354
        $length_filter = empty($tokenwild) ? $tokenlength : min(array_keys($tokenlength));
355
        $indexes_known = $this->indexLengths($length_filter);
0 ignored issues
show
Bug introduced by
The method indexLengths() does not exist on dokuwiki\Search\FulltextIndex. Did you maybe mean getIndexLengths()?

This check marks calls to methods that do not seem to exist on an object.

This is most likely the result of a method being renamed without all references to it being renamed likewise.

Loading history...
356
        if (!empty($tokenwild)) sort($indexes_known);
357
        // get word IDs
358
        $wids = array();
359
        foreach ($indexes_known as $ixlen) {
360
            $word_idx = $this->getIndex('w', $ixlen);
361
            // handle exact search
362
            if (isset($tokenlength[$ixlen])) {
363
                foreach ($tokenlength[$ixlen] as $xword) {
364
                    $wid = array_search($xword, $word_idx, true);
365
                    if ($wid !== false) {
366
                        $wids[$ixlen][] = $wid;
367
                        foreach ($tokens[$xword] as $w)
368
                            $result[$w[0]][] = "{$ixlen}*{$wid}";
369
                    }
370
                }
371
            }
372
            // handle wildcard search
373
            foreach ($tokenwild as $xword => $wlen) {
374
                if ($wlen >= $ixlen) break;
375
                foreach ($tokens[$xword] as $w) {
376
                    if (is_null($w[1])) continue;
377
                    foreach (array_keys(preg_grep($w[1], $word_idx)) as $wid) {
378
                        $wids[$ixlen][] = $wid;
379
                        $result[$w[0]][] = "{$ixlen}*{$wid}";
380
                    }
381
                }
382
            }
383
        }
384
        return $wids;
385
    }
386
387
    /**
388
     * Get the word lengths that have been indexed
389
     *
390
     * Reads the index directory and returns an array of lengths
391
     * that there are indices for.
392
     *
393
     * @author YoBoY <[email protected]>
394
     *
395
     * @param array|int $filter
396
     * @return array
397
     */
398
    public function getIndexLengths($filter)
399
    {
400
        global $conf;
401
        $idx = array();
402
        if (is_array($filter)) {
403
            // testing if index files exist only
404
            $path = $conf['indexdir']."/i";
405
            foreach ($filter as $key => $value) {
406
                if (file_exists($path.$key.'.idx')) {
407
                    $idx[] = $key;
408
                }
409
            }
410
        } else {
411
            $lengths = $this->listIndexLengths();
412
            foreach ($lengths as $key => $length) {
413
                // keep all the values equal or superior
414
                if ((int)$length >= (int)$filter) {
415
                    $idx[] = $length;
416
                }
417
            }
418
        }
419
        return $idx;
420
    }
421
422
    /**
423
     * Get the list of lengths indexed in the wiki
424
     *
425
     * Read the index directory or a cache file and returns
426
     * a sorted array of lengths of the words used in the wiki.
427
     *
428
     * @author YoBoY <[email protected]>
429
     *
430
     * @return array
431
     */
432
    public function listIndexLengths()
433
    {
434
        global $conf;
435
        $lengthsFile = $conf['indexdir'].'/lengths.idx';
436
437
        // testing what we have to do, create a cache file or not.
438
        if ($conf['readdircache'] == 0) {
439
            $docache = false;
440
        } else {
441
            clearstatcache();
442
            if (file_exists($lengthsFile)
443
                && (time() < @filemtime($lengthsFile) + $conf['readdircache'])
444
            ) {
445
                $lengths = @file($lengthsFile, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
446
                if ($lengths !== false) {
447
                    $idx = array();
448
                    foreach ($lengths as $length) {
449
                        $idx[] = (int)$length;
450
                    }
451
                    return $idx;
452
                }
453
            }
454
            $docache = true;
455
        }
456
457
        if ($conf['readdircache'] == 0 || $docache) {
458
            $dir = @opendir($conf['indexdir']);
459
            if ($dir === false) return array();
460
            $idx = array();
461
            while (($f = readdir($dir)) !== false) {
462
                if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') {
463
                    $i = substr($f, 1, -4);
464
                    if (is_numeric($i)) $idx[] = (int)$i;
465
                }
466
            }
467
            closedir($dir);
468
            sort($idx);
469
            // save this in a file
470
            if ($docache) {
471
                $handle = @fopen($lengthsFile, 'w');
472
                @fwrite($handle, implode("\n", $idx));
473
                @fclose($handle);
474
            }
475
            return $idx;
476
        }
477
        return array();
478
    }
479
480
    /**
481
     * Return a list of words sorted by number of times used
482
     *
483
     * @param int       $min    bottom frequency threshold
484
     * @param int       $max    upper frequency limit. No limit if $max<$min
485
     * @param int       $minlen minimum length of words to count
486
     * @return array            list of words as the keys and frequency as value
487
     *
488
     * @author Tom N Harris <[email protected]>
489
     */
490
    public function histogram($min=1, $max=0, $minlen=3)
491
    {
492
        return Search\MetadataIndex::getInstance()->histogram($min, $max, $minlen);
493
    }
494
495
    /**
496
     * Clear the Fulltext Index
497
     *
498
     * @param bool   $requireLock  should be false only if the caller is resposible for index lock
499
     * @return bool  If the index has been cleared successfully
500
     */
501
    public function clear($requireLock = true)
502
    {
503
        global $conf;
504
505
        if ($requireLock && !$this->lock()) return false;
506
507
        $lengths = $this->listIndexLengths();
508
        foreach ($lengths as $length) {
509
            @unlink($conf['indexdir'].'/i'.$length.'.idx');
510
            @unlink($conf['indexdir'].'/w'.$length.'.idx');
511
        }
512
        @unlink($conf['indexdir'].'/lengths.idx');
513
        @unlink($conf['indexdir'].'/pageword.idx');
514
515
        if ($requireLock) $this->unlock();
516
        return true;
517
    }
518
}
519