Failed Conditions
Pull Request — master (#2943)
by Andreas
03:32
created

FulltextIndex::getPageWords()   B

Complexity

Conditions 10
Paths 44

Size

Total Lines 42

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 10
nc 44
nop 1
dl 0
loc 42
rs 7.6666
c 0
b 0
f 0

How to fix   Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
namespace dokuwiki\Search;
4
5
use dokuwiki\Search\Tokenizer;
6
use dokuwiki\Utf8;
7
8
/**
9
 * Class DokuWiki Fulltext Index (Singleton)
10
 *
11
 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
12
 * @author     Andreas Gohr <[email protected]>
13
 * @author Tom N Harris <[email protected]>
14
 */
15
class FulltextIndex extends AbstractIndex
16
{
17
    /** @var FulltextIndex $instance */
18
    protected static $instance = null;
19
20
    /**
21
     * Get new or existing singleton instance of the FulltextIndex
22
     *
23
     * @return FulltextIndex
24
     */
25
    public static function getInstance()
26
    {
27
        if (is_null(static::$instance)) {
28
            static::$instance = new static();
29
        }
30
        return static::$instance;
31
    }
32
33
    /**
34
     * Measure the length of a string
35
     * Differs from strlen in handling of asian characters.
36
     *
37
     * @author Tom N Harris <[email protected]>
38
     *
39
     * @param string $w
40
     * @return int
41
     */
42
    public function wordlen($w)
43
    {
44
        $l = strlen($w);
45
        // If left alone, all chinese "words" will get put into w3.idx
46
        // So the "length" of a "word" is faked
47
        if (preg_match_all('/[\xE2-\xEF]/', $w, $leadbytes)) {
48
            foreach ($leadbytes[0] as $b) {
49
                $l += ord($b) - 0xE1;
50
            }
51
        }
52
        return $l;
53
    }
54
55
    /**
56
     * Adds the contents of a page to the fulltext index
57
     *
58
     * The added text replaces previous words for the same page.
59
     * An empty value erases the page.
60
     *
61
     * @param string $page a page name
62
     * @param string $text the body of the page
63
     * @param bool $requireLock should be false only if the caller is resposible for index lock
64
     * @return bool  if the function completed successfully
65
     *
66
     * @throws Exception\IndexLockException
67
     * @author Andreas Gohr <[email protected]>
68
     * @author Tom N Harris <[email protected]>
69
     */
70
    public function addPageWords($page, $text, $requireLock = true)
71
    {
72
        // load known documents
73
        $pid = $this->getPID($page);
74
        if ($pid === false) {
75
            return false;
76
        }
77
78
        if ($requireLock) $this->lock();
79
80
        $pagewords = array();
81
        // get word usage in page
82
        $words = $this->getPageWords($text);
83
        if ($words === false) {
84
            $this->unlock();
85
            return false;
86
        }
87
88
        if (!empty($words)) {
89
            foreach (array_keys($words) as $wlen) {
90
                $index = $this->getIndex('i', $wlen);
91
                foreach ($words[$wlen] as $wid => $freq) {
92
                    $idx = ($wid < count($index)) ? $index[$wid] : '';
93
                    $index[$wid] = $this->updateTuple($idx, $pid, $freq);
94
                    $pagewords[] = "{$wlen}*{$wid}";
95
                }
96
                if (!$this->saveIndex('i', $wlen, $index)) {
97
                    $this->unlock();
98
                    return false;
99
                }
100
            }
101
        }
102
103
        // Remove obsolete index entries
104
        $pageword_idx = $this->getIndexKey('pageword', '', $pid);
105
        if ($pageword_idx !== '') {
106
            $oldwords = explode(':',$pageword_idx);
107
            $delwords = array_diff($oldwords, $pagewords);
108
            $upwords = array();
109
            foreach ($delwords as $word) {
110
                if ($word != '') {
111
                    list($wlen, $wid) = explode('*', $word);
112
                    $wid = (int)$wid;
113
                    $upwords[$wlen][] = $wid;
114
                }
115
            }
116
            foreach ($upwords as $wlen => $widx) {
117
                $index = $this->getIndex('i', $wlen);
118
                foreach ($widx as $wid) {
119
                    $index[$wid] = $this->updateTuple($index[$wid], $pid, 0);
120
                }
121
                $this->saveIndex('i', $wlen, $index);
122
            }
123
        }
124
        // Save the reverse index
125
        $pageword_idx = implode(':', $pagewords);
126
        if (!$this->saveIndexKey('pageword', '', $pid, $pageword_idx)) {
127
            $result = false;
128
        } else {
129
            $result = true;
130
        }
131
132
        if ($requireLock) $this->unlock();
133
        return $result;
134
    }
135
136
    /**
137
     * Split the words in a page and add them to the index
138
     *
139
     * @param string    $text   content of the page
140
     * @return array|false      list of word IDs and number of times used, false on errors
141
     *
142
     * @author Andreas Gohr <[email protected]>
143
     * @author Christopher Smith <[email protected]>
144
     * @author Tom N Harris <[email protected]>
145
     */
146
    protected function getPageWords($text)
147
    {
148
        $Tokenizer = Tokenizer::getInstance();
149
        $tokens = $Tokenizer->getWords($text);
150
        $tokens = array_count_values($tokens);  // count the frequency of each token
151
152
        $words = array();
153
        foreach ($tokens as $w => $c) {
154
            $l = $this->wordlen($w);
155
            if (isset($words[$l])) {
156
                $words[$l][$w] = $c + (isset($words[$l][$w]) ? $words[$l][$w] : 0);
157
            } else {
158
                $words[$l] = array($w => $c);
159
            }
160
        }
161
162
        // arrive here with $words = array(wordlen => array(word => frequency))
163
        $word_idx_modified = false;
164
        $index = array();   //resulting index
165
        foreach (array_keys($words) as $wlen) {
166
            $word_idx = $this->getIndex('w', $wlen);
167
            foreach ($words[$wlen] as $word => $freq) {
168
                $word = (string)$word;
169
                $wid = array_search($word, $word_idx, true);
170
                if ($wid === false) {
171
                    $wid = count($word_idx);
172
                    $word_idx[] = $word;
173
                    $word_idx_modified = true;
174
                }
175
                if (!isset($index[$wlen])) {
176
                    $index[$wlen] = array();
177
                }
178
                $index[$wlen][$wid] = $freq;
179
            }
180
            // save back the word index
181
            if ($word_idx_modified && !$this->saveIndex('w', $wlen, $word_idx)) {
182
                return false;
183
            }
184
        }
185
186
        return $index;
187
    }
188
189
    /**
190
     * Delete the contents of a page to the fulltext index
191
     *
192
     * @param string $page a page name
193
     * @param bool $requireLock should be false only if the caller is resposible for index lock
194
     * @return bool  If renaming the value has been successful, false on error
195
     *
196
     * @throws Exception\IndexLockException
197
     * @author Satoshi Sahara <[email protected]>
198
     * @author Tom N Harris <[email protected]>
199
     */
200
    public function deletePageWords($page, $requireLock = true)
201
    {
202
        // load known documents
203
        $pid = $this->getPID($page);
204
        if ($pid === false) {
205
            return false;
206
        }
207
208
        if ($requireLock) $this->lock();
209
210
        // remove obsolete index entries
211
        $pageword_idx = $this->getIndexKey('pageword', '', $pid);
212
        if ($pageword_idx !== '') {
213
            $delwords = explode(':', $pageword_idx);
214
            $upwords = array();
215
            foreach ($delwords as $word) {
216
                if ($word != '') {
217
                    list($wlen, $wid) = explode('*', $word);
218
                    $wid = (int)$wid;
219
                    $upwords[$wlen][] = $wid;
220
                }
221
            }
222
            foreach ($upwords as $wlen => $widx) {
223
                $index = $this->getIndex('i', $wlen);
224
                foreach ($widx as $wid) {
225
                    $index[$wid] = $this->updateTuple($index[$wid], $pid, 0);
226
                }
227
                $this->saveIndex('i', $wlen, $index);
228
            }
229
        }
230
        // save the reverse index
231
        if (!$this->saveIndexKey('pageword', '', $pid, '')) {
232
            return false;
233
        }
234
235
        if ($requireLock) $this->unlock();
236
        return true;
237
    }
238
239
    /**
240
     * Find pages in the fulltext index containing the words,
241
     *
242
     * The search words must be pre-tokenized, meaning only letters and
243
     * numbers with an optional wildcard
244
     *
245
     * The returned array will have the original tokens as key. The values
246
     * in the returned list is an array with the page names as keys and the
247
     * number of times that token appears on the page as value.
248
     *
249
     * @param array  $tokens list of words to search for
250
     * @return array         list of page names with usage counts
251
     *
252
     * @author Tom N Harris <[email protected]>
253
     * @author Andreas Gohr <[email protected]>
254
     */
255
    public function lookupWords(&$tokens)
256
    {
257
        $result = array();
258
        $wids = $this->getIndexWords($tokens, $result);
259
        if (empty($wids)) return array();
260
        // load known words and documents
261
        $page_idx = $this->getIndex('page', '');
262
        $docs = array();
263
        foreach (array_keys($wids) as $wlen) {
264
            $wids[$wlen] = array_unique($wids[$wlen]);
265
            $index = $this->getIndex('i', $wlen);
266
            foreach ($wids[$wlen] as $ixid) {
267
                if ($ixid < count($index)) {
268
                    $docs["{$wlen}*{$ixid}"] = $this->parseTuples($page_idx, $index[$ixid]);
269
                }
270
            }
271
        }
272
        // merge found pages into final result array
273
        $final = array();
274
        foreach ($result as $word => $res) {
275
            $final[$word] = array();
276
            foreach ($res as $wid) {
277
                // handle the case when ($ixid < count($index)) has been false
278
                // and thus $docs[$wid] hasn't been set.
279
                if (!isset($docs[$wid])) continue;
280
                $hits =& $docs[$wid];
281
                foreach ($hits as $hitkey => $hitcnt) {
282
                    // make sure the document still exists
283
                    if (!page_exists($hitkey, '', false)) continue;
284
                    if (!isset($final[$word][$hitkey])) {
285
                        $final[$word][$hitkey] = $hitcnt;
286
                    } else {
287
                        $final[$word][$hitkey] += $hitcnt;
288
                    }
289
                }
290
            }
291
        }
292
        return $final;
293
    }
294
295
    /**
296
     * Find the index ID of each search term
297
     *
298
     * The query terms should only contain valid characters, with a '*' at
299
     * either the beginning or end of the word (or both).
300
     * The $result parameter can be used to merge the index locations with
301
     * the appropriate query term.
302
     *
303
     * @param array  $words  The query terms.
304
     * @param array  $result Set to word => array("length*id" ...)
305
     * @return array         Set to length => array(id ...)
306
     *
307
     * @author Tom N Harris <[email protected]>
308
     */
309
    protected function getIndexWords(&$words, &$result)
310
    {
311
        $Tokenizer = Tokenizer::getInstance();
312
313
        $tokens = array();
314
        $tokenlength = array();
315
        $tokenwild = array();
316
        foreach ($words as $word) {
317
            $result[$word] = array();
318
            $caret = '^';
319
            $dollar = '$';
320
            $xword = $word;
321
            $wlen = $this->wordlen($word);
322
323
            // check for wildcards
324
            if (substr($xword, 0, 1) == '*') {
325
                $xword = substr($xword, 1);
326
                $caret = '';
327
                $wlen -= 1;
328
            }
329
            if (substr($xword, -1, 1) == '*') {
330
                $xword = substr($xword, 0, -1);
331
                $dollar = '';
332
                $wlen -= 1;
333
            }
334
            if ($wlen < $Tokenizer->getMinWordLength()
335
                && $caret && $dollar && !is_numeric($xword)
336
            ) {
337
                continue;
338
            }
339
            if (!isset($tokens[$xword])) {
340
                $tokenlength[$wlen][] = $xword;
341
            }
342
            if (!$caret || !$dollar) {
343
                $re = $caret.preg_quote($xword, '/').$dollar;
344
                $tokens[$xword][] = array($word, '/'.$re.'/');
345
                if (!isset($tokenwild[$xword])) {
346
                    $tokenwild[$xword] = $wlen;
347
                }
348
            } else {
349
                $tokens[$xword][] = array($word, null);
350
            }
351
        }
352
        asort($tokenwild);
353
        // $tokens = array( base word => array( [ query term , regexp ] ... ) ... )
354
        // $tokenlength = array( base word length => base word ... )
355
        // $tokenwild = array( base word => base word length ... )
356
        $length_filter = empty($tokenwild) ? $tokenlength : min(array_keys($tokenlength));
357
        $indexes_known = $this->getIndexLengths($length_filter);
358
        if (!empty($tokenwild)) sort($indexes_known);
359
        // get word IDs
360
        $wids = array();
361
        foreach ($indexes_known as $ixlen) {
362
            $word_idx = $this->getIndex('w', $ixlen);
363
            // handle exact search
364
            if (isset($tokenlength[$ixlen])) {
365
                foreach ($tokenlength[$ixlen] as $xword) {
366
                    $wid = array_search($xword, $word_idx, true);
367
                    if ($wid !== false) {
368
                        $wids[$ixlen][] = $wid;
369
                        foreach ($tokens[$xword] as $w)
370
                            $result[$w[0]][] = "{$ixlen}*{$wid}";
371
                    }
372
                }
373
            }
374
            // handle wildcard search
375
            foreach ($tokenwild as $xword => $wlen) {
376
                if ($wlen >= $ixlen) break;
377
                foreach ($tokens[$xword] as $w) {
378
                    if (is_null($w[1])) continue;
379
                    foreach (array_keys(preg_grep($w[1], $word_idx)) as $wid) {
380
                        $wids[$ixlen][] = $wid;
381
                        $result[$w[0]][] = "{$ixlen}*{$wid}";
382
                    }
383
                }
384
            }
385
        }
386
        return $wids;
387
    }
388
389
    /**
390
     * Get the word lengths that have been indexed
391
     *
392
     * Reads the index directory and returns an array of lengths
393
     * that there are indices for.
394
     *
395
     * @author YoBoY <[email protected]>
396
     *
397
     * @param array|int $filter
398
     * @return array
399
     */
400
    public function getIndexLengths($filter)
401
    {
402
        global $conf;
403
        $idx = array();
404
        if (is_array($filter)) {
405
            // testing if index files exist only
406
            $path = $conf['indexdir']."/i";
407
            foreach ($filter as $key => $value) {
408
                if (file_exists($path.$key.'.idx')) {
409
                    $idx[] = $key;
410
                }
411
            }
412
        } else {
413
            $lengths = $this->listIndexLengths();
414
            foreach ($lengths as $key => $length) {
415
                // keep all the values equal or superior
416
                if ((int)$length >= (int)$filter) {
417
                    $idx[] = $length;
418
                }
419
            }
420
        }
421
        return $idx;
422
    }
423
424
    /**
425
     * Get the list of lengths indexed in the wiki
426
     *
427
     * Read the index directory or a cache file and returns
428
     * a sorted array of lengths of the words used in the wiki.
429
     *
430
     * @author YoBoY <[email protected]>
431
     *
432
     * @return array
433
     */
434
    public function listIndexLengths()
435
    {
436
        global $conf;
437
        $lengthsFile = $conf['indexdir'].'/lengths.idx';
438
439
        // testing what we have to do, create a cache file or not.
440
        if ($conf['readdircache'] == 0) {
441
            $docache = false;
442
        } else {
443
            clearstatcache();
444
            if (file_exists($lengthsFile)
445
                && (time() < @filemtime($lengthsFile) + $conf['readdircache'])
446
            ) {
447
                $lengths = @file($lengthsFile, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
448
                if ($lengths !== false) {
449
                    $idx = array();
450
                    foreach ($lengths as $length) {
451
                        $idx[] = (int)$length;
452
                    }
453
                    return $idx;
454
                }
455
            }
456
            $docache = true;
457
        }
458
459
        if ($conf['readdircache'] == 0 || $docache) {
460
            $dir = @opendir($conf['indexdir']);
461
            if ($dir === false) return array();
462
            $idx = array();
463
            while (($f = readdir($dir)) !== false) {
464
                if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') {
465
                    $i = substr($f, 1, -4);
466
                    if (is_numeric($i)) $idx[] = (int)$i;
467
                }
468
            }
469
            closedir($dir);
470
            sort($idx);
471
            // save this in a file
472
            if ($docache) {
473
                $handle = @fopen($lengthsFile, 'w');
474
                @fwrite($handle, implode("\n", $idx));
475
                @fclose($handle);
476
            }
477
            return $idx;
478
        }
479
        return array();
480
    }
481
482
    /**
483
     * Return a list of words sorted by number of times used
484
     *
485
     * @param int       $min    bottom frequency threshold
486
     * @param int       $max    upper frequency limit. No limit if $max<$min
487
     * @param int       $minlen minimum length of words to count
488
     * @return array            list of words as the keys and frequency as value
489
     *
490
     * @author Tom N Harris <[email protected]>
491
     */
492
    public function histogram($min=1, $max=0, $minlen=3)
493
    {
494
        return MetadataIndex::getInstance()->histogram($min, $max, $minlen);
495
    }
496
497
    /**
498
     * Clear the Fulltext Index
499
     *
500
     * @param bool $requireLock should be false only if the caller is resposible for index lock
501
     * @return bool  If the index has been cleared successfully
502
     * @throws Exception\IndexLockException
503
     */
504
    public function clear($requireLock = true)
505
    {
506
        global $conf;
507
508
        if ($requireLock) $this->lock();
509
510
        $lengths = $this->listIndexLengths();
511
        foreach ($lengths as $length) {
512
            @unlink($conf['indexdir'].'/i'.$length.'.idx');
513
            @unlink($conf['indexdir'].'/w'.$length.'.idx');
514
        }
515
        @unlink($conf['indexdir'].'/lengths.idx');
516
        @unlink($conf['indexdir'].'/pageword.idx');
517
518
        if ($requireLock) $this->unlock();
519
        return true;
520
    }
521
}
522