Failed Conditions
Pull Request — master (#2943)
by Andreas
03:19
created

FulltextIndex::deletePageWords()   B

Complexity

Conditions 11
Paths 8

Size

Total Lines 38

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 11
nc 8
nop 2
dl 0
loc 38
rs 7.3166
c 0
b 0
f 0

How to fix   Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
namespace dokuwiki\Search;
4
5
use dokuwiki\Search\Tokenizer;
6
use dokuwiki\Utf8;
7
8
/**
9
 * Class DokuWiki Fulltext Index (Singleton)
10
 *
11
 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
12
 * @author     Andreas Gohr <[email protected]>
13
 * @author Tom N Harris <[email protected]>
14
 */
15
class FulltextIndex extends AbstractIndex
16
{
17
    /** @var FulltextIndex $instance */
18
    protected static $instance = null;
19
20
    /**
21
     * Get new or existing singleton instance of the FulltextIndex
22
     *
23
     * @return FulltextIndex
24
     */
25
    public static function getInstance()
26
    {
27
        if (is_null(static::$instance)) {
28
            static::$instance = new static();
29
        }
30
        return static::$instance;
31
    }
32
33
    /**
34
     * Measure the length of a string
35
     * Differs from strlen in handling of asian characters.
36
     *
37
     * @author Tom N Harris <[email protected]>
38
     *
39
     * @param string $w
40
     * @return int
41
     */
42
    public function wordlen($w)
43
    {
44
        $l = strlen($w);
45
        // If left alone, all chinese "words" will get put into w3.idx
46
        // So the "length" of a "word" is faked
47
        if (preg_match_all('/[\xE2-\xEF]/', $w, $leadbytes)) {
48
            foreach ($leadbytes[0] as $b) {
49
                $l += ord($b) - 0xE1;
50
            }
51
        }
52
        return $l;
53
    }
54
55
    /**
56
     * Adds the contents of a page to the fulltext index
57
     *
58
     * The added text replaces previous words for the same page.
59
     * An empty value erases the page.
60
     *
61
     * @param string $page   a page name
62
     * @param string $text   the body of the page
63
     * @param bool   $requireLock  should be false only if the caller is resposible for index lock
64
     * @return bool  if the function completed successfully
65
     *
66
     * @author Tom N Harris <[email protected]>
67
     * @author Andreas Gohr <[email protected]>
68
     */
69
    public function addPageWords($page, $text, $requireLock = true)
70
    {
71
        // load known documents
72
        $pid = $this->getPID($page);
73
        if ($pid === false) {
74
            return false;
75
        }
76
77
        if ($requireLock && !$this->lock()) return false;
78
79
        $pagewords = array();
80
        // get word usage in page
81
        $words = $this->getPageWords($text);
82
        if ($words === false) {
83
            $this->unlock();
84
            return false;
85
        }
86
87
        if (!empty($words)) {
88
            foreach (array_keys($words) as $wlen) {
89
                $index = $this->getIndex('i', $wlen);
90
                foreach ($words[$wlen] as $wid => $freq) {
91
                    $idx = ($wid < count($index)) ? $index[$wid] : '';
92
                    $index[$wid] = $this->updateTuple($idx, $pid, $freq);
93
                    $pagewords[] = "{$wlen}*{$wid}";
94
                }
95
                if (!$this->saveIndex('i', $wlen, $index)) {
96
                    $this->unlock();
97
                    return false;
98
                }
99
            }
100
        }
101
102
        // Remove obsolete index entries
103
        $pageword_idx = $this->getIndexKey('pageword', '', $pid);
104
        if ($pageword_idx !== '') {
105
            $oldwords = explode(':',$pageword_idx);
106
            $delwords = array_diff($oldwords, $pagewords);
107
            $upwords = array();
108
            foreach ($delwords as $word) {
109
                if ($word != '') {
110
                    list($wlen, $wid) = explode('*', $word);
111
                    $wid = (int)$wid;
112
                    $upwords[$wlen][] = $wid;
113
                }
114
            }
115
            foreach ($upwords as $wlen => $widx) {
116
                $index = $this->getIndex('i', $wlen);
117
                foreach ($widx as $wid) {
118
                    $index[$wid] = $this->updateTuple($index[$wid], $pid, 0);
119
                }
120
                $this->saveIndex('i', $wlen, $index);
121
            }
122
        }
123
        // Save the reverse index
124
        $pageword_idx = implode(':', $pagewords);
125
        if (!$this->saveIndexKey('pageword', '', $pid, $pageword_idx)) {
126
            $result = false;
127
        } else {
128
            $result = true;
129
        }
130
131
        if ($requireLock) $this->unlock();
132
        return $result;
133
    }
134
135
    /**
136
     * Split the words in a page and add them to the index
137
     *
138
     * @param string    $text   content of the page
139
     * @return array            list of word IDs and number of times used
140
     *
141
     * @author Andreas Gohr <[email protected]>
142
     * @author Christopher Smith <[email protected]>
143
     * @author Tom N Harris <[email protected]>
144
     */
145
    protected function getPageWords($text)
146
    {
147
        $Tokenizer = Tokenizer::getInstance();
148
        $tokens = $Tokenizer->getWords($text);
149
        $tokens = array_count_values($tokens);  // count the frequency of each token
150
151
        $words = array();
152
        foreach ($tokens as $w => $c) {
153
            $l = $this->wordlen($w);
154
            if (isset($words[$l])) {
155
                $words[$l][$w] = $c + (isset($words[$l][$w]) ? $words[$l][$w] : 0);
156
            } else {
157
                $words[$l] = array($w => $c);
158
            }
159
        }
160
161
        // arrive here with $words = array(wordlen => array(word => frequency))
162
        $word_idx_modified = false;
163
        $index = array();   //resulting index
164
        foreach (array_keys($words) as $wlen) {
165
            $word_idx = $this->getIndex('w', $wlen);
166
            foreach ($words[$wlen] as $word => $freq) {
167
                $word = (string)$word;
168
                $wid = array_search($word, $word_idx, true);
169
                if ($wid === false) {
170
                    $wid = count($word_idx);
171
                    $word_idx[] = $word;
172
                    $word_idx_modified = true;
173
                }
174
                if (!isset($index[$wlen])) {
175
                    $index[$wlen] = array();
176
                }
177
                $index[$wlen][$wid] = $freq;
178
            }
179
            // save back the word index
180
            if ($word_idx_modified && !$this->saveIndex('w', $wlen, $word_idx)) {
181
                return false;
182
            }
183
        }
184
185
        return $index;
186
    }
187
188
    /**
189
     * Delete the contents of a page to the fulltext index
190
     *
191
     * @param string $page   a page name
192
     * @param bool   $requireLock  should be false only if the caller is resposible for index lock
193
     * @return bool  If renaming the value has been successful, false on error
194
     *
195
     * @author Tom N Harris <[email protected]>
196
     * @author Satoshi Sahara <[email protected]>
197
     */
198
    public function deletePageWords($page, $requireLock = true)
199
    {
200
        // load known documents
201
        $pid = $this->getPID($page);
202
        if ($pid === false) {
203
            return false;
204
        }
205
206
        if ($requireLock && !$this->lock()) return false;
207
208
        // remove obsolete index entries
209
        $pageword_idx = $this->getIndexKey('pageword', '', $pid);
210
        if ($pageword_idx !== '') {
211
            $delwords = explode(':', $pageword_idx);
212
            $upwords = array();
213
            foreach ($delwords as $word) {
214
                if ($word != '') {
215
                    list($wlen, $wid) = explode('*', $word);
216
                    $wid = (int)$wid;
217
                    $upwords[$wlen][] = $wid;
218
                }
219
            }
220
            foreach ($upwords as $wlen => $widx) {
221
                $index = $this->getIndex('i', $wlen);
222
                foreach ($widx as $wid) {
223
                    $index[$wid] = $this->updateTuple($index[$wid], $pid, 0);
224
                }
225
                $this->saveIndex('i', $wlen, $index);
226
            }
227
        }
228
        // save the reverse index
229
        if (!$this->saveIndexKey('pageword', '', $pid, '')) {
230
            return false;
231
        }
232
233
        if ($requireLock) $this->unlock();
234
        return true;
235
    }
236
237
    /**
238
     * Find pages in the fulltext index containing the words,
239
     *
240
     * The search words must be pre-tokenized, meaning only letters and
241
     * numbers with an optional wildcard
242
     *
243
     * The returned array will have the original tokens as key. The values
244
     * in the returned list is an array with the page names as keys and the
245
     * number of times that token appears on the page as value.
246
     *
247
     * @param array  $tokens list of words to search for
248
     * @return array         list of page names with usage counts
249
     *
250
     * @author Tom N Harris <[email protected]>
251
     * @author Andreas Gohr <[email protected]>
252
     */
253
    public function lookupWords(&$tokens)
254
    {
255
        $result = array();
256
        $wids = $this->getIndexWords($tokens, $result);
257
        if (empty($wids)) return array();
258
        // load known words and documents
259
        $page_idx = $this->getIndex('page', '');
260
        $docs = array();
261
        foreach (array_keys($wids) as $wlen) {
262
            $wids[$wlen] = array_unique($wids[$wlen]);
263
            $index = $this->getIndex('i', $wlen);
264
            foreach ($wids[$wlen] as $ixid) {
265
                if ($ixid < count($index)) {
266
                    $docs["{$wlen}*{$ixid}"] = $this->parseTuples($page_idx, $index[$ixid]);
267
                }
268
            }
269
        }
270
        // merge found pages into final result array
271
        $final = array();
272
        foreach ($result as $word => $res) {
273
            $final[$word] = array();
274
            foreach ($res as $wid) {
275
                // handle the case when ($ixid < count($index)) has been false
276
                // and thus $docs[$wid] hasn't been set.
277
                if (!isset($docs[$wid])) continue;
278
                $hits =& $docs[$wid];
279
                foreach ($hits as $hitkey => $hitcnt) {
280
                    // make sure the document still exists
281
                    if (!page_exists($hitkey, '', false)) continue;
282
                    if (!isset($final[$word][$hitkey])) {
283
                        $final[$word][$hitkey] = $hitcnt;
284
                    } else {
285
                        $final[$word][$hitkey] += $hitcnt;
286
                    }
287
                }
288
            }
289
        }
290
        return $final;
291
    }
292
293
    /**
294
     * Find the index ID of each search term
295
     *
296
     * The query terms should only contain valid characters, with a '*' at
297
     * either the beginning or end of the word (or both).
298
     * The $result parameter can be used to merge the index locations with
299
     * the appropriate query term.
300
     *
301
     * @param array  $words  The query terms.
302
     * @param array  $result Set to word => array("length*id" ...)
303
     * @return array         Set to length => array(id ...)
304
     *
305
     * @author Tom N Harris <[email protected]>
306
     */
307
    protected function getIndexWords(&$words, &$result)
308
    {
309
        $Tokenizer = Tokenizer::getInstance();
310
311
        $tokens = array();
312
        $tokenlength = array();
313
        $tokenwild = array();
314
        foreach ($words as $word) {
315
            $result[$word] = array();
316
            $caret = '^';
317
            $dollar = '$';
318
            $xword = $word;
319
            $wlen = $this->wordlen($word);
320
321
            // check for wildcards
322
            if (substr($xword, 0, 1) == '*') {
323
                $xword = substr($xword, 1);
324
                $caret = '';
325
                $wlen -= 1;
326
            }
327
            if (substr($xword, -1, 1) == '*') {
328
                $xword = substr($xword, 0, -1);
329
                $dollar = '';
330
                $wlen -= 1;
331
            }
332
            if ($wlen < $Tokenizer->getMinWordLength()
333
                && $caret && $dollar && !is_numeric($xword)
334
            ) {
335
                continue;
336
            }
337
            if (!isset($tokens[$xword])) {
338
                $tokenlength[$wlen][] = $xword;
339
            }
340
            if (!$caret || !$dollar) {
341
                $re = $caret.preg_quote($xword, '/').$dollar;
342
                $tokens[$xword][] = array($word, '/'.$re.'/');
343
                if (!isset($tokenwild[$xword])) {
344
                    $tokenwild[$xword] = $wlen;
345
                }
346
            } else {
347
                $tokens[$xword][] = array($word, null);
348
            }
349
        }
350
        asort($tokenwild);
351
        // $tokens = array( base word => array( [ query term , regexp ] ... ) ... )
352
        // $tokenlength = array( base word length => base word ... )
353
        // $tokenwild = array( base word => base word length ... )
354
        $length_filter = empty($tokenwild) ? $tokenlength : min(array_keys($tokenlength));
355
        $indexes_known = $this->getIndexLengths($length_filter);
356
        if (!empty($tokenwild)) sort($indexes_known);
357
        // get word IDs
358
        $wids = array();
359
        foreach ($indexes_known as $ixlen) {
360
            $word_idx = $this->getIndex('w', $ixlen);
361
            // handle exact search
362
            if (isset($tokenlength[$ixlen])) {
363
                foreach ($tokenlength[$ixlen] as $xword) {
364
                    $wid = array_search($xword, $word_idx, true);
365
                    if ($wid !== false) {
366
                        $wids[$ixlen][] = $wid;
367
                        foreach ($tokens[$xword] as $w)
368
                            $result[$w[0]][] = "{$ixlen}*{$wid}";
369
                    }
370
                }
371
            }
372
            // handle wildcard search
373
            foreach ($tokenwild as $xword => $wlen) {
374
                if ($wlen >= $ixlen) break;
375
                foreach ($tokens[$xword] as $w) {
376
                    if (is_null($w[1])) continue;
377
                    foreach (array_keys(preg_grep($w[1], $word_idx)) as $wid) {
378
                        $wids[$ixlen][] = $wid;
379
                        $result[$w[0]][] = "{$ixlen}*{$wid}";
380
                    }
381
                }
382
            }
383
        }
384
        return $wids;
385
    }
386
387
    /**
388
     * Get the word lengths that have been indexed
389
     *
390
     * Reads the index directory and returns an array of lengths
391
     * that there are indices for.
392
     *
393
     * @author YoBoY <[email protected]>
394
     *
395
     * @param array|int $filter
396
     * @return array
397
     */
398
    public function getIndexLengths($filter)
399
    {
400
        global $conf;
401
        $idx = array();
402
        if (is_array($filter)) {
403
            // testing if index files exist only
404
            $path = $conf['indexdir']."/i";
405
            foreach ($filter as $key => $value) {
406
                if (file_exists($path.$key.'.idx')) {
407
                    $idx[] = $key;
408
                }
409
            }
410
        } else {
411
            $lengths = $this->listIndexLengths();
412
            foreach ($lengths as $key => $length) {
413
                // keep all the values equal or superior
414
                if ((int)$length >= (int)$filter) {
415
                    $idx[] = $length;
416
                }
417
            }
418
        }
419
        return $idx;
420
    }
421
422
    /**
423
     * Get the list of lengths indexed in the wiki
424
     *
425
     * Read the index directory or a cache file and returns
426
     * a sorted array of lengths of the words used in the wiki.
427
     *
428
     * @author YoBoY <[email protected]>
429
     *
430
     * @return array
431
     */
432
    public function listIndexLengths()
433
    {
434
        global $conf;
435
        $lengthsFile = $conf['indexdir'].'/lengths.idx';
436
437
        // testing what we have to do, create a cache file or not.
438
        if ($conf['readdircache'] == 0) {
439
            $docache = false;
440
        } else {
441
            clearstatcache();
442
            if (file_exists($lengthsFile)
443
                && (time() < @filemtime($lengthsFile) + $conf['readdircache'])
444
            ) {
445
                $lengths = @file($lengthsFile, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
446
                if ($lengths !== false) {
447
                    $idx = array();
448
                    foreach ($lengths as $length) {
449
                        $idx[] = (int)$length;
450
                    }
451
                    return $idx;
452
                }
453
            }
454
            $docache = true;
455
        }
456
457
        if ($conf['readdircache'] == 0 || $docache) {
458
            $dir = @opendir($conf['indexdir']);
459
            if ($dir === false) return array();
460
            $idx = array();
461
            while (($f = readdir($dir)) !== false) {
462
                if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') {
463
                    $i = substr($f, 1, -4);
464
                    if (is_numeric($i)) $idx[] = (int)$i;
465
                }
466
            }
467
            closedir($dir);
468
            sort($idx);
469
            // save this in a file
470
            if ($docache) {
471
                $handle = @fopen($lengthsFile, 'w');
472
                @fwrite($handle, implode("\n", $idx));
473
                @fclose($handle);
474
            }
475
            return $idx;
476
        }
477
        return array();
478
    }
479
480
    /**
481
     * Return a list of words sorted by number of times used
482
     *
483
     * @param int       $min    bottom frequency threshold
484
     * @param int       $max    upper frequency limit. No limit if $max<$min
485
     * @param int       $minlen minimum length of words to count
486
     * @return array            list of words as the keys and frequency as value
487
     *
488
     * @author Tom N Harris <[email protected]>
489
     */
490
    public function histogram($min=1, $max=0, $minlen=3)
491
    {
492
        return Search\MetadataIndex::getInstance()->histogram($min, $max, $minlen);
493
    }
494
495
    /**
496
     * Clear the Fulltext Index
497
     *
498
     * @param bool   $requireLock  should be false only if the caller is resposible for index lock
499
     * @return bool  If the index has been cleared successfully
500
     */
501
    public function clear($requireLock = true)
502
    {
503
        global $conf;
504
505
        if ($requireLock && !$this->lock()) return false;
506
507
        $lengths = $this->listIndexLengths();
508
        foreach ($lengths as $length) {
509
            @unlink($conf['indexdir'].'/i'.$length.'.idx');
510
            @unlink($conf['indexdir'].'/w'.$length.'.idx');
511
        }
512
        @unlink($conf['indexdir'].'/lengths.idx');
513
        @unlink($conf['indexdir'].'/pageword.idx');
514
515
        if ($requireLock) $this->unlock();
516
        return true;
517
    }
518
}
519