Failed Conditions
Pull Request — master (#2943)
by
unknown
03:21
created

FulltextIndex::clear()   A

Complexity

Conditions 4
Paths 8

Size

Total Lines 17

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 4
nc 8
nop 1
dl 0
loc 17
rs 9.7
c 0
b 0
f 0
1
<?php
2
3
namespace dokuwiki\Search;
4
5
use dokuwiki\Search\Exception\IndexLockException;
6
use dokuwiki\Search\Exception\IndexWriteException;
7
use dokuwiki\Search\Tokenizer;
8
use dokuwiki\Utf8;
9
10
/**
11
 * Class DokuWiki Fulltext Index (Singleton)
12
 *
13
 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
14
 * @author     Andreas Gohr <[email protected]>
15
 * @author Tom N Harris <[email protected]>
16
 */
17
class FulltextIndex extends AbstractIndex
18
{
19
    /** @var FulltextIndex $instance */
20
    protected static $instance = null;
21
22
    /**
23
     * Get new or existing singleton instance of the FulltextIndex
24
     *
25
     * @return FulltextIndex
26
     */
27
    public static function getInstance()
28
    {
29
        if (is_null(static::$instance)) {
30
            static::$instance = new static();
31
        }
32
        return static::$instance;
33
    }
34
35
    /**
36
     * Measure the length of a string
37
     * Differs from strlen in handling of asian characters.
38
     *
39
     * @author Tom N Harris <[email protected]>
40
     *
41
     * @param string $w
42
     * @return int
43
     */
44
    public function wordlen($w)
45
    {
46
        $l = strlen($w);
47
        // If left alone, all chinese "words" will get put into w3.idx
48
        // So the "length" of a "word" is faked
49
        if (preg_match_all('/[\xE2-\xEF]/', $w, $leadbytes)) {
50
            foreach ($leadbytes[0] as $b) {
51
                $l += ord($b) - 0xE1;
52
            }
53
        }
54
        return $l;
55
    }
56
57
    /**
58
     * Adds the contents of a page to the fulltext index
59
     *
60
     * The added text replaces previous words for the same page.
61
     * An empty value erases the page.
62
     *
63
     * @param string $page a page name
64
     * @param string $text the body of the page
65
     * @param bool $requireLock should be false only if the caller is resposible for index lock
66
     * @return bool  if the function completed successfully
67
     *
68
     * @throws IndexLockException
69
     * @throws IndexWriteException
70
     * @author Andreas Gohr <[email protected]>
71
     * @author Tom N Harris <[email protected]>
72
     */
73
    public function addPageWords($page, $text, $requireLock = true)
74
    {
75
        // load known documents
76
        $pid = $this->getPID($page);
77
78
        if ($requireLock) $this->lock();
79
80
        $pagewords = array();
81
        // get word usage in page
82
        $words = $this->getPageWords($text);
83
84
        foreach (array_keys($words) as $wlen) {
85
            $index = $this->getIndex('i', $wlen);
86
            foreach ($words[$wlen] as $wid => $freq) {
87
                $idx = ($wid < count($index)) ? $index[$wid] : '';
88
                $index[$wid] = $this->updateTuple($idx, $pid, $freq);
89
                $pagewords[] = "{$wlen}*{$wid}";
90
            }
91
            $this->saveIndex('i', $wlen, $index);
92
        }
93
94
        // Remove obsolete index entries
95
        $pageword_idx = $this->getIndexKey('pageword', '', $pid);
96
        if ($pageword_idx !== '') {
97
            $oldwords = explode(':',$pageword_idx);
98
            $delwords = array_diff($oldwords, $pagewords);
99
            $upwords = array();
100
            foreach ($delwords as $word) {
101
                if ($word != '') {
102
                    list($wlen, $wid) = explode('*', $word);
103
                    $wid = (int)$wid;
104
                    $upwords[$wlen][] = $wid;
105
                }
106
            }
107
            foreach ($upwords as $wlen => $widx) {
108
                $index = $this->getIndex('i', $wlen);
109
                foreach ($widx as $wid) {
110
                    $index[$wid] = $this->updateTuple($index[$wid], $pid, 0);
111
                }
112
                $this->saveIndex('i', $wlen, $index);
113
            }
114
        }
115
        // Save the reverse index
116
        $pageword_idx = implode(':', $pagewords);
117
        $this->saveIndexKey('pageword', '', $pid, $pageword_idx);
118
119
        if ($requireLock) $this->unlock();
120
        return true;
121
    }
122
123
    /**
124
     * Split the words in a page and add them to the index
125
     *
126
     * @param string $text content of the page
127
     * @return array  list of word IDs and number of times used, false on errors
128
     *
129
     * @throws IndexWriteException
130
     * @author Andreas Gohr <[email protected]>
131
     * @author Christopher Smith <[email protected]>
132
     * @author Tom N Harris <[email protected]>
133
     */
134
    protected function getPageWords($text)
135
    {
136
        $Tokenizer = Tokenizer::getInstance();
137
        $tokens = $Tokenizer->getWords($text);
138
        $tokens = array_count_values($tokens);  // count the frequency of each token
139
140
        $words = array();
141
        foreach ($tokens as $w => $c) {
142
            $l = $this->wordlen($w);
143
            if (isset($words[$l])) {
144
                $words[$l][$w] = $c + (isset($words[$l][$w]) ? $words[$l][$w] : 0);
145
            } else {
146
                $words[$l] = array($w => $c);
147
            }
148
        }
149
150
        // arrive here with $words = array(wordlen => array(word => frequency))
151
        $word_idx_modified = false;
152
        $index = array();   //resulting index
153
        foreach (array_keys($words) as $wlen) {
154
            $word_idx = $this->getIndex('w', $wlen);
155
            foreach ($words[$wlen] as $word => $freq) {
156
                $word = (string)$word;
157
                $wid = array_search($word, $word_idx, true);
158
                if ($wid === false) {
159
                    $wid = count($word_idx);
160
                    $word_idx[] = $word;
161
                    $word_idx_modified = true;
162
                }
163
                if (!isset($index[$wlen])) {
164
                    $index[$wlen] = array();
165
                }
166
                $index[$wlen][$wid] = $freq;
167
            }
168
            // save back the word index
169
            if ($word_idx_modified) $this->saveIndex('w', $wlen, $word_idx);
170
        }
171
172
        return $index;
173
    }
174
175
    /**
176
     * Delete the contents of a page to the fulltext index
177
     *
178
     * @param string $page a page name
179
     * @param bool $requireLock should be false only if the caller is resposible for index lock
180
     * @return bool  If renaming the value has been successful, false on error
181
     *
182
     * @throws IndexLockException
183
     * @throws IndexWriteException
184
     * @author Satoshi Sahara <[email protected]>
185
     * @author Tom N Harris <[email protected]>
186
     */
187
    public function deletePageWords($page, $requireLock = true)
188
    {
189
        // load known documents
190
        $pid = $this->getPID($page);
191
192
        if ($requireLock) $this->lock();
193
194
        // remove obsolete index entries
195
        $pageword_idx = $this->getIndexKey('pageword', '', $pid);
196
        if ($pageword_idx !== '') {
197
            $delwords = explode(':', $pageword_idx);
198
            $upwords = array();
199
            foreach ($delwords as $word) {
200
                if ($word != '') {
201
                    list($wlen, $wid) = explode('*', $word);
202
                    $wid = (int)$wid;
203
                    $upwords[$wlen][] = $wid;
204
                }
205
            }
206
            foreach ($upwords as $wlen => $widx) {
207
                $index = $this->getIndex('i', $wlen);
208
                foreach ($widx as $wid) {
209
                    $index[$wid] = $this->updateTuple($index[$wid], $pid, 0);
210
                }
211
                $this->saveIndex('i', $wlen, $index);
212
            }
213
        }
214
        // save the reverse index
215
        $this->saveIndexKey('pageword', '', $pid, '');
216
217
        if ($requireLock) $this->unlock();
218
        return true;
219
    }
220
221
    /**
222
     * Find pages in the fulltext index containing the words,
223
     *
224
     * The search words must be pre-tokenized, meaning only letters and
225
     * numbers with an optional wildcard
226
     *
227
     * The returned array will have the original tokens as key. The values
228
     * in the returned list is an array with the page names as keys and the
229
     * number of times that token appears on the page as value.
230
     *
231
     * @param array  $tokens list of words to search for
232
     * @return array         list of page names with usage counts
233
     *
234
     * @author Tom N Harris <[email protected]>
235
     * @author Andreas Gohr <[email protected]>
236
     */
237
    public function lookupWords(&$tokens)
238
    {
239
        $result = array();
240
        $wids = $this->getIndexWords($tokens, $result);
241
        if (empty($wids)) return array();
242
        // load known words and documents
243
        $page_idx = $this->getIndex('page', '');
244
        $docs = array();
245
        foreach (array_keys($wids) as $wlen) {
246
            $wids[$wlen] = array_unique($wids[$wlen]);
247
            $index = $this->getIndex('i', $wlen);
248
            foreach ($wids[$wlen] as $ixid) {
249
                if ($ixid < count($index)) {
250
                    $docs["{$wlen}*{$ixid}"] = $this->parseTuples($page_idx, $index[$ixid]);
251
                }
252
            }
253
        }
254
        // merge found pages into final result array
255
        $final = array();
256
        foreach ($result as $word => $res) {
257
            $final[$word] = array();
258
            foreach ($res as $wid) {
259
                // handle the case when ($ixid < count($index)) has been false
260
                // and thus $docs[$wid] hasn't been set.
261
                if (!isset($docs[$wid])) continue;
262
                $hits =& $docs[$wid];
263
                foreach ($hits as $hitkey => $hitcnt) {
264
                    // make sure the document still exists
265
                    if (!page_exists($hitkey, '', false)) continue;
266
                    if (!isset($final[$word][$hitkey])) {
267
                        $final[$word][$hitkey] = $hitcnt;
268
                    } else {
269
                        $final[$word][$hitkey] += $hitcnt;
270
                    }
271
                }
272
            }
273
        }
274
        return $final;
275
    }
276
277
    /**
278
     * Find the index ID of each search term
279
     *
280
     * The query terms should only contain valid characters, with a '*' at
281
     * either the beginning or end of the word (or both).
282
     * The $result parameter can be used to merge the index locations with
283
     * the appropriate query term.
284
     *
285
     * @param array  $words  The query terms.
286
     * @param array  $result Set to word => array("length*id" ...)
287
     * @return array         Set to length => array(id ...)
288
     *
289
     * @author Tom N Harris <[email protected]>
290
     */
291
    protected function getIndexWords(&$words, &$result)
292
    {
293
        $Tokenizer = Tokenizer::getInstance();
294
295
        $tokens = array();
296
        $tokenlength = array();
297
        $tokenwild = array();
298
        foreach ($words as $word) {
299
            $result[$word] = array();
300
            $caret = '^';
301
            $dollar = '$';
302
            $xword = $word;
303
            $wlen = $this->wordlen($word);
304
305
            // check for wildcards
306
            if (substr($xword, 0, 1) == '*') {
307
                $xword = substr($xword, 1);
308
                $caret = '';
309
                $wlen -= 1;
310
            }
311
            if (substr($xword, -1, 1) == '*') {
312
                $xword = substr($xword, 0, -1);
313
                $dollar = '';
314
                $wlen -= 1;
315
            }
316
            if ($wlen < $Tokenizer->getMinWordLength()
317
                && $caret && $dollar && !is_numeric($xword)
318
            ) {
319
                continue;
320
            }
321
            if (!isset($tokens[$xword])) {
322
                $tokenlength[$wlen][] = $xword;
323
            }
324
            if (!$caret || !$dollar) {
325
                $re = $caret.preg_quote($xword, '/').$dollar;
326
                $tokens[$xword][] = array($word, '/'.$re.'/');
327
                if (!isset($tokenwild[$xword])) {
328
                    $tokenwild[$xword] = $wlen;
329
                }
330
            } else {
331
                $tokens[$xword][] = array($word, null);
332
            }
333
        }
334
        asort($tokenwild);
335
        // $tokens = array( base word => array( [ query term , regexp ] ... ) ... )
336
        // $tokenlength = array( base word length => base word ... )
337
        // $tokenwild = array( base word => base word length ... )
338
        $length_filter = empty($tokenwild) ? $tokenlength : min(array_keys($tokenlength));
339
        $indexes_known = $this->getIndexLengths($length_filter);
340
        if (!empty($tokenwild)) sort($indexes_known);
341
        // get word IDs
342
        $wids = array();
343
        foreach ($indexes_known as $ixlen) {
344
            $word_idx = $this->getIndex('w', $ixlen);
345
            // handle exact search
346
            if (isset($tokenlength[$ixlen])) {
347
                foreach ($tokenlength[$ixlen] as $xword) {
348
                    $wid = array_search($xword, $word_idx, true);
349
                    if ($wid !== false) {
350
                        $wids[$ixlen][] = $wid;
351
                        foreach ($tokens[$xword] as $w)
352
                            $result[$w[0]][] = "{$ixlen}*{$wid}";
353
                    }
354
                }
355
            }
356
            // handle wildcard search
357
            foreach ($tokenwild as $xword => $wlen) {
358
                if ($wlen >= $ixlen) break;
359
                foreach ($tokens[$xword] as $w) {
360
                    if (is_null($w[1])) continue;
361
                    foreach (array_keys(preg_grep($w[1], $word_idx)) as $wid) {
362
                        $wids[$ixlen][] = $wid;
363
                        $result[$w[0]][] = "{$ixlen}*{$wid}";
364
                    }
365
                }
366
            }
367
        }
368
        return $wids;
369
    }
370
371
    /**
372
     * Get the word lengths that have been indexed
373
     *
374
     * Reads the index directory and returns an array of lengths
375
     * that there are indices for.
376
     *
377
     * @author YoBoY <[email protected]>
378
     *
379
     * @param array|int $filter
380
     * @return array
381
     */
382
    public function getIndexLengths($filter)
383
    {
384
        global $conf;
385
        $idx = array();
386
        if (is_array($filter)) {
387
            // testing if index files exist only
388
            $path = $conf['indexdir']."/i";
389
            foreach ($filter as $key => $value) {
390
                if (file_exists($path.$key.'.idx')) {
391
                    $idx[] = $key;
392
                }
393
            }
394
        } else {
395
            $lengths = $this->listIndexLengths();
396
            foreach ($lengths as $key => $length) {
397
                // keep all the values equal or superior
398
                if ((int)$length >= (int)$filter) {
399
                    $idx[] = $length;
400
                }
401
            }
402
        }
403
        return $idx;
404
    }
405
406
    /**
407
     * Get the list of lengths indexed in the wiki
408
     *
409
     * Read the index directory or a cache file and returns
410
     * a sorted array of lengths of the words used in the wiki.
411
     *
412
     * @author YoBoY <[email protected]>
413
     *
414
     * @return array
415
     */
416
    public function listIndexLengths()
417
    {
418
        global $conf;
419
        $lengthsFile = $conf['indexdir'].'/lengths.idx';
420
421
        // testing what we have to do, create a cache file or not.
422
        if ($conf['readdircache'] == 0) {
423
            $docache = false;
424
        } else {
425
            clearstatcache();
426
            if (file_exists($lengthsFile)
427
                && (time() < @filemtime($lengthsFile) + $conf['readdircache'])
428
            ) {
429
                $lengths = @file($lengthsFile, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
430
                if ($lengths !== false) {
431
                    $idx = array();
432
                    foreach ($lengths as $length) {
433
                        $idx[] = (int)$length;
434
                    }
435
                    return $idx;
436
                }
437
            }
438
            $docache = true;
439
        }
440
441
        if ($conf['readdircache'] == 0 || $docache) {
442
            $dir = @opendir($conf['indexdir']);
443
            if ($dir === false) return array();
444
            $idx = array();
445
            while (($f = readdir($dir)) !== false) {
446
                if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') {
447
                    $i = substr($f, 1, -4);
448
                    if (is_numeric($i)) $idx[] = (int)$i;
449
                }
450
            }
451
            closedir($dir);
452
            sort($idx);
453
            // save this in a file
454
            if ($docache) {
455
                $handle = @fopen($lengthsFile, 'w');
456
                @fwrite($handle, implode("\n", $idx));
457
                @fclose($handle);
458
            }
459
            return $idx;
460
        }
461
        return array();
462
    }
463
464
    /**
465
     * Return a list of words sorted by number of times used
466
     *
467
     * @param int       $min    bottom frequency threshold
468
     * @param int       $max    upper frequency limit. No limit if $max<$min
469
     * @param int       $minlen minimum length of words to count
470
     * @return array            list of words as the keys and frequency as value
471
     *
472
     * @author Tom N Harris <[email protected]>
473
     */
474
    public function histogram($min=1, $max=0, $minlen=3)
475
    {
476
        return MetadataIndex::getInstance()->histogram($min, $max, $minlen);
477
    }
478
479
    /**
480
     * Clear the Fulltext Index
481
     *
482
     * @param bool $requireLock should be false only if the caller is resposible for index lock
483
     * @return bool  If the index has been cleared successfully
484
     * @throws Exception\IndexLockException
485
     */
486
    public function clear($requireLock = true)
487
    {
488
        global $conf;
489
490
        if ($requireLock) $this->lock();
491
492
        $lengths = $this->listIndexLengths();
493
        foreach ($lengths as $length) {
494
            @unlink($conf['indexdir'].'/i'.$length.'.idx');
495
            @unlink($conf['indexdir'].'/w'.$length.'.idx');
496
        }
497
        @unlink($conf['indexdir'].'/lengths.idx');
498
        @unlink($conf['indexdir'].'/pageword.idx');
499
500
        if ($requireLock) $this->unlock();
501
        return true;
502
    }
503
}
504