Failed Conditions
Pull Request — master (#2943)
by
unknown
03:31
created

FulltextIndex::__construct()   A

Complexity

Conditions 3
Paths 3

Size

Total Lines 6

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 3
nc 3
nop 1
dl 0
loc 6
rs 10
c 0
b 0
f 0
1
<?php
2
3
namespace dokuwiki\Search;
4
5
use dokuwiki\Search\Exception\IndexAccessException;
6
use dokuwiki\Search\Exception\IndexLockException;
7
use dokuwiki\Search\Exception\IndexWriteException;
8
use dokuwiki\Search\Tokenizer;
9
use dokuwiki\Utf8;
10
11
/**
12
 * Class DokuWiki Fulltext Index
13
 *
14
 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
15
 * @author     Andreas Gohr <[email protected]>
16
 * @author Tom N Harris <[email protected]>
17
 */
18
class FulltextIndex extends AbstractIndex
19
{
20
    // numeric page id to be added to or deleted from the Fulltext index
21
    protected $pageID;
22
23
    /**
24
     * FulltextIndex constructor
25
     *
26
     * @param string|int $page a page name or numeric page id
27
     */
28
    public function __construct($page = null)
29
    {
30
        if (isset($page)) {
31
            $this->pageID = is_int($page) ? $page : $this->getPID($page);
32
        }
33
    }
34
35
    /**
36
     * Measure the length of a string
37
     * Differs from strlen in handling of asian characters.
38
     *
39
     * @author Tom N Harris <[email protected]>
40
     *
41
     * @param string $w
42
     * @return int
43
     */
44
    public function wordlen($w)
45
    {
46
        $l = strlen($w);
47
        // If left alone, all chinese "words" will get put into w3.idx
48
        // So the "length" of a "word" is faked
49
        if (preg_match_all('/[\xE2-\xEF]/', $w, $leadbytes)) {
50
            foreach ($leadbytes[0] as $b) {
51
                $l += ord($b) - 0xE1;
52
            }
53
        }
54
        return $l;
55
    }
56
57
    /**
58
     * Adds the contents of a page to the fulltext index
59
     *
60
     * The added text replaces previous words for the same page.
61
     * An empty value erases the page.
62
     *
63
     * @param string $text the body of the page
64
     * @param bool $requireLock should be false only if the caller is resposible for index lock
65
     * @return bool  if the function completed successfully
66
     *
67
     * @throws IndexAccessException
68
     * @throws IndexLockException
69
     * @throws IndexWriteException
70
     * @author Andreas Gohr <[email protected]>
71
     * @author Tom N Harris <[email protected]>
72
     */
73
    public function addWords($text, $requireLock = true)
74
    {
75
        // load known documents
76
        if (!isset($this->pageID)) {
77
            throw new IndexAccessException('Indexer: page unknown to addWords');
78
        } else {
79
            $pid = $this->pageID;
80
        }
81
 
82
        if ($requireLock) $this->lock();
83
84
        $pagewords = array();
85
        // get word usage in page
86
        $words = $this->getWords($text);
87
88
        foreach (array_keys($words) as $wlen) {
89
            $index = $this->getIndex('i', $wlen);
90
            foreach ($words[$wlen] as $wid => $freq) {
91
                $idx = ($wid < count($index)) ? $index[$wid] : '';
92
                $index[$wid] = $this->updateTuple($idx, $pid, $freq);
93
                $pagewords[] = "{$wlen}*{$wid}";
94
            }
95
            $this->saveIndex('i', $wlen, $index);
96
        }
97
98
        // Remove obsolete index entries
99
        $pageword_idx = $this->getIndexKey('pageword', '', $pid);
100
        if ($pageword_idx !== '') {
101
            $oldwords = explode(':', $pageword_idx);
102
            $delwords = array_diff($oldwords, $pagewords);
103
            $upwords = array();
104
            foreach ($delwords as $word) {
105
                if ($word != '') {
106
                    list($wlen, $wid) = explode('*', $word);
107
                    $wid = (int)$wid;
108
                    $upwords[$wlen][] = $wid;
109
                }
110
            }
111
            foreach ($upwords as $wlen => $widx) {
112
                $index = $this->getIndex('i', $wlen);
113
                foreach ($widx as $wid) {
114
                    $index[$wid] = $this->updateTuple($index[$wid], $pid, 0);
115
                }
116
                $this->saveIndex('i', $wlen, $index);
117
            }
118
        }
119
        // Save the reverse index
120
        $pageword_idx = implode(':', $pagewords);
121
        $this->saveIndexKey('pageword', '', $pid, $pageword_idx);
122
123
        if ($requireLock) $this->unlock();
124
        return true;
125
    }
126
127
    /**
128
     * Split the words in a page and add them to the index
129
     *
130
     * @param string $text content of the page
131
     * @return array  list of word IDs and number of times used, false on errors
132
     *
133
     * @throws IndexWriteException
134
     * @author Andreas Gohr <[email protected]>
135
     * @author Christopher Smith <[email protected]>
136
     * @author Tom N Harris <[email protected]>
137
     */
138
    protected function getWords($text)
139
    {
140
        $Tokenizer = Tokenizer::getInstance();
141
        $tokens = $Tokenizer->getWords($text);
142
        $tokens = array_count_values($tokens);  // count the frequency of each token
143
144
        $words = array();
145
        foreach ($tokens as $w => $c) {
146
            $l = $this->wordlen($w);
147
            if (isset($words[$l])) {
148
                $words[$l][$w] = $c + (isset($words[$l][$w]) ? $words[$l][$w] : 0);
149
            } else {
150
                $words[$l] = array($w => $c);
151
            }
152
        }
153
154
        // arrive here with $words = array(wordlen => array(word => frequency))
155
        $word_idx_modified = false;
156
        $index = array();   //resulting index
157
        foreach (array_keys($words) as $wlen) {
158
            $word_idx = $this->getIndex('w', $wlen);
159
            foreach ($words[$wlen] as $word => $freq) {
160
                $word = (string)$word;
161
                $wid = array_search($word, $word_idx, true);
162
                if ($wid === false) {
163
                    $wid = count($word_idx);
164
                    $word_idx[] = $word;
165
                    $word_idx_modified = true;
166
                }
167
                if (!isset($index[$wlen])) {
168
                    $index[$wlen] = array();
169
                }
170
                $index[$wlen][$wid] = $freq;
171
            }
172
            // save back the word index
173
            if ($word_idx_modified) $this->saveIndex('w', $wlen, $word_idx);
174
        }
175
176
        return $index;
177
    }
178
179
    /**
180
     * Delete the contents of a page to the fulltext index
181
     *
182
     * @param bool $requireLock should be false only if the caller is resposible for index lock
183
     * @return bool  If renaming the value has been successful, false on error
184
     *
185
     * @throws IndexAccessException
186
     * @throws IndexLockException
187
     * @throws IndexWriteException
188
     * @author Satoshi Sahara <[email protected]>
189
     * @author Tom N Harris <[email protected]>
190
     */
191
    public function deleteWords($requireLock = true)
192
    {
193
        // load known documents
194
        if (!isset($this->pageID)) {
195
            throw new IndexAccessException('Indexer: page unknown to deleteWords');
196
        } else {
197
            $pid = $this->pageID;
198
        }
199
200
        if ($requireLock) $this->lock();
201
202
        // remove obsolete index entries
203
        $pageword_idx = $this->getIndexKey('pageword', '', $pid);
204
        if ($pageword_idx !== '') {
205
            $delwords = explode(':', $pageword_idx);
206
            $upwords = array();
207
            foreach ($delwords as $word) {
208
                if ($word != '') {
209
                    list($wlen, $wid) = explode('*', $word);
210
                    $wid = (int)$wid;
211
                    $upwords[$wlen][] = $wid;
212
                }
213
            }
214
            foreach ($upwords as $wlen => $widx) {
215
                $index = $this->getIndex('i', $wlen);
216
                foreach ($widx as $wid) {
217
                    $index[$wid] = $this->updateTuple($index[$wid], $pid, 0);
218
                }
219
                $this->saveIndex('i', $wlen, $index);
220
            }
221
        }
222
        // save the reverse index
223
        $this->saveIndexKey('pageword', '', $pid, '');
224
225
        if ($requireLock) $this->unlock();
226
        return true;
227
    }
228
229
    /**
230
     * Find pages in the fulltext index containing the words,
231
     *
232
     * The search words must be pre-tokenized, meaning only letters and
233
     * numbers with an optional wildcard
234
     *
235
     * The returned array will have the original tokens as key. The values
236
     * in the returned list is an array with the page names as keys and the
237
     * number of times that token appears on the page as value.
238
     *
239
     * @param array  $tokens list of words to search for
240
     * @return array         list of page names with usage counts
241
     *
242
     * @author Tom N Harris <[email protected]>
243
     * @author Andreas Gohr <[email protected]>
244
     */
245
    public function lookupWords(&$tokens)
246
    {
247
        $result = array();
248
        $wids = $this->getIndexWords($tokens, $result);
249
        if (empty($wids)) return array();
250
        // load known words and documents
251
        $page_idx = $this->getIndex('page', '');
252
        $docs = array();
253
        foreach (array_keys($wids) as $wlen) {
254
            $wids[$wlen] = array_unique($wids[$wlen]);
255
            $index = $this->getIndex('i', $wlen);
256
            foreach ($wids[$wlen] as $ixid) {
257
                if ($ixid < count($index)) {
258
                    $docs["{$wlen}*{$ixid}"] = $this->parseTuples($page_idx, $index[$ixid]);
259
                }
260
            }
261
        }
262
        // merge found pages into final result array
263
        $final = array();
264
        foreach ($result as $word => $res) {
265
            $final[$word] = array();
266
            foreach ($res as $wid) {
267
                // handle the case when ($ixid < count($index)) has been false
268
                // and thus $docs[$wid] hasn't been set.
269
                if (!isset($docs[$wid])) continue;
270
                $hits =& $docs[$wid];
271
                foreach ($hits as $hitkey => $hitcnt) {
272
                    // make sure the document still exists
273
                    if (!page_exists($hitkey, '', false)) continue;
274
                    if (!isset($final[$word][$hitkey])) {
275
                        $final[$word][$hitkey] = $hitcnt;
276
                    } else {
277
                        $final[$word][$hitkey] += $hitcnt;
278
                    }
279
                }
280
            }
281
        }
282
        return $final;
283
    }
284
285
    /**
286
     * Find the index ID of each search term
287
     *
288
     * The query terms should only contain valid characters, with a '*' at
289
     * either the beginning or end of the word (or both).
290
     * The $result parameter can be used to merge the index locations with
291
     * the appropriate query term.
292
     *
293
     * @param array  $words  The query terms.
294
     * @param array  $result Set to word => array("length*id" ...)
295
     * @return array         Set to length => array(id ...)
296
     *
297
     * @author Tom N Harris <[email protected]>
298
     */
299
    protected function getIndexWords(&$words, &$result)
300
    {
301
        $Tokenizer = Tokenizer::getInstance();
302
303
        $tokens = array();
304
        $tokenlength = array();
305
        $tokenwild = array();
306
        foreach ($words as $word) {
307
            $result[$word] = array();
308
            $caret = '^';
309
            $dollar = '$';
310
            $xword = $word;
311
            $wlen = $this->wordlen($word);
312
313
            // check for wildcards
314
            if (substr($xword, 0, 1) == '*') {
315
                $xword = substr($xword, 1);
316
                $caret = '';
317
                $wlen -= 1;
318
            }
319
            if (substr($xword, -1, 1) == '*') {
320
                $xword = substr($xword, 0, -1);
321
                $dollar = '';
322
                $wlen -= 1;
323
            }
324
            if ($wlen < $Tokenizer->getMinWordLength()
325
                && $caret && $dollar && !is_numeric($xword)
326
            ) {
327
                continue;
328
            }
329
            if (!isset($tokens[$xword])) {
330
                $tokenlength[$wlen][] = $xword;
331
            }
332
            if (!$caret || !$dollar) {
333
                $re = $caret.preg_quote($xword, '/').$dollar;
334
                $tokens[$xword][] = array($word, '/'.$re.'/');
335
                if (!isset($tokenwild[$xword])) {
336
                    $tokenwild[$xword] = $wlen;
337
                }
338
            } else {
339
                $tokens[$xword][] = array($word, null);
340
            }
341
        }
342
        asort($tokenwild);
343
        // $tokens = array( base word => array( [ query term , regexp ] ... ) ... )
344
        // $tokenlength = array( base word length => base word ... )
345
        // $tokenwild = array( base word => base word length ... )
346
        $length_filter = empty($tokenwild) ? $tokenlength : min(array_keys($tokenlength));
347
        $indexes_known = $this->getIndexLengths($length_filter);
348
        if (!empty($tokenwild)) sort($indexes_known);
349
        // get word IDs
350
        $wids = array();
351
        foreach ($indexes_known as $ixlen) {
352
            $word_idx = $this->getIndex('w', $ixlen);
353
            // handle exact search
354
            if (isset($tokenlength[$ixlen])) {
355
                foreach ($tokenlength[$ixlen] as $xword) {
356
                    $wid = array_search($xword, $word_idx, true);
357
                    if ($wid !== false) {
358
                        $wids[$ixlen][] = $wid;
359
                        foreach ($tokens[$xword] as $w)
360
                            $result[$w[0]][] = "{$ixlen}*{$wid}";
361
                    }
362
                }
363
            }
364
            // handle wildcard search
365
            foreach ($tokenwild as $xword => $wlen) {
366
                if ($wlen >= $ixlen) break;
367
                foreach ($tokens[$xword] as $w) {
368
                    if (is_null($w[1])) continue;
369
                    foreach (array_keys(preg_grep($w[1], $word_idx)) as $wid) {
370
                        $wids[$ixlen][] = $wid;
371
                        $result[$w[0]][] = "{$ixlen}*{$wid}";
372
                    }
373
                }
374
            }
375
        }
376
        return $wids;
377
    }
378
379
    /**
380
     * Get the word lengths that have been indexed
381
     *
382
     * Reads the index directory and returns an array of lengths
383
     * that there are indices for.
384
     *
385
     * @author YoBoY <[email protected]>
386
     *
387
     * @param array|int $filter
388
     * @return array
389
     */
390
    public function getIndexLengths($filter)
391
    {
392
        global $conf;
393
        $idx = array();
394
        if (is_array($filter)) {
395
            // testing if index files exist only
396
            $path = $conf['indexdir']."/i";
397
            foreach ($filter as $key => $value) {
398
                if (file_exists($path.$key.'.idx')) {
399
                    $idx[] = $key;
400
                }
401
            }
402
        } else {
403
            $lengths = $this->listIndexLengths();
404
            foreach ($lengths as $key => $length) {
405
                // keep all the values equal or superior
406
                if ((int)$length >= (int)$filter) {
407
                    $idx[] = $length;
408
                }
409
            }
410
        }
411
        return $idx;
412
    }
413
414
    /**
415
     * Get the list of lengths indexed in the wiki
416
     *
417
     * Read the index directory or a cache file and returns
418
     * a sorted array of lengths of the words used in the wiki.
419
     *
420
     * @author YoBoY <[email protected]>
421
     *
422
     * @return array
423
     */
424
    public function listIndexLengths()
425
    {
426
        global $conf;
427
        $lengthsFile = $conf['indexdir'].'/lengths.idx';
428
429
        // testing what we have to do, create a cache file or not.
430
        if ($conf['readdircache'] == 0) {
431
            $docache = false;
432
        } else {
433
            clearstatcache();
434
            if (file_exists($lengthsFile)
435
                && (time() < @filemtime($lengthsFile) + $conf['readdircache'])
436
            ) {
437
                $lengths = @file($lengthsFile, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
438
                if ($lengths !== false) {
439
                    $idx = array();
440
                    foreach ($lengths as $length) {
441
                        $idx[] = (int)$length;
442
                    }
443
                    return $idx;
444
                }
445
            }
446
            $docache = true;
447
        }
448
449
        if ($conf['readdircache'] == 0 || $docache) {
450
            $dir = @opendir($conf['indexdir']);
451
            if ($dir === false) return array();
452
            $idx = array();
453
            while (($f = readdir($dir)) !== false) {
454
                if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') {
455
                    $i = substr($f, 1, -4);
456
                    if (is_numeric($i)) $idx[] = (int)$i;
457
                }
458
            }
459
            closedir($dir);
460
            sort($idx);
461
            // save this in a file
462
            if ($docache) {
463
                $handle = @fopen($lengthsFile, 'w');
464
                @fwrite($handle, implode("\n", $idx));
465
                @fclose($handle);
466
            }
467
            return $idx;
468
        }
469
        return array();
470
    }
471
472
    /**
473
     * Return a list of words sorted by number of times used
474
     *
475
     * @param int       $min    bottom frequency threshold
476
     * @param int       $max    upper frequency limit. No limit if $max<$min
477
     * @param int       $minlen minimum length of words to count
478
     * @return array            list of words as the keys and frequency as value
479
     *
480
     * @author Tom N Harris <[email protected]>
481
     */
482
    public function histogram($min=1, $max=0, $minlen=3)
483
    {
484
        return (new MetadataIndex())->histogram($min, $max, $minlen);
485
    }
486
487
    /**
488
     * Clear the Fulltext Index
489
     *
490
     * @param bool $requireLock should be false only if the caller is resposible for index lock
491
     * @return bool  If the index has been cleared successfully
492
     * @throws Exception\IndexLockException
493
     */
494
    public function clear($requireLock = true)
495
    {
496
        global $conf;
497
498
        if ($requireLock) $this->lock();
499
500
        $lengths = $this->listIndexLengths();
501
        foreach ($lengths as $length) {
502
            @unlink($conf['indexdir'].'/i'.$length.'.idx');
503
            @unlink($conf['indexdir'].'/w'.$length.'.idx');
504
        }
505
        @unlink($conf['indexdir'].'/lengths.idx');
506
        @unlink($conf['indexdir'].'/pageword.idx');
507
508
        if ($requireLock) $this->unlock();
509
        return true;
510
    }
511
}
512