Completed
Push — onlyeditwarningonchange ( 3f18d0...5462ed )
by Gerrit
04:17
created

Doku_Indexer::getIndexWords()   F

Complexity

Conditions 24
Paths 580

Size

Total Lines 71
Code Lines 51

Duplication

Lines 0
Ratio 0 %
Metric Value
dl 0
loc 71
rs 2.9577
cc 24
eloc 51
nc 580
nop 2

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
/**
3
 * Functions to create the fulltext search index
4
 *
5
 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
6
 * @author     Andreas Gohr <[email protected]>
7
 * @author     Tom N Harris <[email protected]>
8
 */
9
10
if(!defined('DOKU_INC')) die('meh.');
11
12
// Version tag used to force rebuild on upgrade
13
define('INDEXER_VERSION', 8);
14
15
// set the minimum token length to use in the index (note, this doesn't apply to numeric tokens)
16
if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH',2);
17
18
// Asian characters are handled as words. The following regexp defines the
19
// Unicode-Ranges for Asian characters
20
// Ranges taken from http://en.wikipedia.org/wiki/Unicode_block
21
// I'm no language expert. If you think some ranges are wrongly chosen or
22
// a range is missing, please contact me
23
define('IDX_ASIAN1','[\x{0E00}-\x{0E7F}]'); // Thai
24
define('IDX_ASIAN2','['.
25
                   '\x{2E80}-\x{3040}'.  // CJK -> Hangul
26
                   '\x{309D}-\x{30A0}'.
27
                   '\x{30FD}-\x{31EF}\x{3200}-\x{D7AF}'.
28
                   '\x{F900}-\x{FAFF}'.  // CJK Compatibility Ideographs
29
                   '\x{FE30}-\x{FE4F}'.  // CJK Compatibility Forms
30
                   "\xF0\xA0\x80\x80-\xF0\xAA\x9B\x9F". // CJK Extension B
31
                   "\xF0\xAA\x9C\x80-\xF0\xAB\x9C\xBF". // CJK Extension C
32
                   "\xF0\xAB\x9D\x80-\xF0\xAB\xA0\x9F". // CJK Extension D
33
                   "\xF0\xAF\xA0\x80-\xF0\xAF\xAB\xBF". // CJK Compatibility Supplement
34
                   ']');
35
define('IDX_ASIAN3','['.                // Hiragana/Katakana (can be two characters)
36
                   '\x{3042}\x{3044}\x{3046}\x{3048}'.
37
                   '\x{304A}-\x{3062}\x{3064}-\x{3082}'.
38
                   '\x{3084}\x{3086}\x{3088}-\x{308D}'.
39
                   '\x{308F}-\x{3094}'.
40
                   '\x{30A2}\x{30A4}\x{30A6}\x{30A8}'.
41
                   '\x{30AA}-\x{30C2}\x{30C4}-\x{30E2}'.
42
                   '\x{30E4}\x{30E6}\x{30E8}-\x{30ED}'.
43
                   '\x{30EF}-\x{30F4}\x{30F7}-\x{30FA}'.
44
                   ']['.
45
                   '\x{3041}\x{3043}\x{3045}\x{3047}\x{3049}'.
46
                   '\x{3063}\x{3083}\x{3085}\x{3087}\x{308E}\x{3095}-\x{309C}'.
47
                   '\x{30A1}\x{30A3}\x{30A5}\x{30A7}\x{30A9}'.
48
                   '\x{30C3}\x{30E3}\x{30E5}\x{30E7}\x{30EE}\x{30F5}\x{30F6}\x{30FB}\x{30FC}'.
49
                   '\x{31F0}-\x{31FF}'.
50
                   ']?');
51
define('IDX_ASIAN', '(?:'.IDX_ASIAN1.'|'.IDX_ASIAN2.'|'.IDX_ASIAN3.')');
52
53
/**
54
 * Version of the indexer taking into consideration the external tokenizer.
55
 * The indexer is only compatible with data written by the same version.
56
 *
57
 * @triggers INDEXER_VERSION_GET
58
 * Plugins that modify what gets indexed should hook this event and
59
 * add their version info to the event data like so:
60
 *     $data[$plugin_name] = $plugin_version;
61
 *
62
 * @author Tom N Harris <[email protected]>
63
 * @author Michael Hamann <[email protected]>
64
 *
65
 * @return int|string
66
 */
67
function idx_get_version(){
68
    static $indexer_version = null;
69
    if ($indexer_version == null) {
70
        $version = INDEXER_VERSION;
71
72
        // DokuWiki version is included for the convenience of plugins
73
        $data = array('dokuwiki'=>$version);
74
        trigger_event('INDEXER_VERSION_GET', $data, null, false);
75
        unset($data['dokuwiki']); // this needs to be first
76
        ksort($data);
77
        foreach ($data as $plugin=>$vers)
78
            $version .= '+'.$plugin.'='.$vers;
79
        $indexer_version = $version;
80
    }
81
    return $indexer_version;
82
}
83
84
/**
85
 * Measure the length of a string.
86
 * Differs from strlen in handling of asian characters.
87
 *
88
 * @author Tom N Harris <[email protected]>
89
 *
90
 * @param string $w
91
 * @return int
92
 */
93
function wordlen($w){
94
    $l = strlen($w);
95
    // If left alone, all chinese "words" will get put into w3.idx
96
    // So the "length" of a "word" is faked
97
    if(preg_match_all('/[\xE2-\xEF]/',$w,$leadbytes)) {
98
        foreach($leadbytes[0] as $b)
99
            $l += ord($b) - 0xE1;
100
    }
101
    return $l;
102
}
103
104
/**
105
 * Class that encapsulates operations on the indexer database.
106
 *
107
 * @author Tom N Harris <[email protected]>
108
 */
109
class Doku_Indexer {
110
    /**
111
     * @var array $pidCache Cache for getPID()
112
     */
113
    protected $pidCache = array();
114
115
    /**
116
     * Adds the contents of a page to the fulltext index
117
     *
118
     * The added text replaces previous words for the same page.
119
     * An empty value erases the page.
120
     *
121
     * @param string    $page   a page name
122
     * @param string    $text   the body of the page
123
     * @return string|boolean  the function completed successfully
124
     *
125
     * @author Tom N Harris <[email protected]>
126
     * @author Andreas Gohr <[email protected]>
127
     */
128
    public function addPageWords($page, $text) {
129
        if (!$this->lock())
130
            return "locked";
131
132
        // load known documents
133
        $pid = $this->getPIDNoLock($page);
134
        if ($pid === false) {
135
            $this->unlock();
136
            return false;
137
        }
138
139
        $pagewords = array();
140
        // get word usage in page
141
        $words = $this->getPageWords($text);
142
        if ($words === false) {
143
            $this->unlock();
144
            return false;
145
        }
146
147
        if (!empty($words)) {
148
            foreach (array_keys($words) as $wlen) {
149
                $index = $this->getIndex('i', $wlen);
150
                foreach ($words[$wlen] as $wid => $freq) {
151
                    $idx = ($wid<count($index)) ? $index[$wid] : '';
152
                    $index[$wid] = $this->updateTuple($idx, $pid, $freq);
0 ignored issues
show
Bug introduced by
It seems like $pid defined by $this->getPIDNoLock($page) on line 133 can also be of type boolean; however, Doku_Indexer::updateTuple() does only seem to accept string|integer, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
153
                    $pagewords[] = "$wlen*$wid";
154
                }
155
                if (!$this->saveIndex('i', $wlen, $index)) {
156
                    $this->unlock();
157
                    return false;
158
                }
159
            }
160
        }
161
162
        // Remove obsolete index entries
163
        $pageword_idx = $this->getIndexKey('pageword', '', $pid);
0 ignored issues
show
Bug introduced by
It seems like $pid defined by $this->getPIDNoLock($page) on line 133 can also be of type boolean; however, Doku_Indexer::getIndexKey() does only seem to accept integer, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
164
        if ($pageword_idx !== '') {
165
            $oldwords = explode(':',$pageword_idx);
166
            $delwords = array_diff($oldwords, $pagewords);
167
            $upwords = array();
168
            foreach ($delwords as $word) {
169
                if ($word != '') {
170
                    list($wlen,$wid) = explode('*', $word);
171
                    $wid = (int)$wid;
172
                    $upwords[$wlen][] = $wid;
173
                }
174
            }
175
            foreach ($upwords as $wlen => $widx) {
176
                $index = $this->getIndex('i', $wlen);
177
                foreach ($widx as $wid) {
178
                    $index[$wid] = $this->updateTuple($index[$wid], $pid, 0);
0 ignored issues
show
Bug introduced by
It seems like $pid defined by $this->getPIDNoLock($page) on line 133 can also be of type boolean; however, Doku_Indexer::updateTuple() does only seem to accept string|integer, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
179
                }
180
                $this->saveIndex('i', $wlen, $index);
181
            }
182
        }
183
        // Save the reverse index
184
        $pageword_idx = join(':', $pagewords);
185
        if (!$this->saveIndexKey('pageword', '', $pid, $pageword_idx)) {
0 ignored issues
show
Bug introduced by
It seems like $pid defined by $this->getPIDNoLock($page) on line 133 can also be of type boolean; however, Doku_Indexer::saveIndexKey() does only seem to accept integer, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
186
            $this->unlock();
187
            return false;
188
        }
189
190
        $this->unlock();
191
        return true;
192
    }
193
194
    /**
195
     * Split the words in a page and add them to the index.
196
     *
197
     * @param string    $text   content of the page
198
     * @return array            list of word IDs and number of times used
199
     *
200
     * @author Andreas Gohr <[email protected]>
201
     * @author Christopher Smith <[email protected]>
202
     * @author Tom N Harris <[email protected]>
203
     */
204
    protected function getPageWords($text) {
205
206
        $tokens = $this->tokenizer($text);
207
        $tokens = array_count_values($tokens);  // count the frequency of each token
208
209
        $words = array();
210
        foreach ($tokens as $w=>$c) {
211
            $l = wordlen($w);
212
            if (isset($words[$l])){
213
                $words[$l][$w] = $c + (isset($words[$l][$w]) ? $words[$l][$w] : 0);
214
            }else{
215
                $words[$l] = array($w => $c);
216
            }
217
        }
218
219
        // arrive here with $words = array(wordlen => array(word => frequency))
220
        $word_idx_modified = false;
221
        $index = array();   //resulting index
222
        foreach (array_keys($words) as $wlen) {
223
            $word_idx = $this->getIndex('w', $wlen);
224
            foreach ($words[$wlen] as $word => $freq) {
225
                $word = (string)$word;
226
                $wid = array_search($word, $word_idx, true);
227
                if ($wid === false) {
228
                    $wid = count($word_idx);
229
                    $word_idx[] = $word;
230
                    $word_idx_modified = true;
231
                }
232
                if (!isset($index[$wlen]))
233
                    $index[$wlen] = array();
234
                $index[$wlen][$wid] = $freq;
235
            }
236
            // save back the word index
237
            if ($word_idx_modified && !$this->saveIndex('w', $wlen, $word_idx))
238
                return false;
239
        }
240
241
        return $index;
242
    }
243
244
    /**
245
     * Add/update keys to/of the metadata index.
246
     *
247
     * Adding new keys does not remove other keys for the page.
248
     * An empty value will erase the key.
249
     * The $key parameter can be an array to add multiple keys. $value will
250
     * not be used if $key is an array.
251
     *
252
     * @param string    $page   a page name
253
     * @param mixed     $key    a key string or array of key=>value pairs
254
     * @param mixed     $value  the value or list of values
255
     * @return boolean|string     the function completed successfully
256
     *
257
     * @author Tom N Harris <[email protected]>
258
     * @author Michael Hamann <[email protected]>
259
     */
260
    public function addMetaKeys($page, $key, $value=null) {
261
        if (!is_array($key)) {
262
            $key = array($key => $value);
263
        } elseif (!is_null($value)) {
264
            // $key is array, but $value is not null
265
            trigger_error("array passed to addMetaKeys but value is not null", E_USER_WARNING);
266
        }
267
268
        if (!$this->lock())
269
            return "locked";
270
271
        // load known documents
272
        $pid = $this->getPIDNoLock($page);
273
        if ($pid === false) {
274
            $this->unlock();
275
            return false;
276
        }
277
278
        // Special handling for titles so the index file is simpler
279
        if (array_key_exists('title', $key)) {
280
            $value = $key['title'];
281
            if (is_array($value)) {
282
                $value = $value[0];
283
            }
284
            $this->saveIndexKey('title', '', $pid, $value);
0 ignored issues
show
Bug introduced by
It seems like $pid defined by $this->getPIDNoLock($page) on line 272 can also be of type boolean; however, Doku_Indexer::saveIndexKey() does only seem to accept integer, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
285
            unset($key['title']);
286
        }
287
288
        foreach ($key as $name => $values) {
289
            $metaname = idx_cleanName($name);
290
            $this->addIndexKey('metadata', '', $metaname);
291
            $metaidx = $this->getIndex($metaname.'_i', '');
292
            $metawords = $this->getIndex($metaname.'_w', '');
293
            $addwords = false;
294
295
            if (!is_array($values)) $values = array($values);
296
297
            $val_idx = $this->getIndexKey($metaname.'_p', '', $pid);
0 ignored issues
show
Bug introduced by
It seems like $pid defined by $this->getPIDNoLock($page) on line 272 can also be of type boolean; however, Doku_Indexer::getIndexKey() does only seem to accept integer, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
298
            if ($val_idx != '') {
299
                $val_idx = explode(':', $val_idx);
300
                // -1 means remove, 0 keep, 1 add
301
                $val_idx = array_combine($val_idx, array_fill(0, count($val_idx), -1));
302
            } else {
303
                $val_idx = array();
304
            }
305
306
            foreach ($values as $val) {
307
                $val = (string)$val;
308
                if ($val !== "") {
309
                    $id = array_search($val, $metawords, true);
310
                    if ($id === false) {
311
                        // didn't find $val, so we'll add it to the end of metawords and create a placeholder in metaidx
312
                        $id = count($metawords);
313
                        $metawords[$id] = $val;
314
                        $metaidx[$id] = '';
315
                        $addwords = true;
316
                    }
317
                    // test if value is already in the index
318
                    if (isset($val_idx[$id]) && $val_idx[$id] <= 0){
319
                        $val_idx[$id] = 0;
320
                    } else { // else add it
321
                        $val_idx[$id] = 1;
322
                    }
323
                }
324
            }
325
326
            if ($addwords) {
327
                $this->saveIndex($metaname.'_w', '', $metawords);
328
            }
329
            $vals_changed = false;
330
            foreach ($val_idx as $id => $action) {
331
                if ($action == -1) {
332
                    $metaidx[$id] = $this->updateTuple($metaidx[$id], $pid, 0);
0 ignored issues
show
Bug introduced by
It seems like $pid defined by $this->getPIDNoLock($page) on line 272 can also be of type boolean; however, Doku_Indexer::updateTuple() does only seem to accept string|integer, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
333
                    $vals_changed = true;
334
                    unset($val_idx[$id]);
335
                } elseif ($action == 1) {
336
                    $metaidx[$id] = $this->updateTuple($metaidx[$id], $pid, 1);
0 ignored issues
show
Bug introduced by
It seems like $pid defined by $this->getPIDNoLock($page) on line 272 can also be of type boolean; however, Doku_Indexer::updateTuple() does only seem to accept string|integer, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
337
                    $vals_changed = true;
338
                }
339
            }
340
341
            if ($vals_changed) {
342
                $this->saveIndex($metaname.'_i', '', $metaidx);
343
                $val_idx = implode(':', array_keys($val_idx));
344
                $this->saveIndexKey($metaname.'_p', '', $pid, $val_idx);
0 ignored issues
show
Bug introduced by
It seems like $pid defined by $this->getPIDNoLock($page) on line 272 can also be of type boolean; however, Doku_Indexer::saveIndexKey() does only seem to accept integer, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
345
            }
346
347
            unset($metaidx);
348
            unset($metawords);
349
        }
350
351
        $this->unlock();
352
        return true;
353
    }
354
355
    /**
356
     * Rename a page in the search index without changing the indexed content. This function doesn't check if the
357
     * old or new name exists in the filesystem. It returns an error if the old page isn't in the page list of the
358
     * indexer and it deletes all previously indexed content of the new page.
359
     *
360
     * @param string $oldpage The old page name
361
     * @param string $newpage The new page name
362
     * @return string|bool If the page was successfully renamed, can be a message in the case of an error
363
     */
364
    public function renamePage($oldpage, $newpage) {
365
        if (!$this->lock()) return 'locked';
366
367
        $pages = $this->getPages();
368
369
        $id = array_search($oldpage, $pages, true);
370
        if ($id === false) {
371
            $this->unlock();
372
            return 'page is not in index';
373
        }
374
375
        $new_id = array_search($newpage, $pages, true);
376
        if ($new_id !== false) {
377
            // make sure the page is not in the index anymore
378
            if ($this->deletePageNoLock($newpage) !== true) {
379
                return false;
380
            }
381
382
            $pages[$new_id] = 'deleted:'.time().rand(0, 9999);
383
        }
384
385
        $pages[$id] = $newpage;
386
387
        // update index
388
        if (!$this->saveIndex('page', '', $pages)) {
389
            $this->unlock();
390
            return false;
391
        }
392
393
        // reset the pid cache
394
        $this->pidCache = array();
395
396
        $this->unlock();
397
        return true;
398
    }
399
400
    /**
401
     * Renames a meta value in the index. This doesn't change the meta value in the pages, it assumes that all pages
402
     * will be updated.
403
     *
404
     * @param string $key       The metadata key of which a value shall be changed
405
     * @param string $oldvalue  The old value that shall be renamed
406
     * @param string $newvalue  The new value to which the old value shall be renamed, can exist (then values will be merged)
407
     * @return bool|string      If renaming the value has been successful, false or error message on error.
408
     */
409
    public function renameMetaValue($key, $oldvalue, $newvalue) {
410
        if (!$this->lock()) return 'locked';
411
412
        // change the relation references index
413
        $metavalues = $this->getIndex($key, '_w');
414
        $oldid = array_search($oldvalue, $metavalues, true);
415
        if ($oldid !== false) {
416
            $newid = array_search($newvalue, $metavalues, true);
417
            if ($newid !== false) {
418
                // free memory
419
                unset ($metavalues);
420
421
                // okay, now we have two entries for the same value. we need to merge them.
422
                $indexline = $this->getIndexKey($key.'_i', '', $oldid);
423
                if ($indexline != '') {
424
                    $newindexline = $this->getIndexKey($key.'_i', '', $newid);
425
                    $pagekeys     = $this->getIndex($key.'_p', '');
426
                    $parts = explode(':', $indexline);
427
                    foreach ($parts as $part) {
428
                        list($id, $count) = explode('*', $part);
429
                        $newindexline =  $this->updateTuple($newindexline, $id, $count);
430
431
                        $keyline = explode(':', $pagekeys[$id]);
432
                        // remove old meta value
433
                        $keyline = array_diff($keyline, array($oldid));
434
                        // add new meta value when not already present
435
                        if (!in_array($newid, $keyline)) {
436
                            array_push($keyline, $newid);
437
                        }
438
                        $pagekeys[$id] = implode(':', $keyline);
439
                    }
440
                    $this->saveIndex($key.'_p', '', $pagekeys);
441
                    unset($pagekeys);
442
                    $this->saveIndexKey($key.'_i', '', $oldid, '');
443
                    $this->saveIndexKey($key.'_i', '', $newid, $newindexline);
444
                }
445
            } else {
446
                $metavalues[$oldid] = $newvalue;
447
                if (!$this->saveIndex($key.'_w', '', $metavalues)) {
448
                    $this->unlock();
449
                    return false;
450
                }
451
            }
452
        }
453
454
        $this->unlock();
455
        return true;
456
    }
457
458
    /**
459
     * Remove a page from the index
460
     *
461
     * Erases entries in all known indexes.
462
     *
463
     * @param string    $page   a page name
464
     * @return string|boolean  the function completed successfully
465
     *
466
     * @author Tom N Harris <[email protected]>
467
     */
468
    public function deletePage($page) {
469
        if (!$this->lock())
470
            return "locked";
471
472
        $result = $this->deletePageNoLock($page);
473
474
        $this->unlock();
475
476
        return $result;
477
    }
478
479
    /**
480
     * Remove a page from the index without locking the index, only use this function if the index is already locked
481
     *
482
     * Erases entries in all known indexes.
483
     *
484
     * @param string    $page   a page name
485
     * @return boolean          the function completed successfully
486
     *
487
     * @author Tom N Harris <[email protected]>
488
     */
489
    protected function deletePageNoLock($page) {
490
        // load known documents
491
        $pid = $this->getPIDNoLock($page);
492
        if ($pid === false) {
493
            return false;
494
        }
495
496
        // Remove obsolete index entries
497
        $pageword_idx = $this->getIndexKey('pageword', '', $pid);
0 ignored issues
show
Bug introduced by
It seems like $pid defined by $this->getPIDNoLock($page) on line 491 can also be of type boolean; however, Doku_Indexer::getIndexKey() does only seem to accept integer, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
498
        if ($pageword_idx !== '') {
499
            $delwords = explode(':',$pageword_idx);
500
            $upwords = array();
501
            foreach ($delwords as $word) {
502
                if ($word != '') {
503
                    list($wlen,$wid) = explode('*', $word);
504
                    $wid = (int)$wid;
505
                    $upwords[$wlen][] = $wid;
506
                }
507
            }
508
            foreach ($upwords as $wlen => $widx) {
509
                $index = $this->getIndex('i', $wlen);
510
                foreach ($widx as $wid) {
511
                    $index[$wid] = $this->updateTuple($index[$wid], $pid, 0);
0 ignored issues
show
Bug introduced by
It seems like $pid defined by $this->getPIDNoLock($page) on line 491 can also be of type boolean; however, Doku_Indexer::updateTuple() does only seem to accept string|integer, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
512
                }
513
                $this->saveIndex('i', $wlen, $index);
514
            }
515
        }
516
        // Save the reverse index
517
        if (!$this->saveIndexKey('pageword', '', $pid, "")) {
0 ignored issues
show
Bug introduced by
It seems like $pid defined by $this->getPIDNoLock($page) on line 491 can also be of type boolean; however, Doku_Indexer::saveIndexKey() does only seem to accept integer, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
518
            return false;
519
        }
520
521
        $this->saveIndexKey('title', '', $pid, "");
0 ignored issues
show
Bug introduced by
It seems like $pid defined by $this->getPIDNoLock($page) on line 491 can also be of type boolean; however, Doku_Indexer::saveIndexKey() does only seem to accept integer, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
522
        $keyidx = $this->getIndex('metadata', '');
523
        foreach ($keyidx as $metaname) {
524
            $val_idx = explode(':', $this->getIndexKey($metaname.'_p', '', $pid));
0 ignored issues
show
Bug introduced by
It seems like $pid defined by $this->getPIDNoLock($page) on line 491 can also be of type boolean; however, Doku_Indexer::getIndexKey() does only seem to accept integer, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
525
            $meta_idx = $this->getIndex($metaname.'_i', '');
526
            foreach ($val_idx as $id) {
527
                if ($id === '') continue;
528
                $meta_idx[$id] = $this->updateTuple($meta_idx[$id], $pid, 0);
0 ignored issues
show
Bug introduced by
It seems like $pid defined by $this->getPIDNoLock($page) on line 491 can also be of type boolean; however, Doku_Indexer::updateTuple() does only seem to accept string|integer, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
529
            }
530
            $this->saveIndex($metaname.'_i', '', $meta_idx);
531
            $this->saveIndexKey($metaname.'_p', '', $pid, '');
0 ignored issues
show
Bug introduced by
It seems like $pid defined by $this->getPIDNoLock($page) on line 491 can also be of type boolean; however, Doku_Indexer::saveIndexKey() does only seem to accept integer, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
532
        }
533
534
        return true;
535
    }
536
537
    /**
538
     * Clear the whole index
539
     *
540
     * @return bool If the index has been cleared successfully
541
     */
542
    public function clear() {
543
        global $conf;
544
545
        if (!$this->lock()) return false;
546
547
        @unlink($conf['indexdir'].'/page.idx');
1 ignored issue
show
Security Best Practice introduced by
It seems like you do not handle an error condition here. This can introduce security issues, and is generally not recommended.

If you suppress an error, we recommend checking for the error condition explicitly:

// For example instead of
@mkdir($dir);

// Better use
if (@mkdir($dir) === false) {
    throw new \RuntimeException('The directory '.$dir.' could not be created.');
}
Loading history...
548
        @unlink($conf['indexdir'].'/title.idx');
1 ignored issue
show
Security Best Practice introduced by
It seems like you do not handle an error condition here. This can introduce security issues, and is generally not recommended.

If you suppress an error, we recommend checking for the error condition explicitly:

// For example instead of
@mkdir($dir);

// Better use
if (@mkdir($dir) === false) {
    throw new \RuntimeException('The directory '.$dir.' could not be created.');
}
Loading history...
549
        @unlink($conf['indexdir'].'/pageword.idx');
1 ignored issue
show
Security Best Practice introduced by
It seems like you do not handle an error condition here. This can introduce security issues, and is generally not recommended.

If you suppress an error, we recommend checking for the error condition explicitly:

// For example instead of
@mkdir($dir);

// Better use
if (@mkdir($dir) === false) {
    throw new \RuntimeException('The directory '.$dir.' could not be created.');
}
Loading history...
550
        @unlink($conf['indexdir'].'/metadata.idx');
1 ignored issue
show
Security Best Practice introduced by
It seems like you do not handle an error condition here. This can introduce security issues, and is generally not recommended.

If you suppress an error, we recommend checking for the error condition explicitly:

// For example instead of
@mkdir($dir);

// Better use
if (@mkdir($dir) === false) {
    throw new \RuntimeException('The directory '.$dir.' could not be created.');
}
Loading history...
551
        $dir = @opendir($conf['indexdir']);
552
        if($dir!==false){
553
            while(($f = readdir($dir)) !== false){
554
                if(substr($f,-4)=='.idx' &&
555
                    (substr($f,0,1)=='i' || substr($f,0,1)=='w'
556
                        || substr($f,-6)=='_w.idx' || substr($f,-6)=='_i.idx' || substr($f,-6)=='_p.idx'))
557
                    @unlink($conf['indexdir']."/$f");
1 ignored issue
show
Security Best Practice introduced by
It seems like you do not handle an error condition here. This can introduce security issues, and is generally not recommended.

If you suppress an error, we recommend checking for the error condition explicitly:

// For example instead of
@mkdir($dir);

// Better use
if (@mkdir($dir) === false) {
    throw new \RuntimeException('The directory '.$dir.' could not be created.');
}
Loading history...
558
            }
559
        }
560
        @unlink($conf['indexdir'].'/lengths.idx');
1 ignored issue
show
Security Best Practice introduced by
It seems like you do not handle an error condition here. This can introduce security issues, and is generally not recommended.

If you suppress an error, we recommend checking for the error condition explicitly:

// For example instead of
@mkdir($dir);

// Better use
if (@mkdir($dir) === false) {
    throw new \RuntimeException('The directory '.$dir.' could not be created.');
}
Loading history...
561
562
        // clear the pid cache
563
        $this->pidCache = array();
564
565
        $this->unlock();
566
        return true;
567
    }
568
569
    /**
570
     * Split the text into words for fulltext search
571
     *
572
     * TODO: does this also need &$stopwords ?
573
     *
574
     * @triggers INDEXER_TEXT_PREPARE
575
     * This event allows plugins to modify the text before it gets tokenized.
576
     * Plugins intercepting this event should also intercept INDEX_VERSION_GET
577
     *
578
     * @param string    $text   plain text
579
     * @param boolean   $wc     are wildcards allowed?
580
     * @return array            list of words in the text
581
     *
582
     * @author Tom N Harris <[email protected]>
583
     * @author Andreas Gohr <[email protected]>
584
     */
585
    public function tokenizer($text, $wc=false) {
586
        $wc = ($wc) ? '' : '\*';
587
        $stopwords =& idx_get_stopwords();
588
589
        // prepare the text to be tokenized
590
        $evt = new Doku_Event('INDEXER_TEXT_PREPARE', $text);
591
        if ($evt->advise_before(true)) {
592
            if (preg_match('/[^0-9A-Za-z ]/u', $text)) {
593
                // handle asian chars as single words (may fail on older PHP version)
594
                $asia = @preg_replace('/('.IDX_ASIAN.')/u', ' \1 ', $text);
595
                if (!is_null($asia)) $text = $asia; // recover from regexp falure
596
            }
597
        }
598
        $evt->advise_after();
599
        unset($evt);
600
601
        $text = strtr($text,
602
                       array(
603
                           "\r" => ' ',
604
                           "\n" => ' ',
605
                           "\t" => ' ',
606
                           "\xC2\xAD" => '', //soft-hyphen
607
                       )
608
                     );
609
        if (preg_match('/[^0-9A-Za-z ]/u', $text))
610
            $text = utf8_stripspecials($text, ' ', '\._\-:'.$wc);
611
612
        $wordlist = explode(' ', $text);
613
        foreach ($wordlist as $i => $word) {
614
            $wordlist[$i] = (preg_match('/[^0-9A-Za-z]/u', $word)) ?
615
                utf8_strtolower($word) : strtolower($word);
616
        }
617
618
        foreach ($wordlist as $i => $word) {
619
            if ((!is_numeric($word) && strlen($word) < IDX_MINWORDLENGTH)
620
              || array_search($word, $stopwords, true) !== false)
621
                unset($wordlist[$i]);
622
        }
623
        return array_values($wordlist);
624
    }
625
626
    /**
627
     * Get the numeric PID of a page
628
     *
629
     * @param string $page The page to get the PID for
630
     * @return bool|int The page id on success, false on error
631
     */
632
    public function getPID($page) {
633
        // return PID without locking when it is in the cache
634
        if (isset($this->pidCache[$page])) return $this->pidCache[$page];
635
636
        if (!$this->lock())
637
            return false;
638
639
        // load known documents
640
        $pid = $this->getPIDNoLock($page);
641
        if ($pid === false) {
642
            $this->unlock();
643
            return false;
644
        }
645
646
        $this->unlock();
647
        return $pid;
648
    }
649
650
    /**
651
     * Get the numeric PID of a page without locking the index.
652
     * Only use this function when the index is already locked.
653
     *
654
     * @param string $page The page to get the PID for
655
     * @return bool|int The page id on success, false on error
656
     */
657
    protected function getPIDNoLock($page) {
658
        // avoid expensive addIndexKey operation for the most recently requested pages by using a cache
659
        if (isset($this->pidCache[$page])) return $this->pidCache[$page];
660
        $pid = $this->addIndexKey('page', '', $page);
0 ignored issues
show
Bug Compatibility introduced by
The expression $this->addIndexKey('page', '', $page); of type false|integer|string adds the type string to the return on line 665 which is incompatible with the return type documented by Doku_Indexer::getPIDNoLock of type boolean|integer.
Loading history...
661
        // limit cache to 10 entries by discarding the oldest element as in DokuWiki usually only the most recently
662
        // added item will be requested again
663
        if (count($this->pidCache) > 10) array_shift($this->pidCache);
664
        $this->pidCache[$page] = $pid;
665
        return $pid;
666
    }
667
668
    /**
669
     * Get the page id of a numeric PID
670
     *
671
     * @param int $pid The PID to get the page id for
672
     * @return string The page id
673
     */
674
    public function getPageFromPID($pid) {
675
        return $this->getIndexKey('page', '', $pid);
676
    }
677
678
    /**
679
     * Find pages in the fulltext index containing the words,
680
     *
681
     * The search words must be pre-tokenized, meaning only letters and
682
     * numbers with an optional wildcard
683
     *
684
     * The returned array will have the original tokens as key. The values
685
     * in the returned list is an array with the page names as keys and the
686
     * number of times that token appears on the page as value.
687
     *
688
     * @param array  $tokens list of words to search for
689
     * @return array         list of page names with usage counts
690
     *
691
     * @author Tom N Harris <[email protected]>
692
     * @author Andreas Gohr <[email protected]>
693
     */
694
    public function lookup(&$tokens) {
695
        $result = array();
696
        $wids = $this->getIndexWords($tokens, $result);
697
        if (empty($wids)) return array();
698
        // load known words and documents
699
        $page_idx = $this->getIndex('page', '');
700
        $docs = array();
701
        foreach (array_keys($wids) as $wlen) {
702
            $wids[$wlen] = array_unique($wids[$wlen]);
703
            $index = $this->getIndex('i', $wlen);
704
            foreach($wids[$wlen] as $ixid) {
705
                if ($ixid < count($index))
706
                    $docs["$wlen*$ixid"] = $this->parseTuples($page_idx, $index[$ixid]);
707
            }
708
        }
709
        // merge found pages into final result array
710
        $final = array();
711
        foreach ($result as $word => $res) {
712
            $final[$word] = array();
713
            foreach ($res as $wid) {
714
                // handle the case when ($ixid < count($index)) has been false
715
                // and thus $docs[$wid] hasn't been set.
716
                if (!isset($docs[$wid])) continue;
717
                $hits = &$docs[$wid];
718
                foreach ($hits as $hitkey => $hitcnt) {
719
                    // make sure the document still exists
720
                    if (!page_exists($hitkey, '', false)) continue;
721
                    if (!isset($final[$word][$hitkey]))
722
                        $final[$word][$hitkey] = $hitcnt;
723
                    else
724
                        $final[$word][$hitkey] += $hitcnt;
725
                }
726
            }
727
        }
728
        return $final;
729
    }
730
731
    /**
732
     * Find pages containing a metadata key.
733
     *
734
     * The metadata values are compared as case-sensitive strings. Pass a
735
     * callback function that returns true or false to use a different
736
     * comparison function. The function will be called with the $value being
737
     * searched for as the first argument, and the word in the index as the
738
     * second argument. The function preg_match can be used directly if the
739
     * values are regexes.
740
     *
741
     * @param string    $key    name of the metadata key to look for
742
     * @param string    $value  search term to look for, must be a string or array of strings
743
     * @param callback  $func   comparison function
744
     * @return array            lists with page names, keys are query values if $value is array
745
     *
746
     * @author Tom N Harris <[email protected]>
747
     * @author Michael Hamann <[email protected]>
748
     */
749
    public function lookupKey($key, &$value, $func=null) {
750
        if (!is_array($value))
751
            $value_array = array($value);
752
        else
753
            $value_array =& $value;
754
755
        // the matching ids for the provided value(s)
756
        $value_ids = array();
757
758
        $metaname = idx_cleanName($key);
759
760
        // get all words in order to search the matching ids
761
        if ($key == 'title') {
762
            $words = $this->getIndex('title', '');
763
        } else {
764
            $words = $this->getIndex($metaname.'_w', '');
765
        }
766
767
        if (!is_null($func)) {
768
            foreach ($value_array as $val) {
769
                foreach ($words as $i => $word) {
770
                    if (call_user_func_array($func, array($val, $word)))
771
                        $value_ids[$i][] = $val;
772
                }
773
            }
774
        } else {
775
            foreach ($value_array as $val) {
776
                $xval = $val;
777
                $caret = '^';
778
                $dollar = '$';
779
                // check for wildcards
780
                if (substr($xval, 0, 1) == '*') {
781
                    $xval = substr($xval, 1);
782
                    $caret = '';
783
                }
784
                if (substr($xval, -1, 1) == '*') {
785
                    $xval = substr($xval, 0, -1);
786
                    $dollar = '';
787
                }
788
                if (!$caret || !$dollar) {
789
                    $re = $caret.preg_quote($xval, '/').$dollar;
790
                    foreach(array_keys(preg_grep('/'.$re.'/', $words)) as $i)
791
                        $value_ids[$i][] = $val;
792
                } else {
793
                    if (($i = array_search($val, $words, true)) !== false)
794
                        $value_ids[$i][] = $val;
795
                }
796
            }
797
        }
798
799
        unset($words); // free the used memory
800
801
        // initialize the result so it won't be null
802
        $result = array();
803
        foreach ($value_array as $val) {
804
            $result[$val] = array();
805
        }
806
807
        $page_idx = $this->getIndex('page', '');
808
809
        // Special handling for titles
810
        if ($key == 'title') {
811
            foreach ($value_ids as $pid => $val_list) {
812
                $page = $page_idx[$pid];
813
                foreach ($val_list as $val) {
814
                    $result[$val][] = $page;
815
                }
816
            }
817
        } else {
818
            // load all lines and pages so the used lines can be taken and matched with the pages
819
            $lines = $this->getIndex($metaname.'_i', '');
820
821
            foreach ($value_ids as $value_id => $val_list) {
822
                // parse the tuples of the form page_id*1:page2_id*1 and so on, return value
823
                // is an array with page_id => 1, page2_id => 1 etc. so take the keys only
824
                $pages = array_keys($this->parseTuples($page_idx, $lines[$value_id]));
825
                foreach ($val_list as $val) {
826
                    $result[$val] = array_merge($result[$val], $pages);
827
                }
828
            }
829
        }
830
        if (!is_array($value)) $result = $result[$value];
831
        return $result;
832
    }
833
834
    /**
835
     * Find the index ID of each search term.
836
     *
837
     * The query terms should only contain valid characters, with a '*' at
838
     * either the beginning or end of the word (or both).
839
     * The $result parameter can be used to merge the index locations with
840
     * the appropriate query term.
841
     *
842
     * @param array  $words  The query terms.
843
     * @param array  $result Set to word => array("length*id" ...)
844
     * @return array         Set to length => array(id ...)
845
     *
846
     * @author Tom N Harris <[email protected]>
847
     */
848
    protected function getIndexWords(&$words, &$result) {
849
        $tokens = array();
850
        $tokenlength = array();
851
        $tokenwild = array();
852
        foreach ($words as $word) {
853
            $result[$word] = array();
854
            $caret = '^';
855
            $dollar = '$';
856
            $xword = $word;
857
            $wlen = wordlen($word);
858
859
            // check for wildcards
860
            if (substr($xword, 0, 1) == '*') {
861
                $xword = substr($xword, 1);
862
                $caret = '';
863
                $wlen -= 1;
864
            }
865
            if (substr($xword, -1, 1) == '*') {
866
                $xword = substr($xword, 0, -1);
867
                $dollar = '';
868
                $wlen -= 1;
869
            }
870
            if ($wlen < IDX_MINWORDLENGTH && $caret && $dollar && !is_numeric($xword))
871
                continue;
872
            if (!isset($tokens[$xword]))
873
                $tokenlength[$wlen][] = $xword;
874
            if (!$caret || !$dollar) {
875
                $re = $caret.preg_quote($xword, '/').$dollar;
876
                $tokens[$xword][] = array($word, '/'.$re.'/');
877
                if (!isset($tokenwild[$xword]))
878
                    $tokenwild[$xword] = $wlen;
879
            } else {
880
                $tokens[$xword][] = array($word, null);
881
            }
882
        }
883
        asort($tokenwild);
884
        // $tokens = array( base word => array( [ query term , regexp ] ... ) ... )
885
        // $tokenlength = array( base word length => base word ... )
886
        // $tokenwild = array( base word => base word length ... )
887
        $length_filter = empty($tokenwild) ? $tokenlength : min(array_keys($tokenlength));
888
        $indexes_known = $this->indexLengths($length_filter);
889
        if (!empty($tokenwild)) sort($indexes_known);
890
        // get word IDs
891
        $wids = array();
892
        foreach ($indexes_known as $ixlen) {
893
            $word_idx = $this->getIndex('w', $ixlen);
894
            // handle exact search
895
            if (isset($tokenlength[$ixlen])) {
896
                foreach ($tokenlength[$ixlen] as $xword) {
897
                    $wid = array_search($xword, $word_idx, true);
898
                    if ($wid !== false) {
899
                        $wids[$ixlen][] = $wid;
900
                        foreach ($tokens[$xword] as $w)
901
                            $result[$w[0]][] = "$ixlen*$wid";
902
                    }
903
                }
904
            }
905
            // handle wildcard search
906
            foreach ($tokenwild as $xword => $wlen) {
907
                if ($wlen >= $ixlen) break;
908
                foreach ($tokens[$xword] as $w) {
909
                    if (is_null($w[1])) continue;
910
                    foreach(array_keys(preg_grep($w[1], $word_idx)) as $wid) {
911
                        $wids[$ixlen][] = $wid;
912
                        $result[$w[0]][] = "$ixlen*$wid";
913
                    }
914
                }
915
            }
916
        }
917
        return $wids;
918
    }
919
920
    /**
921
     * Return a list of all pages
922
     * Warning: pages may not exist!
923
     *
924
     * @param string    $key    list only pages containing the metadata key (optional)
925
     * @return array            list of page names
926
     *
927
     * @author Tom N Harris <[email protected]>
928
     */
929
    public function getPages($key=null) {
930
        $page_idx = $this->getIndex('page', '');
931
        if (is_null($key)) return $page_idx;
932
933
        $metaname = idx_cleanName($key);
934
935
        // Special handling for titles
936
        if ($key == 'title') {
937
            $title_idx = $this->getIndex('title', '');
938
            array_splice($page_idx, count($title_idx));
939
            foreach ($title_idx as $i => $title)
940
                if ($title === "") unset($page_idx[$i]);
941
            return array_values($page_idx);
942
        }
943
944
        $pages = array();
945
        $lines = $this->getIndex($metaname.'_i', '');
946
        foreach ($lines as $line) {
947
            $pages = array_merge($pages, $this->parseTuples($page_idx, $line));
948
        }
949
        return array_keys($pages);
950
    }
951
952
    /**
953
     * Return a list of words sorted by number of times used
954
     *
955
     * @param int       $min    bottom frequency threshold
956
     * @param int       $max    upper frequency limit. No limit if $max<$min
957
     * @param int       $minlen minimum length of words to count
958
     * @param string    $key    metadata key to list. Uses the fulltext index if not given
959
     * @return array            list of words as the keys and frequency as values
960
     *
961
     * @author Tom N Harris <[email protected]>
962
     */
963
    public function histogram($min=1, $max=0, $minlen=3, $key=null) {
964
        if ($min < 1)
965
            $min = 1;
966
        if ($max < $min)
967
            $max = 0;
968
969
        $result = array();
970
971
        if ($key == 'title') {
972
            $index = $this->getIndex('title', '');
973
            $index = array_count_values($index);
974
            foreach ($index as $val => $cnt) {
975
                if ($cnt >= $min && (!$max || $cnt <= $max) && strlen($val) >= $minlen)
976
                    $result[$val] = $cnt;
977
            }
978
        }
979
        elseif (!is_null($key)) {
980
            $metaname = idx_cleanName($key);
981
            $index = $this->getIndex($metaname.'_i', '');
982
            $val_idx = array();
983
            foreach ($index as $wid => $line) {
984
                $freq = $this->countTuples($line);
985
                if ($freq >= $min && (!$max || $freq <= $max))
986
                    $val_idx[$wid] = $freq;
987
            }
988
            if (!empty($val_idx)) {
989
                $words = $this->getIndex($metaname.'_w', '');
990
                foreach ($val_idx as $wid => $freq) {
991
                    if (strlen($words[$wid]) >= $minlen)
992
                        $result[$words[$wid]] = $freq;
993
                }
994
            }
995
        }
996
        else {
997
            $lengths = idx_listIndexLengths();
998
            foreach ($lengths as $length) {
999
                if ($length < $minlen) continue;
1000
                $index = $this->getIndex('i', $length);
1001
                $words = null;
1002
                foreach ($index as $wid => $line) {
1003
                    $freq = $this->countTuples($line);
1004
                    if ($freq >= $min && (!$max || $freq <= $max)) {
1005
                        if ($words === null)
1006
                            $words = $this->getIndex('w', $length);
1007
                        $result[$words[$wid]] = $freq;
1008
                    }
1009
                }
1010
            }
1011
        }
1012
1013
        arsort($result);
1014
        return $result;
1015
    }
1016
1017
    /**
1018
     * Lock the indexer.
1019
     *
1020
     * @author Tom N Harris <[email protected]>
1021
     *
1022
     * @return bool|string
1023
     */
1024
    protected function lock() {
1025
        global $conf;
1026
        $status = true;
1027
        $run = 0;
1028
        $lock = $conf['lockdir'].'/_indexer.lock';
1029
        while (!@mkdir($lock, $conf['dmode'])) {
1030
            usleep(50);
1031
            if(is_dir($lock) && time()-@filemtime($lock) > 60*5){
1032
                // looks like a stale lock - remove it
1033
                if (!@rmdir($lock)) {
1034
                    $status = "removing the stale lock failed";
0 ignored issues
show
Unused Code introduced by
$status is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1035
                    return false;
1036
                } else {
1037
                    $status = "stale lock removed";
1038
                }
1039
            }elseif($run++ == 1000){
1040
                // we waited 5 seconds for that lock
1041
                return false;
1042
            }
1043
        }
1044
        if (!empty($conf['dperm'])) {
1045
            chmod($lock, $conf['dperm']);
1046
        }
1047
        return $status;
1048
    }
1049
1050
    /**
1051
     * Release the indexer lock.
1052
     *
1053
     * @author Tom N Harris <[email protected]>
1054
     *
1055
     * @return bool
1056
     */
1057
    protected function unlock() {
1058
        global $conf;
1059
        @rmdir($conf['lockdir'].'/_indexer.lock');
1 ignored issue
show
Security Best Practice introduced by
It seems like you do not handle an error condition here. This can introduce security issues, and is generally not recommended.

If you suppress an error, we recommend checking for the error condition explicitly:

// For example instead of
@mkdir($dir);

// Better use
if (@mkdir($dir) === false) {
    throw new \RuntimeException('The directory '.$dir.' could not be created.');
}
Loading history...
1060
        return true;
1061
    }
1062
1063
    /**
1064
     * Retrieve the entire index.
1065
     *
1066
     * The $suffix argument is for an index that is split into
1067
     * multiple parts. Different index files should use different
1068
     * base names.
1069
     *
1070
     * @param string    $idx    name of the index
1071
     * @param string    $suffix subpart identifier
1072
     * @return array            list of lines without CR or LF
1073
     *
1074
     * @author Tom N Harris <[email protected]>
1075
     */
1076
    protected function getIndex($idx, $suffix) {
1077
        global $conf;
1078
        $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx';
1079
        if (!file_exists($fn)) return array();
1080
        return file($fn, FILE_IGNORE_NEW_LINES);
1081
    }
1082
1083
    /**
1084
     * Replace the contents of the index with an array.
1085
     *
1086
     * @param string    $idx    name of the index
1087
     * @param string    $suffix subpart identifier
1088
     * @param array     $lines  list of lines without LF
1089
     * @return bool             If saving succeeded
1090
     *
1091
     * @author Tom N Harris <[email protected]>
1092
     */
1093
    protected function saveIndex($idx, $suffix, &$lines) {
1094
        global $conf;
1095
        $fn = $conf['indexdir'].'/'.$idx.$suffix;
1096
        $fh = @fopen($fn.'.tmp', 'w');
1097
        if (!$fh) return false;
1098
        fwrite($fh, join("\n", $lines));
1099
        if (!empty($lines))
1100
            fwrite($fh, "\n");
1101
        fclose($fh);
1102
        if (isset($conf['fperm']))
1103
            chmod($fn.'.tmp', $conf['fperm']);
1104
        io_rename($fn.'.tmp', $fn.'.idx');
1105
        return true;
1106
    }
1107
1108
    /**
1109
     * Retrieve a line from the index.
1110
     *
1111
     * @param string    $idx    name of the index
1112
     * @param string    $suffix subpart identifier
1113
     * @param int       $id     the line number
1114
     * @return string           a line with trailing whitespace removed
1115
     *
1116
     * @author Tom N Harris <[email protected]>
1117
     */
1118
    protected function getIndexKey($idx, $suffix, $id) {
1119
        global $conf;
1120
        $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx';
1121
        if (!file_exists($fn)) return '';
1122
        $fh = @fopen($fn, 'r');
1123
        if (!$fh) return '';
1124
        $ln = -1;
1125
        while (($line = fgets($fh)) !== false) {
1126
            if (++$ln == $id) break;
1127
        }
1128
        fclose($fh);
1129
        return rtrim((string)$line);
1130
    }
1131
1132
    /**
1133
     * Write a line into the index.
1134
     *
1135
     * @param string    $idx    name of the index
1136
     * @param string    $suffix subpart identifier
1137
     * @param int       $id     the line number
1138
     * @param string    $line   line to write
1139
     * @return bool             If saving succeeded
1140
     *
1141
     * @author Tom N Harris <[email protected]>
1142
     */
1143
    protected function saveIndexKey($idx, $suffix, $id, $line) {
1144
        global $conf;
1145
        if (substr($line, -1) != "\n")
1146
            $line .= "\n";
1147
        $fn = $conf['indexdir'].'/'.$idx.$suffix;
1148
        $fh = @fopen($fn.'.tmp', 'w');
1149
        if (!$fh) return false;
1150
        $ih = @fopen($fn.'.idx', 'r');
1151
        if ($ih) {
1152
            $ln = -1;
1153
            while (($curline = fgets($ih)) !== false) {
1154
                fwrite($fh, (++$ln == $id) ? $line : $curline);
1155
            }
1156
            if ($id > $ln) {
1157
                while ($id > ++$ln)
1158
                    fwrite($fh, "\n");
1159
                fwrite($fh, $line);
1160
            }
1161
            fclose($ih);
1162
        } else {
1163
            $ln = -1;
1164
            while ($id > ++$ln)
1165
                fwrite($fh, "\n");
1166
            fwrite($fh, $line);
1167
        }
1168
        fclose($fh);
1169
        if (isset($conf['fperm']))
1170
            chmod($fn.'.tmp', $conf['fperm']);
1171
        io_rename($fn.'.tmp', $fn.'.idx');
1172
        return true;
1173
    }
1174
1175
    /**
1176
     * Retrieve or insert a value in the index.
1177
     *
1178
     * @param string    $idx    name of the index
1179
     * @param string    $suffix subpart identifier
1180
     * @param string    $value  line to find in the index
1181
     * @return int|bool          line number of the value in the index or false if writing the index failed
1182
     *
1183
     * @author Tom N Harris <[email protected]>
1184
     */
1185
    protected function addIndexKey($idx, $suffix, $value) {
1186
        $index = $this->getIndex($idx, $suffix);
1187
        $id = array_search($value, $index, true);
1188
        if ($id === false) {
1189
            $id = count($index);
1190
            $index[$id] = $value;
1191
            if (!$this->saveIndex($idx, $suffix, $index)) {
1192
                trigger_error("Failed to write $idx index", E_USER_ERROR);
1193
                return false;
1194
            }
1195
        }
1196
        return $id;
1197
    }
1198
1199
    /**
1200
     * Get the list of lengths indexed in the wiki.
1201
     *
1202
     * Read the index directory or a cache file and returns
1203
     * a sorted array of lengths of the words used in the wiki.
1204
     *
1205
     * @author YoBoY <[email protected]>
1206
     *
1207
     * @return array
1208
     */
1209
    protected function listIndexLengths() {
1210
        return idx_listIndexLengths();
1211
    }
1212
1213
    /**
1214
     * Get the word lengths that have been indexed.
1215
     *
1216
     * Reads the index directory and returns an array of lengths
1217
     * that there are indices for.
1218
     *
1219
     * @author YoBoY <[email protected]>
1220
     *
1221
     * @param array|int $filter
1222
     * @return array
1223
     */
1224
    protected function indexLengths($filter) {
1225
        global $conf;
1226
        $idx = array();
1227
        if (is_array($filter)) {
1228
            // testing if index files exist only
1229
            $path = $conf['indexdir']."/i";
1230
            foreach ($filter as $key => $value) {
1231
                if (file_exists($path.$key.'.idx'))
1232
                    $idx[] = $key;
1233
            }
1234
        } else {
1235
            $lengths = idx_listIndexLengths();
1236
            foreach ($lengths as $key => $length) {
1237
                // keep all the values equal or superior
1238
                if ((int)$length >= (int)$filter)
1239
                    $idx[] = $length;
1240
            }
1241
        }
1242
        return $idx;
1243
    }
1244
1245
    /**
1246
     * Insert or replace a tuple in a line.
1247
     *
1248
     * @author Tom N Harris <[email protected]>
1249
     *
1250
     * @param string $line
1251
     * @param string|int $id
1252
     * @param int    $count
1253
     * @return string
1254
     */
1255
    protected function updateTuple($line, $id, $count) {
1256
        if ($line != ''){
1257
            $line = preg_replace('/(^|:)'.preg_quote($id,'/').'\*\d*/', '', $line);
1258
        }
1259
        $line = trim($line, ':');
1260
        if ($count) {
1261
            if ($line) {
1262
                return "$id*$count:".$line;
1263
            } else {
1264
                return "$id*$count";
1265
            }
1266
        }
1267
        return $line;
1268
    }
1269
1270
    /**
1271
     * Split a line into an array of tuples.
1272
     *
1273
     * @author Tom N Harris <[email protected]>
1274
     * @author Andreas Gohr <[email protected]>
1275
     *
1276
     * @param array $keys
1277
     * @param string $line
1278
     * @return array
1279
     */
1280
    protected function parseTuples(&$keys, $line) {
1281
        $result = array();
1282
        if ($line == '') return $result;
1283
        $parts = explode(':', $line);
1284
        foreach ($parts as $tuple) {
1285
            if ($tuple === '') continue;
1286
            list($key, $cnt) = explode('*', $tuple);
1287
            if (!$cnt) continue;
1288
            $key = $keys[$key];
1289
            if (!$key) continue;
1290
            $result[$key] = $cnt;
1291
        }
1292
        return $result;
1293
    }
1294
1295
    /**
1296
     * Sum the counts in a list of tuples.
1297
     *
1298
     * @author Tom N Harris <[email protected]>
1299
     *
1300
     * @param string $line
1301
     * @return int
1302
     */
1303
    protected function countTuples($line) {
1304
        $freq = 0;
1305
        $parts = explode(':', $line);
1306
        foreach ($parts as $tuple) {
1307
            if ($tuple === '') continue;
1308
            list(/* $pid */, $cnt) = explode('*', $tuple);
1309
            $freq += (int)$cnt;
1310
        }
1311
        return $freq;
1312
    }
1313
}
1314
1315
/**
1316
 * Create an instance of the indexer.
1317
 *
1318
 * @return Doku_Indexer    a Doku_Indexer
1319
 *
1320
 * @author Tom N Harris <[email protected]>
1321
 */
1322
function idx_get_indexer() {
1323
    static $Indexer;
1324
    if (!isset($Indexer)) {
1325
        $Indexer = new Doku_Indexer();
1326
    }
1327
    return $Indexer;
1328
}
1329
1330
/**
1331
 * Returns words that will be ignored.
1332
 *
1333
 * @return array                list of stop words
1334
 *
1335
 * @author Tom N Harris <[email protected]>
1336
 */
1337
function & idx_get_stopwords() {
1338
    static $stopwords = null;
1339
    if (is_null($stopwords)) {
1340
        global $conf;
1341
        $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
1342
        if(file_exists($swfile)){
1343
            $stopwords = file($swfile, FILE_IGNORE_NEW_LINES);
1344
        }else{
1345
            $stopwords = array();
1346
        }
1347
    }
1348
    return $stopwords;
1349
}
1350
1351
/**
1352
 * Adds/updates the search index for the given page
1353
 *
1354
 * Locking is handled internally.
1355
 *
1356
 * @param string        $page   name of the page to index
1357
 * @param boolean       $verbose    print status messages
1358
 * @param boolean       $force  force reindexing even when the index is up to date
1359
 * @return string|boolean  the function completed successfully
1360
 *
1361
 * @author Tom N Harris <[email protected]>
1362
 */
1363
function idx_addPage($page, $verbose=false, $force=false) {
1364
    $idxtag = metaFN($page,'.indexed');
1365
    // check if page was deleted but is still in the index
1366
    if (!page_exists($page)) {
1367
        if (!file_exists($idxtag)) {
1368
            if ($verbose) print("Indexer: $page does not exist, ignoring".DOKU_LF);
1369
            return false;
1370
        }
1371
        $Indexer = idx_get_indexer();
1372
        $result = $Indexer->deletePage($page);
1373
        if ($result === "locked") {
1374
            if ($verbose) print("Indexer: locked".DOKU_LF);
1375
            return false;
1376
        }
1377
        @unlink($idxtag);
1 ignored issue
show
Security Best Practice introduced by
It seems like you do not handle an error condition here. This can introduce security issues, and is generally not recommended.

If you suppress an error, we recommend checking for the error condition explicitly:

// For example instead of
@mkdir($dir);

// Better use
if (@mkdir($dir) === false) {
    throw new \RuntimeException('The directory '.$dir.' could not be created.');
}
Loading history...
1378
        return $result;
1379
    }
1380
1381
    // check if indexing needed
1382
    if(!$force && file_exists($idxtag)){
1383
        if(trim(io_readFile($idxtag)) == idx_get_version()){
1384
            $last = @filemtime($idxtag);
1385
            if($last > @filemtime(wikiFN($page))){
1386
                if ($verbose) print("Indexer: index for $page up to date".DOKU_LF);
1387
                return false;
1388
            }
1389
        }
1390
    }
1391
1392
    $indexenabled = p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED);
1393
    if ($indexenabled === false) {
1394
        $result = false;
1395
        if (file_exists($idxtag)) {
1396
            $Indexer = idx_get_indexer();
1397
            $result = $Indexer->deletePage($page);
1398
            if ($result === "locked") {
1399
                if ($verbose) print("Indexer: locked".DOKU_LF);
1400
                return false;
1401
            }
1402
            @unlink($idxtag);
1 ignored issue
show
Security Best Practice introduced by
It seems like you do not handle an error condition here. This can introduce security issues, and is generally not recommended.

If you suppress an error, we recommend checking for the error condition explicitly:

// For example instead of
@mkdir($dir);

// Better use
if (@mkdir($dir) === false) {
    throw new \RuntimeException('The directory '.$dir.' could not be created.');
}
Loading history...
1403
        }
1404
        if ($verbose) print("Indexer: index disabled for $page".DOKU_LF);
1405
        return $result;
1406
    }
1407
1408
    $Indexer = idx_get_indexer();
1409
    $pid = $Indexer->getPID($page);
1410
    if ($pid === false) {
1411
        if ($verbose) print("Indexer: getting the PID failed for $page".DOKU_LF);
1412
        return false;
1413
    }
1414
    $body = '';
1415
    $metadata = array();
1416
    $metadata['title'] = p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED);
1417
    if (($references = p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED)) !== null)
1418
        $metadata['relation_references'] = array_keys($references);
1419
    else
1420
        $metadata['relation_references'] = array();
1421
1422
    if (($media = p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED)) !== null)
1423
        $metadata['relation_media'] = array_keys($media);
1424
    else
1425
        $metadata['relation_media'] = array();
1426
1427
    $data = compact('page', 'body', 'metadata', 'pid');
1428
    $evt = new Doku_Event('INDEXER_PAGE_ADD', $data);
1429
    if ($evt->advise_before()) $data['body'] = $data['body'] . " " . rawWiki($page);
1430
    $evt->advise_after();
1431
    unset($evt);
1432
    extract($data);
1433
1434
    $result = $Indexer->addPageWords($page, $body);
1435
    if ($result === "locked") {
1436
        if ($verbose) print("Indexer: locked".DOKU_LF);
1437
        return false;
1438
    }
1439
1440
    if ($result) {
1441
        $result = $Indexer->addMetaKeys($page, $metadata);
1442
        if ($result === "locked") {
1443
            if ($verbose) print("Indexer: locked".DOKU_LF);
1444
            return false;
1445
        }
1446
    }
1447
1448
    if ($result)
1449
        io_saveFile(metaFN($page,'.indexed'), idx_get_version());
1450
    if ($verbose) {
1451
        print("Indexer: finished".DOKU_LF);
1452
        return true;
1453
    }
1454
    return $result;
1455
}
1456
1457
/**
1458
 * Find tokens in the fulltext index
1459
 *
1460
 * Takes an array of words and will return a list of matching
1461
 * pages for each one.
1462
 *
1463
 * Important: No ACL checking is done here! All results are
1464
 *            returned, regardless of permissions
1465
 *
1466
 * @param array      $words  list of words to search for
1467
 * @return array             list of pages found, associated with the search terms
1468
 */
1469
function idx_lookup(&$words) {
1470
    $Indexer = idx_get_indexer();
1471
    return $Indexer->lookup($words);
1472
}
1473
1474
/**
1475
 * Split a string into tokens
1476
 *
1477
 */
1478
function idx_tokenizer($string, $wc=false) {
1479
    $Indexer = idx_get_indexer();
1480
    return $Indexer->tokenizer($string, $wc);
1481
}
1482
1483
/* For compatibility */
1484
1485
/**
1486
 * Read the list of words in an index (if it exists).
1487
 *
1488
 * @author Tom N Harris <[email protected]>
1489
 *
1490
 * @param string $idx
1491
 * @param string $suffix
1492
 * @return array
1493
 */
1494
function idx_getIndex($idx, $suffix) {
1495
    global $conf;
1496
    $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx';
1497
    if (!file_exists($fn)) return array();
1498
    return file($fn);
1499
}
1500
1501
/**
1502
 * Get the list of lengths indexed in the wiki.
1503
 *
1504
 * Read the index directory or a cache file and returns
1505
 * a sorted array of lengths of the words used in the wiki.
1506
 *
1507
 * @author YoBoY <[email protected]>
1508
 *
1509
 * @return array
1510
 */
1511
function idx_listIndexLengths() {
1512
    global $conf;
1513
    // testing what we have to do, create a cache file or not.
1514
    if ($conf['readdircache'] == 0) {
1515
        $docache = false;
1516
    } else {
1517
        clearstatcache();
1518
        if (file_exists($conf['indexdir'].'/lengths.idx')
1519
        && (time() < @filemtime($conf['indexdir'].'/lengths.idx') + $conf['readdircache'])) {
1520
            if (($lengths = @file($conf['indexdir'].'/lengths.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES)) !== false) {
1521
                $idx = array();
1522
                foreach ($lengths as $length) {
1523
                    $idx[] = (int)$length;
1524
                }
1525
                return $idx;
1526
            }
1527
        }
1528
        $docache = true;
1529
    }
1530
1531
    if ($conf['readdircache'] == 0 || $docache) {
1532
        $dir = @opendir($conf['indexdir']);
1533
        if ($dir === false)
1534
            return array();
1535
        $idx = array();
1536
        while (($f = readdir($dir)) !== false) {
1537
            if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') {
1538
                $i = substr($f, 1, -4);
1539
                if (is_numeric($i))
1540
                    $idx[] = (int)$i;
1541
            }
1542
        }
1543
        closedir($dir);
1544
        sort($idx);
1545
        // save this in a file
1546
        if ($docache) {
1547
            $handle = @fopen($conf['indexdir'].'/lengths.idx', 'w');
1548
            @fwrite($handle, implode("\n", $idx));
1 ignored issue
show
Security Best Practice introduced by
It seems like you do not handle an error condition here. This can introduce security issues, and is generally not recommended.

If you suppress an error, we recommend checking for the error condition explicitly:

// For example instead of
@mkdir($dir);

// Better use
if (@mkdir($dir) === false) {
    throw new \RuntimeException('The directory '.$dir.' could not be created.');
}
Loading history...
1549
            @fclose($handle);
1 ignored issue
show
Security Best Practice introduced by
It seems like you do not handle an error condition here. This can introduce security issues, and is generally not recommended.

If you suppress an error, we recommend checking for the error condition explicitly:

// For example instead of
@mkdir($dir);

// Better use
if (@mkdir($dir) === false) {
    throw new \RuntimeException('The directory '.$dir.' could not be created.');
}
Loading history...
1550
        }
1551
        return $idx;
1552
    }
1553
1554
    return array();
1555
}
1556
1557
/**
1558
 * Get the word lengths that have been indexed.
1559
 *
1560
 * Reads the index directory and returns an array of lengths
1561
 * that there are indices for.
1562
 *
1563
 * @author YoBoY <[email protected]>
1564
 *
1565
 * @param array|int $filter
1566
 * @return array
1567
 */
1568
function idx_indexLengths($filter) {
1569
    global $conf;
1570
    $idx = array();
1571
    if (is_array($filter)) {
1572
        // testing if index files exist only
1573
        $path = $conf['indexdir']."/i";
1574
        foreach ($filter as $key => $value) {
1575
            if (file_exists($path.$key.'.idx'))
1576
                $idx[] = $key;
1577
        }
1578
    } else {
1579
        $lengths = idx_listIndexLengths();
1580
        foreach ($lengths as $key => $length) {
1581
            // keep all the values equal or superior
1582
            if ((int)$length >= (int)$filter)
1583
                $idx[] = $length;
1584
        }
1585
    }
1586
    return $idx;
1587
}
1588
1589
/**
1590
 * Clean a name of a key for use as a file name.
1591
 *
1592
 * Romanizes non-latin characters, then strips away anything that's
1593
 * not a letter, number, or underscore.
1594
 *
1595
 * @author Tom N Harris <[email protected]>
1596
 *
1597
 * @param string $name
1598
 * @return string
1599
 */
1600
function idx_cleanName($name) {
1601
    $name = utf8_romanize(trim((string)$name));
1602
    $name = preg_replace('#[ \./\\:-]+#', '_', $name);
1603
    $name = preg_replace('/[^A-Za-z0-9_]/', '', $name);
1604
    return strtolower($name);
1605
}
1606
1607
//Setup VIM: ex: et ts=4 :
1608