Failed Conditions
Push — psr2 ( 64159a )
by Andreas
07:54 queued 04:15
created

inc/indexer.php (1 issue)

Upgrade to new PHP Analysis Engine

These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more

1
<?php
2
/**
3
 * Functions to create the fulltext search index
4
 *
5
 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
6
 * @author     Andreas Gohr <[email protected]>
7
 * @author     Tom N Harris <[email protected]>
8
 */
9
10
// Version tag used to force rebuild on upgrade
11
define('INDEXER_VERSION', 8);
12
13
// set the minimum token length to use in the index (note, this doesn't apply to numeric tokens)
14
if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH',2);
15
16
// Asian characters are handled as words. The following regexp defines the
17
// Unicode-Ranges for Asian characters
18
// Ranges taken from http://en.wikipedia.org/wiki/Unicode_block
19
// I'm no language expert. If you think some ranges are wrongly chosen or
20
// a range is missing, please contact me
21
define('IDX_ASIAN1','[\x{0E00}-\x{0E7F}]'); // Thai
22
define('IDX_ASIAN2','['.
23
                   '\x{2E80}-\x{3040}'.  // CJK -> Hangul
24
                   '\x{309D}-\x{30A0}'.
25
                   '\x{30FD}-\x{31EF}\x{3200}-\x{D7AF}'.
26
                   '\x{F900}-\x{FAFF}'.  // CJK Compatibility Ideographs
27
                   '\x{FE30}-\x{FE4F}'.  // CJK Compatibility Forms
28
                   "\xF0\xA0\x80\x80-\xF0\xAA\x9B\x9F". // CJK Extension B
29
                   "\xF0\xAA\x9C\x80-\xF0\xAB\x9C\xBF". // CJK Extension C
30
                   "\xF0\xAB\x9D\x80-\xF0\xAB\xA0\x9F". // CJK Extension D
31
                   "\xF0\xAF\xA0\x80-\xF0\xAF\xAB\xBF". // CJK Compatibility Supplement
32
                   ']');
33
define('IDX_ASIAN3','['.                // Hiragana/Katakana (can be two characters)
34
                   '\x{3042}\x{3044}\x{3046}\x{3048}'.
35
                   '\x{304A}-\x{3062}\x{3064}-\x{3082}'.
36
                   '\x{3084}\x{3086}\x{3088}-\x{308D}'.
37
                   '\x{308F}-\x{3094}'.
38
                   '\x{30A2}\x{30A4}\x{30A6}\x{30A8}'.
39
                   '\x{30AA}-\x{30C2}\x{30C4}-\x{30E2}'.
40
                   '\x{30E4}\x{30E6}\x{30E8}-\x{30ED}'.
41
                   '\x{30EF}-\x{30F4}\x{30F7}-\x{30FA}'.
42
                   ']['.
43
                   '\x{3041}\x{3043}\x{3045}\x{3047}\x{3049}'.
44
                   '\x{3063}\x{3083}\x{3085}\x{3087}\x{308E}\x{3095}-\x{309C}'.
45
                   '\x{30A1}\x{30A3}\x{30A5}\x{30A7}\x{30A9}'.
46
                   '\x{30C3}\x{30E3}\x{30E5}\x{30E7}\x{30EE}\x{30F5}\x{30F6}\x{30FB}\x{30FC}'.
47
                   '\x{31F0}-\x{31FF}'.
48
                   ']?');
49
define('IDX_ASIAN', '(?:'.IDX_ASIAN1.'|'.IDX_ASIAN2.'|'.IDX_ASIAN3.')');
50
51
/**
52
 * Version of the indexer taking into consideration the external tokenizer.
53
 * The indexer is only compatible with data written by the same version.
54
 *
55
 * @triggers INDEXER_VERSION_GET
56
 * Plugins that modify what gets indexed should hook this event and
57
 * add their version info to the event data like so:
58
 *     $data[$plugin_name] = $plugin_version;
59
 *
60
 * @author Tom N Harris <[email protected]>
61
 * @author Michael Hamann <[email protected]>
62
 *
63
 * @return int|string
64
 */
65
function idx_get_version(){
66
    static $indexer_version = null;
67
    if ($indexer_version == null) {
68
        $version = INDEXER_VERSION;
69
70
        // DokuWiki version is included for the convenience of plugins
71
        $data = array('dokuwiki'=>$version);
72
        trigger_event('INDEXER_VERSION_GET', $data, null, false);
73
        unset($data['dokuwiki']); // this needs to be first
74
        ksort($data);
75
        foreach ($data as $plugin=>$vers)
76
            $version .= '+'.$plugin.'='.$vers;
77
        $indexer_version = $version;
78
    }
79
    return $indexer_version;
80
}
81
82
/**
83
 * Measure the length of a string.
84
 * Differs from strlen in handling of asian characters.
85
 *
86
 * @author Tom N Harris <[email protected]>
87
 *
88
 * @param string $w
89
 * @return int
90
 */
91
function wordlen($w){
92
    $l = strlen($w);
93
    // If left alone, all chinese "words" will get put into w3.idx
94
    // So the "length" of a "word" is faked
95
    if(preg_match_all('/[\xE2-\xEF]/',$w,$leadbytes)) {
96
        foreach($leadbytes[0] as $b)
97
            $l += ord($b) - 0xE1;
98
    }
99
    return $l;
100
}
101
102
/**
103
 * Class that encapsulates operations on the indexer database.
104
 *
105
 * @author Tom N Harris <[email protected]>
106
 */
107
class Doku_Indexer {
108
    /**
109
     * @var array $pidCache Cache for getPID()
110
     */
111
    protected $pidCache = array();
112
113
    /**
114
     * Adds the contents of a page to the fulltext index
115
     *
116
     * The added text replaces previous words for the same page.
117
     * An empty value erases the page.
118
     *
119
     * @param string    $page   a page name
120
     * @param string    $text   the body of the page
121
     * @return string|boolean  the function completed successfully
122
     *
123
     * @author Tom N Harris <[email protected]>
124
     * @author Andreas Gohr <[email protected]>
125
     */
126
    public function addPageWords($page, $text) {
127
        if (!$this->lock())
128
            return "locked";
129
130
        // load known documents
131
        $pid = $this->getPIDNoLock($page);
132
        if ($pid === false) {
133
            $this->unlock();
134
            return false;
135
        }
136
137
        $pagewords = array();
138
        // get word usage in page
139
        $words = $this->getPageWords($text);
140
        if ($words === false) {
141
            $this->unlock();
142
            return false;
143
        }
144
145
        if (!empty($words)) {
146
            foreach (array_keys($words) as $wlen) {
147
                $index = $this->getIndex('i', $wlen);
148
                foreach ($words[$wlen] as $wid => $freq) {
149
                    $idx = ($wid<count($index)) ? $index[$wid] : '';
150
                    $index[$wid] = $this->updateTuple($idx, $pid, $freq);
151
                    $pagewords[] = "$wlen*$wid";
152
                }
153
                if (!$this->saveIndex('i', $wlen, $index)) {
154
                    $this->unlock();
155
                    return false;
156
                }
157
            }
158
        }
159
160
        // Remove obsolete index entries
161
        $pageword_idx = $this->getIndexKey('pageword', '', $pid);
162
        if ($pageword_idx !== '') {
163
            $oldwords = explode(':',$pageword_idx);
164
            $delwords = array_diff($oldwords, $pagewords);
165
            $upwords = array();
166
            foreach ($delwords as $word) {
167
                if ($word != '') {
168
                    list($wlen,$wid) = explode('*', $word);
169
                    $wid = (int)$wid;
170
                    $upwords[$wlen][] = $wid;
171
                }
172
            }
173
            foreach ($upwords as $wlen => $widx) {
174
                $index = $this->getIndex('i', $wlen);
175
                foreach ($widx as $wid) {
176
                    $index[$wid] = $this->updateTuple($index[$wid], $pid, 0);
177
                }
178
                $this->saveIndex('i', $wlen, $index);
179
            }
180
        }
181
        // Save the reverse index
182
        $pageword_idx = join(':', $pagewords);
183
        if (!$this->saveIndexKey('pageword', '', $pid, $pageword_idx)) {
184
            $this->unlock();
185
            return false;
186
        }
187
188
        $this->unlock();
189
        return true;
190
    }
191
192
    /**
193
     * Split the words in a page and add them to the index.
194
     *
195
     * @param string    $text   content of the page
196
     * @return array            list of word IDs and number of times used
197
     *
198
     * @author Andreas Gohr <[email protected]>
199
     * @author Christopher Smith <[email protected]>
200
     * @author Tom N Harris <[email protected]>
201
     */
202
    protected function getPageWords($text) {
203
204
        $tokens = $this->tokenizer($text);
205
        $tokens = array_count_values($tokens);  // count the frequency of each token
206
207
        $words = array();
208
        foreach ($tokens as $w=>$c) {
209
            $l = wordlen($w);
210
            if (isset($words[$l])){
211
                $words[$l][$w] = $c + (isset($words[$l][$w]) ? $words[$l][$w] : 0);
212
            }else{
213
                $words[$l] = array($w => $c);
214
            }
215
        }
216
217
        // arrive here with $words = array(wordlen => array(word => frequency))
218
        $word_idx_modified = false;
219
        $index = array();   //resulting index
220
        foreach (array_keys($words) as $wlen) {
221
            $word_idx = $this->getIndex('w', $wlen);
222
            foreach ($words[$wlen] as $word => $freq) {
223
                $word = (string)$word;
224
                $wid = array_search($word, $word_idx, true);
225
                if ($wid === false) {
226
                    $wid = count($word_idx);
227
                    $word_idx[] = $word;
228
                    $word_idx_modified = true;
229
                }
230
                if (!isset($index[$wlen]))
231
                    $index[$wlen] = array();
232
                $index[$wlen][$wid] = $freq;
233
            }
234
            // save back the word index
235
            if ($word_idx_modified && !$this->saveIndex('w', $wlen, $word_idx))
236
                return false;
237
        }
238
239
        return $index;
240
    }
241
242
    /**
243
     * Add/update keys to/of the metadata index.
244
     *
245
     * Adding new keys does not remove other keys for the page.
246
     * An empty value will erase the key.
247
     * The $key parameter can be an array to add multiple keys. $value will
248
     * not be used if $key is an array.
249
     *
250
     * @param string    $page   a page name
251
     * @param mixed     $key    a key string or array of key=>value pairs
252
     * @param mixed     $value  the value or list of values
253
     * @return boolean|string     the function completed successfully
254
     *
255
     * @author Tom N Harris <[email protected]>
256
     * @author Michael Hamann <[email protected]>
257
     */
258
    public function addMetaKeys($page, $key, $value=null) {
259
        if (!is_array($key)) {
260
            $key = array($key => $value);
261
        } elseif (!is_null($value)) {
262
            // $key is array, but $value is not null
263
            trigger_error("array passed to addMetaKeys but value is not null", E_USER_WARNING);
264
        }
265
266
        if (!$this->lock())
267
            return "locked";
268
269
        // load known documents
270
        $pid = $this->getPIDNoLock($page);
271
        if ($pid === false) {
272
            $this->unlock();
273
            return false;
274
        }
275
276
        // Special handling for titles so the index file is simpler
277
        if (array_key_exists('title', $key)) {
278
            $value = $key['title'];
279
            if (is_array($value)) {
280
                $value = $value[0];
281
            }
282
            $this->saveIndexKey('title', '', $pid, $value);
283
            unset($key['title']);
284
        }
285
286
        foreach ($key as $name => $values) {
287
            $metaname = idx_cleanName($name);
288
            $this->addIndexKey('metadata', '', $metaname);
289
            $metaidx = $this->getIndex($metaname.'_i', '');
290
            $metawords = $this->getIndex($metaname.'_w', '');
291
            $addwords = false;
292
293
            if (!is_array($values)) $values = array($values);
294
295
            $val_idx = $this->getIndexKey($metaname.'_p', '', $pid);
296
            if ($val_idx != '') {
297
                $val_idx = explode(':', $val_idx);
298
                // -1 means remove, 0 keep, 1 add
299
                $val_idx = array_combine($val_idx, array_fill(0, count($val_idx), -1));
300
            } else {
301
                $val_idx = array();
302
            }
303
304
            foreach ($values as $val) {
305
                $val = (string)$val;
306
                if ($val !== "") {
307
                    $id = array_search($val, $metawords, true);
308
                    if ($id === false) {
309
                        // didn't find $val, so we'll add it to the end of metawords and create a placeholder in metaidx
310
                        $id = count($metawords);
311
                        $metawords[$id] = $val;
312
                        $metaidx[$id] = '';
313
                        $addwords = true;
314
                    }
315
                    // test if value is already in the index
316
                    if (isset($val_idx[$id]) && $val_idx[$id] <= 0){
317
                        $val_idx[$id] = 0;
318
                    } else { // else add it
319
                        $val_idx[$id] = 1;
320
                    }
321
                }
322
            }
323
324
            if ($addwords) {
325
                $this->saveIndex($metaname.'_w', '', $metawords);
326
            }
327
            $vals_changed = false;
328
            foreach ($val_idx as $id => $action) {
329
                if ($action == -1) {
330
                    $metaidx[$id] = $this->updateTuple($metaidx[$id], $pid, 0);
331
                    $vals_changed = true;
332
                    unset($val_idx[$id]);
333
                } elseif ($action == 1) {
334
                    $metaidx[$id] = $this->updateTuple($metaidx[$id], $pid, 1);
335
                    $vals_changed = true;
336
                }
337
            }
338
339
            if ($vals_changed) {
340
                $this->saveIndex($metaname.'_i', '', $metaidx);
341
                $val_idx = implode(':', array_keys($val_idx));
342
                $this->saveIndexKey($metaname.'_p', '', $pid, $val_idx);
343
            }
344
345
            unset($metaidx);
346
            unset($metawords);
347
        }
348
349
        $this->unlock();
350
        return true;
351
    }
352
353
    /**
354
     * Rename a page in the search index without changing the indexed content. This function doesn't check if the
355
     * old or new name exists in the filesystem. It returns an error if the old page isn't in the page list of the
356
     * indexer and it deletes all previously indexed content of the new page.
357
     *
358
     * @param string $oldpage The old page name
359
     * @param string $newpage The new page name
360
     * @return string|bool If the page was successfully renamed, can be a message in the case of an error
361
     */
362
    public function renamePage($oldpage, $newpage) {
363
        if (!$this->lock()) return 'locked';
364
365
        $pages = $this->getPages();
366
367
        $id = array_search($oldpage, $pages, true);
368
        if ($id === false) {
369
            $this->unlock();
370
            return 'page is not in index';
371
        }
372
373
        $new_id = array_search($newpage, $pages, true);
374
        if ($new_id !== false) {
375
            // make sure the page is not in the index anymore
376
            if ($this->deletePageNoLock($newpage) !== true) {
377
                return false;
378
            }
379
380
            $pages[$new_id] = 'deleted:'.time().rand(0, 9999);
381
        }
382
383
        $pages[$id] = $newpage;
384
385
        // update index
386
        if (!$this->saveIndex('page', '', $pages)) {
387
            $this->unlock();
388
            return false;
389
        }
390
391
        // reset the pid cache
392
        $this->pidCache = array();
393
394
        $this->unlock();
395
        return true;
396
    }
397
398
    /**
399
     * Renames a meta value in the index. This doesn't change the meta value in the pages, it assumes that all pages
400
     * will be updated.
401
     *
402
     * @param string $key       The metadata key of which a value shall be changed
403
     * @param string $oldvalue  The old value that shall be renamed
404
     * @param string $newvalue  The new value to which the old value shall be renamed, if exists values will be merged
405
     * @return bool|string      If renaming the value has been successful, false or error message on error.
406
     */
407
    public function renameMetaValue($key, $oldvalue, $newvalue) {
408
        if (!$this->lock()) return 'locked';
409
410
        // change the relation references index
411
        $metavalues = $this->getIndex($key, '_w');
412
        $oldid = array_search($oldvalue, $metavalues, true);
413
        if ($oldid !== false) {
414
            $newid = array_search($newvalue, $metavalues, true);
415
            if ($newid !== false) {
416
                // free memory
417
                unset ($metavalues);
418
419
                // okay, now we have two entries for the same value. we need to merge them.
420
                $indexline = $this->getIndexKey($key.'_i', '', $oldid);
421
                if ($indexline != '') {
422
                    $newindexline = $this->getIndexKey($key.'_i', '', $newid);
423
                    $pagekeys     = $this->getIndex($key.'_p', '');
424
                    $parts = explode(':', $indexline);
425
                    foreach ($parts as $part) {
426
                        list($id, $count) = explode('*', $part);
427
                        $newindexline =  $this->updateTuple($newindexline, $id, $count);
428
429
                        $keyline = explode(':', $pagekeys[$id]);
430
                        // remove old meta value
431
                        $keyline = array_diff($keyline, array($oldid));
432
                        // add new meta value when not already present
433
                        if (!in_array($newid, $keyline)) {
434
                            array_push($keyline, $newid);
435
                        }
436
                        $pagekeys[$id] = implode(':', $keyline);
437
                    }
438
                    $this->saveIndex($key.'_p', '', $pagekeys);
439
                    unset($pagekeys);
440
                    $this->saveIndexKey($key.'_i', '', $oldid, '');
441
                    $this->saveIndexKey($key.'_i', '', $newid, $newindexline);
442
                }
443
            } else {
444
                $metavalues[$oldid] = $newvalue;
445
                if (!$this->saveIndex($key.'_w', '', $metavalues)) {
446
                    $this->unlock();
447
                    return false;
448
                }
449
            }
450
        }
451
452
        $this->unlock();
453
        return true;
454
    }
455
456
    /**
457
     * Remove a page from the index
458
     *
459
     * Erases entries in all known indexes.
460
     *
461
     * @param string    $page   a page name
462
     * @return string|boolean  the function completed successfully
463
     *
464
     * @author Tom N Harris <[email protected]>
465
     */
466
    public function deletePage($page) {
467
        if (!$this->lock())
468
            return "locked";
469
470
        $result = $this->deletePageNoLock($page);
471
472
        $this->unlock();
473
474
        return $result;
475
    }
476
477
    /**
478
     * Remove a page from the index without locking the index, only use this function if the index is already locked
479
     *
480
     * Erases entries in all known indexes.
481
     *
482
     * @param string    $page   a page name
483
     * @return boolean          the function completed successfully
484
     *
485
     * @author Tom N Harris <[email protected]>
486
     */
487
    protected function deletePageNoLock($page) {
488
        // load known documents
489
        $pid = $this->getPIDNoLock($page);
490
        if ($pid === false) {
491
            return false;
492
        }
493
494
        // Remove obsolete index entries
495
        $pageword_idx = $this->getIndexKey('pageword', '', $pid);
496
        if ($pageword_idx !== '') {
497
            $delwords = explode(':',$pageword_idx);
498
            $upwords = array();
499
            foreach ($delwords as $word) {
500
                if ($word != '') {
501
                    list($wlen,$wid) = explode('*', $word);
502
                    $wid = (int)$wid;
503
                    $upwords[$wlen][] = $wid;
504
                }
505
            }
506
            foreach ($upwords as $wlen => $widx) {
507
                $index = $this->getIndex('i', $wlen);
508
                foreach ($widx as $wid) {
509
                    $index[$wid] = $this->updateTuple($index[$wid], $pid, 0);
510
                }
511
                $this->saveIndex('i', $wlen, $index);
512
            }
513
        }
514
        // Save the reverse index
515
        if (!$this->saveIndexKey('pageword', '', $pid, "")) {
516
            return false;
517
        }
518
519
        $this->saveIndexKey('title', '', $pid, "");
520
        $keyidx = $this->getIndex('metadata', '');
521
        foreach ($keyidx as $metaname) {
522
            $val_idx = explode(':', $this->getIndexKey($metaname.'_p', '', $pid));
523
            $meta_idx = $this->getIndex($metaname.'_i', '');
524
            foreach ($val_idx as $id) {
525
                if ($id === '') continue;
526
                $meta_idx[$id] = $this->updateTuple($meta_idx[$id], $pid, 0);
527
            }
528
            $this->saveIndex($metaname.'_i', '', $meta_idx);
529
            $this->saveIndexKey($metaname.'_p', '', $pid, '');
530
        }
531
532
        return true;
533
    }
534
535
    /**
536
     * Clear the whole index
537
     *
538
     * @return bool If the index has been cleared successfully
539
     */
540
    public function clear() {
541
        global $conf;
542
543
        if (!$this->lock()) return false;
544
545
        @unlink($conf['indexdir'].'/page.idx');
546
        @unlink($conf['indexdir'].'/title.idx');
547
        @unlink($conf['indexdir'].'/pageword.idx');
548
        @unlink($conf['indexdir'].'/metadata.idx');
549
        $dir = @opendir($conf['indexdir']);
550
        if($dir!==false){
551
            while(($f = readdir($dir)) !== false){
552
                if(substr($f,-4)=='.idx' &&
553
                    (substr($f,0,1)=='i' || substr($f,0,1)=='w'
554
                        || substr($f,-6)=='_w.idx' || substr($f,-6)=='_i.idx' || substr($f,-6)=='_p.idx'))
555
                    @unlink($conf['indexdir']."/$f");
556
            }
557
        }
558
        @unlink($conf['indexdir'].'/lengths.idx');
1 ignored issue
show
Security Best Practice introduced by
It seems like you do not handle an error condition here. This can introduce security issues, and is generally not recommended.

If you suppress an error, we recommend checking for the error condition explicitly:

// For example instead of
@mkdir($dir);

// Better use
if (@mkdir($dir) === false) {
    throw new \RuntimeException('The directory '.$dir.' could not be created.');
}
Loading history...
559
560
        // clear the pid cache
561
        $this->pidCache = array();
562
563
        $this->unlock();
564
        return true;
565
    }
566
567
    /**
568
     * Split the text into words for fulltext search
569
     *
570
     * TODO: does this also need &$stopwords ?
571
     *
572
     * @triggers INDEXER_TEXT_PREPARE
573
     * This event allows plugins to modify the text before it gets tokenized.
574
     * Plugins intercepting this event should also intercept INDEX_VERSION_GET
575
     *
576
     * @param string    $text   plain text
577
     * @param boolean   $wc     are wildcards allowed?
578
     * @return array            list of words in the text
579
     *
580
     * @author Tom N Harris <[email protected]>
581
     * @author Andreas Gohr <[email protected]>
582
     */
583
    public function tokenizer($text, $wc=false) {
584
        $wc = ($wc) ? '' : '\*';
585
        $stopwords =& idx_get_stopwords();
586
587
        // prepare the text to be tokenized
588
        $evt = new Doku_Event('INDEXER_TEXT_PREPARE', $text);
589
        if ($evt->advise_before(true)) {
590
            if (preg_match('/[^0-9A-Za-z ]/u', $text)) {
591
                // handle asian chars as single words (may fail on older PHP version)
592
                $asia = @preg_replace('/('.IDX_ASIAN.')/u', ' \1 ', $text);
593
                if (!is_null($asia)) $text = $asia; // recover from regexp falure
594
            }
595
        }
596
        $evt->advise_after();
597
        unset($evt);
598
599
        $text = strtr($text,
600
                       array(
601
                           "\r" => ' ',
602
                           "\n" => ' ',
603
                           "\t" => ' ',
604
                           "\xC2\xAD" => '', //soft-hyphen
605
                       )
606
                     );
607
        if (preg_match('/[^0-9A-Za-z ]/u', $text))
608
            $text = utf8_stripspecials($text, ' ', '\._\-:'.$wc);
609
610
        $wordlist = explode(' ', $text);
611
        foreach ($wordlist as $i => $word) {
612
            $wordlist[$i] = (preg_match('/[^0-9A-Za-z]/u', $word)) ?
613
                utf8_strtolower($word) : strtolower($word);
614
        }
615
616
        foreach ($wordlist as $i => $word) {
617
            if ((!is_numeric($word) && strlen($word) < IDX_MINWORDLENGTH)
618
              || array_search($word, $stopwords, true) !== false)
619
                unset($wordlist[$i]);
620
        }
621
        return array_values($wordlist);
622
    }
623
624
    /**
625
     * Get the numeric PID of a page
626
     *
627
     * @param string $page The page to get the PID for
628
     * @return bool|int The page id on success, false on error
629
     */
630
    public function getPID($page) {
631
        // return PID without locking when it is in the cache
632
        if (isset($this->pidCache[$page])) return $this->pidCache[$page];
633
634
        if (!$this->lock())
635
            return false;
636
637
        // load known documents
638
        $pid = $this->getPIDNoLock($page);
639
        if ($pid === false) {
640
            $this->unlock();
641
            return false;
642
        }
643
644
        $this->unlock();
645
        return $pid;
646
    }
647
648
    /**
649
     * Get the numeric PID of a page without locking the index.
650
     * Only use this function when the index is already locked.
651
     *
652
     * @param string $page The page to get the PID for
653
     * @return bool|int The page id on success, false on error
654
     */
655
    protected function getPIDNoLock($page) {
656
        // avoid expensive addIndexKey operation for the most recently requested pages by using a cache
657
        if (isset($this->pidCache[$page])) return $this->pidCache[$page];
658
        $pid = $this->addIndexKey('page', '', $page);
659
        // limit cache to 10 entries by discarding the oldest element as in DokuWiki usually only the most recently
660
        // added item will be requested again
661
        if (count($this->pidCache) > 10) array_shift($this->pidCache);
662
        $this->pidCache[$page] = $pid;
663
        return $pid;
664
    }
665
666
    /**
667
     * Get the page id of a numeric PID
668
     *
669
     * @param int $pid The PID to get the page id for
670
     * @return string The page id
671
     */
672
    public function getPageFromPID($pid) {
673
        return $this->getIndexKey('page', '', $pid);
674
    }
675
676
    /**
677
     * Find pages in the fulltext index containing the words,
678
     *
679
     * The search words must be pre-tokenized, meaning only letters and
680
     * numbers with an optional wildcard
681
     *
682
     * The returned array will have the original tokens as key. The values
683
     * in the returned list is an array with the page names as keys and the
684
     * number of times that token appears on the page as value.
685
     *
686
     * @param array  $tokens list of words to search for
687
     * @return array         list of page names with usage counts
688
     *
689
     * @author Tom N Harris <[email protected]>
690
     * @author Andreas Gohr <[email protected]>
691
     */
692
    public function lookup(&$tokens) {
693
        $result = array();
694
        $wids = $this->getIndexWords($tokens, $result);
695
        if (empty($wids)) return array();
696
        // load known words and documents
697
        $page_idx = $this->getIndex('page', '');
698
        $docs = array();
699
        foreach (array_keys($wids) as $wlen) {
700
            $wids[$wlen] = array_unique($wids[$wlen]);
701
            $index = $this->getIndex('i', $wlen);
702
            foreach($wids[$wlen] as $ixid) {
703
                if ($ixid < count($index))
704
                    $docs["$wlen*$ixid"] = $this->parseTuples($page_idx, $index[$ixid]);
705
            }
706
        }
707
        // merge found pages into final result array
708
        $final = array();
709
        foreach ($result as $word => $res) {
710
            $final[$word] = array();
711
            foreach ($res as $wid) {
712
                // handle the case when ($ixid < count($index)) has been false
713
                // and thus $docs[$wid] hasn't been set.
714
                if (!isset($docs[$wid])) continue;
715
                $hits = &$docs[$wid];
716
                foreach ($hits as $hitkey => $hitcnt) {
717
                    // make sure the document still exists
718
                    if (!page_exists($hitkey, '', false)) continue;
719
                    if (!isset($final[$word][$hitkey]))
720
                        $final[$word][$hitkey] = $hitcnt;
721
                    else
722
                        $final[$word][$hitkey] += $hitcnt;
723
                }
724
            }
725
        }
726
        return $final;
727
    }
728
729
    /**
730
     * Find pages containing a metadata key.
731
     *
732
     * The metadata values are compared as case-sensitive strings. Pass a
733
     * callback function that returns true or false to use a different
734
     * comparison function. The function will be called with the $value being
735
     * searched for as the first argument, and the word in the index as the
736
     * second argument. The function preg_match can be used directly if the
737
     * values are regexes.
738
     *
739
     * @param string    $key    name of the metadata key to look for
740
     * @param string    $value  search term to look for, must be a string or array of strings
741
     * @param callback  $func   comparison function
742
     * @return array            lists with page names, keys are query values if $value is array
743
     *
744
     * @author Tom N Harris <[email protected]>
745
     * @author Michael Hamann <[email protected]>
746
     */
747
    public function lookupKey($key, &$value, $func=null) {
748
        if (!is_array($value))
749
            $value_array = array($value);
750
        else
751
            $value_array =& $value;
752
753
        // the matching ids for the provided value(s)
754
        $value_ids = array();
755
756
        $metaname = idx_cleanName($key);
757
758
        // get all words in order to search the matching ids
759
        if ($key == 'title') {
760
            $words = $this->getIndex('title', '');
761
        } else {
762
            $words = $this->getIndex($metaname.'_w', '');
763
        }
764
765
        if (!is_null($func)) {
766
            foreach ($value_array as $val) {
767
                foreach ($words as $i => $word) {
768
                    if (call_user_func_array($func, array($val, $word)))
769
                        $value_ids[$i][] = $val;
770
                }
771
            }
772
        } else {
773
            foreach ($value_array as $val) {
774
                $xval = $val;
775
                $caret = '^';
776
                $dollar = '$';
777
                // check for wildcards
778
                if (substr($xval, 0, 1) == '*') {
779
                    $xval = substr($xval, 1);
780
                    $caret = '';
781
                }
782
                if (substr($xval, -1, 1) == '*') {
783
                    $xval = substr($xval, 0, -1);
784
                    $dollar = '';
785
                }
786
                if (!$caret || !$dollar) {
787
                    $re = $caret.preg_quote($xval, '/').$dollar;
788
                    foreach(array_keys(preg_grep('/'.$re.'/', $words)) as $i)
789
                        $value_ids[$i][] = $val;
790
                } else {
791
                    if (($i = array_search($val, $words, true)) !== false)
792
                        $value_ids[$i][] = $val;
793
                }
794
            }
795
        }
796
797
        unset($words); // free the used memory
798
799
        // initialize the result so it won't be null
800
        $result = array();
801
        foreach ($value_array as $val) {
802
            $result[$val] = array();
803
        }
804
805
        $page_idx = $this->getIndex('page', '');
806
807
        // Special handling for titles
808
        if ($key == 'title') {
809
            foreach ($value_ids as $pid => $val_list) {
810
                $page = $page_idx[$pid];
811
                foreach ($val_list as $val) {
812
                    $result[$val][] = $page;
813
                }
814
            }
815
        } else {
816
            // load all lines and pages so the used lines can be taken and matched with the pages
817
            $lines = $this->getIndex($metaname.'_i', '');
818
819
            foreach ($value_ids as $value_id => $val_list) {
820
                // parse the tuples of the form page_id*1:page2_id*1 and so on, return value
821
                // is an array with page_id => 1, page2_id => 1 etc. so take the keys only
822
                $pages = array_keys($this->parseTuples($page_idx, $lines[$value_id]));
823
                foreach ($val_list as $val) {
824
                    $result[$val] = array_merge($result[$val], $pages);
825
                }
826
            }
827
        }
828
        if (!is_array($value)) $result = $result[$value];
829
        return $result;
830
    }
831
832
    /**
833
     * Find the index ID of each search term.
834
     *
835
     * The query terms should only contain valid characters, with a '*' at
836
     * either the beginning or end of the word (or both).
837
     * The $result parameter can be used to merge the index locations with
838
     * the appropriate query term.
839
     *
840
     * @param array  $words  The query terms.
841
     * @param array  $result Set to word => array("length*id" ...)
842
     * @return array         Set to length => array(id ...)
843
     *
844
     * @author Tom N Harris <[email protected]>
845
     */
846
    protected function getIndexWords(&$words, &$result) {
847
        $tokens = array();
848
        $tokenlength = array();
849
        $tokenwild = array();
850
        foreach ($words as $word) {
851
            $result[$word] = array();
852
            $caret = '^';
853
            $dollar = '$';
854
            $xword = $word;
855
            $wlen = wordlen($word);
856
857
            // check for wildcards
858
            if (substr($xword, 0, 1) == '*') {
859
                $xword = substr($xword, 1);
860
                $caret = '';
861
                $wlen -= 1;
862
            }
863
            if (substr($xword, -1, 1) == '*') {
864
                $xword = substr($xword, 0, -1);
865
                $dollar = '';
866
                $wlen -= 1;
867
            }
868
            if ($wlen < IDX_MINWORDLENGTH && $caret && $dollar && !is_numeric($xword))
869
                continue;
870
            if (!isset($tokens[$xword]))
871
                $tokenlength[$wlen][] = $xword;
872
            if (!$caret || !$dollar) {
873
                $re = $caret.preg_quote($xword, '/').$dollar;
874
                $tokens[$xword][] = array($word, '/'.$re.'/');
875
                if (!isset($tokenwild[$xword]))
876
                    $tokenwild[$xword] = $wlen;
877
            } else {
878
                $tokens[$xword][] = array($word, null);
879
            }
880
        }
881
        asort($tokenwild);
882
        // $tokens = array( base word => array( [ query term , regexp ] ... ) ... )
883
        // $tokenlength = array( base word length => base word ... )
884
        // $tokenwild = array( base word => base word length ... )
885
        $length_filter = empty($tokenwild) ? $tokenlength : min(array_keys($tokenlength));
886
        $indexes_known = $this->indexLengths($length_filter);
887
        if (!empty($tokenwild)) sort($indexes_known);
888
        // get word IDs
889
        $wids = array();
890
        foreach ($indexes_known as $ixlen) {
891
            $word_idx = $this->getIndex('w', $ixlen);
892
            // handle exact search
893
            if (isset($tokenlength[$ixlen])) {
894
                foreach ($tokenlength[$ixlen] as $xword) {
895
                    $wid = array_search($xword, $word_idx, true);
896
                    if ($wid !== false) {
897
                        $wids[$ixlen][] = $wid;
898
                        foreach ($tokens[$xword] as $w)
899
                            $result[$w[0]][] = "$ixlen*$wid";
900
                    }
901
                }
902
            }
903
            // handle wildcard search
904
            foreach ($tokenwild as $xword => $wlen) {
905
                if ($wlen >= $ixlen) break;
906
                foreach ($tokens[$xword] as $w) {
907
                    if (is_null($w[1])) continue;
908
                    foreach(array_keys(preg_grep($w[1], $word_idx)) as $wid) {
909
                        $wids[$ixlen][] = $wid;
910
                        $result[$w[0]][] = "$ixlen*$wid";
911
                    }
912
                }
913
            }
914
        }
915
        return $wids;
916
    }
917
918
    /**
919
     * Return a list of all pages
920
     * Warning: pages may not exist!
921
     *
922
     * @param string    $key    list only pages containing the metadata key (optional)
923
     * @return array            list of page names
924
     *
925
     * @author Tom N Harris <[email protected]>
926
     */
927
    public function getPages($key=null) {
928
        $page_idx = $this->getIndex('page', '');
929
        if (is_null($key)) return $page_idx;
930
931
        $metaname = idx_cleanName($key);
932
933
        // Special handling for titles
934
        if ($key == 'title') {
935
            $title_idx = $this->getIndex('title', '');
936
            array_splice($page_idx, count($title_idx));
937
            foreach ($title_idx as $i => $title)
938
                if ($title === "") unset($page_idx[$i]);
939
            return array_values($page_idx);
940
        }
941
942
        $pages = array();
943
        $lines = $this->getIndex($metaname.'_i', '');
944
        foreach ($lines as $line) {
945
            $pages = array_merge($pages, $this->parseTuples($page_idx, $line));
946
        }
947
        return array_keys($pages);
948
    }
949
950
    /**
951
     * Return a list of words sorted by number of times used
952
     *
953
     * @param int       $min    bottom frequency threshold
954
     * @param int       $max    upper frequency limit. No limit if $max<$min
955
     * @param int       $minlen minimum length of words to count
956
     * @param string    $key    metadata key to list. Uses the fulltext index if not given
957
     * @return array            list of words as the keys and frequency as values
958
     *
959
     * @author Tom N Harris <[email protected]>
960
     */
961
    public function histogram($min=1, $max=0, $minlen=3, $key=null) {
962
        if ($min < 1)
963
            $min = 1;
964
        if ($max < $min)
965
            $max = 0;
966
967
        $result = array();
968
969
        if ($key == 'title') {
970
            $index = $this->getIndex('title', '');
971
            $index = array_count_values($index);
972
            foreach ($index as $val => $cnt) {
973
                if ($cnt >= $min && (!$max || $cnt <= $max) && strlen($val) >= $minlen)
974
                    $result[$val] = $cnt;
975
            }
976
        }
977
        elseif (!is_null($key)) {
978
            $metaname = idx_cleanName($key);
979
            $index = $this->getIndex($metaname.'_i', '');
980
            $val_idx = array();
981
            foreach ($index as $wid => $line) {
982
                $freq = $this->countTuples($line);
983
                if ($freq >= $min && (!$max || $freq <= $max))
984
                    $val_idx[$wid] = $freq;
985
            }
986
            if (!empty($val_idx)) {
987
                $words = $this->getIndex($metaname.'_w', '');
988
                foreach ($val_idx as $wid => $freq) {
989
                    if (strlen($words[$wid]) >= $minlen)
990
                        $result[$words[$wid]] = $freq;
991
                }
992
            }
993
        }
994
        else {
995
            $lengths = idx_listIndexLengths();
996
            foreach ($lengths as $length) {
997
                if ($length < $minlen) continue;
998
                $index = $this->getIndex('i', $length);
999
                $words = null;
1000
                foreach ($index as $wid => $line) {
1001
                    $freq = $this->countTuples($line);
1002
                    if ($freq >= $min && (!$max || $freq <= $max)) {
1003
                        if ($words === null)
1004
                            $words = $this->getIndex('w', $length);
1005
                        $result[$words[$wid]] = $freq;
1006
                    }
1007
                }
1008
            }
1009
        }
1010
1011
        arsort($result);
1012
        return $result;
1013
    }
1014
1015
    /**
1016
     * Lock the indexer.
1017
     *
1018
     * @author Tom N Harris <[email protected]>
1019
     *
1020
     * @return bool|string
1021
     */
1022
    protected function lock() {
1023
        global $conf;
1024
        $status = true;
1025
        $run = 0;
1026
        $lock = $conf['lockdir'].'/_indexer.lock';
1027
        while (!@mkdir($lock, $conf['dmode'])) {
1028
            usleep(50);
1029
            if(is_dir($lock) && time()-@filemtime($lock) > 60*5){
1030
                // looks like a stale lock - remove it
1031
                if (!@rmdir($lock)) {
1032
                    $status = "removing the stale lock failed";
1033
                    return false;
1034
                } else {
1035
                    $status = "stale lock removed";
1036
                }
1037
            }elseif($run++ == 1000){
1038
                // we waited 5 seconds for that lock
1039
                return false;
1040
            }
1041
        }
1042
        if (!empty($conf['dperm'])) {
1043
            chmod($lock, $conf['dperm']);
1044
        }
1045
        return $status;
1046
    }
1047
1048
    /**
1049
     * Release the indexer lock.
1050
     *
1051
     * @author Tom N Harris <[email protected]>
1052
     *
1053
     * @return bool
1054
     */
1055
    protected function unlock() {
1056
        global $conf;
1057
        @rmdir($conf['lockdir'].'/_indexer.lock');
1058
        return true;
1059
    }
1060
1061
    /**
1062
     * Retrieve the entire index.
1063
     *
1064
     * The $suffix argument is for an index that is split into
1065
     * multiple parts. Different index files should use different
1066
     * base names.
1067
     *
1068
     * @param string    $idx    name of the index
1069
     * @param string    $suffix subpart identifier
1070
     * @return array            list of lines without CR or LF
1071
     *
1072
     * @author Tom N Harris <[email protected]>
1073
     */
1074
    protected function getIndex($idx, $suffix) {
1075
        global $conf;
1076
        $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx';
1077
        if (!file_exists($fn)) return array();
1078
        return file($fn, FILE_IGNORE_NEW_LINES);
1079
    }
1080
1081
    /**
1082
     * Replace the contents of the index with an array.
1083
     *
1084
     * @param string    $idx    name of the index
1085
     * @param string    $suffix subpart identifier
1086
     * @param array     $lines  list of lines without LF
1087
     * @return bool             If saving succeeded
1088
     *
1089
     * @author Tom N Harris <[email protected]>
1090
     */
1091
    protected function saveIndex($idx, $suffix, &$lines) {
1092
        global $conf;
1093
        $fn = $conf['indexdir'].'/'.$idx.$suffix;
1094
        $fh = @fopen($fn.'.tmp', 'w');
1095
        if (!$fh) return false;
1096
        fwrite($fh, join("\n", $lines));
1097
        if (!empty($lines))
1098
            fwrite($fh, "\n");
1099
        fclose($fh);
1100
        if (isset($conf['fperm']))
1101
            chmod($fn.'.tmp', $conf['fperm']);
1102
        io_rename($fn.'.tmp', $fn.'.idx');
1103
        return true;
1104
    }
1105
1106
    /**
1107
     * Retrieve a line from the index.
1108
     *
1109
     * @param string    $idx    name of the index
1110
     * @param string    $suffix subpart identifier
1111
     * @param int       $id     the line number
1112
     * @return string           a line with trailing whitespace removed
1113
     *
1114
     * @author Tom N Harris <[email protected]>
1115
     */
1116
    protected function getIndexKey($idx, $suffix, $id) {
1117
        global $conf;
1118
        $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx';
1119
        if (!file_exists($fn)) return '';
1120
        $fh = @fopen($fn, 'r');
1121
        if (!$fh) return '';
1122
        $ln = -1;
1123
        while (($line = fgets($fh)) !== false) {
1124
            if (++$ln == $id) break;
1125
        }
1126
        fclose($fh);
1127
        return rtrim((string)$line);
1128
    }
1129
1130
    /**
1131
     * Write a line into the index.
1132
     *
1133
     * @param string    $idx    name of the index
1134
     * @param string    $suffix subpart identifier
1135
     * @param int       $id     the line number
1136
     * @param string    $line   line to write
1137
     * @return bool             If saving succeeded
1138
     *
1139
     * @author Tom N Harris <[email protected]>
1140
     */
1141
    protected function saveIndexKey($idx, $suffix, $id, $line) {
1142
        global $conf;
1143
        if (substr($line, -1) != "\n")
1144
            $line .= "\n";
1145
        $fn = $conf['indexdir'].'/'.$idx.$suffix;
1146
        $fh = @fopen($fn.'.tmp', 'w');
1147
        if (!$fh) return false;
1148
        $ih = @fopen($fn.'.idx', 'r');
1149
        if ($ih) {
1150
            $ln = -1;
1151
            while (($curline = fgets($ih)) !== false) {
1152
                fwrite($fh, (++$ln == $id) ? $line : $curline);
1153
            }
1154
            if ($id > $ln) {
1155
                while ($id > ++$ln)
1156
                    fwrite($fh, "\n");
1157
                fwrite($fh, $line);
1158
            }
1159
            fclose($ih);
1160
        } else {
1161
            $ln = -1;
1162
            while ($id > ++$ln)
1163
                fwrite($fh, "\n");
1164
            fwrite($fh, $line);
1165
        }
1166
        fclose($fh);
1167
        if (isset($conf['fperm']))
1168
            chmod($fn.'.tmp', $conf['fperm']);
1169
        io_rename($fn.'.tmp', $fn.'.idx');
1170
        return true;
1171
    }
1172
1173
    /**
1174
     * Retrieve or insert a value in the index.
1175
     *
1176
     * @param string    $idx    name of the index
1177
     * @param string    $suffix subpart identifier
1178
     * @param string    $value  line to find in the index
1179
     * @return int|bool          line number of the value in the index or false if writing the index failed
1180
     *
1181
     * @author Tom N Harris <[email protected]>
1182
     */
1183
    protected function addIndexKey($idx, $suffix, $value) {
1184
        $index = $this->getIndex($idx, $suffix);
1185
        $id = array_search($value, $index, true);
1186
        if ($id === false) {
1187
            $id = count($index);
1188
            $index[$id] = $value;
1189
            if (!$this->saveIndex($idx, $suffix, $index)) {
1190
                trigger_error("Failed to write $idx index", E_USER_ERROR);
1191
                return false;
1192
            }
1193
        }
1194
        return $id;
1195
    }
1196
1197
    /**
1198
     * Get the list of lengths indexed in the wiki.
1199
     *
1200
     * Read the index directory or a cache file and returns
1201
     * a sorted array of lengths of the words used in the wiki.
1202
     *
1203
     * @author YoBoY <[email protected]>
1204
     *
1205
     * @return array
1206
     */
1207
    protected function listIndexLengths() {
1208
        return idx_listIndexLengths();
1209
    }
1210
1211
    /**
1212
     * Get the word lengths that have been indexed.
1213
     *
1214
     * Reads the index directory and returns an array of lengths
1215
     * that there are indices for.
1216
     *
1217
     * @author YoBoY <[email protected]>
1218
     *
1219
     * @param array|int $filter
1220
     * @return array
1221
     */
1222
    protected function indexLengths($filter) {
1223
        global $conf;
1224
        $idx = array();
1225
        if (is_array($filter)) {
1226
            // testing if index files exist only
1227
            $path = $conf['indexdir']."/i";
1228
            foreach ($filter as $key => $value) {
1229
                if (file_exists($path.$key.'.idx'))
1230
                    $idx[] = $key;
1231
            }
1232
        } else {
1233
            $lengths = idx_listIndexLengths();
1234
            foreach ($lengths as $key => $length) {
1235
                // keep all the values equal or superior
1236
                if ((int)$length >= (int)$filter)
1237
                    $idx[] = $length;
1238
            }
1239
        }
1240
        return $idx;
1241
    }
1242
1243
    /**
1244
     * Insert or replace a tuple in a line.
1245
     *
1246
     * @author Tom N Harris <[email protected]>
1247
     *
1248
     * @param string $line
1249
     * @param string|int $id
1250
     * @param int    $count
1251
     * @return string
1252
     */
1253
    protected function updateTuple($line, $id, $count) {
1254
        if ($line != ''){
1255
            $line = preg_replace('/(^|:)'.preg_quote($id,'/').'\*\d*/', '', $line);
1256
        }
1257
        $line = trim($line, ':');
1258
        if ($count) {
1259
            if ($line) {
1260
                return "$id*$count:".$line;
1261
            } else {
1262
                return "$id*$count";
1263
            }
1264
        }
1265
        return $line;
1266
    }
1267
1268
    /**
1269
     * Split a line into an array of tuples.
1270
     *
1271
     * @author Tom N Harris <[email protected]>
1272
     * @author Andreas Gohr <[email protected]>
1273
     *
1274
     * @param array $keys
1275
     * @param string $line
1276
     * @return array
1277
     */
1278
    protected function parseTuples(&$keys, $line) {
1279
        $result = array();
1280
        if ($line == '') return $result;
1281
        $parts = explode(':', $line);
1282
        foreach ($parts as $tuple) {
1283
            if ($tuple === '') continue;
1284
            list($key, $cnt) = explode('*', $tuple);
1285
            if (!$cnt) continue;
1286
            $key = $keys[$key];
1287
            if (!$key) continue;
1288
            $result[$key] = $cnt;
1289
        }
1290
        return $result;
1291
    }
1292
1293
    /**
1294
     * Sum the counts in a list of tuples.
1295
     *
1296
     * @author Tom N Harris <[email protected]>
1297
     *
1298
     * @param string $line
1299
     * @return int
1300
     */
1301
    protected function countTuples($line) {
1302
        $freq = 0;
1303
        $parts = explode(':', $line);
1304
        foreach ($parts as $tuple) {
1305
            if ($tuple === '') continue;
1306
            list(/* $pid */, $cnt) = explode('*', $tuple);
1307
            $freq += (int)$cnt;
1308
        }
1309
        return $freq;
1310
    }
1311
}
1312
1313
/**
1314
 * Create an instance of the indexer.
1315
 *
1316
 * @return Doku_Indexer    a Doku_Indexer
1317
 *
1318
 * @author Tom N Harris <[email protected]>
1319
 */
1320
function idx_get_indexer() {
1321
    static $Indexer;
1322
    if (!isset($Indexer)) {
1323
        $Indexer = new Doku_Indexer();
1324
    }
1325
    return $Indexer;
1326
}
1327
1328
/**
1329
 * Returns words that will be ignored.
1330
 *
1331
 * @return array                list of stop words
1332
 *
1333
 * @author Tom N Harris <[email protected]>
1334
 */
1335
function & idx_get_stopwords() {
1336
    static $stopwords = null;
1337
    if (is_null($stopwords)) {
1338
        global $conf;
1339
        $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
1340
        if(file_exists($swfile)){
1341
            $stopwords = file($swfile, FILE_IGNORE_NEW_LINES);
1342
        }else{
1343
            $stopwords = array();
1344
        }
1345
    }
1346
    return $stopwords;
1347
}
1348
1349
/**
1350
 * Adds/updates the search index for the given page
1351
 *
1352
 * Locking is handled internally.
1353
 *
1354
 * @param string        $page   name of the page to index
1355
 * @param boolean       $verbose    print status messages
1356
 * @param boolean       $force  force reindexing even when the index is up to date
1357
 * @return string|boolean  the function completed successfully
1358
 *
1359
 * @author Tom N Harris <[email protected]>
1360
 */
1361
function idx_addPage($page, $verbose=false, $force=false) {
1362
    $idxtag = metaFN($page,'.indexed');
1363
    // check if page was deleted but is still in the index
1364
    if (!page_exists($page)) {
1365
        if (!file_exists($idxtag)) {
1366
            if ($verbose) print("Indexer: $page does not exist, ignoring".DOKU_LF);
1367
            return false;
1368
        }
1369
        $Indexer = idx_get_indexer();
1370
        $result = $Indexer->deletePage($page);
1371
        if ($result === "locked") {
1372
            if ($verbose) print("Indexer: locked".DOKU_LF);
1373
            return false;
1374
        }
1375
        @unlink($idxtag);
1376
        return $result;
1377
    }
1378
1379
    // check if indexing needed
1380
    if(!$force && file_exists($idxtag)){
1381
        if(trim(io_readFile($idxtag)) == idx_get_version()){
1382
            $last = @filemtime($idxtag);
1383
            if($last > @filemtime(wikiFN($page))){
1384
                if ($verbose) print("Indexer: index for $page up to date".DOKU_LF);
1385
                return false;
1386
            }
1387
        }
1388
    }
1389
1390
    $indexenabled = p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED);
1391
    if ($indexenabled === false) {
1392
        $result = false;
1393
        if (file_exists($idxtag)) {
1394
            $Indexer = idx_get_indexer();
1395
            $result = $Indexer->deletePage($page);
1396
            if ($result === "locked") {
1397
                if ($verbose) print("Indexer: locked".DOKU_LF);
1398
                return false;
1399
            }
1400
            @unlink($idxtag);
1401
        }
1402
        if ($verbose) print("Indexer: index disabled for $page".DOKU_LF);
1403
        return $result;
1404
    }
1405
1406
    $Indexer = idx_get_indexer();
1407
    $pid = $Indexer->getPID($page);
1408
    if ($pid === false) {
1409
        if ($verbose) print("Indexer: getting the PID failed for $page".DOKU_LF);
1410
        return false;
1411
    }
1412
    $body = '';
1413
    $metadata = array();
1414
    $metadata['title'] = p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED);
1415
    if (($references = p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED)) !== null)
1416
        $metadata['relation_references'] = array_keys($references);
1417
    else
1418
        $metadata['relation_references'] = array();
1419
1420
    if (($media = p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED)) !== null)
1421
        $metadata['relation_media'] = array_keys($media);
1422
    else
1423
        $metadata['relation_media'] = array();
1424
1425
    $data = compact('page', 'body', 'metadata', 'pid');
1426
    $evt = new Doku_Event('INDEXER_PAGE_ADD', $data);
1427
    if ($evt->advise_before()) $data['body'] = $data['body'] . " " . rawWiki($page);
1428
    $evt->advise_after();
1429
    unset($evt);
1430
    extract($data);
1431
1432
    $result = $Indexer->addPageWords($page, $body);
1433
    if ($result === "locked") {
1434
        if ($verbose) print("Indexer: locked".DOKU_LF);
1435
        return false;
1436
    }
1437
1438
    if ($result) {
1439
        $result = $Indexer->addMetaKeys($page, $metadata);
1440
        if ($result === "locked") {
1441
            if ($verbose) print("Indexer: locked".DOKU_LF);
1442
            return false;
1443
        }
1444
    }
1445
1446
    if ($result)
1447
        io_saveFile(metaFN($page,'.indexed'), idx_get_version());
1448
    if ($verbose) {
1449
        print("Indexer: finished".DOKU_LF);
1450
        return true;
1451
    }
1452
    return $result;
1453
}
1454
1455
/**
1456
 * Find tokens in the fulltext index
1457
 *
1458
 * Takes an array of words and will return a list of matching
1459
 * pages for each one.
1460
 *
1461
 * Important: No ACL checking is done here! All results are
1462
 *            returned, regardless of permissions
1463
 *
1464
 * @param array      $words  list of words to search for
1465
 * @return array             list of pages found, associated with the search terms
1466
 */
1467
function idx_lookup(&$words) {
1468
    $Indexer = idx_get_indexer();
1469
    return $Indexer->lookup($words);
1470
}
1471
1472
/**
1473
 * Split a string into tokens
1474
 *
1475
 * @param string $string
1476
 * @param bool $wc
1477
 *
1478
 * @return array
1479
 */
1480
function idx_tokenizer($string, $wc=false) {
1481
    $Indexer = idx_get_indexer();
1482
    return $Indexer->tokenizer($string, $wc);
1483
}
1484
1485
/* For compatibility */
1486
1487
/**
1488
 * Read the list of words in an index (if it exists).
1489
 *
1490
 * @author Tom N Harris <[email protected]>
1491
 *
1492
 * @param string $idx
1493
 * @param string $suffix
1494
 * @return array
1495
 */
1496
function idx_getIndex($idx, $suffix) {
1497
    global $conf;
1498
    $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx';
1499
    if (!file_exists($fn)) return array();
1500
    return file($fn);
1501
}
1502
1503
/**
1504
 * Get the list of lengths indexed in the wiki.
1505
 *
1506
 * Read the index directory or a cache file and returns
1507
 * a sorted array of lengths of the words used in the wiki.
1508
 *
1509
 * @author YoBoY <[email protected]>
1510
 *
1511
 * @return array
1512
 */
1513
function idx_listIndexLengths() {
1514
    global $conf;
1515
    // testing what we have to do, create a cache file or not.
1516
    if ($conf['readdircache'] == 0) {
1517
        $docache = false;
1518
    } else {
1519
        clearstatcache();
1520
        if (file_exists($conf['indexdir'].'/lengths.idx')
1521
        && (time() < @filemtime($conf['indexdir'].'/lengths.idx') + $conf['readdircache'])) {
1522
            if (
1523
                ($lengths = @file($conf['indexdir'].'/lengths.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES))
1524
                !== false
1525
            ) {
1526
                $idx = array();
1527
                foreach ($lengths as $length) {
1528
                    $idx[] = (int)$length;
1529
                }
1530
                return $idx;
1531
            }
1532
        }
1533
        $docache = true;
1534
    }
1535
1536
    if ($conf['readdircache'] == 0 || $docache) {
1537
        $dir = @opendir($conf['indexdir']);
1538
        if ($dir === false)
1539
            return array();
1540
        $idx = array();
1541
        while (($f = readdir($dir)) !== false) {
1542
            if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') {
1543
                $i = substr($f, 1, -4);
1544
                if (is_numeric($i))
1545
                    $idx[] = (int)$i;
1546
            }
1547
        }
1548
        closedir($dir);
1549
        sort($idx);
1550
        // save this in a file
1551
        if ($docache) {
1552
            $handle = @fopen($conf['indexdir'].'/lengths.idx', 'w');
1553
            @fwrite($handle, implode("\n", $idx));
1554
            @fclose($handle);
1555
        }
1556
        return $idx;
1557
    }
1558
1559
    return array();
1560
}
1561
1562
/**
1563
 * Get the word lengths that have been indexed.
1564
 *
1565
 * Reads the index directory and returns an array of lengths
1566
 * that there are indices for.
1567
 *
1568
 * @author YoBoY <[email protected]>
1569
 *
1570
 * @param array|int $filter
1571
 * @return array
1572
 */
1573
function idx_indexLengths($filter) {
1574
    global $conf;
1575
    $idx = array();
1576
    if (is_array($filter)) {
1577
        // testing if index files exist only
1578
        $path = $conf['indexdir']."/i";
1579
        foreach ($filter as $key => $value) {
1580
            if (file_exists($path.$key.'.idx'))
1581
                $idx[] = $key;
1582
        }
1583
    } else {
1584
        $lengths = idx_listIndexLengths();
1585
        foreach ($lengths as $key => $length) {
1586
            // keep all the values equal or superior
1587
            if ((int)$length >= (int)$filter)
1588
                $idx[] = $length;
1589
        }
1590
    }
1591
    return $idx;
1592
}
1593
1594
/**
1595
 * Clean a name of a key for use as a file name.
1596
 *
1597
 * Romanizes non-latin characters, then strips away anything that's
1598
 * not a letter, number, or underscore.
1599
 *
1600
 * @author Tom N Harris <[email protected]>
1601
 *
1602
 * @param string $name
1603
 * @return string
1604
 */
1605
function idx_cleanName($name) {
1606
    $name = utf8_romanize(trim((string)$name));
1607
    $name = preg_replace('#[ \./\\:-]+#', '_', $name);
1608
    $name = preg_replace('/[^A-Za-z0-9_]/', '', $name);
1609
    return strtolower($name);
1610
}
1611
1612
//Setup VIM: ex: et ts=4 :
1613