FulltextIndex::deletePageWords() - Code Metrics - Inspection of "Refactor fulltext search functions and class Doku_..." - splitbrain/dokuwiki - Measure and Improve Code Quality continuously with Scrutinizer

Failed Conditions

Pull Request — master (#2943)

by Andreas

created 2020-09-10 07:56 UTC

FulltextIndex::deletePageWords() B

↳ Parent: FulltextIndex

Complexity

Conditions	11
Paths	8

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	11
nc	8
nop	2
dl	0
loc	38
rs	7.3166
c	0
b	0
f	0

How to fix Complexity

<?php

namespace dokuwiki\Search;

use dokuwiki\Search\Tokenizer;
use dokuwiki\Utf8;

/**
 * Class DokuWiki Fulltext Index (Singleton)
 *
 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
 * @author     Andreas Gohr <[email protected]>
 * @author Tom N Harris <[email protected]>
 */
class FulltextIndex extends AbstractIndex
{
    /** @var FulltextIndex $instance */
    protected static $instance = null;

    /**
     * Get new or existing singleton instance of the FulltextIndex
     *
     * @return FulltextIndex
     */
    public static function getInstance()
    {
        if (is_null(static::$instance)) {
            static::$instance = new static();
        }
        return static::$instance;
    }

    /**
     * Measure the length of a string
     * Differs from strlen in handling of asian characters.
     *
     * @author Tom N Harris <[email protected]>
     *
     * @param string $w
     * @return int
     */
    public function wordlen($w)
    {
        $l = strlen($w);
        // If left alone, all chinese "words" will get put into w3.idx
        // So the "length" of a "word" is faked
        if (preg_match_all('/[\xE2-\xEF]/', $w, $leadbytes)) {
            foreach ($leadbytes[0] as $b) {
                $l += ord($b) - 0xE1;
            }
        }
        return $l;
    }

    /**
     * Adds the contents of a page to the fulltext index
     *
     * The added text replaces previous words for the same page.
     * An empty value erases the page.
     *
     * @param string $page   a page name
     * @param string $text   the body of the page
     * @param bool   $requireLock  should be false only if the caller is resposible for index lock
     * @return bool  if the function completed successfully
     *
     * @author Tom N Harris <[email protected]>
     * @author Andreas Gohr <[email protected]>
     */
    public function addPageWords($page, $text, $requireLock = true)
    {
        // load known documents
        $pid = $this->getPID($page);
        if ($pid === false) {
            return false;
        }

        if ($requireLock && !$this->lock()) return false;

        $pagewords = array();
        // get word usage in page
        $words = $this->getPageWords($text);
        if ($words === false) {
            $this->unlock();
            return false;
        }

        if (!empty($words)) {
            foreach (array_keys($words) as $wlen) {
                $index = $this->getIndex('i', $wlen);
                foreach ($words[$wlen] as $wid => $freq) {
                    $idx = ($wid < count($index)) ? $index[$wid] : '';
                    $index[$wid] = $this->updateTuple($idx, $pid, $freq);
                    $pagewords[] = "{$wlen}*{$wid}";
                }
                if (!$this->saveIndex('i', $wlen, $index)) {
                    $this->unlock();
                    return false;
                }
            }
        }

        // Remove obsolete index entries
        $pageword_idx = $this->getIndexKey('pageword', '', $pid);
        if ($pageword_idx !== '') {
            $oldwords = explode(':',$pageword_idx);
            $delwords = array_diff($oldwords, $pagewords);
            $upwords = array();
            foreach ($delwords as $word) {
                if ($word != '') {
                    list($wlen, $wid) = explode('*', $word);
                    $wid = (int)$wid;
                    $upwords[$wlen][] = $wid;
                }
            }
            foreach ($upwords as $wlen => $widx) {
                $index = $this->getIndex('i', $wlen);
                foreach ($widx as $wid) {
                    $index[$wid] = $this->updateTuple($index[$wid], $pid, 0);
                }
                $this->saveIndex('i', $wlen, $index);
            }
        }
        // Save the reverse index
        $pageword_idx = implode(':', $pagewords);
        if (!$this->saveIndexKey('pageword', '', $pid, $pageword_idx)) {
            $result = false;
        } else {
            $result = true;
        }

        if ($requireLock) $this->unlock();
        return $result;
    }

    /**
     * Split the words in a page and add them to the index
     *
     * @param string    $text   content of the page
     * @return array            list of word IDs and number of times used
     *
     * @author Andreas Gohr <[email protected]>
     * @author Christopher Smith <[email protected]>
     * @author Tom N Harris <[email protected]>
     */
    protected function getPageWords($text)
    {
        $Tokenizer = Tokenizer::getInstance();
        $tokens = $Tokenizer->getWords($text);
        $tokens = array_count_values($tokens);  // count the frequency of each token

        $words = array();
        foreach ($tokens as $w => $c) {
            $l = $this->wordlen($w);
            if (isset($words[$l])) {
                $words[$l][$w] = $c + (isset($words[$l][$w]) ? $words[$l][$w] : 0);
            } else {
                $words[$l] = array($w => $c);
            }
        }

        // arrive here with $words = array(wordlen => array(word => frequency))
        $word_idx_modified = false;
        $index = array();   //resulting index
        foreach (array_keys($words) as $wlen) {
            $word_idx = $this->getIndex('w', $wlen);
            foreach ($words[$wlen] as $word => $freq) {
                $word = (string)$word;
                $wid = array_search($word, $word_idx, true);
                if ($wid === false) {
                    $wid = count($word_idx);
                    $word_idx[] = $word;
                    $word_idx_modified = true;
                }
                if (!isset($index[$wlen])) {
                    $index[$wlen] = array();
                }
                $index[$wlen][$wid] = $freq;
            }
            // save back the word index
            if ($word_idx_modified && !$this->saveIndex('w', $wlen, $word_idx)) {
                return false;
            }
        }

        return $index;
    }

    /**
     * Delete the contents of a page to the fulltext index
     *
     * @param string $page   a page name
     * @param bool   $requireLock  should be false only if the caller is resposible for index lock
     * @return bool  If renaming the value has been successful, false on error
     *
     * @author Tom N Harris <[email protected]>
     * @author Satoshi Sahara <[email protected]>
     */
    public function deletePageWords($page, $requireLock = true)
    {
        // load known documents
        $pid = $this->getPID($page);
        if ($pid === false) {
            return false;
        }

        if ($requireLock && !$this->lock()) return false;

        // remove obsolete index entries
        $pageword_idx = $this->getIndexKey('pageword', '', $pid);
        if ($pageword_idx !== '') {
            $delwords = explode(':', $pageword_idx);
            $upwords = array();
            foreach ($delwords as $word) {
                if ($word != '') {
                    list($wlen, $wid) = explode('*', $word);
                    $wid = (int)$wid;
                    $upwords[$wlen][] = $wid;
                }
            }
            foreach ($upwords as $wlen => $widx) {
                $index = $this->getIndex('i', $wlen);
                foreach ($widx as $wid) {
                    $index[$wid] = $this->updateTuple($index[$wid], $pid, 0);
                }
                $this->saveIndex('i', $wlen, $index);
            }
        }
        // save the reverse index
        if (!$this->saveIndexKey('pageword', '', $pid, '')) {
            return false;
        }

        if ($requireLock) $this->unlock();
        return true;
    }

    /**
     * Find pages in the fulltext index containing the words,
     *
     * The search words must be pre-tokenized, meaning only letters and
     * numbers with an optional wildcard
     *
     * The returned array will have the original tokens as key. The values
     * in the returned list is an array with the page names as keys and the
     * number of times that token appears on the page as value.
     *
     * @param array  $tokens list of words to search for
     * @return array         list of page names with usage counts
     *
     * @author Tom N Harris <[email protected]>
     * @author Andreas Gohr <[email protected]>
     */
    public function lookupWords(&$tokens)
    {
        $result = array();
        $wids = $this->getIndexWords($tokens, $result);
        if (empty($wids)) return array();
        // load known words and documents
        $page_idx = $this->getIndex('page', '');
        $docs = array();
        foreach (array_keys($wids) as $wlen) {
            $wids[$wlen] = array_unique($wids[$wlen]);
            $index = $this->getIndex('i', $wlen);
            foreach ($wids[$wlen] as $ixid) {
                if ($ixid < count($index)) {
                    $docs["{$wlen}*{$ixid}"] = $this->parseTuples($page_idx, $index[$ixid]);
                }
            }
        }
        // merge found pages into final result array
        $final = array();
        foreach ($result as $word => $res) {
            $final[$word] = array();
            foreach ($res as $wid) {
                // handle the case when ($ixid < count($index)) has been false
                // and thus $docs[$wid] hasn't been set.
                if (!isset($docs[$wid])) continue;
                $hits =& $docs[$wid];
                foreach ($hits as $hitkey => $hitcnt) {
                    // make sure the document still exists
                    if (!page_exists($hitkey, '', false)) continue;
                    if (!isset($final[$word][$hitkey])) {
                        $final[$word][$hitkey] = $hitcnt;
                    } else {
                        $final[$word][$hitkey] += $hitcnt;
                    }
                }
            }
        }
        return $final;
    }

    /**
     * Find the index ID of each search term
     *
     * The query terms should only contain valid characters, with a '*' at
     * either the beginning or end of the word (or both).
     * The $result parameter can be used to merge the index locations with
     * the appropriate query term.
     *
     * @param array  $words  The query terms.
     * @param array  $result Set to word => array("length*id" ...)
     * @return array         Set to length => array(id ...)
     *
     * @author Tom N Harris <[email protected]>
     */
    protected function getIndexWords(&$words, &$result)
    {
        $Tokenizer = Tokenizer::getInstance();

        $tokens = array();
        $tokenlength = array();
        $tokenwild = array();
        foreach ($words as $word) {
            $result[$word] = array();
            $caret = '^';
            $dollar = '$';
            $xword = $word;
            $wlen = $this->wordlen($word);

            // check for wildcards
            if (substr($xword, 0, 1) == '*') {
                $xword = substr($xword, 1);
                $caret = '';
                $wlen -= 1;
            }
            if (substr($xword, -1, 1) == '*') {
                $xword = substr($xword, 0, -1);
                $dollar = '';
                $wlen -= 1;
            }
            if ($wlen < $Tokenizer->getMinWordLength()
                && $caret && $dollar && !is_numeric($xword)
            ) {
                continue;
            }
            if (!isset($tokens[$xword])) {
                $tokenlength[$wlen][] = $xword;
            }
            if (!$caret || !$dollar) {
                $re = $caret.preg_quote($xword, '/').$dollar;
                $tokens[$xword][] = array($word, '/'.$re.'/');
                if (!isset($tokenwild[$xword])) {
                    $tokenwild[$xword] = $wlen;
                }
            } else {
                $tokens[$xword][] = array($word, null);
            }
        }
        asort($tokenwild);
        // $tokens = array( base word => array( [ query term , regexp ] ... ) ... )
        // $tokenlength = array( base word length => base word ... )
        // $tokenwild = array( base word => base word length ... )
        $length_filter = empty($tokenwild) ? $tokenlength : min(array_keys($tokenlength));
        $indexes_known = $this->getIndexLengths($length_filter);
        if (!empty($tokenwild)) sort($indexes_known);
        // get word IDs
        $wids = array();
        foreach ($indexes_known as $ixlen) {
            $word_idx = $this->getIndex('w', $ixlen);
            // handle exact search
            if (isset($tokenlength[$ixlen])) {
                foreach ($tokenlength[$ixlen] as $xword) {
                    $wid = array_search($xword, $word_idx, true);
                    if ($wid !== false) {
                        $wids[$ixlen][] = $wid;
                        foreach ($tokens[$xword] as $w)
                            $result[$w[0]][] = "{$ixlen}*{$wid}";
                    }
                }
            }
            // handle wildcard search
            foreach ($tokenwild as $xword => $wlen) {
                if ($wlen >= $ixlen) break;
                foreach ($tokens[$xword] as $w) {
                    if (is_null($w[1])) continue;
                    foreach (array_keys(preg_grep($w[1], $word_idx)) as $wid) {
                        $wids[$ixlen][] = $wid;
                        $result[$w[0]][] = "{$ixlen}*{$wid}";
                    }
                }
            }
        }
        return $wids;
    }

    /**
     * Get the word lengths that have been indexed
     *
     * Reads the index directory and returns an array of lengths
     * that there are indices for.
     *
     * @author YoBoY <[email protected]>
     *
     * @param array|int $filter
     * @return array
     */
    public function getIndexLengths($filter)
    {
        global $conf;
        $idx = array();
        if (is_array($filter)) {
            // testing if index files exist only
            $path = $conf['indexdir']."/i";
            foreach ($filter as $key => $value) {
                if (file_exists($path.$key.'.idx')) {
                    $idx[] = $key;
                }
            }
        } else {
            $lengths = $this->listIndexLengths();
            foreach ($lengths as $key => $length) {
                // keep all the values equal or superior
                if ((int)$length >= (int)$filter) {
                    $idx[] = $length;
                }
            }
        }
        return $idx;
    }

    /**
     * Get the list of lengths indexed in the wiki
     *
     * Read the index directory or a cache file and returns
     * a sorted array of lengths of the words used in the wiki.
     *
     * @author YoBoY <[email protected]>
     *
     * @return array
     */
    public function listIndexLengths()
    {
        global $conf;
        $lengthsFile = $conf['indexdir'].'/lengths.idx';

        // testing what we have to do, create a cache file or not.
        if ($conf['readdircache'] == 0) {
            $docache = false;
        } else {
            clearstatcache();
            if (file_exists($lengthsFile)
                && (time() < @filemtime($lengthsFile) + $conf['readdircache'])
            ) {
                $lengths = @file($lengthsFile, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
                if ($lengths !== false) {
                    $idx = array();
                    foreach ($lengths as $length) {
                        $idx[] = (int)$length;
                    }
                    return $idx;
                }
            }
            $docache = true;
        }

        if ($conf['readdircache'] == 0 || $docache) {
            $dir = @opendir($conf['indexdir']);
            if ($dir === false) return array();
            $idx = array();
            while (($f = readdir($dir)) !== false) {
                if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') {
                    $i = substr($f, 1, -4);
                    if (is_numeric($i)) $idx[] = (int)$i;
                }
            }
            closedir($dir);
            sort($idx);
            // save this in a file
            if ($docache) {
                $handle = @fopen($lengthsFile, 'w');
                @fwrite($handle, implode("\n", $idx));
                @fclose($handle);
            }
            return $idx;
        }
        return array();
    }

    /**
     * Return a list of words sorted by number of times used
     *
     * @param int       $min    bottom frequency threshold
     * @param int       $max    upper frequency limit. No limit if $max<$min
     * @param int       $minlen minimum length of words to count
     * @return array            list of words as the keys and frequency as value
     *
     * @author Tom N Harris <[email protected]>
     */
    public function histogram($min=1, $max=0, $minlen=3)
    {
        return Search\MetadataIndex::getInstance()->histogram($min, $max, $minlen);
    }

    /**
     * Clear the Fulltext Index
     *
     * @param bool   $requireLock  should be false only if the caller is resposible for index lock
     * @return bool  If the index has been cleared successfully
     */
    public function clear($requireLock = true)
    {
        global $conf;

        if ($requireLock && !$this->lock()) return false;

        $lengths = $this->listIndexLengths();
        foreach ($lengths as $length) {
            @unlink($conf['indexdir'].'/i'.$length.'.idx');
            @unlink($conf['indexdir'].'/w'.$length.'.idx');
        }
        @unlink($conf['indexdir'].'/lengths.idx');
        @unlink($conf['indexdir'].'/pageword.idx');

        if ($requireLock) $this->unlock();
        return true;
    }
}


1			<?php
2
3			namespace dokuwiki\Search;
4
5			use dokuwiki\Search\Tokenizer;
6			use dokuwiki\Utf8;
7
8			/**
9			* Class DokuWiki Fulltext Index (Singleton)
10			*
11			* @license GPL 2 (http://www.gnu.org/licenses/gpl.html)
12			* @author Andreas Gohr <[email protected]>
13			* @author Tom N Harris <[email protected]>
14			*/
15			class FulltextIndex extends AbstractIndex
16			{
17			/** @var FulltextIndex $instance */
18			protected static $instance = null;
19
20			/**
21			* Get new or existing singleton instance of the FulltextIndex
22			*
23			* @return FulltextIndex
24			*/
25			public static function getInstance()
26			{
27			if (is_null(static::$instance)) {
28			static::$instance = new static();
29			}
30			return static::$instance;
31			}
32
33			/**
34			* Measure the length of a string
35			* Differs from strlen in handling of asian characters.
36			*
37			* @author Tom N Harris <[email protected]>
38			*
39			* @param string $w
40			* @return int
41			*/
42			public function wordlen($w)
43			{
44			$l = strlen($w);
45			// If left alone, all chinese "words" will get put into w3.idx
46			// So the "length" of a "word" is faked
47			if (preg_match_all('/[\xE2-\xEF]/', $w, $leadbytes)) {
48			foreach ($leadbytes[0] as $b) {
49			$l += ord($b) - 0xE1;
50			}
51			}
52			return $l;
53			}
54
55			/**
56			* Adds the contents of a page to the fulltext index
57			*
58			* The added text replaces previous words for the same page.
59			* An empty value erases the page.
60			*
61			* @param string $page a page name
62			* @param string $text the body of the page
63			* @param bool $requireLock should be false only if the caller is resposible for index lock
64			* @return bool if the function completed successfully
65			*
66			* @author Tom N Harris <[email protected]>
67			* @author Andreas Gohr <[email protected]>
68			*/
69			public function addPageWords($page, $text, $requireLock = true)
70			{
71			// load known documents
72			$pid = $this->getPID($page);
73			if ($pid === false) {
74			return false;
75			}
76
77			if ($requireLock && !$this->lock()) return false;
78
79			$pagewords = array();
80			// get word usage in page
81			$words = $this->getPageWords($text);
82			if ($words === false) {
83			$this->unlock();
84			return false;
85			}
86
87			if (!empty($words)) {
88			foreach (array_keys($words) as $wlen) {
89			$index = $this->getIndex('i', $wlen);
90			foreach ($words[$wlen] as $wid => $freq) {
91			$idx = ($wid < count($index)) ? $index[$wid] : '';
92			$index[$wid] = $this->updateTuple($idx, $pid, $freq);
93			$pagewords[] = "{$wlen}*{$wid}";
94			}
95			if (!$this->saveIndex('i', $wlen, $index)) {
96			$this->unlock();
97			return false;
98			}
99			}
100			}
101
102			// Remove obsolete index entries
103			$pageword_idx = $this->getIndexKey('pageword', '', $pid);
104			if ($pageword_idx !== '') {
105			$oldwords = explode(':',$pageword_idx);
106			$delwords = array_diff($oldwords, $pagewords);
107			$upwords = array();
108			foreach ($delwords as $word) {
109			if ($word != '') {
110			list($wlen, $wid) = explode('*', $word);
111			$wid = (int)$wid;
112			$upwords[$wlen][] = $wid;
113			}
114			}
115			foreach ($upwords as $wlen => $widx) {
116			$index = $this->getIndex('i', $wlen);
117			foreach ($widx as $wid) {
118			$index[$wid] = $this->updateTuple($index[$wid], $pid, 0);
119			}
120			$this->saveIndex('i', $wlen, $index);
121			}
122			}
123			// Save the reverse index
124			$pageword_idx = implode(':', $pagewords);
125			if (!$this->saveIndexKey('pageword', '', $pid, $pageword_idx)) {
126			$result = false;
127			} else {
128			$result = true;
129			}
130
131			if ($requireLock) $this->unlock();
132			return $result;
133			}
134
135			/**
136			* Split the words in a page and add them to the index
137			*
138			* @param string $text content of the page
139			* @return array list of word IDs and number of times used
140			*
141			* @author Andreas Gohr <[email protected]>
142			* @author Christopher Smith <[email protected]>
143			* @author Tom N Harris <[email protected]>
144			*/
145			protected function getPageWords($text)
146			{
147			$Tokenizer = Tokenizer::getInstance();
148			$tokens = $Tokenizer->getWords($text);
149			$tokens = array_count_values($tokens); // count the frequency of each token
150
151			$words = array();
152			foreach ($tokens as $w => $c) {
153			$l = $this->wordlen($w);
154			if (isset($words[$l])) {
155			$words[$l][$w] = $c + (isset($words[$l][$w]) ? $words[$l][$w] : 0);
156			} else {
157			$words[$l] = array($w => $c);
158			}
159			}
160
161			// arrive here with $words = array(wordlen => array(word => frequency))
162			$word_idx_modified = false;
163			$index = array(); //resulting index
164			foreach (array_keys($words) as $wlen) {
165			$word_idx = $this->getIndex('w', $wlen);
166			foreach ($words[$wlen] as $word => $freq) {
167			$word = (string)$word;
168			$wid = array_search($word, $word_idx, true);
169			if ($wid === false) {
170			$wid = count($word_idx);
171			$word_idx[] = $word;
172			$word_idx_modified = true;
173			}
174			if (!isset($index[$wlen])) {
175			$index[$wlen] = array();
176			}
177			$index[$wlen][$wid] = $freq;
178			}
179			// save back the word index
180			if ($word_idx_modified && !$this->saveIndex('w', $wlen, $word_idx)) {
181			return false;
182			}
183			}
184
185			return $index;
186			}
187
188			/**
189			* Delete the contents of a page to the fulltext index
190			*
191			* @param string $page a page name
192			* @param bool $requireLock should be false only if the caller is resposible for index lock
193			* @return bool If renaming the value has been successful, false on error
194			*
195			* @author Tom N Harris <[email protected]>
196			* @author Satoshi Sahara <[email protected]>
197			*/
198			public function deletePageWords($page, $requireLock = true)
199			{
200			// load known documents
201			$pid = $this->getPID($page);
202			if ($pid === false) {
203			return false;
204			}
205
206			if ($requireLock && !$this->lock()) return false;
207
208			// remove obsolete index entries
209			$pageword_idx = $this->getIndexKey('pageword', '', $pid);
210			if ($pageword_idx !== '') {
211			$delwords = explode(':', $pageword_idx);
212			$upwords = array();
213			foreach ($delwords as $word) {
214			if ($word != '') {
215			list($wlen, $wid) = explode('*', $word);
216			$wid = (int)$wid;
217			$upwords[$wlen][] = $wid;
218			}
219			}
220			foreach ($upwords as $wlen => $widx) {
221			$index = $this->getIndex('i', $wlen);
222			foreach ($widx as $wid) {
223			$index[$wid] = $this->updateTuple($index[$wid], $pid, 0);
224			}
225			$this->saveIndex('i', $wlen, $index);
226			}
227			}
228			// save the reverse index
229			if (!$this->saveIndexKey('pageword', '', $pid, '')) {
230			return false;
231			}
232
233			if ($requireLock) $this->unlock();
234			return true;
235			}
236
237			/**
238			* Find pages in the fulltext index containing the words,
239			*
240			* The search words must be pre-tokenized, meaning only letters and
241			* numbers with an optional wildcard
242			*
243			* The returned array will have the original tokens as key. The values
244			* in the returned list is an array with the page names as keys and the
245			* number of times that token appears on the page as value.
246			*
247			* @param array $tokens list of words to search for
248			* @return array list of page names with usage counts
249			*
250			* @author Tom N Harris <[email protected]>
251			* @author Andreas Gohr <[email protected]>
252			*/
253			public function lookupWords(&$tokens)
254			{
255			$result = array();
256			$wids = $this->getIndexWords($tokens, $result);
257			if (empty($wids)) return array();
258			// load known words and documents
259			$page_idx = $this->getIndex('page', '');
260			$docs = array();
261			foreach (array_keys($wids) as $wlen) {
262			$wids[$wlen] = array_unique($wids[$wlen]);
263			$index = $this->getIndex('i', $wlen);
264			foreach ($wids[$wlen] as $ixid) {
265			if ($ixid < count($index)) {
266			$docs["{$wlen}*{$ixid}"] = $this->parseTuples($page_idx, $index[$ixid]);
267			}
268			}
269			}
270			// merge found pages into final result array
271			$final = array();
272			foreach ($result as $word => $res) {
273			$final[$word] = array();
274			foreach ($res as $wid) {
275			// handle the case when ($ixid < count($index)) has been false
276			// and thus $docs[$wid] hasn't been set.
277			if (!isset($docs[$wid])) continue;
278			$hits =& $docs[$wid];
279			foreach ($hits as $hitkey => $hitcnt) {
280			// make sure the document still exists
281			if (!page_exists($hitkey, '', false)) continue;
282			if (!isset($final[$word][$hitkey])) {
283			$final[$word][$hitkey] = $hitcnt;
284			} else {
285			$final[$word][$hitkey] += $hitcnt;
286			}
287			}
288			}
289			}
290			return $final;
291			}
292
293			/**
294			* Find the index ID of each search term
295			*
296			* The query terms should only contain valid characters, with a '*' at
297			* either the beginning or end of the word (or both).
298			* The $result parameter can be used to merge the index locations with
299			* the appropriate query term.
300			*
301			* @param array $words The query terms.
302			* @param array $result Set to word => array("length*id" ...)
303			* @return array Set to length => array(id ...)
304			*
305			* @author Tom N Harris <[email protected]>
306			*/
307			protected function getIndexWords(&$words, &$result)
308			{
309			$Tokenizer = Tokenizer::getInstance();
310
311			$tokens = array();
312			$tokenlength = array();
313			$tokenwild = array();
314			foreach ($words as $word) {
315			$result[$word] = array();
316			$caret = '^';
317			$dollar = '$';
318			$xword = $word;
319			$wlen = $this->wordlen($word);
320
321			// check for wildcards
322			if (substr($xword, 0, 1) == '*') {
323			$xword = substr($xword, 1);
324			$caret = '';
325			$wlen -= 1;
326			}
327			if (substr($xword, -1, 1) == '*') {
328			$xword = substr($xword, 0, -1);
329			$dollar = '';
330			$wlen -= 1;
331			}
332			if ($wlen < $Tokenizer->getMinWordLength()
333			&& $caret && $dollar && !is_numeric($xword)
334			) {
335			continue;
336			}
337			if (!isset($tokens[$xword])) {
338			$tokenlength[$wlen][] = $xword;
339			}
340			if (!$caret \|\| !$dollar) {
341			$re = $caret.preg_quote($xword, '/').$dollar;
342			$tokens[$xword][] = array($word, '/'.$re.'/');
343			if (!isset($tokenwild[$xword])) {
344			$tokenwild[$xword] = $wlen;
345			}
346			} else {
347			$tokens[$xword][] = array($word, null);
348			}
349			}
350			asort($tokenwild);
351			// $tokens = array( base word => array( [ query term , regexp ] ... ) ... )
352			// $tokenlength = array( base word length => base word ... )
353			// $tokenwild = array( base word => base word length ... )
354			$length_filter = empty($tokenwild) ? $tokenlength : min(array_keys($tokenlength));
355			$indexes_known = $this->getIndexLengths($length_filter);
356			if (!empty($tokenwild)) sort($indexes_known);
357			// get word IDs
358			$wids = array();
359			foreach ($indexes_known as $ixlen) {
360			$word_idx = $this->getIndex('w', $ixlen);
361			// handle exact search
362			if (isset($tokenlength[$ixlen])) {
363			foreach ($tokenlength[$ixlen] as $xword) {
364			$wid = array_search($xword, $word_idx, true);
365			if ($wid !== false) {
366			$wids[$ixlen][] = $wid;
367			foreach ($tokens[$xword] as $w)
368			$result[$w[0]][] = "{$ixlen}*{$wid}";
369			}
370			}
371			}
372			// handle wildcard search
373			foreach ($tokenwild as $xword => $wlen) {
374			if ($wlen >= $ixlen) break;
375			foreach ($tokens[$xword] as $w) {
376			if (is_null($w[1])) continue;
377			foreach (array_keys(preg_grep($w[1], $word_idx)) as $wid) {
378			$wids[$ixlen][] = $wid;
379			$result[$w[0]][] = "{$ixlen}*{$wid}";
380			}
381			}
382			}
383			}
384			return $wids;
385			}
386
387			/**
388			* Get the word lengths that have been indexed
389			*
390			* Reads the index directory and returns an array of lengths
391			* that there are indices for.
392			*
393			* @author YoBoY <[email protected]>
394			*
395			* @param array\|int $filter
396			* @return array
397			*/
398			public function getIndexLengths($filter)
399			{
400			global $conf;
401			$idx = array();
402			if (is_array($filter)) {
403			// testing if index files exist only
404			$path = $conf['indexdir']."/i";
405			foreach ($filter as $key => $value) {
406			if (file_exists($path.$key.'.idx')) {
407			$idx[] = $key;
408			}
409			}
410			} else {
411			$lengths = $this->listIndexLengths();
412			foreach ($lengths as $key => $length) {
413			// keep all the values equal or superior
414			if ((int)$length >= (int)$filter) {
415			$idx[] = $length;
416			}
417			}
418			}
419			return $idx;
420			}
421
422			/**
423			* Get the list of lengths indexed in the wiki
424			*
425			* Read the index directory or a cache file and returns
426			* a sorted array of lengths of the words used in the wiki.
427			*
428			* @author YoBoY <[email protected]>
429			*
430			* @return array
431			*/
432			public function listIndexLengths()
433			{
434			global $conf;
435			$lengthsFile = $conf['indexdir'].'/lengths.idx';
436
437			// testing what we have to do, create a cache file or not.
438			if ($conf['readdircache'] == 0) {
439			$docache = false;
440			} else {
441			clearstatcache();
442			if (file_exists($lengthsFile)
443			&& (time() < @filemtime($lengthsFile) + $conf['readdircache'])
444			) {
445			$lengths = @file($lengthsFile, FILE_IGNORE_NEW_LINES \| FILE_SKIP_EMPTY_LINES);
446			if ($lengths !== false) {
447			$idx = array();
448			foreach ($lengths as $length) {
449			$idx[] = (int)$length;
450			}
451			return $idx;
452			}
453			}
454			$docache = true;
455			}
456
457			if ($conf['readdircache'] == 0 \|\| $docache) {
458			$dir = @opendir($conf['indexdir']);
459			if ($dir === false) return array();
460			$idx = array();
461			while (($f = readdir($dir)) !== false) {
462			if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') {
463			$i = substr($f, 1, -4);
464			if (is_numeric($i)) $idx[] = (int)$i;
465			}
466			}
467			closedir($dir);
468			sort($idx);
469			// save this in a file
470			if ($docache) {
471			$handle = @fopen($lengthsFile, 'w');
472			@fwrite($handle, implode("\n", $idx));
473			@fclose($handle);
474			}
475			return $idx;
476			}
477			return array();
478			}
479
480			/**
481			* Return a list of words sorted by number of times used
482			*
483			* @param int $min bottom frequency threshold
484			* @param int $max upper frequency limit. No limit if $max<$min
485			* @param int $minlen minimum length of words to count
486			* @return array list of words as the keys and frequency as value
487			*
488			* @author Tom N Harris <[email protected]>
489			*/
490			public function histogram($min=1, $max=0, $minlen=3)
491			{
492			return Search\MetadataIndex::getInstance()->histogram($min, $max, $minlen);
493			}
494
495			/**
496			* Clear the Fulltext Index
497			*
498			* @param bool $requireLock should be false only if the caller is resposible for index lock
499			* @return bool If the index has been cleared successfully
500			*/
501			public function clear($requireLock = true)
502			{
503			global $conf;
504
505			if ($requireLock && !$this->lock()) return false;
506
507			$lengths = $this->listIndexLengths();
508			foreach ($lengths as $length) {
509			@unlink($conf['indexdir'].'/i'.$length.'.idx');
510			@unlink($conf['indexdir'].'/w'.$length.'.idx');
511			}
512			@unlink($conf['indexdir'].'/lengths.idx');
513			@unlink($conf['indexdir'].'/pageword.idx');
514
515			if ($requireLock) $this->unlock();
516			return true;
517			}
518			}
519

splitbrain / dokuwiki

Pull Request — master (#2943)

FulltextIndex::deletePageWords() B

Complexity

Size

Duplication

Importance

How to fix Complexity

Long Method

Duplication Side-by-Side

Filter issues like