FulltextIndex::getIndexWords() - Code Metrics - Inspection of "Refactor fulltext search functions and class Doku_..." - splitbrain/dokuwiki - Measure and Improve Code Quality continuously with Scrutinizer

Failed Conditions

Pull Request — master (#2943)

by Andreas

created 2020-09-10 15:40 UTC

FulltextIndex::getIndexWords() F

↳ Parent: FulltextIndex

Complexity

Conditions	24
Paths	580

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	24
nc	580
nop	2
dl	0
loc	79
rs	0.5833
c	0
b	0
f	0

How to fix Long Method Complexity

<?php

namespace dokuwiki\Search;

use dokuwiki\Search\Tokenizer;
use dokuwiki\Utf8;

/**
 * Class DokuWiki Fulltext Index (Singleton)
 *
 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
 * @author     Andreas Gohr <[email protected]>
 * @author Tom N Harris <[email protected]>
 */
class FulltextIndex extends AbstractIndex
{
    /** @var FulltextIndex $instance */
    protected static $instance = null;

    /**
     * Get new or existing singleton instance of the FulltextIndex
     *
     * @return FulltextIndex
     */
    public static function getInstance()
    {
        if (is_null(static::$instance)) {
            static::$instance = new static();
        }
        return static::$instance;
    }

    /**
     * Measure the length of a string
     * Differs from strlen in handling of asian characters.
     *
     * @author Tom N Harris <[email protected]>
     *
     * @param string $w
     * @return int
     */
    public function wordlen($w)
    {
        $l = strlen($w);
        // If left alone, all chinese "words" will get put into w3.idx
        // So the "length" of a "word" is faked
        if (preg_match_all('/[\xE2-\xEF]/', $w, $leadbytes)) {
            foreach ($leadbytes[0] as $b) {
                $l += ord($b) - 0xE1;
            }
        }
        return $l;
    }

    /**
     * Adds the contents of a page to the fulltext index
     *
     * The added text replaces previous words for the same page.
     * An empty value erases the page.
     *
     * @param string $page a page name
     * @param string $text the body of the page
     * @param bool $requireLock should be false only if the caller is resposible for index lock
     * @return bool  if the function completed successfully
     *
     * @throws Exception\IndexLockException
     * @author Andreas Gohr <[email protected]>
     * @author Tom N Harris <[email protected]>
     */
    public function addPageWords($page, $text, $requireLock = true)
    {
        // load known documents
        $pid = $this->getPID($page);
        if ($pid === false) {
            return false;
        }

        if ($requireLock) $this->lock();

        $pagewords = array();
        // get word usage in page
        $words = $this->getPageWords($text);
        if ($words === false) {
            $this->unlock();
            return false;
        }

        if (!empty($words)) {
            foreach (array_keys($words) as $wlen) {
                $index = $this->getIndex('i', $wlen);
                foreach ($words[$wlen] as $wid => $freq) {
                    $idx = ($wid < count($index)) ? $index[$wid] : '';
                    $index[$wid] = $this->updateTuple($idx, $pid, $freq);
                    $pagewords[] = "{$wlen}*{$wid}";
                }
                if (!$this->saveIndex('i', $wlen, $index)) {
                    $this->unlock();
                    return false;
                }
            }
        }

        // Remove obsolete index entries
        $pageword_idx = $this->getIndexKey('pageword', '', $pid);
        if ($pageword_idx !== '') {
            $oldwords = explode(':',$pageword_idx);
            $delwords = array_diff($oldwords, $pagewords);
            $upwords = array();
            foreach ($delwords as $word) {
                if ($word != '') {
                    list($wlen, $wid) = explode('*', $word);
                    $wid = (int)$wid;
                    $upwords[$wlen][] = $wid;
                }
            }
            foreach ($upwords as $wlen => $widx) {
                $index = $this->getIndex('i', $wlen);
                foreach ($widx as $wid) {
                    $index[$wid] = $this->updateTuple($index[$wid], $pid, 0);
                }
                $this->saveIndex('i', $wlen, $index);
            }
        }
        // Save the reverse index
        $pageword_idx = implode(':', $pagewords);
        if (!$this->saveIndexKey('pageword', '', $pid, $pageword_idx)) {
            $result = false;
        } else {
            $result = true;
        }

        if ($requireLock) $this->unlock();
        return $result;
    }

    /**
     * Split the words in a page and add them to the index
     *
     * @param string    $text   content of the page
     * @return array|false      list of word IDs and number of times used, false on errors
     *
     * @author Andreas Gohr <[email protected]>
     * @author Christopher Smith <[email protected]>
     * @author Tom N Harris <[email protected]>
     */
    protected function getPageWords($text)
    {
        $Tokenizer = Tokenizer::getInstance();
        $tokens = $Tokenizer->getWords($text);
        $tokens = array_count_values($tokens);  // count the frequency of each token

        $words = array();
        foreach ($tokens as $w => $c) {
            $l = $this->wordlen($w);
            if (isset($words[$l])) {
                $words[$l][$w] = $c + (isset($words[$l][$w]) ? $words[$l][$w] : 0);
            } else {
                $words[$l] = array($w => $c);
            }
        }

        // arrive here with $words = array(wordlen => array(word => frequency))
        $word_idx_modified = false;
        $index = array();   //resulting index
        foreach (array_keys($words) as $wlen) {
            $word_idx = $this->getIndex('w', $wlen);
            foreach ($words[$wlen] as $word => $freq) {
                $word = (string)$word;
                $wid = array_search($word, $word_idx, true);
                if ($wid === false) {
                    $wid = count($word_idx);
                    $word_idx[] = $word;
                    $word_idx_modified = true;
                }
                if (!isset($index[$wlen])) {
                    $index[$wlen] = array();
                }
                $index[$wlen][$wid] = $freq;
            }
            // save back the word index
            if ($word_idx_modified && !$this->saveIndex('w', $wlen, $word_idx)) {
                return false;
            }
        }

        return $index;
    }

    /**
     * Delete the contents of a page to the fulltext index
     *
     * @param string $page a page name
     * @param bool $requireLock should be false only if the caller is resposible for index lock
     * @return bool  If renaming the value has been successful, false on error
     *
     * @throws Exception\IndexLockException
     * @author Satoshi Sahara <[email protected]>
     * @author Tom N Harris <[email protected]>
     */
    public function deletePageWords($page, $requireLock = true)
    {
        // load known documents
        $pid = $this->getPID($page);
        if ($pid === false) {
            return false;
        }

        if ($requireLock) $this->lock();

        // remove obsolete index entries
        $pageword_idx = $this->getIndexKey('pageword', '', $pid);
        if ($pageword_idx !== '') {
            $delwords = explode(':', $pageword_idx);
            $upwords = array();
            foreach ($delwords as $word) {
                if ($word != '') {
                    list($wlen, $wid) = explode('*', $word);
                    $wid = (int)$wid;
                    $upwords[$wlen][] = $wid;
                }
            }
            foreach ($upwords as $wlen => $widx) {
                $index = $this->getIndex('i', $wlen);
                foreach ($widx as $wid) {
                    $index[$wid] = $this->updateTuple($index[$wid], $pid, 0);
                }
                $this->saveIndex('i', $wlen, $index);
            }
        }
        // save the reverse index
        if (!$this->saveIndexKey('pageword', '', $pid, '')) {
            return false;
        }

        if ($requireLock) $this->unlock();
        return true;
    }

    /**
     * Find pages in the fulltext index containing the words,
     *
     * The search words must be pre-tokenized, meaning only letters and
     * numbers with an optional wildcard
     *
     * The returned array will have the original tokens as key. The values
     * in the returned list is an array with the page names as keys and the
     * number of times that token appears on the page as value.
     *
     * @param array  $tokens list of words to search for
     * @return array         list of page names with usage counts
     *
     * @author Tom N Harris <[email protected]>
     * @author Andreas Gohr <[email protected]>
     */
    public function lookupWords(&$tokens)
    {
        $result = array();
        $wids = $this->getIndexWords($tokens, $result);
        if (empty($wids)) return array();
        // load known words and documents
        $page_idx = $this->getIndex('page', '');
        $docs = array();
        foreach (array_keys($wids) as $wlen) {
            $wids[$wlen] = array_unique($wids[$wlen]);
            $index = $this->getIndex('i', $wlen);
            foreach ($wids[$wlen] as $ixid) {
                if ($ixid < count($index)) {
                    $docs["{$wlen}*{$ixid}"] = $this->parseTuples($page_idx, $index[$ixid]);
                }
            }
        }
        // merge found pages into final result array
        $final = array();
        foreach ($result as $word => $res) {
            $final[$word] = array();
            foreach ($res as $wid) {
                // handle the case when ($ixid < count($index)) has been false
                // and thus $docs[$wid] hasn't been set.
                if (!isset($docs[$wid])) continue;
                $hits =& $docs[$wid];
                foreach ($hits as $hitkey => $hitcnt) {
                    // make sure the document still exists
                    if (!page_exists($hitkey, '', false)) continue;
                    if (!isset($final[$word][$hitkey])) {
                        $final[$word][$hitkey] = $hitcnt;
                    } else {
                        $final[$word][$hitkey] += $hitcnt;
                    }
                }
            }
        }
        return $final;
    }

    /**
     * Find the index ID of each search term
     *
     * The query terms should only contain valid characters, with a '*' at
     * either the beginning or end of the word (or both).
     * The $result parameter can be used to merge the index locations with
     * the appropriate query term.
     *
     * @param array  $words  The query terms.
     * @param array  $result Set to word => array("length*id" ...)
     * @return array         Set to length => array(id ...)
     *
     * @author Tom N Harris <[email protected]>
     */
    protected function getIndexWords(&$words, &$result)
    {
        $Tokenizer = Tokenizer::getInstance();

        $tokens = array();
        $tokenlength = array();
        $tokenwild = array();
        foreach ($words as $word) {
            $result[$word] = array();
            $caret = '^';
            $dollar = '$';
            $xword = $word;
            $wlen = $this->wordlen($word);

            // check for wildcards
            if (substr($xword, 0, 1) == '*') {
                $xword = substr($xword, 1);
                $caret = '';
                $wlen -= 1;
            }
            if (substr($xword, -1, 1) == '*') {
                $xword = substr($xword, 0, -1);
                $dollar = '';
                $wlen -= 1;
            }
            if ($wlen < $Tokenizer->getMinWordLength()
                && $caret && $dollar && !is_numeric($xword)
            ) {
                continue;
            }
            if (!isset($tokens[$xword])) {
                $tokenlength[$wlen][] = $xword;
            }
            if (!$caret || !$dollar) {
                $re = $caret.preg_quote($xword, '/').$dollar;
                $tokens[$xword][] = array($word, '/'.$re.'/');
                if (!isset($tokenwild[$xword])) {
                    $tokenwild[$xword] = $wlen;
                }
            } else {
                $tokens[$xword][] = array($word, null);
            }
        }
        asort($tokenwild);
        // $tokens = array( base word => array( [ query term , regexp ] ... ) ... )
        // $tokenlength = array( base word length => base word ... )
        // $tokenwild = array( base word => base word length ... )
        $length_filter = empty($tokenwild) ? $tokenlength : min(array_keys($tokenlength));
        $indexes_known = $this->getIndexLengths($length_filter);
        if (!empty($tokenwild)) sort($indexes_known);
        // get word IDs
        $wids = array();
        foreach ($indexes_known as $ixlen) {
            $word_idx = $this->getIndex('w', $ixlen);
            // handle exact search
            if (isset($tokenlength[$ixlen])) {
                foreach ($tokenlength[$ixlen] as $xword) {
                    $wid = array_search($xword, $word_idx, true);
                    if ($wid !== false) {
                        $wids[$ixlen][] = $wid;
                        foreach ($tokens[$xword] as $w)
                            $result[$w[0]][] = "{$ixlen}*{$wid}";
                    }
                }
            }
            // handle wildcard search
            foreach ($tokenwild as $xword => $wlen) {
                if ($wlen >= $ixlen) break;
                foreach ($tokens[$xword] as $w) {
                    if (is_null($w[1])) continue;
                    foreach (array_keys(preg_grep($w[1], $word_idx)) as $wid) {
                        $wids[$ixlen][] = $wid;
                        $result[$w[0]][] = "{$ixlen}*{$wid}";
                    }
                }
            }
        }
        return $wids;
    }

    /**
     * Get the word lengths that have been indexed
     *
     * Reads the index directory and returns an array of lengths
     * that there are indices for.
     *
     * @author YoBoY <[email protected]>
     *
     * @param array|int $filter
     * @return array
     */
    public function getIndexLengths($filter)
    {
        global $conf;
        $idx = array();
        if (is_array($filter)) {
            // testing if index files exist only
            $path = $conf['indexdir']."/i";
            foreach ($filter as $key => $value) {
                if (file_exists($path.$key.'.idx')) {
                    $idx[] = $key;
                }
            }
        } else {
            $lengths = $this->listIndexLengths();
            foreach ($lengths as $key => $length) {
                // keep all the values equal or superior
                if ((int)$length >= (int)$filter) {
                    $idx[] = $length;
                }
            }
        }
        return $idx;
    }

    /**
     * Get the list of lengths indexed in the wiki
     *
     * Read the index directory or a cache file and returns
     * a sorted array of lengths of the words used in the wiki.
     *
     * @author YoBoY <[email protected]>
     *
     * @return array
     */
    public function listIndexLengths()
    {
        global $conf;
        $lengthsFile = $conf['indexdir'].'/lengths.idx';

        // testing what we have to do, create a cache file or not.
        if ($conf['readdircache'] == 0) {
            $docache = false;
        } else {
            clearstatcache();
            if (file_exists($lengthsFile)
                && (time() < @filemtime($lengthsFile) + $conf['readdircache'])
            ) {
                $lengths = @file($lengthsFile, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
                if ($lengths !== false) {
                    $idx = array();
                    foreach ($lengths as $length) {
                        $idx[] = (int)$length;
                    }
                    return $idx;
                }
            }
            $docache = true;
        }

        if ($conf['readdircache'] == 0 || $docache) {
            $dir = @opendir($conf['indexdir']);
            if ($dir === false) return array();
            $idx = array();
            while (($f = readdir($dir)) !== false) {
                if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') {
                    $i = substr($f, 1, -4);
                    if (is_numeric($i)) $idx[] = (int)$i;
                }
            }
            closedir($dir);
            sort($idx);
            // save this in a file
            if ($docache) {
                $handle = @fopen($lengthsFile, 'w');
                @fwrite($handle, implode("\n", $idx));
                @fclose($handle);
            }
            return $idx;
        }
        return array();
    }

    /**
     * Return a list of words sorted by number of times used
     *
     * @param int       $min    bottom frequency threshold
     * @param int       $max    upper frequency limit. No limit if $max<$min
     * @param int       $minlen minimum length of words to count
     * @return array            list of words as the keys and frequency as value
     *
     * @author Tom N Harris <[email protected]>
     */
    public function histogram($min=1, $max=0, $minlen=3)
    {
        return MetadataIndex::getInstance()->histogram($min, $max, $minlen);
    }

    /**
     * Clear the Fulltext Index
     *
     * @param bool $requireLock should be false only if the caller is resposible for index lock
     * @return bool  If the index has been cleared successfully
     * @throws Exception\IndexLockException
     */
    public function clear($requireLock = true)
    {
        global $conf;

        if ($requireLock) $this->lock();

        $lengths = $this->listIndexLengths();
        foreach ($lengths as $length) {
            @unlink($conf['indexdir'].'/i'.$length.'.idx');
            @unlink($conf['indexdir'].'/w'.$length.'.idx');
        }
        @unlink($conf['indexdir'].'/lengths.idx');
        @unlink($conf['indexdir'].'/pageword.idx');

        if ($requireLock) $this->unlock();
        return true;
    }
}


1			<?php
2
3			namespace dokuwiki\Search;
4
5			use dokuwiki\Search\Tokenizer;
6			use dokuwiki\Utf8;
7
8			/**
9			* Class DokuWiki Fulltext Index (Singleton)
10			*
11			* @license GPL 2 (http://www.gnu.org/licenses/gpl.html)
12			* @author Andreas Gohr <[email protected]>
13			* @author Tom N Harris <[email protected]>
14			*/
15			class FulltextIndex extends AbstractIndex
16			{
17			/** @var FulltextIndex $instance */
18			protected static $instance = null;
19
20			/**
21			* Get new or existing singleton instance of the FulltextIndex
22			*
23			* @return FulltextIndex
24			*/
25			public static function getInstance()
26			{
27			if (is_null(static::$instance)) {
28			static::$instance = new static();
29			}
30			return static::$instance;
31			}
32
33			/**
34			* Measure the length of a string
35			* Differs from strlen in handling of asian characters.
36			*
37			* @author Tom N Harris <[email protected]>
38			*
39			* @param string $w
40			* @return int
41			*/
42			public function wordlen($w)
43			{
44			$l = strlen($w);
45			// If left alone, all chinese "words" will get put into w3.idx
46			// So the "length" of a "word" is faked
47			if (preg_match_all('/[\xE2-\xEF]/', $w, $leadbytes)) {
48			foreach ($leadbytes[0] as $b) {
49			$l += ord($b) - 0xE1;
50			}
51			}
52			return $l;
53			}
54
55			/**
56			* Adds the contents of a page to the fulltext index
57			*
58			* The added text replaces previous words for the same page.
59			* An empty value erases the page.
60			*
61			* @param string $page a page name
62			* @param string $text the body of the page
63			* @param bool $requireLock should be false only if the caller is resposible for index lock
64			* @return bool if the function completed successfully
65			*
66			* @throws Exception\IndexLockException
67			* @author Andreas Gohr <[email protected]>
68			* @author Tom N Harris <[email protected]>
69			*/
70			public function addPageWords($page, $text, $requireLock = true)
71			{
72			// load known documents
73			$pid = $this->getPID($page);
74			if ($pid === false) {
75			return false;
76			}
77
78			if ($requireLock) $this->lock();
79
80			$pagewords = array();
81			// get word usage in page
82			$words = $this->getPageWords($text);
83			if ($words === false) {
84			$this->unlock();
85			return false;
86			}
87
88			if (!empty($words)) {
89			foreach (array_keys($words) as $wlen) {
90			$index = $this->getIndex('i', $wlen);
91			foreach ($words[$wlen] as $wid => $freq) {
92			$idx = ($wid < count($index)) ? $index[$wid] : '';
93			$index[$wid] = $this->updateTuple($idx, $pid, $freq);
94			$pagewords[] = "{$wlen}*{$wid}";
95			}
96			if (!$this->saveIndex('i', $wlen, $index)) {
97			$this->unlock();
98			return false;
99			}
100			}
101			}
102
103			// Remove obsolete index entries
104			$pageword_idx = $this->getIndexKey('pageword', '', $pid);
105			if ($pageword_idx !== '') {
106			$oldwords = explode(':',$pageword_idx);
107			$delwords = array_diff($oldwords, $pagewords);
108			$upwords = array();
109			foreach ($delwords as $word) {
110			if ($word != '') {
111			list($wlen, $wid) = explode('*', $word);
112			$wid = (int)$wid;
113			$upwords[$wlen][] = $wid;
114			}
115			}
116			foreach ($upwords as $wlen => $widx) {
117			$index = $this->getIndex('i', $wlen);
118			foreach ($widx as $wid) {
119			$index[$wid] = $this->updateTuple($index[$wid], $pid, 0);
120			}
121			$this->saveIndex('i', $wlen, $index);
122			}
123			}
124			// Save the reverse index
125			$pageword_idx = implode(':', $pagewords);
126			if (!$this->saveIndexKey('pageword', '', $pid, $pageword_idx)) {
127			$result = false;
128			} else {
129			$result = true;
130			}
131
132			if ($requireLock) $this->unlock();
133			return $result;
134			}
135
136			/**
137			* Split the words in a page and add them to the index
138			*
139			* @param string $text content of the page
140			* @return array\|false list of word IDs and number of times used, false on errors
141			*
142			* @author Andreas Gohr <[email protected]>
143			* @author Christopher Smith <[email protected]>
144			* @author Tom N Harris <[email protected]>
145			*/
146			protected function getPageWords($text)
147			{
148			$Tokenizer = Tokenizer::getInstance();
149			$tokens = $Tokenizer->getWords($text);
150			$tokens = array_count_values($tokens); // count the frequency of each token
151
152			$words = array();
153			foreach ($tokens as $w => $c) {
154			$l = $this->wordlen($w);
155			if (isset($words[$l])) {
156			$words[$l][$w] = $c + (isset($words[$l][$w]) ? $words[$l][$w] : 0);
157			} else {
158			$words[$l] = array($w => $c);
159			}
160			}
161
162			// arrive here with $words = array(wordlen => array(word => frequency))
163			$word_idx_modified = false;
164			$index = array(); //resulting index
165			foreach (array_keys($words) as $wlen) {
166			$word_idx = $this->getIndex('w', $wlen);
167			foreach ($words[$wlen] as $word => $freq) {
168			$word = (string)$word;
169			$wid = array_search($word, $word_idx, true);
170			if ($wid === false) {
171			$wid = count($word_idx);
172			$word_idx[] = $word;
173			$word_idx_modified = true;
174			}
175			if (!isset($index[$wlen])) {
176			$index[$wlen] = array();
177			}
178			$index[$wlen][$wid] = $freq;
179			}
180			// save back the word index
181			if ($word_idx_modified && !$this->saveIndex('w', $wlen, $word_idx)) {
182			return false;
183			}
184			}
185
186			return $index;
187			}
188
189			/**
190			* Delete the contents of a page to the fulltext index
191			*
192			* @param string $page a page name
193			* @param bool $requireLock should be false only if the caller is resposible for index lock
194			* @return bool If renaming the value has been successful, false on error
195			*
196			* @throws Exception\IndexLockException
197			* @author Satoshi Sahara <[email protected]>
198			* @author Tom N Harris <[email protected]>
199			*/
200			public function deletePageWords($page, $requireLock = true)
201			{
202			// load known documents
203			$pid = $this->getPID($page);
204			if ($pid === false) {
205			return false;
206			}
207
208			if ($requireLock) $this->lock();
209
210			// remove obsolete index entries
211			$pageword_idx = $this->getIndexKey('pageword', '', $pid);
212			if ($pageword_idx !== '') {
213			$delwords = explode(':', $pageword_idx);
214			$upwords = array();
215			foreach ($delwords as $word) {
216			if ($word != '') {
217			list($wlen, $wid) = explode('*', $word);
218			$wid = (int)$wid;
219			$upwords[$wlen][] = $wid;
220			}
221			}
222			foreach ($upwords as $wlen => $widx) {
223			$index = $this->getIndex('i', $wlen);
224			foreach ($widx as $wid) {
225			$index[$wid] = $this->updateTuple($index[$wid], $pid, 0);
226			}
227			$this->saveIndex('i', $wlen, $index);
228			}
229			}
230			// save the reverse index
231			if (!$this->saveIndexKey('pageword', '', $pid, '')) {
232			return false;
233			}
234
235			if ($requireLock) $this->unlock();
236			return true;
237			}
238
239			/**
240			* Find pages in the fulltext index containing the words,
241			*
242			* The search words must be pre-tokenized, meaning only letters and
243			* numbers with an optional wildcard
244			*
245			* The returned array will have the original tokens as key. The values
246			* in the returned list is an array with the page names as keys and the
247			* number of times that token appears on the page as value.
248			*
249			* @param array $tokens list of words to search for
250			* @return array list of page names with usage counts
251			*
252			* @author Tom N Harris <[email protected]>
253			* @author Andreas Gohr <[email protected]>
254			*/
255			public function lookupWords(&$tokens)
256			{
257			$result = array();
258			$wids = $this->getIndexWords($tokens, $result);
259			if (empty($wids)) return array();
260			// load known words and documents
261			$page_idx = $this->getIndex('page', '');
262			$docs = array();
263			foreach (array_keys($wids) as $wlen) {
264			$wids[$wlen] = array_unique($wids[$wlen]);
265			$index = $this->getIndex('i', $wlen);
266			foreach ($wids[$wlen] as $ixid) {
267			if ($ixid < count($index)) {
268			$docs["{$wlen}*{$ixid}"] = $this->parseTuples($page_idx, $index[$ixid]);
269			}
270			}
271			}
272			// merge found pages into final result array
273			$final = array();
274			foreach ($result as $word => $res) {
275			$final[$word] = array();
276			foreach ($res as $wid) {
277			// handle the case when ($ixid < count($index)) has been false
278			// and thus $docs[$wid] hasn't been set.
279			if (!isset($docs[$wid])) continue;
280			$hits =& $docs[$wid];
281			foreach ($hits as $hitkey => $hitcnt) {
282			// make sure the document still exists
283			if (!page_exists($hitkey, '', false)) continue;
284			if (!isset($final[$word][$hitkey])) {
285			$final[$word][$hitkey] = $hitcnt;
286			} else {
287			$final[$word][$hitkey] += $hitcnt;
288			}
289			}
290			}
291			}
292			return $final;
293			}
294
295			/**
296			* Find the index ID of each search term
297			*
298			* The query terms should only contain valid characters, with a '*' at
299			* either the beginning or end of the word (or both).
300			* The $result parameter can be used to merge the index locations with
301			* the appropriate query term.
302			*
303			* @param array $words The query terms.
304			* @param array $result Set to word => array("length*id" ...)
305			* @return array Set to length => array(id ...)
306			*
307			* @author Tom N Harris <[email protected]>
308			*/
309			protected function getIndexWords(&$words, &$result)
310			{
311			$Tokenizer = Tokenizer::getInstance();
312
313			$tokens = array();
314			$tokenlength = array();
315			$tokenwild = array();
316			foreach ($words as $word) {
317			$result[$word] = array();
318			$caret = '^';
319			$dollar = '$';
320			$xword = $word;
321			$wlen = $this->wordlen($word);
322
323			// check for wildcards
324			if (substr($xword, 0, 1) == '*') {
325			$xword = substr($xword, 1);
326			$caret = '';
327			$wlen -= 1;
328			}
329			if (substr($xword, -1, 1) == '*') {
330			$xword = substr($xword, 0, -1);
331			$dollar = '';
332			$wlen -= 1;
333			}
334			if ($wlen < $Tokenizer->getMinWordLength()
335			&& $caret && $dollar && !is_numeric($xword)
336			) {
337			continue;
338			}
339			if (!isset($tokens[$xword])) {
340			$tokenlength[$wlen][] = $xword;
341			}
342			if (!$caret \|\| !$dollar) {
343			$re = $caret.preg_quote($xword, '/').$dollar;
344			$tokens[$xword][] = array($word, '/'.$re.'/');
345			if (!isset($tokenwild[$xword])) {
346			$tokenwild[$xword] = $wlen;
347			}
348			} else {
349			$tokens[$xword][] = array($word, null);
350			}
351			}
352			asort($tokenwild);
353			// $tokens = array( base word => array( [ query term , regexp ] ... ) ... )
354			// $tokenlength = array( base word length => base word ... )
355			// $tokenwild = array( base word => base word length ... )
356			$length_filter = empty($tokenwild) ? $tokenlength : min(array_keys($tokenlength));
357			$indexes_known = $this->getIndexLengths($length_filter);
358			if (!empty($tokenwild)) sort($indexes_known);
359			// get word IDs
360			$wids = array();
361			foreach ($indexes_known as $ixlen) {
362			$word_idx = $this->getIndex('w', $ixlen);
363			// handle exact search
364			if (isset($tokenlength[$ixlen])) {
365			foreach ($tokenlength[$ixlen] as $xword) {
366			$wid = array_search($xword, $word_idx, true);
367			if ($wid !== false) {
368			$wids[$ixlen][] = $wid;
369			foreach ($tokens[$xword] as $w)
370			$result[$w[0]][] = "{$ixlen}*{$wid}";
371			}
372			}
373			}
374			// handle wildcard search
375			foreach ($tokenwild as $xword => $wlen) {
376			if ($wlen >= $ixlen) break;
377			foreach ($tokens[$xword] as $w) {
378			if (is_null($w[1])) continue;
379			foreach (array_keys(preg_grep($w[1], $word_idx)) as $wid) {
380			$wids[$ixlen][] = $wid;
381			$result[$w[0]][] = "{$ixlen}*{$wid}";
382			}
383			}
384			}
385			}
386			return $wids;
387			}
388
389			/**
390			* Get the word lengths that have been indexed
391			*
392			* Reads the index directory and returns an array of lengths
393			* that there are indices for.
394			*
395			* @author YoBoY <[email protected]>
396			*
397			* @param array\|int $filter
398			* @return array
399			*/
400			public function getIndexLengths($filter)
401			{
402			global $conf;
403			$idx = array();
404			if (is_array($filter)) {
405			// testing if index files exist only
406			$path = $conf['indexdir']."/i";
407			foreach ($filter as $key => $value) {
408			if (file_exists($path.$key.'.idx')) {
409			$idx[] = $key;
410			}
411			}
412			} else {
413			$lengths = $this->listIndexLengths();
414			foreach ($lengths as $key => $length) {
415			// keep all the values equal or superior
416			if ((int)$length >= (int)$filter) {
417			$idx[] = $length;
418			}
419			}
420			}
421			return $idx;
422			}
423
424			/**
425			* Get the list of lengths indexed in the wiki
426			*
427			* Read the index directory or a cache file and returns
428			* a sorted array of lengths of the words used in the wiki.
429			*
430			* @author YoBoY <[email protected]>
431			*
432			* @return array
433			*/
434			public function listIndexLengths()
435			{
436			global $conf;
437			$lengthsFile = $conf['indexdir'].'/lengths.idx';
438
439			// testing what we have to do, create a cache file or not.
440			if ($conf['readdircache'] == 0) {
441			$docache = false;
442			} else {
443			clearstatcache();
444			if (file_exists($lengthsFile)
445			&& (time() < @filemtime($lengthsFile) + $conf['readdircache'])
446			) {
447			$lengths = @file($lengthsFile, FILE_IGNORE_NEW_LINES \| FILE_SKIP_EMPTY_LINES);
448			if ($lengths !== false) {
449			$idx = array();
450			foreach ($lengths as $length) {
451			$idx[] = (int)$length;
452			}
453			return $idx;
454			}
455			}
456			$docache = true;
457			}
458
459			if ($conf['readdircache'] == 0 \|\| $docache) {
460			$dir = @opendir($conf['indexdir']);
461			if ($dir === false) return array();
462			$idx = array();
463			while (($f = readdir($dir)) !== false) {
464			if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') {
465			$i = substr($f, 1, -4);
466			if (is_numeric($i)) $idx[] = (int)$i;
467			}
468			}
469			closedir($dir);
470			sort($idx);
471			// save this in a file
472			if ($docache) {
473			$handle = @fopen($lengthsFile, 'w');
474			@fwrite($handle, implode("\n", $idx));
475			@fclose($handle);
476			}
477			return $idx;
478			}
479			return array();
480			}
481
482			/**
483			* Return a list of words sorted by number of times used
484			*
485			* @param int $min bottom frequency threshold
486			* @param int $max upper frequency limit. No limit if $max<$min
487			* @param int $minlen minimum length of words to count
488			* @return array list of words as the keys and frequency as value
489			*
490			* @author Tom N Harris <[email protected]>
491			*/
492			public function histogram($min=1, $max=0, $minlen=3)
493			{
494			return MetadataIndex::getInstance()->histogram($min, $max, $minlen);
495			}
496
497			/**
498			* Clear the Fulltext Index
499			*
500			* @param bool $requireLock should be false only if the caller is resposible for index lock
501			* @return bool If the index has been cleared successfully
502			* @throws Exception\IndexLockException
503			*/
504			public function clear($requireLock = true)
505			{
506			global $conf;
507
508			if ($requireLock) $this->lock();
509
510			$lengths = $this->listIndexLengths();
511			foreach ($lengths as $length) {
512			@unlink($conf['indexdir'].'/i'.$length.'.idx');
513			@unlink($conf['indexdir'].'/w'.$length.'.idx');
514			}
515			@unlink($conf['indexdir'].'/lengths.idx');
516			@unlink($conf['indexdir'].'/pageword.idx');
517
518			if ($requireLock) $this->unlock();
519			return true;
520			}
521			}
522

splitbrain / dokuwiki

Pull Request — master (#2943)

FulltextIndex::getIndexWords() F

Complexity

Size

Duplication

Importance

How to fix Long Method Complexity

Long Method

Duplication Side-by-Side

Filter issues like