indexer.php ➔ idx_tokenizer() - Code Metrics - Inspection of "add PHP 7.4 to Travis CI" - splitbrain/dokuwiki - Measure and Improve Code Quality continuously with Scrutinizer

Failed Conditions

Push — travis-php74 ( 84311c...10ffd5 )

by Henry

created 2020-01-20 00:09 UTC

indexer.php ➔ idx_tokenizer() A

↳ Parent: Project

Complexity

Conditions	1
Paths	1

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	1
nc	1
nop	2
dl	0
loc	4
rs	10
c	0
b	0
f	0

<?php
/**
 * Functions to create the fulltext search index
 *
 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
 * @author     Andreas Gohr <[email protected]>
 * @author     Tom N Harris <[email protected]>
 */

use dokuwiki\Extension\Event;
use dokuwiki\Search\Indexer;

// Version tag used to force rebuild on upgrade
define('INDEXER_VERSION', 8);

// set the minimum token length to use in the index (note, this doesn't apply to numeric tokens)
if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH',2);

/**
 * Version of the indexer taking into consideration the external tokenizer.
 * The indexer is only compatible with data written by the same version.
 *
 * @triggers INDEXER_VERSION_GET
 * Plugins that modify what gets indexed should hook this event and
 * add their version info to the event data like so:
 *     $data[$plugin_name] = $plugin_version;
 *
 * @author Tom N Harris <[email protected]>
 * @author Michael Hamann <[email protected]>
 *
 * @return int|string
 */
function idx_get_version(){
    static $indexer_version = null;
    if ($indexer_version == null) {
        $version = INDEXER_VERSION;

        // DokuWiki version is included for the convenience of plugins
        $data = array('dokuwiki'=>$version);
        Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false);
        unset($data['dokuwiki']); // this needs to be first
        ksort($data);
        foreach ($data as $plugin=>$vers)
            $version .= '+'.$plugin.'='.$vers;
        $indexer_version = $version;
    }
    return $indexer_version;
}

/**
 * Measure the length of a string.
 * Differs from strlen in handling of asian characters.
 *
 * @author Tom N Harris <[email protected]>
 *
 * @param string $w
 * @return int
 */
function wordlen($w){
    $l = strlen($w);
    // If left alone, all chinese "words" will get put into w3.idx
    // So the "length" of a "word" is faked
    if(preg_match_all('/[\xE2-\xEF]/',$w,$leadbytes)) {
        foreach($leadbytes[0] as $b)
            $l += ord($b) - 0xE1;
    }
    return $l;
}

/**
 * Create an instance of the indexer.
 *
 * @return Indexer    an Indexer
 *
 * @author Tom N Harris <[email protected]>
 */
function idx_get_indexer() {
    static $Indexer;
    if (!isset($Indexer)) {
        $Indexer = new Indexer();
    }
    return $Indexer;
}

/**
 * Returns words that will be ignored.
 *
 * @return array                list of stop words
 *
 * @author Tom N Harris <[email protected]>
 */
function & idx_get_stopwords() {
    static $stopwords = null;
    if (is_null($stopwords)) {
        global $conf;
        $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
        if(file_exists($swfile)){
            $stopwords = file($swfile, FILE_IGNORE_NEW_LINES);
        }else{
            $stopwords = array();
        }
    }
    return $stopwords;
}

/**
 * Adds/updates the search index for the given page
 *
 * Locking is handled internally.
 *
 * @param string        $page   name of the page to index
 * @param boolean       $verbose    print status messages
 * @param boolean       $force  force reindexing even when the index is up to date
 * @return string|boolean  the function completed successfully
 *
 * @author Tom N Harris <[email protected]>
 */
function idx_addPage($page, $verbose=false, $force=false) {
    $idxtag = metaFN($page,'.indexed');
    // check if page was deleted but is still in the index
    if (!page_exists($page)) {
        if (!file_exists($idxtag)) {
            if ($verbose) print("Indexer: $page does not exist, ignoring".DOKU_LF);
            return false;
        }
        $Indexer = idx_get_indexer();
        $result = $Indexer->deletePage($page);
        if ($result === "locked") {
            if ($verbose) print("Indexer: locked".DOKU_LF);
            return false;
        }
        @unlink($idxtag);
        return $result;
    }

    // check if indexing needed
    if(!$force && file_exists($idxtag)){
        if(trim(io_readFile($idxtag)) == idx_get_version()){
            $last = @filemtime($idxtag);
            if($last > @filemtime(wikiFN($page))){
                if ($verbose) print("Indexer: index for $page up to date".DOKU_LF);
                return false;
            }
        }
    }

    $indexenabled = p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED);
    if ($indexenabled === false) {
        $result = false;
        if (file_exists($idxtag)) {
            $Indexer = idx_get_indexer();
            $result = $Indexer->deletePage($page);
            if ($result === "locked") {
                if ($verbose) print("Indexer: locked".DOKU_LF);
                return false;
            }
            @unlink($idxtag);
        }
        if ($verbose) print("Indexer: index disabled for $page".DOKU_LF);
        return $result;
    }

    $Indexer = idx_get_indexer();
    $pid = $Indexer->getPID($page);
    if ($pid === false) {
        if ($verbose) print("Indexer: getting the PID failed for $page".DOKU_LF);
        return false;
    }
    $body = '';
    $metadata = array();
    $metadata['title'] = p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED);
    if (($references = p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED)) !== null)
        $metadata['relation_references'] = array_keys($references);
    else
        $metadata['relation_references'] = array();

    if (($media = p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED)) !== null)
        $metadata['relation_media'] = array_keys($media);
    else
        $metadata['relation_media'] = array();

    $data = compact('page', 'body', 'metadata', 'pid');
    $evt = new Event('INDEXER_PAGE_ADD', $data);
    if ($evt->advise_before()) $data['body'] = $data['body'] . " " . rawWiki($page);
    $evt->advise_after();
    unset($evt);
    extract($data);

    $result = $Indexer->addPageWords($page, $body);
    if ($result === "locked") {
        if ($verbose) print("Indexer: locked".DOKU_LF);
        return false;
    }

    if ($result) {
        $result = $Indexer->addMetaKeys($page, $metadata);
        if ($result === "locked") {
            if ($verbose) print("Indexer: locked".DOKU_LF);
            return false;
        }
    }

    if ($result)
        io_saveFile(metaFN($page,'.indexed'), idx_get_version());
    if ($verbose) {
        print("Indexer: finished".DOKU_LF);
        return true;
    }
    return $result;
}

/**
 * Find tokens in the fulltext index
 *
 * Takes an array of words and will return a list of matching
 * pages for each one.
 *
 * Important: No ACL checking is done here! All results are
 *            returned, regardless of permissions
 *
 * @param array      $words  list of words to search for
 * @return array             list of pages found, associated with the search terms
 */
function idx_lookup(&$words) {
    $Indexer = idx_get_indexer();
    return $Indexer->lookup($words);
}

/**
 * Split a string into tokens
 *
 * @param string $string
 * @param bool $wc
 *
 * @return array
 */
function idx_tokenizer($string, $wc=false) {
    $Indexer = idx_get_indexer();
    return $Indexer->tokenizer($string, $wc);
}

/* For compatibility */

/**
 * Read the list of words in an index (if it exists).
 *
 * @author Tom N Harris <[email protected]>
 *
 * @param string $idx
 * @param string $suffix
 * @return array
 */
function idx_getIndex($idx, $suffix) {
    global $conf;
    $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx';
    if (!file_exists($fn)) return array();
    return file($fn);
}

/**
 * Get the list of lengths indexed in the wiki.
 *
 * Read the index directory or a cache file and returns
 * a sorted array of lengths of the words used in the wiki.
 *
 * @author YoBoY <[email protected]>
 *
 * @return array
 */
function idx_listIndexLengths() {
    global $conf;
    // testing what we have to do, create a cache file or not.
    if ($conf['readdircache'] == 0) {
        $docache = false;
    } else {
        clearstatcache();
        if (file_exists($conf['indexdir'].'/lengths.idx')
        && (time() < @filemtime($conf['indexdir'].'/lengths.idx') + $conf['readdircache'])) {
            if (
                ($lengths = @file($conf['indexdir'].'/lengths.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES))
                !== false
            ) {
                $idx = array();
                foreach ($lengths as $length) {
                    $idx[] = (int)$length;
                }
                return $idx;
            }
        }
        $docache = true;
    }

    if ($conf['readdircache'] == 0 || $docache) {
        $dir = @opendir($conf['indexdir']);
        if ($dir === false)
            return array();
        $idx = array();
        while (($f = readdir($dir)) !== false) {
            if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') {
                $i = substr($f, 1, -4);
                if (is_numeric($i))
                    $idx[] = (int)$i;
            }
        }
        closedir($dir);
        sort($idx);
        // save this in a file
        if ($docache) {
            $handle = @fopen($conf['indexdir'].'/lengths.idx', 'w');
            @fwrite($handle, implode("\n", $idx));
            @fclose($handle);
        }
        return $idx;
    }

    return array();
}

/**
 * Get the word lengths that have been indexed.
 *
 * Reads the index directory and returns an array of lengths
 * that there are indices for.
 *
 * @author YoBoY <[email protected]>
 *
 * @param array|int $filter
 * @return array
 */
function idx_indexLengths($filter) {
    global $conf;
    $idx = array();
    if (is_array($filter)) {
        // testing if index files exist only
        $path = $conf['indexdir']."/i";
        foreach ($filter as $key => $value) {
            if (file_exists($path.$key.'.idx'))
                $idx[] = $key;
        }
    } else {
        $lengths = idx_listIndexLengths();
        foreach ($lengths as $key => $length) {
            // keep all the values equal or superior
            if ((int)$length >= (int)$filter)
                $idx[] = $length;
        }
    }
    return $idx;
}

/**
 * Clean a name of a key for use as a file name.
 *
 * Romanizes non-latin characters, then strips away anything that's
 * not a letter, number, or underscore.
 *
 * @author Tom N Harris <[email protected]>
 *
 * @param string $name
 * @return string
 */
function idx_cleanName($name) {
    $name = \dokuwiki\Utf8\Clean::romanize(trim((string)$name));
    $name = preg_replace('#[ \./\\:-]+#', '_', $name);
    $name = preg_replace('/[^A-Za-z0-9_]/', '', $name);
    return strtolower($name);
}

//Setup VIM: ex: et ts=4 :


1			<?php
2			/**
3			* Functions to create the fulltext search index
4			*
5			* @license GPL 2 (http://www.gnu.org/licenses/gpl.html)
6			* @author Andreas Gohr <[email protected]>
7			* @author Tom N Harris <[email protected]>
8			*/
9
10			use dokuwiki\Extension\Event;
11			use dokuwiki\Search\Indexer;
12
13			// Version tag used to force rebuild on upgrade
14			define('INDEXER_VERSION', 8);
15
16			// set the minimum token length to use in the index (note, this doesn't apply to numeric tokens)
17			if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH',2);
18
19			/**
20			* Version of the indexer taking into consideration the external tokenizer.
21			* The indexer is only compatible with data written by the same version.
22			*
23			* @triggers INDEXER_VERSION_GET
24			* Plugins that modify what gets indexed should hook this event and
25			* add their version info to the event data like so:
26			* $data[$plugin_name] = $plugin_version;
27			*
28			* @author Tom N Harris <[email protected]>
29			* @author Michael Hamann <[email protected]>
30			*
31			* @return int\|string
32			*/
33			function idx_get_version(){
34			static $indexer_version = null;
35			if ($indexer_version == null) {
36			$version = INDEXER_VERSION;
37
38			// DokuWiki version is included for the convenience of plugins
39			$data = array('dokuwiki'=>$version);
40			Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false);
41			unset($data['dokuwiki']); // this needs to be first
42			ksort($data);
43			foreach ($data as $plugin=>$vers)
44			$version .= '+'.$plugin.'='.$vers;
45			$indexer_version = $version;
46			}
47			return $indexer_version;
48			}
49
50			/**
51			* Measure the length of a string.
52			* Differs from strlen in handling of asian characters.
53			*
54			* @author Tom N Harris <[email protected]>
55			*
56			* @param string $w
57			* @return int
58			*/
59			function wordlen($w){
60			$l = strlen($w);
61			// If left alone, all chinese "words" will get put into w3.idx
62			// So the "length" of a "word" is faked
63			if(preg_match_all('/[\xE2-\xEF]/',$w,$leadbytes)) {
64			foreach($leadbytes[0] as $b)
65			$l += ord($b) - 0xE1;
66			}
67			return $l;
68			}
69
70			/**
71			* Create an instance of the indexer.
72			*
73			* @return Indexer an Indexer
74			*
75			* @author Tom N Harris <[email protected]>
76			*/
77			function idx_get_indexer() {
78			static $Indexer;
79			if (!isset($Indexer)) {
80			$Indexer = new Indexer();
81			}
82			return $Indexer;
83			}
84
85			/**
86			* Returns words that will be ignored.
87			*
88			* @return array list of stop words
89			*
90			* @author Tom N Harris <[email protected]>
91			*/
92			function & idx_get_stopwords() {
93			static $stopwords = null;
94			if (is_null($stopwords)) {
95			global $conf;
96			$swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
97			if(file_exists($swfile)){
98			$stopwords = file($swfile, FILE_IGNORE_NEW_LINES);
99			}else{
100			$stopwords = array();
101			}
102			}
103			return $stopwords;
104			}
105
106			/**
107			* Adds/updates the search index for the given page
108			*
109			* Locking is handled internally.
110			*
111			* @param string $page name of the page to index
112			* @param boolean $verbose print status messages
113			* @param boolean $force force reindexing even when the index is up to date
114			* @return string\|boolean the function completed successfully
115			*
116			* @author Tom N Harris <[email protected]>
117			*/
118			function idx_addPage($page, $verbose=false, $force=false) {
119			$idxtag = metaFN($page,'.indexed');
120			// check if page was deleted but is still in the index
121			if (!page_exists($page)) {
122			if (!file_exists($idxtag)) {
123			if ($verbose) print("Indexer: $page does not exist, ignoring".DOKU_LF);
124			return false;
125			}
126			$Indexer = idx_get_indexer();
127			$result = $Indexer->deletePage($page);
128			if ($result === "locked") {
129			if ($verbose) print("Indexer: locked".DOKU_LF);
130			return false;
131			}
132			@unlink($idxtag);
133			return $result;
134			}
135
136			// check if indexing needed
137			if(!$force && file_exists($idxtag)){
138			if(trim(io_readFile($idxtag)) == idx_get_version()){
139			$last = @filemtime($idxtag);
140			if($last > @filemtime(wikiFN($page))){
141			if ($verbose) print("Indexer: index for $page up to date".DOKU_LF);
142			return false;
143			}
144			}
145			}
146
147			$indexenabled = p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED);
148			if ($indexenabled === false) {
149			$result = false;
150			if (file_exists($idxtag)) {
151			$Indexer = idx_get_indexer();
152			$result = $Indexer->deletePage($page);
153			if ($result === "locked") {
154			if ($verbose) print("Indexer: locked".DOKU_LF);
155			return false;
156			}
157			@unlink($idxtag);
158			}
159			if ($verbose) print("Indexer: index disabled for $page".DOKU_LF);
160			return $result;
161			}
162
163			$Indexer = idx_get_indexer();
164			$pid = $Indexer->getPID($page);
165			if ($pid === false) {
166			if ($verbose) print("Indexer: getting the PID failed for $page".DOKU_LF);
167			return false;
168			}
169			$body = '';
170			$metadata = array();
171			$metadata['title'] = p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED);
172			if (($references = p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED)) !== null)
173			$metadata['relation_references'] = array_keys($references);
174			else
175			$metadata['relation_references'] = array();
176
177			if (($media = p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED)) !== null)
178			$metadata['relation_media'] = array_keys($media);
179			else
180			$metadata['relation_media'] = array();
181
182			$data = compact('page', 'body', 'metadata', 'pid');
183			$evt = new Event('INDEXER_PAGE_ADD', $data);
184			if ($evt->advise_before()) $data['body'] = $data['body'] . " " . rawWiki($page);
185			$evt->advise_after();
186			unset($evt);
187			extract($data);
188
189			$result = $Indexer->addPageWords($page, $body);
190			if ($result === "locked") {
191			if ($verbose) print("Indexer: locked".DOKU_LF);
192			return false;
193			}
194
195			if ($result) {
196			$result = $Indexer->addMetaKeys($page, $metadata);
197			if ($result === "locked") {
198			if ($verbose) print("Indexer: locked".DOKU_LF);
199			return false;
200			}
201			}
202
203			if ($result)
204			io_saveFile(metaFN($page,'.indexed'), idx_get_version());
205			if ($verbose) {
206			print("Indexer: finished".DOKU_LF);
207			return true;
208			}
209			return $result;
210			}
211
212			/**
213			* Find tokens in the fulltext index
214			*
215			* Takes an array of words and will return a list of matching
216			* pages for each one.
217			*
218			* Important: No ACL checking is done here! All results are
219			* returned, regardless of permissions
220			*
221			* @param array $words list of words to search for
222			* @return array list of pages found, associated with the search terms
223			*/
224			function idx_lookup(&$words) {
225			$Indexer = idx_get_indexer();
226			return $Indexer->lookup($words);
227			}
228
229			/**
230			* Split a string into tokens
231			*
232			* @param string $string
233			* @param bool $wc
234			*
235			* @return array
236			*/
237			function idx_tokenizer($string, $wc=false) {
238			$Indexer = idx_get_indexer();
239			return $Indexer->tokenizer($string, $wc);
240			}
241
242			/* For compatibility */
243
244			/**
245			* Read the list of words in an index (if it exists).
246			*
247			* @author Tom N Harris <[email protected]>
248			*
249			* @param string $idx
250			* @param string $suffix
251			* @return array
252			*/
253			function idx_getIndex($idx, $suffix) {
254			global $conf;
255			$fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx';
256			if (!file_exists($fn)) return array();
257			return file($fn);
258			}
259
260			/**
261			* Get the list of lengths indexed in the wiki.
262			*
263			* Read the index directory or a cache file and returns
264			* a sorted array of lengths of the words used in the wiki.
265			*
266			* @author YoBoY <[email protected]>
267			*
268			* @return array
269			*/
270			function idx_listIndexLengths() {
271			global $conf;
272			// testing what we have to do, create a cache file or not.
273			if ($conf['readdircache'] == 0) {
274			$docache = false;
275			} else {
276			clearstatcache();
277			if (file_exists($conf['indexdir'].'/lengths.idx')
278			&& (time() < @filemtime($conf['indexdir'].'/lengths.idx') + $conf['readdircache'])) {
279			if (
280			($lengths = @file($conf['indexdir'].'/lengths.idx', FILE_IGNORE_NEW_LINES \| FILE_SKIP_EMPTY_LINES))
281			!== false
282			) {
283			$idx = array();
284			foreach ($lengths as $length) {
285			$idx[] = (int)$length;
286			}
287			return $idx;
288			}
289			}
290			$docache = true;
291			}
292
293			if ($conf['readdircache'] == 0 \|\| $docache) {
294			$dir = @opendir($conf['indexdir']);
295			if ($dir === false)
296			return array();
297			$idx = array();
298			while (($f = readdir($dir)) !== false) {
299			if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') {
300			$i = substr($f, 1, -4);
301			if (is_numeric($i))
302			$idx[] = (int)$i;
303			}
304			}
305			closedir($dir);
306			sort($idx);
307			// save this in a file
308			if ($docache) {
309			$handle = @fopen($conf['indexdir'].'/lengths.idx', 'w');
310			@fwrite($handle, implode("\n", $idx));
311			@fclose($handle);
312			}
313			return $idx;
314			}
315
316			return array();
317			}
318
319			/**
320			* Get the word lengths that have been indexed.
321			*
322			* Reads the index directory and returns an array of lengths
323			* that there are indices for.
324			*
325			* @author YoBoY <[email protected]>
326			*
327			* @param array\|int $filter
328			* @return array
329			*/
330			function idx_indexLengths($filter) {
331			global $conf;
332			$idx = array();
333			if (is_array($filter)) {
334			// testing if index files exist only
335			$path = $conf['indexdir']."/i";
336			foreach ($filter as $key => $value) {
337			if (file_exists($path.$key.'.idx'))
338			$idx[] = $key;
339			}
340			} else {
341			$lengths = idx_listIndexLengths();
342			foreach ($lengths as $key => $length) {
343			// keep all the values equal or superior
344			if ((int)$length >= (int)$filter)
345			$idx[] = $length;
346			}
347			}
348			return $idx;
349			}
350
351			/**
352			* Clean a name of a key for use as a file name.
353			*
354			* Romanizes non-latin characters, then strips away anything that's
355			* not a letter, number, or underscore.
356			*
357			* @author Tom N Harris <[email protected]>
358			*
359			* @param string $name
360			* @return string
361			*/
362			function idx_cleanName($name) {
363			$name = \dokuwiki\Utf8\Clean::romanize(trim((string)$name));
364			$name = preg_replace('#[ \./\\:-]+#', '_', $name);
365			$name = preg_replace('/[^A-Za-z0-9_]/', '', $name);
366			return strtolower($name);
367			}
368
369			//Setup VIM: ex: et ts=4 :
370

splitbrain / dokuwiki

Push — travis-php74 ( 84311c...10ffd5 )

indexer.php ➔ idx_tokenizer() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like