Failed Conditions
Push — stable ( 017e16...b83837 )
by
unknown
07:54 queued 02:55
created

Doku_Indexer::getIndexWords()   F

Complexity

Conditions 24
Paths 580

Size

Total Lines 71

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 24
nc 580
nop 2
dl 0
loc 71
rs 0.5833
c 0
b 0
f 0

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
/**
3
 * Functions to create the fulltext search index
4
 *
5
 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
6
 * @author     Andreas Gohr <[email protected]>
7
 * @author     Tom N Harris <[email protected]>
8
 */
9
10
use dokuwiki\Extension\Event;
11
use dokuwiki\Search\Indexer;
12
13
// Version tag used to force rebuild on upgrade
14
define('INDEXER_VERSION', 8);
15
16
// set the minimum token length to use in the index (note, this doesn't apply to numeric tokens)
17
if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH',2);
18
19
/**
20
 * Version of the indexer taking into consideration the external tokenizer.
21
 * The indexer is only compatible with data written by the same version.
22
 *
23
 * @triggers INDEXER_VERSION_GET
24
 * Plugins that modify what gets indexed should hook this event and
25
 * add their version info to the event data like so:
26
 *     $data[$plugin_name] = $plugin_version;
27
 *
28
 * @author Tom N Harris <[email protected]>
29
 * @author Michael Hamann <[email protected]>
30
 *
31
 * @return int|string
32
 */
33
function idx_get_version(){
34
    static $indexer_version = null;
35
    if ($indexer_version == null) {
36
        $version = INDEXER_VERSION;
37
38
        // DokuWiki version is included for the convenience of plugins
39
        $data = array('dokuwiki'=>$version);
40
        Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false);
41
        unset($data['dokuwiki']); // this needs to be first
42
        ksort($data);
43
        foreach ($data as $plugin=>$vers)
44
            $version .= '+'.$plugin.'='.$vers;
45
        $indexer_version = $version;
46
    }
47
    return $indexer_version;
48
}
49
50
/**
51
 * Measure the length of a string.
52
 * Differs from strlen in handling of asian characters.
53
 *
54
 * @author Tom N Harris <[email protected]>
55
 *
56
 * @param string $w
57
 * @return int
58
 */
59
function wordlen($w){
60
    $l = strlen($w);
61
    // If left alone, all chinese "words" will get put into w3.idx
62
    // So the "length" of a "word" is faked
63
    if(preg_match_all('/[\xE2-\xEF]/',$w,$leadbytes)) {
64
        foreach($leadbytes[0] as $b)
65
            $l += ord($b) - 0xE1;
66
    }
67
    return $l;
68
}
69
70
/**
71
 * Create an instance of the indexer.
72
 *
73
 * @return Indexer    an Indexer
74
 *
75
 * @author Tom N Harris <[email protected]>
76
 */
77
function idx_get_indexer() {
78
    static $Indexer;
79
    if (!isset($Indexer)) {
80
        $Indexer = new Indexer();
81
    }
82
    return $Indexer;
83
}
84
85
/**
86
 * Returns words that will be ignored.
87
 *
88
 * @return array                list of stop words
89
 *
90
 * @author Tom N Harris <[email protected]>
91
 */
92
function & idx_get_stopwords() {
93
    static $stopwords = null;
94
    if (is_null($stopwords)) {
95
        global $conf;
96
        $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
97
        if(file_exists($swfile)){
98
            $stopwords = file($swfile, FILE_IGNORE_NEW_LINES);
99
        }else{
100
            $stopwords = array();
101
        }
102
    }
103
    return $stopwords;
104
}
105
106
/**
107
 * Adds/updates the search index for the given page
108
 *
109
 * Locking is handled internally.
110
 *
111
 * @param string        $page   name of the page to index
112
 * @param boolean       $verbose    print status messages
113
 * @param boolean       $force  force reindexing even when the index is up to date
114
 * @return string|boolean  the function completed successfully
115
 *
116
 * @author Tom N Harris <[email protected]>
117
 */
118
function idx_addPage($page, $verbose=false, $force=false) {
119
    $idxtag = metaFN($page,'.indexed');
120
    // check if page was deleted but is still in the index
121
    if (!page_exists($page)) {
122
        if (!file_exists($idxtag)) {
123
            if ($verbose) print("Indexer: $page does not exist, ignoring".DOKU_LF);
124
            return false;
125
        }
126
        $Indexer = idx_get_indexer();
127
        $result = $Indexer->deletePage($page);
128
        if ($result === "locked") {
129
            if ($verbose) print("Indexer: locked".DOKU_LF);
130
            return false;
131
        }
132
        @unlink($idxtag);
133
        return $result;
134
    }
135
136
    // check if indexing needed
137
    if(!$force && file_exists($idxtag)){
138
        if(trim(io_readFile($idxtag)) == idx_get_version()){
139
            $last = @filemtime($idxtag);
140
            if($last > @filemtime(wikiFN($page))){
141
                if ($verbose) print("Indexer: index for $page up to date".DOKU_LF);
142
                return false;
143
            }
144
        }
145
    }
146
147
    $indexenabled = p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED);
148
    if ($indexenabled === false) {
149
        $result = false;
150
        if (file_exists($idxtag)) {
151
            $Indexer = idx_get_indexer();
152
            $result = $Indexer->deletePage($page);
153
            if ($result === "locked") {
154
                if ($verbose) print("Indexer: locked".DOKU_LF);
155
                return false;
156
            }
157
            @unlink($idxtag);
158
        }
159
        if ($verbose) print("Indexer: index disabled for $page".DOKU_LF);
160
        return $result;
161
    }
162
163
    $Indexer = idx_get_indexer();
164
    $pid = $Indexer->getPID($page);
165
    if ($pid === false) {
166
        if ($verbose) print("Indexer: getting the PID failed for $page".DOKU_LF);
167
        return false;
168
    }
169
    $body = '';
170
    $metadata = array();
171
    $metadata['title'] = p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED);
172
    if (($references = p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED)) !== null)
173
        $metadata['relation_references'] = array_keys($references);
174
    else
175
        $metadata['relation_references'] = array();
176
177
    if (($media = p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED)) !== null)
178
        $metadata['relation_media'] = array_keys($media);
179
    else
180
        $metadata['relation_media'] = array();
181
182
    $data = compact('page', 'body', 'metadata', 'pid');
183
    $evt = new Event('INDEXER_PAGE_ADD', $data);
184
    if ($evt->advise_before()) $data['body'] = $data['body'] . " " . rawWiki($page);
185
    $evt->advise_after();
186
    unset($evt);
187
    extract($data);
188
189
    $result = $Indexer->addPageWords($page, $body);
190
    if ($result === "locked") {
191
        if ($verbose) print("Indexer: locked".DOKU_LF);
192
        return false;
193
    }
194
195
    if ($result) {
196
        $result = $Indexer->addMetaKeys($page, $metadata);
197
        if ($result === "locked") {
198
            if ($verbose) print("Indexer: locked".DOKU_LF);
199
            return false;
200
        }
201
    }
202
203
    if ($result)
204
        io_saveFile(metaFN($page,'.indexed'), idx_get_version());
205
    if ($verbose) {
206
        print("Indexer: finished".DOKU_LF);
207
        return true;
208
    }
209
    return $result;
210
}
211
212
/**
213
 * Find tokens in the fulltext index
214
 *
215
 * Takes an array of words and will return a list of matching
216
 * pages for each one.
217
 *
218
 * Important: No ACL checking is done here! All results are
219
 *            returned, regardless of permissions
220
 *
221
 * @param array      $words  list of words to search for
222
 * @return array             list of pages found, associated with the search terms
223
 */
224
function idx_lookup(&$words) {
225
    $Indexer = idx_get_indexer();
226
    return $Indexer->lookup($words);
227
}
228
229
/**
230
 * Split a string into tokens
231
 *
232
 * @param string $string
233
 * @param bool $wc
234
 *
235
 * @return array
236
 */
237
function idx_tokenizer($string, $wc=false) {
238
    $Indexer = idx_get_indexer();
239
    return $Indexer->tokenizer($string, $wc);
240
}
241
242
/* For compatibility */
243
244
/**
245
 * Read the list of words in an index (if it exists).
246
 *
247
 * @author Tom N Harris <[email protected]>
248
 *
249
 * @param string $idx
250
 * @param string $suffix
251
 * @return array
252
 */
253
function idx_getIndex($idx, $suffix) {
254
    global $conf;
255
    $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx';
256
    if (!file_exists($fn)) return array();
257
    return file($fn);
258
}
259
260
/**
261
 * Get the list of lengths indexed in the wiki.
262
 *
263
 * Read the index directory or a cache file and returns
264
 * a sorted array of lengths of the words used in the wiki.
265
 *
266
 * @author YoBoY <[email protected]>
267
 *
268
 * @return array
269
 */
270
function idx_listIndexLengths() {
271
    global $conf;
272
    // testing what we have to do, create a cache file or not.
273
    if ($conf['readdircache'] == 0) {
274
        $docache = false;
275
    } else {
276
        clearstatcache();
277
        if (file_exists($conf['indexdir'].'/lengths.idx')
278
        && (time() < @filemtime($conf['indexdir'].'/lengths.idx') + $conf['readdircache'])) {
279
            if (
280
                ($lengths = @file($conf['indexdir'].'/lengths.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES))
281
                !== false
282
            ) {
283
                $idx = array();
284
                foreach ($lengths as $length) {
285
                    $idx[] = (int)$length;
286
                }
287
                return $idx;
288
            }
289
        }
290
        $docache = true;
291
    }
292
293
    if ($conf['readdircache'] == 0 || $docache) {
294
        $dir = @opendir($conf['indexdir']);
295
        if ($dir === false)
296
            return array();
297
        $idx = array();
298
        while (($f = readdir($dir)) !== false) {
299
            if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') {
300
                $i = substr($f, 1, -4);
301
                if (is_numeric($i))
302
                    $idx[] = (int)$i;
303
            }
304
        }
305
        closedir($dir);
306
        sort($idx);
307
        // save this in a file
308
        if ($docache) {
309
            $handle = @fopen($conf['indexdir'].'/lengths.idx', 'w');
310
            @fwrite($handle, implode("\n", $idx));
311
            @fclose($handle);
312
        }
313
        return $idx;
314
    }
315
316
    return array();
317
}
318
319
/**
320
 * Get the word lengths that have been indexed.
321
 *
322
 * Reads the index directory and returns an array of lengths
323
 * that there are indices for.
324
 *
325
 * @author YoBoY <[email protected]>
326
 *
327
 * @param array|int $filter
328
 * @return array
329
 */
330
function idx_indexLengths($filter) {
331
    global $conf;
332
    $idx = array();
333
    if (is_array($filter)) {
334
        // testing if index files exist only
335
        $path = $conf['indexdir']."/i";
336
        foreach ($filter as $key => $value) {
337
            if (file_exists($path.$key.'.idx'))
338
                $idx[] = $key;
339
        }
340
    } else {
341
        $lengths = idx_listIndexLengths();
342
        foreach ($lengths as $key => $length) {
343
            // keep all the values equal or superior
344
            if ((int)$length >= (int)$filter)
345
                $idx[] = $length;
346
        }
347
    }
348
    return $idx;
349
}
350
351
/**
352
 * Clean a name of a key for use as a file name.
353
 *
354
 * Romanizes non-latin characters, then strips away anything that's
355
 * not a letter, number, or underscore.
356
 *
357
 * @author Tom N Harris <[email protected]>
358
 *
359
 * @param string $name
360
 * @return string
361
 */
362
function idx_cleanName($name) {
363
    $name = \dokuwiki\Utf8\Clean::romanize(trim((string)$name));
364
    $name = preg_replace('#[ \./\\:-]+#', '_', $name);
365
    $name = preg_replace('/[^A-Za-z0-9_]/', '', $name);
366
    return strtolower($name);
367
}
368
369
//Setup VIM: ex: et ts=4 :
370