Failed Conditions
Pull Request — master (#2943)
by
unknown
03:12
created

Tokenizer::getWords()   B

Complexity

Conditions 11
Paths 108

Size

Total Lines 38

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 11
nc 108
nop 2
dl 0
loc 38
rs 7.25
c 0
b 0
f 0

How to fix   Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
namespace dokuwiki\Search;
4
5
use dokuwiki\Extension\Event;
6
use dokuwiki\Utf8;
7
8
// set the minimum token length to use in the index
9
// (note, this doesn't apply to numeric tokens)
10
const MINWORDLENGTH = 2;
11
12
/**
13
 * DokuWiki Tokenizer class
14
 */
15
class Tokenizer
16
{
17
    /** @var array $Stopwords Words that tokenizer ignores */
18
    protected static $Stopwords;
19
20
    /** @var int $MinWordLength minimum token length */
21
    protected static $MinWordLength;
22
23
    /**
24
     * Returns words that will be ignored
25
     *
26
     * @return array  list of stop words
27
     *
28
     * @author Tom N Harris <[email protected]>
29
     */
30
    public static function getStopwords()
31
    {
32
        if (!isset(static::$Stopwords)) {
33
            global $conf;
34
            $swFile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
35
            if (file_exists($swFile)) {
36
                static::$Stopwords = file($swFile, FILE_IGNORE_NEW_LINES);
37
            } else {
38
                static::$Stopwords = array();
39
            }
40
        }
41
        return static::$Stopwords;
42
    }
43
44
    /**
45
     * Returns minimum word length to be used in the index
46
     *
47
     * @return int
48
     */
49
    public static function getMinWordLength()
50
    {
51
        if (!isset(static::$MinWordLength)) {
52
            // set the minimum token length to use in the index
53
            // (note, this doesn't apply to numeric tokens)
54
            static::$MinWordLength = (defined('IDX_MINWORDLENGTH'))
55
                ? IDX_MINWORDLENGTH
56
                : MINWORDLENGTH;
57
        }
58
        return static::$MinWordLength;
59
    }
60
61
    /**
62
     * Split the text into words for fulltext search
63
     *
64
     * @triggers INDEXER_TEXT_PREPARE
65
     * This event allows plugins to modify the text before it gets tokenized.
66
     * Plugins intercepting this event should also intercept INDEX_VERSION_GET
67
     *
68
     * @param string $text plain text
69
     * @param bool $wc are wildcards allowed?
70
     * @return array  list of words in the text
71
     *
72
     * @author Tom N Harris <[email protected]>
73
     * @author Andreas Gohr <[email protected]>
74
     */
75
    public static function getWords($text, $wc = false)
76
    {
77
        $wc = ($wc) ? '' : '\*';
78
79
        // prepare the text to be tokenized
80
        $event = new Event('INDEXER_TEXT_PREPARE', $text);
81
        if ($event->advise_before(true)) {
82
            if (preg_match('/[^0-9A-Za-z ]/u', $text)) {
83
                $text = Utf8\Asian::separateAsianWords($text);
84
            }
85
        }
86
        $event->advise_after();
87
        unset($event);
88
89
        $text = strtr($text, array(
90
                "\r" => ' ',
91
                "\n" => ' ',
92
                "\t" => ' ',
93
                "\xC2\xAD" => '', //soft-hyphen
94
        ));
95
        if (preg_match('/[^0-9A-Za-z ]/u', $text)) {
96
            $text = Utf8\Clean::stripspecials($text, ' ', '\._\-:'.$wc);
97
        }
98
99
        $wordlist = explode(' ', $text);
100
        foreach ($wordlist as $i => $word) {
101
            $wordlist[$i] = (preg_match('/[^0-9A-Za-z]/u', $word)) ?
102
                Utf8\PhpString::strtolower($word) : strtolower($word);
103
        }
104
105
        foreach ($wordlist as $i => $word) {
106
            if ((!is_numeric($word) && strlen($word) < static::getMinWordLength())
107
              || array_search($word, static::getStopwords(), true) !== false) {
108
                unset($wordlist[$i]);
109
            }
110
        }
111
        return array_values($wordlist);
112
    }
113
}
114