Failed Conditions
Pull Request — master (#2943)
by Andreas
03:32
created

Tokenizer::__construct()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 10

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
nc 2
nop 0
dl 0
loc 10
rs 9.9332
c 0
b 0
f 0
1
<?php
2
3
namespace dokuwiki\Search;
4
5
use dokuwiki\Extension\Event;
6
use dokuwiki\Utf8;
7
8
// set the minimum token length to use in the index
9
// (note, this doesn't apply to numeric tokens)
10
const MINWORDLENGTH = 2;
11
12
/**
13
 * DokuWiki Tokenizer class (Singleton)
14
 */
15
class Tokenizer
16
{
17
    /** @var Tokenizer */
18
    protected static $instance = null;
19
20
    /** @var array $Stopwords Words that tokenizer ignores */
21
    protected $Stopwords;
22
23
    /** @var int $MinWordLength  minimum token length */
24
    protected $MinWordLength;
25
26
    /**
27
     * Tokenizer constructor. Singleton, thus protected!
28
     */
29
    protected function __construct()
30
    {
31
        // set the minimum token length to use in the index
32
        // (note, this doesn't apply to numeric tokens)
33
        $this->MinWordLength = (defined('IDX_MINWORDLENGTH'))
34
            ? IDX_MINWORDLENGTH
35
            : MINWORDLENGTH;
36
37
        $this->Stopwords = $this->getStopwords();
38
    }
39
40
    /**
41
     * Get new or existing singleton instance of the Tokenizer
42
     *
43
     * @return Tokenizer
44
     */
45
    public static function getInstance()
46
    {
47
        if (is_null(static::$instance)) {
48
            static::$instance = new static();
49
        }
50
        return static::$instance;
51
    }
52
53
    /**
54
     * Returns words that will be ignored
55
     *
56
     * @return array                list of stop words
57
     *
58
     * @author Tom N Harris <[email protected]>
59
     */
60
    public function getStopwords()
61
    {
62
        if (!isset($this->Stopwords)) {
63
            global $conf;
64
            $swFile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
65
            if (file_exists($swFile)) {
66
                $this->Stopwords = file($swFile, FILE_IGNORE_NEW_LINES);
67
            } else {
68
                $this->Stopwords = array();
69
           }
70
        }
71
        return $this->Stopwords;
72
    }
73
74
    /**
75
     * Returns minimum word length to be used in the index
76
     */
77
    public function getMinWordLength()
78
    {
79
        return $this->MinWordLength;
80
    }
81
82
    /**
83
     * Split the text into words for fulltext search
84
     *
85
     * @triggers INDEXER_TEXT_PREPARE
86
     * This event allows plugins to modify the text before it gets tokenized.
87
     * Plugins intercepting this event should also intercept INDEX_VERSION_GET
88
     *
89
     * @param string    $text   plain text
90
     * @param bool      $wc     are wildcards allowed?
91
     * @return array            list of words in the text
92
     *
93
     * @author Tom N Harris <[email protected]>
94
     * @author Andreas Gohr <[email protected]>
95
     */
96
    public function getWords($text, $wc=false)
97
    {
98
        $wc = ($wc) ? '' : '\*';
99
100
        // prepare the text to be tokenized
101
        $event = new Event('INDEXER_TEXT_PREPARE', $text);
102
        if ($event->advise_before(true)) {
103
            if (preg_match('/[^0-9A-Za-z ]/u', $text)) {
104
                $text = Utf8\Asian::separateAsianWords($text);
105
            }
106
        }
107
        $event->advise_after();
108
        unset($event);
109
110
        $text = strtr($text,
111
                       array(
112
                           "\r" => ' ',
113
                           "\n" => ' ',
114
                           "\t" => ' ',
115
                           "\xC2\xAD" => '', //soft-hyphen
116
                       )
117
                     );
118
        if (preg_match('/[^0-9A-Za-z ]/u', $text)) {
119
            $text = Utf8\Clean::stripspecials($text, ' ', '\._\-:'.$wc);
120
        }
121
122
        $wordlist = explode(' ', $text);
123
        foreach ($wordlist as $i => $word) {
124
            $wordlist[$i] = (preg_match('/[^0-9A-Za-z]/u', $word)) ?
125
                Utf8\PhpString::strtolower($word) : strtolower($word);
126
        }
127
128
        foreach ($wordlist as $i => $word) {
129
            if ((!is_numeric($word) && strlen($word) < $this->MinWordLength)
130
              || array_search($word, $this->getStopwords(), true) !== false) {
131
                unset($wordlist[$i]);
132
            }
133
        }
134
        return array_values($wordlist);
135
    }
136
}
137