NaiveBayesian   A
last analyzed

Complexity

Total Complexity 28

Size/Duplication

Total Lines 238
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 85
c 0
b 0
f 0
dl 0
loc 238
rs 10
wmc 28

9 Methods

Rating   Name   Duplication   Size   Complexity  
A untrain() 0 11 2
A getIgnoreList() 0 7 1
A __construct() 0 3 1
A rescale() 0 28 5
A train() 0 10 2
A updateProbabilities() 0 5 1
B getTokens() 0 25 9
B categorize() 0 37 6
A cleanString() 0 17 1
1
<?php declare(strict_types=1);
2
3
namespace XoopsModules\Xhelp;
4
5
/*
6
 ***** BEGIN LICENSE BLOCK *****
7
 This file is part of PHP Naive Bayesian Filter.
8
9
 The Initial Developer of the Original Code is
10
 Loic d'Anterroches [loic_at_xhtml.net].
11
 Portions created by the Initial Developer are Copyright (C) 2003
12
 the Initial Developer. All Rights Reserved.
13
14
 Contributor(s):
15
 See the source
16
17
 PHP Naive Bayesian Filter is free software; you can redistribute it
18
 and/or modify it under the terms of the GNU General Public License as
19
 published by the Free Software Foundation; either version 2 of
20
 the License, or (at your option) any later version.
21
22
 PHP Naive Bayesian Filter is distributed in the hope that it will
23
 be useful, but WITHOUT ANY WARRANTY; without even the implied
24
 warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
25
 See the GNU General Public License for more details.
26
27
 You should have received a copy of the GNU General Public License
28
 along with Foobar; if not, write to the Free Software
29
 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
30
31
 Alternatively, the contents of this file may be used under the terms of
32
 the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
33
 in which case the provisions of the LGPL are applicable instead
34
 of those above.
35
36
 ***** END LICENSE BLOCK *****
37
 */
38
39
/**
40
 * class NaiveBayesian
41
 */
42
class NaiveBayesian
43
{
44
    /** min token length for it to be taken into consideration */
45
    public $min_token_length = 3;
46
    /** max token length for it to be taken into consideration */
47
    public $max_token_length = 15;
48
    /** list of token to ignore
49
     * @see getIgnoreList()
50
     */
51
    public $ignore_list = [];
52
    /** storage object
53
     * @see class NaiveBayesianStorage
54
     */
55
    public $nbs;
56
57
    /**
58
     * Xhelp\NaiveBayesian constructor.
59
     * @param NaiveBayesianStorage $nbs
60
     */
61
    public function __construct(NaiveBayesianStorage $nbs)
62
    {
63
        $this->nbs = $nbs;
64
    }
65
66
    /** categorize a document.
67
     * Get list of categories in which the document can be categorized
68
     * with a score for each category.
69
     *
70
     * @param mixed $document
71
     * @return array keys = category ids, values = scores
72
     */
73
    public function categorize($document): array
74
    {
75
        $scores     = [];
76
        $categories = $this->nbs->getCategories();
77
        $tokens     = $this->getTokens($document);
78
        // calculate the score in each category
79
        $total_words = 0;
80
        $ncat        = 0;
81
        //        while (list($category, $data) = each($categories)) {
82
        foreach ($categories as $category => $data) {
83
            $total_words += $data['word_count'];
84
            ++$ncat;
85
        }
86
        //        reset($categories);
87
        //        while (list($category, $data) = each($categories)) {
88
        foreach ($categories as $category => $data) {
89
            $scores[$category] = $data['probability'];
90
            // small probability for a word not in the category
91
            // maybe putting 1.0 as a 'no effect' word can also be good
92
            $small_proba = 1.0 / ($data['word_count'] * 2);
93
            //            reset($tokens);
94
            //            while (list($token, $count) = each($tokens)) {
95
            foreach ($tokens as $token => $count) {
96
                if ($this->nbs->wordExists($token)) {
97
                    $word = $this->nbs->getWord($token, $category);
98
                    if ($word['count']) {
99
                        $proba = $word['count'] / $data['word_count'];
100
                    } else {
101
                        $proba = $small_proba;
102
                    }
103
                    $scores[$category] *= ($proba ** $count) * (($total_words / $ncat) ** $count);
104
                    // pow($total_words/$ncat, $count) is here to avoid underflow.
105
                }
106
            }
107
        }
108
109
        return $this->rescale($scores);
110
    }
111
112
    /** training against a document.
113
     * Set a document as being in a specific category. The document becomes a reference
114
     * and is saved in the table of references. After a set of training is done
115
     * the updateProbabilities() function must be run.
116
     *
117
     * @param mixed $doc_id
118
     * @param mixed $category_id
119
     * @param mixed $content
120
     * @return bool success
121
     * @see updateProbabilities()
122
     * @see untrain()
123
     */
124
    public function train($doc_id, $category_id, $content): bool
125
    {
126
        $tokens = $this->getTokens($content);
127
        //            while (list($token, $count) = each($tokens)) {
128
        foreach ($tokens as $token => $count) {
129
            $this->nbs->updateWord($token, $count, $category_id);
130
        }
131
        $this->nbs->saveReference($doc_id, $category_id, $content);
132
133
        return true;
134
    }
135
136
    /** untraining of a document.
137
     * To remove just one document from the references.
138
     *
139
     * @param mixed $doc_id
140
     * @return bool success
141
     * @see updateProbabilities()
142
     * @see untrain()
143
     */
144
    public function untrain($doc_id): bool
145
    {
146
        $ref    = $this->nbs->getReference($doc_id);
147
        $tokens = $this->getTokens($ref['content']);
148
        //            while (list($token, $count) = each($tokens)) {
149
        foreach ($tokens as $token => $count) {
150
            $this->nbs->removeWord($token, $count, $ref['category_id']);
151
        }
152
        $this->nbs->removeReference($doc_id);
153
154
        return true;
155
    }
156
157
    /** rescale the results between 0 and 1.
158
     *
159
     * @param mixed $scores
160
     * @return array normalized scores (keys => category, values => scores)
161
     * @author Ken Williams, [email protected]
162
     * @see    categorize()
163
     */
164
    public function rescale($scores): array
165
    {
166
        // Scale everything back to a reasonable area in
167
        // logspace (near zero), un-loggify, and normalize
168
        $total = 0.0;
169
        $max   = 0.0;
170
        //        reset($scores);
171
        //        while (list($cat, $score) = each($scores)) {
172
        foreach ($scores as $cat => $score) {
173
            if ($score >= $max) {
174
                $max = $score;
175
            }
176
        }
177
        //        reset($scores);
178
        //        while (list($cat, $score) = each($scores)) {
179
        foreach ($scores as $cat => $score) {
180
            $scores[$cat] = \exp($score - $max);
181
            $total        += $scores[$cat] ** 2;
182
        }
183
        $total = \sqrt($total);
184
        //        reset($scores);
185
        //        while (list($cat, $score) = each($scores)) {
186
        foreach ($scores as $cat => $score) {
187
            $scores[$cat] = (float)$score / $total;
188
        }
189
        \reset($scores);
190
191
        return $scores;
192
    }
193
194
    /** update the probabilities of the categories and word count.
195
     * This function must be run after a set of training
196
     *
197
     * @return bool sucess
198
     * @see untrain()
199
     * @see train()
200
     */
201
    public function updateProbabilities(): bool
202
    {
203
        // this function is really only database manipulation
204
        // that is why all is done in the NaiveBayesianStorage
205
        return $this->nbs->updateProbabilities();
206
    }
207
208
    /** Get the list of token to ignore.
209
     * @return array ignore list
210
     */
211
    public function getIgnoreList(): array
212
    {
213
        global $xhelp_noise_words;
214
        $helper = Helper::getInstance();
215
        @$helper->loadLanguage('noise_words');
0 ignored issues
show
Security Best Practice introduced by
It seems like you do not handle an error condition for loadLanguage(). This can introduce security issues, and is generally not recommended. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unhandled  annotation

215
        /** @scrutinizer ignore-unhandled */ @$helper->loadLanguage('noise_words');

If you suppress an error, we recommend checking for the error condition explicitly:

// For example instead of
@mkdir($dir);

// Better use
if (@mkdir($dir) === false) {
    throw new \RuntimeException('The directory '.$dir.' could not be created.');
}
Loading history...
216
217
        return $xhelp_noise_words;
218
    }
219
220
    /** get the tokens from a string
221
     *
222
     * @author   James Seng. [https://james.seng.cc/] (based on his perl version)
223
     *
224
     *
225
     * @param string $string
226
     * @return array tokens
227
     * @internal param the $string string to get the tokens from
228
     */
229
    private function getTokens(string $string): array
230
    {
231
        $rawtokens = [];
0 ignored issues
show
Unused Code introduced by
The assignment to $rawtokens is dead and can be removed.
Loading history...
232
        $tokens    = [];
233
        $string    = $this->cleanString($string);
234
        if (0 == \count($this->ignore_list)) {
235
            $this->ignore_list = $this->getIgnoreList();
236
        }
237
        $rawtokens = \preg_split('[^-_A-Za-z0-9]+', $string);
238
        // remove some tokens
239
        //        while (list(, $token) = each($rawtokens)) {
240
        foreach ($rawtokens as $key => $token) {
241
            $token = \trim($token);
242
            if (!isset($tokens[$token])) {
243
                $tokens[$token] = 0;
244
            }
245
            if (!(('' == $token) || (mb_strlen($token) < $this->min_token_length)
246
                  || (mb_strlen($token) > $this->max_token_length)
247
                  || \preg_match('/^[0-9]+$/', $token)
248
                  || \in_array($token, $this->ignore_list))) {
249
                $tokens[$token]++;
250
            }
251
        }
252
253
        return $tokens;
254
    }
255
256
    /** clean a string from the diacritics
257
     *
258
     * @param mixed $string
259
     * @return string clean string
260
     * @author Antoine Bajolet [phpdig_at_toiletoine.net]
261
     * @author SPIP [https://uzine.net/spip/]
262
     */
263
    private function cleanString($string): string
264
    {
265
        $diac = /* A */
266
            \chr(192) . \chr(193) . \chr(194) . \chr(195) . \chr(196) . \chr(197) . /* a */
267
            \chr(224) . \chr(225) . \chr(226) . \chr(227) . \chr(228) . \chr(229) . /* O */
268
            \chr(210) . \chr(211) . \chr(212) . \chr(213) . \chr(214) . \chr(216) . /* o */
269
            \chr(242) . \chr(243) . \chr(244) . \chr(245) . \chr(246) . \chr(248) . /* E */
270
            \chr(200) . \chr(201) . \chr(202) . \chr(203) . /* e */
271
            \chr(232) . \chr(233) . \chr(234) . \chr(235) . /* Cc */
272
            \chr(199) . \chr(231) . /* I */
273
            \chr(204) . \chr(205) . \chr(206) . \chr(207) . /* i */
274
            \chr(236) . \chr(237) . \chr(238) . \chr(239) . /* U */
275
            \chr(217) . \chr(218) . \chr(219) . \chr(220) . /* u */
276
            \chr(249) . \chr(250) . \chr(251) . \chr(252) . /* yNn */
277
            \chr(255) . \chr(209) . \chr(241);
278
279
        return \mb_strtolower(strtr($string, $diac, 'AAAAAAaaaaaaOOOOOOooooooEEEEeeeeCcIIIIiiiiUUUUuuuuyNn'));
280
    }
281
}
282