XoopsModules25x /
xhelp
This project does not seem to handle request data directly as such no vulnerable execution paths were found.
include, or for example
via PHP's auto-loading mechanism.
| 1 | <?php declare(strict_types=1); |
||
| 2 | |||
| 3 | namespace XoopsModules\Xhelp; |
||
| 4 | |||
| 5 | /* |
||
| 6 | ***** BEGIN LICENSE BLOCK ***** |
||
| 7 | This file is part of PHP Naive Bayesian Filter. |
||
| 8 | |||
| 9 | The Initial Developer of the Original Code is |
||
| 10 | Loic d'Anterroches [loic_at_xhtml.net]. |
||
| 11 | Portions created by the Initial Developer are Copyright (C) 2003 |
||
| 12 | the Initial Developer. All Rights Reserved. |
||
| 13 | |||
| 14 | Contributor(s): |
||
| 15 | See the source |
||
| 16 | |||
| 17 | PHP Naive Bayesian Filter is free software; you can redistribute it |
||
| 18 | and/or modify it under the terms of the GNU General Public License as |
||
| 19 | published by the Free Software Foundation; either version 2 of |
||
| 20 | the License, or (at your option) any later version. |
||
| 21 | |||
| 22 | PHP Naive Bayesian Filter is distributed in the hope that it will |
||
| 23 | be useful, but WITHOUT ANY WARRANTY; without even the implied |
||
| 24 | warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
||
| 25 | See the GNU General Public License for more details. |
||
| 26 | |||
| 27 | You should have received a copy of the GNU General Public License |
||
| 28 | along with Foobar; if not, write to the Free Software |
||
| 29 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
||
| 30 | |||
| 31 | Alternatively, the contents of this file may be used under the terms of |
||
| 32 | the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), |
||
| 33 | in which case the provisions of the LGPL are applicable instead |
||
| 34 | of those above. |
||
| 35 | |||
| 36 | ***** END LICENSE BLOCK ***** |
||
| 37 | */ |
||
| 38 | |||
| 39 | /** |
||
| 40 | * class NaiveBayesian |
||
| 41 | */ |
||
| 42 | class NaiveBayesian |
||
| 43 | { |
||
| 44 | /** min token length for it to be taken into consideration */ |
||
| 45 | public $min_token_length = 3; |
||
| 46 | /** max token length for it to be taken into consideration */ |
||
| 47 | public $max_token_length = 15; |
||
| 48 | /** list of token to ignore |
||
| 49 | * @see getIgnoreList() |
||
| 50 | */ |
||
| 51 | public $ignore_list = []; |
||
| 52 | /** storage object |
||
| 53 | * @see class NaiveBayesianStorage |
||
| 54 | */ |
||
| 55 | public $nbs; |
||
| 56 | |||
| 57 | /** |
||
| 58 | * Xhelp\NaiveBayesian constructor. |
||
| 59 | * @param NaiveBayesianStorage $nbs |
||
| 60 | */ |
||
| 61 | public function __construct(NaiveBayesianStorage $nbs) |
||
| 62 | { |
||
| 63 | $this->nbs = $nbs; |
||
| 64 | } |
||
| 65 | |||
| 66 | /** categorize a document. |
||
| 67 | * Get list of categories in which the document can be categorized |
||
| 68 | * with a score for each category. |
||
| 69 | * |
||
| 70 | * @param mixed $document |
||
| 71 | * @return array keys = category ids, values = scores |
||
| 72 | */ |
||
| 73 | public function categorize($document): array |
||
| 74 | { |
||
| 75 | $scores = []; |
||
| 76 | $categories = $this->nbs->getCategories(); |
||
| 77 | $tokens = $this->getTokens($document); |
||
| 78 | // calculate the score in each category |
||
| 79 | $total_words = 0; |
||
| 80 | $ncat = 0; |
||
| 81 | // while (list($category, $data) = each($categories)) { |
||
| 82 | foreach ($categories as $category => $data) { |
||
| 83 | $total_words += $data['word_count']; |
||
| 84 | ++$ncat; |
||
| 85 | } |
||
| 86 | // reset($categories); |
||
| 87 | // while (list($category, $data) = each($categories)) { |
||
| 88 | foreach ($categories as $category => $data) { |
||
| 89 | $scores[$category] = $data['probability']; |
||
| 90 | // small probability for a word not in the category |
||
| 91 | // maybe putting 1.0 as a 'no effect' word can also be good |
||
| 92 | $small_proba = 1.0 / ($data['word_count'] * 2); |
||
| 93 | // reset($tokens); |
||
| 94 | // while (list($token, $count) = each($tokens)) { |
||
| 95 | foreach ($tokens as $token => $count) { |
||
| 96 | if ($this->nbs->wordExists($token)) { |
||
| 97 | $word = $this->nbs->getWord($token, $category); |
||
| 98 | if ($word['count']) { |
||
| 99 | $proba = $word['count'] / $data['word_count']; |
||
| 100 | } else { |
||
| 101 | $proba = $small_proba; |
||
| 102 | } |
||
| 103 | $scores[$category] *= ($proba ** $count) * (($total_words / $ncat) ** $count); |
||
| 104 | // pow($total_words/$ncat, $count) is here to avoid underflow. |
||
| 105 | } |
||
| 106 | } |
||
| 107 | } |
||
| 108 | |||
| 109 | return $this->rescale($scores); |
||
| 110 | } |
||
| 111 | |||
| 112 | /** training against a document. |
||
| 113 | * Set a document as being in a specific category. The document becomes a reference |
||
| 114 | * and is saved in the table of references. After a set of training is done |
||
| 115 | * the updateProbabilities() function must be run. |
||
| 116 | * |
||
| 117 | * @param mixed $doc_id |
||
| 118 | * @param mixed $category_id |
||
| 119 | * @param mixed $content |
||
| 120 | * @return bool success |
||
| 121 | * @see updateProbabilities() |
||
| 122 | * @see untrain() |
||
| 123 | */ |
||
| 124 | public function train($doc_id, $category_id, $content): bool |
||
| 125 | { |
||
| 126 | $tokens = $this->getTokens($content); |
||
| 127 | // while (list($token, $count) = each($tokens)) { |
||
| 128 | foreach ($tokens as $token => $count) { |
||
| 129 | $this->nbs->updateWord($token, $count, $category_id); |
||
| 130 | } |
||
| 131 | $this->nbs->saveReference($doc_id, $category_id, $content); |
||
| 132 | |||
| 133 | return true; |
||
| 134 | } |
||
| 135 | |||
| 136 | /** untraining of a document. |
||
| 137 | * To remove just one document from the references. |
||
| 138 | * |
||
| 139 | * @param mixed $doc_id |
||
| 140 | * @return bool success |
||
| 141 | * @see updateProbabilities() |
||
| 142 | * @see untrain() |
||
| 143 | */ |
||
| 144 | public function untrain($doc_id): bool |
||
| 145 | { |
||
| 146 | $ref = $this->nbs->getReference($doc_id); |
||
| 147 | $tokens = $this->getTokens($ref['content']); |
||
| 148 | // while (list($token, $count) = each($tokens)) { |
||
| 149 | foreach ($tokens as $token => $count) { |
||
| 150 | $this->nbs->removeWord($token, $count, $ref['category_id']); |
||
| 151 | } |
||
| 152 | $this->nbs->removeReference($doc_id); |
||
| 153 | |||
| 154 | return true; |
||
| 155 | } |
||
| 156 | |||
| 157 | /** rescale the results between 0 and 1. |
||
| 158 | * |
||
| 159 | * @param mixed $scores |
||
| 160 | * @return array normalized scores (keys => category, values => scores) |
||
| 161 | * @author Ken Williams, [email protected] |
||
| 162 | * @see categorize() |
||
| 163 | */ |
||
| 164 | public function rescale($scores): array |
||
| 165 | { |
||
| 166 | // Scale everything back to a reasonable area in |
||
| 167 | // logspace (near zero), un-loggify, and normalize |
||
| 168 | $total = 0.0; |
||
| 169 | $max = 0.0; |
||
| 170 | // reset($scores); |
||
| 171 | // while (list($cat, $score) = each($scores)) { |
||
| 172 | foreach ($scores as $cat => $score) { |
||
| 173 | if ($score >= $max) { |
||
| 174 | $max = $score; |
||
| 175 | } |
||
| 176 | } |
||
| 177 | // reset($scores); |
||
| 178 | // while (list($cat, $score) = each($scores)) { |
||
| 179 | foreach ($scores as $cat => $score) { |
||
| 180 | $scores[$cat] = \exp($score - $max); |
||
| 181 | $total += $scores[$cat] ** 2; |
||
| 182 | } |
||
| 183 | $total = \sqrt($total); |
||
| 184 | // reset($scores); |
||
| 185 | // while (list($cat, $score) = each($scores)) { |
||
| 186 | foreach ($scores as $cat => $score) { |
||
| 187 | $scores[$cat] = (float)$score / $total; |
||
| 188 | } |
||
| 189 | \reset($scores); |
||
| 190 | |||
| 191 | return $scores; |
||
| 192 | } |
||
| 193 | |||
| 194 | /** update the probabilities of the categories and word count. |
||
| 195 | * This function must be run after a set of training |
||
| 196 | * |
||
| 197 | * @return bool sucess |
||
| 198 | * @see untrain() |
||
| 199 | * @see train() |
||
| 200 | */ |
||
| 201 | public function updateProbabilities(): bool |
||
| 202 | { |
||
| 203 | // this function is really only database manipulation |
||
| 204 | // that is why all is done in the NaiveBayesianStorage |
||
| 205 | return $this->nbs->updateProbabilities(); |
||
| 206 | } |
||
| 207 | |||
| 208 | /** Get the list of token to ignore. |
||
| 209 | * @return array ignore list |
||
| 210 | */ |
||
| 211 | public function getIgnoreList(): array |
||
| 212 | { |
||
| 213 | global $xhelp_noise_words; |
||
| 214 | $helper = Helper::getInstance(); |
||
| 215 | @$helper->loadLanguage('noise_words'); |
||
|
0 ignored issues
–
show
|
|||
| 216 | |||
| 217 | return $xhelp_noise_words; |
||
| 218 | } |
||
| 219 | |||
| 220 | /** get the tokens from a string |
||
| 221 | * |
||
| 222 | * @author James Seng. [https://james.seng.cc/] (based on his perl version) |
||
| 223 | * |
||
| 224 | * |
||
| 225 | * @param string $string |
||
| 226 | * @return array tokens |
||
| 227 | * @internal param the $string string to get the tokens from |
||
| 228 | */ |
||
| 229 | private function getTokens(string $string): array |
||
| 230 | { |
||
| 231 | $rawtokens = []; |
||
|
0 ignored issues
–
show
|
|||
| 232 | $tokens = []; |
||
| 233 | $string = $this->cleanString($string); |
||
| 234 | if (0 == \count($this->ignore_list)) { |
||
| 235 | $this->ignore_list = $this->getIgnoreList(); |
||
| 236 | } |
||
| 237 | $rawtokens = \preg_split('[^-_A-Za-z0-9]+', $string); |
||
| 238 | // remove some tokens |
||
| 239 | // while (list(, $token) = each($rawtokens)) { |
||
| 240 | foreach ($rawtokens as $key => $token) { |
||
| 241 | $token = \trim($token); |
||
| 242 | if (!isset($tokens[$token])) { |
||
| 243 | $tokens[$token] = 0; |
||
| 244 | } |
||
| 245 | if (!(('' == $token) || (mb_strlen($token) < $this->min_token_length) |
||
| 246 | || (mb_strlen($token) > $this->max_token_length) |
||
| 247 | || \preg_match('/^[0-9]+$/', $token) |
||
| 248 | || \in_array($token, $this->ignore_list))) { |
||
| 249 | $tokens[$token]++; |
||
| 250 | } |
||
| 251 | } |
||
| 252 | |||
| 253 | return $tokens; |
||
| 254 | } |
||
| 255 | |||
| 256 | /** clean a string from the diacritics |
||
| 257 | * |
||
| 258 | * @param mixed $string |
||
| 259 | * @return string clean string |
||
| 260 | * @author Antoine Bajolet [phpdig_at_toiletoine.net] |
||
| 261 | * @author SPIP [https://uzine.net/spip/] |
||
| 262 | */ |
||
| 263 | private function cleanString($string): string |
||
| 264 | { |
||
| 265 | $diac = /* A */ |
||
| 266 | \chr(192) . \chr(193) . \chr(194) . \chr(195) . \chr(196) . \chr(197) . /* a */ |
||
| 267 | \chr(224) . \chr(225) . \chr(226) . \chr(227) . \chr(228) . \chr(229) . /* O */ |
||
| 268 | \chr(210) . \chr(211) . \chr(212) . \chr(213) . \chr(214) . \chr(216) . /* o */ |
||
| 269 | \chr(242) . \chr(243) . \chr(244) . \chr(245) . \chr(246) . \chr(248) . /* E */ |
||
| 270 | \chr(200) . \chr(201) . \chr(202) . \chr(203) . /* e */ |
||
| 271 | \chr(232) . \chr(233) . \chr(234) . \chr(235) . /* Cc */ |
||
| 272 | \chr(199) . \chr(231) . /* I */ |
||
| 273 | \chr(204) . \chr(205) . \chr(206) . \chr(207) . /* i */ |
||
| 274 | \chr(236) . \chr(237) . \chr(238) . \chr(239) . /* U */ |
||
| 275 | \chr(217) . \chr(218) . \chr(219) . \chr(220) . /* u */ |
||
| 276 | \chr(249) . \chr(250) . \chr(251) . \chr(252) . /* yNn */ |
||
| 277 | \chr(255) . \chr(209) . \chr(241); |
||
| 278 | |||
| 279 | return \mb_strtolower(strtr($string, $diac, 'AAAAAAaaaaaaOOOOOOooooooEEEEeeeeCcIIIIiiiiUUUUuuuuyNn')); |
||
| 280 | } |
||
| 281 | } |
||
| 282 |
If you suppress an error, we recommend checking for the error condition explicitly: