|
1
|
|
|
<?php |
|
2
|
|
|
/* |
|
3
|
|
|
You may not change or alter any portion of this comment or credits |
|
4
|
|
|
of supporting developers from this source code or any supporting source code |
|
5
|
|
|
which is considered copyrighted (c) material of the original comment or credit authors. |
|
6
|
|
|
|
|
7
|
|
|
This program is distributed in the hope that it will be useful, |
|
8
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
9
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
|
10
|
|
|
*/ |
|
11
|
|
|
|
|
12
|
|
|
namespace Xmf; |
|
13
|
|
|
|
|
14
|
|
|
/** |
|
15
|
|
|
* StopWords - facilitate filtering of common or purely connective words for natural language processing |
|
16
|
|
|
* |
|
17
|
|
|
* @category Xmf\StopWords |
|
18
|
|
|
* @package Xmf |
|
19
|
|
|
* @author Richard Griffith <[email protected]> |
|
20
|
|
|
* @author trabis <[email protected]> |
|
21
|
|
|
* @copyright 2011-2016 XOOPS Project (http://xoops.org) |
|
22
|
|
|
* @license GNU GPL 2 or later (http://www.gnu.org/licenses/gpl-2.0.html) |
|
23
|
|
|
* @link http://xoops.org |
|
24
|
|
|
* @see https://en.wikipedia.org/wiki/Stop_words |
|
25
|
|
|
*/ |
|
26
|
|
|
class StopWords |
|
27
|
|
|
{ |
|
28
|
|
|
|
|
29
|
|
|
/** |
|
30
|
|
|
* mbstring encoding |
|
31
|
|
|
*/ |
|
32
|
|
|
const ENCODING = 'UTF-8'; |
|
33
|
|
|
|
|
34
|
|
|
/** @var string[] */ |
|
35
|
|
|
protected $stopwordList = array(); |
|
36
|
|
|
|
|
37
|
|
|
/** |
|
38
|
|
|
* StopWords constructor - load stop words for current locale |
|
39
|
|
|
* |
|
40
|
|
|
* @todo specify locale to constructor, will require shift away from defined constant |
|
41
|
|
|
*/ |
|
42
|
|
|
public function __construct() |
|
43
|
|
|
{ |
|
44
|
|
|
if (!defined('_XMF_STOPWORDS')) { |
|
45
|
|
|
Language::load('stopwords'); |
|
46
|
|
|
} |
|
47
|
|
|
if (defined('_XMF_STOPWORDS')) { |
|
48
|
|
|
$sw = explode(' ', _XMF_STOPWORDS); |
|
49
|
|
|
$this->stopwordList = array_fill_keys($sw, true); |
|
50
|
|
|
} |
|
51
|
|
|
} |
|
52
|
|
|
|
|
53
|
|
|
/** |
|
54
|
|
|
* check - look up a word in a list of stop words and |
|
55
|
|
|
* classify it as a significant word or a stop word. |
|
56
|
|
|
* |
|
57
|
|
|
* @param string $key the word to check |
|
58
|
|
|
* |
|
59
|
|
|
* @return bool True if word is significant, false if it is a stop word |
|
60
|
|
|
*/ |
|
61
|
|
|
public function check($key) |
|
62
|
|
|
{ |
|
63
|
|
|
$key = function_exists('mb_strtolower') |
|
64
|
|
|
? mb_strtolower($key, static::ENCODING) |
|
65
|
|
|
: strtolower($key); |
|
66
|
|
|
return !isset($this->stopwordList[$key]); |
|
67
|
|
|
} |
|
68
|
|
|
} |
|
69
|
|
|
|