1 | <?php |
||
23 | class Sanitizer { |
||
24 | |||
25 | /** |
||
26 | * Any change to the content of its data files should be reflected in a |
||
27 | * version change (the version number does not necessarily correlate with |
||
28 | * the library version) |
||
29 | */ |
||
30 | const VERSION = '0.1.1'; |
||
31 | |||
32 | /** |
||
33 | * @var string |
||
34 | */ |
||
35 | private $string = ''; |
||
36 | |||
37 | /** |
||
38 | * @var null|string |
||
39 | */ |
||
40 | private $encoding = null; |
||
41 | |||
42 | /** |
||
43 | * @var array |
||
44 | */ |
||
45 | private $options = array(); |
||
46 | |||
47 | /** |
||
48 | * @since 0.1 |
||
49 | * |
||
50 | * @param string $string |
||
51 | */ |
||
52 | 17 | public function __construct( $string ) { |
|
53 | 17 | $this->string = $string; |
|
54 | 17 | $this->encoding = $this->detectEncoding( $string ); |
|
55 | 17 | $this->setOption( ONOI_TESA_CHARACTER_MIN_LENGTH, 3 ); |
|
56 | 17 | $this->setOption( ONOI_TESA_WORD_WHITELIST, array() ); |
|
57 | 17 | } |
|
58 | |||
59 | /** |
||
60 | * @since 1.0 |
||
61 | * |
||
62 | * @param string $name |
||
63 | * @param mixed $value |
||
64 | */ |
||
65 | 17 | public function setOption( $name, $value ) { |
|
66 | |||
67 | 17 | if ( $name === ONOI_TESA_WORD_WHITELIST && $value !== array() ) { |
|
68 | 1 | $value = array_fill_keys( $value, true ); |
|
69 | 1 | } |
|
70 | |||
71 | 17 | $this->options[$name] = $value; |
|
72 | 17 | } |
|
73 | |||
74 | /** |
||
75 | * @since 0.1 |
||
76 | * |
||
77 | * @param integer $flag |
||
78 | */ |
||
79 | 1 | public function applyTransliteration( $flag = Transliterator::DIACRITICS ) { |
|
82 | |||
83 | /** |
||
84 | * @since 0.1 |
||
85 | * |
||
86 | * @param integer $flag |
||
87 | * |
||
88 | * @return array |
||
89 | */ |
||
90 | 11 | public function getTokens( $flag = Tokenizer::STRICT ) { |
|
93 | |||
94 | /** |
||
95 | * @since 0.1 |
||
96 | * |
||
97 | * @param StopwordAnalyzer $stopwordAnalyzer |
||
98 | * |
||
99 | * @return string |
||
100 | */ |
||
101 | 8 | public function sanitizeBy( StopwordAnalyzer $stopwordAnalyzer ) { |
|
102 | |||
103 | 8 | $words = $this->getTokens(); |
|
104 | 8 | $wordWhitelist = $this->options['ONOI_TESA_WORD_WHITELIST']; |
|
105 | 8 | $minLength = (int)$this->options['ONOI_TESA_CHARACTER_MIN_LENGTH']; |
|
106 | |||
107 | 8 | $index = array(); |
|
108 | 8 | $pos = 0; |
|
109 | |||
110 | 8 | if ( !$words || !is_array( $words ) ) { |
|
111 | 1 | return $this->string; |
|
112 | } |
||
113 | |||
114 | 7 | foreach ( $words as $key => $word ) { |
|
115 | |||
116 | // If it is not an exemption and less than the required minimum length |
||
117 | // or identified as stop word it is removed |
||
118 | 7 | if ( !isset( $wordWhitelist[$word] ) && ( mb_strlen( $word ) < $minLength || $stopwordAnalyzer->isStopWord( $word ) ) ) { |
|
119 | 6 | continue; |
|
120 | } |
||
121 | |||
122 | // Simple proximity checker for same words appearing next to each other |
||
123 | 7 | if ( isset( $index[$pos-1] ) && $index[$pos-1] === $word ) { |
|
124 | 2 | continue; |
|
125 | } |
||
126 | |||
127 | 7 | $index[] = $word; |
|
128 | 7 | $pos++; |
|
129 | 7 | } |
|
130 | |||
131 | 7 | return implode( ' ' , $index ); |
|
132 | } |
||
133 | |||
134 | /** |
||
135 | * @since 0.1 |
||
136 | */ |
||
137 | 5 | public function toLowercase() { |
|
138 | 5 | $this->string = mb_strtolower( $this->string, $this->encoding ); |
|
139 | 5 | } |
|
140 | |||
141 | /** |
||
142 | * @since 0.1 |
||
143 | * |
||
144 | * @param integer $length |
||
145 | */ |
||
146 | 3 | public function reduceLengthTo( $length ) { |
|
147 | |||
148 | 3 | if ( mb_strlen( $this->string ) <= $length ) { |
|
149 | 1 | return; |
|
150 | } |
||
151 | |||
152 | 3 | if ( strpos( $this->string, ' ' ) !== false ) { |
|
153 | 1 | $length = strrpos( mb_substr( $this->string, 0, $length, $this->encoding ), ' ' ); // last whole word |
|
154 | 1 | } |
|
155 | |||
156 | 3 | $this->string = mb_substr( $this->string, 0, $length, $this->encoding ); |
|
157 | 3 | } |
|
158 | |||
159 | /** |
||
160 | * @see http://www.phpwact.org/php/i18n/utf-8#str_replace |
||
161 | * @since 0.1 |
||
162 | * |
||
163 | * @param string $search |
||
164 | * @param string $replace |
||
165 | */ |
||
166 | 1 | public function replace( $search, $replace ) { |
|
169 | |||
170 | /** |
||
171 | * @since 0.1 |
||
172 | * |
||
173 | * @return string |
||
174 | */ |
||
175 | 6 | public function __toString() { |
|
178 | |||
179 | 17 | private function detectEncoding( $string) { |
|
182 | |||
183 | } |
||
184 |