Completed
Push — master ( 83b1b2...0f3bb4 )
by mw
02:04
created

Sanitizer::setOption()   A

Complexity

Conditions 3
Paths 2

Size

Total Lines 8
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 6
CRAP Score 3

Importance

Changes 1
Bugs 0 Features 0
Metric Value
c 1
b 0
f 0
dl 0
loc 8
ccs 6
cts 6
cp 1
rs 9.4286
cc 3
eloc 4
nc 2
nop 2
crap 3
1
<?php
2
3
namespace Onoi\Tesa;
4
5
/**
6
 * @since 0.1
7
 *
8
 * @{
9
 */
10
// @codeCoverageIgnoreStart
11
define( 'ONOI_TESA_CHARACTER_MIN_LENGTH', 'ONOI_TESA_CHARACTER_MIN_LENGTH' );
12
define( 'ONOI_TESA_WORD_WHITELIST', 'ONOI_TESA_WORD_WHITELIST' );
13
// @codeCoverageIgnoreEnd
14
/**@}
15
 */
16
17
/**
18
 * @license GNU GPL v2+
19
 * @since 0.1
20
 *
21
 * @author mwjames
22
 */
23
class Sanitizer {
24
25
	/**
26
	 * Any change to the content of its data files should be reflected in a
27
	 * version change (the version number does not necessarily correlate with
28
	 * the library version)
29
	 */
30
	const VERSION = '0.1.1';
31
32
	/**
33
	 * @var string
34
	 */
35
	private $string = '';
36
37
	/**
38
	 * @var null
39
	 */
40
	private $encoding = null;
41
42
	/**
43
	 * @var array
44
	 */
45
	private $options = array();
46
47
	/**
48
	 * @since 0.1
49
	 *
50
	 * @param string $string
51
	 */
52 20
	public function __construct( $string ) {
53 20
		$this->string = $string;
54 20
		$this->encoding = $this->detectEncoding( $string );
0 ignored issues
show
Documentation Bug introduced by
It seems like $this->detectEncoding($string) of type string is incompatible with the declared type null of property $encoding.

Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.

Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..

Loading history...
55 20
		$this->setOption( ONOI_TESA_CHARACTER_MIN_LENGTH, 3 );
56 20
		$this->setOption( ONOI_TESA_WORD_WHITELIST, array() );
57 20
	}
58
59
	/**
60
	 * @since 1.0
61
	 *
62
	 * @param string $name
63
	 * @param mixed $value
64
	 */
65 20
	public function setOption( $name, $value ) {
66
67 20
		if ( $name === ONOI_TESA_WORD_WHITELIST && $value !== array() ) {
68 1
			$value = array_fill_keys( $value, true );
69 1
		}
70
71 20
		$this->options[$name] = $value;
72 20
	}
73
74
	/**
75
	 * @since 0.1
76
	 *
77
	 * @param integer $flag
78
	 */
79 1
	public function applyTransliteration( $flag = Transliterator::DIACRITICS ) {
80 1
		$this->string = Transliterator::transliterate( $this->string, $flag );
81 1
	}
82
83
	/**
84
	 * @since 0.1
85
	 *
86
	 * @param integer $flag
87
	 *
88
	 * @return array
89
	 */
90 11
	public function getTokens( $flag = Tokenizer::STRICT ) {
91 11
		return Tokenizer::tokenize( $this->string, $flag );
92
	}
93
94
	/**
95
	 * @since 0.1
96
	 *
97
	 * @param StopwordAnalyzer $stopwordAnalyzer
98
	 *
99
	 * @return string
100
	 */
101 8
	public function sanitizeBy( StopwordAnalyzer $stopwordAnalyzer ) {
102
103 8
		$words = $this->getTokens();
104 8
		$wordWhitelist = $this->options['ONOI_TESA_WORD_WHITELIST'];
105 8
		$minLength = (int)$this->options['ONOI_TESA_CHARACTER_MIN_LENGTH'];
106
107 8
		$index = array();
108 8
		$pos = 0;
109
110 8
		if ( !$words || !is_array( $words ) ) {
111 1
			return $this->string;
112
		}
113
114 7
		foreach ( $words as $key => $word ) {
115
116
			// If it is not an exemption and less than the required minimum length
117
			// or identified as stop word it is removed
118 7
			if ( !isset( $wordWhitelist[$word] ) && ( mb_strlen( $word ) < $minLength || $stopwordAnalyzer->isStopWord( $word ) ) ) {
119 6
				continue;
120
			}
121
122
			// Simple proximity checker for same words appearing next to each other
123 7
			if ( isset( $index[$pos-1] ) && $index[$pos-1] === $word ) {
124 2
				continue;
125
			}
126
127 7
			$index[] = $word;
128 7
			$pos++;
129 7
		}
130
131 7
		return implode( ' ' , $index );
132
	}
133
134
	/**
135
	 * @since 0.1
136
	 */
137 5
	public function toLowercase() {
138 5
		$this->string = mb_strtolower( $this->string, $this->encoding );
139 5
	}
140
141
	/**
142
	 * @since 0.1
143
	 *
144
	 * @param integer $length
145
	 */
146 3
	public function reduceLengthTo( $length ) {
147
148 3
		if ( mb_strlen( $this->string ) <= $length ) {
149 1
			return;
150
		}
151
152 3
		if ( strpos( $this->string, ' ' ) !== false ) {
153 1
			$length = strrpos( mb_substr( $this->string, 0, $length, $this->encoding ), ' ' ); // last whole word
154 1
		}
155
156 3
		$this->string = mb_substr( $this->string, 0, $length, $this->encoding );
157 3
	}
158
159
	/**
160
	 * @see http://www.phpwact.org/php/i18n/utf-8#str_replace
161
	 * @since 0.1
162
	 *
163
	 * @param string $search
164
	 * @param string $replace
165
	 */
166 1
	public function replace( $search, $replace ) {
167 1
		$this->string = str_replace( $search, $replace, $this->string );
168 1
	}
169
170
	/**
171
	 * @since 0.1
172
	 *
173
	 * @return boolean
174
	 */
175 1
	public function containsJapaneseCharacters() {
176 1
		return preg_match('/[\x{3040}-\x{309F}]/u', $this->string ) > 0 || preg_match('/[\x{30A0}-\x{30FF}]/u', $this->string ) > 0; // isHiragana || isKatakana
177
	}
178
179
	/**
180
	 * @since 0.1
181
	 *
182
	 * @return boolean
183
	 */
184 1
	public function containsKoreanCharacters() {
185 1
		return preg_match('/[\x{3130}-\x{318F}]/u', $this->string ) > 0 || preg_match('/[\x{AC00}-\x{D7AF}]/u', $this->string ) > 0;
186
	}
187
188
	/**
189
	 * @since 0.1
190
	 *
191
	 * @return boolean
192
	 */
193 1
	public function containsChineseCharacters() {
194 1
		return preg_match('/[\x{4e00}-\x{9fa5}]/u', $this->string ) > 0;
195
	}
196
197
	/**
198
	 * @since 0.1
199
	 *
200
	 * @return string
201
	 */
202 6
	public function __toString() {
203 6
		return $this->string;
204
	}
205
206 20
	private function detectEncoding( $string) {
207 20
		return mb_detect_encoding( $string );
208
	}
209
210
}
211