Completed
Push — master ( 0f3bb4...97a511 )
by mw
02:24
created

Sanitizer::containsChineseCharacters()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 1
Bugs 0 Features 0
Metric Value
c 1
b 0
f 0
dl 0
loc 3
ccs 2
cts 2
cp 1
rs 10
cc 1
eloc 2
nc 1
nop 0
crap 1
1
<?php
2
3
namespace Onoi\Tesa;
4
5
/**
6
 * @since 0.1
7
 *
8
 * @{
9
 */
10
// @codeCoverageIgnoreStart
11
define( 'ONOI_TESA_CHARACTER_MIN_LENGTH', 'ONOI_TESA_CHARACTER_MIN_LENGTH' );
12
define( 'ONOI_TESA_WORD_WHITELIST', 'ONOI_TESA_WORD_WHITELIST' );
13
// @codeCoverageIgnoreEnd
14
/**@}
15
 */
16
17
/**
18
 * @license GNU GPL v2+
19
 * @since 0.1
20
 *
21
 * @author mwjames
22
 */
23
class Sanitizer {
24
25
	/**
26
	 * Any change to the content of its data files should be reflected in a
27
	 * version change (the version number does not necessarily correlate with
28
	 * the library version)
29
	 */
30
	const VERSION = '0.1.1';
31
32
	/**
33
	 * @var string
34
	 */
35
	private $string = '';
36
37
	/**
38
	 * @var null|string
39
	 */
40
	private $encoding = null;
41
42
	/**
43
	 * @var array
44
	 */
45
	private $options = array();
46
47
	/**
48
	 * @since 0.1
49
	 *
50
	 * @param string $string
51
	 */
52 17
	public function __construct( $string ) {
53 17
		$this->string = $string;
54 17
		$this->encoding = $this->detectEncoding( $string );
55 17
		$this->setOption( ONOI_TESA_CHARACTER_MIN_LENGTH, 3 );
56 17
		$this->setOption( ONOI_TESA_WORD_WHITELIST, array() );
57 17
	}
58
59
	/**
60
	 * @since 1.0
61
	 *
62
	 * @param string $name
63
	 * @param mixed $value
64
	 */
65 17
	public function setOption( $name, $value ) {
66
67 17
		if ( $name === ONOI_TESA_WORD_WHITELIST && $value !== array() ) {
68 1
			$value = array_fill_keys( $value, true );
69 1
		}
70
71 17
		$this->options[$name] = $value;
72 17
	}
73
74
	/**
75
	 * @since 0.1
76
	 *
77
	 * @param integer $flag
78
	 */
79 1
	public function applyTransliteration( $flag = Transliterator::DIACRITICS ) {
80 1
		$this->string = Transliterator::transliterate( $this->string, $flag );
81 1
	}
82
83
	/**
84
	 * @since 0.1
85
	 *
86
	 * @param integer $flag
87
	 *
88
	 * @return array
89
	 */
90 11
	public function getTokens( $flag = Tokenizer::STRICT ) {
91 11
		return Tokenizer::tokenize( $this->string, $flag );
92
	}
93
94
	/**
95
	 * @since 0.1
96
	 *
97
	 * @param StopwordAnalyzer $stopwordAnalyzer
98
	 *
99
	 * @return string
100
	 */
101 8
	public function sanitizeBy( StopwordAnalyzer $stopwordAnalyzer ) {
102
103 8
		$words = $this->getTokens();
104 8
		$wordWhitelist = $this->options['ONOI_TESA_WORD_WHITELIST'];
105 8
		$minLength = (int)$this->options['ONOI_TESA_CHARACTER_MIN_LENGTH'];
106
107 8
		$index = array();
108 8
		$pos = 0;
109
110 8
		if ( !$words || !is_array( $words ) ) {
111 1
			return $this->string;
112
		}
113
114 7
		foreach ( $words as $key => $word ) {
115
116
			// If it is not an exemption and less than the required minimum length
117
			// or identified as stop word it is removed
118 7
			if ( !isset( $wordWhitelist[$word] ) && ( mb_strlen( $word ) < $minLength || $stopwordAnalyzer->isStopWord( $word ) ) ) {
119 6
				continue;
120
			}
121
122
			// Simple proximity checker for same words appearing next to each other
123 7
			if ( isset( $index[$pos-1] ) && $index[$pos-1] === $word ) {
124 2
				continue;
125
			}
126
127 7
			$index[] = $word;
128 7
			$pos++;
129 7
		}
130
131 7
		return implode( ' ' , $index );
132
	}
133
134
	/**
135
	 * @since 0.1
136
	 */
137 5
	public function toLowercase() {
138 5
		$this->string = mb_strtolower( $this->string, $this->encoding );
139 5
	}
140
141
	/**
142
	 * @since 0.1
143
	 *
144
	 * @param integer $length
145
	 */
146 3
	public function reduceLengthTo( $length ) {
147
148 3
		if ( mb_strlen( $this->string ) <= $length ) {
149 1
			return;
150
		}
151
152 3
		if ( strpos( $this->string, ' ' ) !== false ) {
153 1
			$length = strrpos( mb_substr( $this->string, 0, $length, $this->encoding ), ' ' ); // last whole word
154 1
		}
155
156 3
		$this->string = mb_substr( $this->string, 0, $length, $this->encoding );
157 3
	}
158
159
	/**
160
	 * @see http://www.phpwact.org/php/i18n/utf-8#str_replace
161
	 * @since 0.1
162
	 *
163
	 * @param string $search
164
	 * @param string $replace
165
	 */
166 1
	public function replace( $search, $replace ) {
167 1
		$this->string = str_replace( $search, $replace, $this->string );
168 1
	}
169
170
	/**
171
	 * @since 0.1
172
	 *
173
	 * @return string
174
	 */
175 6
	public function __toString() {
176 6
		return $this->string;
177
	}
178
179 17
	private function detectEncoding( $string) {
180 17
		return mb_detect_encoding( $string );
181
	}
182
183
}
184