JaCompoundGroupTokenizer   A
last analyzed

Complexity

Total Complexity 13

Size/Duplication

Total Lines 216
Duplicated Lines 0 %

Coupling/Cohesion

Components 1
Dependencies 2

Test Coverage

Coverage 91.89%

Importance

Changes 0
Metric Value
wmc 13
lcom 1
cbo 2
dl 0
loc 216
ccs 34
cts 37
cp 0.9189
rs 10
c 0
b 0
f 0

6 Methods

Rating   Name   Duplication   Size   Complexity  
A __construct() 0 3 1
A setOption() 0 5 2
A isWordTokenizer() 0 3 1
C tokenize() 0 27 7
A splitByCharacterGroup() 0 15 1
A insertSpace() 0 3 1
1
<?php
2
3
namespace Onoi\Tesa\Tokenizer;
4
5
use Onoi\Tesa\CharacterExaminer;
6
7
/**
8
 * @license GNU GPL v2+
9
 * @since 0.1
10
 *
11
 * @author mwjames
12
 */
13
class JaCompoundGroupTokenizer implements Tokenizer {
14
15
	/**
16
	 * @var array
17
	 */
18
	private $compound = array(
19
		"あっ",
20
		"あり",
21
		"ある",
22
		"い",
23
		"いう",
24
		"いる",
25
		"う",
26
		"うち",
27
		"お",
28
		"および",
29
		"おり",
30
		"か",
31
		"かつて",
32
		"から",
33
		"が",
34
		"き",
35
		"ここ",
36
		"こと",
37
		"この",
38
		"これ",
39
		"これら",
40
		"さ",
41
		"さらに",
42
		"し",
43
		"しかし",
44
		"する",
45
		"ず",
46
		"せ",
47
		"せる",
48
		"そして",
49
		"その",
50
		"その他",
51
		"その後",
52
		"それ",
53
		"それぞれ",
54
		"た",
55
		"ただし",
56
		"たち",
57
		"ため",
58
		"たり",
59
		"だ",
60
		"だっ",
61
		"つ",
62
		"て",
63
		"で",
64
		"でき",
65
		"できる",
66
		"です",
67
		"では",
68
		"でも",
69
		"と",
70
		"という",
71
		"といった",
72
		"とき",
73
		"ところ",
74
		"として",
75
		"とともに",
76
		"とも",
77
		"と共に",
78
		"な",
79
		"ない",
80
		"なお",
81
		"なかっ",
82
		"ながら",
83
		"なく",
84
		"なっ",
85
		"など",
86
		"なら",
87
		"なり",
88
		"なる",
89
		"に",
90
		"において",
91
		"における",
92
		"について",
93
		"にて",
94
		"によって",
95
		"により",
96
		"による",
97
		"に対して",
98
		"に対する",
99
		"に関する",
100
		"の",
101
		"ので",
102
		"のみ",
103
		"は",
104
		"ば",
105
		"へ",
106
		"ほか",
107
		"ほとんど",
108
		"ほど",
109
		"ます",
110
		"また",
111
		"または",
112
		"まで",
113
		"も",
114
		"もの",
115
		"ものの",
116
		"や",
117
		"よう",
118
		"より",
119
		"ら",
120
		"られ",
121
		"られる",
122
		"れ",
123
		"れる",
124
		"を",
125
		"ん",
126
		"及び",
127
		"特に",
128
		"、",
129
		"。",
130
		"「",
131
		"」"
132
	);
133
134
	/**
135
	 * @var Tokenizer
136
	 */
137
	private $tokenizer;
138
139
	/**
140
	 * @since 0.1
141
	 *
142
	 * @param Tokenizer $tokenizer
143
	 */
144 3
	public function __construct( Tokenizer $tokenizer = null ) {
145 3
		$this->tokenizer = $tokenizer;
146 3
	}
147
148
	/**
149
	 * @since 0.1
150
	 *
151
	 * {@inheritDoc}
152
	 */
153 1
	public function setOption( $name, $value ) {
154 1
		if ( $this->tokenizer !== null ) {
155 1
			$this->tokenizer->setOption( $name, $value );
156 1
		}
157 1
	}
158
159
	/**
160
	 * @since 0.1
161
	 *
162
	 * {@inheritDoc}
163
	 */
164
	public function isWordTokenizer() {
165
		return false;
166
	}
167
168
	/**
169
	 * @since 0.1
170
	 *
171
	 * {@inheritDoc}
172
	 */
173 2
	public function tokenize( $string ) {
174
175 2
		if ( $this->tokenizer !== null ) {
176 1
			$string = implode( " ", $this->tokenizer->tokenize( $string ) );
177 1
		}
178
179 2
		$result = explode( " " , $this->splitByCharacterGroup(
180 2
			str_replace( $this->compound, ' ', $string ) )
181 2
		);
182
183 2
		foreach ( $result as $key => $value ) {
184 2
			if ( $value === '' ) {
185 2
				unset( $result[$key] );
186 2
			}
187
188
			// Single katakana/hiragana are exempted
189 2
			if ( mb_strlen( $value ) === 1 && CharacterExaminer::contains( CharacterExaminer::HIRAGANA_KATAKANA, $value ) ) {
190 2
				unset( $result[$key] );
191 2
			}
192 2
		}
193
194 2
		if ( $result !== false ) {
195 2
			return array_values( $result );
196
		}
197
198
		return array();
199
	}
200
201
	/**
202
	 * @see MediaWiki LanguageJa::segmentByWord
203
	 *
204
	 * @since 0.1
205
	 *
206
	 * {@inheritDoc}
207
	 */
208 2
	public function splitByCharacterGroup( $string ) {
209
210
		// Space strings of like hiragana/katakana/kanji
211 2
		$hiragana = '(?:\xe3(?:\x81[\x80-\xbf]|\x82[\x80-\x9f]))'; # U3040-309f
212 2
		$katakana = '(?:\xe3(?:\x82[\xa0-\xbf]|\x83[\x80-\xbf]))'; # U30a0-30ff
213
		$kanji = '(?:\xe3[\x88-\xbf][\x80-\xbf]'
214
			. '|[\xe4-\xe8][\x80-\xbf]{2}'
215 2
			. '|\xe9[\x80-\xa5][\x80-\xbf]'
216 2
			. '|\xe9\xa6[\x80-\x99])';
217
			# U3200-9999 = \xe3\x88\x80-\xe9\xa6\x99
218
219 2
		$reg = "/({$hiragana}+|{$katakana}+|{$kanji}+)/";
220
221 2
		return $this->insertSpace( $string, $reg );
222
	}
223
224 2
	private function insertSpace( $string, $pattern ) {
225 2
		return preg_replace( '/ +/', ' ', preg_replace( $pattern, " $1 ", $string ) );
226
	}
227
228
}
229