JaCompoundGroupTokenizer::tokenize()   C
last analyzed

Complexity

Conditions 7
Paths 20

Size

Total Lines 27
Code Lines 13

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 17
CRAP Score 7.0084

Importance

Changes 0
Metric Value
dl 0
loc 27
ccs 17
cts 18
cp 0.9444
rs 6.7272
c 0
b 0
f 0
cc 7
eloc 13
nc 20
nop 1
crap 7.0084
1
<?php
2
3
namespace Onoi\Tesa\Tokenizer;
4
5
use Onoi\Tesa\CharacterExaminer;
6
7
/**
8
 * @license GNU GPL v2+
9
 * @since 0.1
10
 *
11
 * @author mwjames
12
 */
13
class JaCompoundGroupTokenizer implements Tokenizer {
14
15
	/**
16
	 * @var array
17
	 */
18
	private $compound = array(
19
		"あっ",
20
		"あり",
21
		"ある",
22
		"い",
23
		"いう",
24
		"いる",
25
		"う",
26
		"うち",
27
		"お",
28
		"および",
29
		"おり",
30
		"か",
31
		"かつて",
32
		"から",
33
		"が",
34
		"き",
35
		"ここ",
36
		"こと",
37
		"この",
38
		"これ",
39
		"これら",
40
		"さ",
41
		"さらに",
42
		"し",
43
		"しかし",
44
		"する",
45
		"ず",
46
		"せ",
47
		"せる",
48
		"そして",
49
		"その",
50
		"その他",
51
		"その後",
52
		"それ",
53
		"それぞれ",
54
		"た",
55
		"ただし",
56
		"たち",
57
		"ため",
58
		"たり",
59
		"だ",
60
		"だっ",
61
		"つ",
62
		"て",
63
		"で",
64
		"でき",
65
		"できる",
66
		"です",
67
		"では",
68
		"でも",
69
		"と",
70
		"という",
71
		"といった",
72
		"とき",
73
		"ところ",
74
		"として",
75
		"とともに",
76
		"とも",
77
		"と共に",
78
		"な",
79
		"ない",
80
		"なお",
81
		"なかっ",
82
		"ながら",
83
		"なく",
84
		"なっ",
85
		"など",
86
		"なら",
87
		"なり",
88
		"なる",
89
		"に",
90
		"において",
91
		"における",
92
		"について",
93
		"にて",
94
		"によって",
95
		"により",
96
		"による",
97
		"に対して",
98
		"に対する",
99
		"に関する",
100
		"の",
101
		"ので",
102
		"のみ",
103
		"は",
104
		"ば",
105
		"へ",
106
		"ほか",
107
		"ほとんど",
108
		"ほど",
109
		"ます",
110
		"また",
111
		"または",
112
		"まで",
113
		"も",
114
		"もの",
115
		"ものの",
116
		"や",
117
		"よう",
118
		"より",
119
		"ら",
120
		"られ",
121
		"られる",
122
		"れ",
123
		"れる",
124
		"を",
125
		"ん",
126
		"及び",
127
		"特に",
128
		"、",
129
		"。",
130
		"「",
131
		"」"
132
	);
133
134
	/**
135
	 * @var Tokenizer
136
	 */
137
	private $tokenizer;
138
139
	/**
140
	 * @since 0.1
141
	 *
142
	 * @param Tokenizer $tokenizer
143
	 */
144 3
	public function __construct( Tokenizer $tokenizer = null ) {
145 3
		$this->tokenizer = $tokenizer;
146 3
	}
147
148
	/**
149
	 * @since 0.1
150
	 *
151
	 * {@inheritDoc}
152
	 */
153 1
	public function setOption( $name, $value ) {
154 1
		if ( $this->tokenizer !== null ) {
155 1
			$this->tokenizer->setOption( $name, $value );
156 1
		}
157 1
	}
158
159
	/**
160
	 * @since 0.1
161
	 *
162
	 * {@inheritDoc}
163
	 */
164
	public function isWordTokenizer() {
165
		return false;
166
	}
167
168
	/**
169
	 * @since 0.1
170
	 *
171
	 * {@inheritDoc}
172
	 */
173 2
	public function tokenize( $string ) {
174
175 2
		if ( $this->tokenizer !== null ) {
176 1
			$string = implode( " ", $this->tokenizer->tokenize( $string ) );
177 1
		}
178
179 2
		$result = explode( " " , $this->splitByCharacterGroup(
180 2
			str_replace( $this->compound, ' ', $string ) )
181 2
		);
182
183 2
		foreach ( $result as $key => $value ) {
184 2
			if ( $value === '' ) {
185 2
				unset( $result[$key] );
186 2
			}
187
188
			// Single katakana/hiragana are exempted
189 2
			if ( mb_strlen( $value ) === 1 && CharacterExaminer::contains( CharacterExaminer::HIRAGANA_KATAKANA, $value ) ) {
190 2
				unset( $result[$key] );
191 2
			}
192 2
		}
193
194 2
		if ( $result !== false ) {
195 2
			return array_values( $result );
196
		}
197
198
		return array();
199
	}
200
201
	/**
202
	 * @see MediaWiki LanguageJa::segmentByWord
203
	 *
204
	 * @since 0.1
205
	 *
206
	 * {@inheritDoc}
207
	 */
208 2
	public function splitByCharacterGroup( $string ) {
209
210
		// Space strings of like hiragana/katakana/kanji
211 2
		$hiragana = '(?:\xe3(?:\x81[\x80-\xbf]|\x82[\x80-\x9f]))'; # U3040-309f
212 2
		$katakana = '(?:\xe3(?:\x82[\xa0-\xbf]|\x83[\x80-\xbf]))'; # U30a0-30ff
213
		$kanji = '(?:\xe3[\x88-\xbf][\x80-\xbf]'
214
			. '|[\xe4-\xe8][\x80-\xbf]{2}'
215 2
			. '|\xe9[\x80-\xa5][\x80-\xbf]'
216 2
			. '|\xe9\xa6[\x80-\x99])';
217
			# U3200-9999 = \xe3\x88\x80-\xe9\xa6\x99
218
219 2
		$reg = "/({$hiragana}+|{$katakana}+|{$kanji}+)/";
220
221 2
		return $this->insertSpace( $string, $reg );
222
	}
223
224 2
	private function insertSpace( $string, $pattern ) {
225 2
		return preg_replace( '/ +/', ' ', preg_replace( $pattern, " $1 ", $string ) );
226
	}
227
228
}
229