Completed
Push — master ( 9401a4...2cc7a2 )
by mw
13:59
created

JaCompoundGroupTokenizer::tokenize()   C

Complexity

Conditions 7
Paths 20

Size

Total Lines 27
Code Lines 13

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 1
Metric Value
c 1
b 0
f 1
dl 0
loc 27
rs 6.7272
cc 7
eloc 13
nc 20
nop 1
1
<?php
2
3
namespace Onoi\Tesa\Tokenizer;
4
5
use Onoi\Tesa\CharacterExaminer;
6
7
/**
8
 * @license GNU GPL v2+
9
 * @since 0.1
10
 *
11
 * @author mwjames
12
 */
13
class JaCompoundGroupTokenizer implements Tokenizer {
14
15
	private $compound = [
16
		"あっ",
17
		"あり",
18
		"ある",
19
		"い",
20
		"いう",
21
		"いる",
22
		"う",
23
		"うち",
24
		"お",
25
		"および",
26
		"おり",
27
		"か",
28
		"かつて",
29
		"から",
30
		"が",
31
		"き",
32
		"ここ",
33
		"こと",
34
		"この",
35
		"これ",
36
		"これら",
37
		"さ",
38
		"さらに",
39
		"し",
40
		"しかし",
41
		"する",
42
		"ず",
43
		"せ",
44
		"せる",
45
		"そして",
46
		"その",
47
		"その他",
48
		"その後",
49
		"それ",
50
		"それぞれ",
51
		"た",
52
		"ただし",
53
		"たち",
54
		"ため",
55
		"たり",
56
		"だ",
57
		"だっ",
58
		"つ",
59
		"て",
60
		"で",
61
		"でき",
62
		"できる",
63
		"です",
64
		"では",
65
		"でも",
66
		"と",
67
		"という",
68
		"といった",
69
		"とき",
70
		"ところ",
71
		"として",
72
		"とともに",
73
		"とも",
74
		"と共に",
75
		"な",
76
		"ない",
77
		"なお",
78
		"なかっ",
79
		"ながら",
80
		"なく",
81
		"なっ",
82
		"など",
83
		"なら",
84
		"なり",
85
		"なる",
86
		"に",
87
		"において",
88
		"における",
89
		"について",
90
		"にて",
91
		"によって",
92
		"により",
93
		"による",
94
		"に対して",
95
		"に対する",
96
		"に関する",
97
		"の",
98
		"ので",
99
		"のみ",
100
		"は",
101
		"ば",
102
		"へ",
103
		"ほか",
104
		"ほとんど",
105
		"ほど",
106
		"ます",
107
		"また",
108
		"または",
109
		"まで",
110
		"も",
111
		"もの",
112
		"ものの",
113
		"や",
114
		"よう",
115
		"より",
116
		"ら",
117
		"られ",
118
		"られる",
119
		"れ",
120
		"れる",
121
		"を",
122
		"ん",
123
		"及び",
124
		"特に",
125
		"、",
126
		"。",
127
		"「",
128
		"」"
129
	];
130
131
	/**
132
	 * @var Tokenizer
133
	 */
134
	private $tokenizer;
135
136
	/**
137
	 * @since 0.1
138
	 *
139
	 * @param Tokenizer $tokenizer
140
	 */
141
	public function __construct( Tokenizer $tokenizer = null ) {
142
		$this->tokenizer = $tokenizer;
143
	}
144
145
	/**
146
	 * @since 0.1
147
	 *
148
	 * {@inheritDoc}
149
	 */
150
	public function setOption( $name, $value ) {
151
		if ( $this->tokenizer !== null ) {
152
			$this->tokenizer->setOption( $name, $value );
153
		}
154
	}
155
156
	/**
157
	 * @since 0.1
158
	 *
159
	 * {@inheritDoc}
160
	 */
161
	public function isWordTokenizer() {
162
		return false;
163
	}
164
165
	/**
166
	 * @since 0.1
167
	 *
168
	 * {@inheritDoc}
169
	 */
170
	public function tokenize( $string ) {
171
172
		if ( $this->tokenizer !== null ) {
173
			$string = implode( " ", $this->tokenizer->tokenize( $string ) );
174
		}
175
176
		$result = explode( " " , $this->splitByCharacterGroup(
177
			str_replace( $this->compound, ' ', $string ) )
178
		);
179
180
		foreach ( $result as $key => $value ) {
181
			if ( $value === '' ) {
182
				unset( $result[$key] );
183
			}
184
185
			// Single katakana/hiragana are exempted
186
			if ( mb_strlen( $value ) === 1 && CharacterExaminer::contains( CharacterExaminer::HIRAGANA_KATAKANA, $value ) ) {
187
				unset( $result[$key] );
188
			}
189
		}
190
191
		if ( $result !== false ) {
192
			return array_values( $result );
193
		}
194
195
		return array();
196
	}
197
198
	/**
199
	 * @see MediaWiki LanguageJa::segmentByWord
200
	 *
201
	 * @since 0.1
202
	 *
203
	 * {@inheritDoc}
204
	 */
205
	public function splitByCharacterGroup( $string ) {
206
207
		// Space strings of like hiragana/katakana/kanji
208
		$hiragana = '(?:\xe3(?:\x81[\x80-\xbf]|\x82[\x80-\x9f]))'; # U3040-309f
209
		$katakana = '(?:\xe3(?:\x82[\xa0-\xbf]|\x83[\x80-\xbf]))'; # U30a0-30ff
210
		$kanji = '(?:\xe3[\x88-\xbf][\x80-\xbf]'
211
			. '|[\xe4-\xe8][\x80-\xbf]{2}'
212
			. '|\xe9[\x80-\xa5][\x80-\xbf]'
213
			. '|\xe9\xa6[\x80-\x99])';
214
			# U3200-9999 = \xe3\x88\x80-\xe9\xa6\x99
215
216
		$reg = "/({$hiragana}+|{$katakana}+|{$kanji}+)/";
217
218
		return $this->insertSpace( $string, $reg );
219
	}
220
221
	private function insertSpace( $string, $pattern ) {
222
		return preg_replace( '/ +/', ' ', preg_replace( $pattern, " $1 ", $string ) );
223
	}
224
225
}
226