1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace Onoi\Tesa\Tokenizer; |
4
|
|
|
|
5
|
|
|
use RuntimeException; |
6
|
|
|
|
7
|
|
|
/** |
8
|
|
|
* PHP Version of the TinySegmenter as a super compact Japanese tokenizer. |
9
|
|
|
* - https://github.com/setchi/codeute/blob/71c09c86cd1ce1cf9c8ca4d20b1db60b3784227a/fuel/app/classes/model/lib/tiny_segmenter.php |
10
|
|
|
* |
11
|
|
|
* TinySegmenter was originally developed by Taku Kudo <taku(at)chasen.org>. |
12
|
|
|
* Pulished under the BSD license http://chasen.org/~taku/software/TinySegmenter/LICENCE.txt |
13
|
|
|
* |
14
|
|
|
* PHP Version was developed by xnights <programming.magic(at)gmail.com>. |
15
|
|
|
* For details, see http://programming-magic.com/?id=172 |
16
|
|
|
* |
17
|
|
|
* The model is based on the http://research.nii.ac.jp/src/list.html corpus |
18
|
|
|
* together with an optimized L1-norm regularization. |
19
|
|
|
* |
20
|
|
|
* - https://github.com/shogo82148/TinySegmenterMaker |
21
|
|
|
* |
22
|
|
|
* @since 0.1 |
23
|
|
|
*/ |
24
|
|
|
class JaTinySegmenterTokenizer implements Tokenizer { |
25
|
|
|
|
26
|
|
|
private $patterns_ = array( |
27
|
|
|
"[一二三四五六七八九十百千万億兆]"=>"M", // numbers (japanese) |
28
|
|
|
"[一-龠々〆ヵヶ]"=>"H", // kanji & misc characters |
29
|
|
|
"[ぁ-ん]"=>"I", // hiragana |
30
|
|
|
"[ァ-ヴーア-ン゙ー]"=>"K", // katakana |
31
|
|
|
"[a-zA-Za-zA-Z]"=>"A", // ascii / romaji letters |
32
|
|
|
"[0-90-9]"=>"N", // ascii / romaji numbers |
33
|
|
|
); |
34
|
|
|
|
35
|
|
|
/** |
36
|
|
|
* @var Tokenizer |
37
|
|
|
*/ |
38
|
|
|
private $tokenizer; |
39
|
|
|
|
40
|
|
|
/** |
41
|
|
|
* This is kept static on purpose. |
42
|
|
|
* @var array |
43
|
|
|
*/ |
44
|
|
|
private static $model; |
45
|
|
|
|
46
|
|
|
/** |
47
|
|
|
* @var string |
48
|
|
|
*/ |
49
|
|
|
private $modelFile; |
|
|
|
|
50
|
|
|
|
51
|
|
|
/** |
52
|
|
|
* @since 0.1 |
53
|
|
|
* |
54
|
|
|
* @param Tokenizer $tokenizer |
55
|
|
|
*/ |
56
|
9 |
|
public function __construct( Tokenizer $tokenizer = null ) { |
57
|
9 |
|
$this->tokenizer = $tokenizer; |
58
|
9 |
|
} |
59
|
|
|
|
60
|
|
|
/** |
61
|
|
|
* @since 0.1 |
62
|
|
|
* |
63
|
|
|
* {@inheritDoc} |
64
|
|
|
*/ |
65
|
|
|
public function setOption( $name, $value ) { |
66
|
|
|
if ( $this->tokenizer !== null ) { |
67
|
|
|
$this->tokenizer->setOption( $name, $value ); |
68
|
|
|
} |
69
|
|
|
} |
70
|
|
|
|
71
|
|
|
/** |
72
|
|
|
* @since 0.1 |
73
|
|
|
* |
74
|
|
|
* {@inheritDoc} |
75
|
|
|
*/ |
76
|
|
|
public function isWordTokenizer() { |
77
|
|
|
return false; |
78
|
|
|
} |
79
|
|
|
|
80
|
|
|
/** |
81
|
|
|
* @since 0.1 |
82
|
|
|
* |
83
|
|
|
* {@inheritDoc} |
84
|
|
|
*/ |
85
|
8 |
|
public function tokenize( $string ) { |
86
|
|
|
|
87
|
8 |
|
if ( $this->tokenizer !== null ) { |
88
|
1 |
|
$string = implode( " ", $this->tokenizer->tokenize( $string ) ); |
89
|
1 |
|
} |
90
|
|
|
|
91
|
8 |
|
return $this->loadModel()->segment( $string ); |
92
|
|
|
} |
93
|
|
|
|
94
|
8 |
|
private function loadModel() { |
95
|
|
|
|
96
|
8 |
|
if ( self::$model !== null ) { |
97
|
7 |
|
return $this; |
98
|
|
|
} |
99
|
|
|
|
100
|
1 |
|
$contents = null; |
|
|
|
|
101
|
1 |
|
$file = __DIR__ . '/model/rwcp.model.json'; |
102
|
|
|
|
103
|
1 |
|
if ( ( $contents = @file_get_contents( $file ) ) !== false ) { |
104
|
1 |
|
self::$model = json_decode( $contents, true ); |
|
|
|
|
105
|
1 |
|
} |
106
|
|
|
|
107
|
1 |
|
if ( $contents === false || json_last_error() !== JSON_ERROR_NONE ) { |
108
|
|
|
throw new RuntimeException( "Couldn't read the model from {$file}." ); |
109
|
|
|
} |
110
|
|
|
|
111
|
1 |
|
return $this; |
112
|
|
|
} |
113
|
|
|
|
114
|
8 |
|
protected function segment( $input, $encoding = null ) { |
115
|
|
|
|
116
|
8 |
|
if ( !$input ) { |
117
|
|
|
return array(); |
118
|
|
|
} |
119
|
|
|
|
120
|
8 |
|
if ( !$encoding ) { |
121
|
8 |
|
$encoding = mb_detect_encoding( $input ); |
122
|
8 |
|
} |
123
|
|
|
|
124
|
8 |
|
if ( $encoding !== 'UTF-8' ) { |
125
|
|
|
$input = mb_convert_encoding( $input, 'UTF-8', $encoding ); |
126
|
|
|
} |
127
|
|
|
|
128
|
8 |
|
$result = array(); |
129
|
8 |
|
$seg = array( "B3", "B2", "B1" ); |
130
|
|
|
|
131
|
8 |
|
$ctype = array( "O", "O", "O" ); |
132
|
8 |
|
$o = $this->mb_string_to_array_( $input ); |
133
|
|
|
|
134
|
8 |
|
for ( $i = 0; $i<count($o); ++$i ) { |
|
|
|
|
135
|
8 |
|
$seg[] = $o[$i]; |
136
|
8 |
|
$ctype[] = $this->ctype_( $o[$i] ); |
137
|
8 |
|
} |
138
|
|
|
|
139
|
8 |
|
$seg[] = "E1"; |
140
|
8 |
|
$seg[] = "E2"; |
141
|
8 |
|
$seg[] = "E3"; |
142
|
8 |
|
$ctype[] = "O"; |
143
|
8 |
|
$ctype[] = "O"; |
144
|
8 |
|
$ctype[] = "O"; |
145
|
8 |
|
$word = $seg[3]; |
146
|
8 |
|
$p1 = "U"; |
147
|
8 |
|
$p2 = "U"; |
148
|
8 |
|
$p3 = "U"; |
149
|
|
|
|
150
|
8 |
|
for($i = 4; $i<count($seg)-3; ++$i){ |
|
|
|
|
151
|
8 |
|
$score = self::$model["BIAS"]; |
152
|
8 |
|
$w1 = $seg[$i-3]; |
153
|
8 |
|
$w2 = $seg[$i-2]; |
154
|
8 |
|
$w3 = $seg[$i-1]; |
155
|
8 |
|
$w4 = $seg[$i]; |
156
|
8 |
|
$w5 = $seg[$i+1]; |
157
|
8 |
|
$w6 = $seg[$i+2]; |
158
|
8 |
|
$c1 = $ctype[$i-3]; |
159
|
8 |
|
$c2 = $ctype[$i-2]; |
160
|
8 |
|
$c3 = $ctype[$i-1]; |
161
|
8 |
|
$c4 = $ctype[$i]; |
162
|
8 |
|
$c5 = $ctype[$i+1]; |
163
|
8 |
|
$c6 = $ctype[$i+2]; |
164
|
8 |
|
$score += $this->ts_(@self::$model["UP1"][$p1]); |
165
|
8 |
|
$score += $this->ts_(@self::$model["UP2"][$p2]); |
166
|
8 |
|
$score += $this->ts_(@self::$model["UP3"][$p3]); |
167
|
8 |
|
$score += $this->ts_(@self::$model["BP1"][$p1 . $p2]); |
168
|
8 |
|
$score += $this->ts_(@self::$model["BP2"][$p2 . $p3]); |
169
|
8 |
|
$score += $this->ts_(@self::$model["UW1"][$w1]); |
170
|
8 |
|
$score += $this->ts_(@self::$model["UW2"][$w2]); |
171
|
8 |
|
$score += $this->ts_(@self::$model["UW3"][$w3]); |
172
|
8 |
|
$score += $this->ts_(@self::$model["UW4"][$w4]); |
173
|
8 |
|
$score += $this->ts_(@self::$model["UW5"][$w5]); |
174
|
8 |
|
$score += $this->ts_(@self::$model["UW6"][$w6]); |
175
|
8 |
|
$score += $this->ts_(@self::$model["BW1"][$w2 . $w3]); |
176
|
8 |
|
$score += $this->ts_(@self::$model["BW2"][$w3 . $w4]); |
177
|
8 |
|
$score += $this->ts_(@self::$model["BW3"][$w4 . $w5]); |
178
|
8 |
|
$score += $this->ts_(@self::$model["TW1"][$w1 . $w2 . $w3]); |
179
|
8 |
|
$score += $this->ts_(@self::$model["TW2"][$w2 . $w3 . $w4]); |
180
|
8 |
|
$score += $this->ts_(@self::$model["TW3"][$w3 . $w4 . $w5]); |
181
|
8 |
|
$score += $this->ts_(@self::$model["TW4"][$w4 . $w5 . $w6]); |
182
|
8 |
|
$score += $this->ts_(@self::$model["UC1"][$c1]); |
183
|
8 |
|
$score += $this->ts_(@self::$model["UC2"][$c2]); |
184
|
8 |
|
$score += $this->ts_(@self::$model["UC3"][$c3]); |
185
|
8 |
|
$score += $this->ts_(@self::$model["UC4"][$c4]); |
186
|
8 |
|
$score += $this->ts_(@self::$model["UC5"][$c5]); |
187
|
8 |
|
$score += $this->ts_(@self::$model["UC6"][$c6]); |
188
|
8 |
|
$score += $this->ts_(@self::$model["BC1"][$c2 . $c3]); |
189
|
8 |
|
$score += $this->ts_(@self::$model["BC2"][$c3 . $c4]); |
190
|
8 |
|
$score += $this->ts_(@self::$model["BC3"][$c4 . $c5]); |
191
|
8 |
|
$score += $this->ts_(@self::$model["TC1"][$c1 . $c2 . $c3]); |
192
|
8 |
|
$score += $this->ts_(@self::$model["TC2"][$c2 . $c3 . $c4]); |
193
|
8 |
|
$score += $this->ts_(@self::$model["TC3"][$c3 . $c4 . $c5]); |
194
|
8 |
|
$score += $this->ts_(@self::$model["TC4"][$c4 . $c5 . $c6]); |
195
|
|
|
// $score += $this->ts_(@self::$model["TC5"][$c4 . $c5 . $c6]); |
196
|
8 |
|
$score += $this->ts_(@self::$model["UQ1"][$p1 . $c1]); |
197
|
8 |
|
$score += $this->ts_(@self::$model["UQ2"][$p2 . $c2]); |
198
|
8 |
|
$score += $this->ts_(@self::$model["UQ1"][$p3 . $c3]); |
199
|
8 |
|
$score += $this->ts_(@self::$model["BQ1"][$p2 . $c2 . $c3]); |
200
|
8 |
|
$score += $this->ts_(@self::$model["BQ2"][$p2 . $c3 . $c4]); |
201
|
8 |
|
$score += $this->ts_(@self::$model["BQ3"][$p3 . $c2 . $c3]); |
202
|
8 |
|
$score += $this->ts_(@self::$model["BQ4"][$p3 . $c3 . $c4]); |
203
|
8 |
|
$score += $this->ts_(@self::$model["TQ1"][$p2 . $c1 . $c2 . $c3]); |
204
|
8 |
|
$score += $this->ts_(@self::$model["TQ2"][$p2 . $c2 . $c3 . $c4]); |
205
|
8 |
|
$score += $this->ts_(@self::$model["TQ3"][$p3 . $c1 . $c2 . $c3]); |
206
|
8 |
|
$score += $this->ts_(@self::$model["TQ4"][$p3 . $c2 . $c3 . $c4]); |
207
|
|
|
|
208
|
8 |
|
$p = "O"; |
209
|
|
|
|
210
|
8 |
|
if ( $score > 0 ) { |
211
|
|
|
|
212
|
8 |
|
if ( $word !== '' && $word !== ' ' ) { |
213
|
8 |
|
$result[] = $word; |
214
|
8 |
|
} |
215
|
|
|
|
216
|
8 |
|
$word = ""; |
217
|
8 |
|
$p = "B"; |
218
|
8 |
|
} |
219
|
|
|
|
220
|
8 |
|
$p1 = $p2; |
221
|
8 |
|
$p2 = $p3; |
222
|
8 |
|
$p3 = $p; |
223
|
|
|
|
224
|
8 |
|
if ( $seg[$i] !== '' && $seg[$i] !== ' ' ) { |
225
|
8 |
|
$word .= $seg[$i]; |
226
|
8 |
|
} |
227
|
8 |
|
} |
228
|
|
|
|
229
|
8 |
|
$result[] = $word; |
230
|
|
|
|
231
|
8 |
|
if ( $encoding !== 'UTF-8') { |
232
|
|
|
foreach( $result as &$str ) { |
233
|
|
|
$str = mb_convert_encoding( $str, $encoding, 'UTF-8' ); |
234
|
|
|
} |
235
|
|
|
} |
236
|
|
|
|
237
|
8 |
|
return $result; |
238
|
|
|
} |
239
|
|
|
|
240
|
8 |
|
private function ctype_( $str ) { |
241
|
|
|
|
242
|
8 |
|
foreach( $this->patterns_ as $pattern => $type ) { |
243
|
8 |
|
if( preg_match( '/'.$pattern.'/u', $str ) ) { |
244
|
8 |
|
return $type; |
245
|
|
|
} |
246
|
8 |
|
} |
247
|
|
|
|
248
|
6 |
|
return "O"; |
249
|
|
|
} |
250
|
|
|
|
251
|
8 |
|
private function ts_( $v ) { |
252
|
8 |
|
return $v ? $v : 0; |
253
|
|
|
} |
254
|
|
|
|
255
|
8 |
|
private function mb_string_to_array_( $str, $encoding = 'UTF-8' ) { |
256
|
|
|
|
257
|
8 |
|
$result = array(); |
258
|
8 |
|
$length = mb_strlen( $str, $encoding ); |
259
|
|
|
|
260
|
8 |
|
for ( $i=0; $i < $length; ++$i ) { |
261
|
8 |
|
$result[] = mb_substr( $str, $i, 1, $encoding ); |
262
|
8 |
|
} |
263
|
|
|
|
264
|
8 |
|
return $result; |
265
|
|
|
} |
266
|
|
|
|
267
|
|
|
} |
268
|
|
|
|
This check marks private properties in classes that are never used. Those properties can be removed.