Completed
Push — master ( 9401a4...2cc7a2 )
by mw
13:59
created

JaTinySegmenterTokenizer::segment()   D

Complexity

Conditions 13
Paths 113

Size

Total Lines 125
Code Lines 96

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 1
Metric Value
c 1
b 0
f 1
dl 0
loc 125
rs 4.7679
cc 13
eloc 96
nc 113
nop 2

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
namespace Onoi\Tesa\Tokenizer;
4
5
use RuntimeException;
6
7
/**
8
 * PHP Version of the TinySegmenter as a super compact Japanese tokenizer.
9
 * - https://github.com/setchi/codeute/blob/71c09c86cd1ce1cf9c8ca4d20b1db60b3784227a/fuel/app/classes/model/lib/tiny_segmenter.php
10
 *
11
 * TinySegmenter was originally developed by Taku Kudo <taku(at)chasen.org>.
12
 * Pulished under the BSD license http://chasen.org/~taku/software/TinySegmenter/LICENCE.txt
13
 *
14
 * PHP Version was developed by xnights <programming.magic(at)gmail.com>.
15
 * For details, see http://programming-magic.com/?id=172
16
 *
17
 * The model is based on the http://research.nii.ac.jp/src/list.html corpus
18
 * together with an optimized L1-norm regularization.
19
 *
20
 * - https://github.com/shogo82148/TinySegmenterMaker
21
 *
22
 * @since 0.1
23
 */
24
class JaTinySegmenterTokenizer implements Tokenizer {
25
26
	private $patterns_ = array(
27
		"[一二三四五六七八九十百千万億兆]"=>"M", // numbers (japanese)
28
		"[一-龠々〆ヵヶ]"=>"H", // kanji & misc characters
29
		"[ぁ-ん]"=>"I", // hiragana
30
		"[ァ-ヴーア-ン゙ー]"=>"K", // katakana
31
		"[a-zA-Za-zA-Z]"=>"A", // ascii / romaji letters
32
		"[0-90-9]"=>"N", // ascii / romaji numbers
33
	);
34
35
	/**
36
	 * @var Tokenizer
37
	 */
38
	private $tokenizer;
39
40
	/**
41
	 * This is kept static on purpose.
42
	 * @var array
43
	 */
44
	private static $model;
45
46
	/**
47
	 * @var string
48
	 */
49
	private $modelFile;
0 ignored issues
show
Unused Code introduced by
The property $modelFile is not used and could be removed.

This check marks private properties in classes that are never used. Those properties can be removed.

Loading history...
50
51
	/**
52
	 * @since 0.1
53
	 *
54
	 * @param Tokenizer $tokenizer
55
	 */
56
	public function __construct( Tokenizer $tokenizer = null ) {
57
		$this->tokenizer = $tokenizer;
58
	}
59
60
	/**
61
	 * @since 0.1
62
	 *
63
	 * {@inheritDoc}
64
	 */
65
	public function setOption( $name, $value ) {
66
		if ( $this->tokenizer !== null ) {
67
			$this->tokenizer->setOption( $name, $value );
68
		}
69
	}
70
71
	/**
72
	 * @since 0.1
73
	 *
74
	 * {@inheritDoc}
75
	 */
76
	public function isWordTokenizer() {
77
		return false;
78
	}
79
80
	/**
81
	 * @since 0.1
82
	 *
83
	 * {@inheritDoc}
84
	 */
85
	public function tokenize( $string ) {
86
87
		if ( $this->tokenizer !== null ) {
88
			$string = implode( " ", $this->tokenizer->tokenize( $string ) );
89
		}
90
91
		return $this->loadModel()->segment( $string );
92
	}
93
94
	private function loadModel() {
95
96
		if ( self::$model !== null ) {
97
			return $this;
98
		}
99
100
		$contents = null;
0 ignored issues
show
Unused Code introduced by
$contents is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
101
		$file = __DIR__ . '/model/rwcp.model.json';
102
103
		if ( ( $contents = @file_get_contents( $file ) ) !== false ) {
104
			self::$model = json_decode( $contents, true );
0 ignored issues
show
Documentation Bug introduced by
It seems like json_decode($contents, true) of type * is incompatible with the declared type array of property $model.

Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.

Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..

Loading history...
105
		}
106
107
		if ( $contents === false || json_last_error() !== JSON_ERROR_NONE ) {
108
			throw new RuntimeException( "Couldn't read the model from {$file}." );
109
		}
110
111
		return $this;
112
	}
113
114
	protected function segment( $input, $encoding = null ) {
115
116
		if ( !$input ) {
117
			return array();
118
		}
119
120
		if ( !$encoding ) {
121
			$encoding = mb_detect_encoding( $input );
122
		}
123
124
		if ( $encoding !== 'UTF-8' ) {
125
			$input = mb_convert_encoding( $input, 'UTF-8', $encoding );
126
		}
127
128
		$result = array();
129
		$seg = array( "B3", "B2", "B1" );
130
131
		$ctype = array( "O", "O", "O" );
132
		$o = $this->mb_string_to_array_( $input );
133
134
		for ( $i = 0; $i<count($o); ++$i ) {
0 ignored issues
show
Performance Best Practice introduced by
It seems like you are calling the size function count() as part of the test condition. You might want to compute the size beforehand, and not on each iteration.

If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration:

for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}

// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
Loading history...
Performance Best Practice introduced by
Consider avoiding function calls on each iteration of the for loop.

If you have a function call in the test part of a for loop, this function is executed on each iteration. Often such a function, can be moved to the initialization part and be cached.

// count() is called on each iteration
for ($i=0; $i < count($collection); $i++) { }

// count() is only called once
for ($i=0, $c=count($collection); $i<$c; $i++) { }
Loading history...
135
			$seg[] = $o[$i];
136
			$ctype[] = $this->ctype_( $o[$i] );
137
		}
138
139
		$seg[] = "E1";
140
		$seg[] = "E2";
141
		$seg[] = "E3";
142
		$ctype[] = "O";
143
		$ctype[] = "O";
144
		$ctype[] = "O";
145
		$word = $seg[3];
146
		$p1 = "U";
147
		$p2 = "U";
148
		$p3 = "U";
149
150
		for($i = 4; $i<count($seg)-3; ++$i){
0 ignored issues
show
Performance Best Practice introduced by
Consider avoiding function calls on each iteration of the for loop.

If you have a function call in the test part of a for loop, this function is executed on each iteration. Often such a function, can be moved to the initialization part and be cached.

// count() is called on each iteration
for ($i=0; $i < count($collection); $i++) { }

// count() is only called once
for ($i=0, $c=count($collection); $i<$c; $i++) { }
Loading history...
151
			$score = self::$model["BIAS"];
152
			$w1 = $seg[$i-3];
153
			$w2 = $seg[$i-2];
154
			$w3 = $seg[$i-1];
155
			$w4 = $seg[$i];
156
			$w5 = $seg[$i+1];
157
			$w6 = $seg[$i+2];
158
			$c1 = $ctype[$i-3];
159
			$c2 = $ctype[$i-2];
160
			$c3 = $ctype[$i-1];
161
			$c4 = $ctype[$i];
162
			$c5 = $ctype[$i+1];
163
			$c6 = $ctype[$i+2];
164
			$score += $this->ts_(@self::$model["UP1"][$p1]);
165
			$score += $this->ts_(@self::$model["UP2"][$p2]);
166
			$score += $this->ts_(@self::$model["UP3"][$p3]);
167
			$score += $this->ts_(@self::$model["BP1"][$p1 . $p2]);
168
			$score += $this->ts_(@self::$model["BP2"][$p2 . $p3]);
169
			$score += $this->ts_(@self::$model["UW1"][$w1]);
170
			$score += $this->ts_(@self::$model["UW2"][$w2]);
171
			$score += $this->ts_(@self::$model["UW3"][$w3]);
172
			$score += $this->ts_(@self::$model["UW4"][$w4]);
173
			$score += $this->ts_(@self::$model["UW5"][$w5]);
174
			$score += $this->ts_(@self::$model["UW6"][$w6]);
175
			$score += $this->ts_(@self::$model["BW1"][$w2 . $w3]);
176
			$score += $this->ts_(@self::$model["BW2"][$w3 . $w4]);
177
			$score += $this->ts_(@self::$model["BW3"][$w4 . $w5]);
178
			$score += $this->ts_(@self::$model["TW1"][$w1 . $w2 . $w3]);
179
			$score += $this->ts_(@self::$model["TW2"][$w2 . $w3 . $w4]);
180
			$score += $this->ts_(@self::$model["TW3"][$w3 . $w4 . $w5]);
181
			$score += $this->ts_(@self::$model["TW4"][$w4 . $w5 . $w6]);
182
			$score += $this->ts_(@self::$model["UC1"][$c1]);
183
			$score += $this->ts_(@self::$model["UC2"][$c2]);
184
			$score += $this->ts_(@self::$model["UC3"][$c3]);
185
			$score += $this->ts_(@self::$model["UC4"][$c4]);
186
			$score += $this->ts_(@self::$model["UC5"][$c5]);
187
			$score += $this->ts_(@self::$model["UC6"][$c6]);
188
			$score += $this->ts_(@self::$model["BC1"][$c2 . $c3]);
189
			$score += $this->ts_(@self::$model["BC2"][$c3 . $c4]);
190
			$score += $this->ts_(@self::$model["BC3"][$c4 . $c5]);
191
			$score += $this->ts_(@self::$model["TC1"][$c1 . $c2 . $c3]);
192
			$score += $this->ts_(@self::$model["TC2"][$c2 . $c3 . $c4]);
193
			$score += $this->ts_(@self::$model["TC3"][$c3 . $c4 . $c5]);
194
			$score += $this->ts_(@self::$model["TC4"][$c4 . $c5 . $c6]);
195
			//  $score += $this->ts_(@self::$model["TC5"][$c4 . $c5 . $c6]);
196
			$score += $this->ts_(@self::$model["UQ1"][$p1 . $c1]);
197
			$score += $this->ts_(@self::$model["UQ2"][$p2 . $c2]);
198
			$score += $this->ts_(@self::$model["UQ1"][$p3 . $c3]);
199
			$score += $this->ts_(@self::$model["BQ1"][$p2 . $c2 . $c3]);
200
			$score += $this->ts_(@self::$model["BQ2"][$p2 . $c3 . $c4]);
201
			$score += $this->ts_(@self::$model["BQ3"][$p3 . $c2 . $c3]);
202
			$score += $this->ts_(@self::$model["BQ4"][$p3 . $c3 . $c4]);
203
			$score += $this->ts_(@self::$model["TQ1"][$p2 . $c1 . $c2 . $c3]);
204
			$score += $this->ts_(@self::$model["TQ2"][$p2 . $c2 . $c3 . $c4]);
205
			$score += $this->ts_(@self::$model["TQ3"][$p3 . $c1 . $c2 . $c3]);
206
			$score += $this->ts_(@self::$model["TQ4"][$p3 . $c2 . $c3 . $c4]);
207
208
			$p = "O";
209
210
			if ( $score > 0 ) {
211
212
				if ( $word !== '' && $word !== ' ' ) {
213
					$result[] = $word;
214
				}
215
216
				$word = "";
217
				$p = "B";
218
			}
219
220
			$p1 = $p2;
221
			$p2 = $p3;
222
			$p3 = $p;
223
224
			if ( $seg[$i] !== '' && $seg[$i] !== ' ' ) {
225
				$word .= $seg[$i];
226
			}
227
		}
228
229
		$result[] = $word;
230
231
		if ( $encoding !== 'UTF-8') {
232
			foreach( $result as &$str ) {
233
				$str = mb_convert_encoding( $str, $encoding, 'UTF-8' );
234
			}
235
		}
236
237
		return $result;
238
	}
239
240
	private function ctype_( $str ) {
241
242
		foreach( $this->patterns_ as $pattern => $type ) {
243
			if( preg_match( '/'.$pattern.'/u', $str ) ) {
244
				return $type;
245
			}
246
		}
247
248
		return "O";
249
	}
250
251
	private function ts_( $v ) {
252
		return $v ? $v : 0;
253
	}
254
255
	private function mb_string_to_array_( $str, $encoding = 'UTF-8' ) {
256
257
		$result = array();
258
		$length = mb_strlen( $str, $encoding );
259
260
		for ( $i=0; $i < $length; ++$i ) {
261
			$result[] = mb_substr( $str, $i, 1, $encoding );
262
		}
263
264
		return $result;
265
	}
266
267
}
268