JaTinySegmenterTokenizer::segment() - Code Metrics - Inspection of "Add SanitizerFactory" - onoi/tesa - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 9401a4...2cc7a2 )

by mw

created 2016-08-02 14:22 UTC

JaTinySegmenterTokenizer::segment() D

↳ Parent: JaTinySegmenterTokenizer

Complexity

Conditions	13
Paths	113

Size

Total Lines	125
Code Lines	96

Duplication

Lines	0
Ratio	0 %

Importance

Changes	1
Bugs	0	Features	1

Metric	Value
c	1
b	0
f	1
dl	0
loc	125
rs	4.7679
cc	13
eloc	96
nc	113
nop	2

How to fix Long Method Complexity

<?php

namespace Onoi\Tesa\Tokenizer;

use RuntimeException;

/**
 * PHP Version of the TinySegmenter as a super compact Japanese tokenizer.
 * - https://github.com/setchi/codeute/blob/71c09c86cd1ce1cf9c8ca4d20b1db60b3784227a/fuel/app/classes/model/lib/tiny_segmenter.php
 *
 * TinySegmenter was originally developed by Taku Kudo <taku(at)chasen.org>.
 * Pulished under the BSD license http://chasen.org/~taku/software/TinySegmenter/LICENCE.txt
 *
 * PHP Version was developed by xnights <programming.magic(at)gmail.com>.
 * For details, see http://programming-magic.com/?id=172
 *
 * The model is based on the http://research.nii.ac.jp/src/list.html corpus
 * together with an optimized L1-norm regularization.
 *
 * - https://github.com/shogo82148/TinySegmenterMaker
 *
 * @since 0.1
 */
class JaTinySegmenterTokenizer implements Tokenizer {

	private $patterns_ = array(
		"[一二三四五六七八九十百千万億兆]"=>"M", // numbers (japanese)
		"[一-龠々〆ヵヶ]"=>"H", // kanji & misc characters
		"[ぁ-ん]"=>"I", // hiragana
		"[ァ-ヴーｱ-ﾝﾞｰ]"=>"K", // katakana
		"[a-zA-Zａ-ｚＡ-Ｚ]"=>"A", // ascii / romaji letters
		"[0-9０-９]"=>"N", // ascii / romaji numbers
	);

	/**
	 * @var Tokenizer
	 */
	private $tokenizer;

	/**
	 * This is kept static on purpose.
	 * @var array
	 */
	private static $model;

	/**
	 * @var string
	 */
	private $modelFile;


	/**
	 * @since 0.1
	 *
	 * @param Tokenizer $tokenizer
	 */
	public function __construct( Tokenizer $tokenizer = null ) {
		$this->tokenizer = $tokenizer;
	}

	/**
	 * @since 0.1
	 *
	 * {@inheritDoc}
	 */
	public function setOption( $name, $value ) {
		if ( $this->tokenizer !== null ) {
			$this->tokenizer->setOption( $name, $value );
		}
	}

	/**
	 * @since 0.1
	 *
	 * {@inheritDoc}
	 */
	public function isWordTokenizer() {
		return false;
	}

	/**
	 * @since 0.1
	 *
	 * {@inheritDoc}
	 */
	public function tokenize( $string ) {

		if ( $this->tokenizer !== null ) {
			$string = implode( " ", $this->tokenizer->tokenize( $string ) );
		}

		return $this->loadModel()->segment( $string );
	}

	private function loadModel() {

		if ( self::$model !== null ) {
			return $this;
		}

		$contents = null;
$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}
		$file = __DIR__ . '/model/rwcp.model.json';

		if ( ( $contents = @file_get_contents( $file ) ) !== false ) {
			self::$model = json_decode( $contents, true );

		}

		if ( $contents === false || json_last_error() !== JSON_ERROR_NONE ) {
			throw new RuntimeException( "Couldn't read the model from {$file}." );
		}

		return $this;
	}

	protected function segment( $input, $encoding = null ) {

		if ( !$input ) {
			return array();
		}

		if ( !$encoding ) {
			$encoding = mb_detect_encoding( $input );
		}

		if ( $encoding !== 'UTF-8' ) {
			$input = mb_convert_encoding( $input, 'UTF-8', $encoding );
		}

		$result = array();
		$seg = array( "B3", "B2", "B1" );

		$ctype = array( "O", "O", "O" );
		$o = $this->mb_string_to_array_( $input );

		for ( $i = 0; $i<count($o); ++$i ) {
for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}

// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
			$seg[] = $o[$i];
			$ctype[] = $this->ctype_( $o[$i] );
		}

		$seg[] = "E1";
		$seg[] = "E2";
		$seg[] = "E3";
		$ctype[] = "O";
		$ctype[] = "O";
		$ctype[] = "O";
		$word = $seg[3];
		$p1 = "U";
		$p2 = "U";
		$p3 = "U";

		for($i = 4; $i<count($seg)-3; ++$i){
// count() is called on each iteration
for ($i=0; $i < count($collection); $i++) { }

// count() is only called once
for ($i=0, $c=count($collection); $i<$c; $i++) { }
			$score = self::$model["BIAS"];
			$w1 = $seg[$i-3];
			$w2 = $seg[$i-2];
			$w3 = $seg[$i-1];
			$w4 = $seg[$i];
			$w5 = $seg[$i+1];
			$w6 = $seg[$i+2];
			$c1 = $ctype[$i-3];
			$c2 = $ctype[$i-2];
			$c3 = $ctype[$i-1];
			$c4 = $ctype[$i];
			$c5 = $ctype[$i+1];
			$c6 = $ctype[$i+2];
			$score += $this->ts_(@self::$model["UP1"][$p1]);
			$score += $this->ts_(@self::$model["UP2"][$p2]);
			$score += $this->ts_(@self::$model["UP3"][$p3]);
			$score += $this->ts_(@self::$model["BP1"][$p1 . $p2]);
			$score += $this->ts_(@self::$model["BP2"][$p2 . $p3]);
			$score += $this->ts_(@self::$model["UW1"][$w1]);
			$score += $this->ts_(@self::$model["UW2"][$w2]);
			$score += $this->ts_(@self::$model["UW3"][$w3]);
			$score += $this->ts_(@self::$model["UW4"][$w4]);
			$score += $this->ts_(@self::$model["UW5"][$w5]);
			$score += $this->ts_(@self::$model["UW6"][$w6]);
			$score += $this->ts_(@self::$model["BW1"][$w2 . $w3]);
			$score += $this->ts_(@self::$model["BW2"][$w3 . $w4]);
			$score += $this->ts_(@self::$model["BW3"][$w4 . $w5]);
			$score += $this->ts_(@self::$model["TW1"][$w1 . $w2 . $w3]);
			$score += $this->ts_(@self::$model["TW2"][$w2 . $w3 . $w4]);
			$score += $this->ts_(@self::$model["TW3"][$w3 . $w4 . $w5]);
			$score += $this->ts_(@self::$model["TW4"][$w4 . $w5 . $w6]);
			$score += $this->ts_(@self::$model["UC1"][$c1]);
			$score += $this->ts_(@self::$model["UC2"][$c2]);
			$score += $this->ts_(@self::$model["UC3"][$c3]);
			$score += $this->ts_(@self::$model["UC4"][$c4]);
			$score += $this->ts_(@self::$model["UC5"][$c5]);
			$score += $this->ts_(@self::$model["UC6"][$c6]);
			$score += $this->ts_(@self::$model["BC1"][$c2 . $c3]);
			$score += $this->ts_(@self::$model["BC2"][$c3 . $c4]);
			$score += $this->ts_(@self::$model["BC3"][$c4 . $c5]);
			$score += $this->ts_(@self::$model["TC1"][$c1 . $c2 . $c3]);
			$score += $this->ts_(@self::$model["TC2"][$c2 . $c3 . $c4]);
			$score += $this->ts_(@self::$model["TC3"][$c3 . $c4 . $c5]);
			$score += $this->ts_(@self::$model["TC4"][$c4 . $c5 . $c6]);
			//  $score += $this->ts_(@self::$model["TC5"][$c4 . $c5 . $c6]);
			$score += $this->ts_(@self::$model["UQ1"][$p1 . $c1]);
			$score += $this->ts_(@self::$model["UQ2"][$p2 . $c2]);
			$score += $this->ts_(@self::$model["UQ1"][$p3 . $c3]);
			$score += $this->ts_(@self::$model["BQ1"][$p2 . $c2 . $c3]);
			$score += $this->ts_(@self::$model["BQ2"][$p2 . $c3 . $c4]);
			$score += $this->ts_(@self::$model["BQ3"][$p3 . $c2 . $c3]);
			$score += $this->ts_(@self::$model["BQ4"][$p3 . $c3 . $c4]);
			$score += $this->ts_(@self::$model["TQ1"][$p2 . $c1 . $c2 . $c3]);
			$score += $this->ts_(@self::$model["TQ2"][$p2 . $c2 . $c3 . $c4]);
			$score += $this->ts_(@self::$model["TQ3"][$p3 . $c1 . $c2 . $c3]);
			$score += $this->ts_(@self::$model["TQ4"][$p3 . $c2 . $c3 . $c4]);

			$p = "O";

			if ( $score > 0 ) {

				if ( $word !== '' && $word !== ' ' ) {
					$result[] = $word;
				}

				$word = "";
				$p = "B";
			}

			$p1 = $p2;
			$p2 = $p3;
			$p3 = $p;

			if ( $seg[$i] !== '' && $seg[$i] !== ' ' ) {
				$word .= $seg[$i];
			}
		}

		$result[] = $word;

		if ( $encoding !== 'UTF-8') {
			foreach( $result as &$str ) {
				$str = mb_convert_encoding( $str, $encoding, 'UTF-8' );
			}
		}

		return $result;
	}

	private function ctype_( $str ) {

		foreach( $this->patterns_ as $pattern => $type ) {
			if( preg_match( '/'.$pattern.'/u', $str ) ) {
				return $type;
			}
		}

		return "O";
	}

	private function ts_( $v ) {
		return $v ? $v : 0;
	}

	private function mb_string_to_array_( $str, $encoding = 'UTF-8' ) {

		$result = array();
		$length = mb_strlen( $str, $encoding );

		for ( $i=0; $i < $length; ++$i ) {
			$result[] = mb_substr( $str, $i, 1, $encoding );
		}

		return $result;
	}

}


1			<?php
2
3			namespace Onoi\Tesa\Tokenizer;
4
5			use RuntimeException;
6
7			/**
8			* PHP Version of the TinySegmenter as a super compact Japanese tokenizer.
9			* - https://github.com/setchi/codeute/blob/71c09c86cd1ce1cf9c8ca4d20b1db60b3784227a/fuel/app/classes/model/lib/tiny_segmenter.php
10			*
11			* TinySegmenter was originally developed by Taku Kudo <taku(at)chasen.org>.
12			* Pulished under the BSD license http://chasen.org/~taku/software/TinySegmenter/LICENCE.txt
13			*
14			* PHP Version was developed by xnights <programming.magic(at)gmail.com>.
15			* For details, see http://programming-magic.com/?id=172
16			*
17			* The model is based on the http://research.nii.ac.jp/src/list.html corpus
18			* together with an optimized L1-norm regularization.
19			*
20			* - https://github.com/shogo82148/TinySegmenterMaker
21			*
22			* @since 0.1
23			*/
24			class JaTinySegmenterTokenizer implements Tokenizer {
25
26			private $patterns_ = array(
27			"[一二三四五六七八九十百千万億兆]"=>"M", // numbers (japanese)
28			"[一-龠々〆ヵヶ]"=>"H", // kanji & misc characters
29			"[ぁ-ん]"=>"I", // hiragana
30			"[ァ-ヴーｱ-ﾝﾞｰ]"=>"K", // katakana
31			"[a-zA-Zａ-ｚＡ-Ｚ]"=>"A", // ascii / romaji letters
32			"[0-9０-９]"=>"N", // ascii / romaji numbers
33			);
34
35			/**
36			* @var Tokenizer
37			*/
38			private $tokenizer;
39
40			/**
41			* This is kept static on purpose.
42			* @var array
43			*/
44			private static $model;
45
46			/**
47			* @var string
48			*/
49			private $modelFile;
			0 ignored issues – show Unused Code introduced 2016-08-02 14:35 UTC by Report Bug Copy Issue Report The property `$modelFile` is not used and could be removed. This check marks private properties in classes that are never used. Those properties can be removed. Loading history...
50
51			/**
52			* @since 0.1
53			*
54			* @param Tokenizer $tokenizer
55			*/
56			public function __construct( Tokenizer $tokenizer = null ) {
57			$this->tokenizer = $tokenizer;
58			}
59
60			/**
61			* @since 0.1
62			*
63			* {@inheritDoc}
64			*/
65			public function setOption( $name, $value ) {
66			if ( $this->tokenizer !== null ) {
67			$this->tokenizer->setOption( $name, $value );
68			}
69			}
70
71			/**
72			* @since 0.1
73			*
74			* {@inheritDoc}
75			*/
76			public function isWordTokenizer() {
77			return false;
78			}
79
80			/**
81			* @since 0.1
82			*
83			* {@inheritDoc}
84			*/
85			public function tokenize( $string ) {
86
87			if ( $this->tokenizer !== null ) {
88			$string = implode( " ", $this->tokenizer->tokenize( $string ) );
89			}
90
91			return $this->loadModel()->segment( $string );
92			}
93
94			private function loadModel() {
95
96			if ( self::$model !== null ) {
97			return $this;
98			}
99
100			$contents = null;
			0 ignored issues – show Unused Code introduced 2016-08-02 14:35 UTC by Report Bug Copy Issue Report `$contents` is not used, you could remove the assignment. This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently. $myVar = 'Value'; $higher = false; if (rand(1, 6) > 3) { $higher = true; } else { $higher = false; } Both the `$myVar` assignment in line 1 and the `$higher` assignment in line 2 are dead. The first because `$myVar` is never used and the second because `$higher` is always overwritten for every possible time line. Loading history...
101			$file = __DIR__ . '/model/rwcp.model.json';
102
103			if ( ( $contents = @file_get_contents( $file ) ) !== false ) {
104			self::$model = json_decode( $contents, true );
			0 ignored issues – show Documentation Bug introduced 2016-08-02 14:35 UTC by Report Bug Copy Issue Report It seems like `json_decode($contents, true)` of type `*` is incompatible with the declared type `array` of property `$model`. Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property. Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property.. Loading history...
105			}
106
107			if ( $contents === false \|\| json_last_error() !== JSON_ERROR_NONE ) {
108			throw new RuntimeException( "Couldn't read the model from {$file}." );
109			}
110
111			return $this;
112			}
113
114			protected function segment( $input, $encoding = null ) {
115
116			if ( !$input ) {
117			return array();
118			}
119
120			if ( !$encoding ) {
121			$encoding = mb_detect_encoding( $input );
122			}
123
124			if ( $encoding !== 'UTF-8' ) {
125			$input = mb_convert_encoding( $input, 'UTF-8', $encoding );
126			}
127
128			$result = array();
129			$seg = array( "B3", "B2", "B1" );
130
131			$ctype = array( "O", "O", "O" );
132			$o = $this->mb_string_to_array_( $input );
133
134			for ( $i = 0; $i<count($o); ++$i ) {
			0 ignored issues – show Performance Best Practice introduced 2016-08-02 14:35 UTC by Report Bug Copy Issue Report It seems like you are calling the size function `count()` as part of the test condition. You might want to compute the size beforehand, and not on each iteration. If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration: for ($i=0; $i<count($array); $i++) { // calls count() on each iteration } // Better for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once } Loading history... Performance Best Practice introduced 2016-08-02 14:35 UTC by Report Bug Copy Issue Report Consider avoiding function calls on each iteration of the `for` loop. If you have a function call in the test part of a `for` loop, this function is executed on each iteration. Often such a function, can be moved to the initialization part and be cached. // count() is called on each iteration for ($i=0; $i < count($collection); $i++) { } // count() is only called once for ($i=0, $c=count($collection); $i<$c; $i++) { } Loading history...
135			$seg[] = $o[$i];
136			$ctype[] = $this->ctype_( $o[$i] );
137			}
138
139			$seg[] = "E1";
140			$seg[] = "E2";
141			$seg[] = "E3";
142			$ctype[] = "O";
143			$ctype[] = "O";
144			$ctype[] = "O";
145			$word = $seg[3];
146			$p1 = "U";
147			$p2 = "U";
148			$p3 = "U";
149
150			for($i = 4; $i<count($seg)-3; ++$i){
			0 ignored issues – show Performance Best Practice introduced 2016-08-02 14:35 UTC by Report Bug Copy Issue Report Consider avoiding function calls on each iteration of the `for` loop. If you have a function call in the test part of a `for` loop, this function is executed on each iteration. Often such a function, can be moved to the initialization part and be cached. // count() is called on each iteration for ($i=0; $i < count($collection); $i++) { } // count() is only called once for ($i=0, $c=count($collection); $i<$c; $i++) { } Loading history...
151			$score = self::$model["BIAS"];
152			$w1 = $seg[$i-3];
153			$w2 = $seg[$i-2];
154			$w3 = $seg[$i-1];
155			$w4 = $seg[$i];
156			$w5 = $seg[$i+1];
157			$w6 = $seg[$i+2];
158			$c1 = $ctype[$i-3];
159			$c2 = $ctype[$i-2];
160			$c3 = $ctype[$i-1];
161			$c4 = $ctype[$i];
162			$c5 = $ctype[$i+1];
163			$c6 = $ctype[$i+2];
164			$score += $this->ts_(@self::$model["UP1"][$p1]);
165			$score += $this->ts_(@self::$model["UP2"][$p2]);
166			$score += $this->ts_(@self::$model["UP3"][$p3]);
167			$score += $this->ts_(@self::$model["BP1"][$p1 . $p2]);
168			$score += $this->ts_(@self::$model["BP2"][$p2 . $p3]);
169			$score += $this->ts_(@self::$model["UW1"][$w1]);
170			$score += $this->ts_(@self::$model["UW2"][$w2]);
171			$score += $this->ts_(@self::$model["UW3"][$w3]);
172			$score += $this->ts_(@self::$model["UW4"][$w4]);
173			$score += $this->ts_(@self::$model["UW5"][$w5]);
174			$score += $this->ts_(@self::$model["UW6"][$w6]);
175			$score += $this->ts_(@self::$model["BW1"][$w2 . $w3]);
176			$score += $this->ts_(@self::$model["BW2"][$w3 . $w4]);
177			$score += $this->ts_(@self::$model["BW3"][$w4 . $w5]);
178			$score += $this->ts_(@self::$model["TW1"][$w1 . $w2 . $w3]);
179			$score += $this->ts_(@self::$model["TW2"][$w2 . $w3 . $w4]);
180			$score += $this->ts_(@self::$model["TW3"][$w3 . $w4 . $w5]);
181			$score += $this->ts_(@self::$model["TW4"][$w4 . $w5 . $w6]);
182			$score += $this->ts_(@self::$model["UC1"][$c1]);
183			$score += $this->ts_(@self::$model["UC2"][$c2]);
184			$score += $this->ts_(@self::$model["UC3"][$c3]);
185			$score += $this->ts_(@self::$model["UC4"][$c4]);
186			$score += $this->ts_(@self::$model["UC5"][$c5]);
187			$score += $this->ts_(@self::$model["UC6"][$c6]);
188			$score += $this->ts_(@self::$model["BC1"][$c2 . $c3]);
189			$score += $this->ts_(@self::$model["BC2"][$c3 . $c4]);
190			$score += $this->ts_(@self::$model["BC3"][$c4 . $c5]);
191			$score += $this->ts_(@self::$model["TC1"][$c1 . $c2 . $c3]);
192			$score += $this->ts_(@self::$model["TC2"][$c2 . $c3 . $c4]);
193			$score += $this->ts_(@self::$model["TC3"][$c3 . $c4 . $c5]);
194			$score += $this->ts_(@self::$model["TC4"][$c4 . $c5 . $c6]);
195			// $score += $this->ts_(@self::$model["TC5"][$c4 . $c5 . $c6]);
196			$score += $this->ts_(@self::$model["UQ1"][$p1 . $c1]);
197			$score += $this->ts_(@self::$model["UQ2"][$p2 . $c2]);
198			$score += $this->ts_(@self::$model["UQ1"][$p3 . $c3]);
199			$score += $this->ts_(@self::$model["BQ1"][$p2 . $c2 . $c3]);
200			$score += $this->ts_(@self::$model["BQ2"][$p2 . $c3 . $c4]);
201			$score += $this->ts_(@self::$model["BQ3"][$p3 . $c2 . $c3]);
202			$score += $this->ts_(@self::$model["BQ4"][$p3 . $c3 . $c4]);
203			$score += $this->ts_(@self::$model["TQ1"][$p2 . $c1 . $c2 . $c3]);
204			$score += $this->ts_(@self::$model["TQ2"][$p2 . $c2 . $c3 . $c4]);
205			$score += $this->ts_(@self::$model["TQ3"][$p3 . $c1 . $c2 . $c3]);
206			$score += $this->ts_(@self::$model["TQ4"][$p3 . $c2 . $c3 . $c4]);
207
208			$p = "O";
209
210			if ( $score > 0 ) {
211
212			if ( $word !== '' && $word !== ' ' ) {
213			$result[] = $word;
214			}
215
216			$word = "";
217			$p = "B";
218			}
219
220			$p1 = $p2;
221			$p2 = $p3;
222			$p3 = $p;
223
224			if ( $seg[$i] !== '' && $seg[$i] !== ' ' ) {
225			$word .= $seg[$i];
226			}
227			}
228
229			$result[] = $word;
230
231			if ( $encoding !== 'UTF-8') {
232			foreach( $result as &$str ) {
233			$str = mb_convert_encoding( $str, $encoding, 'UTF-8' );
234			}
235			}
236
237			return $result;
238			}
239
240			private function ctype_( $str ) {
241
242			foreach( $this->patterns_ as $pattern => $type ) {
243			if( preg_match( '/'.$pattern.'/u', $str ) ) {
244			return $type;
245			}
246			}
247
248			return "O";
249			}
250
251			private function ts_( $v ) {
252			return $v ? $v : 0;
253			}
254
255			private function mb_string_to_array_( $str, $encoding = 'UTF-8' ) {
256
257			$result = array();
258			$length = mb_strlen( $str, $encoding );
259
260			for ( $i=0; $i < $length; ++$i ) {
261			$result[] = mb_substr( $str, $i, 1, $encoding );
262			}
263
264			return $result;
265			}
266
267			}
268

onoi / tesa

Push — master ( 9401a4...2cc7a2 )

JaTinySegmenterTokenizer::segment() D

Complexity

Size

Duplication

Importance

How to fix Long Method Complexity

Long Method

Duplication Side-by-Side

Filter issues like