Completed
Push — master ( 9401a4...2cc7a2 )
by mw
13:59
created

CJKSimpleCharacterRegExTokenizer   A

Complexity

Total Complexity 8

Size/Duplication

Total Lines 74
Duplicated Lines 0 %

Coupling/Cohesion

Components 1
Dependencies 1

Importance

Changes 1
Bugs 0 Features 1
Metric Value
wmc 8
c 1
b 0
f 1
lcom 1
cbo 1
dl 0
loc 74
rs 10

4 Methods

Rating   Name   Duplication   Size   Complexity  
A __construct() 0 3 1
A setOption() 0 10 3
A isWordTokenizer() 0 3 1
A tokenize() 0 21 3
1
<?php
2
3
namespace Onoi\Tesa\Tokenizer;
4
5
/**
6
 * @license GNU GPL v2+
7
 * @since 0.1
8
 *
9
 * @author mwjames
10
 */
11
class CJKSimpleCharacterRegExTokenizer implements Tokenizer {
12
13
	/**
14
	 * @var Tokenizer
15
	 */
16
	private $tokenizer;
17
18
	/**
19
	 * @var string
20
	 */
21
	private $patternExemption = '';
22
23
	/**
24
	 * @since 0.1
25
	 *
26
	 * @param Tokenizer $tokenizer
27
	 */
28
	public function __construct( Tokenizer $tokenizer = null ) {
29
		$this->tokenizer = $tokenizer;
30
	}
31
32
	/**
33
	 * @since 0.1
34
	 *
35
	 * {@inheritDoc}
36
	 */
37
	public function setOption( $name, $value ) {
38
39
		if ( $this->tokenizer !== null ) {
40
			$this->tokenizer->setOption( $name, $value );
41
		}
42
43
		if ( $name === self::REGEX_EXEMPTION ) {
44
			$this->patternExemption = $value;
45
		}
46
	}
47
48
	/**
49
	 * @since 0.1
50
	 *
51
	 * {@inheritDoc}
52
	 */
53
	public function isWordTokenizer() {
54
		return false;
55
	}
56
57
	/**
58
	 * @since 0.1
59
	 *
60
	 * {@inheritDoc}
61
	 */
62
	public function tokenize( $string ) {
63
64
		if ( $this->tokenizer !== null ) {
65
			$string = implode( " ", $this->tokenizer->tokenize( $string ) );
66
		}
67
68
		// Filter is based on https://github.com/kitech/cms-drupal/blob/master/modules/csplitter/filter.txt
69
		$pattern = str_replace(
70
			$this->patternExemption,
71
			'',
72
			'([\s\、,,。/?《》〈〉;:“”"〃'`[]{}\|~!-=_+)(()*…—─%¥…◆★◇□■【】#·啊吧把并被才从的得当对但到地而该过个给还和叫将就可来了啦里没你您哪那呢去却让使是时省随他我为现县向像象要由矣已以也又与于在之这则最乃\/\(\)\[\]{}<>\r\n"]|(?<!\d)\.(?!\d))'
73
		);
74
75
		$result = preg_split( '/' . $pattern . '/u', $string, null, PREG_SPLIT_NO_EMPTY );
76
77
		if ( $result !== false ) {
78
			return $result;
79
		}
80
81
		return array();
82
	}
83
84
}
85