GenericRegExTokenizer::tokenize()   A
last analyzed

Complexity

Conditions 3
Paths 4

Size

Total Lines 22
Code Lines 11

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 11
CRAP Score 3.0327

Importance

Changes 0
Metric Value
dl 0
loc 22
ccs 11
cts 13
cp 0.8462
rs 9.2
c 0
b 0
f 0
cc 3
eloc 11
nc 4
nop 1
crap 3.0327
1
<?php
2
3
namespace Onoi\Tesa\Tokenizer;
4
5
/**
6
 * @license GNU GPL v2+
7
 * @since 0.1
8
 *
9
 * @author mwjames
10
 */
11
class GenericRegExTokenizer implements Tokenizer {
12
13
	/**
14
	 * @var Tokenizer
15
	 */
16
	private $tokenizer;
17
18
	/**
19
	 * @var string
20
	 */
21
	private $patternExemption = '';
22
23
	/**
24
	 * @since 0.1
25
	 *
26
	 * @param Tokenizer|null $tokenizer
27
	 */
28 10
	public function __construct( Tokenizer $tokenizer = null ) {
29 10
		$this->tokenizer = $tokenizer;
30 10
	}
31
32
	/**
33
	 * @since 0.1
34
	 *
35
	 * {@inheritDoc}
36
	 */
37 1
	public function setOption( $name, $value ) {
38
39 1
		if ( $this->tokenizer !== null ) {
40 1
			$this->tokenizer->setOption( $name, $value );
41 1
		}
42
43 1
		if ( $name === self::REGEX_EXEMPTION ) {
44 1
			$this->patternExemption = $value;
45 1
		}
46 1
	}
47
48
	/**
49
	 * @since 0.1
50
	 *
51
	 * {@inheritDoc}
52
	 */
53 4
	public function isWordTokenizer() {
54 4
		return $this->tokenizer !== null ? $this->tokenizer->isWordTokenizer() :true;
55
	}
56
57
	/**
58
	 * @since 0.1
59
	 *
60
	 * @param string $string
61
	 *
62
	 * @return array|false
63
	 */
64 9
	public function tokenize( $string ) {
65
66 9
		if ( $this->tokenizer !== null ) {
67 1
			$string = implode( " ", $this->tokenizer->tokenize( $string ) );
68 1
		}
69
70
		// (?<=\p{L})(?=\p{N}) to split alphanumeric and numeric
71
72 9
		$pattern = str_replace(
73 9
			$this->patternExemption,
74 9
			'',
75
			'([\s\-_,:;?!%\'\|\/\(\)\[\]{}<>\r\n"]|(?<!\d)\.(?!\d)|(?<=\p{L})(?=\p{N}))'
76 9
		);
77
78 9
		$result = preg_split( '/' . $pattern . '/u', $string, null, PREG_SPLIT_NO_EMPTY );
79
80 9
		if ( $result === false ) {
81
			$result = array();
82
		}
83
84 9
		return $result;
85
	}
86
87
}
88