IcuWordBoundaryTokenizerTest::testCanConstruct()   A
last analyzed

Complexity

Conditions 1
Paths 1

Size

Total Lines 7
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 7
rs 9.4285
c 0
b 0
f 0
cc 1
eloc 4
nc 1
nop 0
1
<?php
2
3
namespace Onoi\Tesa\Tests;
4
5
use Onoi\Tesa\Tokenizer\IcuWordBoundaryTokenizer;
6
7
/**
8
 * @covers \Onoi\Tesa\Tokenizer\IcuWordBoundaryTokenizer
9
 * @group onoi-tesa
10
 *
11
 * @license GNU GPL v2+
12
 * @since 0.1
13
 *
14
 * @author mwjames
15
 */
16
class IcuWordBoundaryTokenizerTest extends \PHPUnit_Framework_TestCase {
17
18
	protected function setUp() {
19
		$instance = new IcuWordBoundaryTokenizer();
20
21
		if ( !$instance->isAvailable() || INTL_ICU_VERSION != '54.1' ) {
22
			$this->markTestSkipped( 'ICU extension is not available or does not match the expected version constraint.' );
23
		}
24
	}
25
26
	public function testCanConstruct() {
27
28
		$this->assertInstanceOf(
29
			'\Onoi\Tesa\Tokenizer\IcuWordBoundaryTokenizer',
30
			new IcuWordBoundaryTokenizer()
31
		);
32
	}
33
34
	/**
35
	 * @dataProvider stringProvider
36
	 */
37
	public function testTokenize( $string, $expected ) {
38
39
		$instance = new IcuWordBoundaryTokenizer();
40
41
		$this->assertEquals(
42
			$expected,
43
			$instance->tokenize( $string )
44
		);
45
	}
46
47
	public function testSetOption() {
48
49
		$tokenizer = $this->getMockBuilder( '\Onoi\Tesa\Tokenizer\Tokenizer' )
50
			->disableOriginalConstructor()
51
			->getMockForAbstractClass();
52
53
		$tokenizer->expects( $this->once() )
54
			->method( 'setOption' );
55
56
		$instance = new IcuWordBoundaryTokenizer(
57
			$tokenizer
58
		);
59
60
		$instance->setOption(
61
			IcuWordBoundaryTokenizer::REGEX_EXEMPTION,
62
			array( 'Foo' )
63
		);
64
	}
65
66
	public function testGeneralSetters() {
67
68
		$tokenizer = $this->getMockBuilder( '\Onoi\Tesa\Tokenizer\Tokenizer' )
69
			->disableOriginalConstructor()
70
			->getMockForAbstractClass();
71
72
		$instance = new IcuWordBoundaryTokenizer(
73
			$tokenizer
74
		);
75
76
		$instance->setLocale( 'en' );
77
		$instance->setWordTokenizerAttribute( false );
78
79
		$this->assertFalse(
80
			$instance->isWordTokenizer()
81
		);
82
	}
83
84
	public function stringProvider() {
85
86
		$provider[] = array(
0 ignored issues
show
Coding Style Comprehensibility introduced by
$provider was never initialized. Although not strictly required by PHP, it is generally a good practice to add $provider = array(); before regardless.

Adding an explicit array definition is generally preferable to implicit array definition as it guarantees a stable state of the code.

Let’s take a look at an example:

foreach ($collection as $item) {
    $myArray['foo'] = $item->getFoo();

    if ($item->hasBar()) {
        $myArray['bar'] = $item->getBar();
    }

    // do something with $myArray
}

As you can see in this example, the array $myArray is initialized the first time when the foreach loop is entered. You can also see that the value of the bar key is only written conditionally; thus, its value might result from a previous iteration.

This might or might not be intended. To make your intention clear, your code more readible and to avoid accidental bugs, we recommend to add an explicit initialization $myArray = array() either outside or inside the foreach loop.

Loading history...
87
			"安全テスト",
88
			array( '安全', 'テスト' )
89
		);
90
91
		// Would expect 'すもも', 'も', 'もも', 'も', 'もも', 'の', 'うち', '。'
92
		$provider[] = array(
93
			"すもももももももものうち。",
94
			array( 'すもも', 'も', 'も', 'も', 'も', 'も', 'もの', 'うち', '。' )
95
		);
96
97
		$provider[] = array(
98
			"李も桃も桃のうち。",
99
			array( '李', 'も', '桃', 'も', '桃', 'の', 'うち', '。' )
100
		);
101
102
		$provider[] = array(
103
			"إسرائيل",
104
			array( 'إسرائيل' )
105
		);
106
107
		$provider[] = array(
108
			"검색엔ㅇㅏ진",
109
			array( '검색엔', 'ㅇㅏ', '진' )
110
		);
111
112
		$provider[] = array(
113
			"검색엔ㅇㅏ진1234abcdfrA",
114
			array( '검색엔', 'ㅇㅏ', '진', '1234abcdfrA' )
115
		);
116
117
		$provider[] = array(
118
			"1234abcdfrA",
119
			array( '1234abcdfrA' )
120
		);
121
122
		$provider[] = array(
123
			"公明執ようなSNSもストーカー行為の対象に",
124
			array(
125
				'公明', '執よう','な','SNS', 'も',
126
				'ストーカー', '行為', 'の', '対象', 'に'
127
			)
128
		);
129
130
		$provider[] = array(
131
			"公明執",
132
			array( '公明', '執' )
133
		);
134
135
		$provider[] = array(
136
			"IQテスト",
137
			array( 'IQ', 'テスト' )
138
		);
139
140
		$provider[] = array(
141
			"foo テスト bar",
142
			array( 'foo', 'テスト', 'bar' )
143
		);
144
145
		$provider[] = array(
146
			"foo テスト bar 123abc ^&'",
147
			array( 'foo', 'テスト', 'bar', '123abc', '^', '&', "'" )
148
		);
149
150
		$provider[] = array(
151
			"was discovered in 1957 and first sold as a medication in 1971",
152
			array(
153
				'was', 'discovered', 'in', '1957', 'and',
154
				'first', 'sold', 'as', 'a', 'medication', 'in', '1971'
155
			)
156
		);
157
158
		// See JaTinySegmenterTokenizerTest for comparison
159
		$provider[] = array(
160
			'日本語の新聞記事であれば文字単位で95%程度の精度で分かち書きが行えます。 ',
161
			array(
162
				'日本語', 'の', '新聞', '記事', 'で',
163
				'あれ', 'ば',	'文字', '単位',
164
				'で', '95', '%', '程度',
165
				'の', '精度', 'で', '分かち書き',
166
				'が', '行', 'え', 'ます', '。'
167
			)
168
		);
169
170
		return $provider;
171
	}
172
173
}
174