Completed
Push — master ( 9401a4...2cc7a2 )
by mw
13:59
created

NGramTokenizerTest   A

Complexity

Total Complexity 6

Size/Duplication

Total Lines 203
Duplicated Lines 0 %

Coupling/Cohesion

Components 0
Dependencies 1

Importance

Changes 1
Bugs 0 Features 1
Metric Value
wmc 6
c 1
b 0
f 1
lcom 0
cbo 1
dl 0
loc 203
rs 10

6 Methods

Rating   Name   Duplication   Size   Complexity  
A testCanConstruct() 0 7 1
A testTokenize() 0 13 1
A testTokenizeWithStartEndMarker() 0 21 1
A testTokenizeWithStartEndMarker2() 0 19 1
B testTokenizeWithOption() 0 28 1
B stringProvider() 0 104 1
1
<?php
2
3
namespace Onoi\Tesa\Tests;
4
5
use Onoi\Tesa\Tokenizer\NGramTokenizer;
6
7
/**
8
 * @covers \Onoi\Tesa\Tokenizer\NGramTokenizer
9
 * @group onoi-tesa
10
 *
11
 * @license GNU GPL v2+
12
 * @since 0.1
13
 *
14
 * @author mwjames
15
 */
16
class NGramTokenizerTest extends \PHPUnit_Framework_TestCase {
17
18
	public function testCanConstruct() {
19
20
		$this->assertInstanceOf(
21
			'\Onoi\Tesa\Tokenizer\NGramTokenizer',
22
			new NGramTokenizer()
23
		);
24
	}
25
26
	/**
27
	 * @dataProvider stringProvider
28
	 */
29
	public function testTokenize( $string, $ngram, $expected ) {
30
31
		$instance = new NGramTokenizer( null, $ngram );
32
33
		$this->assertEquals(
34
			$expected,
35
			$instance->tokenize( $string )
36
		);
37
38
		$this->assertFalse(
39
			$instance->isWordTokenizer()
40
		);
41
	}
42
43
	public function testTokenizeWithStartEndMarker() {
44
45
		// http://cloudmark.github.io/Language-Detection
46
		$string = 'TEXT';
47
48
		$expected = array(
49
			'_tex',
50
			'text',
51
			'ext_',
52
			'xt__',
53
			't___'
54
		);
55
56
		$instance = new NGramTokenizer( null, 4 );
57
		$instance->withMarker( true );
58
59
		$this->assertEquals(
60
			$expected,
61
			$instance->tokenize( $string )
62
		);
63
	}
64
65
	public function testTokenizeWithStartEndMarker2() {
66
67
		$string = '教授は';
68
69
		$expected = array(
70
			'_教授',
71
			'教授は',
72
			'授は_',
73
			'は__'
74
		);
75
76
		$instance = new NGramTokenizer( null, 3 );
77
		$instance->withMarker( true );
78
79
		$this->assertEquals(
80
			$expected,
81
			$instance->tokenize( $string )
82
		);
83
	}
84
	public function testTokenizeWithOption() {
85
86
		$string = '红色中华';
87
88
		$tokenizer = $this->getMockBuilder( '\Onoi\Tesa\Tokenizer\Tokenizer' )
89
			->disableOriginalConstructor()
90
			->getMockForAbstractClass();
91
92
		$tokenizer->expects( $this->once() )
93
			->method( 'setOption' );
94
95
		$tokenizer->expects( $this->once() )
96
			->method( 'tokenize' )
97
			->with( $this->equalTo( $string ) )
98
			->will( $this->returnValue( array( $string ) ) );
99
100
		$instance = new NGramTokenizer( $tokenizer );
101
102
		$instance->setOption(
103
			NGramTokenizer::REGEX_EXEMPTION,
104
			array( 'Foo' )
105
		);
106
107
		$this->assertEquals(
108
			array( '红色', '色中', '中华' ),
109
			$instance->tokenize( $string )
110
		);
111
	}
112
113
	public function stringProvider() {
114
115
		$provider[] = array(
0 ignored issues
show
Coding Style Comprehensibility introduced by
$provider was never initialized. Although not strictly required by PHP, it is generally a good practice to add $provider = array(); before regardless.

Adding an explicit array definition is generally preferable to implicit array definition as it guarantees a stable state of the code.

Let’s take a look at an example:

foreach ($collection as $item) {
    $myArray['foo'] = $item->getFoo();

    if ($item->hasBar()) {
        $myArray['bar'] = $item->getBar();
    }

    // do something with $myArray
}

As you can see in this example, the array $myArray is initialized the first time when the foreach loop is entered. You can also see that the value of the bar key is only written conditionally; thus, its value might result from a previous iteration.

This might or might not be intended. To make your intention clear, your code more readible and to avoid accidental bugs, we recommend to add an explicit initialization $myArray = array() either outside or inside the foreach loop.

Loading history...
116
			'TEXT',
117
			'4',
118
			array(
119
				'text'
120
			)
121
		);
122
123
		$provider[] = array(
124
			'12345678',
125
			'2',
126
			array(
127
				'12',
128
				'23',
129
				'34',
130
				'45',
131
				'56',
132
				'67',
133
				'78'
134
			)
135
		);
136
137
		$provider[] = array(
138
			'12345678',
139
			'3',
140
			array(
141
				'123',
142
				'234',
143
				'345',
144
				'456',
145
				'567',
146
				'678'
147
			)
148
		);
149
150
		$provider[] = array(
151
			'hello',
152
			'3',
153
			array(
154
				'hel',
155
				'ell',
156
				'llo'
157
			)
158
		);
159
160
		$provider[] = array(
161
			'Hello World!',
162
			'3',
163
			array(
164
				'hel',
165
				'ell',
166
				'llo',
167
				'lo ',
168
				'o w',
169
				' wo',
170
				'wor',
171
				'orl',
172
				'rld',
173
				'ld!'
174
			)
175
		);
176
177
		$provider[] = array(
178
			'Новости',
179
			'3',
180
			array(
181
				'нов',
182
				'ово',
183
				'вос',
184
				'ост',
185
				'сти'
186
			)
187
		);
188
189
		$provider[] = array(
190
			'1時36分更新',
191
			'3',
192
			array(
193
				'1時3',
194
				'時36',
195
				'36分',
196
				'6分更',
197
				'分更新'
198
			)
199
		);
200
201
		$provider[] = array(
202
			'こんにちは世界!',
203
			'2',
204
			array(
205
				'こん',
206
				'んに',
207
				'にち',
208
				'ちは',
209
				'は世',
210
				'世界',
211
				'界!'
212
			)
213
		);
214
215
		return $provider;
216
	}
217
218
}
219