testNgramWithBeginEndMarker()   A
last analyzed

Complexity

Conditions 1
Paths 1

Size

Total Lines 18
Code Lines 10

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 18
rs 9.4285
c 0
b 0
f 0
cc 1
eloc 10
nc 1
nop 4
1
<?php
2
3
namespace Onoi\Tesa\Tests\Integration;
4
5
use Onoi\Tesa\SanitizerFactory;
6
use Onoi\Tesa\Tokenizer\NGramTokenizer;
7
8
/**
9
 * @group onoi-tesa
10
 *
11
 * @license GNU GPL v2+
12
 * @since 0.1
13
 *
14
 * @author mwjames
15
 */
16
class CombinedNGramTokenizerTest extends \PHPUnit_Framework_TestCase {
17
18
	/**
19
	 * @dataProvider textProvider
20
	 */
21
	public function testNgramWithBeginEndMarker( $languageCode, $ngramSize, $text, $expected ) {
22
23
		$sanitizerFactory = new SanitizerFactory();
24
25
		$tokenizer = $sanitizerFactory->newNGramTokenizer(
26
			$sanitizerFactory->newGenericRegExTokenizer()
27
		);
28
29
		$tokenizer->withMarker( true );
30
		$tokenizer->setNgramSize( $ngramSize );
31
32
		$tokens = $tokenizer->tokenize( $text );
33
34
		$this->assertEquals(
35
			$expected,
36
			$tokens
37
		);
38
	}
39
40
	public function textProvider() {
41
42
		// https://en.wikipedia.org/wiki/Stop_words
43
		$provider[] = array(
0 ignored issues
show
Coding Style Comprehensibility introduced by
$provider was never initialized. Although not strictly required by PHP, it is generally a good practice to add $provider = array(); before regardless.

Adding an explicit array definition is generally preferable to implicit array definition as it guarantees a stable state of the code.

Let’s take a look at an example:

foreach ($collection as $item) {
    $myArray['foo'] = $item->getFoo();

    if ($item->hasBar()) {
        $myArray['bar'] = $item->getBar();
    }

    // do something with $myArray
}

As you can see in this example, the array $myArray is initialized the first time when the foreach loop is entered. You can also see that the value of the bar key is only written conditionally; thus, its value might result from a previous iteration.

This might or might not be intended. To make your intention clear, your code more readible and to avoid accidental bugs, we recommend to add an explicit initialization $myArray = array() either outside or inside the foreach loop.

Loading history...
44
			'en',
45
			'2',
46
			//
47
			'In computing, stop words are words which are filtered ...',
48
			//
49
			array(
50
				0 => '_i',
51
				1 => 'in',
52
				2 => 'n_',
53
				3 => '_c',
54
				4 => 'co',
55
				5 => 'om',
56
				6 => 'mp',
57
				7 => 'pu',
58
				8 => 'ut',
59
				9 => 'ti',
60
				10 => 'in',
61
				11 => 'ng',
62
				12 => 'g_',
63
				13 => '_s',
64
				14 => 'st',
65
				15 => 'to',
66
				16 => 'op',
67
				17 => 'p_',
68
				18 => '_w',
69
				19 => 'wo',
70
				20 => 'or',
71
				21 => 'rd',
72
				22 => 'ds',
73
				23 => 's_',
74
				24 => '_a',
75
				25 => 'ar',
76
				26 => 're',
77
				27 => 'e_',
78
				28 => '_w',
79
				29 => 'wo',
80
				30 => 'or',
81
				31 => 'rd',
82
				32 => 'ds',
83
				33 => 's_',
84
				34 => '_w',
85
				35 => 'wh',
86
				36 => 'hi',
87
				37 => 'ic',
88
				38 => 'ch',
89
				39 => 'h_',
90
				40 => '_a',
91
				41 => 'ar',
92
				42 => 're',
93
				43 => 'e_',
94
				44 => '_f',
95
				45 => 'fi',
96
				46 => 'il',
97
				47 => 'lt',
98
				48 => 'te',
99
				49 => 'er',
100
				50 => 're',
101
				51 => 'ed',
102
				52 => 'd_',
103
			)
104
		);
105
106
		return $provider;
107
	}
108
109
}
110