CombinedSanitizerTextStopwordTest   A
last analyzed

Complexity

Total Complexity 2

Size/Duplication

Total Lines 88
Duplicated Lines 0 %

Coupling/Cohesion

Components 0
Dependencies 2

Importance

Changes 0
Metric Value
wmc 2
lcom 0
cbo 2
dl 0
loc 88
rs 10
c 0
b 0
f 0

2 Methods

Rating   Name   Duplication   Size   Complexity  
A testByLanguage() 0 18 1
A textByLanguageProvider() 0 62 1
1
<?php
2
3
namespace Onoi\Tesa\Tests\Integration;
4
5
use Onoi\Tesa\SanitizerFactory;
6
7
/**
8
 * @group onoi-tesa
9
 *
10
 * @license GNU GPL v2+
11
 * @since 0.1
12
 *
13
 * @author mwjames
14
 */
15
class CombinedSanitizerTextStopwordTest extends \PHPUnit_Framework_TestCase {
16
17
	/**
18
	 * @dataProvider textByLanguageProvider
19
	 */
20
	public function testByLanguage( $languageCode, $text, $expected ) {
21
22
		$sanitizerFactory = new SanitizerFactory();
23
24
		$sanitizer = $sanitizerFactory->newSanitizer( $text );
25
		$sanitizer->toLowercase();
26
27
		$text = $sanitizer->sanitizeWith(
28
			$sanitizerFactory->newGenericRegExTokenizer(),
29
			$sanitizerFactory->newCdbStopwordAnalyzer( $languageCode ),
30
			$sanitizerFactory->newNullSynonymizer()
31
		);
32
33
		$this->assertEquals(
34
			$expected,
35
			$text
36
		);
37
	}
38
39
	public function textByLanguageProvider() {
40
41
		// https://en.wikipedia.org/wiki/Stop_words
42
		$provider[] = array(
0 ignored issues
show
Coding Style Comprehensibility introduced by
$provider was never initialized. Although not strictly required by PHP, it is generally a good practice to add $provider = array(); before regardless.

Adding an explicit array definition is generally preferable to implicit array definition as it guarantees a stable state of the code.

Let’s take a look at an example:

foreach ($collection as $item) {
    $myArray['foo'] = $item->getFoo();

    if ($item->hasBar()) {
        $myArray['bar'] = $item->getBar();
    }

    // do something with $myArray
}

As you can see in this example, the array $myArray is initialized the first time when the foreach loop is entered. You can also see that the value of the bar key is only written conditionally; thus, its value might result from a previous iteration.

This might or might not be intended. To make your intention clear, your code more readible and to avoid accidental bugs, we recommend to add an explicit initialization $myArray = array() either outside or inside the foreach loop.

Loading history...
43
			'en',
44
			//
45
			'In computing, stop words are words which are filtered out before or after processing of ' .
46
			'natural language data (text).[1] Though stop words usually refer to the most common words ' .
47
			'in a language, there is no single universal list of stop words used by all natural language ' .
48
			'processing tools, and indeed not all tools even use such a list. Some tools specifically avoid '.
49
			'removing these stop words to support phrase search.',
50
			//
51
			'computing stop words filtered processing natural language data text stop words refer common ' .
52
			'words language single universal list stop words natural language processing tools list tools ' .
53
			'specifically avoid removing stop words support phrase search'
54
		);
55
56
		// https://es.wikipedia.org/wiki/Palabra_vac%C3%ADa
57
		$provider[] = array(
58
			'es',
59
			//
60
			'Palabras vacías es el nombre que reciben las palabras sin significado como artículos, pronombres, ' .
61
			'preposiciones, etc. que son filtradas antes o después del procesamiento de datos en lenguaje natural ' .
62
			'(texto). A Hans Peter Luhn, uno de los pioneros en recuperación de información, se le atribuye la ' .
63
			'acuñación de la locución inglesa stop words y el uso del concepto en su diseño. Está controlada por ' .
64
			'introducción humana y no automática.',
65
			//
66
			'palabras vacías nombre que reciben palabras significado artículos pronombres preposiciones etc que ' .
67
			'son filtradas después del procesamiento datos lenguaje natural texto hans peter luhn pioneros ' .
68
			'recuperación información atribuye acuñación locución inglesa stop words del concepto diseño está ' .
69
			'controlada introducción humana automática'
70
		);
71
72
		// https://de.wikipedia.org/wiki/Stoppwort
73
		$provider[] = array(
74
			'de',
75
			//
76
			'Stoppwörter nennt man im Information Retrieval Wörter, die bei einer Volltextindexierung nicht beachtet ' .
77
			'werden, da sie sehr häufig auftreten und gewöhnlich keine Relevanz für die Erfassung des Dokumentinhalts ' .
78
			'besitzen.',
79
			//
80
			'stoppwörter nennt information retrieval wörter volltextindexierung beachtet häufig auftreten gewöhnlich ' .
81
			'relevanz erfassung dokumentinhalts besitzen'
82
		);
83
84
		// https://en.wikipedia.org/wiki/Query_expansion
85
		$provider[] = array(
86
			'en',
87
			//
88
			"The goal of query expansion in this regard is by increasing recall, precision can potentially increase " .
89
			"(rather than decrease as mathematically equated), by including in the result set pages which are more " .
90
			"relevant (of higher quality), or at least equally relevant. Pages which would not be included in the " .
91
			"result set, which have the potential to be more relevant to the user's desired query, are included, and " .
92
			"without query expansion would not have, regardless of relevance.",
93
			//
94
			"goal query expansion regard increasing recall precision potentially increase decrease mathematically " .
95
			"equated including result set pages relevant quality equally relevant pages included result set potential " .
96
			"relevant user desired query included query expansion relevance"
97
		);
98
99
		return $provider;
100
	}
101
102
}
103