1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace Onoi\Tesa\Tests\Integration; |
4
|
|
|
|
5
|
|
|
use Onoi\Tesa\SanitizerFactory; |
6
|
|
|
|
7
|
|
|
/** |
8
|
|
|
* @group onoi-tesa |
9
|
|
|
* |
10
|
|
|
* @license GNU GPL v2+ |
11
|
|
|
* @since 0.1 |
12
|
|
|
* |
13
|
|
|
* @author mwjames |
14
|
|
|
*/ |
15
|
|
|
class CombinedSanitizerTextStopwordTest extends \PHPUnit_Framework_TestCase { |
16
|
|
|
|
17
|
|
|
/** |
18
|
|
|
* @dataProvider textByLanguageProvider |
19
|
|
|
*/ |
20
|
|
|
public function testByLanguage( $languageCode, $text, $expected ) { |
21
|
|
|
|
22
|
|
|
$sanitizerFactory = new SanitizerFactory(); |
23
|
|
|
|
24
|
|
|
$sanitizer = $sanitizerFactory->newSanitizer( $text ); |
25
|
|
|
$sanitizer->toLowercase(); |
26
|
|
|
|
27
|
|
|
$text = $sanitizer->sanitizeWith( |
28
|
|
|
$sanitizerFactory->newGenericRegExTokenizer(), |
29
|
|
|
$sanitizerFactory->newCdbStopwordAnalyzer( $languageCode ), |
30
|
|
|
$sanitizerFactory->newNullSynonymizer() |
31
|
|
|
); |
32
|
|
|
|
33
|
|
|
$this->assertEquals( |
34
|
|
|
$expected, |
35
|
|
|
$text |
36
|
|
|
); |
37
|
|
|
} |
38
|
|
|
|
39
|
|
|
public function textByLanguageProvider() { |
40
|
|
|
|
41
|
|
|
// https://en.wikipedia.org/wiki/Stop_words |
42
|
|
|
$provider[] = array( |
|
|
|
|
43
|
|
|
'en', |
44
|
|
|
// |
45
|
|
|
'In computing, stop words are words which are filtered out before or after processing of ' . |
46
|
|
|
'natural language data (text).[1] Though stop words usually refer to the most common words ' . |
47
|
|
|
'in a language, there is no single universal list of stop words used by all natural language ' . |
48
|
|
|
'processing tools, and indeed not all tools even use such a list. Some tools specifically avoid '. |
49
|
|
|
'removing these stop words to support phrase search.', |
50
|
|
|
// |
51
|
|
|
'computing stop words filtered processing natural language data text stop words refer common ' . |
52
|
|
|
'words language single universal list stop words natural language processing tools list tools ' . |
53
|
|
|
'specifically avoid removing stop words support phrase search' |
54
|
|
|
); |
55
|
|
|
|
56
|
|
|
// https://es.wikipedia.org/wiki/Palabra_vac%C3%ADa |
57
|
|
|
$provider[] = array( |
58
|
|
|
'es', |
59
|
|
|
// |
60
|
|
|
'Palabras vacías es el nombre que reciben las palabras sin significado como artículos, pronombres, ' . |
61
|
|
|
'preposiciones, etc. que son filtradas antes o después del procesamiento de datos en lenguaje natural ' . |
62
|
|
|
'(texto). A Hans Peter Luhn, uno de los pioneros en recuperación de información, se le atribuye la ' . |
63
|
|
|
'acuñación de la locución inglesa stop words y el uso del concepto en su diseño. Está controlada por ' . |
64
|
|
|
'introducción humana y no automática.', |
65
|
|
|
// |
66
|
|
|
'palabras vacías nombre que reciben palabras significado artículos pronombres preposiciones etc que ' . |
67
|
|
|
'son filtradas después del procesamiento datos lenguaje natural texto hans peter luhn pioneros ' . |
68
|
|
|
'recuperación información atribuye acuñación locución inglesa stop words del concepto diseño está ' . |
69
|
|
|
'controlada introducción humana automática' |
70
|
|
|
); |
71
|
|
|
|
72
|
|
|
// https://de.wikipedia.org/wiki/Stoppwort |
73
|
|
|
$provider[] = array( |
74
|
|
|
'de', |
75
|
|
|
// |
76
|
|
|
'Stoppwörter nennt man im Information Retrieval Wörter, die bei einer Volltextindexierung nicht beachtet ' . |
77
|
|
|
'werden, da sie sehr häufig auftreten und gewöhnlich keine Relevanz für die Erfassung des Dokumentinhalts ' . |
78
|
|
|
'besitzen.', |
79
|
|
|
// |
80
|
|
|
'stoppwörter nennt information retrieval wörter volltextindexierung beachtet häufig auftreten gewöhnlich ' . |
81
|
|
|
'relevanz erfassung dokumentinhalts besitzen' |
82
|
|
|
); |
83
|
|
|
|
84
|
|
|
// https://en.wikipedia.org/wiki/Query_expansion |
85
|
|
|
$provider[] = array( |
86
|
|
|
'en', |
87
|
|
|
// |
88
|
|
|
"The goal of query expansion in this regard is by increasing recall, precision can potentially increase " . |
89
|
|
|
"(rather than decrease as mathematically equated), by including in the result set pages which are more " . |
90
|
|
|
"relevant (of higher quality), or at least equally relevant. Pages which would not be included in the " . |
91
|
|
|
"result set, which have the potential to be more relevant to the user's desired query, are included, and " . |
92
|
|
|
"without query expansion would not have, regardless of relevance.", |
93
|
|
|
// |
94
|
|
|
"goal query expansion regard increasing recall precision potentially increase decrease mathematically " . |
95
|
|
|
"equated including result set pages relevant quality equally relevant pages included result set potential " . |
96
|
|
|
"relevant user desired query included query expansion relevance" |
97
|
|
|
); |
98
|
|
|
|
99
|
|
|
return $provider; |
100
|
|
|
} |
101
|
|
|
|
102
|
|
|
} |
103
|
|
|
|
Adding an explicit array definition is generally preferable to implicit array definition as it guarantees a stable state of the code.
Let’s take a look at an example:
As you can see in this example, the array
$myArray
is initialized the first time when the foreach loop is entered. You can also see that the value of thebar
key is only written conditionally; thus, its value might result from a previous iteration.This might or might not be intended. To make your intention clear, your code more readible and to avoid accidental bugs, we recommend to add an explicit initialization $myArray = array() either outside or inside the foreach loop.