|
1
|
|
|
<?php |
|
2
|
|
|
|
|
3
|
|
|
/** |
|
4
|
|
|
* |
|
5
|
|
|
* Synonyms |
|
6
|
|
|
* https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-tokenfilter.html |
|
7
|
|
|
* |
|
8
|
|
|
* ASCII folding |
|
9
|
|
|
* https://www.elastic.co/guide/en/elasticsearch/guide/current/asciifolding-token-filter.html |
|
10
|
|
|
* |
|
11
|
|
|
* Snowball |
|
12
|
|
|
* https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-snowball-analyzer.html |
|
13
|
|
|
* |
|
14
|
|
|
* Thai tokenizer |
|
15
|
|
|
* https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-thai-tokenizer.html |
|
16
|
|
|
* |
|
17
|
|
|
* Reverser |
|
18
|
|
|
* https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-reverse-tokenfilter.html |
|
19
|
|
|
* |
|
20
|
|
|
* Elisions, possibly suitable for French |
|
21
|
|
|
* https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-elision-tokenfilter.html |
|
22
|
|
|
* Common grams |
|
23
|
|
|
* https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-common-grams-tokenfilter.html |
|
24
|
|
|
* |
|
25
|
|
|
* This page has a long list |
|
26
|
|
|
* https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-lang-analyzer.html#german-analyzer |
|
27
|
|
|
* |
|
28
|
|
|
* Boost weight and mix of stem/unstemmed |
|
29
|
|
|
* https://www.elastic.co/guide/en/elasticsearch/guide/current/most-fields.html |
|
30
|
|
|
* |
|
31
|
|
|
* Extend this class to create your own index settings |
|
32
|
|
|
* |
|
33
|
|
|
*/ |
|
34
|
|
|
class BaseIndexSettings { |
|
35
|
|
|
|
|
36
|
|
|
/** |
|
37
|
|
|
* If true add a field called folded with likes of está converted to esta |
|
38
|
|
|
* @var boolean |
|
39
|
|
|
*/ |
|
40
|
|
|
private $foldedAscii = false; |
|
41
|
|
|
|
|
42
|
|
|
/* |
|
43
|
|
|
Stopwords for this index |
|
44
|
|
|
*/ |
|
45
|
|
|
protected $stopWords = array(); |
|
46
|
|
|
|
|
47
|
|
|
/** |
|
48
|
|
|
* Synonyms for this index in form of CSV terms => actual term |
|
49
|
|
|
* @var array |
|
50
|
|
|
*/ |
|
51
|
|
|
private $synonyms = array(); |
|
|
|
|
|
|
52
|
|
|
|
|
53
|
|
|
|
|
54
|
|
|
/* |
|
55
|
|
|
Filters added by the language specific settings |
|
56
|
|
|
*/ |
|
57
|
|
|
private $filters = array(); |
|
58
|
|
|
|
|
59
|
|
|
/* |
|
60
|
|
|
Analyzers added by the language specific settings |
|
61
|
|
|
*/ |
|
62
|
|
|
private $analyzers = array(); |
|
63
|
|
|
|
|
64
|
|
|
|
|
65
|
|
|
protected $stopWordFilter = null; |
|
66
|
|
|
|
|
67
|
|
|
|
|
68
|
|
|
/** |
|
69
|
|
|
* Set to true to add an extra field containing a folded version of terms, |
|
70
|
|
|
* i.e. not accents on the letters |
|
71
|
|
|
* @param boolean $newFolding true for an extra field with no accents |
|
72
|
|
|
*/ |
|
73
|
2 |
|
public function setAsciiFolding($newFolding) { |
|
74
|
2 |
|
$this->foldedAscii = $newFolding; |
|
75
|
2 |
|
} |
|
76
|
|
|
|
|
77
|
|
|
|
|
78
|
1 |
|
public function getAsciiFolding() { |
|
79
|
1 |
|
return $this->foldedAscii; |
|
80
|
|
|
} |
|
81
|
|
|
|
|
82
|
|
|
|
|
83
|
|
|
/** |
|
84
|
|
|
* NOTE: Test with _german_ or _english_ |
|
85
|
|
|
* Set the stopwords for this index |
|
86
|
|
|
* @param array or string $newStopWords An array of stopwords or a CSV string of stopwords |
|
87
|
|
|
*/ |
|
88
|
2 |
|
public function setStopwords($newStopWords) { |
|
89
|
2 |
|
if (is_array($newStopWords)) { |
|
90
|
|
|
$this->stopWords = $newStopWords; |
|
91
|
2 |
|
} else if (is_string($newStopWords)) { |
|
92
|
2 |
|
$this->stopWords = explode(',', $newStopWords); |
|
93
|
2 |
|
} else { |
|
94
|
|
|
throw new Exception("ERROR: Stopwords must be a string or an array"); |
|
95
|
|
|
} |
|
96
|
2 |
|
} |
|
97
|
|
|
|
|
98
|
|
|
|
|
99
|
|
|
/* |
|
100
|
|
|
Accessor for stopwords |
|
101
|
|
|
*/ |
|
102
|
1 |
|
public function getStopwords() { |
|
103
|
1 |
|
return $this->stopWords; |
|
104
|
|
|
} |
|
105
|
|
|
|
|
106
|
|
|
|
|
107
|
|
|
/** |
|
108
|
|
|
* Add a filter, expressed as an array |
|
109
|
|
|
* @param string $name The name of the filter |
|
110
|
|
|
* @param array $properties The filter modelled as an array |
|
111
|
|
|
*/ |
|
112
|
2 |
|
public function addFilter($name, $properties) { |
|
113
|
2 |
|
$this->filters[$name] = $properties; |
|
114
|
2 |
|
} |
|
115
|
|
|
|
|
116
|
|
|
|
|
117
|
|
|
/** |
|
118
|
|
|
* Add an analyzer, expressed as an array |
|
119
|
|
|
* @param string $name The name of the analyzer |
|
120
|
|
|
* @param array $properties The analyzer modelled as an array |
|
121
|
|
|
*/ |
|
122
|
2 |
|
public function addAnalyzer($name, $properties) { |
|
123
|
2 |
|
$this->analyzers[$name] = $properties; |
|
124
|
2 |
|
} |
|
125
|
|
|
|
|
126
|
|
|
|
|
127
|
|
|
/* |
|
128
|
|
|
Generate an Elasticsearch config representing the configurations previously set. |
|
129
|
|
|
*/ |
|
130
|
2 |
|
public function generateConfig() { |
|
131
|
2 |
|
$settings = array(); |
|
132
|
2 |
|
$settings['analysis'] = array(); |
|
133
|
|
|
|
|
134
|
|
|
// create redefined filters in this array, e.g. tweaked stopwords |
|
135
|
|
|
|
|
136
|
2 |
|
$properties = array(); |
|
137
|
2 |
|
$analyzerNotStemmed = array(); |
|
138
|
2 |
|
$analyzerFolded = array(); |
|
139
|
|
|
|
|
140
|
2 |
|
$analyzerNotStemmed['type'] = 'custom'; |
|
141
|
|
|
|
|
142
|
2 |
|
$this->addFilter('no_single_chars', array( |
|
143
|
2 |
|
'type' => 'length', |
|
144
|
|
|
'min' => 2 |
|
145
|
2 |
|
)); |
|
146
|
|
|
|
|
147
|
|
|
|
|
148
|
|
|
/* |
|
|
|
|
|
|
149
|
|
|
if (sizeof($this->stopWords) > 0) { |
|
150
|
|
|
$stopwordFilter = array(); |
|
151
|
|
|
$stopwordFilter['type'] = 'stop'; |
|
152
|
|
|
$stopwordFilter['stopwords'] = $this->stopWords; |
|
153
|
|
|
$this->filters['stopword_filter'] = $stopwordFilter; |
|
154
|
|
|
} |
|
155
|
|
|
*/ |
|
156
|
|
|
|
|
157
|
|
|
//$analyzerStemmed['char_filter'] = array('html_strip'); |
|
|
|
|
|
|
158
|
2 |
|
$filterNames = array_keys($this->filters); |
|
159
|
|
|
|
|
160
|
|
|
//$analyzerNotStemmed['char_filter'] = array('html_strip'); |
|
|
|
|
|
|
161
|
2 |
|
$analyzerNotStemmed['tokenizer'] = 'uax_url_email'; |
|
162
|
2 |
|
array_push($filterNames, 'lowercase'); |
|
163
|
2 |
|
$analyzerNotStemmed['filter'] = array('no_single_chars', 'lowercase', $this->stopWordFilter); |
|
164
|
|
|
|
|
165
|
|
|
//Autocomplete filter |
|
166
|
|
|
/* |
|
|
|
|
|
|
167
|
|
|
"autocomplete": { |
|
168
|
|
|
"type": "custom", |
|
169
|
|
|
"tokenizer": "standard", |
|
170
|
|
|
"filter": [ |
|
171
|
|
|
"lowercase", |
|
172
|
|
|
"autocomplete_filter" |
|
173
|
|
|
] |
|
174
|
|
|
} |
|
175
|
|
|
*/ |
|
176
|
2 |
|
$this->addFilter('autocomplete', array( |
|
177
|
2 |
|
'type' => 'nGram', |
|
178
|
2 |
|
'min_gram' => 2, |
|
179
|
2 |
|
'max_gram' => 20, |
|
180
|
2 |
|
'token_chars' => array('letter', 'digit','punctuation', 'symbol') |
|
181
|
2 |
|
)); |
|
182
|
|
|
|
|
183
|
2 |
|
$this->addAnalyzer('autocomplete_index_analyzer',array( |
|
184
|
2 |
|
'type' => 'custom', |
|
185
|
2 |
|
'tokenizer' => 'whitespace', |
|
186
|
|
|
'filter' => array( |
|
187
|
2 |
|
'lowercase', |
|
188
|
2 |
|
'asciifolding', |
|
189
|
|
|
'autocomplete' |
|
190
|
2 |
|
) |
|
191
|
2 |
|
)); |
|
192
|
|
|
|
|
193
|
2 |
|
$this->addAnalyzer('autocomplete_search_analyzer',array( |
|
194
|
2 |
|
'type' => 'custom', |
|
195
|
2 |
|
'tokenizer' => 'whitespace', |
|
196
|
|
|
'filter' => array( |
|
197
|
2 |
|
'lowercase', |
|
198
|
|
|
'asciifolding' |
|
199
|
2 |
|
) |
|
200
|
2 |
|
)); |
|
201
|
|
|
|
|
202
|
|
|
//Folded analyzer |
|
203
|
2 |
|
$analyzerFolded['tokenizer'] = 'uax_url_email'; |
|
204
|
2 |
|
$analyzerFolded['filters'] = array('lowercase', 'asciifolding'); |
|
205
|
|
|
|
|
206
|
|
|
|
|
207
|
|
|
|
|
208
|
|
|
//HTML needs to have been removed for all indexes |
|
209
|
|
|
//stemmed is set by the specific language provider |
|
210
|
2 |
|
$this->analyzers['unstemmed'] = $analyzerNotStemmed; |
|
211
|
|
|
|
|
212
|
|
|
|
|
213
|
2 |
|
if ($this->foldedAscii) { |
|
214
|
2 |
|
$analyzers['folded'] = $analyzerFolded; |
|
|
|
|
|
|
215
|
2 |
|
} |
|
216
|
|
|
|
|
217
|
|
|
//Store bigrams in the index, namely pairs of words |
|
218
|
2 |
|
$this->addFilter('filter_shingle', array( |
|
219
|
2 |
|
'type' => 'shingle', |
|
220
|
2 |
|
'min_shingle_size' => 2, |
|
221
|
2 |
|
'max_shingle_size' => 2, |
|
222
|
|
|
'output_unigrams' => false |
|
223
|
2 |
|
)); |
|
224
|
|
|
|
|
225
|
|
|
//See https://www.elastic.co/blog/searching-with-shingles?q=shingle for details |
|
226
|
2 |
|
$this->addAnalyzer('shingles', array( |
|
227
|
|
|
// Ensure URLs happily tokenized |
|
228
|
2 |
|
'tokenizer' => 'uax_url_email', |
|
229
|
2 |
|
'filter' => array("lowercase", "filter_shingle"), |
|
230
|
|
|
'type' => 'custom' |
|
231
|
2 |
|
)); |
|
232
|
|
|
|
|
233
|
2 |
|
$settings['analysis']['analyzer'] = $this->analyzers; |
|
234
|
2 |
|
$settings['analysis']['filter'] = $this->filters; |
|
235
|
|
|
|
|
236
|
|
|
|
|
237
|
2 |
|
$properties['index'] = $settings; |
|
238
|
|
|
|
|
239
|
|
|
/* |
|
|
|
|
|
|
240
|
|
|
|
|
241
|
|
|
if ($this->foldedAscii) { |
|
242
|
|
|
$foldingFilter = array('my_ascii_folding' => array( |
|
243
|
|
|
"type" => "asciifolding", |
|
244
|
|
|
"preserve_original" => 'true' |
|
245
|
|
|
)); |
|
246
|
|
|
array_push($filters, $foldingFilter); |
|
247
|
|
|
} |
|
248
|
|
|
*/ |
|
249
|
|
|
|
|
250
|
|
|
|
|
251
|
|
|
/* |
|
|
|
|
|
|
252
|
|
|
$json = '{ |
|
253
|
|
|
"settings": { |
|
254
|
|
|
"analysis": { |
|
255
|
|
|
"analyzer": { |
|
256
|
|
|
"stemmed": { |
|
257
|
|
|
"type": "english", |
|
258
|
|
|
"stem_exclusion": [ "organization", "organizations" ], |
|
259
|
|
|
"stopwords": [ |
|
260
|
|
|
"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", |
|
261
|
|
|
"if", "in", "into", "is", "it", "of", "on", "or", "such", "that", |
|
262
|
|
|
"the", "their", "then", "there", "these", "they", "this", "to", |
|
263
|
|
|
"was", "will", "with" |
|
264
|
|
|
] |
|
265
|
|
|
} |
|
266
|
|
|
} |
|
267
|
|
|
} |
|
268
|
|
|
} |
|
269
|
|
|
}'; |
|
270
|
|
|
*/ |
|
271
|
|
|
//$this->extend('alterIndexingProperties', $properties); |
|
|
|
|
|
|
272
|
|
|
// |
|
273
|
|
|
// |
|
274
|
|
|
|
|
275
|
2 |
|
return $properties; |
|
276
|
|
|
} |
|
277
|
|
|
} |
|
278
|
|
|
|
This check marks private properties in classes that are never used. Those properties can be removed.