1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
/** |
4
|
|
|
* |
5
|
|
|
* Synonyms |
6
|
|
|
* https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-tokenfilter.html |
7
|
|
|
* |
8
|
|
|
* ASCII folding |
9
|
|
|
* https://www.elastic.co/guide/en/elasticsearch/guide/current/asciifolding-token-filter.html |
10
|
|
|
* |
11
|
|
|
* Snowball |
12
|
|
|
* https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-snowball-analyzer.html |
13
|
|
|
* |
14
|
|
|
* Thai tokenizer |
15
|
|
|
* https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-thai-tokenizer.html |
16
|
|
|
* |
17
|
|
|
* Reverser |
18
|
|
|
* https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-reverse-tokenfilter.html |
19
|
|
|
* |
20
|
|
|
* Elisions, possibly suitable for French |
21
|
|
|
* https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-elision-tokenfilter.html |
22
|
|
|
* Common grams |
23
|
|
|
* https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-common-grams-tokenfilter.html |
24
|
|
|
* |
25
|
|
|
* This page has a long list |
26
|
|
|
* https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-lang-analyzer.html#german-analyzer |
27
|
|
|
* |
28
|
|
|
* Boost weight and mix of stem/unstemmed |
29
|
|
|
* https://www.elastic.co/guide/en/elasticsearch/guide/current/most-fields.html |
30
|
|
|
* |
31
|
|
|
* Extend this class to create your own index settings |
32
|
|
|
* |
33
|
|
|
*/ |
34
|
|
|
class BaseIndexSettings { |
35
|
|
|
|
36
|
|
|
/** |
37
|
|
|
* If true add a field called folded with likes of está converted to esta |
38
|
|
|
* @var boolean |
39
|
|
|
*/ |
40
|
|
|
private $foldedAscii = false; |
41
|
|
|
|
42
|
|
|
/* |
43
|
|
|
Stopwords for this index |
44
|
|
|
*/ |
45
|
|
|
protected $stopWords = array(); |
46
|
|
|
|
47
|
|
|
/** |
48
|
|
|
* Synonyms for this index in form of CSV terms => actual term |
49
|
|
|
* @var array |
50
|
|
|
*/ |
51
|
|
|
private $synonyms = array(); |
|
|
|
|
52
|
|
|
|
53
|
|
|
|
54
|
|
|
/* |
55
|
|
|
Filters added by the language specific settings |
56
|
|
|
*/ |
57
|
|
|
private $filters = array(); |
58
|
|
|
|
59
|
|
|
/* |
60
|
|
|
Analyzers added by the language specific settings |
61
|
|
|
*/ |
62
|
|
|
private $analyzers = array(); |
63
|
|
|
|
64
|
|
|
|
65
|
|
|
protected $stopWordFilter = null; |
66
|
|
|
|
67
|
|
|
|
68
|
|
|
/** |
69
|
|
|
* Set to true to add an extra field containing a folded version of terms, |
70
|
|
|
* i.e. not accents on the letters |
71
|
|
|
* @param boolean $newFolding true for an extra field with no accents |
72
|
|
|
*/ |
73
|
1 |
|
public function setAsciiFolding($newFolding) { |
74
|
1 |
|
$this->foldedAscii = $newFolding; |
75
|
1 |
|
} |
76
|
|
|
|
77
|
|
|
|
78
|
|
|
public function getAsciiFolding() { |
79
|
|
|
return $this->foldedAscii; |
80
|
|
|
} |
81
|
|
|
|
82
|
|
|
|
83
|
|
|
/** |
84
|
|
|
* NOTE: Test with _german_ or _english_ |
85
|
|
|
* Set the stopwords for this index |
86
|
|
|
* @param array or string $newStopWords An array of stopwords or a CSV string of stopwords |
87
|
|
|
*/ |
88
|
1 |
|
public function setStopwords($newStopWords) { |
89
|
1 |
|
if (is_array($newStopWords)) { |
90
|
|
|
$this->stopWords = $newStopWords; |
91
|
1 |
|
} else if (is_string($newStopWords)) { |
92
|
1 |
|
$this->stopWords = explode(',', $newStopWords); |
93
|
1 |
|
} else { |
94
|
|
|
throw new Exception("ERROR: Stopwords must be a string or an array"); |
95
|
|
|
} |
96
|
1 |
|
} |
97
|
|
|
|
98
|
|
|
|
99
|
|
|
/* |
100
|
|
|
Accessor for stopwords |
101
|
|
|
*/ |
102
|
|
|
public function getStopwords() { |
103
|
|
|
return $this->stopWords; |
104
|
|
|
} |
105
|
|
|
|
106
|
|
|
|
107
|
|
|
/** |
108
|
|
|
* Add a filter, expressed as an array |
109
|
|
|
* @param string $name The name of the filter |
110
|
|
|
* @param array $properties The filter modelled as an array |
111
|
|
|
*/ |
112
|
1 |
|
public function addFilter($name, $properties) { |
113
|
1 |
|
$this->filters[$name] = $properties; |
114
|
1 |
|
} |
115
|
|
|
|
116
|
|
|
|
117
|
|
|
/** |
118
|
|
|
* Add an analyzer, expressed as an array |
119
|
|
|
* @param string $name The name of the analyzer |
120
|
|
|
* @param array $properties The analyzer modelled as an array |
121
|
|
|
*/ |
122
|
1 |
|
public function addAnalyzer($name, $properties) { |
123
|
1 |
|
$this->analyzers[$name] = $properties; |
124
|
1 |
|
} |
125
|
|
|
|
126
|
|
|
|
127
|
|
|
/* |
128
|
|
|
Generate an Elasticsearch config representing the configurations previously set. |
129
|
|
|
*/ |
130
|
1 |
|
public function generateConfig() { |
131
|
1 |
|
$settings = array(); |
132
|
1 |
|
$settings['analysis'] = array(); |
133
|
|
|
|
134
|
|
|
// create redefined filters in this array, e.g. tweaked stopwords |
135
|
|
|
|
136
|
1 |
|
$properties = array(); |
137
|
1 |
|
$analyzerNotStemmed = array(); |
138
|
1 |
|
$analyzerFolded = array(); |
139
|
|
|
|
140
|
1 |
|
$analyzerNotStemmed['type'] = 'custom'; |
141
|
|
|
|
142
|
1 |
|
$this->addFilter('no_single_chars', array( |
143
|
1 |
|
'type' => 'length', |
144
|
|
|
'min' => 2 |
145
|
1 |
|
)); |
146
|
|
|
|
147
|
|
|
|
148
|
|
|
/* |
|
|
|
|
149
|
|
|
if (sizeof($this->stopWords) > 0) { |
150
|
|
|
$stopwordFilter = array(); |
151
|
|
|
$stopwordFilter['type'] = 'stop'; |
152
|
|
|
$stopwordFilter['stopwords'] = $this->stopWords; |
153
|
|
|
$this->filters['stopword_filter'] = $stopwordFilter; |
154
|
|
|
} |
155
|
|
|
*/ |
156
|
|
|
|
157
|
|
|
//$analyzerStemmed['char_filter'] = array('html_strip'); |
|
|
|
|
158
|
1 |
|
$filterNames = array_keys($this->filters); |
159
|
|
|
|
160
|
|
|
//$analyzerNotStemmed['char_filter'] = array('html_strip'); |
|
|
|
|
161
|
1 |
|
$analyzerNotStemmed['tokenizer'] = 'uax_url_email'; |
162
|
1 |
|
array_push($filterNames, 'lowercase'); |
163
|
1 |
|
$analyzerNotStemmed['filter'] = array('no_single_chars', 'lowercase', $this->stopWordFilter); |
164
|
|
|
|
165
|
|
|
//Autocomplete filter |
166
|
|
|
/* |
|
|
|
|
167
|
|
|
"autocomplete": { |
168
|
|
|
"type": "custom", |
169
|
|
|
"tokenizer": "standard", |
170
|
|
|
"filter": [ |
171
|
|
|
"lowercase", |
172
|
|
|
"autocomplete_filter" |
173
|
|
|
] |
174
|
|
|
} |
175
|
|
|
*/ |
176
|
1 |
|
$this->addFilter('autocomplete', array( |
177
|
1 |
|
'type' => 'nGram', |
178
|
1 |
|
'min_gram' => 2, |
179
|
1 |
|
'max_gram' => 20, |
180
|
1 |
|
'token_chars' => array('letter', 'digit','punctuation', 'symbol') |
181
|
1 |
|
)); |
182
|
|
|
|
183
|
1 |
|
$this->addAnalyzer('autocomplete_index_analyzer',array( |
184
|
1 |
|
'type' => 'custom', |
185
|
1 |
|
'tokenizer' => 'whitespace', |
186
|
|
|
'filter' => array( |
187
|
1 |
|
'lowercase', |
188
|
1 |
|
'asciifolding', |
189
|
|
|
'autocomplete' |
190
|
1 |
|
) |
191
|
1 |
|
)); |
192
|
|
|
|
193
|
1 |
|
$this->addAnalyzer('autocomplete_search_analyzer',array( |
194
|
1 |
|
'type' => 'custom', |
195
|
1 |
|
'tokenizer' => 'whitespace', |
196
|
|
|
'filter' => array( |
197
|
1 |
|
'lowercase', |
198
|
|
|
'asciifolding' |
199
|
1 |
|
) |
200
|
1 |
|
)); |
201
|
|
|
|
202
|
|
|
//Folded analyzer |
203
|
1 |
|
$analyzerFolded['tokenizer'] = 'uax_url_email'; |
204
|
1 |
|
$analyzerFolded['filters'] = array('lowercase', 'asciifolding'); |
205
|
|
|
|
206
|
|
|
|
207
|
|
|
|
208
|
|
|
//HTML needs to have been removed for all indexes |
209
|
|
|
//stemmed is set by the specific language provider |
210
|
1 |
|
$this->analyzers['unstemmed'] = $analyzerNotStemmed; |
211
|
|
|
|
212
|
|
|
|
213
|
1 |
|
if ($this->foldedAscii) { |
214
|
1 |
|
$analyzers['folded'] = $analyzerFolded; |
|
|
|
|
215
|
1 |
|
} |
216
|
|
|
|
217
|
|
|
//Store bigrams in the index, namely pairs of words |
218
|
1 |
|
$this->addFilter('filter_shingle', array( |
219
|
1 |
|
'type' => 'shingle', |
220
|
1 |
|
'min_shingle_size' => 2, |
221
|
1 |
|
'max_shingle_size' => 2, |
222
|
|
|
'output_unigrams' => false |
223
|
1 |
|
)); |
224
|
|
|
|
225
|
|
|
//See https://www.elastic.co/blog/searching-with-shingles?q=shingle for details |
226
|
1 |
|
$this->addAnalyzer('shingles', array( |
227
|
|
|
// Ensure URLs happily tokenized |
228
|
1 |
|
'tokenizer' => 'uax_url_email', |
229
|
1 |
|
'filter' => array("lowercase", "filter_shingle"), |
230
|
|
|
'type' => 'custom' |
231
|
1 |
|
)); |
232
|
|
|
|
233
|
1 |
|
$settings['analysis']['analyzer'] = $this->analyzers; |
234
|
1 |
|
$settings['analysis']['filter'] = $this->filters; |
235
|
|
|
|
236
|
|
|
|
237
|
1 |
|
$properties['index'] = $settings; |
238
|
|
|
|
239
|
|
|
/* |
|
|
|
|
240
|
|
|
|
241
|
|
|
if ($this->foldedAscii) { |
242
|
|
|
$foldingFilter = array('my_ascii_folding' => array( |
243
|
|
|
"type" => "asciifolding", |
244
|
|
|
"preserve_original" => 'true' |
245
|
|
|
)); |
246
|
|
|
array_push($filters, $foldingFilter); |
247
|
|
|
} |
248
|
|
|
*/ |
249
|
|
|
|
250
|
|
|
|
251
|
|
|
/* |
|
|
|
|
252
|
|
|
$json = '{ |
253
|
|
|
"settings": { |
254
|
|
|
"analysis": { |
255
|
|
|
"analyzer": { |
256
|
|
|
"stemmed": { |
257
|
|
|
"type": "english", |
258
|
|
|
"stem_exclusion": [ "organization", "organizations" ], |
259
|
|
|
"stopwords": [ |
260
|
|
|
"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", |
261
|
|
|
"if", "in", "into", "is", "it", "of", "on", "or", "such", "that", |
262
|
|
|
"the", "their", "then", "there", "these", "they", "this", "to", |
263
|
|
|
"was", "will", "with" |
264
|
|
|
] |
265
|
|
|
} |
266
|
|
|
} |
267
|
|
|
} |
268
|
|
|
} |
269
|
|
|
}'; |
270
|
|
|
*/ |
271
|
|
|
//$this->extend('alterIndexingProperties', $properties); |
|
|
|
|
272
|
|
|
// |
273
|
|
|
// |
274
|
|
|
|
275
|
1 |
|
return $properties; |
276
|
|
|
} |
277
|
|
|
} |
278
|
|
|
|
This check marks private properties in classes that are never used. Those properties can be removed.