Completed
Push — dev2 ( f6a7ae...88484c )
by Gordon
07:43
created

BaseIndexSettings   A

Complexity

Total Complexity 10

Size/Duplication

Total Lines 244
Duplicated Lines 0 %

Coupling/Cohesion

Components 2
Dependencies 0

Test Coverage

Coverage 91.78%
Metric Value
wmc 10
lcom 2
cbo 0
dl 0
loc 244
ccs 67
cts 73
cp 0.9178
rs 10

7 Methods

Rating   Name   Duplication   Size   Complexity  
A setAsciiFolding() 0 3 1
A getAsciiFolding() 0 3 1
A setStopwords() 0 9 3
A getStopwords() 0 3 1
A addFilter() 0 3 1
A addAnalyzer() 0 3 1
B generateConfig() 0 147 2
1
<?php
2
3
/**
4
*
5
* Synonyms
6
* https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-tokenfilter.html
7
*
8
* ASCII folding
9
* https://www.elastic.co/guide/en/elasticsearch/guide/current/asciifolding-token-filter.html
10
*
11
* Snowball
12
* https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-snowball-analyzer.html
13
*
14
* Thai tokenizer
15
* https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-thai-tokenizer.html
16
*
17
* Reverser
18
* https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-reverse-tokenfilter.html
19
*
20
* Elisions, possibly suitable for French
21
* https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-elision-tokenfilter.html
22
* Common grams
23
* https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-common-grams-tokenfilter.html
24
*
25
* This page has a long list
26
* https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-lang-analyzer.html#german-analyzer
27
*
28
* Boost weight and mix of stem/unstemmed
29
* https://www.elastic.co/guide/en/elasticsearch/guide/current/most-fields.html
30
*
31
* Extend this class to create your own index settings
32
*
33
*/
34
class BaseIndexSettings {
35
36
	/**
37
	 * If true add a field called folded with likes of está converted to esta
38
	 * @var boolean
39
	 */
40
	private $foldedAscii = false;
41
42
	/*
43
	Stopwords for this index
44
	 */
45
	protected $stopWords = array();
46
47
	/**
48
	 * Synonyms for this index in form of CSV terms => actual term
49
	 * @var array
50
	 */
51
	private $synonyms = array();
0 ignored issues
show
Unused Code introduced by
The property $synonyms is not used and could be removed.

This check marks private properties in classes that are never used. Those properties can be removed.

Loading history...
52
53
54
	/*
55
	Filters added by the language specific settings
56
	 */
57
	private $filters = array();
58
59
	/*
60
	Analyzers added by the language specific settings
61
	 */
62
	private $analyzers = array();
63
64
65
	protected $stopWordFilter = null;
66
67
68
	/**
69
	 * Set to true to add an extra field containing a folded version of terms,
70
	 * i.e. not accents on the letters
71
	 * @param boolean $newFolding true for an extra field with no accents
72
	 */
73 1
	public function setAsciiFolding($newFolding) {
74 1
		$this->foldedAscii = $newFolding;
75 1
	}
76
77
78
	public function getAsciiFolding() {
79
		return $this->foldedAscii;
80
	}
81
82
83
	/**
84
	 * NOTE: Test with _german_ or _english_
85
	 * Set the stopwords for this index
86
	 * @param array or string $newStopWords An array of stopwords or a CSV string of stopwords
87
	 */
88 1
	public function setStopwords($newStopWords) {
89 1
		if (is_array($newStopWords)) {
90
			$this->stopWords = $newStopWords;
91 1
		} else if (is_string($newStopWords)) {
92 1
			$this->stopWords = explode(',', $newStopWords);
93 1
		} else {
94
			throw new Exception("ERROR: Stopwords must be a string or an array");
95
		}
96 1
	}
97
98
99
	/*
100
	Accessor for stopwords
101
	 */
102
	public function getStopwords() {
103
		return $this->stopWords;
104
	}
105
106
107
	/**
108
	 * Add a filter, expressed as an array
109
	 * @param string $name The name of the filter
110
	 * @param array $properties The filter modelled as an array
111
	 */
112 1
	public function addFilter($name, $properties) {
113 1
		$this->filters[$name] = $properties;
114 1
	}
115
116
117
	/**
118
	 * Add an analyzer, expressed as an array
119
	 * @param string $name       The name of the analyzer
120
	 * @param array $properties The analyzer modelled as an array
121
	 */
122 1
	public function addAnalyzer($name, $properties) {
123 1
		$this->analyzers[$name] = $properties;
124 1
	}
125
126
127
	/*
128
	Generate an Elasticsearch config representing the configurations previously set.
129
	 */
130 1
	public function generateConfig() {
131 1
		$settings = array();
132 1
		$settings['analysis'] = array();
133
134
		// create redefined filters in this array, e.g. tweaked stopwords
135
136 1
		$properties = array();
137 1
		$analyzerNotStemmed = array();
138 1
		$analyzerFolded = array();
139
140 1
		$analyzerNotStemmed['type'] = 'custom';
141
142 1
		$this->addFilter('no_single_chars', array(
143 1
			'type' => 'length',
144
			'min' => 2
145 1
		));
146
147
148
/*
0 ignored issues
show
Unused Code Comprehensibility introduced by
57% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
149
		if (sizeof($this->stopWords) > 0) {
150
			$stopwordFilter = array();
151
			$stopwordFilter['type'] = 'stop';
152
			$stopwordFilter['stopwords'] = $this->stopWords;
153
			$this->filters['stopword_filter'] = $stopwordFilter;
154
		}
155
*/
156
157
		//$analyzerStemmed['char_filter'] = array('html_strip');
0 ignored issues
show
Unused Code Comprehensibility introduced by
75% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
158 1
		$filterNames = array_keys($this->filters);
159
160
		//$analyzerNotStemmed['char_filter'] = array('html_strip');
0 ignored issues
show
Unused Code Comprehensibility introduced by
75% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
161 1
		$analyzerNotStemmed['tokenizer'] = 'uax_url_email';
162 1
		array_push($filterNames, 'lowercase');
163 1
		$analyzerNotStemmed['filter'] = array('no_single_chars', 'lowercase', $this->stopWordFilter);
164
165
		//Autocomplete filter
166
		/*
0 ignored issues
show
Unused Code Comprehensibility introduced by
62% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
167
		"autocomplete": {
168
            "type":      "custom",
169
            "tokenizer": "standard",
170
            "filter": [
171
                "lowercase",
172
                "autocomplete_filter"
173
            ]
174
        }
175
		 */
176 1
		$this->addFilter('autocomplete', array(
177 1
			'type' => 'nGram',
178 1
			'min_gram' => 2,
179 1
			'max_gram' => 20,
180 1
			'token_chars' => array('letter', 'digit','punctuation', 'symbol')
181 1
		));
182
183 1
		$this->addAnalyzer('autocomplete_index_analyzer',array(
184 1
			'type' => 'custom',
185 1
			'tokenizer' => 'whitespace',
186
			'filter' => array(
187 1
				'lowercase',
188 1
				'asciifolding',
189
				'autocomplete'
190 1
			)
191 1
		));
192
193 1
		$this->addAnalyzer('autocomplete_search_analyzer',array(
194 1
			'type' => 'custom',
195 1
			'tokenizer' => 'whitespace',
196
			'filter' => array(
197 1
				'lowercase',
198
				'asciifolding'
199 1
			)
200 1
		));
201
202
		//Folded analyzer
203 1
		$analyzerFolded['tokenizer'] = 'uax_url_email';
204 1
		$analyzerFolded['filters'] = array('lowercase', 'asciifolding');
205
206
207
208
		//HTML needs to have been removed for all indexes
209
		//stemmed is set by the specific language provider
210 1
		$this->analyzers['unstemmed'] = $analyzerNotStemmed;
211
212
213 1
		if ($this->foldedAscii) {
214 1
			$analyzers['folded'] = $analyzerFolded;
0 ignored issues
show
Coding Style Comprehensibility introduced by
$analyzers was never initialized. Although not strictly required by PHP, it is generally a good practice to add $analyzers = array(); before regardless.

Adding an explicit array definition is generally preferable to implicit array definition as it guarantees a stable state of the code.

Let’s take a look at an example:

foreach ($collection as $item) {
    $myArray['foo'] = $item->getFoo();

    if ($item->hasBar()) {
        $myArray['bar'] = $item->getBar();
    }

    // do something with $myArray
}

As you can see in this example, the array $myArray is initialized the first time when the foreach loop is entered. You can also see that the value of the bar key is only written conditionally; thus, its value might result from a previous iteration.

This might or might not be intended. To make your intention clear, your code more readible and to avoid accidental bugs, we recommend to add an explicit initialization $myArray = array() either outside or inside the foreach loop.

Loading history...
215 1
		}
216
217
        //Store bigrams in the index, namely pairs of words
218 1
		$this->addFilter('filter_shingle', array(
219 1
			'type' => 'shingle',
220 1
			'min_shingle_size' => 2,
221 1
			'max_shingle_size' => 2,
222
			'output_unigrams' => false
223 1
		));
224
225
		//See https://www.elastic.co/blog/searching-with-shingles?q=shingle for details
226 1
		$this->addAnalyzer('shingles', array(
227
			// Ensure URLs happily tokenized
228 1
			'tokenizer' => 'uax_url_email',
229 1
			'filter' => array("lowercase", "filter_shingle"),
230
			'type' => 'custom'
231 1
		));
232
233 1
		$settings['analysis']['analyzer'] = $this->analyzers;
234 1
		$settings['analysis']['filter'] = $this->filters;
235
236
237 1
		$properties['index'] = $settings;
238
239
		/*
0 ignored issues
show
Unused Code Comprehensibility introduced by
56% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
240
241
		if ($this->foldedAscii) {
242
			$foldingFilter = array('my_ascii_folding' => array(
243
				"type" => "asciifolding",
244
				"preserve_original" => 'true'
245
			));
246
			array_push($filters, $foldingFilter);
247
		}
248
		*/
249
250
251
/*
0 ignored issues
show
Unused Code Comprehensibility introduced by
84% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
252
		$json = '{
253
		  "settings": {
254
		    "analysis": {
255
		      "analyzer": {
256
		        "stemmed": {
257
		          "type": "english",
258
		          "stem_exclusion": [ "organization", "organizations" ],
259
		          "stopwords": [
260
		            "a", "an", "and", "are", "as", "at", "be", "but", "by", "for",
261
		            "if", "in", "into", "is", "it", "of", "on", "or", "such", "that",
262
		            "the", "their", "then", "there", "these", "they", "this", "to",
263
		            "was", "will", "with"
264
		          ]
265
		        }
266
		      }
267
		    }
268
		  }
269
		}';
270
		*/
271
		//$this->extend('alterIndexingProperties', $properties);
0 ignored issues
show
Unused Code Comprehensibility introduced by
80% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
272
		//
273
		//
274
275 1
		return $properties;
276
	}
277
}
278