Passed
Push — master ( 3f6c85...9c6499 )
by Jens
02:40
created

Search   A

Complexity

Total Complexity 31

Size/Duplication

Total Lines 278
Duplicated Lines 9.35 %

Coupling/Cohesion

Components 1
Dependencies 3

Importance

Changes 7
Bugs 0 Features 0
Metric Value
c 7
b 0
f 0
dl 26
loc 278
rs 9.8
wmc 31
lcom 1
cbo 3

11 Methods

Rating   Name   Duplication   Size   Complexity  
A flattenResults() 0 20 4
A scoreCompare() 0 6 3
B getResultsForToken() 3 30 3
B getQueryNorm() 3 22 5
A getDocumentsForTokenizer() 0 13 1
A queryTokens() 0 11 2
A applyFilters() 9 9 2
A applyQueryCoordination() 0 12 2
B getSearchSuggestions() 6 30 4
A getTokens() 0 13 2
A getIndexedDocuments() 5 20 3

How to fix   Duplicated Code   

Duplicated Code

Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.

Common duplication problems, and corresponding solutions are:

1
<?php
2
/**
3
 * User: jensk
4
 * Date: 21-2-2017
5
 * Time: 17:05
6
 */
7
8
namespace library\search;
9
use library\search\results\SearchResult;
10
11
/**
12
 * Class Search
13
 * Formula:
14
 * score(q,d)  =
15
 *		queryNorm(q)
16
 *		· coord(q,d)
17
 *		· ∑ (
18
 *			tf(t in d)
19
 *			· idf(t)²
20
 *			· t.getBoost()
21
 *			· norm(t,d)
22
 *		) (t in q)
23
 *
24
 * @see https://www.elastic.co/guide/en/elasticsearch/guide/current/practical-scoring-function.html
25
 * @package library\search
26
 */
27
class Search extends SearchDbConnected
28
{
29
	/**
30
	 * @var Tokenizer
31
	 */
32
	protected $tokenizer;
33
	protected $results = array();
34
35
	/**
36
	 * An array containing classes implementing \library\search\Filters
37
	 * These will be applied to all tokenizers
38
	 * @var array
39
	 */
40
	protected $filters = array(
41
		'DutchStopWords',
42
		'EnglishStopWords'
43
	);
44
45
	/**
46
	 * Returns an array of SeachResult and / or SearchSuggestion objects,
47
	 * based on the tokens in the Tokenizer
48
	 * @param Tokenizer $tokenizer
49
	 *
50
	 * @return array
51
	 */
52
	public function getDocumentsForTokenizer(Tokenizer $tokenizer)
53
	{
54
		$this->tokenizer = $tokenizer;
55
		$resultsPerTokens = $this->queryTokens();
56
57
		$flatResults = $this->flattenResults($resultsPerTokens);
58
		$flatResults = $this->applyQueryCoordination($flatResults);
59
		usort($flatResults, array($this, "scoreCompare"));
60
61
		$flatResults = array_merge($this->getSearchSuggestions(), $flatResults);
62
63
		return $flatResults;
64
	}
65
66
	/**
67
	 * Returns the amount of distinct documents
68
	 * that are currently in the search index.
69
	 * @return int
70
	 * @throws \Exception
71
	 */
72
	public function getIndexedDocuments()
73
	{
74
		$db = $this->getSearchDbHandle();
75
		$sql = '
76
			SELECT count(DISTINCT documentPath) as indexedDocuments
77
			  FROM term_frequency
78
		';
79
		if (!$stmt = $db->query($sql)) {
80
			$errorInfo = $db->errorInfo();
81
			$errorMsg = $errorInfo[2];
82
			throw new \Exception('SQLite Exception: ' . $errorMsg . ' in SQL: <br /><pre>' . $sql . '</pre>');
83
		}
84
		$result = $stmt->fetch(\PDO::FETCH_COLUMN);
85 View Code Duplication
		if (false === $result) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
86
			$errorInfo = $db->errorInfo();
87
			$errorMsg = $errorInfo[2];
88
			throw new \Exception('SQLite Exception: ' . $errorMsg . ' in SQL: <br /><pre>' . $sql . '</pre>');
89
		}
90
		return intval($result);
91
	}
92
93
	/**
94
	 * Queries each token present in the Tokenizer
95
	 * and returns SearchResult objects for the found
96
	 * documents
97
	 * @return array
98
	 */
99
	private function queryTokens()
100
	{
101
		$tokens = $this->getTokens();
102
103
		$queryNorm = $this->getQueryNorm($tokens);
104
		$results = array();
105
		foreach ($tokens as $token) {
106
			$results[$token] = $this->getResultsForToken($token, $queryNorm);
107
		}
108
		return $results;
109
	}
110
111
	/**
112
	 * Applies the Filter objects in the the filter array to the
113
	 * tokens in the Tokenizer
114
	 * @param $tokens
115
	 *
116
	 * @return mixed
117
	 */
118 View Code Duplication
	protected function applyFilters($tokens)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
119
	{
120
		foreach ($this->filters as $filterName) {
121
			$filterClassName = '\library\search\filters\\' . $filterName;
122
			$filter = new $filterClassName($tokens);
123
			$tokens = $filter->getFilterResults();
124
		}
125
		return $tokens;
126
	}
127
128
	/**
129
	 * Queries the search index for a given token
130
	 * and the query norm.
131
	 * @param $token
132
	 * @param $queryNorm
133
	 *
134
	 * @return array
135
	 * @throws \Exception
136
	 */
137
	public function getResultsForToken($token, $queryNorm) {
138
		$db = $this->getSearchDbHandle();
139
		$sql = '
140
			SELECT (:queryNorm * 
141
						(SUM(term_frequency.frequency) --TF
142
						* inverse_document_frequency.inverseDocumentFrequency -- IDF
143
						* SUM(term_frequency.termNorm) -- norm
144
						) 
145
				    )as score,
146
				   SUM(term_frequency.frequency) as TF,
147
				   inverse_document_frequency.inverseDocumentFrequency as IDF,
148
				   SUM(term_frequency.termNorm) as norm,
149
				   term_frequency.documentPath
150
			  FROM term_frequency
151
		 LEFT JOIN inverse_document_frequency
152
		 		ON inverse_document_frequency.term = term_frequency.term
153
			 WHERE term_frequency.term = :query
154
		  GROUP BY term_frequency.documentPath, term_frequency.term
155
		  ORDER BY score DESC
156
		';
157
		if(!$stmt = $db->prepare($sql)) {
158
			throw new \Exception('SQLite exception: <pre>' . print_r($db->errorInfo(), true) . '</pre> for SQL:<pre>' . $sql . '</pre>');
159
		}
160
		$stmt->bindValue(':query', $token);
161
		$stmt->bindValue(':queryNorm', $queryNorm);
162 View Code Duplication
		if (!$stmt->execute()) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
163
			throw new \Exception('SQLite exception: <pre>' . print_r($db->errorInfo(), true) . '</pre> for SQL:<pre>' . $sql . '</pre>');
164
		}
165
		return $stmt->fetchAll(\PDO::FETCH_CLASS, '\library\search\results\SearchResult');
166
	}
167
168
	/**
169
	 * @param $resultsPerTokens
170
	 *
171
	 * @return array
172
	 */
173
	private function flattenResults($resultsPerTokens)
174
	{
175
		$finalResults = array();
176
		foreach ($resultsPerTokens as $token => $resultPerToken) {
177
			foreach ($resultPerToken as $result) {
178
				if (isset($finalResults[$result->documentPath])) {
179
					$finalResults[$result->documentPath]->score += $result->score;
180
					$finalResults[$result->documentPath]->matchingTokens[] = $token;
181
				} else {
182
					$resultObj = new SearchResult();
183
					$resultObj->documentPath = $result->documentPath;
184
					$resultObj->matchingTokens = array($token);
185
					$resultObj->score = floatval($result->score);
186
					$resultObj->setStorage($this->storage);
187
					$finalResults[$result->documentPath] = $resultObj;
188
				}
189
			}
190
		}
191
		return $finalResults;
192
	}
193
194
	private function scoreCompare($a, $b) {
195
		if ($a->score == $b->score) {
196
			return 0;
197
		}
198
		return ($a->score > $b->score) ? -1 : 1;
199
	}
200
201
	/**
202
	 * Calculates the query norm for all tokens in the Tokenizer
203
	 * @param $tokens
204
	 *
205
	 * @return int
206
	 * @throws \Exception
207
	 */
208
	private function getQueryNorm($tokens)
209
	{
210
		$db = $this->getSearchDbHandle();
211
		$db->sqliteCreateFunction('sqrt', 'sqrt', 1);
212
		foreach ($tokens as $key => $token) {
213
			$tokens[$key] = $db->quote($token);
214
		}
215
		$terms = implode(',', $tokens);
216
		$sql = '
217
			SELECT (1 / sqrt(SUM(inverseDocumentFrequency))) as queryNorm
218
			  FROM inverse_document_frequency
219
			 WHERE term IN (' . $terms . ') 
220
		';
221
		if(!$stmt = $db->prepare($sql)) {
222
			throw new \Exception('SQLite exception: <pre>' . print_r($db->errorInfo(), true) . '</pre> for SQL:<pre>' . $sql . '</pre>');
223
		}
224 View Code Duplication
		if (!$stmt->execute()) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
225
			throw new \Exception('SQLite exception: <pre>' . print_r($db->errorInfo(), true) . '</pre> for SQL:<pre>' . $sql . '</pre>');
226
		}
227
		$result = $stmt->fetch(\PDO::FETCH_OBJ);
228
		return $result->queryNorm == null ? 1 : $result->queryNorm;
229
	}
230
231
	/**
232
	 * Applies query coordination to all results
233
	 * @param $flatResults
234
	 *
235
	 * @return mixed
236
	 */
237
	private function applyQueryCoordination($flatResults)
238
	{
239
		$tokenVector = $this->tokenizer->getTokenVector();
240
		$tokens = array_keys($tokenVector);
241
		$tokenCount = count($tokens);
242
		foreach ($flatResults as $key => $result) {
243
			$matchCount = count($result->matchingTokens);
244
			$result->score = ($matchCount / $tokenCount) * $result->score;
245
			$flatResults[$key] = $result;
246
		}
247
		return $flatResults;
248
	}
249
250
	/**
251
	 * Uses the levenshtein algorithm to determine the term that is
252
	 * closest to the token that was input for the search
253
	 * @return array
254
	 * @throws \Exception
255
	 */
256
	private function getSearchSuggestions()
257
	{
258
		$tokens = $this->getTokens();
259
		$allResults = array();
260
		foreach ($tokens as $token) {
261
			$db = $this->getSearchDbHandle();
262
			$db->sqliteCreateFunction('levenshtein', 'levenshtein', 2);
263
			$sql = '
264
				SELECT *
265
				  FROM (
266
				  	SELECT :token as original, term, levenshtein(term, :token) as editDistance
267
				  	  FROM inverse_document_frequency
268
			  	  ORDER BY editDistance ASC
269
			  	     LIMIT 0, 1
270
			  	     )
271
			  	   WHERE editDistance > 0
272
			';
273
			$stmt = $db->prepare($sql);
274 View Code Duplication
			if ($stmt === false) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
275
				throw new \Exception('SQLite exception: <pre>' . print_r($db->errorInfo(), true) . '</pre> for SQL:<pre>' . $sql . '</pre>');
276
			}
277
			$stmt->bindValue(':token', $token);
278 View Code Duplication
			if (($stmt === false) | (!$stmt->execute())) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
279
				throw new \Exception('SQLite exception: <pre>' . print_r($db->errorInfo(), true) . '</pre> for SQL:<pre>' . $sql . '</pre>');
280
			}
281
			$result = $stmt->fetchAll(\PDO::FETCH_CLASS, '\library\search\results\SearchSuggestion');
282
			$allResults = array_merge($result, $allResults);
283
		}
284
		return $allResults;
285
	}
286
287
	/**
288
	 * Retrieves all tokens from the tokenizer
289
	 * @return array
290
	 */
291
	private function getTokens()
292
	{
293
		$tokenVector = array(
294
			'query' => array(),
295
		);
296
		$tokenVector['query'] = $this->tokenizer->getTokenVector();
297
		$tokens = $this->applyFilters($tokenVector);
298
		if (!empty($tokens)) {
299
			$tokens = array_keys($tokens['query']);
300
		}
301
302
		return $tokens;
303
	}
304
}