1
|
|
|
<?php |
2
|
|
|
/** |
3
|
|
|
* User: jensk |
4
|
|
|
* Date: 21-2-2017 |
5
|
|
|
* Time: 17:05 |
6
|
|
|
*/ |
7
|
|
|
|
8
|
|
|
namespace library\search; |
9
|
|
|
use library\search\results\SearchResult; |
10
|
|
|
|
11
|
|
|
/** |
12
|
|
|
* Class Search |
13
|
|
|
* Formula: |
14
|
|
|
* score(q,d) = |
15
|
|
|
* queryNorm(q) |
16
|
|
|
* · coord(q,d) |
17
|
|
|
* · ∑ ( |
18
|
|
|
* tf(t in d) |
19
|
|
|
* · idf(t)² |
20
|
|
|
* · t.getBoost() |
21
|
|
|
* · norm(t,d) |
22
|
|
|
* ) (t in q) |
23
|
|
|
* |
24
|
|
|
* @see https://www.elastic.co/guide/en/elasticsearch/guide/current/practical-scoring-function.html |
25
|
|
|
* @package library\search |
26
|
|
|
*/ |
27
|
|
|
class Search extends SearchDbConnected |
28
|
|
|
{ |
29
|
|
|
/** |
30
|
|
|
* @var Tokenizer |
31
|
|
|
*/ |
32
|
|
|
protected $tokenizer; |
33
|
|
|
protected $results = array(); |
34
|
|
|
|
35
|
|
|
/** |
36
|
|
|
* An array containing classes implementing \library\search\Filters |
37
|
|
|
* These will be applied to all tokenizers |
38
|
|
|
* @var array |
39
|
|
|
*/ |
40
|
|
|
protected $filters = array( |
41
|
|
|
'DutchStopWords', |
42
|
|
|
'EnglishStopWords' |
43
|
|
|
); |
44
|
|
|
|
45
|
|
|
/** |
46
|
|
|
* Returns an array of SeachResult and / or SearchSuggestion objects, |
47
|
|
|
* based on the tokens in the Tokenizer |
48
|
|
|
* @param Tokenizer $tokenizer |
49
|
|
|
* |
50
|
|
|
* @return array |
51
|
|
|
*/ |
52
|
|
|
public function getDocumentsForTokenizer(Tokenizer $tokenizer) |
53
|
|
|
{ |
54
|
|
|
$this->tokenizer = $tokenizer; |
55
|
|
|
$resultsPerTokens = $this->queryTokens(); |
56
|
|
|
|
57
|
|
|
$flatResults = $this->flattenResults($resultsPerTokens); |
58
|
|
|
$flatResults = $this->applyQueryCoordination($flatResults); |
59
|
|
|
usort($flatResults, array($this, "scoreCompare")); |
60
|
|
|
|
61
|
|
|
$flatResults = array_merge($this->getSearchSuggestions($tokenizer), $flatResults); |
|
|
|
|
62
|
|
|
|
63
|
|
|
return $flatResults; |
64
|
|
|
} |
65
|
|
|
|
66
|
|
|
/** |
67
|
|
|
* Returns the amount of distinct documents |
68
|
|
|
* that are currently in the search index. |
69
|
|
|
* @return int |
70
|
|
|
* @throws \Exception |
71
|
|
|
*/ |
72
|
|
|
public function getIndexedDocuments() |
73
|
|
|
{ |
74
|
|
|
$db = $this->getSearchDbHandle(); |
75
|
|
|
$sql = ' |
76
|
|
|
SELECT count(DISTINCT documentPath) as indexedDocuments |
77
|
|
|
FROM term_frequency |
78
|
|
|
'; |
79
|
|
View Code Duplication |
if (!$stmt = $db->query($sql)) { |
|
|
|
|
80
|
|
|
$errorInfo = $db->errorInfo(); |
81
|
|
|
$errorMsg = $errorInfo[2]; |
82
|
|
|
throw new \Exception('SQLite Exception: ' . $errorMsg . ' in SQL: <br /><pre>' . $sql . '</pre>'); |
83
|
|
|
} |
84
|
|
|
$result = $stmt->fetch(\PDO::FETCH_COLUMN); |
85
|
|
View Code Duplication |
if (false === $result) { |
|
|
|
|
86
|
|
|
$errorInfo = $db->errorInfo(); |
87
|
|
|
$errorMsg = $errorInfo[2]; |
88
|
|
|
throw new \Exception('SQLite Exception: ' . $errorMsg . ' in SQL: <br /><pre>' . $sql . '</pre>'); |
89
|
|
|
} |
90
|
|
|
return intval($result); |
91
|
|
|
} |
92
|
|
|
|
93
|
|
|
/** |
94
|
|
|
* Queries each token present in the Tokenizer |
95
|
|
|
* and returns SearchResult objects for the found |
96
|
|
|
* documents |
97
|
|
|
* @return array |
98
|
|
|
*/ |
99
|
|
|
private function queryTokens() |
100
|
|
|
{ |
101
|
|
|
$tokens = $this->getTokens(); |
102
|
|
|
|
103
|
|
|
$queryNorm = $this->getQueryNorm($tokens); |
104
|
|
|
$results = array(); |
105
|
|
|
foreach ($tokens as $token) { |
106
|
|
|
$results[$token] = $this->getResultsForToken($token, $queryNorm); |
107
|
|
|
} |
108
|
|
|
return $results; |
109
|
|
|
} |
110
|
|
|
|
111
|
|
|
/** |
112
|
|
|
* Applies the Filter objects in the the filter array to the |
113
|
|
|
* tokens in the Tokenizer |
114
|
|
|
* @param $tokens |
115
|
|
|
* |
116
|
|
|
* @return mixed |
117
|
|
|
*/ |
118
|
|
View Code Duplication |
protected function applyFilters($tokens) |
|
|
|
|
119
|
|
|
{ |
120
|
|
|
foreach ($this->filters as $filterName) { |
121
|
|
|
$filterClassName = '\library\search\filters\\' . $filterName; |
122
|
|
|
$filter = new $filterClassName($tokens); |
123
|
|
|
$tokens = $filter->getFilterResults(); |
124
|
|
|
} |
125
|
|
|
return $tokens; |
126
|
|
|
} |
127
|
|
|
|
128
|
|
|
/** |
129
|
|
|
* Queries the search index for a given token |
130
|
|
|
* and the query norm. |
131
|
|
|
* @param $token |
132
|
|
|
* @param $queryNorm |
133
|
|
|
* |
134
|
|
|
* @return array |
135
|
|
|
* @throws \Exception |
136
|
|
|
*/ |
137
|
|
|
public function getResultsForToken($token, $queryNorm) { |
138
|
|
|
$db = $this->getSearchDbHandle(); |
139
|
|
|
$sql = ' |
140
|
|
|
SELECT (:queryNorm * |
141
|
|
|
(SUM(term_frequency.frequency) --TF |
142
|
|
|
* inverse_document_frequency.inverseDocumentFrequency -- IDF |
143
|
|
|
* SUM(term_frequency.termNorm) -- norm |
144
|
|
|
) |
145
|
|
|
)as score, |
146
|
|
|
SUM(term_frequency.frequency) as TF, |
147
|
|
|
inverse_document_frequency.inverseDocumentFrequency as IDF, |
148
|
|
|
SUM(term_frequency.termNorm) as norm, |
149
|
|
|
term_frequency.documentPath |
150
|
|
|
FROM term_frequency |
151
|
|
|
LEFT JOIN inverse_document_frequency |
152
|
|
|
ON inverse_document_frequency.term = term_frequency.term |
153
|
|
|
WHERE term_frequency.term = :query |
154
|
|
|
GROUP BY term_frequency.documentPath, term_frequency.term |
155
|
|
|
ORDER BY score DESC |
156
|
|
|
'; |
157
|
|
|
if(!$stmt = $db->prepare($sql)) { |
158
|
|
|
throw new \Exception('SQLite exception: <pre>' . print_r($db->errorInfo(), true) . '</pre> for SQL:<pre>' . $sql . '</pre>'); |
159
|
|
|
} |
160
|
|
|
$stmt->bindValue(':query', $token); |
161
|
|
|
$stmt->bindValue(':queryNorm', $queryNorm); |
162
|
|
View Code Duplication |
if (!$stmt->execute()) { |
|
|
|
|
163
|
|
|
throw new \Exception('SQLite exception: <pre>' . print_r($db->errorInfo(), true) . '</pre> for SQL:<pre>' . $sql . '</pre>'); |
164
|
|
|
} |
165
|
|
|
return $stmt->fetchAll(\PDO::FETCH_CLASS, '\library\search\results\SearchResult'); |
166
|
|
|
} |
167
|
|
|
|
168
|
|
|
/** |
169
|
|
|
* @param $resultsPerTokens |
170
|
|
|
* |
171
|
|
|
* @return array |
172
|
|
|
*/ |
173
|
|
|
private function flattenResults($resultsPerTokens) |
174
|
|
|
{ |
175
|
|
|
$finalResults = array(); |
176
|
|
|
foreach ($resultsPerTokens as $token => $resultPerToken) { |
177
|
|
|
foreach ($resultPerToken as $result) { |
178
|
|
|
if (isset($finalResults[$result->documentPath])) { |
179
|
|
|
$finalResults[$result->documentPath]->score += $result->score; |
180
|
|
|
$finalResults[$result->documentPath]->matchingTokens[] = $token; |
181
|
|
|
} else { |
182
|
|
|
$resultObj = new SearchResult(); |
183
|
|
|
$resultObj->documentPath = $result->documentPath; |
184
|
|
|
$resultObj->matchingTokens = array($token); |
185
|
|
|
$resultObj->score = floatval($result->score); |
186
|
|
|
$resultObj->setStorage($this->storage); |
187
|
|
|
$finalResults[$result->documentPath] = $resultObj; |
188
|
|
|
} |
189
|
|
|
} |
190
|
|
|
} |
191
|
|
|
return $finalResults; |
192
|
|
|
} |
193
|
|
|
|
194
|
|
|
private function scoreCompare($a, $b) { |
195
|
|
|
if ($a->score == $b->score) { |
196
|
|
|
return 0; |
197
|
|
|
} |
198
|
|
|
return ($a->score > $b->score) ? -1 : 1; |
199
|
|
|
} |
200
|
|
|
|
201
|
|
|
/** |
202
|
|
|
* Calculates the query norm for all tokens in the Tokenizer |
203
|
|
|
* @param $tokens |
204
|
|
|
* |
205
|
|
|
* @return int |
206
|
|
|
* @throws \Exception |
207
|
|
|
*/ |
208
|
|
|
private function getQueryNorm($tokens) |
209
|
|
|
{ |
210
|
|
|
$db = $this->getSearchDbHandle(); |
211
|
|
|
$db->sqliteCreateFunction('sqrt', 'sqrt', 1); |
212
|
|
|
foreach ($tokens as $key => $token) { |
213
|
|
|
$tokens[$key] = $db->quote($token); |
214
|
|
|
} |
215
|
|
|
$terms = implode(',', $tokens); |
216
|
|
|
$sql = ' |
217
|
|
|
SELECT (1 / sqrt(SUM(inverseDocumentFrequency))) as queryNorm |
218
|
|
|
FROM inverse_document_frequency |
219
|
|
|
WHERE term IN (' . $terms . ') |
220
|
|
|
'; |
221
|
|
|
if(!$stmt = $db->prepare($sql)) { |
222
|
|
|
throw new \Exception('SQLite exception: <pre>' . print_r($db->errorInfo(), true) . '</pre> for SQL:<pre>' . $sql . '</pre>'); |
223
|
|
|
} |
224
|
|
View Code Duplication |
if (!$stmt->execute()) { |
|
|
|
|
225
|
|
|
throw new \Exception('SQLite exception: <pre>' . print_r($db->errorInfo(), true) . '</pre> for SQL:<pre>' . $sql . '</pre>'); |
226
|
|
|
} |
227
|
|
|
$result = $stmt->fetch(\PDO::FETCH_OBJ); |
228
|
|
|
return $result->queryNorm == null ? 1 : $result->queryNorm; |
229
|
|
|
} |
230
|
|
|
|
231
|
|
|
/** |
232
|
|
|
* Applies query coordination to all results |
233
|
|
|
* @param $flatResults |
234
|
|
|
* |
235
|
|
|
* @return mixed |
236
|
|
|
*/ |
237
|
|
|
private function applyQueryCoordination($flatResults) |
238
|
|
|
{ |
239
|
|
|
$tokenVector = $this->tokenizer->getTokenVector(); |
240
|
|
|
$tokens = array_keys($tokenVector); |
241
|
|
|
$tokenCount = count($tokens); |
242
|
|
|
foreach ($flatResults as $key => $result) { |
243
|
|
|
$matchCount = count($result->matchingTokens); |
244
|
|
|
$result->score = ($matchCount / $tokenCount) * $result->score; |
245
|
|
|
$flatResults[$key] = $result; |
246
|
|
|
} |
247
|
|
|
return $flatResults; |
248
|
|
|
} |
249
|
|
|
|
250
|
|
|
/** |
251
|
|
|
* Uses the levenshtein algorithm to determine the term that is |
252
|
|
|
* closest to the token that was input for the search |
253
|
|
|
* @return array |
254
|
|
|
* @throws \Exception |
255
|
|
|
*/ |
256
|
|
|
private function getSearchSuggestions() |
257
|
|
|
{ |
258
|
|
|
$tokens = $this->getTokens(); |
259
|
|
|
$allResults = array(); |
260
|
|
|
foreach ($tokens as $token) { |
261
|
|
|
$db = $this->getSearchDbHandle(); |
262
|
|
|
$db->sqliteCreateFunction('levenshtein', 'levenshtein', 2); |
263
|
|
|
$sql = ' |
264
|
|
|
SELECT * |
265
|
|
|
FROM ( |
266
|
|
|
SELECT :token as original, term, levenshtein(term, :token) as editDistance |
267
|
|
|
FROM inverse_document_frequency |
268
|
|
|
ORDER BY editDistance ASC |
269
|
|
|
LIMIT 0, 1 |
270
|
|
|
) |
271
|
|
|
WHERE editDistance > 0 |
272
|
|
|
'; |
273
|
|
|
$stmt = $db->prepare($sql); |
274
|
|
View Code Duplication |
if ($stmt === false) { |
|
|
|
|
275
|
|
|
throw new \Exception('SQLite exception: <pre>' . print_r($db->errorInfo(), true) . '</pre> for SQL:<pre>' . $sql . '</pre>'); |
276
|
|
|
} |
277
|
|
|
$stmt->bindValue(':token', $token); |
278
|
|
View Code Duplication |
if ($stmt === false | !$stmt->execute()) { |
|
|
|
|
279
|
|
|
throw new \Exception('SQLite exception: <pre>' . print_r($db->errorInfo(), true) . '</pre> for SQL:<pre>' . $sql . '</pre>'); |
280
|
|
|
} |
281
|
|
|
$result = $stmt->fetchAll(\PDO::FETCH_CLASS, '\library\search\results\SearchSuggestion'); |
282
|
|
|
$allResults = array_merge($result, $allResults); |
283
|
|
|
} |
284
|
|
|
return $allResults; |
285
|
|
|
} |
286
|
|
|
|
287
|
|
|
/** |
288
|
|
|
* Retrieves all tokens from the tokenizer |
289
|
|
|
* @return array |
290
|
|
|
*/ |
291
|
|
|
private function getTokens() |
292
|
|
|
{ |
293
|
|
|
$tokenVector = array( |
294
|
|
|
'query' => array(), |
295
|
|
|
); |
296
|
|
|
$tokenVector['query'] = $this->tokenizer->getTokenVector(); |
297
|
|
|
$tokens = $this->applyFilters($tokenVector); |
298
|
|
|
if (!empty($tokens)) { |
299
|
|
|
$tokens = array_keys($tokens['query']); |
300
|
|
|
} |
301
|
|
|
|
302
|
|
|
return $tokens; |
303
|
|
|
} |
304
|
|
|
} |
This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue.
If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress.
In this case you can add the
@ignore
PhpDoc annotation to the duplicate definition and it will be ignored.