Passed
Push — develop ( 5bae1d...5b2d1e )
by Jens
02:48
created

Search::getSearchSuggestions()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 8
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
eloc 5
nc 2
nop 0
dl 0
loc 8
rs 9.4285
c 0
b 0
f 0
1
<?php
2
/**
3
 * User: jensk
4
 * Date: 21-2-2017
5
 * Time: 17:05
6
 */
7
8
namespace CloudControl\Cms\search;
9
10
use CloudControl\Cms\search\results\SearchResult;
11
12
/**
13
 * Class Search
14
 * Formula:
15
 * score(q,d)  =
16
 *        queryNorm(q)
17
 *        · coord(q,d)
18
 *        · ∑ (
19
 *            tf(t in d)
20
 *            · idf(t)²
21
 *            · t.getBoost()
22
 *            · norm(t,d)
23
 *        ) (t in q)
24
 *
25
 * @see https://www.elastic.co/guide/en/elasticsearch/guide/current/practical-scoring-function.html
26
 * @package CloudControl\Cms\search
27
 */
28
class Search extends SearchDbConnected
29
{
30
    /**
31
     * @var Tokenizer
32
     */
33
    protected $tokenizer;
34
    protected $results = array();
35
36
    /**
37
     * An array containing classes implementing \CloudControl\Cms\search\Filters
38
     * These will be applied to all tokenizers
39
     * @var array
40
     */
41
    protected $filters = array(
42
        'DutchStopWords',
43
        'EnglishStopWords'
44
    );
45
46
    /**
47
     * Returns an array of SeachResult and / or SearchSuggestion objects,
48
     * based on the tokens in the Tokenizer
49
     * @param Tokenizer $tokenizer
50
     *
51
     * @return array
52
     * @throws \Exception
53
     */
54
    public function getDocumentsForTokenizer(Tokenizer $tokenizer)
55
    {
56
        $this->tokenizer = $tokenizer;
57
        $resultsPerTokens = $this->queryTokens();
58
59
        $flatResults = $this->flattenResults($resultsPerTokens);
60
        $flatResults = $this->applyQueryCoordination($flatResults);
61
        usort($flatResults, array($this, 'scoreCompare'));
62
63
        $flatResults = array_merge($this->getSearchSuggestions(), $flatResults);
64
65
        return $flatResults;
66
    }
67
68
    /**
69
     * Returns the amount of distinct documents
70
     * that are currently in the search index.
71
     * @return int
72
     * @throws \Exception
73
     */
74
    public function getIndexedDocuments()
75
    {
76
        $db = $this->getSearchDbHandle();
77
        $sql = '
78
			SELECT count(DISTINCT documentPath) AS indexedDocuments
79
			  FROM term_frequency
80
		';
81
        if (!$stmt = $db->query($sql)) {
82
            $errorInfo = $db->errorInfo();
83
            $errorMsg = $errorInfo[2];
84
            throw new \RuntimeException('SQLite Exception: ' . $errorMsg . ' in SQL: <br /><pre>' . $sql . '</pre>');
85
        }
86
        $result = $stmt->fetch(\PDO::FETCH_COLUMN);
87
        if (false === $result) {
88
            $errorInfo = $db->errorInfo();
89
            $errorMsg = $errorInfo[2];
90
            throw new \RuntimeException('SQLite Exception: ' . $errorMsg . ' in SQL: <br /><pre>' . $sql . '</pre>');
91
        }
92
        return (int)$result;
93
    }
94
95
    /**
96
     * Queries each token present in the Tokenizer
97
     * and returns SearchResult objects for the found
98
     * documents
99
     * @return array
100
     * @throws \Exception
101
     */
102
    private function queryTokens()
103
    {
104
        $tokens = $this->getTokens();
105
106
        $queryNorm = $this->getQueryNorm($tokens);
107
        $results = array();
108
        foreach ($tokens as $token) {
109
            $results[$token] = $this->getResultsForToken($token, $queryNorm);
110
        }
111
        return $results;
112
    }
113
114
    /**
115
     * Applies the Filter objects in the the filter array to the
116
     * tokens in the Tokenizer
117
     * @param $tokens
118
     *
119
     * @return mixed
120
     */
121
    protected function applyFilters($tokens)
122
    {
123
        foreach ($this->filters as $filterName) {
124
            $filterClassName = '\CloudControl\Cms\search\filters\\' . $filterName;
125
            $filter = new $filterClassName($tokens);
126
            $tokens = $filter->getFilterResults();
127
        }
128
        return $tokens;
129
    }
130
131
    /**
132
     * Queries the search index for a given token
133
     * and the query norm.
134
     * @param $token
135
     * @param $queryNorm
136
     *
137
     * @return array
138
     * @throws \Exception
139
     */
140
    public function getResultsForToken($token, $queryNorm)
141
    {
142
        $db = $this->getSearchDbHandle();
143
        $sql = '
144
			SELECT (:queryNorm * 
145
						(SUM(term_frequency.frequency) --TF
146
						* inverse_document_frequency.inverseDocumentFrequency -- IDF
147
						* SUM(term_frequency.termNorm) -- norm
148
						) 
149
				    )AS score,
150
				   SUM(term_frequency.frequency) AS TF,
151
				   inverse_document_frequency.inverseDocumentFrequency AS IDF,
152
				   SUM(term_frequency.termNorm) AS norm,
153
				   term_frequency.documentPath
154
			  FROM term_frequency
155
		 LEFT JOIN inverse_document_frequency
156
		 		ON inverse_document_frequency.term = term_frequency.term
157
			 WHERE term_frequency.term = :query
158
		  GROUP BY term_frequency.documentPath, term_frequency.term
159
		  ORDER BY score DESC
160
		';
161
        if (!$stmt = $db->prepare($sql)) {
162
            throw new \RuntimeException('SQLite exception: <pre>' . print_r($db->errorInfo(),
163
                    true) . '</pre> for SQL:<pre>' . $sql . '</pre>');
164
        }
165
        $stmt->bindValue(':query', $token);
166
        $stmt->bindValue(':queryNorm', $queryNorm);
167
        if (!$stmt->execute()) {
168
            throw new \RuntimeException('SQLite exception: <pre>' . print_r($db->errorInfo(),
169
                    true) . '</pre> for SQL:<pre>' . $sql . '</pre>');
170
        }
171
        return $stmt->fetchAll(\PDO::FETCH_CLASS, SearchResult::class);
172
    }
173
174
    /**
175
     * @param $resultsPerTokens
176
     *
177
     * @return array
178
     */
179
    private function flattenResults($resultsPerTokens)
180
    {
181
        $finalResults = array();
182
        foreach ($resultsPerTokens as $token => $resultPerToken) {
183
            foreach ($resultPerToken as $result) {
184
                if (isset($finalResults[$result->documentPath])) {
185
                    $finalResults[$result->documentPath]->score += $result->score;
186
                    $finalResults[$result->documentPath]->matchingTokens[] = $token;
187
                } else {
188
                    $resultObj = new SearchResult();
189
                    $resultObj->documentPath = $result->documentPath;
190
                    $resultObj->matchingTokens = array($token);
191
                    $resultObj->score = (float)$result->score;
192
                    $resultObj->setStorage($this->storage);
193
                    $finalResults[$result->documentPath] = $resultObj;
194
                }
195
            }
196
        }
197
        return $finalResults;
198
    }
199
200
    /**
201
     * @param $a
202
     * @param $b
203
     * @return int
204
     */
205
    private function scoreCompare($a, $b)
206
    {
207
        if ($a->score === $b->score) {
208
            return 0;
209
        }
210
        return ($a->score > $b->score) ? -1 : 1;
211
    }
212
213
    /**
214
     * Calculates the query norm for all tokens in the Tokenizer
215
     * @param $tokens
216
     *
217
     * @return int
218
     * @throws \Exception
219
     */
220
    private function getQueryNorm($tokens)
221
    {
222
        $db = $this->getSearchDbHandle();
223
        $db->/** @scrutinizer ignore-call */
224
        sqliteCreateFunction('sqrt', 'sqrt', 1);
225
        foreach ($tokens as $key => $token) {
226
            $tokens[$key] = $db->quote($token);
227
        }
228
        $terms = implode(',', $tokens);
229
        $sql = '
230
			SELECT (1 / sqrt(SUM(inverseDocumentFrequency))) AS queryNorm
231
			  FROM inverse_document_frequency
232
			 WHERE term IN (' . $terms . ') 
233
		';
234
        if (!$stmt = $db->prepare($sql)) {
235
            throw new \RuntimeException('SQLite exception: <pre>' . print_r($db->errorInfo(),
236
                    true) . '</pre> for SQL:<pre>' . $sql . '</pre>');
237
        }
238
        if (!$stmt->execute()) {
239
            throw new \RuntimeException('SQLite exception: <pre>' . print_r($db->errorInfo(),
240
                    true) . '</pre> for SQL:<pre>' . $sql . '</pre>');
241
        }
242
        $result = $stmt->fetch(\PDO::FETCH_OBJ);
243
        return $result->queryNorm == null ? 1 : $result->queryNorm;
244
    }
245
246
    /**
247
     * Applies query coordination to all results
248
     * @param $flatResults
249
     *
250
     * @return mixed
251
     */
252
    private function applyQueryCoordination($flatResults)
253
    {
254
        $tokenVector = $this->tokenizer->getTokenVector();
255
        $tokens = array_keys($tokenVector);
256
        $tokenCount = count($tokens);
257
        foreach ($flatResults as $key => $result) {
258
            $matchCount = count($result->matchingTokens);
259
            $result->score = ($matchCount / $tokenCount) * $result->score;
260
            $flatResults[$key] = $result;
261
        }
262
        return $flatResults;
263
    }
264
265
    /**
266
     * Uses the levenshtein algorithm to determine the term that is
267
     * closest to the token that was input for the search
268
     * @return array
269
     * @throws \Exception
270
     */
271
    private function getSearchSuggestions()
272
    {
273
        $tokens = $this->getTokens();
274
        $allResults = array();
275
        foreach ($tokens as $token) {
276
            $allResults = $this->getSearchSuggestion($token, $allResults);
277
        }
278
        return $allResults;
279
    }
280
281
    /**
282
     * Retrieves all tokens from the tokenizer
283
     * @return array
284
     */
285
    private function getTokens()
286
    {
287
        $tokenVector = array(
288
            'query' => array(),
289
        );
290
        $tokenVector['query'] = $this->tokenizer->getTokenVector();
291
        $tokens = $this->applyFilters($tokenVector);
292
        if (!empty($tokens)) {
293
            $tokens = array_keys($tokens['query']);
294
        }
295
296
        return $tokens;
297
    }
298
299
    /**
300
     * @param \PDO $db
301
     * @param $sql
302
     * @param $token
303
     * @return mixed
304
     * @throws \Exception
305
     */
306
    private function getSearchSuggestionStatement($db, $sql, $token)
307
    {
308
        $stmt = $db->prepare($sql);
309
        if ($stmt === false) {
310
            throw new \RuntimeException('SQLite exception: <pre>' . print_r($db->errorInfo(),
311
                    true) . '</pre> for SQL:<pre>' . $sql . '</pre>');
312
        }
313
        $stmt->bindValue(':token', $token);
314
        if (($stmt === false) || (!$stmt->execute())) {
315
            throw new \RuntimeException('SQLite exception: <pre>' . print_r($db->errorInfo(),
316
                    true) . '</pre> for SQL:<pre>' . $sql . '</pre>');
317
        }
318
        return $stmt;
319
    }
320
321
    /**
322
     * @param $token
323
     * @param $allResults
324
     * @return array
325
     */
326
    private function getSearchSuggestion($token, $allResults)
327
    {
328
        $db = $this->getSearchDbHandle();
329
        $db->/** @scrutinizer ignore-call */
330
        sqliteCreateFunction('levenshtein', 'levenshtein', 2);
331
        $sql = '
332
				SELECT *
333
				  FROM (
334
				  	SELECT :token AS original, term, levenshtein(term, :token) AS editDistance
335
				  	  FROM inverse_document_frequency
336
			  	  ORDER BY editDistance ASC
337
			  	     LIMIT 0, 1
338
			  	     )
339
			  	   WHERE editDistance > 0
340
			';
341
        $stmt = $this->getSearchSuggestionStatement($db, $sql, $token);
342
        $result = $stmt->fetchAll(\PDO::FETCH_CLASS, results\SearchSuggestion::class);
343
        $allResults = array_merge($result, $allResults);
344
        return $allResults;
345
    }
346
}