Search   A
last analyzed

Complexity

Total Complexity 34

Size/Duplication

Total Lines 322
Duplicated Lines 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
eloc 120
c 1
b 0
f 0
dl 0
loc 322
rs 9.68
wmc 34

13 Methods

Rating   Name   Duplication   Size   Complexity  
A getDocumentsForTokenizer() 0 12 1
A queryTokens() 0 10 2
A flattenResults() 0 19 4
A applyFilters() 0 8 2
A scoreCompare() 0 6 3
A applyQueryCoordination() 0 11 2
A getSearchSuggestions() 0 8 2
A getResultsForToken() 0 37 3
A getIndexedDocuments() 0 19 3
A getTokens() 0 12 2
A getSearchSuggestionStatement() 0 13 4
A getQueryNorm() 0 24 5
A getSearchSuggestion() 0 19 1
1
<?php
2
/**
3
 * User: jensk
4
 * Date: 21-2-2017
5
 * Time: 17:05
6
 */
7
8
namespace CloudControl\Cms\search;
9
10
use CloudControl\Cms\search\results\SearchResult;
11
12
/**
13
 * Class Search
14
 * Formula:
15
 * score(q,d)  =
16
 *        queryNorm(q)
17
 *        · coord(q,d)
18
 *        · ∑ (
19
 *            tf(t in d)
20
 *            · idf(t)²
21
 *            · t.getBoost()
22
 *            · norm(t,d)
23
 *        ) (t in q)
24
 *
25
 * @see https://www.elastic.co/guide/en/elasticsearch/guide/current/practical-scoring-function.html
26
 * @package CloudControl\Cms\search
27
 */
28
class Search extends SearchDbConnected
29
{
30
    /**
31
     * @var Tokenizer
32
     */
33
    protected $tokenizer;
34
    protected $results = array();
35
36
    /**
37
     * An array containing classes implementing \CloudControl\Cms\search\Filters
38
     * These will be applied to all tokenizers
39
     * @var array
40
     */
41
    protected $filters = array(
42
        'DutchStopWords',
43
        'EnglishStopWords'
44
    );
45
46
    /**
47
     * Returns an array of SeachResult and / or SearchSuggestion objects,
48
     * based on the tokens in the Tokenizer
49
     * @param Tokenizer $tokenizer
50
     *
51
     * @return array
52
     * @throws \Exception
53
     */
54
    public function getDocumentsForTokenizer(Tokenizer $tokenizer)
55
    {
56
        $this->tokenizer = $tokenizer;
57
        $resultsPerTokens = $this->queryTokens();
58
59
        $flatResults = $this->flattenResults($resultsPerTokens);
60
        $flatResults = $this->applyQueryCoordination($flatResults);
61
        usort($flatResults, array($this, 'scoreCompare'));
62
63
        $flatResults = array_merge($this->getSearchSuggestions(), $flatResults);
64
65
        return $flatResults;
66
    }
67
68
    /**
69
     * Returns the amount of distinct documents
70
     * that are currently in the search index.
71
     * @return int
72
     * @throws \Exception
73
     */
74
    public function getIndexedDocuments()
75
    {
76
        $db = $this->getSearchDbHandle();
77
        $sql = '
78
			SELECT count(DISTINCT documentPath) AS indexedDocuments
79
			  FROM term_frequency
80
		';
81
        if (!$stmt = $db->query($sql)) {
82
            $errorInfo = $db->errorInfo();
83
            $errorMsg = $errorInfo[2];
84
            throw new \RuntimeException('SQLite Exception: ' . $errorMsg . ' in SQL: <br /><pre>' . $sql . '</pre>');
85
        }
86
        $result = $stmt->fetch(\PDO::FETCH_COLUMN);
87
        if (false === $result) {
88
            $errorInfo = $db->errorInfo();
89
            $errorMsg = $errorInfo[2];
90
            throw new \RuntimeException('SQLite Exception: ' . $errorMsg . ' in SQL: <br /><pre>' . $sql . '</pre>');
91
        }
92
        return (int)$result;
93
    }
94
95
    /**
96
     * Queries each token present in the Tokenizer
97
     * and returns SearchResult objects for the found
98
     * documents
99
     * @return array
100
     * @throws \Exception
101
     */
102
    private function queryTokens()
103
    {
104
        $tokens = $this->getTokens();
105
106
        $queryNorm = $this->getQueryNorm($tokens);
107
        $results = array();
108
        foreach ($tokens as $token) {
109
            $results[$token] = $this->getResultsForToken($token, $queryNorm);
110
        }
111
        return $results;
112
    }
113
114
    /**
115
     * Applies the Filter objects in the the filter array to the
116
     * tokens in the Tokenizer
117
     * @param $tokens
118
     *
119
     * @return mixed
120
     */
121
    protected function applyFilters($tokens)
122
    {
123
        foreach ($this->filters as $filterName) {
124
            $filterClassName = '\CloudControl\Cms\search\filters\\' . $filterName;
125
            $filter = new $filterClassName($tokens);
126
            $tokens = $filter->getFilterResults();
127
        }
128
        return $tokens;
129
    }
130
131
    /**
132
     * Queries the search index for a given token
133
     * and the query norm.
134
     * @param $token
135
     * @param $queryNorm
136
     *
137
     * @return array
138
     * @throws \Exception
139
     */
140
    public function getResultsForToken($token, $queryNorm)
141
    {
142
        $db = $this->getSearchDbHandle();
143
        $contentDbPath = $this->storageDir . DIRECTORY_SEPARATOR . 'content.db';
144
        $db->exec('ATTACH ' . $db->quote($contentDbPath) . ' as `content_db`;');
145
        $sql = '
146
			SELECT (:queryNorm * 
147
						(SUM(term_frequency.frequency) --TF
148
						* inverse_document_frequency.inverseDocumentFrequency -- IDF
149
						* SUM(term_frequency.termNorm) -- norm
150
						) 
151
				    )AS score,
152
				   SUM(term_frequency.frequency) AS TF,
153
				   inverse_document_frequency.inverseDocumentFrequency AS IDF,
154
				   SUM(term_frequency.termNorm) AS norm,
155
				   term_frequency.documentPath
156
			  FROM term_frequency
157
		 LEFT JOIN inverse_document_frequency
158
		 		ON inverse_document_frequency.term = term_frequency.term
159
		 LEFT JOIN documents_published
160
		        ON documents_published.path = term_frequency.documentPath
161
			 WHERE term_frequency.term = :query
162
			   AND documents_published.publicationDate <= ' . time() . '
163
		  GROUP BY term_frequency.documentPath, term_frequency.term
164
		  ORDER BY score DESC
165
		';
166
        if (!$stmt = $db->prepare($sql)) {
167
            throw new \RuntimeException('SQLite exception: <pre>' . print_r($db->errorInfo(),
168
                    true) . '</pre> for SQL:<pre>' . $sql . '</pre>');
169
        }
170
        $stmt->bindValue(':query', $token);
171
        $stmt->bindValue(':queryNorm', $queryNorm);
172
        if (!$stmt->execute()) {
173
            throw new \RuntimeException('SQLite exception: <pre>' . print_r($db->errorInfo(),
174
                    true) . '</pre> for SQL:<pre>' . $sql . '</pre>');
175
        }
176
        return $stmt->fetchAll(\PDO::FETCH_CLASS, SearchResult::class);
177
    }
178
179
    /**
180
     * @param $resultsPerTokens
181
     *
182
     * @return array
183
     */
184
    private function flattenResults($resultsPerTokens)
185
    {
186
        $finalResults = array();
187
        foreach ($resultsPerTokens as $token => $resultPerToken) {
188
            foreach ($resultPerToken as $result) {
189
                if (isset($finalResults[$result->documentPath])) {
190
                    $finalResults[$result->documentPath]->score += $result->score;
191
                    $finalResults[$result->documentPath]->matchingTokens[] = $token;
192
                } else {
193
                    $resultObj = new SearchResult();
194
                    $resultObj->documentPath = $result->documentPath;
195
                    $resultObj->matchingTokens = array($token);
196
                    $resultObj->score = (float)$result->score;
197
                    $resultObj->setStorage($this->storage);
198
                    $finalResults[$result->documentPath] = $resultObj;
199
                }
200
            }
201
        }
202
        return $finalResults;
203
    }
204
205
    /**
206
     * @param $a
207
     * @param $b
208
     * @return int
209
     */
210
    private function scoreCompare($a, $b)
211
    {
212
        if ($a->score === $b->score) {
213
            return 0;
214
        }
215
        return ($a->score > $b->score) ? -1 : 1;
216
    }
217
218
    /**
219
     * Calculates the query norm for all tokens in the Tokenizer
220
     * @param $tokens
221
     *
222
     * @return int
223
     * @throws \Exception
224
     */
225
    private function getQueryNorm($tokens)
226
    {
227
        $db = $this->getSearchDbHandle();
228
        $db->/** @scrutinizer ignore-call */
229
        sqliteCreateFunction('sqrt', 'sqrt', 1);
230
        foreach ($tokens as $key => $token) {
231
            $tokens[$key] = $db->quote($token);
232
        }
233
        $terms = implode(',', $tokens);
234
        $sql = '
235
			SELECT (1 / sqrt(SUM(inverseDocumentFrequency))) AS queryNorm
236
			  FROM inverse_document_frequency
237
			 WHERE term IN (' . $terms . ') 
238
		';
239
        if (!$stmt = $db->prepare($sql)) {
240
            throw new \RuntimeException('SQLite exception: <pre>' . print_r($db->errorInfo(),
241
                    true) . '</pre> for SQL:<pre>' . $sql . '</pre>');
242
        }
243
        if (!$stmt->execute()) {
244
            throw new \RuntimeException('SQLite exception: <pre>' . print_r($db->errorInfo(),
245
                    true) . '</pre> for SQL:<pre>' . $sql . '</pre>');
246
        }
247
        $result = $stmt->fetch(\PDO::FETCH_OBJ);
248
        return $result->queryNorm == null ? 1 : $result->queryNorm;
249
    }
250
251
    /**
252
     * Applies query coordination to all results
253
     * @param $flatResults
254
     *
255
     * @return mixed
256
     */
257
    private function applyQueryCoordination($flatResults)
258
    {
259
        $tokenVector = $this->tokenizer->getTokenVector();
260
        $tokens = array_keys($tokenVector);
261
        $tokenCount = count($tokens);
262
        foreach ($flatResults as $key => $result) {
263
            $matchCount = count($result->matchingTokens);
264
            $result->score = ($matchCount / $tokenCount) * $result->score;
265
            $flatResults[$key] = $result;
266
        }
267
        return $flatResults;
268
    }
269
270
    /**
271
     * Uses the levenshtein algorithm to determine the term that is
272
     * closest to the token that was input for the search
273
     * @return array
274
     * @throws \Exception
275
     */
276
    private function getSearchSuggestions()
277
    {
278
        $tokens = $this->getTokens();
279
        $allResults = array();
280
        foreach ($tokens as $token) {
281
            $allResults = $this->getSearchSuggestion($token, $allResults);
282
        }
283
        return $allResults;
284
    }
285
286
    /**
287
     * Retrieves all tokens from the tokenizer
288
     * @return array
289
     */
290
    private function getTokens()
291
    {
292
        $tokenVector = array(
293
            'query' => array(),
294
        );
295
        $tokenVector['query'] = $this->tokenizer->getTokenVector();
296
        $tokens = $this->applyFilters($tokenVector);
297
        if (!empty($tokens)) {
298
            $tokens = array_keys($tokens['query']);
299
        }
300
301
        return $tokens;
302
    }
303
304
    /**
305
     * @param \PDO $db
306
     * @param $sql
307
     * @param $token
308
     * @return mixed
309
     * @throws \Exception
310
     */
311
    private function getSearchSuggestionStatement($db, $sql, $token)
312
    {
313
        $stmt = $db->prepare($sql);
314
        if ($stmt === false) {
315
            throw new \RuntimeException('SQLite exception: <pre>' . print_r($db->errorInfo(),
316
                    true) . '</pre> for SQL:<pre>' . $sql . '</pre>');
317
        }
318
        $stmt->bindValue(':token', $token);
319
        if (($stmt === false) || (!$stmt->execute())) {
320
            throw new \RuntimeException('SQLite exception: <pre>' . print_r($db->errorInfo(),
321
                    true) . '</pre> for SQL:<pre>' . $sql . '</pre>');
322
        }
323
        return $stmt;
324
    }
325
326
    /**
327
     * @param $token
328
     * @param $allResults
329
     * @return array
330
     */
331
    private function getSearchSuggestion($token, $allResults)
332
    {
333
        $db = $this->getSearchDbHandle();
334
        $db->/** @scrutinizer ignore-call */
335
        sqliteCreateFunction('levenshtein', 'levenshtein', 2);
336
        $sql = '
337
				SELECT *
338
				  FROM (
339
				  	SELECT :token AS original, term, levenshtein(term, :token) AS editDistance
340
				  	  FROM inverse_document_frequency
341
			  	  ORDER BY editDistance ASC
342
			  	     LIMIT 0, 1
343
			  	     )
344
			  	   WHERE editDistance > 0
345
			';
346
        $stmt = $this->getSearchSuggestionStatement($db, $sql, $token);
347
        $result = $stmt->fetchAll(\PDO::FETCH_CLASS, results\SearchSuggestion::class);
348
        $allResults = array_merge($result, $allResults);
349
        return $allResults;
350
    }
351
}