|
1
|
|
|
<?php |
|
2
|
|
|
/** |
|
3
|
|
|
* User: jensk |
|
4
|
|
|
* Date: 21-2-2017 |
|
5
|
|
|
* Time: 10:29 |
|
6
|
|
|
*/ |
|
7
|
|
|
|
|
8
|
|
|
namespace library\search; |
|
9
|
|
|
|
|
10
|
|
|
|
|
11
|
|
|
use library\search\indexer\InverseDocumentFrequency; |
|
12
|
|
|
use library\search\indexer\TermCount; |
|
13
|
|
|
use library\search\indexer\TermFieldLengthNorm; |
|
14
|
|
|
use library\search\indexer\TermFrequency; |
|
15
|
|
|
|
|
16
|
|
|
/** |
|
17
|
|
|
* Class Indexer |
|
18
|
|
|
* Responsible for creating the search index based on the |
|
19
|
|
|
* existing documents |
|
20
|
|
|
* |
|
21
|
|
|
* @package library\search |
|
22
|
|
|
*/ |
|
23
|
|
|
class Indexer extends SearchDbConnected |
|
24
|
|
|
{ |
|
25
|
|
|
const SQLITE_MAX_COMPOUND_SELECT = 100; |
|
26
|
|
|
protected $filters = array( |
|
27
|
|
|
'DutchStopWords', |
|
28
|
|
|
'EnglishStopWords' |
|
29
|
|
|
); |
|
30
|
|
|
protected $storageDir; |
|
31
|
|
|
/** |
|
32
|
|
|
* @var double |
|
33
|
|
|
*/ |
|
34
|
|
|
protected $loggingStart; |
|
35
|
|
|
/** |
|
36
|
|
|
* @var string |
|
37
|
|
|
*/ |
|
38
|
|
|
protected $log; |
|
39
|
|
|
/** |
|
40
|
|
|
* @var double |
|
41
|
|
|
*/ |
|
42
|
|
|
protected $lastLog; |
|
43
|
|
|
|
|
44
|
|
|
const SEARCH_TEMP_DB = 'search_tmp.db'; |
|
45
|
|
|
|
|
46
|
|
|
/** |
|
47
|
|
|
* Creates a new temporary search db, cleans it if it exists |
|
48
|
|
|
* then calculates and stores the search index in this db |
|
49
|
|
|
* and finally if indexing completed replaces the current search |
|
50
|
|
|
* db with the temporary one. Returns the log in string format. |
|
51
|
|
|
* @return string |
|
52
|
|
|
*/ |
|
53
|
|
|
public function updateIndex() |
|
54
|
|
|
{ |
|
55
|
|
|
$this->startLogging(); |
|
56
|
|
|
$this->addLog('Indexing start.'); |
|
57
|
|
|
$this->addLog('Clearing index.'); |
|
58
|
|
|
$this->resetIndex(); |
|
59
|
|
|
$this->addLog('Cleaning Published Deleted Documents'); |
|
60
|
|
|
$this->storage->getDocuments()->cleanPublishedDeletedDocuments(); |
|
61
|
|
|
$this->addLog('Retrieving documents to be indexed.'); |
|
62
|
|
|
$documents = $this->storage->getDocuments()->getPublishedDocumentsNoFolders(); |
|
63
|
|
|
$this->addLog('Start Document Term Count for ' . count($documents) . ' documents'); |
|
64
|
|
|
$this->createDocumentTermCount($documents); |
|
65
|
|
|
$this->addLog('Start Document Term Frequency.'); |
|
66
|
|
|
$this->createDocumentTermFrequency(); |
|
67
|
|
|
$this->addLog('Start Term Field Length Norm.'); |
|
68
|
|
|
$this->createTermFieldLengthNorm(); |
|
69
|
|
|
$this->addLog('Start Inverse Document Frequency.'); |
|
70
|
|
|
$this->createInverseDocumentFrequency(); |
|
71
|
|
|
$this->addLog('Replacing old index.'); |
|
72
|
|
|
$this->replaceOldIndex(); |
|
73
|
|
|
$this->addLog('Indexing complete.'); |
|
74
|
|
|
return $this->log; |
|
75
|
|
|
} |
|
76
|
|
|
|
|
77
|
|
|
/** |
|
78
|
|
|
* Count how often a term is used in a document |
|
79
|
|
|
* |
|
80
|
|
|
* @param $documents |
|
81
|
|
|
*/ |
|
82
|
|
|
public function createDocumentTermCount($documents) |
|
83
|
|
|
{ |
|
84
|
|
|
$termCount = new TermCount($this->getSearchDbHandle(), $documents, $this->filters, $this->storage); |
|
85
|
|
|
$termCount->execute(); |
|
86
|
|
|
} |
|
87
|
|
|
|
|
88
|
|
|
/** |
|
89
|
|
|
* Calculate the frequency index for a term with |
|
90
|
|
|
* a field |
|
91
|
|
|
*/ |
|
92
|
|
|
public function createDocumentTermFrequency() |
|
93
|
|
|
{ |
|
94
|
|
|
$termFrequency = new TermFrequency($this->getSearchDbHandle()); |
|
95
|
|
|
$termFrequency->execute(); |
|
96
|
|
|
} |
|
97
|
|
|
|
|
98
|
|
|
|
|
99
|
|
|
/** |
|
100
|
|
|
* Resets the entire index |
|
101
|
|
|
*/ |
|
102
|
|
|
public function resetIndex() |
|
103
|
|
|
{ |
|
104
|
|
|
$db = $this->getSearchDbHandle(); |
|
105
|
|
|
$sql = ' |
|
106
|
|
|
DELETE FROM term_count; |
|
107
|
|
|
DELETE FROM term_frequency; |
|
108
|
|
|
DELETE FROM inverse_document_frequency; |
|
109
|
|
|
UPDATE `sqlite_sequence` SET `seq`= 0 WHERE `name`=\'term_count\'; |
|
110
|
|
|
UPDATE `sqlite_sequence` SET `seq`= 0 WHERE `name`=\'term_frequency\'; |
|
111
|
|
|
UPDATE `sqlite_sequence` SET `seq`= 0 WHERE `name`=\'inverse_document_frequency\'; |
|
112
|
|
|
'; |
|
113
|
|
|
$db->exec($sql); |
|
114
|
|
|
} |
|
115
|
|
|
|
|
116
|
|
|
/** |
|
117
|
|
|
* Calculates the inverse document frequency for each |
|
118
|
|
|
* term. This is a representation of how often a certain |
|
119
|
|
|
* term is used in comparison to all terms. |
|
120
|
|
|
*/ |
|
121
|
|
|
public function createInverseDocumentFrequency() |
|
122
|
|
|
{ |
|
123
|
|
|
$documentCount = $this->getTotalDocumentCount(); |
|
124
|
|
|
$inverseDocumentFrequency = new InverseDocumentFrequency($this->getSearchDbHandle(), $documentCount); |
|
125
|
|
|
$inverseDocumentFrequency->execute(); |
|
126
|
|
|
} |
|
127
|
|
|
|
|
128
|
|
|
/** |
|
129
|
|
|
* @return int|mixed |
|
130
|
|
|
*/ |
|
131
|
|
|
private function getTotalDocumentCount() |
|
132
|
|
|
{ |
|
133
|
|
|
return $this->storage->getDocuments()->getTotalDocumentCount(); |
|
134
|
|
|
} |
|
135
|
|
|
|
|
136
|
|
|
/** |
|
137
|
|
|
* Calculates the Term Field Length Norm. |
|
138
|
|
|
* This is an index determining how important a |
|
139
|
|
|
* term is, based on the total length of the field |
|
140
|
|
|
* it comes from. |
|
141
|
|
|
*/ |
|
142
|
|
|
public function createTermFieldLengthNorm() |
|
143
|
|
|
{ |
|
144
|
|
|
$termFieldLengthNorm = new TermFieldLengthNorm($this->getSearchDbHandle()); |
|
145
|
|
|
$termFieldLengthNorm->execute(); |
|
146
|
|
|
} |
|
147
|
|
|
|
|
148
|
|
|
/** |
|
149
|
|
|
* Stores the time the indexing started in memory |
|
150
|
|
|
*/ |
|
151
|
|
|
private function startLogging() |
|
152
|
|
|
{ |
|
153
|
|
|
$this->loggingStart = round(microtime(true) * 1000); |
|
154
|
|
|
$this->lastLog = $this->loggingStart; |
|
155
|
|
|
} |
|
156
|
|
|
|
|
157
|
|
|
/** |
|
158
|
|
|
* Adds a logline with the time since last log |
|
159
|
|
|
* @param $string |
|
160
|
|
|
*/ |
|
161
|
|
|
private function addLog($string) |
|
162
|
|
|
{ |
|
163
|
|
|
$currentTime = round(microtime(true) * 1000); |
|
164
|
|
|
$this->log .= date('d-m-Y H:i:s - ') . str_pad($string, 50, " ", STR_PAD_RIGHT) . "\t" . ($currentTime - $this->lastLog) . 'ms since last log. ' . "\t" . ($currentTime - $this->loggingStart) . 'ms since start.' . PHP_EOL; |
|
165
|
|
|
$this->lastLog = round(microtime(true) * 1000); |
|
166
|
|
|
} |
|
167
|
|
|
|
|
168
|
|
|
/** |
|
169
|
|
|
* Creates the SQLite \PDO object if it doesnt |
|
170
|
|
|
* exist and returns it. |
|
171
|
|
|
* @return \PDO |
|
172
|
|
|
*/ |
|
173
|
|
View Code Duplication |
protected function getSearchDbHandle() |
|
|
|
|
|
|
174
|
|
|
{ |
|
175
|
|
|
if ($this->searchDbHandle === null) { |
|
176
|
|
|
$path = __DIR__ . DIRECTORY_SEPARATOR . '..' . DIRECTORY_SEPARATOR . '..' . DIRECTORY_SEPARATOR . $this->storageDir . DIRECTORY_SEPARATOR; |
|
177
|
|
|
$this->searchDbHandle = new \PDO('sqlite:' . $path . self::SEARCH_TEMP_DB); |
|
178
|
|
|
} |
|
179
|
|
|
return $this->searchDbHandle; |
|
180
|
|
|
} |
|
181
|
|
|
|
|
182
|
|
|
/** |
|
183
|
|
|
* Replaces the old search index database with the new one. |
|
184
|
|
|
*/ |
|
185
|
|
|
public function replaceOldIndex() |
|
186
|
|
|
{ |
|
187
|
|
|
$this->searchDbHandle = null; |
|
188
|
|
|
$path = __DIR__ . DIRECTORY_SEPARATOR . '..' . DIRECTORY_SEPARATOR . '..' . DIRECTORY_SEPARATOR . $this->storageDir . DIRECTORY_SEPARATOR; |
|
189
|
|
|
rename($path . self::SEARCH_TEMP_DB, $path . 'search.db'); |
|
190
|
|
|
} |
|
191
|
|
|
} |
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.