|
1
|
|
|
<?php |
|
2
|
|
|
/** |
|
3
|
|
|
* User: jensk |
|
4
|
|
|
* Date: 21-2-2017 |
|
5
|
|
|
* Time: 10:29 |
|
6
|
|
|
*/ |
|
7
|
|
|
|
|
8
|
|
|
namespace library\search; |
|
9
|
|
|
|
|
10
|
|
|
|
|
11
|
|
|
use library\search\indexer\InverseDocumentFrequency; |
|
12
|
|
|
use library\search\indexer\TermCount; |
|
13
|
|
|
use library\search\indexer\TermFieldLengthNorm; |
|
14
|
|
|
use library\search\indexer\TermFrequency; |
|
15
|
|
|
|
|
16
|
|
|
/** |
|
17
|
|
|
* Class Indexer |
|
18
|
|
|
* Responsible for creating the search index based on the |
|
19
|
|
|
* existing documents |
|
20
|
|
|
* |
|
21
|
|
|
* @package library\search |
|
22
|
|
|
*/ |
|
23
|
|
|
class Indexer extends SearchDbConnected |
|
24
|
|
|
{ |
|
25
|
|
|
const SQLITE_MAX_COMPOUND_SELECT = 100; |
|
26
|
|
|
protected $filters = array( |
|
27
|
|
|
'DutchStopWords', |
|
28
|
|
|
'EnglishStopWords' |
|
29
|
|
|
); |
|
30
|
|
|
protected $storageDir; |
|
31
|
|
|
/** |
|
32
|
|
|
* @var int |
|
33
|
|
|
*/ |
|
34
|
|
|
protected $loggingStart; |
|
35
|
|
|
/** |
|
36
|
|
|
* @var string |
|
37
|
|
|
*/ |
|
38
|
|
|
protected $log; |
|
39
|
|
|
/** |
|
40
|
|
|
* @var int |
|
41
|
|
|
*/ |
|
42
|
|
|
protected $lastLog; |
|
43
|
|
|
|
|
44
|
|
|
const SEARCH_TEMP_DB = 'search_tmp.db'; |
|
45
|
|
|
|
|
46
|
|
|
/** |
|
47
|
|
|
* Creates a new temporary search db, cleans it if it exists |
|
48
|
|
|
* then calculates and stores the search index in this db |
|
49
|
|
|
* and finally if indexing completed replaces the current search |
|
50
|
|
|
* db with the temporary one. Returns the log in string format. |
|
51
|
|
|
* @return string |
|
52
|
|
|
*/ |
|
53
|
|
|
public function updateIndex() |
|
54
|
|
|
{ |
|
55
|
|
|
$this->startLogging(); |
|
56
|
|
|
$this->addLog('Indexing start.'); |
|
57
|
|
|
$this->addLog('Clearing index.'); |
|
58
|
|
|
$this->resetIndex(); |
|
59
|
|
|
$this->addLog('Retrieving documents to be indexed.'); |
|
60
|
|
|
$documents = $this->storage->getDocuments(); |
|
61
|
|
|
$this->addLog('Start Document Term Count for ' . count($documents) . ' documents'); |
|
62
|
|
|
$this->createDocumentTermCount($documents); |
|
63
|
|
|
$this->addLog('Start Document Term Frequency.'); |
|
64
|
|
|
$this->createDocumentTermFrequency(); |
|
65
|
|
|
$this->addLog('Start Term Field Length Norm.'); |
|
66
|
|
|
$this->createTermFieldLengthNorm(); |
|
67
|
|
|
$this->addLog('Start Inverse Document Frequency.'); |
|
68
|
|
|
$this->createInverseDocumentFrequency(); |
|
69
|
|
|
$this->addLog('Replacing old index.'); |
|
70
|
|
|
$this->replaceOldIndex(); |
|
71
|
|
|
$this->addLog('Indexing complete.'); |
|
72
|
|
|
return $this->log; |
|
73
|
|
|
} |
|
74
|
|
|
|
|
75
|
|
|
/** |
|
76
|
|
|
* Count how often a term is used in a document |
|
77
|
|
|
* |
|
78
|
|
|
* @param $documents |
|
79
|
|
|
*/ |
|
80
|
|
|
public function createDocumentTermCount($documents) |
|
81
|
|
|
{ |
|
82
|
|
|
$termCount = new TermCount($this->getSearchDbHandle(), $documents, $this->filters, $this->storage); |
|
|
|
|
|
|
83
|
|
|
$termCount->execute(); |
|
84
|
|
|
} |
|
85
|
|
|
|
|
86
|
|
|
/** |
|
87
|
|
|
* Calculate the frequency index for a term with |
|
88
|
|
|
* a field |
|
89
|
|
|
*/ |
|
90
|
|
|
public function createDocumentTermFrequency() |
|
91
|
|
|
{ |
|
92
|
|
|
$termFrequency = new TermFrequency($this->getSearchDbHandle()); |
|
93
|
|
|
$termFrequency->execute(); |
|
94
|
|
|
} |
|
95
|
|
|
|
|
96
|
|
|
|
|
97
|
|
|
/** |
|
98
|
|
|
* Resets the entire index |
|
99
|
|
|
*/ |
|
100
|
|
|
public function resetIndex() |
|
101
|
|
|
{ |
|
102
|
|
|
$db = $this->getSearchDbHandle(); |
|
103
|
|
|
$sql = ' |
|
104
|
|
|
DELETE FROM term_count; |
|
105
|
|
|
DELETE FROM term_frequency; |
|
106
|
|
|
DELETE FROM inverse_document_frequency; |
|
107
|
|
|
UPDATE `sqlite_sequence` SET `seq`= 0 WHERE `name`=\'term_count\'; |
|
108
|
|
|
UPDATE `sqlite_sequence` SET `seq`= 0 WHERE `name`=\'term_frequency\'; |
|
109
|
|
|
UPDATE `sqlite_sequence` SET `seq`= 0 WHERE `name`=\'inverse_document_frequency\'; |
|
110
|
|
|
'; |
|
111
|
|
|
$db->exec($sql); |
|
112
|
|
|
} |
|
113
|
|
|
|
|
114
|
|
|
/** |
|
115
|
|
|
* Calculates the inverse document frequency for each |
|
116
|
|
|
* term. This is a representation of how often a certain |
|
117
|
|
|
* term is used in comparison to all terms. |
|
118
|
|
|
*/ |
|
119
|
|
|
public function createInverseDocumentFrequency() |
|
120
|
|
|
{ |
|
121
|
|
|
$documentCount = $this->getTotalDocumentCount(); |
|
122
|
|
|
$inverseDocumentFrequency = new InverseDocumentFrequency($this->getSearchDbHandle(), $documentCount); |
|
|
|
|
|
|
123
|
|
|
$inverseDocumentFrequency->execute(); |
|
124
|
|
|
} |
|
125
|
|
|
|
|
126
|
|
|
/** |
|
127
|
|
|
* @return int|mixed |
|
128
|
|
|
*/ |
|
129
|
|
|
private function getTotalDocumentCount() |
|
130
|
|
|
{ |
|
131
|
|
|
return $this->storage->getTotalDocumentCount(); |
|
132
|
|
|
} |
|
133
|
|
|
|
|
134
|
|
|
/** |
|
135
|
|
|
* Calculates the Term Field Length Norm. |
|
136
|
|
|
* This is an index determining how important a |
|
137
|
|
|
* term is, based on the total length of the field |
|
138
|
|
|
* it comes from. |
|
139
|
|
|
*/ |
|
140
|
|
|
public function createTermFieldLengthNorm() |
|
141
|
|
|
{ |
|
142
|
|
|
$termFieldLengthNorm = new TermFieldLengthNorm($this->getSearchDbHandle()); |
|
143
|
|
|
$termFieldLengthNorm->execute(); |
|
144
|
|
|
} |
|
145
|
|
|
|
|
146
|
|
|
/** |
|
147
|
|
|
* Stores the time the indexing started in memory |
|
148
|
|
|
*/ |
|
149
|
|
|
private function startLogging() |
|
150
|
|
|
{ |
|
151
|
|
|
$this->loggingStart = round(microtime(true) * 1000); |
|
|
|
|
|
|
152
|
|
|
$this->lastLog = $this->loggingStart; |
|
|
|
|
|
|
153
|
|
|
} |
|
154
|
|
|
|
|
155
|
|
|
/** |
|
156
|
|
|
* Adds a logline with the time since last log |
|
157
|
|
|
* @param $string |
|
158
|
|
|
*/ |
|
159
|
|
|
private function addLog($string) |
|
160
|
|
|
{ |
|
161
|
|
|
$currentTime = round(microtime(true) * 1000); |
|
162
|
|
|
$this->log .= date('d-m-Y H:i:s - ') . str_pad($string, 50, " ", STR_PAD_RIGHT) . "\t" . ($currentTime - $this->lastLog) . 'ms since last log. ' . "\t" . ($currentTime - $this->loggingStart) . 'ms since start.' . PHP_EOL; |
|
163
|
|
|
$this->lastLog = round(microtime(true) * 1000); |
|
|
|
|
|
|
164
|
|
|
} |
|
165
|
|
|
|
|
166
|
|
|
/** |
|
167
|
|
|
* Creates the SQLite \PDO object if it doesnt |
|
168
|
|
|
* exist and returns it. |
|
169
|
|
|
* @return \PDO |
|
170
|
|
|
*/ |
|
171
|
|
View Code Duplication |
protected function getSearchDbHandle() |
|
|
|
|
|
|
172
|
|
|
{ |
|
173
|
|
|
if ($this->searchDbHandle === null) { |
|
174
|
|
|
$path = __DIR__ . DIRECTORY_SEPARATOR . '..' . DIRECTORY_SEPARATOR . '..' . DIRECTORY_SEPARATOR . $this->storageDir . DIRECTORY_SEPARATOR; |
|
175
|
|
|
$this->searchDbHandle = new \PDO('sqlite:' . $path . self::SEARCH_TEMP_DB); |
|
|
|
|
|
|
176
|
|
|
} |
|
177
|
|
|
return $this->searchDbHandle; |
|
178
|
|
|
} |
|
179
|
|
|
|
|
180
|
|
|
/** |
|
181
|
|
|
* Replaces the old search index database with the new one. |
|
182
|
|
|
*/ |
|
183
|
|
|
public function replaceOldIndex() |
|
184
|
|
|
{ |
|
185
|
|
|
$this->searchDbHandle = null; |
|
186
|
|
|
$path = __DIR__ . DIRECTORY_SEPARATOR . '..' . DIRECTORY_SEPARATOR . '..' . DIRECTORY_SEPARATOR . $this->storageDir . DIRECTORY_SEPARATOR; |
|
187
|
|
|
rename($path . self::SEARCH_TEMP_DB, $path . 'search.db'); |
|
188
|
|
|
} |
|
189
|
|
|
} |
It seems like the type of the argument is not accepted by the function/method which you are calling.
In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.
We suggest to add an explicit type cast like in the following example: