|
1
|
|
|
<?php |
|
2
|
|
|
|
|
3
|
|
|
namespace tomzx\IRCStats; |
|
4
|
|
|
|
|
5
|
|
|
use Illuminate\Database\Connection; |
|
6
|
|
|
use Psr\Log\LoggerAwareInterface; |
|
7
|
|
|
use Psr\Log\LoggerInterface; |
|
8
|
|
|
use Psr\Log\NullLogger; |
|
9
|
|
|
|
|
10
|
|
|
class Processor implements LoggerAwareInterface { |
|
11
|
|
|
/** |
|
12
|
|
|
* @var \tomzx\IRCStats\DatabaseProxy |
|
13
|
|
|
*/ |
|
14
|
|
|
protected $databaseProxy; |
|
15
|
|
|
/** |
|
16
|
|
|
* @var \Psr\Log\LoggerInterface |
|
17
|
|
|
*/ |
|
18
|
|
|
protected $logger; |
|
19
|
|
|
|
|
20
|
|
|
/** |
|
21
|
|
|
* @param \tomzx\IRCStats\DatabaseProxy $databaseProxy |
|
22
|
|
|
*/ |
|
23
|
2 |
|
public function __construct(DatabaseProxy $databaseProxy) |
|
24
|
|
|
{ |
|
25
|
2 |
|
$this->databaseProxy = $databaseProxy; |
|
26
|
2 |
|
$this->logger = new NullLogger(); |
|
27
|
2 |
|
} |
|
28
|
|
|
|
|
29
|
|
|
/** |
|
30
|
|
|
* @return \Illuminate\Database\Connection |
|
31
|
|
|
*/ |
|
32
|
2 |
|
protected function getDatabase() |
|
33
|
|
|
{ |
|
34
|
2 |
|
return $this->databaseProxy->getConnection(); |
|
35
|
|
|
} |
|
36
|
|
|
|
|
37
|
|
|
/** |
|
38
|
|
|
* @param \Psr\Log\LoggerInterface $logger |
|
39
|
|
|
* @return void |
|
40
|
|
|
*/ |
|
41
|
|
|
public function setLogger(LoggerInterface $logger) |
|
42
|
|
|
{ |
|
43
|
|
|
$this->logger = $logger; |
|
44
|
|
|
} |
|
45
|
|
|
|
|
46
|
|
|
/** |
|
47
|
|
|
* @return void |
|
48
|
|
|
*/ |
|
49
|
2 |
|
public function run() |
|
50
|
|
|
{ |
|
51
|
2 |
|
$db = $this->getDatabase(); |
|
52
|
2 |
|
$this->initializeDictionary($db); |
|
53
|
2 |
|
$this->generateLogsWords($db); |
|
54
|
2 |
|
} |
|
55
|
|
|
|
|
56
|
|
|
/** |
|
57
|
|
|
* @param \Illuminate\Database\Connection $db |
|
58
|
|
|
* @throws \Exception |
|
59
|
|
|
*/ |
|
60
|
2 |
|
protected function generateLogsWords(Connection $db) |
|
61
|
|
|
{ |
|
62
|
|
|
// Find last processed logs id |
|
63
|
2 |
|
$lastLogId = (int)$db->table('logs_words')->max('logs_id'); |
|
64
|
|
|
|
|
65
|
2 |
|
$dictionary = null; |
|
66
|
2 |
|
$batchSize = 250; |
|
67
|
2 |
|
$currentId = $lastLogId; |
|
68
|
2 |
|
while (true) { |
|
69
|
2 |
|
$this->logger->debug('Processing id > '.$currentId.' (batch of '.$batchSize.')'); |
|
70
|
2 |
|
$fetchStart = microtime(true); |
|
71
|
|
|
|
|
72
|
2 |
|
$logs = $this->getLogs($db, $currentId, $batchSize); |
|
73
|
|
|
|
|
74
|
2 |
|
$fetchDuration = microtime(true) - $fetchStart; |
|
75
|
|
|
|
|
76
|
2 |
|
if ( ! $logs) { |
|
|
|
|
|
|
77
|
|
|
// No more data available |
|
78
|
2 |
|
$this->logger->debug('End of data'); |
|
79
|
2 |
|
break; |
|
80
|
|
|
} |
|
81
|
|
|
|
|
82
|
|
|
if ( ! $dictionary) { |
|
83
|
|
|
$dictionary = $this->loadDictionary($db); |
|
84
|
|
|
} |
|
85
|
|
|
|
|
86
|
|
|
$insertStart = microtime(true); |
|
87
|
|
|
$data = []; |
|
88
|
|
|
foreach ($logs as $log) { |
|
89
|
|
|
// TODO: Replace this with preg_split <[email protected]> |
|
90
|
|
|
$words = explode(' ', $log->message); |
|
91
|
|
|
foreach ($words as $word) { |
|
92
|
|
|
// TODO: Support case insensitive <[email protected]> |
|
93
|
|
|
if ( ! isset($dictionary[$word])) { |
|
94
|
|
|
//$this->logger->debug('Unknown word '.$word.PHP_EOL); |
|
|
|
|
|
|
95
|
|
|
continue; |
|
96
|
|
|
} |
|
97
|
|
|
|
|
98
|
|
|
$wordId = $dictionary[$word]; |
|
99
|
|
|
|
|
100
|
|
|
$data[] = [ |
|
101
|
|
|
'logs_id' => $log->id, |
|
102
|
|
|
// 'word' => $word, |
|
|
|
|
|
|
103
|
|
|
'word_id' => $wordId, |
|
104
|
|
|
]; |
|
105
|
|
|
} |
|
106
|
|
|
} |
|
107
|
|
|
|
|
108
|
|
|
$currentId = $log->id; |
|
|
|
|
|
|
109
|
|
|
|
|
110
|
|
|
$this->batchInsert($db->table('logs_words'), $data); |
|
111
|
|
|
$insertDuration = microtime(true) - $insertStart; |
|
112
|
|
|
$this->logger->debug('fetch: '.round($fetchDuration, 6).'s, insert: '.round($insertDuration, 6).'s'); |
|
113
|
|
|
} |
|
114
|
2 |
|
} |
|
115
|
|
|
|
|
116
|
2 |
|
protected function initializeDictionary(Connection $db) |
|
117
|
|
|
{ |
|
118
|
2 |
|
$dictionarySize = $db->table('words')->count(); |
|
119
|
|
|
|
|
120
|
2 |
|
if ($dictionarySize > 0) { |
|
121
|
1 |
|
return; |
|
122
|
|
|
} |
|
123
|
|
|
|
|
124
|
1 |
|
$this->logger->info('Seeding words table...'); |
|
125
|
1 |
|
$dictionarySeedStart = microtime(true); |
|
126
|
1 |
|
$dictionary = file(__DIR__ . '/../../../data/dictionary.txt'); |
|
127
|
1 |
|
$data = []; |
|
128
|
1 |
|
foreach ($dictionary as $word) { |
|
129
|
1 |
|
$data[] = [ |
|
130
|
1 |
|
'word' => trim($word), |
|
131
|
|
|
]; |
|
132
|
1 |
|
} |
|
133
|
1 |
|
$this->batchInsert($db->table('words'), $data); |
|
134
|
|
|
|
|
135
|
1 |
|
$dictionarySeedDuration = microtime(true) - $dictionarySeedStart; |
|
136
|
1 |
|
$this->logger->info('Finished seeding words table in '.round($dictionarySeedDuration, 6).'s'); |
|
137
|
1 |
|
} |
|
138
|
|
|
|
|
139
|
|
|
/** |
|
140
|
|
|
* @param \Illuminate\Database\Connection $db |
|
141
|
|
|
* @return array |
|
142
|
|
|
*/ |
|
143
|
|
|
protected function getDictionary(Connection $db) |
|
144
|
|
|
{ |
|
145
|
|
|
return $db->table('words') |
|
146
|
|
|
->select('id', 'word') |
|
147
|
|
|
->lists('id', 'word'); |
|
148
|
|
|
} |
|
149
|
|
|
|
|
150
|
|
|
/** |
|
151
|
|
|
* @param \Illuminate\Database\Connection $db |
|
152
|
|
|
* @return array |
|
153
|
|
|
*/ |
|
154
|
|
|
protected function loadDictionary(Connection $db) |
|
155
|
|
|
{ |
|
156
|
|
|
$dictionaryStart = microtime(true); |
|
157
|
|
|
$dictionary = $this->getDictionary($db); |
|
158
|
|
|
$dictionaryDuration = microtime(true) - $dictionaryStart; |
|
159
|
|
|
$this->logger->info('Dictionary loaded in ' . round($dictionaryDuration, 6) . 's'); |
|
160
|
|
|
return $dictionary; |
|
161
|
|
|
} |
|
162
|
|
|
|
|
163
|
|
|
/** |
|
164
|
|
|
* @param \Illuminate\Database\Connection $db |
|
165
|
|
|
* @param int $currentId |
|
166
|
|
|
* @param int $batchSize |
|
167
|
|
|
* @return array |
|
168
|
|
|
*/ |
|
169
|
2 |
|
protected function getLogs(Connection $db, $currentId, $batchSize) |
|
170
|
|
|
{ |
|
171
|
2 |
|
return $db->table('logs') |
|
172
|
2 |
|
->select('id', 'message') |
|
173
|
2 |
|
->where('id', '>', $currentId) |
|
174
|
2 |
|
->orderBy('id', 'asc') |
|
175
|
2 |
|
->limit($batchSize) |
|
176
|
2 |
|
->get(); |
|
177
|
|
|
} |
|
178
|
|
|
|
|
179
|
|
|
/** |
|
180
|
|
|
* @param \Illuminate\Database\Query\Builder $builder |
|
181
|
|
|
* @param array $data |
|
182
|
|
|
*/ |
|
183
|
|
|
protected function batchInsert(\Illuminate\Database\Query\Builder $builder, array $data) |
|
184
|
|
|
{ |
|
185
|
1 |
|
$builder->getConnection()->transaction(function () use ($builder, $data) { |
|
186
|
|
|
// Batch in group of 250 entries to prevent "Too many SQL variables" SQL error |
|
187
|
|
|
$insertBatchSize = 250; |
|
188
|
|
|
$insertBatchCount = ceil(count($data) / $insertBatchSize); |
|
189
|
|
|
for ($i = 0; $i < $insertBatchCount; ++$i) { |
|
190
|
|
|
$insertedData = array_slice($data, $i * $insertBatchSize, $insertBatchSize); |
|
191
|
|
|
|
|
192
|
|
|
$builder->insert($insertedData); |
|
193
|
|
|
} |
|
194
|
1 |
|
}); |
|
195
|
1 |
|
} |
|
196
|
|
|
} |
|
197
|
|
|
|
This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.
Consider making the comparison explicit by using
empty(..)or! empty(...)instead.