1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace tomzx\IRCStats; |
4
|
|
|
|
5
|
|
|
use Illuminate\Database\Connection; |
6
|
|
|
use Psr\Log\LoggerAwareInterface; |
7
|
|
|
use Psr\Log\LoggerInterface; |
8
|
|
|
use Psr\Log\NullLogger; |
9
|
|
|
|
10
|
|
|
class Processor implements LoggerAwareInterface { |
11
|
|
|
/** |
12
|
|
|
* @var \tomzx\IRCStats\DatabaseProxy |
13
|
|
|
*/ |
14
|
|
|
protected $databaseProxy; |
15
|
|
|
/** |
16
|
|
|
* @var \Psr\Log\LoggerInterface |
17
|
|
|
*/ |
18
|
|
|
protected $logger; |
19
|
|
|
|
20
|
|
|
/** |
21
|
|
|
* @param \tomzx\IRCStats\DatabaseProxy $databaseProxy |
22
|
|
|
*/ |
23
|
2 |
|
public function __construct(DatabaseProxy $databaseProxy) |
24
|
|
|
{ |
25
|
2 |
|
$this->databaseProxy = $databaseProxy; |
26
|
2 |
|
$this->logger = new NullLogger(); |
27
|
2 |
|
} |
28
|
|
|
|
29
|
|
|
/** |
30
|
|
|
* @return \Illuminate\Database\Connection |
31
|
|
|
*/ |
32
|
2 |
|
protected function getDatabase() |
33
|
|
|
{ |
34
|
2 |
|
return $this->databaseProxy->getConnection(); |
35
|
|
|
} |
36
|
|
|
|
37
|
|
|
/** |
38
|
|
|
* @param \Psr\Log\LoggerInterface $logger |
39
|
|
|
* @return void |
40
|
|
|
*/ |
41
|
|
|
public function setLogger(LoggerInterface $logger) |
42
|
|
|
{ |
43
|
|
|
$this->logger = $logger; |
44
|
|
|
} |
45
|
|
|
|
46
|
|
|
/** |
47
|
|
|
* @return void |
48
|
|
|
*/ |
49
|
2 |
|
public function run() |
50
|
|
|
{ |
51
|
2 |
|
$db = $this->getDatabase(); |
52
|
2 |
|
$this->initializeDictionary($db); |
53
|
2 |
|
$this->generateLogsWords($db); |
54
|
2 |
|
} |
55
|
|
|
|
56
|
|
|
/** |
57
|
|
|
* @param \Illuminate\Database\Connection $db |
58
|
|
|
* @throws \Exception |
59
|
|
|
*/ |
60
|
2 |
|
protected function generateLogsWords(Connection $db) |
61
|
|
|
{ |
62
|
|
|
// Find last processed logs id |
63
|
2 |
|
$lastLogId = (int)$db->table('logs_words')->max('logs_id'); |
64
|
|
|
|
65
|
2 |
|
$dictionary = null; |
66
|
2 |
|
$batchSize = 250; |
67
|
2 |
|
$currentId = $lastLogId; |
68
|
2 |
|
while (true) { |
69
|
2 |
|
$this->logger->debug('Processing id > '.$currentId.' (batch of '.$batchSize.')'); |
70
|
2 |
|
$fetchStart = microtime(true); |
71
|
|
|
|
72
|
2 |
|
$logs = $this->getLogs($db, $currentId, $batchSize); |
73
|
|
|
|
74
|
2 |
|
$fetchDuration = microtime(true) - $fetchStart; |
75
|
|
|
|
76
|
2 |
|
if ( ! $logs) { |
|
|
|
|
77
|
|
|
// No more data available |
78
|
2 |
|
$this->logger->debug('End of data'); |
79
|
2 |
|
break; |
80
|
|
|
} |
81
|
|
|
|
82
|
|
|
if ( ! $dictionary) { |
83
|
|
|
$dictionary = $this->loadDictionary($db); |
84
|
|
|
} |
85
|
|
|
|
86
|
|
|
$insertStart = microtime(true); |
87
|
|
|
$data = []; |
88
|
|
|
foreach ($logs as $log) { |
89
|
|
|
// TODO: Replace this with preg_split <[email protected]> |
90
|
|
|
$words = explode(' ', $log->message); |
91
|
|
|
foreach ($words as $word) { |
92
|
|
|
// TODO: Support case insensitive <[email protected]> |
93
|
|
|
if ( ! isset($dictionary[$word])) { |
94
|
|
|
//$this->logger->debug('Unknown word '.$word.PHP_EOL); |
|
|
|
|
95
|
|
|
continue; |
96
|
|
|
} |
97
|
|
|
|
98
|
|
|
$wordId = $dictionary[$word]; |
99
|
|
|
|
100
|
|
|
$data[] = [ |
101
|
|
|
'logs_id' => $log->id, |
102
|
|
|
// 'word' => $word, |
|
|
|
|
103
|
|
|
'word_id' => $wordId, |
104
|
|
|
]; |
105
|
|
|
} |
106
|
|
|
} |
107
|
|
|
|
108
|
|
|
$currentId = $log->id; |
|
|
|
|
109
|
|
|
|
110
|
|
|
$this->batchInsert($db->table('logs_words'), $data); |
111
|
|
|
$insertDuration = microtime(true) - $insertStart; |
112
|
|
|
$this->logger->debug('fetch: '.round($fetchDuration, 6).'s, insert: '.round($insertDuration, 6).'s'); |
113
|
|
|
} |
114
|
2 |
|
} |
115
|
|
|
|
116
|
2 |
|
protected function initializeDictionary(Connection $db) |
117
|
|
|
{ |
118
|
2 |
|
$dictionarySize = $db->table('words')->count(); |
119
|
|
|
|
120
|
2 |
|
if ($dictionarySize > 0) { |
121
|
1 |
|
return; |
122
|
|
|
} |
123
|
|
|
|
124
|
1 |
|
$this->logger->info('Seeding words table...'); |
125
|
1 |
|
$dictionarySeedStart = microtime(true); |
126
|
1 |
|
$dictionary = file(__DIR__ . '/../../../data/dictionary.txt'); |
127
|
1 |
|
$data = []; |
128
|
1 |
|
foreach ($dictionary as $word) { |
129
|
1 |
|
$data[] = [ |
130
|
1 |
|
'word' => trim($word), |
131
|
|
|
]; |
132
|
1 |
|
} |
133
|
1 |
|
$this->batchInsert($db->table('words'), $data); |
134
|
|
|
|
135
|
1 |
|
$dictionarySeedDuration = microtime(true) - $dictionarySeedStart; |
136
|
1 |
|
$this->logger->info('Finished seeding words table in '.round($dictionarySeedDuration, 6).'s'); |
137
|
1 |
|
} |
138
|
|
|
|
139
|
|
|
/** |
140
|
|
|
* @param \Illuminate\Database\Connection $db |
141
|
|
|
* @return array |
142
|
|
|
*/ |
143
|
|
|
protected function getDictionary(Connection $db) |
144
|
|
|
{ |
145
|
|
|
return $db->table('words') |
146
|
|
|
->select('id', 'word') |
147
|
|
|
->lists('id', 'word'); |
148
|
|
|
} |
149
|
|
|
|
150
|
|
|
/** |
151
|
|
|
* @param \Illuminate\Database\Connection $db |
152
|
|
|
* @return array |
153
|
|
|
*/ |
154
|
|
|
protected function loadDictionary(Connection $db) |
155
|
|
|
{ |
156
|
|
|
$dictionaryStart = microtime(true); |
157
|
|
|
$dictionary = $this->getDictionary($db); |
158
|
|
|
$dictionaryDuration = microtime(true) - $dictionaryStart; |
159
|
|
|
$this->logger->info('Dictionary loaded in ' . round($dictionaryDuration, 6) . 's'); |
160
|
|
|
return $dictionary; |
161
|
|
|
} |
162
|
|
|
|
163
|
|
|
/** |
164
|
|
|
* @param \Illuminate\Database\Connection $db |
165
|
|
|
* @param int $currentId |
166
|
|
|
* @param int $batchSize |
167
|
|
|
* @return array |
168
|
|
|
*/ |
169
|
2 |
|
protected function getLogs(Connection $db, $currentId, $batchSize) |
170
|
|
|
{ |
171
|
2 |
|
return $db->table('logs') |
172
|
2 |
|
->select('id', 'message') |
173
|
2 |
|
->where('id', '>', $currentId) |
174
|
2 |
|
->orderBy('id', 'asc') |
175
|
2 |
|
->limit($batchSize) |
176
|
2 |
|
->get(); |
177
|
|
|
} |
178
|
|
|
|
179
|
|
|
/** |
180
|
|
|
* @param \Illuminate\Database\Query\Builder $builder |
181
|
|
|
* @param array $data |
182
|
|
|
*/ |
183
|
|
|
protected function batchInsert(\Illuminate\Database\Query\Builder $builder, array $data) |
184
|
|
|
{ |
185
|
1 |
|
$builder->getConnection()->transaction(function () use ($builder, $data) { |
186
|
|
|
// Batch in group of 250 entries to prevent "Too many SQL variables" SQL error |
187
|
|
|
$insertBatchSize = 250; |
188
|
|
|
$insertBatchCount = ceil(count($data) / $insertBatchSize); |
189
|
|
|
for ($i = 0; $i < $insertBatchCount; ++$i) { |
190
|
|
|
$insertedData = array_slice($data, $i * $insertBatchSize, $insertBatchSize); |
191
|
|
|
|
192
|
|
|
$builder->insert($insertedData); |
193
|
|
|
} |
194
|
1 |
|
}); |
195
|
1 |
|
} |
196
|
|
|
} |
197
|
|
|
|
This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.
Consider making the comparison explicit by using
empty(..)
or! empty(...)
instead.