1
|
|
|
<?php |
2
|
|
|
namespace AOE\Crawler\Domain\Repository; |
3
|
|
|
|
4
|
|
|
/*************************************************************** |
5
|
|
|
* Copyright notice |
6
|
|
|
* |
7
|
|
|
* (c) 2017 AOE GmbH <[email protected]> |
8
|
|
|
* |
9
|
|
|
* All rights reserved |
10
|
|
|
* |
11
|
|
|
* This script is part of the TYPO3 project. The TYPO3 project is |
12
|
|
|
* free software; you can redistribute it and/or modify |
13
|
|
|
* it under the terms of the GNU General Public License as published by |
14
|
|
|
* the Free Software Foundation; either version 3 of the License, or |
15
|
|
|
* (at your option) any later version. |
16
|
|
|
* |
17
|
|
|
* The GNU General Public License can be found at |
18
|
|
|
* http://www.gnu.org/copyleft/gpl.html. |
19
|
|
|
* |
20
|
|
|
* This script is distributed in the hope that it will be useful, |
21
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
22
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
23
|
|
|
* GNU General Public License for more details. |
24
|
|
|
* |
25
|
|
|
* This copyright notice MUST APPEAR in all copies of the script! |
26
|
|
|
***************************************************************/ |
27
|
|
|
|
28
|
|
|
use AOE\Crawler\Domain\Model\Process; |
29
|
|
|
use AOE\Crawler\Domain\Model\Queue; |
30
|
|
|
use TYPO3\CMS\Core\Utility\MathUtility; |
31
|
|
|
|
32
|
|
|
/** |
33
|
|
|
* Class QueueRepository |
34
|
|
|
* |
35
|
|
|
* @package AOE\Crawler\Domain\Repository |
36
|
|
|
*/ |
37
|
|
|
class QueueRepository extends AbstractRepository |
38
|
|
|
{ |
39
|
|
|
/** |
40
|
|
|
* @var string |
41
|
|
|
*/ |
42
|
|
|
protected $tableName = 'tx_crawler_queue'; |
43
|
|
|
|
44
|
|
|
/** |
45
|
|
|
* This method is used to find the youngest entry for a given process. |
46
|
|
|
* |
47
|
|
|
* @param Process $process |
48
|
|
|
* |
49
|
|
|
* @return Queue $entry |
50
|
|
|
*/ |
51
|
1 |
|
public function findYoungestEntryForProcess(Process $process) |
52
|
|
|
{ |
53
|
1 |
|
return $this->getFirstOrLastObjectByProcess($process, 'exec_time ASC'); |
54
|
|
|
} |
55
|
|
|
|
56
|
|
|
/** |
57
|
|
|
* This method is used to find the oldest entry for a given process. |
58
|
|
|
* |
59
|
|
|
* @param Process $process |
60
|
|
|
* |
61
|
|
|
* @return Queue |
62
|
|
|
*/ |
63
|
1 |
|
public function findOldestEntryForProcess(Process $process) |
64
|
|
|
{ |
65
|
1 |
|
return $this->getFirstOrLastObjectByProcess($process, 'exec_time DESC'); |
66
|
|
|
} |
67
|
|
|
|
68
|
|
|
/** |
69
|
|
|
* This internal helper method is used to create an instance of an entry object |
70
|
|
|
* |
71
|
|
|
* @param Process $process |
72
|
|
|
* @param string $orderby first matching item will be returned as object |
73
|
|
|
* |
74
|
|
|
* @return Queue |
75
|
|
|
*/ |
76
|
5 |
|
protected function getFirstOrLastObjectByProcess($process, $orderby) |
77
|
|
|
{ |
78
|
5 |
|
$db = $this->getDB(); |
|
|
|
|
79
|
5 |
|
$where = 'process_id_completed=' . $db->fullQuoteStr($process->getProcessId(), $this->tableName) . |
80
|
5 |
|
' AND exec_time > 0 '; |
81
|
5 |
|
$limit = 1; |
82
|
5 |
|
$groupby = ''; |
83
|
|
|
|
84
|
5 |
|
$res = $db->exec_SELECTgetRows('*', 'tx_crawler_queue', $where, $groupby, $orderby, $limit); |
85
|
5 |
|
if ($res) { |
86
|
4 |
|
$first = $res[0]; |
87
|
|
|
} else { |
88
|
1 |
|
$first = []; |
89
|
|
|
} |
90
|
5 |
|
$resultObject = new Queue($first); |
91
|
|
|
|
92
|
5 |
|
return $resultObject; |
93
|
|
|
} |
94
|
|
|
|
95
|
|
|
/** |
96
|
|
|
* Counts all executed items of a process. |
97
|
|
|
* |
98
|
|
|
* @param Process $process |
99
|
|
|
* |
100
|
|
|
* @return int |
101
|
|
|
*/ |
102
|
1 |
|
public function countExecutedItemsByProcess($process) |
103
|
|
|
{ |
104
|
1 |
|
return $this->countItemsByWhereClause('exec_time > 0 AND process_id_completed = ' . $this->getDB()->fullQuoteStr( |
|
|
|
|
105
|
1 |
|
$process->getProcessId(), |
106
|
1 |
|
$this->tableName |
107
|
|
|
)); |
108
|
|
|
} |
109
|
|
|
|
110
|
|
|
/** |
111
|
|
|
* Counts items of a process which yet have not been processed/executed |
112
|
|
|
* |
113
|
|
|
* @param Process $process |
114
|
|
|
* |
115
|
|
|
* @return int |
116
|
|
|
*/ |
117
|
1 |
|
public function countNonExecutedItemsByProcess($process) |
118
|
|
|
{ |
119
|
1 |
|
return $this->countItemsByWhereClause('exec_time = 0 AND process_id = ' . $this->getDB()->fullQuoteStr( |
|
|
|
|
120
|
1 |
|
$process->getProcessId(), |
121
|
1 |
|
$this->tableName |
122
|
|
|
)); |
123
|
|
|
} |
124
|
|
|
|
125
|
|
|
/** |
126
|
|
|
* Method to determine unprocessed Items in the crawler queue. |
127
|
|
|
* |
128
|
|
|
* @return array |
129
|
|
|
*/ |
130
|
2 |
|
public function getUnprocessedItems() |
131
|
|
|
{ |
132
|
2 |
|
$rows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows( |
133
|
2 |
|
'*', |
134
|
2 |
|
'tx_crawler_queue', |
135
|
2 |
|
'exec_time = 0', |
136
|
2 |
|
'', |
137
|
2 |
|
'page_id, scheduled' |
138
|
|
|
); |
139
|
|
|
|
140
|
2 |
|
return $rows; |
141
|
|
|
} |
142
|
|
|
|
143
|
|
|
/** |
144
|
|
|
* Count items which have not been processed yet |
145
|
|
|
* |
146
|
|
|
* @return int |
147
|
|
|
*/ |
148
|
2 |
|
public function countUnprocessedItems() |
149
|
|
|
{ |
150
|
2 |
|
return count($this->getUnprocessedItems()); |
151
|
|
|
} |
152
|
|
|
|
153
|
|
|
/** |
154
|
|
|
* This method can be used to count all queue entrys which are |
155
|
|
|
* scheduled for now or a earlier date. |
156
|
|
|
* |
157
|
|
|
* @return int |
158
|
|
|
*/ |
159
|
2 |
|
public function countAllPendingItems() |
160
|
|
|
{ |
161
|
2 |
|
return $this->countItemsByWhereClause('exec_time = 0 AND scheduled < ' . time()); |
162
|
|
|
} |
163
|
|
|
|
164
|
|
|
/** |
165
|
|
|
* This method can be used to count all queue entrys which are |
166
|
|
|
* scheduled for now or a earlier date and are assigned to a process. |
167
|
|
|
* |
168
|
|
|
* @return int |
169
|
|
|
*/ |
170
|
2 |
|
public function countAllAssignedPendingItems() |
171
|
|
|
{ |
172
|
2 |
|
return $this->countItemsByWhereClause("exec_time = 0 AND scheduled < " . time() . " AND process_id != ''"); |
173
|
|
|
} |
174
|
|
|
|
175
|
|
|
/** |
176
|
|
|
* This method can be used to count all queue entrys which are |
177
|
|
|
* scheduled for now or a earlier date and are not assigned to a process. |
178
|
|
|
* |
179
|
|
|
* @return int |
180
|
|
|
*/ |
181
|
1 |
|
public function countAllUnassignedPendingItems() |
182
|
|
|
{ |
183
|
1 |
|
return $this->countItemsByWhereClause("exec_time = 0 AND scheduled < " . time() . " AND process_id = ''"); |
184
|
|
|
} |
185
|
|
|
|
186
|
|
|
/** |
187
|
|
|
* Internal method to count items by a given where clause |
188
|
|
|
* |
189
|
|
|
* @param string $where |
190
|
|
|
* |
191
|
|
|
* @return mixed |
192
|
|
|
*/ |
193
|
8 |
|
protected function countItemsByWhereClause($where) |
194
|
|
|
{ |
195
|
8 |
|
$db = $this->getDB(); |
|
|
|
|
196
|
8 |
|
$rs = $db->exec_SELECTquery('count(*) as anz', $this->tableName, $where); |
197
|
8 |
|
$res = $db->sql_fetch_assoc($rs); |
198
|
|
|
|
199
|
8 |
|
return $res['anz']; |
200
|
|
|
} |
201
|
|
|
|
202
|
|
|
/** |
203
|
|
|
* Count pending queue entries grouped by configuration key |
204
|
|
|
* |
205
|
|
|
* @return array |
206
|
|
|
*/ |
207
|
1 |
|
public function countPendingItemsGroupedByConfigurationKey() |
208
|
|
|
{ |
209
|
1 |
|
$db = $this->getDB(); |
|
|
|
|
210
|
1 |
|
$res = $db->exec_SELECTquery( |
211
|
1 |
|
"configuration, count(*) as unprocessed, sum(process_id != '') as assignedButUnprocessed", |
212
|
1 |
|
$this->tableName, |
213
|
1 |
|
'exec_time = 0 AND scheduled < ' . time(), |
214
|
1 |
|
'configuration' |
215
|
|
|
); |
216
|
1 |
|
$rows = []; |
217
|
1 |
|
while ($row = $db->sql_fetch_assoc($res)) { |
218
|
1 |
|
$rows[] = $row; |
219
|
|
|
} |
220
|
|
|
|
221
|
1 |
|
return $rows; |
222
|
|
|
} |
223
|
|
|
|
224
|
|
|
/** |
225
|
|
|
* Get set id with unprocessed entries |
226
|
|
|
* |
227
|
|
|
* @param void |
228
|
|
|
* |
229
|
|
|
* @return array array of set ids |
230
|
|
|
*/ |
231
|
1 |
|
public function getSetIdWithUnprocessedEntries() |
232
|
|
|
{ |
233
|
1 |
|
$db = $this->getDB(); |
|
|
|
|
234
|
1 |
|
$res = $db->exec_SELECTquery( |
235
|
1 |
|
'set_id', |
236
|
1 |
|
$this->tableName, |
237
|
1 |
|
'exec_time = 0 AND scheduled < ' . time(), |
238
|
1 |
|
'set_id' |
239
|
|
|
); |
240
|
1 |
|
$setIds = []; |
241
|
1 |
|
while ($row = $db->sql_fetch_assoc($res)) { |
242
|
1 |
|
$setIds[] = intval($row['set_id']); |
243
|
|
|
} |
244
|
|
|
|
245
|
1 |
|
return $setIds; |
246
|
|
|
} |
247
|
|
|
|
248
|
|
|
/** |
249
|
|
|
* Get total queue entries by configuration |
250
|
|
|
* |
251
|
|
|
* @param array $setIds |
252
|
|
|
* |
253
|
|
|
* @return array totals by configuration (keys) |
254
|
|
|
*/ |
255
|
1 |
|
public function getTotalQueueEntriesByConfiguration(array $setIds) |
256
|
|
|
{ |
257
|
1 |
|
$totals = []; |
258
|
1 |
|
if (count($setIds) > 0) { |
259
|
1 |
|
$db = $this->getDB(); |
|
|
|
|
260
|
1 |
|
$res = $db->exec_SELECTquery( |
261
|
1 |
|
'configuration, count(*) as c', |
262
|
1 |
|
$this->tableName, |
263
|
1 |
|
'set_id in (' . implode(',', $setIds) . ') AND scheduled < ' . time(), |
264
|
1 |
|
'configuration' |
265
|
|
|
); |
266
|
1 |
|
while ($row = $db->sql_fetch_assoc($res)) { |
267
|
1 |
|
$totals[$row['configuration']] = $row['c']; |
268
|
|
|
} |
269
|
|
|
} |
270
|
|
|
|
271
|
1 |
|
return $totals; |
272
|
|
|
} |
273
|
|
|
|
274
|
|
|
/** |
275
|
|
|
* Get the timestamps of the last processed entries |
276
|
|
|
* |
277
|
|
|
* @param int $limit |
278
|
|
|
* |
279
|
|
|
* @return array |
280
|
|
|
*/ |
281
|
1 |
|
public function getLastProcessedEntriesTimestamps($limit = 100) |
282
|
|
|
{ |
283
|
1 |
|
$db = $this->getDB(); |
|
|
|
|
284
|
1 |
|
$res = $db->exec_SELECTquery( |
285
|
1 |
|
'exec_time', |
286
|
1 |
|
$this->tableName, |
287
|
1 |
|
'', |
288
|
1 |
|
'', |
289
|
1 |
|
'exec_time desc', |
290
|
1 |
|
$limit |
291
|
|
|
); |
292
|
|
|
|
293
|
1 |
|
$rows = []; |
294
|
1 |
|
while (($row = $db->sql_fetch_assoc($res)) !== false) { |
295
|
1 |
|
$rows[] = intval($row['exec_time']); |
296
|
|
|
} |
297
|
|
|
|
298
|
1 |
|
return $rows; |
299
|
|
|
} |
300
|
|
|
|
301
|
|
|
/** |
302
|
|
|
* Get the last processed entries |
303
|
|
|
* |
304
|
|
|
* @param string $selectFields |
305
|
|
|
* @param int $limit |
306
|
|
|
* |
307
|
|
|
* @return array |
308
|
|
|
*/ |
309
|
1 |
|
public function getLastProcessedEntries($selectFields = '*', $limit = 100) |
310
|
|
|
{ |
311
|
1 |
|
$db = $this->getDB(); |
|
|
|
|
312
|
1 |
|
$res = $db->exec_SELECTquery( |
313
|
1 |
|
$selectFields, |
314
|
1 |
|
$this->tableName, |
315
|
1 |
|
'', |
316
|
1 |
|
'', |
317
|
1 |
|
'exec_time desc', |
318
|
1 |
|
$limit |
319
|
|
|
); |
320
|
|
|
|
321
|
1 |
|
$rows = []; |
322
|
1 |
|
while (($row = $db->sql_fetch_assoc($res)) !== false) { |
323
|
1 |
|
$rows[] = $row; |
324
|
|
|
} |
325
|
|
|
|
326
|
1 |
|
return $rows; |
327
|
|
|
} |
328
|
|
|
|
329
|
|
|
/** |
330
|
|
|
* Get performance statistics data |
331
|
|
|
* |
332
|
|
|
* @param int $start timestamp |
333
|
|
|
* @param int $end timestamp |
334
|
|
|
* |
335
|
|
|
* @return array performance data |
336
|
|
|
*/ |
337
|
1 |
|
public function getPerformanceData($start, $end) |
338
|
|
|
{ |
339
|
1 |
|
$db = $this->getDB(); |
|
|
|
|
340
|
1 |
|
$res = $db->exec_SELECTquery( |
341
|
1 |
|
'process_id_completed, min(exec_time) as start, max(exec_time) as end, count(*) as urlcount', |
342
|
1 |
|
$this->tableName, |
343
|
1 |
|
'exec_time != 0 and exec_time >= ' . intval($start) . ' and exec_time <= ' . intval($end), |
344
|
1 |
|
'process_id_completed' |
345
|
|
|
); |
346
|
|
|
|
347
|
1 |
|
$rows = []; |
348
|
1 |
|
while (($row = $db->sql_fetch_assoc($res)) !== false) { |
349
|
1 |
|
$rows[$row['process_id_completed']] = $row; |
350
|
|
|
} |
351
|
|
|
|
352
|
1 |
|
return $rows; |
353
|
|
|
} |
354
|
|
|
|
355
|
|
|
/** |
356
|
|
|
* Determines if a page is queued |
357
|
|
|
* |
358
|
|
|
* @param $uid |
359
|
|
|
* @param bool $unprocessed_only |
360
|
|
|
* @param bool $timed_only |
361
|
|
|
* @param bool $timestamp |
362
|
|
|
* |
363
|
|
|
* @return bool |
364
|
|
|
*/ |
365
|
7 |
|
public function isPageInQueue($uid, $unprocessed_only = true, $timed_only = false, $timestamp = false) |
366
|
|
|
{ |
367
|
7 |
|
if (!MathUtility::canBeInterpretedAsInteger($uid)) { |
368
|
1 |
|
throw new \InvalidArgumentException('Invalid parameter type', 1468931945); |
369
|
|
|
} |
370
|
|
|
|
371
|
6 |
|
$isPageInQueue = false; |
372
|
|
|
|
373
|
6 |
|
$whereClause = 'page_id = ' . (integer)$uid; |
374
|
|
|
|
375
|
6 |
|
if (false !== $unprocessed_only) { |
376
|
3 |
|
$whereClause .= ' AND exec_time = 0'; |
377
|
|
|
} |
378
|
|
|
|
379
|
6 |
|
if (false !== $timed_only) { |
380
|
1 |
|
$whereClause .= ' AND scheduled != 0'; |
381
|
|
|
} |
382
|
|
|
|
383
|
6 |
|
if (false !== $timestamp) { |
384
|
1 |
|
$whereClause .= ' AND scheduled = ' . (integer)$timestamp; |
385
|
|
|
} |
386
|
|
|
|
387
|
6 |
|
$count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows( |
388
|
6 |
|
'*', |
389
|
6 |
|
'tx_crawler_queue', |
390
|
6 |
|
$whereClause |
391
|
|
|
); |
392
|
|
|
|
393
|
6 |
|
if (false !== $count && $count > 0) { |
394
|
5 |
|
$isPageInQueue = true; |
395
|
|
|
} |
396
|
|
|
|
397
|
6 |
|
return $isPageInQueue; |
398
|
|
|
} |
399
|
|
|
|
400
|
|
|
/** |
401
|
|
|
* Method to check if a page is in the queue which is timed for a |
402
|
|
|
* date when it should be crawled |
403
|
|
|
* |
404
|
|
|
* @param int $uid uid of the page |
405
|
|
|
* |
406
|
|
|
* @return boolean |
407
|
|
|
* |
408
|
|
|
*/ |
409
|
1 |
|
public function isPageInQueueTimed($uid) |
410
|
|
|
{ |
411
|
1 |
|
$uid = intval($uid); |
412
|
1 |
|
return $this->isPageInQueue($uid, true); |
413
|
|
|
} |
414
|
|
|
|
415
|
|
|
/** |
416
|
|
|
* This method is used to count all processes in the process table. |
417
|
|
|
* |
418
|
|
|
* @param string $where Where clause |
419
|
|
|
* |
420
|
|
|
* @return integer |
421
|
|
|
*/ |
422
|
8 |
|
public function countAll($where = '1 = 1') |
423
|
|
|
{ |
424
|
8 |
|
return $this->countByWhere($where); |
425
|
|
|
} |
426
|
|
|
} |
427
|
|
|
|
This method has been deprecated. The supplier of the class has supplied an explanatory message.
The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.