Passed
Push — typo3v9 ( c71e93...5cd459 )
by Tomas Norre
05:30
created

CrawlerController::checkIfPageShouldBeSkipped()   F

Complexity

Conditions 14
Paths 360

Size

Total Lines 52
Code Lines 29

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 20
CRAP Score 18.5707

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 14
eloc 29
c 1
b 0
f 0
nc 360
nop 1
dl 0
loc 52
ccs 20
cts 28
cp 0.7143
crap 18.5707
rs 3.4333

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2019 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
33
use AOE\Crawler\Domain\Repository\ProcessRepository;
34
use AOE\Crawler\Domain\Repository\QueueRepository;
35
use AOE\Crawler\Event\EventDispatcher;
36
use AOE\Crawler\QueueExecutor;
37
use AOE\Crawler\Utility\SignalSlotUtility;
38
use Psr\Http\Message\UriInterface;
39
use Psr\Log\LoggerAwareInterface;
40
use Psr\Log\LoggerAwareTrait;
41
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
42
use TYPO3\CMS\Backend\Utility\BackendUtility;
43
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
44
use TYPO3\CMS\Core\Core\Bootstrap;
45
use TYPO3\CMS\Core\Core\Environment;
46
use TYPO3\CMS\Core\Database\Connection;
47
use TYPO3\CMS\Core\Database\ConnectionPool;
48
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
49
use TYPO3\CMS\Core\Http\Uri;
50
use TYPO3\CMS\Core\Imaging\Icon;
51
use TYPO3\CMS\Core\Imaging\IconFactory;
52
use TYPO3\CMS\Core\Routing\SiteMatcher;
53
use TYPO3\CMS\Core\Site\Entity\Site;
54
use TYPO3\CMS\Core\Type\Bitmask\Permission;
55
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
56
use TYPO3\CMS\Core\Utility\DebugUtility;
57
use TYPO3\CMS\Core\Utility\GeneralUtility;
58
use TYPO3\CMS\Core\Utility\MathUtility;
59
use TYPO3\CMS\Extbase\Object\ObjectManager;
60
use TYPO3\CMS\Frontend\Page\CacheHashCalculator;
61
use TYPO3\CMS\Frontend\Page\PageRepository;
62
63
/**
64
 * Class CrawlerController
65
 *
66
 * @package AOE\Crawler\Controller
67
 */
68
class CrawlerController implements LoggerAwareInterface
69
{
70
    use LoggerAwareTrait;
71
72
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
73
    public const CLI_STATUS_REMAIN = 1; //queue not empty
74
    public const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
75
    public const CLI_STATUS_ABORTED = 4; //instance didn't finish
76
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
77
78
    /**
79
     * @var integer
80
     */
81
    public $setID = 0;
82
83
    /**
84
     * @var string
85
     */
86
    public $processID = '';
87
88
    /**
89
     * @var array
90
     */
91
    public $duplicateTrack = [];
92
93
    /**
94
     * @var array
95
     */
96
    public $downloadUrls = [];
97
98
    /**
99
     * @var array
100
     */
101
    public $incomingProcInstructions = [];
102
103
    /**
104
     * @var array
105
     */
106
    public $incomingConfigurationSelection = [];
107
108
    /**
109
     * @var bool
110
     */
111
    public $registerQueueEntriesInternallyOnly = false;
112
113
    /**
114
     * @var array
115
     */
116
    public $queueEntries = [];
117
118
    /**
119
     * @var array
120
     */
121
    public $urlList = [];
122
123
    /**
124
     * @var array
125
     */
126
    public $extensionSettings = [];
127
128
    /**
129
     * Mount Point
130
     *
131
     * @var bool
132
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
133
     */
134
    public $MP = false;
135
136
    /**
137
     * @var string
138
     */
139
    protected $processFilename;
140
141
    /**
142
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
143
     *
144
     * @var string
145
     */
146
    protected $accessMode;
147
148
    /**
149
     * @var BackendUserAuthentication|null
150
     */
151
    private $backendUser;
152
153
    /**
154
     * @var integer
155
     */
156
    private $scheduledTime = 0;
157
158
    /**
159
     * @var integer
160
     */
161
    private $reqMinute = 0;
162
163
    /**
164
     * @var bool
165
     */
166
    private $submitCrawlUrls = false;
167
168
    /**
169
     * @var bool
170
     */
171
    private $downloadCrawlUrls = false;
172
173
    /**
174
     * @var QueueRepository
175
     */
176
    protected $queueRepository;
177
178
    /**
179
     * @var ProcessRepository
180
     */
181
    protected $processRepository;
182
183
    /**
184
     * @var ConfigurationRepository
185
     */
186
    protected $configurationRepository;
187
188
    /**
189
     * @var string
190
     */
191
    protected $tableName = 'tx_crawler_queue';
192
193
    /**
194
     * @var QueueExecutor
195
     */
196
    protected $queueExecutor;
197
198
    /**
199
     * @var int
200
     */
201
    protected $maximumUrlsToCompile = 10000;
202
203
    /**
204
     * @var IconFactory
205
     */
206
    protected $iconFactory;
207
208
    /**
209
     * Method to set the accessMode can be gui, cli or cli_im
210
     *
211
     * @return string
212
     */
213 1
    public function getAccessMode()
214
    {
215 1
        return $this->accessMode;
216
    }
217
218
    /**
219
     * @param string $accessMode
220
     */
221 1
    public function setAccessMode($accessMode): void
222
    {
223 1
        $this->accessMode = $accessMode;
224 1
    }
225
226
    /**
227
     * Set disabled status to prevent processes from being processed
228
     *
229
     * @param bool $disabled (optional, defaults to true)
230
     * @return void
231
     */
232 3
    public function setDisabled($disabled = true): void
233
    {
234 3
        if ($disabled) {
235 2
            GeneralUtility::writeFile($this->processFilename, '');
236
        } else {
237 1
            if (is_file($this->processFilename)) {
238 1
                unlink($this->processFilename);
239
            }
240
        }
241 3
    }
242
243
    /**
244
     * Get disable status
245
     *
246
     * @return bool true if disabled
247
     */
248 3
    public function getDisabled()
249
    {
250 3
        return is_file($this->processFilename);
251
    }
252
253
    /**
254
     * @param string $filenameWithPath
255
     *
256
     * @return void
257
     */
258 4
    public function setProcessFilename($filenameWithPath): void
259
    {
260 4
        $this->processFilename = $filenameWithPath;
261 4
    }
262
263
    /**
264
     * @return string
265
     */
266 1
    public function getProcessFilename()
267
    {
268 1
        return $this->processFilename;
269
    }
270
271
    /************************************
272
     *
273
     * Getting URLs based on Page TSconfig
274
     *
275
     ************************************/
276
277 39
    public function __construct()
278
    {
279 39
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
280 39
        $this->queueRepository = $objectManager->get(QueueRepository::class);
281 39
        $this->processRepository = $objectManager->get(ProcessRepository::class);
282 39
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
283 39
        $this->queueExecutor = $objectManager->get(QueueExecutor::class);
284 39
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
285
286 39
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
287
288
        /** @var ExtensionConfigurationProvider $configurationProvider */
289 39
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
290 39
        $settings = $configurationProvider->getExtensionConfiguration();
291 39
        $this->extensionSettings = is_array($settings) ? $settings : [];
0 ignored issues
show
introduced by
The condition is_array($settings) is always true.
Loading history...
292
293
        // set defaults:
294 39
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) == 0) {
295
            $this->extensionSettings['countInARun'] = 100;
296
        }
297
298 39
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
299 39
        $this->maximumUrlsToCompile = MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000);
300 39
    }
301
302
    /**
303
     * @return BackendUserAuthentication
304
     */
305 1
    private function getBackendUser()
306
    {
307
        // Make sure the _cli_ user is loaded
308 1
        Bootstrap::initializeBackendAuthentication();
309 1
        if ($this->backendUser === null) {
310 1
            $this->backendUser = $GLOBALS['BE_USER'];
311
        }
312 1
        return $this->backendUser;
313
    }
314
315
    /**
316
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
317
     *
318
     * @param array $extensionSettings
319
     * @return void
320
     */
321 12
    public function setExtensionSettings(array $extensionSettings): void
322
    {
323 12
        $this->extensionSettings = $extensionSettings;
324 12
    }
325
326
    /**
327
     * Check if the given page should be crawled
328
     *
329
     * @param array $pageRow
330
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
331
     */
332 8
    public function checkIfPageShouldBeSkipped(array $pageRow)
333
    {
334 8
        $skipPage = false;
335 8
        $skipMessage = 'Skipped'; // message will be overwritten later
336
337
        // if page is hidden
338 8
        if (!$this->extensionSettings['crawlHiddenPages']) {
339 8
            if ($pageRow['hidden']) {
340 1
                $skipPage = true;
341 1
                $skipMessage = 'Because page is hidden';
342
            }
343
        }
344
345 8
        if (!$skipPage) {
346 7
            if (GeneralUtility::inList('3,4', $pageRow['doktype']) || $pageRow['doktype'] >= 199) {
347 3
                $skipPage = true;
348 3
                $skipMessage = 'Because doktype is not allowed';
349
            }
350
        }
351
352 8
        if (!$skipPage) {
353 4
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
354 1
                if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
355 1
                    $skipPage = true;
356 1
                    $skipMessage = 'Doktype was excluded by "' . $key . '"';
357 1
                    break;
358
                }
359
            }
360
        }
361
362 8
        if (!$skipPage) {
363
            // veto hook
364 3
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
365
                $params = [
366
                    'pageRow' => $pageRow,
367
                ];
368
                // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
369
                $veto = GeneralUtility::callUserFunction($func, $params, $this);
370
                if ($veto !== false) {
371
                    $skipPage = true;
372
                    if (is_string($veto)) {
373
                        $skipMessage = $veto;
374
                    } else {
375
                        $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
376
                    }
377
                    // no need to execute other hooks if a previous one return a veto
378
                    break;
379
                }
380
            }
381
        }
382
383 8
        return $skipPage ? $skipMessage : false;
384
    }
385
386
    /**
387
     * Wrapper method for getUrlsForPageId()
388
     * It returns an array of configurations and no urls!
389
     *
390
     * @param array $pageRow Page record with at least dok-type and uid columns.
391
     * @param string $skipMessage
392
     * @return array
393
     * @see getUrlsForPageId()
394
     */
395 4
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
396
    {
397 4
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
398
399 4
        if ($message === false) {
400 3
            $res = $this->getUrlsForPageId($pageRow['uid']);
401 3
            $skipMessage = '';
402
        } else {
403 1
            $skipMessage = $message;
404 1
            $res = [];
405
        }
406
407 4
        return $res;
408
    }
409
410
    /**
411
     * This method is used to count if there are ANY unprocessed queue entries
412
     * of a given page_id and the configuration which matches a given hash.
413
     * If there if none, we can skip an inner detail check
414
     *
415
     * @param int $uid
416
     * @param string $configurationHash
417
     * @return boolean
418
     */
419 5
    protected function noUnprocessedQueueEntriesForPageWithConfigurationHashExist($uid, $configurationHash)
420
    {
421 5
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
422 5
        $noUnprocessedQueueEntriesFound = true;
423
424
        $result = $queryBuilder
425 5
            ->count('*')
426 5
            ->from($this->tableName)
427 5
            ->where(
428 5
                $queryBuilder->expr()->eq('page_id', (int)$uid),
429 5
                $queryBuilder->expr()->eq('configuration_hash', $queryBuilder->createNamedParameter($configurationHash)),
430 5
                $queryBuilder->expr()->eq('exec_time', 0)
431
            )
432 5
            ->execute()
433 5
            ->fetchColumn();
434
435 5
        if ($result) {
436 3
            $noUnprocessedQueueEntriesFound = false;
437
        }
438
439 5
        return $noUnprocessedQueueEntriesFound;
440
    }
441
442
    /**
443
     * Creates a list of URLs from input array (and submits them to queue if asked for)
444
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
445
     *
446
     * @param array $vv Information about URLs from pageRow to crawl.
447
     * @param array $pageRow Page row
448
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
449
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
450
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
451
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
452
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
453
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
454
     * @param array $incomingProcInstructions Array of processing instructions
455
     * @return string List of URLs (meant for display in backend module)
456
     *
457
     */
458 2
    public function urlListFromUrlArray(
459
        array $vv,
460
        array $pageRow,
461
        $scheduledTime,
462
        $reqMinute,
463
        $submitCrawlUrls,
464
        $downloadCrawlUrls,
465
        array &$duplicateTrack,
466
        array &$downloadUrls,
467
        array $incomingProcInstructions
468
    ) {
469 2
        if (!is_array($vv['URLs'])) {
470
            return 'ERROR - no URL generated';
471
        }
472 2
        $urlLog = [];
473 2
        $pageId = (int)$pageRow['uid'];
474 2
        $configurationHash = $this->getConfigurationHash($vv);
475 2
        $skipInnerCheck = $this->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
476
477 2
        foreach ($vv['URLs'] as $urlQuery) {
478 2
            if (!$this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
479
                continue;
480
            }
481 2
            $url = (string)$this->getUrlFromPageAndQueryParameters(
482 2
                $pageId,
483 2
                $urlQuery,
484 2
                $vv['subCfg']['baseUrl'] ?? null,
485 2
                $vv['subCfg']['force_ssl'] ?? 0
486
            );
487
488
            // Create key by which to determine unique-ness:
489 2
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
490
491 2
            if (isset($duplicateTrack[$uKey])) {
492
                //if the url key is registered just display it and do not resubmit is
493
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
494
            } else {
495
                // Scheduled time:
496 2
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
497 2
                $schTime = intval($schTime / 60) * 60;
498 2
                $formattedDate = BackendUtility::datetime($schTime);
499 2
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
500 2
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
501
502
                // Submit for crawling!
503 2
                if ($submitCrawlUrls) {
504 2
                    $added = $this->addUrl(
505 2
                        $pageId,
506 2
                        $url,
507 2
                        $vv['subCfg'],
508 2
                        $scheduledTime,
509 2
                        $configurationHash,
510 2
                        $skipInnerCheck
511
                    );
512 2
                    if ($added === false) {
513 2
                        $urlList .= ' (URL already existed)';
514
                    }
515
                } elseif ($downloadCrawlUrls) {
516
                    $downloadUrls[$url] = $url;
517
                }
518 2
                $urlLog[] = $urlList;
519
            }
520 2
            $duplicateTrack[$uKey] = true;
521
        }
522
523 2
        return implode('<br>', $urlLog);
524
    }
525
526
    /**
527
     * Returns true if input processing instruction is among registered ones.
528
     *
529
     * @param string $piString PI to test
530
     * @param array $incomingProcInstructions Processing instructions
531
     * @return boolean
532
     */
533 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
534
    {
535 5
        if (empty($incomingProcInstructions)) {
536 1
            return true;
537
        }
538
539 4
        foreach ($incomingProcInstructions as $pi) {
540 4
            if (GeneralUtility::inList($piString, $pi)) {
541 2
                return true;
542
            }
543
        }
544 2
        return false;
545
    }
546
547 3
    public function getPageTSconfigForId($id): array
548
    {
549 3
        if (!$this->MP) {
550 3
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

550
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($id);
Loading history...
551
        } else {
552
            // TODO: Please check, this makes no sense to split a boolean value.
553
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

553
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
554
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

554
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($mountPointId);
Loading history...
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

554
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
555
        }
556
557
        // Call a hook to alter configuration
558 3
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
559
            $params = [
560
                'pageId' => $id,
561
                'pageTSConfig' => &$pageTSconfig,
562
            ];
563
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
564
                GeneralUtility::callUserFunction($userFunc, $params, $this);
565
            }
566
        }
567 3
        return $pageTSconfig;
568
    }
569
570
    /**
571
     * This methods returns an array of configurations.
572
     * Adds no urls!
573
     */
574 2
    public function getUrlsForPageId(int $pageId): array
575
    {
576
        // Get page TSconfig for page ID
577 2
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
578
579 2
        $res = [];
580
581
        // Fetch Crawler Configuration from pageTSconfig
582 2
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
583 2
        foreach ($crawlerCfg as $key => $values) {
584 1
            if (!is_array($values)) {
585 1
                continue;
586
            }
587 1
            $key = str_replace('.', '', $key);
588
            // Sub configuration for a single configuration string:
589 1
            $subCfg = (array)$crawlerCfg[$key . '.'];
590 1
            $subCfg['key'] = $key;
591
592 1
            if (strcmp($subCfg['procInstrFilter'], '')) {
593 1
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
594
            }
595 1
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
596
597
            // process configuration if it is not page-specific or if the specific page is the current page:
598 1
            if (!strcmp((string)$subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, $pageId)) {
599
600
                // Explode, process etc.:
601 1
                $res[$key] = [];
602 1
                $res[$key]['subCfg'] = $subCfg;
603 1
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
604 1
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
605 1
                $res[$key]['origin'] = 'pagets';
606
607
                // recognize MP value
608 1
                if (!$this->MP) {
609 1
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
610
                } else {
611
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
0 ignored issues
show
Bug introduced by
Are you sure $this->MP of type true can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

611
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . /** @scrutinizer ignore-type */ $this->MP]);
Loading history...
612
                }
613
            }
614
        }
615
616
        // Get configuration from tx_crawler_configuration records up the rootline
617 2
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
618 2
        foreach ($crawlerConfigurations as $configurationRecord) {
619
620
            // check access to the configuration record
621 1
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || $this->hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
622 1
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
623
624
                // process configuration if it is not page-specific or if the specific page is the current page:
625 1
                if (!strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, $pageId)) {
626 1
                    $key = $configurationRecord['name'];
627
628
                    // don't overwrite previously defined paramSets
629 1
                    if (!isset($res[$key])) {
630
631
                        /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
632 1
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
633 1
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
634
635
                        $subCfg = [
636 1
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
637 1
                            'procInstrParams.' => $TSparserObject->setup,
638 1
                            'baseUrl' => $configurationRecord['base_url'],
639 1
                            'force_ssl' => (int)$configurationRecord['force_ssl'],
640 1
                            'userGroups' => $configurationRecord['fegroups'],
641 1
                            'exclude' => $configurationRecord['exclude'],
642 1
                            'key' => $key,
643
                        ];
644
645 1
                        if (!in_array($pageId, $this->expandExcludeString($subCfg['exclude']))) {
646 1
                            $res[$key] = [];
647 1
                            $res[$key]['subCfg'] = $subCfg;
648 1
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
649 1
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
650 1
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
651 1
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
652
                        }
653
                    }
654
                }
655
            }
656
        }
657
658 2
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
659
            $params = [
660
                'res' => &$res,
661
            ];
662
            GeneralUtility::callUserFunction($func, $params, $this);
663
        }
664 2
        return $res;
665
    }
666
667
    /**
668
     * Find all configurations of subpages of a page
669
     * TODO: Write Functional Tests
670
     */
671 1
    public function getConfigurationsForBranch(int $rootid, int $depth): array
672
    {
673 1
        $configurationsForBranch = [];
674 1
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
675 1
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
676 1
        foreach ($sets as $key => $value) {
677
            if (!is_array($value)) {
678
                continue;
679
            }
680
            $configurationsForBranch[] = substr($key, -1) == '.' ? substr($key, 0, -1) : $key;
681
        }
682 1
        $pids = [];
683 1
        $rootLine = BackendUtility::BEgetRootLine($rootid);
684 1
        foreach ($rootLine as $node) {
685 1
            $pids[] = $node['uid'];
686
        }
687
        /* @var PageTreeView $tree */
688 1
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
689 1
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
690 1
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
691 1
        $tree->getTree($rootid, $depth, '');
692 1
        foreach ($tree->tree as $node) {
693
            $pids[] = $node['row']['uid'];
694
        }
695
696 1
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
697
        $statement = $queryBuilder
698 1
            ->select('name')
699 1
            ->from('tx_crawler_configuration')
700 1
            ->where(
701 1
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
702
            )
703 1
            ->execute();
704
705 1
        while ($row = $statement->fetch()) {
706 1
            $configurationsForBranch[] = $row['name'];
707
        }
708 1
        return $configurationsForBranch;
709
    }
710
711
    /**
712
     * Get querybuilder for given table
713
     *
714
     * @param string $table
715
     * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder
716
     */
717 17
    private function getQueryBuilder(string $table)
718
    {
719 17
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
720
    }
721
722
    /**
723
     * Check if a user has access to an item
724
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
725
     *
726
     * @param string $groupList Comma-separated list of (fe_)group UIDs from a user
727
     * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access
728
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
729
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
730
     */
731 3
    public function hasGroupAccess($groupList, $accessList)
732
    {
733 3
        if (empty($accessList)) {
734 1
            return true;
735
        }
736 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
737 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
738 1
                return true;
739
            }
740
        }
741 1
        return false;
742
    }
743
744
    /**
745
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
746
     * Syntax of values:
747
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
748
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
749
     * - For each configuration part:
750
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
751
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
752
     *        _ENABLELANG:1 picks only original records without their language overlays
753
     *         - Default: Literal value
754
     *
755
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
756
     * @param integer $pid Current page ID
757
     * @return array
758
     *
759
     * TODO: Write Functional Tests
760
     */
761 9
    public function expandParameters($paramArray, $pid)
762
    {
763
        // Traverse parameter names:
764 9
        foreach ($paramArray as $p => $v) {
765 9
            $v = trim($v);
766
767
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
768 9
            if (substr($v, 0, 1) === '[' && substr($v, -1) === ']') {
769
                // So, find the value inside brackets and reset the paramArray value as an array.
770 9
                $v = substr($v, 1, -1);
771 9
                $paramArray[$p] = [];
772
773
                // Explode parts and traverse them:
774 9
                $parts = explode('|', $v);
775 9
                foreach ($parts as $pV) {
776
777
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
778 9
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
779 1
                        $reg = self::swapIfFirstIsLargerThanSecond($reg);
0 ignored issues
show
Bug Best Practice introduced by
The method AOE\Crawler\Controller\C...rstIsLargerThanSecond() is not static, but was called statically. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

779
                        /** @scrutinizer ignore-call */ 
780
                        $reg = self::swapIfFirstIsLargerThanSecond($reg);
Loading history...
780
781
                        // Traverse range, add values:
782 1
                        $runAwayBrake = 1000; // Limit to size of range!
783 1
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
784 1
                            $paramArray[$p][] = $a;
785 1
                            $runAwayBrake--;
786 1
                            if ($runAwayBrake <= 0) {
787
                                break;
788
                            }
789
                        }
790 8
                    } elseif (substr(trim($pV), 0, 7) == '_TABLE:') {
791
792
                        // Parse parameters:
793 6
                        $subparts = GeneralUtility::trimExplode(';', $pV);
794 6
                        $subpartParams = [];
795 6
                        foreach ($subparts as $spV) {
796 6
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
797 6
                            $subpartParams[$pKey] = $pVal;
798
                        }
799
800
                        // Table exists:
801 6
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
802 6
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
803 6
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
804 6
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
805 6
                            $where = $subpartParams['_WHERE'] ?? '';
806 6
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
807
808 6
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
809 6
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
810 6
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
811
812 6
                                if ($recursiveDepth > 0) {
813
                                    /** @var \TYPO3\CMS\Core\Database\QueryGenerator $queryGenerator */
814 2
                                    $queryGenerator = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Database\QueryGenerator::class);
815 2
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
816 2
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
817
                                } else {
818 4
                                    $pidArray = [(string)$lookUpPid];
819
                                }
820
821 6
                                $queryBuilder->getRestrictions()
822 6
                                    ->removeAll()
823 6
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
824
825
                                $queryBuilder
826 6
                                    ->select($fieldName)
827 6
                                    ->from($subpartParams['_TABLE'])
828 6
                                    ->where(
829 6
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
830 6
                                        $where
831
                                    );
832
833 6
                                if (!empty($addTable)) {
834
                                    // TODO: Check if this works as intended!
835
                                    $queryBuilder->add('from', $addTable);
836
                                }
837 6
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
838
839 6
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
840
                                    $queryBuilder->andWhere(
841
                                        $queryBuilder->expr()->lte(
842
                                            $transOrigPointerField,
843
                                            0
844
                                        )
845
                                    );
846
                                }
847
848 6
                                $statement = $queryBuilder->execute();
849
850 6
                                $rows = [];
851 6
                                while ($row = $statement->fetch()) {
852 6
                                    $rows[$row[$fieldName]] = $row;
853
                                }
854
855 6
                                if (is_array($rows)) {
856 6
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
857
                                }
858
                            }
859
                        }
860
                    } else { // Just add value:
861 2
                        $paramArray[$p][] = $pV;
862
                    }
863
                    // Hook for processing own expandParameters place holder
864 9
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
865
                        $_params = [
866
                            'pObj' => &$this,
867
                            'paramArray' => &$paramArray,
868
                            'currentKey' => $p,
869
                            'currentValue' => $pV,
870
                            'pid' => $pid,
871
                        ];
872
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
873
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
874
                        }
875
                    }
876
                }
877
878
                // Make unique set of values and sort array by key:
879 9
                $paramArray[$p] = array_unique($paramArray[$p]);
880 9
                ksort($paramArray);
881
            } else {
882
                // Set the literal value as only value in array:
883 2
                $paramArray[$p] = [$v];
884
            }
885
        }
886
887 9
        return $paramArray;
888
    }
889
890
    /**
891
     * Compiling URLs from parameter array (output of expandParameters())
892
     * The number of URLs will be the multiplication of the number of parameter values for each key
893
     *
894
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
895
     * @param array $urls URLs accumulated in this array (for recursion)
896
     * @return array
897
     */
898 5
    public function compileUrls($paramArray, array $urls)
899
    {
900 5
        if (empty($paramArray)) {
901 5
            return $urls;
902
        }
903
        // shift first off stack:
904 4
        reset($paramArray);
905 4
        $varName = key($paramArray);
906 4
        $valueSet = array_shift($paramArray);
907
908
        // Traverse value set:
909 4
        $newUrls = [];
910 4
        foreach ($urls as $url) {
911 3
            foreach ($valueSet as $val) {
912 3
                $newUrls[] = $url . (strcmp((string)$val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string)$val) : '');
913
914 3
                if (count($newUrls) > $this->maximumUrlsToCompile) {
915
                    break;
916
                }
917
            }
918
        }
919 4
        return $this->compileUrls($paramArray, $newUrls);
920
    }
921
922
    /************************************
923
     *
924
     * Crawler log
925
     *
926
     ************************************/
927
928
    /**
929
     * Return array of records from crawler queue for input page ID
930
     *
931
     * @param integer $id Page ID for which to look up log entries.
932
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
933
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
934
     * @param boolean $doFullFlush
935
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
936
     * @return array
937
     */
938 4
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
939
    {
940 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
941
        $queryBuilder
942 4
            ->select('*')
943 4
            ->from($this->tableName)
944 4
            ->where(
945 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT))
946
            )
947 4
            ->orderBy('scheduled', 'DESC');
948
949 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
950 4
            ->getConnectionForTable($this->tableName)
951 4
            ->getExpressionBuilder();
952 4
        $query = $expressionBuilder->andX();
953
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
954
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
955
        // between the statements, it's not a mistake in the code.
956 4
        $addWhere = '';
957 4
        switch ($filter) {
958 4
            case 'pending':
959
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
960
                $addWhere = ' AND ' . $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
961
                break;
962 4
            case 'finished':
963
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
964
                $addWhere = ' AND ' . $query->add($expressionBuilder->gt('exec_time', 0));
965
                break;
966
        }
967
968
        // FIXME: Write unit test that ensures that the right records are deleted.
969 4
        if ($doFlush) {
970 2
            $addWhere = $query->add($expressionBuilder->eq('page_id', (int)$id));
971 2
            $this->flushQueue($doFullFlush ? '1=1' : $addWhere);
972 2
            return [];
973
        } else {
974 2
            if ($itemsPerPage > 0) {
975
                $queryBuilder
976 2
                    ->setMaxResults((int)$itemsPerPage);
977
            }
978
979 2
            return $queryBuilder->execute()->fetchAll();
980
        }
981
    }
982
983
    /**
984
     * Return array of records from crawler queue for input set ID
985
     *
986
     * @param int $set_id Set ID for which to look up log entries.
987
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
988
     * @param bool $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
989
     * @param int $itemsPerPage Limit the amount of entires per page default is 10
990
     * @return array
991
     */
992 6
    public function getLogEntriesForSetId(int $set_id, string $filter = '', bool $doFlush = false, bool $doFullFlush = false, int $itemsPerPage = 10)
993
    {
994 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
995
        $queryBuilder
996 6
            ->select('*')
997 6
            ->from($this->tableName)
998 6
            ->where(
999 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, \PDO::PARAM_INT))
1000
            )
1001 6
            ->orderBy('scheduled', 'DESC');
1002
1003 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1004 6
            ->getConnectionForTable($this->tableName)
1005 6
            ->getExpressionBuilder();
1006 6
        $query = $expressionBuilder->andX();
1007
        // FIXME: Write Unit tests for Filters
1008
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
1009
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
1010
        // between the statements, it's not a mistake in the code.
1011 6
        $addWhere = '';
1012 6
        switch ($filter) {
1013 6
            case 'pending':
1014 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
1015 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
1016 1
                break;
1017 5
            case 'finished':
1018 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1019 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
1020 1
                break;
1021
        }
1022
        // FIXME: Write unit test that ensures that the right records are deleted.
1023 6
        if ($doFlush) {
1024 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int)$set_id));
1025 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
1026 4
            return [];
1027
        } else {
1028 2
            if ($itemsPerPage > 0) {
1029
                $queryBuilder
1030 2
                    ->setMaxResults((int)$itemsPerPage);
1031
            }
1032
1033 2
            return $queryBuilder->execute()->fetchAll();
1034
        }
1035
    }
1036
1037
    /**
1038
     * Removes queue entries
1039
     *
1040
     * @param string $where SQL related filter for the entries which should be removed
1041
     * @return void
1042
     */
1043 10
    protected function flushQueue($where = ''): void
1044
    {
1045 10
        $realWhere = strlen((string)$where) > 0 ? $where : '1=1';
1046
1047 10
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1048
1049 10
        if (EventDispatcher::getInstance()->hasObserver('queueEntryFlush')) {
1050
            $groups = $queryBuilder
1051
                ->select('DISTINCT set_id')
1052
                ->from($this->tableName)
1053
                ->where($realWhere)
1054
                ->execute()
1055
                ->fetchAll();
1056
            if (is_array($groups)) {
0 ignored issues
show
introduced by
The condition is_array($groups) is always true.
Loading history...
1057
                foreach ($groups as $group) {
1058
                    $subSet = $queryBuilder
1059
                        ->select('uid', 'set_id')
1060
                        ->from($this->tableName)
1061
                        ->where(
1062
                            $realWhere,
1063
                            $queryBuilder->expr()->eq('set_id', $group['set_id'])
1064
                        )
1065
                        ->execute()
1066
                        ->fetchAll();
1067
                    EventDispatcher::getInstance()->post('queueEntryFlush', $group['set_id'], $subSet);
1068
                }
1069
            }
1070
        }
1071
1072
        $queryBuilder
1073 10
            ->delete($this->tableName)
1074 10
            ->where($realWhere)
1075 10
            ->execute();
1076 10
    }
1077
1078
    /**
1079
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1080
     *
1081
     * @param integer $setId Set ID
1082
     * @param array $params Parameters to pass to call back function
1083
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1084
     * @param integer $page_id Page ID to attach it to
1085
     * @param integer $schedule Time at which to activate
1086
     * @return void
1087
     */
1088
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
1089
    {
1090
        if (!is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
1091
            $params = [];
1092
        }
1093
        $params['_CALLBACKOBJ'] = $callBack;
1094
1095
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1096
            ->insert(
1097
                'tx_crawler_queue',
1098
                [
1099
                    'page_id' => (int)$page_id,
1100
                    'parameters' => serialize($params),
1101
                    'scheduled' => (int)$schedule ?: $this->getCurrentTime(),
1102
                    'exec_time' => 0,
1103
                    'set_id' => (int)$setId,
1104
                    'result_data' => '',
1105
                ]
1106
            );
1107
    }
1108
1109
    /************************************
1110
     *
1111
     * URL setting
1112
     *
1113
     ************************************/
1114
1115
    /**
1116
     * Setting a URL for crawling:
1117
     *
1118
     * @param integer $id Page ID
1119
     * @param string $url Complete URL
1120
     * @param array $subCfg Sub configuration array (from TS config)
1121
     * @param integer $tstamp Scheduled-time
1122
     * @param string $configurationHash (optional) configuration hash
1123
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1124
     * @return bool
1125
     */
1126 6
    public function addUrl(
1127
        $id,
1128
        $url,
1129
        array $subCfg,
1130
        $tstamp,
1131
        $configurationHash = '',
1132
        $skipInnerDuplicationCheck = false
1133
    ) {
1134 6
        $urlAdded = false;
1135 6
        $rows = [];
1136
1137
        // Creating parameters:
1138
        $parameters = [
1139 6
            'url' => $url,
1140
        ];
1141
1142
        // fe user group simulation:
1143 6
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1144 6
        if ($uGs) {
1145 1
            $parameters['feUserGroupList'] = $uGs;
1146
        }
1147
1148
        // Setting processing instructions
1149 6
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1150 6
        if (is_array($subCfg['procInstrParams.'])) {
1151 3
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1152
        }
1153
1154
        // Compile value array:
1155 6
        $parameters_serialized = serialize($parameters);
1156
        $fieldArray = [
1157 6
            'page_id' => (int)$id,
1158 6
            'parameters' => $parameters_serialized,
1159 6
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1160 6
            'configuration_hash' => $configurationHash,
1161 6
            'scheduled' => $tstamp,
1162 6
            'exec_time' => 0,
1163 6
            'set_id' => (int)$this->setID,
1164 6
            'result_data' => '',
1165 6
            'configuration' => $subCfg['key'],
1166
        ];
1167
1168 6
        if ($this->registerQueueEntriesInternallyOnly) {
1169
            //the entries will only be registered and not stored to the database
1170 1
            $this->queueEntries[] = $fieldArray;
1171
        } else {
1172 5
            if (!$skipInnerDuplicationCheck) {
1173
                // check if there is already an equal entry
1174 4
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1175
            }
1176
1177 5
            if (empty($rows)) {
1178 4
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1179 4
                $connectionForCrawlerQueue->insert(
1180 4
                    'tx_crawler_queue',
1181 4
                    $fieldArray
1182
                );
1183 4
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1184 4
                $rows[] = $uid;
1185 4
                $urlAdded = true;
1186 4
                EventDispatcher::getInstance()->post('urlAddedToQueue', strval($this->setID), ['uid' => $uid, 'fieldArray' => $fieldArray]);
1187
            } else {
1188 1
                EventDispatcher::getInstance()->post('duplicateUrlInQueue', strval($this->setID), ['rows' => $rows, 'fieldArray' => $fieldArray]);
1189
            }
1190
        }
1191
1192 6
        return $urlAdded;
1193
    }
1194
1195
    /**
1196
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1197
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1198
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1199
     *
1200
     * @param int $tstamp
1201
     * @param array $fieldArray
1202
     *
1203
     * @return array
1204
     */
1205 7
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1206
    {
1207 7
        $rows = [];
1208
1209 7
        $currentTime = $this->getCurrentTime();
1210
1211 7
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1212
        $queryBuilder
1213 7
            ->select('qid')
1214 7
            ->from('tx_crawler_queue');
1215
        //if this entry is scheduled with "now"
1216 7
        if ($tstamp <= $currentTime) {
1217 2
            if ($this->extensionSettings['enableTimeslot']) {
1218 1
                $timeBegin = $currentTime - 100;
1219 1
                $timeEnd = $currentTime + 100;
1220
                $queryBuilder
1221 1
                    ->where(
1222 1
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1223
                    )
1224 1
                    ->orWhere(
1225 1
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1226
                    );
1227
            } else {
1228
                $queryBuilder
1229 1
                    ->where(
1230 2
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1231
                    );
1232
            }
1233 5
        } elseif ($tstamp > $currentTime) {
1234
            //entry with a timestamp in the future need to have the same schedule time
1235
            $queryBuilder
1236 5
                ->where(
1237 5
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1238
                );
1239
        }
1240
1241
        $queryBuilder
1242 7
            ->andWhere('NOT exec_time')
1243 7
            ->andWhere('NOT process_id')
1244 7
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT)))
1245 7
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR)));
1246
1247 7
        $statement = $queryBuilder->execute();
1248
1249 7
        while ($row = $statement->fetch()) {
1250 5
            $rows[] = $row['qid'];
1251
        }
1252
1253 7
        return $rows;
1254
    }
1255
1256
    /**
1257
     * Returns the current system time
1258
     *
1259
     * @return int
1260
     */
1261
    public function getCurrentTime()
1262
    {
1263
        return time();
1264
    }
1265
1266
    /************************************
1267
     *
1268
     * URL reading
1269
     *
1270
     ************************************/
1271
1272
    /**
1273
     * Read URL for single queue entry
1274
     *
1275
     * @param integer $queueId
1276
     * @param boolean $force If set, will process even if exec_time has been set!
1277
     * @return integer
1278
     */
1279
    public function readUrl($queueId, $force = false)
1280
    {
1281
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1282
        $ret = 0;
1283
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1284
        // Get entry:
1285
        $queryBuilder
1286
            ->select('*')
1287
            ->from('tx_crawler_queue')
1288
            ->where(
1289
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1290
            );
1291
        if (!$force) {
1292
            $queryBuilder
1293
                ->andWhere('exec_time = 0')
1294
                ->andWhere('process_scheduled > 0');
1295
        }
1296
        $queueRec = $queryBuilder->execute()->fetch();
1297
1298
        if (!is_array($queueRec)) {
1299
            return;
1300
        }
1301
1302
        SignalSlotUtility::emitSignal(
1303
            __CLASS__,
1304
            SignalSlotUtility::SIGNNAL_QUEUEITEM_PREPROCESS,
1305
            [$queueId, &$queueRec]
1306
        );
1307
1308
        // Set exec_time to lock record:
1309
        $field_array = ['exec_time' => $this->getCurrentTime()];
1310
1311
        if (isset($this->processID)) {
1312
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1313
            $field_array['process_id_completed'] = $this->processID;
1314
        }
1315
1316
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1317
            ->update(
1318
                'tx_crawler_queue',
1319
                $field_array,
1320
                ['qid' => (int)$queueId]
1321
            );
1322
1323
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1324
        $resultData = unserialize($result['content']);
1325
1326
        //atm there's no need to point to specific pollable extensions
1327
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1328
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1329
                // only check the success value if the instruction is runnig
1330
                // it is important to name the pollSuccess key same as the procInstructions key
1331
                if (is_array($resultData['parameters']['procInstructions'])
1332
                    && in_array(
1333
                        $pollable,
1334
                        $resultData['parameters']['procInstructions']
1335
                    )
1336
                ) {
1337
                    if (!empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1338
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1339
                    }
1340
                }
1341
            }
1342
        }
1343
1344
        // Set result in log which also denotes the end of the processing of this entry.
1345
        $field_array = ['result_data' => serialize($result)];
1346
1347
        SignalSlotUtility::emitSignal(
1348
            __CLASS__,
1349
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1350
            [$queueId, &$field_array]
1351
        );
1352
1353
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1354
            ->update(
1355
                'tx_crawler_queue',
1356
                $field_array,
1357
                ['qid' => (int)$queueId]
1358
            );
1359
1360
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1361
        return $ret;
1362
    }
1363
1364
    /**
1365
     * Read URL for not-yet-inserted log-entry
1366
     *
1367
     * @param array $field_array Queue field array,
1368
     *
1369
     * @return string
1370
     */
1371
    public function readUrlFromArray($field_array)
1372
    {
1373
        // Set exec_time to lock record:
1374
        $field_array['exec_time'] = $this->getCurrentTime();
1375
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1376
        $connectionForCrawlerQueue->insert(
1377
            $this->tableName,
1378
            $field_array
1379
        );
1380
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1381
1382
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1383
1384
        // Set result in log which also denotes the end of the processing of this entry.
1385
        $field_array = ['result_data' => serialize($result)];
1386
1387
        SignalSlotUtility::emitSignal(
1388
            __CLASS__,
1389
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1390
            [$queueId, &$field_array]
1391
        );
1392
1393
        $connectionForCrawlerQueue->update(
1394
            $this->tableName,
1395
            $field_array,
1396
            ['qid' => $queueId]
1397
        );
1398
1399
        return $result;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $result also could return the type array|boolean which is incompatible with the documented return type string.
Loading history...
1400
    }
1401
1402
    /*****************************
1403
     *
1404
     * Compiling URLs to crawl - tools
1405
     *
1406
     *****************************/
1407
1408
    /**
1409
     * @param integer $id Root page id to start from.
1410
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1411
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1412
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1413
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1414
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1415
     * @param array $incomingProcInstructions Array of processing instructions
1416
     * @param array $configurationSelection Array of configuration keys
1417
     * @return string
1418
     */
1419
    public function getPageTreeAndUrls(
1420
        $id,
1421
        $depth,
1422
        $scheduledTime,
1423
        $reqMinute,
1424
        $submitCrawlUrls,
1425
        $downloadCrawlUrls,
1426
        array $incomingProcInstructions,
1427
        array $configurationSelection
1428
    ) {
1429
        $this->scheduledTime = $scheduledTime;
1430
        $this->reqMinute = $reqMinute;
1431
        $this->submitCrawlUrls = $submitCrawlUrls;
1432
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1433
        $this->incomingProcInstructions = $incomingProcInstructions;
1434
        $this->incomingConfigurationSelection = $configurationSelection;
1435
1436
        $this->duplicateTrack = [];
1437
        $this->downloadUrls = [];
1438
1439
        // Drawing tree:
1440
        /* @var PageTreeView $tree */
1441
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1442
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1443
        $tree->init('AND ' . $perms_clause);
1444
1445
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1446
        if (is_array($pageInfo)) {
0 ignored issues
show
introduced by
The condition is_array($pageInfo) is always true.
Loading history...
1447
            // Set root row:
1448
            $tree->tree[] = [
1449
                'row' => $pageInfo,
1450
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1451
            ];
1452
        }
1453
1454
        // Get branch beneath:
1455
        if ($depth) {
1456
            $tree->getTree($id, $depth, '');
1457
        }
1458
1459
        // Traverse page tree:
1460
        $code = '';
1461
1462
        foreach ($tree->tree as $data) {
1463
            $this->MP = false;
1464
1465
            // recognize mount points
1466
            if ($data['row']['doktype'] == PageRepository::DOKTYPE_MOUNTPOINT) {
1467
                $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1468
                $queryBuilder->getRestrictions()->removeAll()->add(GeneralUtility::makeInstance(DeletedRestriction::class));
1469
                $mountpage = $queryBuilder
1470
                    ->select('*')
1471
                    ->from('pages')
1472
                    ->where(
1473
                        $queryBuilder->expr()->eq('uid', $queryBuilder->createNamedParameter($data['row']['uid'], \PDO::PARAM_INT))
1474
                    )
1475
                    ->execute()
1476
                    ->fetchAll();
1477
                $queryBuilder->resetRestrictions();
1478
1479
                // fetch mounted pages
1480
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1481
1482
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1483
                $mountTree->init('AND ' . $perms_clause);
1484
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1485
1486
                foreach ($mountTree->tree as $mountData) {
1487
                    $code .= $this->drawURLs_addRowsForPage(
1488
                        $mountData['row'],
1489
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1490
                    );
1491
                }
1492
1493
                // replace page when mount_pid_ol is enabled
1494
                if ($mountpage[0]['mount_pid_ol']) {
1495
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1496
                } else {
1497
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1498
                    $this->MP = false;
1499
                }
1500
            }
1501
1502
            $code .= $this->drawURLs_addRowsForPage(
1503
                $data['row'],
1504
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1505
            );
1506
        }
1507
1508
        return $code;
1509
    }
1510
1511
    /**
1512
     * Expands exclude string
1513
     *
1514
     * @param string $excludeString Exclude string
1515
     * @return array
1516
     */
1517 1
    public function expandExcludeString($excludeString)
1518
    {
1519
        // internal static caches;
1520 1
        static $expandedExcludeStringCache;
1521 1
        static $treeCache;
1522
1523 1
        if (empty($expandedExcludeStringCache[$excludeString])) {
1524 1
            $pidList = [];
1525
1526 1
            if (!empty($excludeString)) {
1527
                /** @var PageTreeView $tree */
1528
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1529
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1530
1531
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1532
1533
                foreach ($excludeParts as $excludePart) {
1534
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1535
1536
                    // default is "page only" = "depth=0"
1537
                    if (empty($depth)) {
1538
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1539
                    }
1540
1541
                    $pidList[] = $pid;
1542
1543
                    if ($depth > 0) {
1544
                        if (empty($treeCache[$pid][$depth])) {
1545
                            $tree->reset();
1546
                            $tree->getTree($pid, $depth);
1547
                            $treeCache[$pid][$depth] = $tree->tree;
1548
                        }
1549
1550
                        foreach ($treeCache[$pid][$depth] as $data) {
1551
                            $pidList[] = $data['row']['uid'];
1552
                        }
1553
                    }
1554
                }
1555
            }
1556
1557 1
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1558
        }
1559
1560 1
        return $expandedExcludeStringCache[$excludeString];
1561
    }
1562
1563
    /**
1564
     * Create the rows for display of the page tree
1565
     * For each page a number of rows are shown displaying GET variable configuration
1566
     */
1567
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1568
    {
1569
        $skipMessage = '';
1570
1571
        // Get list of configurations
1572
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1573
1574
        if (!empty($this->incomingConfigurationSelection)) {
1575
            // remove configuration that does not match the current selection
1576
            foreach ($configurations as $confKey => $confArray) {
1577
                if (!in_array($confKey, $this->incomingConfigurationSelection)) {
1578
                    unset($configurations[$confKey]);
1579
                }
1580
            }
1581
        }
1582
1583
        // Traverse parameter combinations:
1584
        $c = 0;
1585
        $content = '';
1586
        if (!empty($configurations)) {
1587
            foreach ($configurations as $confKey => $confArray) {
1588
1589
                // Title column:
1590
                if (!$c) {
1591
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1592
                } else {
1593
                    $titleClm = '';
1594
                }
1595
1596
                if (!in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']))) {
1597
1598
                    // URL list:
1599
                    $urlList = $this->urlListFromUrlArray(
1600
                        $confArray,
1601
                        $pageRow,
1602
                        $this->scheduledTime,
1603
                        $this->reqMinute,
1604
                        $this->submitCrawlUrls,
1605
                        $this->downloadCrawlUrls,
1606
                        $this->duplicateTrack,
1607
                        $this->downloadUrls,
1608
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
1609
                    );
1610
1611
                    // Expanded parameters:
1612
                    $paramExpanded = '';
1613
                    $calcAccu = [];
1614
                    $calcRes = 1;
1615
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1616
                        $paramExpanded .= '
1617
                            <tr>
1618
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1619
                            '(' . count($gVal) . ')' .
1620
                            '</td>
1621
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1622
                            </tr>
1623
                        ';
1624
                        $calcRes *= count($gVal);
1625
                        $calcAccu[] = count($gVal);
1626
                    }
1627
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1628
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1629
1630
                    // Options
1631
                    $optionValues = '';
1632
                    if ($confArray['subCfg']['userGroups']) {
1633
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1634
                    }
1635
                    if ($confArray['subCfg']['procInstrFilter']) {
1636
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1637
                    }
1638
1639
                    // Compile row:
1640
                    $content .= '
1641
                        <tr>
1642
                            ' . $titleClm . '
1643
                            <td>' . htmlspecialchars($confKey) . '</td>
1644
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1645
                            <td>' . $paramExpanded . '</td>
1646
                            <td nowrap="nowrap">' . $urlList . '</td>
1647
                            <td nowrap="nowrap">' . $optionValues . '</td>
1648
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1649
                        </tr>';
1650
                } else {
1651
                    $content .= '<tr>
1652
                            ' . $titleClm . '
1653
                            <td>' . htmlspecialchars($confKey) . '</td>
1654
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1655
                        </tr>';
1656
                }
1657
1658
                $c++;
1659
            }
1660
        } else {
1661
            $message = !empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1662
1663
            // Compile row:
1664
            $content .= '
1665
                <tr>
1666
                    <td>' . $pageTitle . '</td>
1667
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1668
                </tr>';
1669
        }
1670
1671
        return $content;
1672
    }
1673
1674
    /*****************************
1675
     *
1676
     * CLI functions
1677
     *
1678
     *****************************/
1679
1680
    /**
1681
     * Running the functionality of the CLI (crawling URLs from queue)
1682
     *
1683
     * @param int $countInARun
1684
     * @param int $sleepTime
1685
     * @param int $sleepAfterFinish
1686
     * @return string
1687
     */
1688
    public function CLI_run($countInARun, $sleepTime, $sleepAfterFinish)
1689
    {
1690
        $result = 0;
1691
        $counter = 0;
1692
1693
        // First, run hooks:
1694
        $this->CLI_runHooks();
1695
1696
        // Clean up the queue
1697
        if ((int)$this->extensionSettings['purgeQueueDays'] > 0) {
1698
            $purgeDate = $this->getCurrentTime() - 24 * 60 * 60 * (int)$this->extensionSettings['purgeQueueDays'];
1699
1700
            $queryBuilderDelete = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1701
            $del = $queryBuilderDelete
1702
                ->delete($this->tableName)
1703
                ->where(
1704
                    'exec_time != 0 AND exec_time < ' . $purgeDate
1705
                )->execute();
1706
1707
            if (false === $del) {
1708
                $this->logger->info(
1709
                    'Records could not be deleted.'
1710
                );
1711
            }
1712
        }
1713
1714
        // Select entries:
1715
        //TODO Shouldn't this reside within the transaction?
1716
        $queryBuilderSelect = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1717
        $rows = $queryBuilderSelect
1718
            ->select('qid', 'scheduled')
1719
            ->from($this->tableName)
1720
            ->where(
1721
                $queryBuilderSelect->expr()->eq('exec_time', 0),
1722
                $queryBuilderSelect->expr()->eq('process_scheduled', 0),
1723
                $queryBuilderSelect->expr()->lte('scheduled', $this->getCurrentTime())
1724
            )
1725
            ->orderBy('scheduled')
1726
            ->addOrderBy('qid')
1727
            ->setMaxResults($countInARun)
1728
            ->execute()
1729
            ->fetchAll();
1730
1731
        if (!empty($rows)) {
1732
            $quidList = [];
1733
1734
            foreach ($rows as $r) {
1735
                $quidList[] = $r['qid'];
1736
            }
1737
1738
            $processId = $this->CLI_buildProcessId();
1739
1740
            //reserve queue entries for process
1741
1742
            //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
1743
            //TODO make sure we're not taking assigned queue-entires
1744
1745
            //save the number of assigned queue entrys to determine who many have been processed later
1746
            $queryBuilderUpdate = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1747
            $numberOfAffectedRows = $queryBuilderUpdate
1748
                ->update($this->tableName)
1749
                ->where(
1750
                    $queryBuilderUpdate->expr()->in('qid', $quidList)
1751
                )
1752
                ->set('process_scheduled', $this->getCurrentTime())
1753
                ->set('process_id', $processId)
1754
                ->execute();
1755
1756
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')
1757
                ->update(
1758
                    'tx_crawler_process',
1759
                    ['assigned_items_count' => (int)$numberOfAffectedRows],
1760
                    ['process_id' => $processId]
1761
                );
1762
1763
            if ($numberOfAffectedRows == count($quidList)) {
1764
                //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
1765
            } else {
1766
                //$this->queryBuilder->getConnection()->executeQuery('ROLLBACK');
1767
                $this->CLI_debug("Nothing processed due to multi-process collision (" . $this->CLI_buildProcessId() . ")");
1768
                return ($result | self::CLI_STATUS_ABORTED);
1769
            }
1770
1771
            foreach ($rows as $r) {
1772
                $result |= $this->readUrl($r['qid']);
1773
1774
                $counter++;
1775
                usleep((int)$sleepTime); // Just to relax the system
1776
1777
                // if during the start and the current read url the cli has been disable we need to return from the function
1778
                // mark the process NOT as ended.
1779
                if ($this->getDisabled()) {
1780
                    return ($result | self::CLI_STATUS_ABORTED);
1781
                }
1782
1783
                if (!$this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1784
                    $this->CLI_debug("conflict / timeout (" . $this->CLI_buildProcessId() . ")");
1785
1786
                    //TODO might need an additional returncode
1787
                    $result |= self::CLI_STATUS_ABORTED;
1788
                    break; //possible timeout
1789
                }
1790
            }
1791
1792
            sleep((int)$sleepAfterFinish);
1793
1794
            $msg = 'Rows: ' . $counter;
1795
            $this->CLI_debug($msg . " (" . $this->CLI_buildProcessId() . ")");
1796
        } else {
1797
            $this->CLI_debug("Nothing within queue which needs to be processed (" . $this->CLI_buildProcessId() . ")");
1798
        }
1799
1800
        if ($counter > 0) {
1801
            $result |= self::CLI_STATUS_PROCESSED;
1802
        }
1803
1804
        return $result;
1805
    }
1806
1807
    /**
1808
     * Activate hooks
1809
     *
1810
     * @return void
1811
     */
1812
    public function CLI_runHooks(): void
1813
    {
1814
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1815
            $hookObj = GeneralUtility::makeInstance($objRef);
1816
            if (is_object($hookObj)) {
1817
                $hookObj->crawler_init($this);
1818
            }
1819
        }
1820
    }
1821
1822
    /**
1823
     * Try to acquire a new process with the given id
1824
     * also performs some auto-cleanup for orphan processes
1825
     * @param string $id identification string for the process
1826
     * @return boolean
1827
     * @todo preemption might not be the most elegant way to clean up
1828
     *
1829
     */
1830
    public function CLI_checkAndAcquireNewProcess($id)
1831
    {
1832
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1833
        $ret = true;
1834
1835
        $systemProcessId = getmypid();
1836
        if ($systemProcessId < 1) {
1837
            return false;
1838
        }
1839
1840
        $processCount = 0;
1841
        $orphanProcesses = [];
1842
1843
        //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
1844
1845
        $statement = $queryBuilder
1846
            ->select('process_id', 'ttl')
1847
            ->from('tx_crawler_process')
1848
            ->where(
1849
                'active = 1 AND deleted = 0'
1850
            )
1851
            ->execute();
1852
1853
        $currentTime = $this->getCurrentTime();
1854
1855
        while ($row = $statement->fetch()) {
1856
            if ($row['ttl'] < $currentTime) {
1857
                $orphanProcesses[] = $row['process_id'];
1858
            } else {
1859
                $processCount++;
1860
            }
1861
        }
1862
1863
        // if there are less than allowed active processes then add a new one
1864
        if ($processCount < (int)$this->extensionSettings['processLimit']) {
1865
            $this->CLI_debug("add process " . $this->CLI_buildProcessId() . " (" . ($processCount + 1) . "/" . (int)$this->extensionSettings['processLimit'] . ")");
1866
1867
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1868
                'tx_crawler_process',
1869
                [
1870
                    'process_id' => $id,
1871
                    'active' => 1,
1872
                    'ttl' => $currentTime + (int)$this->extensionSettings['processMaxRunTime'],
1873
                    'system_process_id' => $systemProcessId,
1874
                ]
1875
            );
1876
        } else {
1877
            $this->CLI_debug("Processlimit reached (" . ($processCount) . "/" . (int)$this->extensionSettings['processLimit'] . ")");
1878
            $ret = false;
1879
        }
1880
1881
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1882
        $this->CLI_releaseProcesses($orphanProcesses, true); // maybe this should be somehow included into the current lock
1883
1884
        return $ret;
1885
    }
1886
1887
    /**
1888
     * Release a process and the required resources
1889
     *
1890
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1891
     * @param boolean $withinLock show whether the DB-actions are included within an existing lock
1892
     * @return boolean
1893
     */
1894
    public function CLI_releaseProcesses($releaseIds, $withinLock = false)
1895
    {
1896
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1897
1898
        if (!is_array($releaseIds)) {
1899
            $releaseIds = [$releaseIds];
1900
        }
1901
1902
        if (empty($releaseIds)) {
1903
            return false;   //nothing to release
1904
        }
1905
1906
        if (!$withinLock) {
1907
            //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
1908
        }
1909
1910
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1911
        // this ensures that a single process can't mess up the entire process table
1912
1913
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1914
1915
        $queryBuilder
1916
            ->update($this->tableName, 'q')
1917
            ->where(
1918
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1919
            )
1920
            ->set('q.process_scheduled', 0)
1921
            ->set('q.process_id', '')
1922
            ->execute();
1923
1924
        // FIXME: Not entirely sure that this is equivalent to the previous version
1925
        $queryBuilder->resetQueryPart('set');
1926
1927
        $queryBuilder
1928
            ->update('tx_crawler_process')
1929
            ->where(
1930
                $queryBuilder->expr()->eq('active', 0),
1931
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1932
            )
1933
            ->set('system_process_id', 0)
1934
            ->execute();
1935
        // previous version for reference
1936
        /*
1937
        $GLOBALS['TYPO3_DB']->exec_UPDATEquery(
1938
            'tx_crawler_process',
1939
            'active=0 AND deleted=0
1940
            AND NOT EXISTS (
1941
                SELECT * FROM tx_crawler_queue
1942
                WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
1943
                AND tx_crawler_queue.exec_time = 0
1944
            )',
1945
            [
1946
                'deleted' => '1',
1947
                'system_process_id' => 0
1948
            ]
1949
        );*/
1950
        // mark all requested processes as non-active
1951
        $queryBuilder
1952
            ->update('tx_crawler_process')
1953
            ->where(
1954
                'NOT EXISTS (
1955
                SELECT * FROM tx_crawler_queue
1956
                    WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
1957
                    AND tx_crawler_queue.exec_time = 0
1958
                )',
1959
                $queryBuilder->expr()->in('process_id', $queryBuilder->createNamedParameter($releaseIds, Connection::PARAM_STR_ARRAY)),
1960
                $queryBuilder->expr()->eq('deleted', 0)
1961
            )
1962
            ->set('active', 0)
1963
            ->execute();
1964
        $queryBuilder->resetQueryPart('set');
1965
        $queryBuilder
1966
            ->update($this->tableName)
1967
            ->where(
1968
                $queryBuilder->expr()->eq('exec_time', 0),
1969
                $queryBuilder->expr()->in('process_id', $queryBuilder->createNamedParameter($releaseIds, Connection::PARAM_STR_ARRAY))
1970
            )
1971
            ->set('process_scheduled', 0)
1972
            ->set('process_id', '')
1973
            ->execute();
1974
1975
        if (!$withinLock) {
1976
            //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
1977
        }
1978
1979
        return true;
1980
    }
1981
1982
    /**
1983
     * Create a unique Id for the current process
1984
     *
1985
     * @return string  the ID
1986
     */
1987 1
    public function CLI_buildProcessId()
1988
    {
1989 1
        if (!$this->processID) {
1990
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1991
        }
1992 1
        return $this->processID;
1993
    }
1994
1995
    /**
1996
     * Prints a message to the stdout (only if debug-mode is enabled)
1997
     *
1998
     * @param string $msg the message
1999
     */
2000
    public function CLI_debug($msg): void
2001
    {
2002
        if ((int)$this->extensionSettings['processDebug']) {
2003
            echo $msg . "\n";
2004
            flush();
2005
        }
2006
    }
2007
2008
    /**
2009
     * Cleans up entries that stayed for too long in the queue. These are:
2010
     * - processed entries that are over 1.5 days in age
2011
     * - scheduled entries that are over 7 days old
2012
     *
2013
     * @return void
2014
     */
2015
    public function cleanUpOldQueueEntries(): void
2016
    {
2017
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
2018
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
2019
2020
        $now = time();
2021
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
2022
        $this->flushQueue($condition);
2023
    }
2024
2025
    /**
2026
     * Returns a md5 hash generated from a serialized configuration array.
2027
     *
2028
     * @param array $configuration
2029
     *
2030
     * @return string
2031
     */
2032 8
    protected function getConfigurationHash(array $configuration)
2033
    {
2034 8
        unset($configuration['paramExpanded']);
2035 8
        unset($configuration['URLs']);
2036 8
        return md5(serialize($configuration));
2037
    }
2038
2039
    /**
2040
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
2041
     * the Site instance.
2042
     *
2043
     * @param int $pageId
2044
     * @param string $queryString
2045
     * @param string|null $alternativeBaseUrl
2046
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
2047
     * @return UriInterface
2048
     * @throws \TYPO3\CMS\Core\Exception\SiteNotFoundException
2049
     * @throws \TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException
2050
     */
2051 2
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
2052
    {
2053 2
        $site = GeneralUtility::makeInstance(SiteMatcher::class)->matchByPageId((int)$pageId);
2054 2
        if ($site instanceof Site) {
2055
            $queryString = ltrim($queryString, '?&');
2056
            $queryParts = [];
2057
            parse_str($queryString, $queryParts);
2058
            unset($queryParts['id']);
2059
            // workaround as long as we don't have native language support in crawler configurations
2060
            if (isset($queryParts['L'])) {
2061
                $queryParts['_language'] = $queryParts['L'];
2062
                unset($queryParts['L']);
2063
                $siteLanguage = $site->getLanguageById((int)$queryParts['_language']);
0 ignored issues
show
Unused Code introduced by
The assignment to $siteLanguage is dead and can be removed.
Loading history...
2064
            } else {
2065
                $siteLanguage = $site->getDefaultLanguage();
2066
            }
2067
            $url = $site->getRouter()->generateUri($pageId, $queryParts);
2068
            if (!empty($alternativeBaseUrl)) {
2069
                $alternativeBaseUrl = new Uri($alternativeBaseUrl);
2070
                $url = $url->withHost($alternativeBaseUrl->getHost());
2071
                $url = $url->withScheme($alternativeBaseUrl->getScheme());
2072
                $url = $url->withPort($alternativeBaseUrl->getPort());
2073
            }
2074
        } else {
2075
            // Technically this is not possible with site handling, but kept for backwards-compatibility reasons
2076
            // Once EXT:crawler is v10-only compatible, this should be removed completely
2077 2
            $baseUrl = ($alternativeBaseUrl ?: GeneralUtility::getIndpEnv('TYPO3_SITE_URL'));
2078 2
            $cacheHashCalculator = GeneralUtility::makeInstance(CacheHashCalculator::class);
2079 2
            $queryString .= '&cHash=' . $cacheHashCalculator->generateForParameters($queryString);
2080 2
            $url = rtrim($baseUrl, '/') . '/index.php' . $queryString;
2081 2
            $url = new Uri($url);
2082
        }
2083
2084 2
        if ($httpsOrHttp === -1) {
2085
            $url = $url->withScheme('http');
2086 2
        } elseif ($httpsOrHttp === 1) {
2087
            $url = $url->withScheme('https');
2088
        }
2089
2090 2
        return $url;
2091
    }
2092
2093 1
    private function swapIfFirstIsLargerThanSecond(array $reg): array
2094
    {
2095
        // Swap if first is larger than last:
2096 1
        if ($reg[1] > $reg[2]) {
2097
            $temp = $reg[2];
2098
            $reg[2] = $reg[1];
2099
            $reg[1] = $temp;
2100
        }
2101
2102 1
        return $reg;
2103
    }
2104
}
2105