Passed
Push — refactor/crawlerController ( aa4668 )
by Tomas Norre
07:48
created

CrawlerController::getUrlsForPageId()   C

Complexity

Conditions 16
Paths 96

Size

Total Lines 93
Code Lines 51

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 46
CRAP Score 16.0585

Importance

Changes 3
Bugs 0 Features 0
Metric Value
cc 16
eloc 51
c 3
b 0
f 0
nc 96
nop 1
dl 0
loc 93
ccs 46
cts 49
cp 0.9388
crap 16.0585
rs 5.5666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Converter\JsonCompatibilityConverter;
33
use AOE\Crawler\CrawlStrategy\CrawlStrategyFactory;
34
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
35
use AOE\Crawler\Domain\Repository\ProcessRepository;
36
use AOE\Crawler\Domain\Repository\QueueRepository;
37
use AOE\Crawler\QueueExecutor;
38
use AOE\Crawler\Service\ProcessService;
39
use AOE\Crawler\Service\UrlService;
40
use AOE\Crawler\Utility\SignalSlotUtility;
41
use Psr\Http\Message\UriInterface;
42
use Psr\Log\LoggerAwareInterface;
43
use Psr\Log\LoggerAwareTrait;
44
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
45
use TYPO3\CMS\Backend\Utility\BackendUtility;
46
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
47
use TYPO3\CMS\Core\Compatibility\PublicMethodDeprecationTrait;
48
use TYPO3\CMS\Core\Core\Bootstrap;
49
use TYPO3\CMS\Core\Core\Environment;
50
use TYPO3\CMS\Core\Database\Connection;
51
use TYPO3\CMS\Core\Database\ConnectionPool;
52
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
53
use TYPO3\CMS\Core\Imaging\Icon;
54
use TYPO3\CMS\Core\Imaging\IconFactory;
55
use TYPO3\CMS\Core\Site\Entity\Site;
56
use TYPO3\CMS\Core\Type\Bitmask\Permission;
57
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
58
use TYPO3\CMS\Core\Utility\DebugUtility;
59
use TYPO3\CMS\Core\Utility\GeneralUtility;
60
use TYPO3\CMS\Core\Utility\MathUtility;
61
use TYPO3\CMS\Extbase\Object\ObjectManager;
62
use TYPO3\CMS\Frontend\Page\PageRepository;
63
64
/**
65
 * Class CrawlerController
66
 *
67
 * @package AOE\Crawler\Controller
68
 */
69
class CrawlerController implements LoggerAwareInterface
70
{
71
    use LoggerAwareTrait;
72
    use PublicMethodDeprecationTrait;
73
74
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
75
76
    //queue not empty
77
    public const CLI_STATUS_REMAIN = 1;
78
79
    //(some) queue items where processed
80
    public const CLI_STATUS_PROCESSED = 2;
81
82
    //instance didn't finish
83
    public const CLI_STATUS_ABORTED = 4;
84
85
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
86
87
    /**
88
     * @var integer
89
     */
90
    public $setID = 0;
91
92
    /**
93
     * @var string
94
     */
95
    public $processID = '';
96
97
    /**
98
     * @var array
99
     */
100
    public $duplicateTrack = [];
101
102
    /**
103
     * @var array
104
     */
105
    public $downloadUrls = [];
106
107
    /**
108
     * @var array
109
     */
110
    public $incomingProcInstructions = [];
111
112
    /**
113
     * @var array
114
     */
115
    public $incomingConfigurationSelection = [];
116
117
    /**
118
     * @var bool
119
     */
120
    public $registerQueueEntriesInternallyOnly = false;
121
122
    /**
123
     * @var array
124
     */
125
    public $queueEntries = [];
126
127
    /**
128
     * @var array
129
     */
130
    public $urlList = [];
131
132
    /**
133
     * @var array
134
     */
135
    public $extensionSettings = [];
136
137
    /**
138
     * Mount Point
139
     *
140
     * @var bool
141
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
142
     */
143
    public $MP = false;
144
145
    /**
146
     * @var string
147
     */
148
    protected $processFilename;
149
150
    /**
151
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
152
     *
153
     * @var string
154
     */
155
    protected $accessMode;
156
157
    /**
158
     * @var QueueRepository
159
     */
160
    protected $queueRepository;
161
162
    /**
163
     * @var ProcessRepository
164
     */
165
    protected $processRepository;
166
167
    /**
168
     * @var ConfigurationRepository
169
     */
170
    protected $configurationRepository;
171
172
    /**
173
     * @var string
174
     */
175
    protected $tableName = 'tx_crawler_queue';
176
177
    /**
178
     * @var QueueExecutor
179
     */
180
    protected $queueExecutor;
181
182
    /**
183
     * @var int
184
     */
185
    protected $maximumUrlsToCompile = 10000;
186
187
    /**
188
     * @var IconFactory
189
     */
190
    protected $iconFactory;
191
192
    /**
193
     * @var string[]
194
     */
195
    private $deprecatedPublicMethods = [
0 ignored issues
show
introduced by
The private property $deprecatedPublicMethods is not used, and could be removed.
Loading history...
196
        'CLI_buildProcessId' => 'Using CrawlerController->CLI_buildProcessId() is deprecated since 9.1.3 and will be removed in v11.x, please use ProcessService::createProcessId() instead',
197
        'getLogEntriesForSetId' => 'Using crawlerController::getLogEntriesForSetId() is deprecated since 9.0.1 and will be removed in v11.x',
198
        'flushQueue' => 'Using CrawlerController::flushQueue() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->flushQueue() instead.',
199
        'cleanUpOldQueueEntries' => 'Using CrawlerController::cleanUpOldQueueEntries() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->cleanUpOldQueueEntries() instead.',
200
    ];
201
202
    /**
203
     * @var BackendUserAuthentication|null
204
     */
205
    private $backendUser;
206
207
    /**
208
     * @var integer
209
     */
210
    private $scheduledTime = 0;
211
212
    /**
213
     * @var integer
214
     */
215
    private $reqMinute = 0;
216
217
    /**
218
     * @var bool
219
     */
220
    private $submitCrawlUrls = false;
221
222
    /**
223
     * @var bool
224
     */
225
    private $downloadCrawlUrls = false;
226
227
    /************************************
228
     *
229
     * Getting URLs based on Page TSconfig
230
     *
231
     ************************************/
232
233 36
    public function __construct()
234
    {
235 36
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
236 36
        $crawlStrategyFactory = GeneralUtility::makeInstance(CrawlStrategyFactory::class);
237 36
        $this->queueRepository = $objectManager->get(QueueRepository::class);
238 36
        $this->processRepository = $objectManager->get(ProcessRepository::class);
239 36
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
240 36
        $this->queueExecutor = GeneralUtility::makeInstance(QueueExecutor::class, $crawlStrategyFactory);
241 36
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
242
243 36
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
244
245
        /** @var ExtensionConfigurationProvider $configurationProvider */
246 36
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
247 36
        $settings = $configurationProvider->getExtensionConfiguration();
248 36
        $this->extensionSettings = is_array($settings) ? $settings : [];
0 ignored issues
show
introduced by
The condition is_array($settings) is always true.
Loading history...
249
250
        // set defaults:
251 36
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
252
            $this->extensionSettings['countInARun'] = 100;
253
        }
254
255 36
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
256 36
        $this->maximumUrlsToCompile = MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000);
257 36
    }
258
259
    /**
260
     * Method to set the accessMode can be gui, cli or cli_im
261
     *
262
     * @return string
263
     */
264 1
    public function getAccessMode()
265
    {
266 1
        return $this->accessMode;
267
    }
268
269
    /**
270
     * @param string $accessMode
271
     */
272 1
    public function setAccessMode($accessMode): void
273
    {
274 1
        $this->accessMode = $accessMode;
275 1
    }
276
277
    /**
278
     * Set disabled status to prevent processes from being processed
279
     *
280
     * @param bool $disabled (optional, defaults to true)
281
     */
282 2
    public function setDisabled($disabled = true): void
283
    {
284 2
        if ($disabled) {
285 1
            GeneralUtility::writeFile($this->processFilename, '');
286
        } else {
287 1
            if (is_file($this->processFilename)) {
288 1
                unlink($this->processFilename);
289
            }
290
        }
291 2
    }
292
293
    /**
294
     * Get disable status
295
     *
296
     * @return bool true if disabled
297
     */
298 2
    public function getDisabled()
299
    {
300 2
        return is_file($this->processFilename);
301
    }
302
303
    /**
304
     * @param string $filenameWithPath
305
     */
306 3
    public function setProcessFilename($filenameWithPath): void
307
    {
308 3
        $this->processFilename = $filenameWithPath;
309 3
    }
310
311
    /**
312
     * @return string
313
     */
314 1
    public function getProcessFilename()
315
    {
316 1
        return $this->processFilename;
317
    }
318
319
    /**
320
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
321
     */
322 14
    public function setExtensionSettings(array $extensionSettings): void
323
    {
324 14
        $this->extensionSettings = $extensionSettings;
325 14
    }
326
327
    /**
328
     * Check if the given page should be crawled
329
     *
330
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
331
     */
332 12
    public function checkIfPageShouldBeSkipped(array $pageRow)
333
    {
334 12
        $skipPage = false;
335
        // message will be overwritten later
336 12
        $skipMessage = 'Skipped';
337
338
        // if page is hidden
339 12
        if (! $this->extensionSettings['crawlHiddenPages']) {
340 12
            if ($pageRow['hidden']) {
341 1
                $skipPage = true;
342 1
                $skipMessage = 'Because page is hidden';
343
            }
344
        }
345
346 12
        if (! $skipPage) {
347 11
            if (GeneralUtility::inList('3,4,199,254,255', $pageRow['doktype'])) {
348 3
                $skipPage = true;
349 3
                $skipMessage = 'Because doktype is not allowed';
350
            }
351
        }
352
353 12
        if (! $skipPage) {
354 8
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
355 1
                if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
356 1
                    $skipPage = true;
357 1
                    $skipMessage = 'Doktype was excluded by "' . $key . '"';
358 1
                    break;
359
                }
360
            }
361
        }
362
363 12
        if (! $skipPage) {
364
            // veto hook
365 7
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
366
                $params = [
367 2
                    'pageRow' => $pageRow,
368
                ];
369
                // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
370 2
                $veto = GeneralUtility::callUserFunction($func, $params, $this);
371 2
                if ($veto !== false) {
372 2
                    $skipPage = true;
373 2
                    if (is_string($veto)) {
374 1
                        $skipMessage = $veto;
375
                    } else {
376 1
                        $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
377
                    }
378
                    // no need to execute other hooks if a previous one return a veto
379 2
                    break;
380
                }
381
            }
382
        }
383
384 12
        return $skipPage ? $skipMessage : false;
385
    }
386
387
    /**
388
     * Wrapper method for getUrlsForPageId()
389
     * It returns an array of configurations and no urls!
390
     *
391
     * @param array $pageRow Page record with at least dok-type and uid columns.
392
     * @param string $skipMessage
393
     * @return array
394
     * @see getUrlsForPageId()
395
     */
396 6
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
397
    {
398 6
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
399 6
        if ($message === false) {
400 5
            $res = $this->getUrlsForPageId($pageRow['uid']);
401 5
            $skipMessage = '';
402
        } else {
403 1
            $skipMessage = $message;
404 1
            $res = [];
405
        }
406
407 6
        return $res;
408
    }
409
410
    /**
411
     * Creates a list of URLs from input array (and submits them to queue if asked for)
412
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
413
     *
414
     * @param array $vv Information about URLs from pageRow to crawl.
415
     * @param array $pageRow Page row
416
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
417
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
418
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
419
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
420
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
421
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
422
     * @param array $incomingProcInstructions Array of processing instructions
423
     * @return string List of URLs (meant for display in backend module)
424
     */
425 4
    public function urlListFromUrlArray(
426
        array $vv,
427
        array $pageRow,
428
        $scheduledTime,
429
        $reqMinute,
430
        $submitCrawlUrls,
431
        $downloadCrawlUrls,
432
        array &$duplicateTrack,
433
        array &$downloadUrls,
434
        array $incomingProcInstructions
435
    ) {
436 4
        if (! is_array($vv['URLs'])) {
437
            return 'ERROR - no URL generated';
438
        }
439 4
        $urlLog = [];
440 4
        $pageId = (int) $pageRow['uid'];
441 4
        $configurationHash = $this->getConfigurationHash($vv);
442 4
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
443
444 4
        $urlService = new UrlService();
445
446 4
        foreach ($vv['URLs'] as $urlQuery) {
447 4
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
448
                continue;
449
            }
450 4
            $url = (string) $urlService->getUrlFromPageAndQueryParameters(
451 4
                $pageId,
452 4
                $urlQuery,
453 4
                $vv['subCfg']['baseUrl'] ?? null,
454 4
                $vv['subCfg']['force_ssl'] ?? 0
455
            );
456
457
            // Create key by which to determine unique-ness:
458 4
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
459
460 4
            if (isset($duplicateTrack[$uKey])) {
461
                //if the url key is registered just display it and do not resubmit is
462
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
463
            } else {
464
                // Scheduled time:
465 4
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
466 4
                $schTime = intval($schTime / 60) * 60;
467 4
                $formattedDate = BackendUtility::datetime($schTime);
468 4
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
469 4
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
470
471
                // Submit for crawling!
472 4
                if ($submitCrawlUrls) {
473 4
                    $added = $this->addUrl(
474 4
                        $pageId,
475 4
                        $url,
476 4
                        $vv['subCfg'],
477 4
                        $scheduledTime,
478 4
                        $configurationHash,
479 4
                        $skipInnerCheck
480
                    );
481 4
                    if ($added === false) {
482 4
                        $urlList .= ' (URL already existed)';
483
                    }
484
                } elseif ($downloadCrawlUrls) {
485
                    $downloadUrls[$url] = $url;
486
                }
487 4
                $urlLog[] = $urlList;
488
            }
489 4
            $duplicateTrack[$uKey] = true;
490
        }
491
492 4
        return implode('<br>', $urlLog);
493
    }
494
495
    /**
496
     * Returns true if input processing instruction is among registered ones.
497
     *
498
     * @param string $piString PI to test
499
     * @param array $incomingProcInstructions Processing instructions
500
     * @return boolean
501
     */
502 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
503
    {
504 5
        if (empty($incomingProcInstructions)) {
505 1
            return true;
506
        }
507
508 4
        foreach ($incomingProcInstructions as $pi) {
509 4
            if (GeneralUtility::inList($piString, $pi)) {
510 2
                return true;
511
            }
512
        }
513 2
        return false;
514
    }
515
516 5
    public function getPageTSconfigForId($id): array
517
    {
518 5
        if (! $this->MP) {
519 5
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

519
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($id);
Loading history...
520
        } else {
521
            // TODO: Please check, this makes no sense to split a boolean value.
522
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

522
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
523
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

523
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($mountPointId);
Loading history...
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

523
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
524
        }
525
526
        // Call a hook to alter configuration
527 5
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
528
            $params = [
529
                'pageId' => $id,
530
                'pageTSConfig' => &$pageTSconfig,
531
            ];
532
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
533
                GeneralUtility::callUserFunction($userFunc, $params, $this);
534
            }
535
        }
536 5
        return $pageTSconfig;
537
    }
538
539
    /**
540
     * This methods returns an array of configurations.
541
     * Adds no urls!
542
     */
543 4
    public function getUrlsForPageId(int $pageId): array
544
    {
545
        // Get page TSconfig for page ID
546 4
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
547
548 4
        $res = [];
549
550
        // Fetch Crawler Configuration from pageTSconfig
551 4
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
552 4
        foreach ($crawlerCfg as $key => $values) {
553 3
            if (! is_array($values)) {
554 3
                continue;
555
            }
556 3
            $key = str_replace('.', '', $key);
557
            // Sub configuration for a single configuration string:
558 3
            $subCfg = (array) $crawlerCfg[$key . '.'];
559 3
            $subCfg['key'] = $key;
560
561 3
            if (strcmp($subCfg['procInstrFilter'] ?? '', '')) {
562 3
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
563
            }
564 3
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
565
566
            // process configuration if it is not page-specific or if the specific page is the current page:
567
            // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
568 3
            if (! strcmp((string) $subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
569
570
                // Explode, process etc.:
571 3
                $res[$key] = [];
572 3
                $res[$key]['subCfg'] = $subCfg;
573 3
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
574 3
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
575 3
                $res[$key]['origin'] = 'pagets';
576
577
                // recognize MP value
578 3
                if (! $this->MP) {
579 3
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
580
                } else {
581
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
0 ignored issues
show
Bug introduced by
Are you sure $this->MP of type true can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

581
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . /** @scrutinizer ignore-type */ $this->MP]);
Loading history...
582
                }
583
            }
584
        }
585
586
        // Get configuration from tx_crawler_configuration records up the rootline
587 4
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
588 4
        foreach ($crawlerConfigurations as $configurationRecord) {
589
590
            // check access to the configuration record
591 1
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || $this->hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
592 1
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
593
594
                // process configuration if it is not page-specific or if the specific page is the current page:
595
                // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
596 1
                if (! strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
597 1
                    $key = $configurationRecord['name'];
598
599
                    // don't overwrite previously defined paramSets
600 1
                    if (! isset($res[$key])) {
601
602
                        /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
603 1
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
604 1
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
605
606
                        $subCfg = [
607 1
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
608 1
                            'procInstrParams.' => $TSparserObject->setup,
609 1
                            'baseUrl' => $configurationRecord['base_url'],
610 1
                            'force_ssl' => (int) $configurationRecord['force_ssl'],
611 1
                            'userGroups' => $configurationRecord['fegroups'],
612 1
                            'exclude' => $configurationRecord['exclude'],
613 1
                            'key' => $key,
614
                        ];
615
616 1
                        if (! in_array($pageId, $this->expandExcludeString($subCfg['exclude']), true)) {
617 1
                            $res[$key] = [];
618 1
                            $res[$key]['subCfg'] = $subCfg;
619 1
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
620 1
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
621 1
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
622 1
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
623
                        }
624
                    }
625
                }
626
            }
627
        }
628
629 4
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
630
            $params = [
631
                'res' => &$res,
632
            ];
633
            GeneralUtility::callUserFunction($func, $params, $this);
634
        }
635 4
        return $res;
636
    }
637
638
    /**
639
     * Find all configurations of subpages of a page
640
     * TODO: Write Functional Tests
641
     */
642 1
    public function getConfigurationsForBranch(int $rootid, int $depth): array
643
    {
644 1
        $configurationsForBranch = [];
645 1
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
646 1
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
647 1
        foreach ($sets as $key => $value) {
648
            if (! is_array($value)) {
649
                continue;
650
            }
651
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
652
        }
653 1
        $pids = [];
654 1
        $rootLine = BackendUtility::BEgetRootLine($rootid);
655 1
        foreach ($rootLine as $node) {
656 1
            $pids[] = $node['uid'];
657
        }
658
        /* @var PageTreeView $tree */
659 1
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
660 1
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
661 1
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
662 1
        $tree->getTree($rootid, $depth, '');
663 1
        foreach ($tree->tree as $node) {
664
            $pids[] = $node['row']['uid'];
665
        }
666
667 1
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
668
        $statement = $queryBuilder
669 1
            ->select('name')
670 1
            ->from('tx_crawler_configuration')
671 1
            ->where(
672 1
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
673
            )
674 1
            ->execute();
675
676 1
        while ($row = $statement->fetch()) {
677 1
            $configurationsForBranch[] = $row['name'];
678
        }
679 1
        return $configurationsForBranch;
680
    }
681
682
    /**
683
     * Check if a user has access to an item
684
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
685
     *
686
     * @param string $groupList Comma-separated list of (fe_)group UIDs from a user
687
     * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access
688
     * @return bool TRUE if at least one of the users group UIDs is in the access list or the access list is empty
689
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
690
     */
691 3
    public function hasGroupAccess($groupList, $accessList)
692
    {
693 3
        if (empty($accessList)) {
694 1
            return true;
695
        }
696 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
697 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
698 1
                return true;
699
            }
700
        }
701 1
        return false;
702
    }
703
704
    /**
705
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
706
     * Syntax of values:
707
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
708
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
709
     * - For each configuration part:
710
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
711
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
712
     *        _ENABLELANG:1 picks only original records without their language overlays
713
     *         - Default: Literal value
714
     *
715
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
716
     * @param integer $pid Current page ID
717
     * @return array
718
     *
719
     * TODO: Write Functional Tests
720
     */
721 11
    public function expandParameters($paramArray, $pid)
722
    {
723
        // Traverse parameter names:
724 11
        foreach ($paramArray as $p => $v) {
725 11
            $v = trim($v);
726
727
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
728 11
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
729
                // So, find the value inside brackets and reset the paramArray value as an array.
730 11
                $v = substr($v, 1, -1);
731 11
                $paramArray[$p] = [];
732
733
                // Explode parts and traverse them:
734 11
                $parts = explode('|', $v);
735 11
                foreach ($parts as $pV) {
736
737
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
738 11
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
739 1
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
740
741
                        // Traverse range, add values:
742
                        // Limit to size of range!
743 1
                        $runAwayBrake = 1000;
744 1
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
745 1
                            $paramArray[$p][] = $a;
746 1
                            $runAwayBrake--;
747 1
                            if ($runAwayBrake <= 0) {
748
                                break;
749
                            }
750
                        }
751 10
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
752
753
                        // Parse parameters:
754 6
                        $subparts = GeneralUtility::trimExplode(';', $pV);
755 6
                        $subpartParams = [];
756 6
                        foreach ($subparts as $spV) {
757 6
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
758 6
                            $subpartParams[$pKey] = $pVal;
759
                        }
760
761
                        // Table exists:
762 6
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
763 6
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
764 6
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
765 6
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
766 6
                            $where = $subpartParams['_WHERE'] ?? '';
767 6
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
768
769 6
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
770 6
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
771 6
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
772
773 6
                                if ($recursiveDepth > 0) {
774
                                    /** @var \TYPO3\CMS\Core\Database\QueryGenerator $queryGenerator */
775 2
                                    $queryGenerator = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Database\QueryGenerator::class);
776 2
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
777 2
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
778
                                } else {
779 4
                                    $pidArray = [(string) $lookUpPid];
780
                                }
781
782 6
                                $queryBuilder->getRestrictions()
783 6
                                    ->removeAll()
784 6
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
785
786
                                $queryBuilder
787 6
                                    ->select($fieldName)
788 6
                                    ->from($subpartParams['_TABLE'])
789 6
                                    ->where(
790 6
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
791 6
                                        $where
792
                                    );
793
794 6
                                if (! empty($addTable)) {
795
                                    // TODO: Check if this works as intended!
796
                                    $queryBuilder->add('from', $addTable);
797
                                }
798 6
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
799
800 6
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
801
                                    $queryBuilder->andWhere(
802
                                        $queryBuilder->expr()->lte(
803
                                            $transOrigPointerField,
804
                                            0
805
                                        )
806
                                    );
807
                                }
808
809 6
                                $statement = $queryBuilder->execute();
810
811 6
                                $rows = [];
812 6
                                while ($row = $statement->fetch()) {
813 6
                                    $rows[$row[$fieldName]] = $row;
814
                                }
815
816 6
                                if (is_array($rows)) {
817 6
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
818
                                }
819
                            }
820
                        }
821
                    } else {
822
                        // Just add value:
823 4
                        $paramArray[$p][] = $pV;
824
                    }
825
                    // Hook for processing own expandParameters place holder
826 11
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
827
                        $_params = [
828
                            'pObj' => &$this,
829
                            'paramArray' => &$paramArray,
830
                            'currentKey' => $p,
831
                            'currentValue' => $pV,
832
                            'pid' => $pid,
833
                        ];
834
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
835
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
836
                        }
837
                    }
838
                }
839
840
                // Make unique set of values and sort array by key:
841 11
                $paramArray[$p] = array_unique($paramArray[$p]);
842 11
                ksort($paramArray);
843
            } else {
844
                // Set the literal value as only value in array:
845 4
                $paramArray[$p] = [$v];
846
            }
847
        }
848
849 11
        return $paramArray;
850
    }
851
852
    /**
853
     * Compiling URLs from parameter array (output of expandParameters())
854
     * The number of URLs will be the multiplication of the number of parameter values for each key
855
     *
856
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
857
     * @param array $urls URLs accumulated in this array (for recursion)
858
     * @return array
859
     */
860 7
    public function compileUrls($paramArray, array $urls)
861
    {
862 7
        if (empty($paramArray)) {
863 7
            return $urls;
864
        }
865
        // shift first off stack:
866 6
        reset($paramArray);
867 6
        $varName = key($paramArray);
868 6
        $valueSet = array_shift($paramArray);
869
870
        // Traverse value set:
871 6
        $newUrls = [];
872 6
        foreach ($urls as $url) {
873 5
            foreach ($valueSet as $val) {
874 5
                $newUrls[] = $url . (strcmp((string) $val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string) $val) : '');
875
876 5
                if (count($newUrls) > $this->maximumUrlsToCompile) {
877
                    break;
878
                }
879
            }
880
        }
881 6
        return $this->compileUrls($paramArray, $newUrls);
882
    }
883
884
    /************************************
885
     *
886
     * Crawler log
887
     *
888
     ************************************/
889
890
    /**
891
     * Return array of records from crawler queue for input page ID
892
     *
893
     * @param integer $id Page ID for which to look up log entries.
894
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
895
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
896
     * @param boolean $doFullFlush
897
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
898
     * @return array
899
     */
900 4
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
901
    {
902 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
903
        $queryBuilder
904 4
            ->select('*')
905 4
            ->from($this->tableName)
906 4
            ->where(
907 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT))
908
            )
909 4
            ->orderBy('scheduled', 'DESC');
910
911 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
912 4
            ->getConnectionForTable($this->tableName)
913 4
            ->getExpressionBuilder();
914 4
        $query = $expressionBuilder->andX();
0 ignored issues
show
Unused Code introduced by
The assignment to $query is dead and can be removed.
Loading history...
915
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
916
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
917
        // between the statements, it's not a mistake in the code.
918 4
        switch ($filter) {
919 4
            case 'pending':
920
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
921
                break;
922 4
            case 'finished':
923
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
924
                break;
925
        }
926
927 4
        if ($doFlush) {
928 2
            if ($doFullFlush) {
929 1
                $this->queueRepository->flushQueue('all');
930
            } else {
931 1
                $this->queueRepository->flushQueue($filter);
932
            }
933
        }
934 4
        if ($itemsPerPage > 0) {
935
            $queryBuilder
936 4
                ->setMaxResults((int) $itemsPerPage);
937
        }
938
939 4
        return $queryBuilder->execute()->fetchAll();
940
    }
941
942
    /**
943
     * Return array of records from crawler queue for input set ID
944
     *
945
     * @param int $set_id Set ID for which to look up log entries.
946
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
947
     * @param bool $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
948
     * @param int $itemsPerPage Limit the amount of entries per page default is 10
949
     * @return array
950
     *
951
     * @deprecated
952
     */
953 6
    public function getLogEntriesForSetId(int $set_id, string $filter = '', bool $doFlush = false, bool $doFullFlush = false, int $itemsPerPage = 10)
954
    {
955 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
956
        $queryBuilder
957 6
            ->select('*')
958 6
            ->from($this->tableName)
959 6
            ->where(
960 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, \PDO::PARAM_INT))
961
            )
962 6
            ->orderBy('scheduled', 'DESC');
963
964 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
965 6
            ->getConnectionForTable($this->tableName)
966 6
            ->getExpressionBuilder();
967 6
        $query = $expressionBuilder->andX();
968
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
969
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
970
        // between the statements, it's not a mistake in the code.
971 6
        $addWhere = '';
972 6
        switch ($filter) {
973 6
            case 'pending':
974 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
975 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
976 1
                break;
977 5
            case 'finished':
978 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
979 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
980 1
                break;
981
        }
982 6
        if ($doFlush) {
983 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int) $set_id));
984 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

984
            /** @scrutinizer ignore-deprecated */ $this->flushQueue($doFullFlush ? '' : $addWhere);
Loading history...
985 4
            return [];
986
        }
987 2
        if ($itemsPerPage > 0) {
988
            $queryBuilder
989 2
                ->setMaxResults((int) $itemsPerPage);
990
        }
991
992 2
        return $queryBuilder->execute()->fetchAll();
993
    }
994
995
    /**
996
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
997
     *
998
     * @param integer $setId Set ID
999
     * @param array $params Parameters to pass to call back function
1000
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1001
     * @param integer $page_id Page ID to attach it to
1002
     * @param integer $schedule Time at which to activate
1003
     */
1004
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
1005
    {
1006
        if (! is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
1007
            $params = [];
1008
        }
1009
        $params['_CALLBACKOBJ'] = $callBack;
1010
1011
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1012
            ->insert(
1013
                'tx_crawler_queue',
1014
                [
1015
                    'page_id' => (int) $page_id,
1016
                    'parameters' => json_encode($params),
1017
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
1018
                    'exec_time' => 0,
1019
                    'set_id' => (int) $setId,
1020
                    'result_data' => '',
1021
                ]
1022
            );
1023
    }
1024
1025
    /************************************
1026
     *
1027
     * URL setting
1028
     *
1029
     ************************************/
1030
1031
    /**
1032
     * Setting a URL for crawling:
1033
     *
1034
     * @param integer $id Page ID
1035
     * @param string $url Complete URL
1036
     * @param array $subCfg Sub configuration array (from TS config)
1037
     * @param integer $tstamp Scheduled-time
1038
     * @param string $configurationHash (optional) configuration hash
1039
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1040
     * @return bool
1041
     */
1042 8
    public function addUrl(
1043
        $id,
1044
        $url,
1045
        array $subCfg,
1046
        $tstamp,
1047
        $configurationHash = '',
1048
        $skipInnerDuplicationCheck = false
1049
    ) {
1050 8
        $urlAdded = false;
1051 8
        $rows = [];
1052
1053
        // Creating parameters:
1054
        $parameters = [
1055 8
            'url' => $url,
1056
        ];
1057
1058
        // fe user group simulation:
1059 8
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1060 8
        if ($uGs) {
1061 1
            $parameters['feUserGroupList'] = $uGs;
1062
        }
1063
1064
        // Setting processing instructions
1065 8
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1066 8
        if (is_array($subCfg['procInstrParams.'])) {
1067 5
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1068
        }
1069
1070
        // Compile value array:
1071 8
        $parameters_serialized = json_encode($parameters);
1072
        $fieldArray = [
1073 8
            'page_id' => (int) $id,
1074 8
            'parameters' => $parameters_serialized,
1075 8
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1076 8
            'configuration_hash' => $configurationHash,
1077 8
            'scheduled' => $tstamp,
1078 8
            'exec_time' => 0,
1079 8
            'set_id' => (int) $this->setID,
1080 8
            'result_data' => '',
1081 8
            'configuration' => $subCfg['key'],
1082
        ];
1083
1084 8
        if ($this->registerQueueEntriesInternallyOnly) {
1085
            //the entries will only be registered and not stored to the database
1086 1
            $this->queueEntries[] = $fieldArray;
1087
        } else {
1088 7
            if (! $skipInnerDuplicationCheck) {
1089
                // check if there is already an equal entry
1090 6
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1091
            }
1092
1093 7
            if (empty($rows)) {
1094 6
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1095 6
                $connectionForCrawlerQueue->insert(
1096 6
                    'tx_crawler_queue',
1097 6
                    $fieldArray
1098
                );
1099 6
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1100 6
                $rows[] = $uid;
1101 6
                $urlAdded = true;
1102
1103 6
                $signalPayload = ['uid' => $uid, 'fieldArray' => $fieldArray];
1104 6
                SignalSlotUtility::emitSignal(
1105 6
                    self::class,
1106 6
                    SignalSlotUtility::SIGNAL_URL_ADDED_TO_QUEUE,
1107 6
                    $signalPayload
1108
                );
1109
            } else {
1110 3
                $signalPayload = ['rows' => $rows, 'fieldArray' => $fieldArray];
1111 3
                SignalSlotUtility::emitSignal(
1112 3
                    self::class,
1113 3
                    SignalSlotUtility::SIGNAL_DUPLICATE_URL_IN_QUEUE,
1114 3
                    $signalPayload
1115
                );
1116
            }
1117
        }
1118
1119 8
        return $urlAdded;
1120
    }
1121
1122
    /**
1123
     * Returns the current system time
1124
     *
1125
     * @return int
1126
     */
1127
    public function getCurrentTime()
1128
    {
1129
        return time();
1130
    }
1131
1132
    /************************************
1133
     *
1134
     * URL reading
1135
     *
1136
     ************************************/
1137
1138
    /**
1139
     * Read URL for single queue entry
1140
     *
1141
     * @param integer $queueId
1142
     * @param boolean $force If set, will process even if exec_time has been set!
1143
     * @return integer
1144
     */
1145
    public function readUrl($queueId, $force = false)
1146
    {
1147
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1148
        $ret = 0;
1149
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1150
        // Get entry:
1151
        $queryBuilder
1152
            ->select('*')
1153
            ->from('tx_crawler_queue')
1154
            ->where(
1155
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1156
            );
1157
        if (! $force) {
1158
            $queryBuilder
1159
                ->andWhere('exec_time = 0')
1160
                ->andWhere('process_scheduled > 0');
1161
        }
1162
        $queueRec = $queryBuilder->execute()->fetch();
1163
1164
        if (! is_array($queueRec)) {
1165
            return;
1166
        }
1167
1168
        SignalSlotUtility::emitSignal(
1169
            self::class,
1170
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1171
            [$queueId, &$queueRec]
1172
        );
1173
1174
        // Set exec_time to lock record:
1175
        $field_array = ['exec_time' => $this->getCurrentTime()];
1176
1177
        if (isset($this->processID)) {
1178
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1179
            $field_array['process_id_completed'] = $this->processID;
1180
        }
1181
1182
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1183
            ->update(
1184
                'tx_crawler_queue',
1185
                $field_array,
1186
                ['qid' => (int) $queueId]
1187
            );
1188
1189
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1190
        if ($result['content'] === null) {
1191
            $resultData = 'An errors happened';
1192
        } else {
1193
            /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
1194
            $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
1195
            $resultData = $jsonCompatibilityConverter->convert($result['content']);
1196
        }
1197
1198
        //atm there's no need to point to specific pollable extensions
1199
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1200
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1201
                // only check the success value if the instruction is runnig
1202
                // it is important to name the pollSuccess key same as the procInstructions key
1203
                if (is_array($resultData['parameters']['procInstructions'])
1204
                    && in_array(
1205
                        $pollable,
1206
                        $resultData['parameters']['procInstructions'], true
1207
                    )
1208
                ) {
1209
                    if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1210
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1211
                    }
1212
                }
1213
            }
1214
        }
1215
1216
        // Set result in log which also denotes the end of the processing of this entry.
1217
        $field_array = ['result_data' => json_encode($result)];
1218
1219
        SignalSlotUtility::emitSignal(
1220
            self::class,
1221
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1222
            [$queueId, &$field_array]
1223
        );
1224
1225
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1226
            ->update(
1227
                'tx_crawler_queue',
1228
                $field_array,
1229
                ['qid' => (int) $queueId]
1230
            );
1231
1232
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1233
        return $ret;
1234
    }
1235
1236
    /**
1237
     * Read URL for not-yet-inserted log-entry
1238
     *
1239
     * @param array $field_array Queue field array,
1240
     *
1241
     * @return array|bool|mixed|string
1242
     */
1243
    public function readUrlFromArray($field_array)
1244
    {
1245
        // Set exec_time to lock record:
1246
        $field_array['exec_time'] = $this->getCurrentTime();
1247
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1248
        $connectionForCrawlerQueue->insert(
1249
            $this->tableName,
1250
            $field_array
1251
        );
1252
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1253
1254
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1255
1256
        // Set result in log which also denotes the end of the processing of this entry.
1257
        $field_array = ['result_data' => json_encode($result)];
1258
1259
        SignalSlotUtility::emitSignal(
1260
            self::class,
1261
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1262
            [$queueId, &$field_array]
1263
        );
1264
1265
        $connectionForCrawlerQueue->update(
1266
            $this->tableName,
1267
            $field_array,
1268
            ['qid' => $queueId]
1269
        );
1270
1271
        return $result;
1272
    }
1273
1274
    /*****************************
1275
     *
1276
     * Compiling URLs to crawl - tools
1277
     *
1278
     *****************************/
1279
1280
    /**
1281
     * @param integer $id Root page id to start from.
1282
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1283
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1284
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1285
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1286
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1287
     * @param array $incomingProcInstructions Array of processing instructions
1288
     * @param array $configurationSelection Array of configuration keys
1289
     * @return string
1290
     */
1291
    public function getPageTreeAndUrls(
1292
        $id,
1293
        $depth,
1294
        $scheduledTime,
1295
        $reqMinute,
1296
        $submitCrawlUrls,
1297
        $downloadCrawlUrls,
1298
        array $incomingProcInstructions,
1299
        array $configurationSelection
1300
    ) {
1301
        $this->scheduledTime = $scheduledTime;
1302
        $this->reqMinute = $reqMinute;
1303
        $this->submitCrawlUrls = $submitCrawlUrls;
1304
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1305
        $this->incomingProcInstructions = $incomingProcInstructions;
1306
        $this->incomingConfigurationSelection = $configurationSelection;
1307
1308
        $this->duplicateTrack = [];
1309
        $this->downloadUrls = [];
1310
1311
        // Drawing tree:
1312
        /* @var PageTreeView $tree */
1313
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1314
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1315
        $tree->init('AND ' . $perms_clause);
1316
1317
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1318
        if (is_array($pageInfo)) {
0 ignored issues
show
introduced by
The condition is_array($pageInfo) is always true.
Loading history...
1319
            // Set root row:
1320
            $tree->tree[] = [
1321
                'row' => $pageInfo,
1322
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1323
            ];
1324
        }
1325
1326
        // Get branch beneath:
1327
        if ($depth) {
1328
            $tree->getTree($id, $depth, '');
1329
        }
1330
1331
        // Traverse page tree:
1332
        $code = '';
1333
1334
        foreach ($tree->tree as $data) {
1335
            $this->MP = false;
1336
1337
            // recognize mount points
1338
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
1339
                $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('pages');
1340
                $queryBuilder->getRestrictions()->removeAll()->add(GeneralUtility::makeInstance(DeletedRestriction::class));
1341
                $mountpage = $queryBuilder
1342
                    ->select('*')
1343
                    ->from('pages')
1344
                    ->where(
1345
                        $queryBuilder->expr()->eq('uid', $queryBuilder->createNamedParameter($data['row']['uid'], \PDO::PARAM_INT))
1346
                    )
1347
                    ->execute()
1348
                    ->fetchAll();
1349
                $queryBuilder->resetRestrictions();
1350
1351
                // fetch mounted pages
1352
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1353
1354
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1355
                $mountTree->init('AND ' . $perms_clause);
1356
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1357
1358
                foreach ($mountTree->tree as $mountData) {
1359
                    $code .= $this->drawURLs_addRowsForPage(
1360
                        $mountData['row'],
1361
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1362
                    );
1363
                }
1364
1365
                // replace page when mount_pid_ol is enabled
1366
                if ($mountpage[0]['mount_pid_ol']) {
1367
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1368
                } else {
1369
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1370
                    $this->MP = false;
1371
                }
1372
            }
1373
1374
            $code .= $this->drawURLs_addRowsForPage(
1375
                $data['row'],
1376
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1377
            );
1378
        }
1379
1380
        return $code;
1381
    }
1382
1383
    /**
1384
     * Expands exclude string
1385
     *
1386
     * @param string $excludeString Exclude string
1387
     * @return array
1388
     */
1389 2
    public function expandExcludeString($excludeString)
1390
    {
1391
        // internal static caches;
1392 2
        static $expandedExcludeStringCache;
1393 2
        static $treeCache;
1394
1395 2
        if (empty($expandedExcludeStringCache[$excludeString])) {
1396 2
            $pidList = [];
1397
1398 2
            if (! empty($excludeString)) {
1399
                /** @var PageTreeView $tree */
1400 1
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1401 1
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1402
1403 1
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1404
1405 1
                foreach ($excludeParts as $excludePart) {
1406 1
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1407
1408
                    // default is "page only" = "depth=0"
1409 1
                    if (empty($depth)) {
1410 1
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1411
                    }
1412
1413 1
                    $pidList[] = (int) $pid;
1414
1415 1
                    if ($depth > 0) {
1416
                        if (empty($treeCache[$pid][$depth])) {
1417
                            $tree->reset();
1418
                            $tree->getTree($pid, $depth);
0 ignored issues
show
Bug introduced by
$pid of type string is incompatible with the type integer expected by parameter $uid of TYPO3\CMS\Backend\Tree\V...ractTreeView::getTree(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1418
                            $tree->getTree(/** @scrutinizer ignore-type */ $pid, $depth);
Loading history...
1419
                            $treeCache[$pid][$depth] = $tree->tree;
1420
                        }
1421
1422
                        foreach ($treeCache[$pid][$depth] as $data) {
1423
                            $pidList[] = (int) $data['row']['uid'];
1424
                        }
1425
                    }
1426
                }
1427
            }
1428
1429 2
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1430
        }
1431
1432 2
        return $expandedExcludeStringCache[$excludeString];
1433
    }
1434
1435
    /**
1436
     * Create the rows for display of the page tree
1437
     * For each page a number of rows are shown displaying GET variable configuration
1438
     */
1439
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1440
    {
1441
        $skipMessage = '';
1442
1443
        // Get list of configurations
1444
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1445
1446
        if (! empty($this->incomingConfigurationSelection)) {
1447
            // remove configuration that does not match the current selection
1448
            foreach ($configurations as $confKey => $confArray) {
1449
                if (! in_array($confKey, $this->incomingConfigurationSelection, true)) {
1450
                    unset($configurations[$confKey]);
1451
                }
1452
            }
1453
        }
1454
1455
        // Traverse parameter combinations:
1456
        $c = 0;
1457
        $content = '';
1458
        if (! empty($configurations)) {
1459
            foreach ($configurations as $confKey => $confArray) {
1460
1461
                // Title column:
1462
                if (! $c) {
1463
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1464
                } else {
1465
                    $titleClm = '';
1466
                }
1467
1468
                if (! in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']), true)) {
1469
1470
                    // URL list:
1471
                    $urlList = $this->urlListFromUrlArray(
1472
                        $confArray,
1473
                        $pageRow,
1474
                        $this->scheduledTime,
1475
                        $this->reqMinute,
1476
                        $this->submitCrawlUrls,
1477
                        $this->downloadCrawlUrls,
1478
                        $this->duplicateTrack,
1479
                        $this->downloadUrls,
1480
                        // if empty the urls won't be filtered by processing instructions
1481
                        $this->incomingProcInstructions
1482
                    );
1483
1484
                    // Expanded parameters:
1485
                    $paramExpanded = '';
1486
                    $calcAccu = [];
1487
                    $calcRes = 1;
1488
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1489
                        $paramExpanded .= '
1490
                            <tr>
1491
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1492
                            '(' . count($gVal) . ')' .
1493
                            '</td>
1494
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1495
                            </tr>
1496
                        ';
1497
                        $calcRes *= count($gVal);
1498
                        $calcAccu[] = count($gVal);
1499
                    }
1500
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1501
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1502
1503
                    // Options
1504
                    $optionValues = '';
1505
                    if ($confArray['subCfg']['userGroups']) {
1506
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1507
                    }
1508
                    if ($confArray['subCfg']['procInstrFilter']) {
1509
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1510
                    }
1511
1512
                    // Compile row:
1513
                    $content .= '
1514
                        <tr>
1515
                            ' . $titleClm . '
1516
                            <td>' . htmlspecialchars($confKey) . '</td>
1517
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1518
                            <td>' . $paramExpanded . '</td>
1519
                            <td nowrap="nowrap">' . $urlList . '</td>
1520
                            <td nowrap="nowrap">' . $optionValues . '</td>
1521
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1522
                        </tr>';
1523
                } else {
1524
                    $content .= '<tr>
1525
                            ' . $titleClm . '
1526
                            <td>' . htmlspecialchars($confKey) . '</td>
1527
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1528
                        </tr>';
1529
                }
1530
1531
                $c++;
1532
            }
1533
        } else {
1534
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1535
1536
            // Compile row:
1537
            $content .= '
1538
                <tr>
1539
                    <td>' . $pageTitle . '</td>
1540
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1541
                </tr>';
1542
        }
1543
1544
        return $content;
1545
    }
1546
1547
    /*****************************
1548
     *
1549
     * CLI functions
1550
     *
1551
     *****************************/
1552
1553
    /**
1554
     * Running the functionality of the CLI (crawling URLs from queue)
1555
     */
1556
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1557
    {
1558
        $result = 0;
1559
        $counter = 0;
1560
1561
        // First, run hooks:
1562
        $this->CLI_runHooks();
1563
1564
        // Clean up the queue
1565
        $this->queueRepository->cleanupQueue();
1566
1567
        // Select entries:
1568
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1569
1570
        if (! empty($rows)) {
1571
            $quidList = [];
1572
1573
            foreach ($rows as $r) {
1574
                $quidList[] = $r['qid'];
1575
            }
1576
1577
            $processId = $this->CLI_buildProcessId();
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...r::CLI_buildProcessId() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1577
            $processId = /** @scrutinizer ignore-deprecated */ $this->CLI_buildProcessId();
Loading history...
1578
1579
            //save the number of assigned queue entries to determine how many have been processed later
1580
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1581
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1582
1583
            if ($numberOfAffectedRows !== count($quidList)) {
1584
                $this->CLI_debug('Nothing processed due to multi-process collision (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...r::CLI_buildProcessId() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1584
                $this->CLI_debug('Nothing processed due to multi-process collision (' . /** @scrutinizer ignore-deprecated */ $this->CLI_buildProcessId() . ')');
Loading history...
1585
                return ($result | self::CLI_STATUS_ABORTED);
1586
            }
1587
1588
            foreach ($rows as $r) {
1589
                $result |= $this->readUrl($r['qid']);
1590
1591
                $counter++;
1592
                // Just to relax the system
1593
                usleep((int) $sleepTime);
1594
1595
                // if during the start and the current read url the cli has been disable we need to return from the function
1596
                // mark the process NOT as ended.
1597
                if ($this->getDisabled()) {
1598
                    return ($result | self::CLI_STATUS_ABORTED);
1599
                }
1600
1601
                if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...r::CLI_buildProcessId() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1601
                if (! $this->processRepository->isProcessActive(/** @scrutinizer ignore-deprecated */ $this->CLI_buildProcessId())) {
Loading history...
1602
                    $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...r::CLI_buildProcessId() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1602
                    $this->CLI_debug('conflict / timeout (' . /** @scrutinizer ignore-deprecated */ $this->CLI_buildProcessId() . ')');
Loading history...
1603
                    $result |= self::CLI_STATUS_ABORTED;
1604
                    //possible timeout
1605
                    break;
1606
                }
1607
            }
1608
1609
            sleep((int) $sleepAfterFinish);
1610
1611
            $msg = 'Rows: ' . $counter;
1612
            $this->CLI_debug($msg . ' (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...r::CLI_buildProcessId() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1612
            $this->CLI_debug($msg . ' (' . /** @scrutinizer ignore-deprecated */ $this->CLI_buildProcessId() . ')');
Loading history...
1613
        } else {
1614
            $this->CLI_debug('Nothing within queue which needs to be processed (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...r::CLI_buildProcessId() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1614
            $this->CLI_debug('Nothing within queue which needs to be processed (' . /** @scrutinizer ignore-deprecated */ $this->CLI_buildProcessId() . ')');
Loading history...
1615
        }
1616
1617
        if ($counter > 0) {
1618
            $result |= self::CLI_STATUS_PROCESSED;
1619
        }
1620
1621
        return $result;
1622
    }
1623
1624
    /**
1625
     * Activate hooks
1626
     */
1627
    public function CLI_runHooks(): void
1628
    {
1629
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1630
            $hookObj = GeneralUtility::makeInstance($objRef);
1631
            if (is_object($hookObj)) {
1632
                $hookObj->crawler_init($this);
1633
            }
1634
        }
1635
    }
1636
1637
    /**
1638
     * Try to acquire a new process with the given id
1639
     * also performs some auto-cleanup for orphan processes
1640
     * @param string $id identification string for the process
1641
     * @return boolean
1642
     * @todo preemption might not be the most elegant way to clean up
1643
     */
1644
    public function CLI_checkAndAcquireNewProcess($id)
1645
    {
1646
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1647
        $ret = true;
1648
1649
        $systemProcessId = getmypid();
1650
        if ($systemProcessId < 1) {
1651
            return false;
1652
        }
1653
1654
        $processCount = 0;
1655
        $orphanProcesses = [];
1656
1657
        $statement = $queryBuilder
1658
            ->select('process_id', 'ttl')
1659
            ->from('tx_crawler_process')
1660
            ->where(
1661
                'active = 1 AND deleted = 0'
1662
            )
1663
            ->execute();
1664
1665
        $currentTime = $this->getCurrentTime();
1666
1667
        while ($row = $statement->fetch()) {
1668
            if ($row['ttl'] < $currentTime) {
1669
                $orphanProcesses[] = $row['process_id'];
1670
            } else {
1671
                $processCount++;
1672
            }
1673
        }
1674
1675
        // if there are less than allowed active processes then add a new one
1676
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
1677
            $this->CLI_debug('add process ' . $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...r::CLI_buildProcessId() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1677
            $this->CLI_debug('add process ' . /** @scrutinizer ignore-deprecated */ $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
Loading history...
1678
1679
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1680
                'tx_crawler_process',
1681
                [
1682
                    'process_id' => $id,
1683
                    'active' => 1,
1684
                    'ttl' => $currentTime + (int) $this->extensionSettings['processMaxRunTime'],
1685
                    'system_process_id' => $systemProcessId,
1686
                ]
1687
            );
1688
        } else {
1689
            $this->CLI_debug('Processlimit reached (' . ($processCount) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
1690
            $ret = false;
1691
        }
1692
1693
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1694
        $this->CLI_releaseProcesses($orphanProcesses);
1695
1696
        return $ret;
1697
    }
1698
1699
    /**
1700
     * Release a process and the required resources
1701
     *
1702
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1703
     * @return boolean
1704
     */
1705
    public function CLI_releaseProcesses($releaseIds)
1706
    {
1707
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1708
1709
        if (! is_array($releaseIds)) {
1710
            $releaseIds = [$releaseIds];
1711
        }
1712
1713
        if (empty($releaseIds)) {
1714
            //nothing to release
1715
            return false;
1716
        }
1717
1718
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1719
        // this ensures that a single process can't mess up the entire process table
1720
1721
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1722
1723
        $queryBuilder
1724
            ->update($this->tableName, 'q')
1725
            ->where(
1726
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1727
            )
1728
            ->set('q.process_scheduled', 0)
1729
            ->set('q.process_id', '')
1730
            ->execute();
1731
1732
        // FIXME: Not entirely sure that this is equivalent to the previous version
1733
        $queryBuilder->resetQueryPart('set');
1734
1735
        $queryBuilder
1736
            ->update('tx_crawler_process')
1737
            ->where(
1738
                $queryBuilder->expr()->eq('active', 0),
1739
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1740
            )
1741
            ->set('system_process_id', 0)
1742
            ->execute();
1743
1744
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1745
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1746
1747
        return true;
1748
    }
1749
1750
    /**
1751
     * Create a unique Id for the current process
1752
     *
1753
     * @return string the ID
1754
     * @deprecated
1755
     */
1756 1
    public function CLI_buildProcessId(): string
1757
    {
1758 1
        return ProcessService::createProcessId($this->processID);
1759
    }
1760
1761
    /**
1762
     * Prints a message to the stdout (only if debug-mode is enabled)
1763
     *
1764
     * @param string $msg the message
1765
     */
1766
    public function CLI_debug($msg): void
1767
    {
1768
        if ((int) $this->extensionSettings['processDebug']) {
1769
            echo $msg . "\n";
1770
            flush();
1771
        }
1772
    }
1773
1774
    /**
1775
     * Cleans up entries that stayed for too long in the queue. These are:
1776
     * - processed entries that are over 1.5 days in age
1777
     * - scheduled entries that are over 7 days old
1778
     *
1779
     * @deprecated
1780
     */
1781 1
    public function cleanUpOldQueueEntries(): void
1782
    {
1783
        // 24*60*60 Seconds in 24 hours
1784 1
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400;
1785 1
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
1786
1787 1
        $now = time();
1788 1
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
1789 1
        $this->flushQueue($condition);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1789
        /** @scrutinizer ignore-deprecated */ $this->flushQueue($condition);
Loading history...
1790 1
    }
1791
1792
    /**
1793
     * Removes queue entries
1794
     *
1795
     * @param string $where SQL related filter for the entries which should be removed
1796
     *
1797
     * @deprecated
1798
     */
1799 5
    protected function flushQueue($where = ''): void
1800
    {
1801 5
        $realWhere = strlen((string) $where) > 0 ? $where : '1=1';
1802
1803 5
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1804
1805
        $groups = $queryBuilder
1806 5
            ->selectLiteral('DISTINCT set_id')
1807 5
            ->from($this->tableName)
1808 5
            ->where($realWhere)
1809 5
            ->execute()
1810 5
            ->fetchAll();
1811 5
        if (is_array($groups)) {
0 ignored issues
show
introduced by
The condition is_array($groups) is always true.
Loading history...
1812 5
            foreach ($groups as $group) {
1813
                $subSet = $queryBuilder
1814 4
                    ->select('qid', 'set_id')
1815 4
                    ->from($this->tableName)
1816 4
                    ->where(
1817 4
                        $realWhere,
1818 4
                        $queryBuilder->expr()->eq('set_id', $group['set_id'])
1819
                    )
1820 4
                    ->execute()
1821 4
                    ->fetchAll();
1822
1823 4
                $payLoad = ['subSet' => $subSet];
1824 4
                SignalSlotUtility::emitSignal(
1825 4
                    self::class,
1826 4
                    SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH,
1827 4
                    $payLoad
1828
                );
1829
            }
1830
        }
1831
1832
        $queryBuilder
1833 5
            ->delete($this->tableName)
1834 5
            ->where($realWhere)
1835 5
            ->execute();
1836 5
    }
1837
1838
    /**
1839
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1840
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1841
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1842
     *
1843
     * @param int $tstamp
1844
     * @param array $fieldArray
1845
     *
1846
     * @return array
1847
     */
1848 9
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1849
    {
1850 9
        $rows = [];
1851
1852 9
        $currentTime = $this->getCurrentTime();
1853
1854 9
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1855
        $queryBuilder
1856 9
            ->select('qid')
1857 9
            ->from('tx_crawler_queue');
1858
        //if this entry is scheduled with "now"
1859 9
        if ($tstamp <= $currentTime) {
1860 3
            if ($this->extensionSettings['enableTimeslot']) {
1861 2
                $timeBegin = $currentTime - 100;
1862 2
                $timeEnd = $currentTime + 100;
1863
                $queryBuilder
1864 2
                    ->where(
1865 2
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1866
                    )
1867 2
                    ->orWhere(
1868 2
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1869
                    );
1870
            } else {
1871
                $queryBuilder
1872 1
                    ->where(
1873 3
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1874
                    );
1875
            }
1876 6
        } elseif ($tstamp > $currentTime) {
1877
            //entry with a timestamp in the future need to have the same schedule time
1878
            $queryBuilder
1879 6
                ->where(
1880 6
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1881
                );
1882
        }
1883
1884
        $queryBuilder
1885 9
            ->andWhere('NOT exec_time')
1886 9
            ->andWhere('NOT process_id')
1887 9
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT)))
1888 9
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR)));
1889
1890 9
        $statement = $queryBuilder->execute();
1891
1892 9
        while ($row = $statement->fetch()) {
1893 7
            $rows[] = $row['qid'];
1894
        }
1895
1896 9
        return $rows;
1897
    }
1898
1899
    /**
1900
     * Returns a md5 hash generated from a serialized configuration array.
1901
     *
1902
     * @return string
1903
     */
1904 10
    protected function getConfigurationHash(array $configuration)
1905
    {
1906 10
        unset($configuration['paramExpanded']);
1907 10
        unset($configuration['URLs']);
1908 10
        return md5(serialize($configuration));
1909
    }
1910
1911
    /**
1912
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
1913
     * the Site instance.
1914
     *
1915
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
1916
     * @throws \TYPO3\CMS\Core\Exception\SiteNotFoundException
1917
     * @throws \TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException
1918
     *
1919
     * @deprecated Using CrawlerController::getUrlFromPageAndQueryParameters() is deprecated since 9.1.1 and will be removed in v11.x, please use UrlService->getUrlFromPageAndQueryParameters() instead.
1920
     */
1921
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
1922
    {
1923
        $urlService = new UrlService();
1924
        return $urlService->getUrlFromPageAndQueryParameters($pageId, $queryString, $alternativeBaseUrl, $httpsOrHttp);
1925
    }
1926
1927 1
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1928
    {
1929
        // Swap if first is larger than last:
1930 1
        if ($reg[1] > $reg[2]) {
1931
            $temp = $reg[2];
1932
            $reg[2] = $reg[1];
1933
            $reg[1] = $temp;
1934
        }
1935
1936 1
        return $reg;
1937
    }
1938
1939
    /**
1940
     * @return BackendUserAuthentication
1941
     */
1942 2
    private function getBackendUser()
1943
    {
1944
        // Make sure the _cli_ user is loaded
1945 2
        Bootstrap::initializeBackendAuthentication();
1946 2
        if ($this->backendUser === null) {
1947 2
            $this->backendUser = $GLOBALS['BE_USER'];
1948
        }
1949 2
        return $this->backendUser;
1950
    }
1951
1952
    /**
1953
     * Get querybuilder for given table
1954
     *
1955
     * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder
1956
     */
1957 12
    private function getQueryBuilder(string $table)
1958
    {
1959 12
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
1960
    }
1961
}
1962