Passed
Push — refactor/add-CrawlStrategyFact... ( 6331a4 )
by Tomas Norre
07:44
created

CrawlerController::urlListFromUrlArray()   B

Complexity

Conditions 8
Paths 8

Size

Total Lines 66
Code Lines 38

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 33
CRAP Score 8.1458

Importance

Changes 0
Metric Value
cc 8
eloc 38
c 0
b 0
f 0
nc 8
nop 9
dl 0
loc 66
ccs 33
cts 38
cp 0.8684
crap 8.1458
rs 8.0675

How to fix   Long Method    Many Parameters   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Converter\JsonCompatibilityConverter;
33
use AOE\Crawler\CrawlStrategy\CrawlStrategyFactory;
34
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
35
use AOE\Crawler\Domain\Repository\ProcessRepository;
36
use AOE\Crawler\Domain\Repository\QueueRepository;
37
use AOE\Crawler\QueueExecutor;
38
use AOE\Crawler\Utility\SignalSlotUtility;
39
use Psr\Http\Message\UriInterface;
40
use Psr\Log\LoggerAwareInterface;
41
use Psr\Log\LoggerAwareTrait;
42
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
43
use TYPO3\CMS\Backend\Utility\BackendUtility;
44
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
45
use TYPO3\CMS\Core\Compatibility\PublicMethodDeprecationTrait;
46
use TYPO3\CMS\Core\Core\Bootstrap;
47
use TYPO3\CMS\Core\Core\Environment;
48
use TYPO3\CMS\Core\Database\Connection;
49
use TYPO3\CMS\Core\Database\ConnectionPool;
50
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
51
use TYPO3\CMS\Core\Http\Uri;
52
use TYPO3\CMS\Core\Imaging\Icon;
53
use TYPO3\CMS\Core\Imaging\IconFactory;
54
use TYPO3\CMS\Core\Routing\SiteMatcher;
55
use TYPO3\CMS\Core\Site\Entity\Site;
56
use TYPO3\CMS\Core\Type\Bitmask\Permission;
57
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
58
use TYPO3\CMS\Core\Utility\DebugUtility;
59
use TYPO3\CMS\Core\Utility\GeneralUtility;
60
use TYPO3\CMS\Core\Utility\MathUtility;
61
use TYPO3\CMS\Extbase\Object\ObjectManager;
62
use TYPO3\CMS\Frontend\Page\CacheHashCalculator;
63
use TYPO3\CMS\Frontend\Page\PageRepository;
64
65
/**
66
 * Class CrawlerController
67
 *
68
 * @package AOE\Crawler\Controller
69
 */
70
class CrawlerController implements LoggerAwareInterface
71
{
72
    use LoggerAwareTrait;
73
    use PublicMethodDeprecationTrait;
74
75
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
76
77
    public const CLI_STATUS_REMAIN = 1; //queue not empty
78
79
    public const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
80
81
    public const CLI_STATUS_ABORTED = 4; //instance didn't finish
82
83
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
84
85
    /**
86
     * @var integer
87
     */
88
    public $setID = 0;
89
90
    /**
91
     * @var string
92
     */
93
    public $processID = '';
94
95
    /**
96
     * @var array
97
     */
98
    public $duplicateTrack = [];
99
100
    /**
101
     * @var array
102
     */
103
    public $downloadUrls = [];
104
105
    /**
106
     * @var array
107
     */
108
    public $incomingProcInstructions = [];
109
110
    /**
111
     * @var array
112
     */
113
    public $incomingConfigurationSelection = [];
114
115
    /**
116
     * @var bool
117
     */
118
    public $registerQueueEntriesInternallyOnly = false;
119
120
    /**
121
     * @var array
122
     */
123
    public $queueEntries = [];
124
125
    /**
126
     * @var array
127
     */
128
    public $urlList = [];
129
130
    /**
131
     * @var array
132
     */
133
    public $extensionSettings = [];
134
135
    /**
136
     * Mount Point
137
     *
138
     * @var bool
139
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
140
     */
141
    public $MP = false;
142
143
    /**
144
     * @var string
145
     */
146
    protected $processFilename;
147
148
    /**
149
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
150
     *
151
     * @var string
152
     */
153
    protected $accessMode;
154
155
    /**
156
     * @var QueueRepository
157
     */
158
    protected $queueRepository;
159
160
    /**
161
     * @var ProcessRepository
162
     */
163
    protected $processRepository;
164
165
    /**
166
     * @var ConfigurationRepository
167
     */
168
    protected $configurationRepository;
169
170
    /**
171
     * @var string
172
     */
173
    protected $tableName = 'tx_crawler_queue';
174
175
    /**
176
     * @var QueueExecutor
177
     */
178
    protected $queueExecutor;
179
180
    /**
181
     * @var int
182
     */
183
    protected $maximumUrlsToCompile = 10000;
184
185
    /**
186
     * @var IconFactory
187
     */
188
    protected $iconFactory;
189
190
    /**
191
     * @var string[]
192
     */
193
    private $deprecatedPublicMethods = [
0 ignored issues
show
introduced by
The private property $deprecatedPublicMethods is not used, and could be removed.
Loading history...
194
        'getLogEntriesForSetId' => 'Using crawlerController::getLogEntriesForSetId() is deprecated since 9.0.1 and will be removed in v11.x',
195
        'flushQueue' => 'Using CrawlerController::flushQueue() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->flushQueue() instead.',
196
        'cleanUpOldQueueEntries' => 'Using CrawlerController::cleanUpOldQueueEntries() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->cleanUpOldQueueEntries() instead.',
197
    ];
198
199
    /**
200
     * @var BackendUserAuthentication|null
201
     */
202
    private $backendUser;
203
204
    /**
205
     * @var integer
206
     */
207
    private $scheduledTime = 0;
208
209
    /**
210
     * @var integer
211
     */
212
    private $reqMinute = 0;
213
214
    /**
215
     * @var bool
216
     */
217
    private $submitCrawlUrls = false;
218
219
    /**
220
     * @var bool
221
     */
222
    private $downloadCrawlUrls = false;
223
224
    /************************************
225
     *
226
     * Getting URLs based on Page TSconfig
227
     *
228
     ************************************/
229
230 44
    public function __construct()
231
    {
232 44
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
233 44
        $crawlStrategy = $objectManager->get(CrawlStrategyFactory::class)->create();
234 44
        $this->queueRepository = $objectManager->get(QueueRepository::class);
235 44
        $this->processRepository = $objectManager->get(ProcessRepository::class);
236 44
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
237 44
        $this->queueExecutor = GeneralUtility::makeInstance(QueueExecutor::class, $crawlStrategy);
238 44
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
239
240 44
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
241
242
        /** @var ExtensionConfigurationProvider $configurationProvider */
243 44
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
244 44
        $settings = $configurationProvider->getExtensionConfiguration();
245 44
        $this->extensionSettings = is_array($settings) ? $settings : [];
0 ignored issues
show
introduced by
The condition is_array($settings) is always true.
Loading history...
246
247
        // set defaults:
248 44
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
249
            $this->extensionSettings['countInARun'] = 100;
250
        }
251
252 44
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
253 44
        $this->maximumUrlsToCompile = MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000);
254 44
    }
255
256
    /**
257
     * Method to set the accessMode can be gui, cli or cli_im
258
     *
259
     * @return string
260
     */
261 1
    public function getAccessMode()
262
    {
263 1
        return $this->accessMode;
264
    }
265
266
    /**
267
     * @param string $accessMode
268
     */
269 1
    public function setAccessMode($accessMode): void
270
    {
271 1
        $this->accessMode = $accessMode;
272 1
    }
273
274
    /**
275
     * Set disabled status to prevent processes from being processed
276
     *
277
     * @param bool $disabled (optional, defaults to true)
278
     */
279 2
    public function setDisabled($disabled = true): void
280
    {
281 2
        if ($disabled) {
282 1
            GeneralUtility::writeFile($this->processFilename, '');
283
        } else {
284 1
            if (is_file($this->processFilename)) {
285 1
                unlink($this->processFilename);
286
            }
287
        }
288 2
    }
289
290
    /**
291
     * Get disable status
292
     *
293
     * @return bool true if disabled
294
     */
295 2
    public function getDisabled()
296
    {
297 2
        return is_file($this->processFilename);
298
    }
299
300
    /**
301
     * @param string $filenameWithPath
302
     */
303 3
    public function setProcessFilename($filenameWithPath): void
304
    {
305 3
        $this->processFilename = $filenameWithPath;
306 3
    }
307
308
    /**
309
     * @return string
310
     */
311 1
    public function getProcessFilename()
312
    {
313 1
        return $this->processFilename;
314
    }
315
316
    /**
317
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
318
     */
319 14
    public function setExtensionSettings(array $extensionSettings): void
320
    {
321 14
        $this->extensionSettings = $extensionSettings;
322 14
    }
323
324
    /**
325
     * Check if the given page should be crawled
326
     *
327
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
328
     */
329 12
    public function checkIfPageShouldBeSkipped(array $pageRow)
330
    {
331 12
        $skipPage = false;
332 12
        $skipMessage = 'Skipped'; // message will be overwritten later
333
334
        // if page is hidden
335 12
        if (! $this->extensionSettings['crawlHiddenPages']) {
336 12
            if ($pageRow['hidden']) {
337 1
                $skipPage = true;
338 1
                $skipMessage = 'Because page is hidden';
339
            }
340
        }
341
342 12
        if (! $skipPage) {
343 11
            if (GeneralUtility::inList('3,4,199,254,255', $pageRow['doktype'])) {
344 3
                $skipPage = true;
345 3
                $skipMessage = 'Because doktype is not allowed';
346
            }
347
        }
348
349 12
        if (! $skipPage) {
350 8
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
351 1
                if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
352 1
                    $skipPage = true;
353 1
                    $skipMessage = 'Doktype was excluded by "' . $key . '"';
354 1
                    break;
355
                }
356
            }
357
        }
358
359 12
        if (! $skipPage) {
360
            // veto hook
361 7
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
362
                $params = [
363 2
                    'pageRow' => $pageRow,
364
                ];
365
                // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
366 2
                $veto = GeneralUtility::callUserFunction($func, $params, $this);
367 2
                if ($veto !== false) {
368 2
                    $skipPage = true;
369 2
                    if (is_string($veto)) {
370 1
                        $skipMessage = $veto;
371
                    } else {
372 1
                        $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
373
                    }
374
                    // no need to execute other hooks if a previous one return a veto
375 2
                    break;
376
                }
377
            }
378
        }
379
380 12
        return $skipPage ? $skipMessage : false;
381
    }
382
383
    /**
384
     * Wrapper method for getUrlsForPageId()
385
     * It returns an array of configurations and no urls!
386
     *
387
     * @param array $pageRow Page record with at least dok-type and uid columns.
388
     * @param string $skipMessage
389
     * @return array
390
     * @see getUrlsForPageId()
391
     */
392 6
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
393
    {
394 6
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
395 6
        if ($message === false) {
396 5
            $res = $this->getUrlsForPageId($pageRow['uid']);
397 5
            $skipMessage = '';
398
        } else {
399 1
            $skipMessage = $message;
400 1
            $res = [];
401
        }
402
403 6
        return $res;
404
    }
405
406
    /**
407
     * Creates a list of URLs from input array (and submits them to queue if asked for)
408
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
409
     *
410
     * @param array $vv Information about URLs from pageRow to crawl.
411
     * @param array $pageRow Page row
412
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
413
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
414
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
415
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
416
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
417
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
418
     * @param array $incomingProcInstructions Array of processing instructions
419
     * @return string List of URLs (meant for display in backend module)
420
     */
421 4
    public function urlListFromUrlArray(
422
        array $vv,
423
        array $pageRow,
424
        $scheduledTime,
425
        $reqMinute,
426
        $submitCrawlUrls,
427
        $downloadCrawlUrls,
428
        array &$duplicateTrack,
429
        array &$downloadUrls,
430
        array $incomingProcInstructions
431
    ) {
432 4
        if (! is_array($vv['URLs'])) {
433
            return 'ERROR - no URL generated';
434
        }
435 4
        $urlLog = [];
436 4
        $pageId = (int) $pageRow['uid'];
437 4
        $configurationHash = $this->getConfigurationHash($vv);
438 4
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
439
440 4
        foreach ($vv['URLs'] as $urlQuery) {
441 4
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
442
                continue;
443
            }
444 4
            $url = (string) $this->getUrlFromPageAndQueryParameters(
445 4
                $pageId,
446 4
                $urlQuery,
447 4
                $vv['subCfg']['baseUrl'] ?? null,
448 4
                $vv['subCfg']['force_ssl'] ?? 0
449
            );
450
451
            // Create key by which to determine unique-ness:
452 4
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
453
454 4
            if (isset($duplicateTrack[$uKey])) {
455
                //if the url key is registered just display it and do not resubmit is
456
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
457
            } else {
458
                // Scheduled time:
459 4
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
460 4
                $schTime = intval($schTime / 60) * 60;
461 4
                $formattedDate = BackendUtility::datetime($schTime);
462 4
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
463 4
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
464
465
                // Submit for crawling!
466 4
                if ($submitCrawlUrls) {
467 4
                    $added = $this->addUrl(
468 4
                        $pageId,
469 4
                        $url,
470 4
                        $vv['subCfg'],
471 4
                        $scheduledTime,
472 4
                        $configurationHash,
473 4
                        $skipInnerCheck
474
                    );
475 4
                    if ($added === false) {
476 4
                        $urlList .= ' (URL already existed)';
477
                    }
478
                } elseif ($downloadCrawlUrls) {
479
                    $downloadUrls[$url] = $url;
480
                }
481 4
                $urlLog[] = $urlList;
482
            }
483 4
            $duplicateTrack[$uKey] = true;
484
        }
485
486 4
        return implode('<br>', $urlLog);
487
    }
488
489
    /**
490
     * Returns true if input processing instruction is among registered ones.
491
     *
492
     * @param string $piString PI to test
493
     * @param array $incomingProcInstructions Processing instructions
494
     * @return boolean
495
     */
496 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
497
    {
498 5
        if (empty($incomingProcInstructions)) {
499 1
            return true;
500
        }
501
502 4
        foreach ($incomingProcInstructions as $pi) {
503 4
            if (GeneralUtility::inList($piString, $pi)) {
504 2
                return true;
505
            }
506
        }
507 2
        return false;
508
    }
509
510 5
    public function getPageTSconfigForId($id): array
511
    {
512 5
        if (! $this->MP) {
513 5
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

513
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($id);
Loading history...
514
        } else {
515
            // TODO: Please check, this makes no sense to split a boolean value.
516
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

516
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
517
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

517
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($mountPointId);
Loading history...
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

517
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
518
        }
519
520
        // Call a hook to alter configuration
521 5
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
522
            $params = [
523
                'pageId' => $id,
524
                'pageTSConfig' => &$pageTSconfig,
525
            ];
526
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
527
                GeneralUtility::callUserFunction($userFunc, $params, $this);
528
            }
529
        }
530 5
        return $pageTSconfig;
531
    }
532
533
    /**
534
     * This methods returns an array of configurations.
535
     * Adds no urls!
536
     */
537 4
    public function getUrlsForPageId(int $pageId): array
538
    {
539
        // Get page TSconfig for page ID
540 4
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
541
542 4
        $res = [];
543
544
        // Fetch Crawler Configuration from pageTSconfig
545 4
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
546 4
        foreach ($crawlerCfg as $key => $values) {
547 3
            if (! is_array($values)) {
548 3
                continue;
549
            }
550 3
            $key = str_replace('.', '', $key);
551
            // Sub configuration for a single configuration string:
552 3
            $subCfg = (array) $crawlerCfg[$key . '.'];
553 3
            $subCfg['key'] = $key;
554
555 3
            if (strcmp($subCfg['procInstrFilter'] ?? '', '')) {
556 3
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
557
            }
558 3
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
559
560
            // process configuration if it is not page-specific or if the specific page is the current page:
561
            // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
562 3
            if (! strcmp((string) $subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
563
564
                // Explode, process etc.:
565 3
                $res[$key] = [];
566 3
                $res[$key]['subCfg'] = $subCfg;
567 3
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
568 3
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
569 3
                $res[$key]['origin'] = 'pagets';
570
571
                // recognize MP value
572 3
                if (! $this->MP) {
573 3
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
574
                } else {
575
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
0 ignored issues
show
Bug introduced by
Are you sure $this->MP of type true can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

575
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . /** @scrutinizer ignore-type */ $this->MP]);
Loading history...
576
                }
577
            }
578
        }
579
580
        // Get configuration from tx_crawler_configuration records up the rootline
581 4
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
582 4
        foreach ($crawlerConfigurations as $configurationRecord) {
583
584
            // check access to the configuration record
585 1
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || $this->hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
586 1
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
587
588
                // process configuration if it is not page-specific or if the specific page is the current page:
589
                // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
590 1
                if (! strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
591 1
                    $key = $configurationRecord['name'];
592
593
                    // don't overwrite previously defined paramSets
594 1
                    if (! isset($res[$key])) {
595
596
                        /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
597 1
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
598 1
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
599
600
                        $subCfg = [
601 1
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
602 1
                            'procInstrParams.' => $TSparserObject->setup,
603 1
                            'baseUrl' => $configurationRecord['base_url'],
604 1
                            'force_ssl' => (int) $configurationRecord['force_ssl'],
605 1
                            'userGroups' => $configurationRecord['fegroups'],
606 1
                            'exclude' => $configurationRecord['exclude'],
607 1
                            'key' => $key,
608
                        ];
609
610 1
                        if (! in_array($pageId, $this->expandExcludeString($subCfg['exclude']), true)) {
611 1
                            $res[$key] = [];
612 1
                            $res[$key]['subCfg'] = $subCfg;
613 1
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
614 1
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
615 1
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
616 1
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
617
                        }
618
                    }
619
                }
620
            }
621
        }
622
623 4
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
624
            $params = [
625
                'res' => &$res,
626
            ];
627
            GeneralUtility::callUserFunction($func, $params, $this);
628
        }
629 4
        return $res;
630
    }
631
632
    /**
633
     * Find all configurations of subpages of a page
634
     * TODO: Write Functional Tests
635
     */
636 1
    public function getConfigurationsForBranch(int $rootid, int $depth): array
637
    {
638 1
        $configurationsForBranch = [];
639 1
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
640 1
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
641 1
        foreach ($sets as $key => $value) {
642
            if (! is_array($value)) {
643
                continue;
644
            }
645
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
646
        }
647 1
        $pids = [];
648 1
        $rootLine = BackendUtility::BEgetRootLine($rootid);
649 1
        foreach ($rootLine as $node) {
650 1
            $pids[] = $node['uid'];
651
        }
652
        /* @var PageTreeView $tree */
653 1
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
654 1
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
655 1
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
656 1
        $tree->getTree($rootid, $depth, '');
657 1
        foreach ($tree->tree as $node) {
658
            $pids[] = $node['row']['uid'];
659
        }
660
661 1
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
662
        $statement = $queryBuilder
663 1
            ->select('name')
664 1
            ->from('tx_crawler_configuration')
665 1
            ->where(
666 1
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
667
            )
668 1
            ->execute();
669
670 1
        while ($row = $statement->fetch()) {
671 1
            $configurationsForBranch[] = $row['name'];
672
        }
673 1
        return $configurationsForBranch;
674
    }
675
676
    /**
677
     * Check if a user has access to an item
678
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
679
     *
680
     * @param string $groupList Comma-separated list of (fe_)group UIDs from a user
681
     * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access
682
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
683
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
684
     */
685 3
    public function hasGroupAccess($groupList, $accessList)
686
    {
687 3
        if (empty($accessList)) {
688 1
            return true;
689
        }
690 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
691 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
692 1
                return true;
693
            }
694
        }
695 1
        return false;
696
    }
697
698
    /**
699
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
700
     * Syntax of values:
701
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
702
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
703
     * - For each configuration part:
704
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
705
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
706
     *        _ENABLELANG:1 picks only original records without their language overlays
707
     *         - Default: Literal value
708
     *
709
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
710
     * @param integer $pid Current page ID
711
     * @return array
712
     *
713
     * TODO: Write Functional Tests
714
     */
715 11
    public function expandParameters($paramArray, $pid)
716
    {
717
        // Traverse parameter names:
718 11
        foreach ($paramArray as $p => $v) {
719 11
            $v = trim($v);
720
721
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
722 11
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
723
                // So, find the value inside brackets and reset the paramArray value as an array.
724 11
                $v = substr($v, 1, -1);
725 11
                $paramArray[$p] = [];
726
727
                // Explode parts and traverse them:
728 11
                $parts = explode('|', $v);
729 11
                foreach ($parts as $pV) {
730
731
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
732 11
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
733 1
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
734
735
                        // Traverse range, add values:
736 1
                        $runAwayBrake = 1000; // Limit to size of range!
737 1
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
738 1
                            $paramArray[$p][] = $a;
739 1
                            $runAwayBrake--;
740 1
                            if ($runAwayBrake <= 0) {
741
                                break;
742
                            }
743
                        }
744 10
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
745
746
                        // Parse parameters:
747 6
                        $subparts = GeneralUtility::trimExplode(';', $pV);
748 6
                        $subpartParams = [];
749 6
                        foreach ($subparts as $spV) {
750 6
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
751 6
                            $subpartParams[$pKey] = $pVal;
752
                        }
753
754
                        // Table exists:
755 6
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
756 6
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
757 6
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
758 6
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
759 6
                            $where = $subpartParams['_WHERE'] ?? '';
760 6
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
761
762 6
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
763 6
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
764 6
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
765
766 6
                                if ($recursiveDepth > 0) {
767
                                    /** @var \TYPO3\CMS\Core\Database\QueryGenerator $queryGenerator */
768 2
                                    $queryGenerator = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Database\QueryGenerator::class);
769 2
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
770 2
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
771
                                } else {
772 4
                                    $pidArray = [(string) $lookUpPid];
773
                                }
774
775 6
                                $queryBuilder->getRestrictions()
776 6
                                    ->removeAll()
777 6
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
778
779
                                $queryBuilder
780 6
                                    ->select($fieldName)
781 6
                                    ->from($subpartParams['_TABLE'])
782 6
                                    ->where(
783 6
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
784 6
                                        $where
785
                                    );
786
787 6
                                if (! empty($addTable)) {
788
                                    // TODO: Check if this works as intended!
789
                                    $queryBuilder->add('from', $addTable);
790
                                }
791 6
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
792
793 6
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
794
                                    $queryBuilder->andWhere(
795
                                        $queryBuilder->expr()->lte(
796
                                            $transOrigPointerField,
797
                                            0
798
                                        )
799
                                    );
800
                                }
801
802 6
                                $statement = $queryBuilder->execute();
803
804 6
                                $rows = [];
805 6
                                while ($row = $statement->fetch()) {
806 6
                                    $rows[$row[$fieldName]] = $row;
807
                                }
808
809 6
                                if (is_array($rows)) {
810 6
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
811
                                }
812
                            }
813
                        }
814
                    } else { // Just add value:
815 4
                        $paramArray[$p][] = $pV;
816
                    }
817
                    // Hook for processing own expandParameters place holder
818 11
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
819
                        $_params = [
820
                            'pObj' => &$this,
821
                            'paramArray' => &$paramArray,
822
                            'currentKey' => $p,
823
                            'currentValue' => $pV,
824
                            'pid' => $pid,
825
                        ];
826
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
827
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
828
                        }
829
                    }
830
                }
831
832
                // Make unique set of values and sort array by key:
833 11
                $paramArray[$p] = array_unique($paramArray[$p]);
834 11
                ksort($paramArray);
835
            } else {
836
                // Set the literal value as only value in array:
837 4
                $paramArray[$p] = [$v];
838
            }
839
        }
840
841 11
        return $paramArray;
842
    }
843
844
    /**
845
     * Compiling URLs from parameter array (output of expandParameters())
846
     * The number of URLs will be the multiplication of the number of parameter values for each key
847
     *
848
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
849
     * @param array $urls URLs accumulated in this array (for recursion)
850
     * @return array
851
     */
852 7
    public function compileUrls($paramArray, array $urls)
853
    {
854 7
        if (empty($paramArray)) {
855 7
            return $urls;
856
        }
857
        // shift first off stack:
858 6
        reset($paramArray);
859 6
        $varName = key($paramArray);
860 6
        $valueSet = array_shift($paramArray);
861
862
        // Traverse value set:
863 6
        $newUrls = [];
864 6
        foreach ($urls as $url) {
865 5
            foreach ($valueSet as $val) {
866 5
                $newUrls[] = $url . (strcmp((string) $val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string) $val) : '');
867
868 5
                if (count($newUrls) > $this->maximumUrlsToCompile) {
869
                    break;
870
                }
871
            }
872
        }
873 6
        return $this->compileUrls($paramArray, $newUrls);
874
    }
875
876
    /************************************
877
     *
878
     * Crawler log
879
     *
880
     ************************************/
881
882
    /**
883
     * Return array of records from crawler queue for input page ID
884
     *
885
     * @param integer $id Page ID for which to look up log entries.
886
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
887
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
888
     * @param boolean $doFullFlush
889
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
890
     * @return array
891
     */
892 4
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
893
    {
894 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
895
        $queryBuilder
896 4
            ->select('*')
897 4
            ->from($this->tableName)
898 4
            ->where(
899 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT))
900
            )
901 4
            ->orderBy('scheduled', 'DESC');
902
903 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
904 4
            ->getConnectionForTable($this->tableName)
905 4
            ->getExpressionBuilder();
906 4
        $query = $expressionBuilder->andX();
0 ignored issues
show
Unused Code introduced by
The assignment to $query is dead and can be removed.
Loading history...
907
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
908
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
909
        // between the statements, it's not a mistake in the code.
910 4
        switch ($filter) {
911 4
            case 'pending':
912
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
913
                break;
914 4
            case 'finished':
915
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
916
                break;
917
        }
918
919 4
        if ($doFlush) {
920 2
            if ($doFullFlush) {
921 1
                $this->queueRepository->flushQueue('all');
922
            } else {
923 1
                $this->queueRepository->flushQueue($filter);
924
            }
925
        }
926 4
        if ($itemsPerPage > 0) {
927
            $queryBuilder
928 4
                ->setMaxResults((int) $itemsPerPage);
929
        }
930
931 4
        return $queryBuilder->execute()->fetchAll();
932
    }
933
934
    /**
935
     * Return array of records from crawler queue for input set ID
936
     *
937
     * @param int $set_id Set ID for which to look up log entries.
938
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
939
     * @param bool $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
940
     * @param int $itemsPerPage Limit the amount of entries per page default is 10
941
     * @return array
942
     *
943
     * @deprecated
944
     */
945 6
    public function getLogEntriesForSetId(int $set_id, string $filter = '', bool $doFlush = false, bool $doFullFlush = false, int $itemsPerPage = 10)
946
    {
947 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
948
        $queryBuilder
949 6
            ->select('*')
950 6
            ->from($this->tableName)
951 6
            ->where(
952 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, \PDO::PARAM_INT))
953
            )
954 6
            ->orderBy('scheduled', 'DESC');
955
956 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
957 6
            ->getConnectionForTable($this->tableName)
958 6
            ->getExpressionBuilder();
959 6
        $query = $expressionBuilder->andX();
960
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
961
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
962
        // between the statements, it's not a mistake in the code.
963 6
        $addWhere = '';
964 6
        switch ($filter) {
965 6
            case 'pending':
966 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
967 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
968 1
                break;
969 5
            case 'finished':
970 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
971 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
972 1
                break;
973
        }
974 6
        if ($doFlush) {
975 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int) $set_id));
976 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

976
            /** @scrutinizer ignore-deprecated */ $this->flushQueue($doFullFlush ? '' : $addWhere);
Loading history...
977 4
            return [];
978
        }
979 2
        if ($itemsPerPage > 0) {
980
            $queryBuilder
981 2
                ->setMaxResults((int) $itemsPerPage);
982
        }
983
984 2
        return $queryBuilder->execute()->fetchAll();
985
    }
986
987
    /**
988
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
989
     *
990
     * @param integer $setId Set ID
991
     * @param array $params Parameters to pass to call back function
992
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
993
     * @param integer $page_id Page ID to attach it to
994
     * @param integer $schedule Time at which to activate
995
     */
996
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
997
    {
998
        if (! is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
999
            $params = [];
1000
        }
1001
        $params['_CALLBACKOBJ'] = $callBack;
1002
1003
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1004
            ->insert(
1005
                'tx_crawler_queue',
1006
                [
1007
                    'page_id' => (int) $page_id,
1008
                    'parameters' => json_encode($params),
1009
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
1010
                    'exec_time' => 0,
1011
                    'set_id' => (int) $setId,
1012
                    'result_data' => '',
1013
                ]
1014
            );
1015
    }
1016
1017
    /************************************
1018
     *
1019
     * URL setting
1020
     *
1021
     ************************************/
1022
1023
    /**
1024
     * Setting a URL for crawling:
1025
     *
1026
     * @param integer $id Page ID
1027
     * @param string $url Complete URL
1028
     * @param array $subCfg Sub configuration array (from TS config)
1029
     * @param integer $tstamp Scheduled-time
1030
     * @param string $configurationHash (optional) configuration hash
1031
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1032
     * @return bool
1033
     */
1034 8
    public function addUrl(
1035
        $id,
1036
        $url,
1037
        array $subCfg,
1038
        $tstamp,
1039
        $configurationHash = '',
1040
        $skipInnerDuplicationCheck = false
1041
    ) {
1042 8
        $urlAdded = false;
1043 8
        $rows = [];
1044
1045
        // Creating parameters:
1046
        $parameters = [
1047 8
            'url' => $url,
1048
        ];
1049
1050
        // fe user group simulation:
1051 8
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1052 8
        if ($uGs) {
1053 1
            $parameters['feUserGroupList'] = $uGs;
1054
        }
1055
1056
        // Setting processing instructions
1057 8
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1058 8
        if (is_array($subCfg['procInstrParams.'])) {
1059 5
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1060
        }
1061
1062
        // Compile value array:
1063 8
        $parameters_serialized = json_encode($parameters);
1064
        $fieldArray = [
1065 8
            'page_id' => (int) $id,
1066 8
            'parameters' => $parameters_serialized,
1067 8
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1068 8
            'configuration_hash' => $configurationHash,
1069 8
            'scheduled' => $tstamp,
1070 8
            'exec_time' => 0,
1071 8
            'set_id' => (int) $this->setID,
1072 8
            'result_data' => '',
1073 8
            'configuration' => $subCfg['key'],
1074
        ];
1075
1076 8
        if ($this->registerQueueEntriesInternallyOnly) {
1077
            //the entries will only be registered and not stored to the database
1078 1
            $this->queueEntries[] = $fieldArray;
1079
        } else {
1080 7
            if (! $skipInnerDuplicationCheck) {
1081
                // check if there is already an equal entry
1082 6
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1083
            }
1084
1085 7
            if (empty($rows)) {
1086 6
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1087 6
                $connectionForCrawlerQueue->insert(
1088 6
                    'tx_crawler_queue',
1089 6
                    $fieldArray
1090
                );
1091 6
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1092 6
                $rows[] = $uid;
1093 6
                $urlAdded = true;
1094
1095 6
                $signalPayload = ['uid' => $uid, 'fieldArray' => $fieldArray];
1096 6
                SignalSlotUtility::emitSignal(
1097 6
                    self::class,
1098 6
                    SignalSlotUtility::SIGNAL_URL_ADDED_TO_QUEUE,
1099 6
                    $signalPayload
1100
                );
1101
            } else {
1102 3
                $signalPayload = ['rows' => $rows, 'fieldArray' => $fieldArray];
1103 3
                SignalSlotUtility::emitSignal(
1104 3
                    self::class,
1105 3
                    SignalSlotUtility::SIGNAL_DUPLICATE_URL_IN_QUEUE,
1106 3
                    $signalPayload
1107
                );
1108
            }
1109
        }
1110
1111 8
        return $urlAdded;
1112
    }
1113
1114
    /**
1115
     * Returns the current system time
1116
     *
1117
     * @return int
1118
     */
1119
    public function getCurrentTime()
1120
    {
1121
        return time();
1122
    }
1123
1124
    /************************************
1125
     *
1126
     * URL reading
1127
     *
1128
     ************************************/
1129
1130
    /**
1131
     * Read URL for single queue entry
1132
     *
1133
     * @param integer $queueId
1134
     * @param boolean $force If set, will process even if exec_time has been set!
1135
     * @return integer
1136
     */
1137
    public function readUrl($queueId, $force = false)
1138
    {
1139
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1140
        $ret = 0;
1141
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1142
        // Get entry:
1143
        $queryBuilder
1144
            ->select('*')
1145
            ->from('tx_crawler_queue')
1146
            ->where(
1147
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1148
            );
1149
        if (! $force) {
1150
            $queryBuilder
1151
                ->andWhere('exec_time = 0')
1152
                ->andWhere('process_scheduled > 0');
1153
        }
1154
        $queueRec = $queryBuilder->execute()->fetch();
1155
1156
        if (! is_array($queueRec)) {
1157
            return;
1158
        }
1159
1160
        SignalSlotUtility::emitSignal(
1161
            self::class,
1162
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1163
            [$queueId, &$queueRec]
1164
        );
1165
1166
        // Set exec_time to lock record:
1167
        $field_array = ['exec_time' => $this->getCurrentTime()];
1168
1169
        if (isset($this->processID)) {
1170
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1171
            $field_array['process_id_completed'] = $this->processID;
1172
        }
1173
1174
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1175
            ->update(
1176
                'tx_crawler_queue',
1177
                $field_array,
1178
                ['qid' => (int) $queueId]
1179
            );
1180
1181
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1182
        if ($result['content'] === null) {
1183
            $resultData = 'An errors happened';
1184
        } else {
1185
            /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
1186
            $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
1187
            $resultData = $jsonCompatibilityConverter->convert($result['content']);
1188
        }
1189
1190
        //atm there's no need to point to specific pollable extensions
1191
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1192
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1193
                // only check the success value if the instruction is runnig
1194
                // it is important to name the pollSuccess key same as the procInstructions key
1195
                if (is_array($resultData['parameters']['procInstructions'])
1196
                    && in_array(
1197
                        $pollable,
1198
                        $resultData['parameters']['procInstructions'], true
1199
                    )
1200
                ) {
1201
                    if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1202
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1203
                    }
1204
                }
1205
            }
1206
        }
1207
1208
        // Set result in log which also denotes the end of the processing of this entry.
1209
        $field_array = ['result_data' => json_encode($result)];
1210
1211
        SignalSlotUtility::emitSignal(
1212
            self::class,
1213
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1214
            [$queueId, &$field_array]
1215
        );
1216
1217
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1218
            ->update(
1219
                'tx_crawler_queue',
1220
                $field_array,
1221
                ['qid' => (int) $queueId]
1222
            );
1223
1224
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1225
        return $ret;
1226
    }
1227
1228
    /**
1229
     * Read URL for not-yet-inserted log-entry
1230
     *
1231
     * @param array $field_array Queue field array,
1232
     *
1233
     * @return string
1234
     */
1235
    public function readUrlFromArray($field_array)
1236
    {
1237
        // Set exec_time to lock record:
1238
        $field_array['exec_time'] = $this->getCurrentTime();
1239
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1240
        $connectionForCrawlerQueue->insert(
1241
            $this->tableName,
1242
            $field_array
1243
        );
1244
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1245
1246
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1247
1248
        // Set result in log which also denotes the end of the processing of this entry.
1249
        $field_array = ['result_data' => json_encode($result)];
1250
1251
        SignalSlotUtility::emitSignal(
1252
            self::class,
1253
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1254
            [$queueId, &$field_array]
1255
        );
1256
1257
        $connectionForCrawlerQueue->update(
1258
            $this->tableName,
1259
            $field_array,
1260
            ['qid' => $queueId]
1261
        );
1262
1263
        return $result;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $result also could return the type array|boolean which is incompatible with the documented return type string.
Loading history...
1264
    }
1265
1266
    /*****************************
1267
     *
1268
     * Compiling URLs to crawl - tools
1269
     *
1270
     *****************************/
1271
1272
    /**
1273
     * @param integer $id Root page id to start from.
1274
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1275
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1276
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1277
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1278
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1279
     * @param array $incomingProcInstructions Array of processing instructions
1280
     * @param array $configurationSelection Array of configuration keys
1281
     * @return string
1282
     */
1283
    public function getPageTreeAndUrls(
1284
        $id,
1285
        $depth,
1286
        $scheduledTime,
1287
        $reqMinute,
1288
        $submitCrawlUrls,
1289
        $downloadCrawlUrls,
1290
        array $incomingProcInstructions,
1291
        array $configurationSelection
1292
    ) {
1293
        $this->scheduledTime = $scheduledTime;
1294
        $this->reqMinute = $reqMinute;
1295
        $this->submitCrawlUrls = $submitCrawlUrls;
1296
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1297
        $this->incomingProcInstructions = $incomingProcInstructions;
1298
        $this->incomingConfigurationSelection = $configurationSelection;
1299
1300
        $this->duplicateTrack = [];
1301
        $this->downloadUrls = [];
1302
1303
        // Drawing tree:
1304
        /* @var PageTreeView $tree */
1305
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1306
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1307
        $tree->init('AND ' . $perms_clause);
1308
1309
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1310
        if (is_array($pageInfo)) {
0 ignored issues
show
introduced by
The condition is_array($pageInfo) is always true.
Loading history...
1311
            // Set root row:
1312
            $tree->tree[] = [
1313
                'row' => $pageInfo,
1314
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1315
            ];
1316
        }
1317
1318
        // Get branch beneath:
1319
        if ($depth) {
1320
            $tree->getTree($id, $depth, '');
1321
        }
1322
1323
        // Traverse page tree:
1324
        $code = '';
1325
1326
        foreach ($tree->tree as $data) {
1327
            $this->MP = false;
1328
1329
            // recognize mount points
1330
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
1331
                $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('pages');
1332
                $queryBuilder->getRestrictions()->removeAll()->add(GeneralUtility::makeInstance(DeletedRestriction::class));
1333
                $mountpage = $queryBuilder
1334
                    ->select('*')
1335
                    ->from('pages')
1336
                    ->where(
1337
                        $queryBuilder->expr()->eq('uid', $queryBuilder->createNamedParameter($data['row']['uid'], \PDO::PARAM_INT))
1338
                    )
1339
                    ->execute()
1340
                    ->fetchAll();
1341
                $queryBuilder->resetRestrictions();
1342
1343
                // fetch mounted pages
1344
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1345
1346
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1347
                $mountTree->init('AND ' . $perms_clause);
1348
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1349
1350
                foreach ($mountTree->tree as $mountData) {
1351
                    $code .= $this->drawURLs_addRowsForPage(
1352
                        $mountData['row'],
1353
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1354
                    );
1355
                }
1356
1357
                // replace page when mount_pid_ol is enabled
1358
                if ($mountpage[0]['mount_pid_ol']) {
1359
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1360
                } else {
1361
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1362
                    $this->MP = false;
1363
                }
1364
            }
1365
1366
            $code .= $this->drawURLs_addRowsForPage(
1367
                $data['row'],
1368
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1369
            );
1370
        }
1371
1372
        return $code;
1373
    }
1374
1375
    /**
1376
     * Expands exclude string
1377
     *
1378
     * @param string $excludeString Exclude string
1379
     * @return array
1380
     */
1381 2
    public function expandExcludeString($excludeString)
1382
    {
1383
        // internal static caches;
1384 2
        static $expandedExcludeStringCache;
1385 2
        static $treeCache;
1386
1387 2
        if (empty($expandedExcludeStringCache[$excludeString])) {
1388 2
            $pidList = [];
1389
1390 2
            if (! empty($excludeString)) {
1391
                /** @var PageTreeView $tree */
1392 1
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1393 1
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1394
1395 1
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1396
1397 1
                foreach ($excludeParts as $excludePart) {
1398 1
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1399
1400
                    // default is "page only" = "depth=0"
1401 1
                    if (empty($depth)) {
1402 1
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1403
                    }
1404
1405 1
                    $pidList[] = (int) $pid;
1406
1407 1
                    if ($depth > 0) {
1408
                        if (empty($treeCache[$pid][$depth])) {
1409
                            $tree->reset();
1410
                            $tree->getTree($pid, $depth);
0 ignored issues
show
Bug introduced by
$pid of type string is incompatible with the type integer expected by parameter $uid of TYPO3\CMS\Backend\Tree\V...ractTreeView::getTree(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1410
                            $tree->getTree(/** @scrutinizer ignore-type */ $pid, $depth);
Loading history...
1411
                            $treeCache[$pid][$depth] = $tree->tree;
1412
                        }
1413
1414
                        foreach ($treeCache[$pid][$depth] as $data) {
1415
                            $pidList[] = (int) $data['row']['uid'];
1416
                        }
1417
                    }
1418
                }
1419
            }
1420
1421 2
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1422
        }
1423
1424 2
        return $expandedExcludeStringCache[$excludeString];
1425
    }
1426
1427
    /**
1428
     * Create the rows for display of the page tree
1429
     * For each page a number of rows are shown displaying GET variable configuration
1430
     */
1431
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1432
    {
1433
        $skipMessage = '';
1434
1435
        // Get list of configurations
1436
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1437
1438
        if (! empty($this->incomingConfigurationSelection)) {
1439
            // remove configuration that does not match the current selection
1440
            foreach ($configurations as $confKey => $confArray) {
1441
                if (! in_array($confKey, $this->incomingConfigurationSelection, true)) {
1442
                    unset($configurations[$confKey]);
1443
                }
1444
            }
1445
        }
1446
1447
        // Traverse parameter combinations:
1448
        $c = 0;
1449
        $content = '';
1450
        if (! empty($configurations)) {
1451
            foreach ($configurations as $confKey => $confArray) {
1452
1453
                // Title column:
1454
                if (! $c) {
1455
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1456
                } else {
1457
                    $titleClm = '';
1458
                }
1459
1460
                if (! in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']), true)) {
1461
1462
                    // URL list:
1463
                    $urlList = $this->urlListFromUrlArray(
1464
                        $confArray,
1465
                        $pageRow,
1466
                        $this->scheduledTime,
1467
                        $this->reqMinute,
1468
                        $this->submitCrawlUrls,
1469
                        $this->downloadCrawlUrls,
1470
                        $this->duplicateTrack,
1471
                        $this->downloadUrls,
1472
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
1473
                    );
1474
1475
                    // Expanded parameters:
1476
                    $paramExpanded = '';
1477
                    $calcAccu = [];
1478
                    $calcRes = 1;
1479
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1480
                        $paramExpanded .= '
1481
                            <tr>
1482
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1483
                            '(' . count($gVal) . ')' .
1484
                            '</td>
1485
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1486
                            </tr>
1487
                        ';
1488
                        $calcRes *= count($gVal);
1489
                        $calcAccu[] = count($gVal);
1490
                    }
1491
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1492
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1493
1494
                    // Options
1495
                    $optionValues = '';
1496
                    if ($confArray['subCfg']['userGroups']) {
1497
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1498
                    }
1499
                    if ($confArray['subCfg']['procInstrFilter']) {
1500
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1501
                    }
1502
1503
                    // Compile row:
1504
                    $content .= '
1505
                        <tr>
1506
                            ' . $titleClm . '
1507
                            <td>' . htmlspecialchars($confKey) . '</td>
1508
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1509
                            <td>' . $paramExpanded . '</td>
1510
                            <td nowrap="nowrap">' . $urlList . '</td>
1511
                            <td nowrap="nowrap">' . $optionValues . '</td>
1512
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1513
                        </tr>';
1514
                } else {
1515
                    $content .= '<tr>
1516
                            ' . $titleClm . '
1517
                            <td>' . htmlspecialchars($confKey) . '</td>
1518
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1519
                        </tr>';
1520
                }
1521
1522
                $c++;
1523
            }
1524
        } else {
1525
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1526
1527
            // Compile row:
1528
            $content .= '
1529
                <tr>
1530
                    <td>' . $pageTitle . '</td>
1531
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1532
                </tr>';
1533
        }
1534
1535
        return $content;
1536
    }
1537
1538
    /*****************************
1539
     *
1540
     * CLI functions
1541
     *
1542
     *****************************/
1543
1544
    /**
1545
     * Running the functionality of the CLI (crawling URLs from queue)
1546
     */
1547
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1548
    {
1549
        $result = 0;
1550
        $counter = 0;
1551
1552
        // First, run hooks:
1553
        $this->CLI_runHooks();
1554
1555
        // Clean up the queue
1556
        $this->queueRepository->cleanupQueue();
1557
1558
        // Select entries:
1559
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1560
1561
        if (! empty($rows)) {
1562
            $quidList = [];
1563
1564
            foreach ($rows as $r) {
1565
                $quidList[] = $r['qid'];
1566
            }
1567
1568
            $processId = $this->CLI_buildProcessId();
1569
1570
            //save the number of assigned queue entries to determine how many have been processed later
1571
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1572
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1573
1574
            if ($numberOfAffectedRows !== count($quidList)) {
1575
                $this->CLI_debug('Nothing processed due to multi-process collision (' . $this->CLI_buildProcessId() . ')');
1576
                return ($result | self::CLI_STATUS_ABORTED);
1577
            }
1578
1579
            foreach ($rows as $r) {
1580
                $result |= $this->readUrl($r['qid']);
1581
1582
                $counter++;
1583
                usleep((int) $sleepTime); // Just to relax the system
1584
1585
                // if during the start and the current read url the cli has been disable we need to return from the function
1586
                // mark the process NOT as ended.
1587
                if ($this->getDisabled()) {
1588
                    return ($result | self::CLI_STATUS_ABORTED);
1589
                }
1590
1591
                if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1592
                    $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
1593
                    $result |= self::CLI_STATUS_ABORTED;
1594
                    break; //possible timeout
1595
                }
1596
            }
1597
1598
            sleep((int) $sleepAfterFinish);
1599
1600
            $msg = 'Rows: ' . $counter;
1601
            $this->CLI_debug($msg . ' (' . $this->CLI_buildProcessId() . ')');
1602
        } else {
1603
            $this->CLI_debug('Nothing within queue which needs to be processed (' . $this->CLI_buildProcessId() . ')');
1604
        }
1605
1606
        if ($counter > 0) {
1607
            $result |= self::CLI_STATUS_PROCESSED;
1608
        }
1609
1610
        return $result;
1611
    }
1612
1613
    /**
1614
     * Activate hooks
1615
     */
1616
    public function CLI_runHooks(): void
1617
    {
1618
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1619
            $hookObj = GeneralUtility::makeInstance($objRef);
1620
            if (is_object($hookObj)) {
1621
                $hookObj->crawler_init($this);
1622
            }
1623
        }
1624
    }
1625
1626
    /**
1627
     * Try to acquire a new process with the given id
1628
     * also performs some auto-cleanup for orphan processes
1629
     * @param string $id identification string for the process
1630
     * @return boolean
1631
     * @todo preemption might not be the most elegant way to clean up
1632
     */
1633
    public function CLI_checkAndAcquireNewProcess($id)
1634
    {
1635
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1636
        $ret = true;
1637
1638
        $systemProcessId = getmypid();
1639
        if ($systemProcessId < 1) {
1640
            return false;
1641
        }
1642
1643
        $processCount = 0;
1644
        $orphanProcesses = [];
1645
1646
        $statement = $queryBuilder
1647
            ->select('process_id', 'ttl')
1648
            ->from('tx_crawler_process')
1649
            ->where(
1650
                'active = 1 AND deleted = 0'
1651
            )
1652
            ->execute();
1653
1654
        $currentTime = $this->getCurrentTime();
1655
1656
        while ($row = $statement->fetch()) {
1657
            if ($row['ttl'] < $currentTime) {
1658
                $orphanProcesses[] = $row['process_id'];
1659
            } else {
1660
                $processCount++;
1661
            }
1662
        }
1663
1664
        // if there are less than allowed active processes then add a new one
1665
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
1666
            $this->CLI_debug('add process ' . $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
1667
1668
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1669
                'tx_crawler_process',
1670
                [
1671
                    'process_id' => $id,
1672
                    'active' => 1,
1673
                    'ttl' => $currentTime + (int) $this->extensionSettings['processMaxRunTime'],
1674
                    'system_process_id' => $systemProcessId,
1675
                ]
1676
            );
1677
        } else {
1678
            $this->CLI_debug('Processlimit reached (' . ($processCount) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
1679
            $ret = false;
1680
        }
1681
1682
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1683
        $this->CLI_releaseProcesses($orphanProcesses);
1684
1685
        return $ret;
1686
    }
1687
1688
    /**
1689
     * Release a process and the required resources
1690
     *
1691
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1692
     * @return boolean
1693
     */
1694
    public function CLI_releaseProcesses($releaseIds)
1695
    {
1696
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1697
1698
        if (! is_array($releaseIds)) {
1699
            $releaseIds = [$releaseIds];
1700
        }
1701
1702
        if (empty($releaseIds)) {
1703
            return false;   //nothing to release
1704
        }
1705
1706
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1707
        // this ensures that a single process can't mess up the entire process table
1708
1709
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1710
1711
        $queryBuilder
1712
            ->update($this->tableName, 'q')
1713
            ->where(
1714
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1715
            )
1716
            ->set('q.process_scheduled', 0)
1717
            ->set('q.process_id', '')
1718
            ->execute();
1719
1720
        // FIXME: Not entirely sure that this is equivalent to the previous version
1721
        $queryBuilder->resetQueryPart('set');
1722
1723
        $queryBuilder
1724
            ->update('tx_crawler_process')
1725
            ->where(
1726
                $queryBuilder->expr()->eq('active', 0),
1727
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1728
            )
1729
            ->set('system_process_id', 0)
1730
            ->execute();
1731
1732
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1733
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1734
1735
        return true;
1736
    }
1737
1738
    /**
1739
     * Create a unique Id for the current process
1740
     *
1741
     * @return string  the ID
1742
     */
1743 1
    public function CLI_buildProcessId()
1744
    {
1745 1
        if (! $this->processID) {
1746
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1747
        }
1748 1
        return $this->processID;
1749
    }
1750
1751
    /**
1752
     * Prints a message to the stdout (only if debug-mode is enabled)
1753
     *
1754
     * @param string $msg the message
1755
     */
1756
    public function CLI_debug($msg): void
1757
    {
1758
        if ((int) $this->extensionSettings['processDebug']) {
1759
            echo $msg . "\n";
1760
            flush();
1761
        }
1762
    }
1763
1764
    /**
1765
     * Cleans up entries that stayed for too long in the queue. These are:
1766
     * - processed entries that are over 1.5 days in age
1767
     * - scheduled entries that are over 7 days old
1768
     *
1769
     * @deprecated
1770
     */
1771 1
    public function cleanUpOldQueueEntries(): void
1772
    {
1773 1
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
1774 1
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
1775
1776 1
        $now = time();
1777 1
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
1778 1
        $this->flushQueue($condition);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1778
        /** @scrutinizer ignore-deprecated */ $this->flushQueue($condition);
Loading history...
1779 1
    }
1780
1781
    /**
1782
     * Removes queue entries
1783
     *
1784
     * @param string $where SQL related filter for the entries which should be removed
1785
     *
1786
     * @deprecated
1787
     */
1788 5
    protected function flushQueue($where = ''): void
1789
    {
1790 5
        $realWhere = strlen((string) $where) > 0 ? $where : '1=1';
1791
1792 5
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1793
1794
        $groups = $queryBuilder
1795 5
            ->selectLiteral('DISTINCT set_id')
1796 5
            ->from($this->tableName)
1797 5
            ->where($realWhere)
1798 5
            ->execute()
1799 5
            ->fetchAll();
1800 5
        if (is_array($groups)) {
0 ignored issues
show
introduced by
The condition is_array($groups) is always true.
Loading history...
1801 5
            foreach ($groups as $group) {
1802
                $subSet = $queryBuilder
1803 4
                    ->select('qid', 'set_id')
1804 4
                    ->from($this->tableName)
1805 4
                    ->where(
1806 4
                        $realWhere,
1807 4
                        $queryBuilder->expr()->eq('set_id', $group['set_id'])
1808
                    )
1809 4
                    ->execute()
1810 4
                    ->fetchAll();
1811
1812 4
                $payLoad = ['subSet' => $subSet];
1813 4
                SignalSlotUtility::emitSignal(
1814 4
                    self::class,
1815 4
                    SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH,
1816 4
                    $payLoad
1817
                );
1818
            }
1819
        }
1820
1821
        $queryBuilder
1822 5
            ->delete($this->tableName)
1823 5
            ->where($realWhere)
1824 5
            ->execute();
1825 5
    }
1826
1827
    /**
1828
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1829
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1830
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1831
     *
1832
     * @param int $tstamp
1833
     * @param array $fieldArray
1834
     *
1835
     * @return array
1836
     */
1837 9
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1838
    {
1839 9
        $rows = [];
1840
1841 9
        $currentTime = $this->getCurrentTime();
1842
1843 9
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1844
        $queryBuilder
1845 9
            ->select('qid')
1846 9
            ->from('tx_crawler_queue');
1847
        //if this entry is scheduled with "now"
1848 9
        if ($tstamp <= $currentTime) {
1849 3
            if ($this->extensionSettings['enableTimeslot']) {
1850 2
                $timeBegin = $currentTime - 100;
1851 2
                $timeEnd = $currentTime + 100;
1852
                $queryBuilder
1853 2
                    ->where(
1854 2
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1855
                    )
1856 2
                    ->orWhere(
1857 2
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1858
                    );
1859
            } else {
1860
                $queryBuilder
1861 1
                    ->where(
1862 3
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1863
                    );
1864
            }
1865 6
        } elseif ($tstamp > $currentTime) {
1866
            //entry with a timestamp in the future need to have the same schedule time
1867
            $queryBuilder
1868 6
                ->where(
1869 6
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1870
                );
1871
        }
1872
1873
        $queryBuilder
1874 9
            ->andWhere('NOT exec_time')
1875 9
            ->andWhere('NOT process_id')
1876 9
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT)))
1877 9
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR)));
1878
1879 9
        $statement = $queryBuilder->execute();
1880
1881 9
        while ($row = $statement->fetch()) {
1882 7
            $rows[] = $row['qid'];
1883
        }
1884
1885 9
        return $rows;
1886
    }
1887
1888
    /**
1889
     * Returns a md5 hash generated from a serialized configuration array.
1890
     *
1891
     * @return string
1892
     */
1893 10
    protected function getConfigurationHash(array $configuration)
1894
    {
1895 10
        unset($configuration['paramExpanded']);
1896 10
        unset($configuration['URLs']);
1897 10
        return md5(serialize($configuration));
1898
    }
1899
1900
    /**
1901
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
1902
     * the Site instance.
1903
     *
1904
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
1905
     * @throws \TYPO3\CMS\Core\Exception\SiteNotFoundException
1906
     * @throws \TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException
1907
     */
1908 12
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
1909
    {
1910 12
        $site = GeneralUtility::makeInstance(SiteMatcher::class)->matchByPageId((int) $pageId);
1911 12
        if ($site instanceof Site) {
1912 5
            $queryString = ltrim($queryString, '?&');
1913 5
            $queryParts = [];
1914 5
            parse_str($queryString, $queryParts);
1915 5
            unset($queryParts['id']);
1916
            // workaround as long as we don't have native language support in crawler configurations
1917 5
            if (isset($queryParts['L'])) {
1918
                $queryParts['_language'] = $queryParts['L'];
1919
                unset($queryParts['L']);
1920
                $siteLanguage = $site->getLanguageById((int) $queryParts['_language']);
0 ignored issues
show
Unused Code introduced by
The assignment to $siteLanguage is dead and can be removed.
Loading history...
1921
            } else {
1922 5
                $siteLanguage = $site->getDefaultLanguage();
1923
            }
1924 5
            $url = $site->getRouter()->generateUri($pageId, $queryParts);
1925 5
            if (! empty($alternativeBaseUrl)) {
1926 3
                $alternativeBaseUrl = new Uri($alternativeBaseUrl);
1927 3
                $url = $url->withHost($alternativeBaseUrl->getHost());
1928 3
                $url = $url->withScheme($alternativeBaseUrl->getScheme());
1929 3
                $url = $url->withPort($alternativeBaseUrl->getPort());
1930 3
                if ($userInfo = $alternativeBaseUrl->getUserInfo()) {
1931 5
                    $url = $url->withUserInfo($userInfo);
1932
                }
1933
            }
1934
        } else {
1935
            // Technically this is not possible with site handling, but kept for backwards-compatibility reasons
1936
            // Once EXT:crawler is v10-only compatible, this should be removed completely
1937 7
            $baseUrl = ($alternativeBaseUrl ?: GeneralUtility::getIndpEnv('TYPO3_SITE_URL'));
1938 7
            $cacheHashCalculator = GeneralUtility::makeInstance(CacheHashCalculator::class);
1939 7
            $queryString .= '&cHash=' . $cacheHashCalculator->generateForParameters($queryString);
1940 7
            $url = rtrim($baseUrl, '/') . '/index.php' . $queryString;
1941 7
            $url = new Uri($url);
1942
        }
1943
1944 12
        if ($httpsOrHttp === -1) {
1945 2
            $url = $url->withScheme('http');
1946 10
        } elseif ($httpsOrHttp === 1) {
1947 6
            $url = $url->withScheme('https');
1948
        }
1949
1950 12
        return $url;
1951
    }
1952
1953 1
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1954
    {
1955
        // Swap if first is larger than last:
1956 1
        if ($reg[1] > $reg[2]) {
1957
            $temp = $reg[2];
1958
            $reg[2] = $reg[1];
1959
            $reg[1] = $temp;
1960
        }
1961
1962 1
        return $reg;
1963
    }
1964
1965
    /**
1966
     * @return BackendUserAuthentication
1967
     */
1968 2
    private function getBackendUser()
1969
    {
1970
        // Make sure the _cli_ user is loaded
1971 2
        Bootstrap::initializeBackendAuthentication();
1972 2
        if ($this->backendUser === null) {
1973 2
            $this->backendUser = $GLOBALS['BE_USER'];
1974
        }
1975 2
        return $this->backendUser;
1976
    }
1977
1978
    /**
1979
     * Get querybuilder for given table
1980
     *
1981
     * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder
1982
     */
1983 12
    private function getQueryBuilder(string $table)
1984
    {
1985 12
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
1986
    }
1987
}
1988