Passed
Push — cleanup/internal-deprecated-fu... ( aedfa9 )
by Tomas Norre
05:59
created

CrawlerController::expandParameters()   F

Complexity

Conditions 25
Paths 831

Size

Total Lines 129
Code Lines 74

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 57
CRAP Score 28.2868

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 25
eloc 74
c 1
b 0
f 0
nc 831
nop 2
dl 0
loc 129
ccs 57
cts 69
cp 0.8261
crap 28.2868
rs 0.2347

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Converter\JsonCompatibilityConverter;
33
use AOE\Crawler\Crawler;
34
use AOE\Crawler\CrawlStrategy\CrawlStrategyFactory;
35
use AOE\Crawler\Domain\Model\Process;
36
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
37
use AOE\Crawler\Domain\Repository\ProcessRepository;
38
use AOE\Crawler\Domain\Repository\QueueRepository;
39
use AOE\Crawler\QueueExecutor;
40
use AOE\Crawler\Service\ConfigurationService;
41
use AOE\Crawler\Service\UrlService;
42
use AOE\Crawler\Utility\SignalSlotUtility;
43
use AOE\Crawler\Value\QueueFilter;
44
use PDO;
45
use Psr\Http\Message\UriInterface;
46
use Psr\Log\LoggerAwareInterface;
47
use Psr\Log\LoggerAwareTrait;
48
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
49
use TYPO3\CMS\Backend\Utility\BackendUtility;
50
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
51
use TYPO3\CMS\Core\Compatibility\PublicMethodDeprecationTrait;
52
use TYPO3\CMS\Core\Compatibility\PublicPropertyDeprecationTrait;
53
use TYPO3\CMS\Core\Core\Bootstrap;
54
use TYPO3\CMS\Core\Core\Environment;
55
use TYPO3\CMS\Core\Database\Connection;
56
use TYPO3\CMS\Core\Database\ConnectionPool;
57
use TYPO3\CMS\Core\Database\Query\QueryBuilder;
58
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
59
use TYPO3\CMS\Core\Database\QueryGenerator;
60
use TYPO3\CMS\Core\Domain\Repository\PageRepository;
61
use TYPO3\CMS\Core\Exception\SiteNotFoundException;
62
use TYPO3\CMS\Core\Imaging\Icon;
63
use TYPO3\CMS\Core\Imaging\IconFactory;
64
use TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException;
65
use TYPO3\CMS\Core\Site\Entity\Site;
66
use TYPO3\CMS\Core\Type\Bitmask\Permission;
67
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
68
use TYPO3\CMS\Core\Utility\DebugUtility;
69
use TYPO3\CMS\Core\Utility\GeneralUtility;
70
use TYPO3\CMS\Core\Utility\MathUtility;
71
use TYPO3\CMS\Extbase\Object\ObjectManager;
72
73
/**
74
 * Class CrawlerController
75
 *
76
 * @package AOE\Crawler\Controller
77
 */
78
class CrawlerController implements LoggerAwareInterface
79
{
80
    use LoggerAwareTrait;
81
    use PublicMethodDeprecationTrait;
82
    use PublicPropertyDeprecationTrait;
83
84
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
85
86
    //queue not empty
87
    public const CLI_STATUS_REMAIN = 1;
88
89
    //(some) queue items where processed
90
    public const CLI_STATUS_PROCESSED = 2;
91
92
    //instance didn't finish
93
    public const CLI_STATUS_ABORTED = 4;
94
95
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
96
97
    /**
98
     * @var integer
99
     */
100
    public $setID = 0;
101
102
    /**
103
     * @var string
104
     */
105
    public $processID = '';
106
107
    /**
108
     * @var array
109
     */
110
    public $duplicateTrack = [];
111
112
    /**
113
     * @var array
114
     */
115
    public $downloadUrls = [];
116
117
    /**
118
     * @var array
119
     */
120
    public $incomingProcInstructions = [];
121
122
    /**
123
     * @var array
124
     */
125
    public $incomingConfigurationSelection = [];
126
127
    /**
128
     * @var bool
129
     */
130
    public $registerQueueEntriesInternallyOnly = false;
131
132
    /**
133
     * @var array
134
     */
135
    public $queueEntries = [];
136
137
    /**
138
     * @var array
139
     */
140
    public $urlList = [];
141
142
    /**
143
     * @var array
144
     */
145
    public $extensionSettings = [];
146
147
    /**
148
     * Mount Point
149
     *
150
     * @var bool
151
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
152
     */
153
    public $MP = false;
154
155
    /**
156
     * @var string
157
     * @deprecated
158
     */
159
    protected $processFilename;
160
161
    /**
162
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
163
     *
164
     * @var string
165
     * @deprecated
166
     */
167
    protected $accessMode;
168
169
    /**
170
     * @var QueueRepository
171
     */
172
    protected $queueRepository;
173
174
    /**
175
     * @var ProcessRepository
176
     */
177
    protected $processRepository;
178
179
    /**
180
     * @var ConfigurationRepository
181
     */
182
    protected $configurationRepository;
183
184
    /**
185
     * @var string
186
     */
187
    protected $tableName = 'tx_crawler_queue';
188
189
    /**
190
     * @var QueueExecutor
191
     */
192
    protected $queueExecutor;
193
194
    /**
195
     * @var int
196
     */
197
    protected $maximumUrlsToCompile = 10000;
198
199
    /**
200
     * @var IconFactory
201
     */
202
    protected $iconFactory;
203
204
    /**
205
     * @var string[]
206
     */
207
    private $deprecatedPublicMethods = [
0 ignored issues
show
introduced by
The private property $deprecatedPublicMethods is not used, and could be removed.
Loading history...
208
        'cleanUpOldQueueEntries' => 'Using CrawlerController::cleanUpOldQueueEntries() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->cleanUpOldQueueEntries() instead.',
209
        'CLI_debug' => 'Using CrawlerController->CLI_debug() is deprecated since 9.1.3 and will be removed in v11.x',
210
        'CLI_runHooks' => 'Using CrawlerController->CLI_runHooks() is deprecated since 9.1.5 and will be removed in v11.x',
211
        'getAccessMode' => 'Using CrawlerController->getAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
212
        'getLogEntriesForPageId' => 'Using CrawlerController->getLogEntriesForPageId() is deprecated since 9.1.5 and will be remove in v11.x',
213
        'getLogEntriesForSetId' => 'Using crawlerController::getLogEntriesForSetId() is deprecated since 9.0.1 and will be removed in v11.x',
214
        'flushQueue' => 'Using CrawlerController::flushQueue() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->flushQueue() instead.',
215
        'setAccessMode' => 'Using CrawlerController->setAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
216
        'getDisabled' => 'Using CrawlerController->getDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->isDisabled() instead',
217
        'setDisabled' => 'Using CrawlerController->setDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->setDisabled() instead',
218
        'getProcessFilename' => 'Using CrawlerController->getProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
219
        'setProcessFilename' => 'Using CrawlerController->setProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
220
        'getDuplicateRowsIfExist' => 'Using CrawlerController->getDuplicateRowsIfExist() is deprecated since 9.1.4 and will be remove in v11.x, please use QueueRepository->getDuplicateQueueItemsIfExists() instead',
221
    ];
222
223
    /**
224
     * @var string[]
225
     */
226
    private $deprecatedPublicProperties = [
1 ignored issue
show
introduced by
The private property $deprecatedPublicProperties is not used, and could be removed.
Loading history...
227
        'accessMode' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
228
        'processFilename' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
229
    ];
230
231
    /**
232
     * @var BackendUserAuthentication|null
233
     */
234
    private $backendUser;
235
236
    /**
237
     * @var integer
238
     */
239
    private $scheduledTime = 0;
240
241
    /**
242
     * @var integer
243
     */
244
    private $reqMinute = 0;
245
246
    /**
247
     * @var bool
248
     */
249
    private $submitCrawlUrls = false;
250
251
    /**
252
     * @var bool
253
     */
254
    private $downloadCrawlUrls = false;
255
256
    /**
257
     * @var PageRepository
258
     */
259
    private $pageRepository;
260
261
    /**
262
     * @var Crawler
263
     */
264
    private $crawler;
265
266
    /************************************
267
     *
268
     * Getting URLs based on Page TSconfig
269
     *
270
     ************************************/
271
272 36
    public function __construct()
273
    {
274 36
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
275 36
        $crawlStrategyFactory = GeneralUtility::makeInstance(CrawlStrategyFactory::class);
276 36
        $this->queueRepository = $objectManager->get(QueueRepository::class);
277 36
        $this->processRepository = $objectManager->get(ProcessRepository::class);
278 36
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
279 36
        $this->pageRepository = GeneralUtility::makeInstance(PageRepository::class);
280 36
        $this->queueExecutor = GeneralUtility::makeInstance(QueueExecutor::class, $crawlStrategyFactory);
281 36
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
282 36
        $this->crawler = GeneralUtility::makeInstance(Crawler::class);
283
284 36
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

284
        /** @scrutinizer ignore-deprecated */ $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
Loading history...
285
286
        /** @var ExtensionConfigurationProvider $configurationProvider */
287 36
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
288 36
        $settings = $configurationProvider->getExtensionConfiguration();
289 36
        $this->extensionSettings = is_array($settings) ? $settings : [];
0 ignored issues
show
introduced by
The condition is_array($settings) is always true.
Loading history...
290
291
        // set defaults:
292 36
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
293
            $this->extensionSettings['countInARun'] = 100;
294
        }
295
296 36
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
297 36
        $this->setMaximumUrlsToCompile(MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000));
298 36
    }
299
300 40
    public function setMaximumUrlsToCompile(int $maximumUrlsToCompile): void
301
    {
302 40
        $this->maximumUrlsToCompile = $maximumUrlsToCompile;
303 40
    }
304
305
    /**
306
     * Method to set the accessMode can be gui, cli or cli_im
307
     *
308
     * @return string
309
     * @deprecated
310
     */
311 1
    public function getAccessMode()
312
    {
313 1
        return $this->accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

313
        return /** @scrutinizer ignore-deprecated */ $this->accessMode;
Loading history...
314
    }
315
316
    /**
317
     * @param string $accessMode
318
     * @deprecated
319
     */
320 1
    public function setAccessMode($accessMode): void
321
    {
322 1
        $this->accessMode = $accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

322
        /** @scrutinizer ignore-deprecated */ $this->accessMode = $accessMode;
Loading history...
323 1
    }
324
325
    /**
326
     * Set disabled status to prevent processes from being processed
327
     * @deprecated
328
     */
329 2
    public function setDisabled(?bool $disabled = true): void
330
    {
331 2
        if ($disabled) {
332 1
            GeneralUtility::writeFile($this->processFilename, 'disabled');
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

332
            GeneralUtility::writeFile(/** @scrutinizer ignore-deprecated */ $this->processFilename, 'disabled');
Loading history...
333 1
        } elseif (is_file($this->processFilename)) {
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

333
        } elseif (is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename)) {
Loading history...
334 1
            unlink($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

334
            unlink(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
335
        }
336 2
    }
337
338
    /**
339
     * Get disable status
340
     * @deprecated
341
     */
342 2
    public function getDisabled(): bool
343
    {
344 2
        return is_file($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

344
        return is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
345
    }
346
347
    /**
348
     * @param string $filenameWithPath
349
     * @deprecated
350
     */
351 3
    public function setProcessFilename($filenameWithPath): void
352
    {
353 3
        $this->processFilename = $filenameWithPath;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

353
        /** @scrutinizer ignore-deprecated */ $this->processFilename = $filenameWithPath;
Loading history...
354 3
    }
355
356
    /**
357
     * @return string
358
     * @deprecated
359
     */
360 1
    public function getProcessFilename()
361
    {
362 1
        return $this->processFilename;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

362
        return /** @scrutinizer ignore-deprecated */ $this->processFilename;
Loading history...
363
    }
364
365
    /**
366
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
367
     */
368 14
    public function setExtensionSettings(array $extensionSettings): void
369
    {
370 14
        $this->extensionSettings = $extensionSettings;
371 14
    }
372
373
    /**
374
     * Check if the given page should be crawled
375
     *
376
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
377
     */
378 12
    public function checkIfPageShouldBeSkipped(array $pageRow)
379
    {
380
        // if page is hidden
381 12
        if (! $this->extensionSettings['crawlHiddenPages'] && $pageRow['hidden']) {
382 1
            return 'Because page is hidden';
383
        }
384
385 11
        if (GeneralUtility::inList('3,4,199,254,255', $pageRow['doktype'])) {
386 3
            return 'Because doktype is not allowed';
387
        }
388
389 8
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
390 1
            if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
391 1
                return 'Doktype was excluded by "' . $key . '"';
392
            }
393
        }
394
395
        // veto hook
396 7
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
397
            $params = [
398 2
                'pageRow' => $pageRow,
399
            ];
400
            // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
401 2
            $veto = GeneralUtility::callUserFunction($func, $params, $this);
402 2
            if ($veto !== false) {
403 2
                if (is_string($veto)) {
404 1
                    return $veto;
405
                }
406 1
                return 'Veto from hook "' . htmlspecialchars($key) . '"';
407
            }
408
        }
409
410 5
        return false;
411
    }
412
413
    /**
414
     * Wrapper method for getUrlsForPageId()
415
     * It returns an array of configurations and no urls!
416
     *
417
     * @param array $pageRow Page record with at least dok-type and uid columns.
418
     * @param string $skipMessage
419
     * @return array
420
     * @see getUrlsForPageId()
421
     */
422 6
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
423
    {
424 6
        if (! is_int($pageRow['uid'])) {
425
            $skipMessage = 'PageUid ' . $pageRow['uid'] . ' was not an integer';
426
            return [];
427
        }
428
429 6
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
430 6
        if ($message === false) {
431 5
            $res = $this->getUrlsForPageId($pageRow['uid']);
432 5
            $skipMessage = '';
433
        } else {
434 1
            $skipMessage = $message;
435 1
            $res = [];
436
        }
437
438 6
        return $res;
439
    }
440
441
    /**
442
     * Creates a list of URLs from input array (and submits them to queue if asked for)
443
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
444
     *
445
     * @param array $vv Information about URLs from pageRow to crawl.
446
     * @param array $pageRow Page row
447
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
448
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
449
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
450
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
451
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
452
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
453
     * @param array $incomingProcInstructions Array of processing instructions
454
     * @return string List of URLs (meant for display in backend module)
455
     */
456 4
    public function urlListFromUrlArray(
457
        array $vv,
458
        array $pageRow,
459
        $scheduledTime,
460
        $reqMinute,
461
        $submitCrawlUrls,
462
        $downloadCrawlUrls,
463
        array &$duplicateTrack,
464
        array &$downloadUrls,
465
        array $incomingProcInstructions
466
    ) {
467 4
        if (! is_array($vv['URLs'])) {
468
            return 'ERROR - no URL generated';
469
        }
470 4
        $urlLog = [];
471 4
        $pageId = (int) $pageRow['uid'];
472 4
        $configurationHash = $this->getConfigurationHash($vv);
473 4
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
474
475 4
        $urlService = new UrlService();
476
477 4
        foreach ($vv['URLs'] as $urlQuery) {
478 4
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
479
                continue;
480
            }
481 4
            $url = (string) $urlService->getUrlFromPageAndQueryParameters(
482 4
                $pageId,
483
                $urlQuery,
484 4
                $vv['subCfg']['baseUrl'] ?? null,
485 4
                $vv['subCfg']['force_ssl'] ?? 0
486
            );
487
488
            // Create key by which to determine unique-ness:
489 4
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
490
491 4
            if (isset($duplicateTrack[$uKey])) {
492
                //if the url key is registered just display it and do not resubmit is
493
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
494
            } else {
495
                // Scheduled time:
496 4
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
497 4
                $schTime = intval($schTime / 60) * 60;
498 4
                $formattedDate = BackendUtility::datetime($schTime);
499 4
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
500 4
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
501
502
                // Submit for crawling!
503 4
                if ($submitCrawlUrls) {
504 4
                    $added = $this->addUrl(
505 4
                        $pageId,
506
                        $url,
507 4
                        $vv['subCfg'],
508
                        $scheduledTime,
509
                        $configurationHash,
510
                        $skipInnerCheck
511
                    );
512 4
                    if ($added === false) {
513 4
                        $urlList .= ' (URL already existed)';
514
                    }
515
                } elseif ($downloadCrawlUrls) {
516
                    $downloadUrls[$url] = $url;
517
                }
518 4
                $urlLog[] = $urlList;
519
            }
520 4
            $duplicateTrack[$uKey] = true;
521
        }
522
523 4
        return implode('<br>', $urlLog);
524
    }
525
526
    /**
527
     * Returns true if input processing instruction is among registered ones.
528
     *
529
     * @param string $piString PI to test
530
     * @param array $incomingProcInstructions Processing instructions
531
     * @return boolean
532
     */
533 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
534
    {
535 5
        if (empty($incomingProcInstructions)) {
536 1
            return true;
537
        }
538
539 4
        foreach ($incomingProcInstructions as $pi) {
540 4
            if (GeneralUtility::inList($piString, $pi)) {
541 2
                return true;
542
            }
543
        }
544 2
        return false;
545
    }
546
547 5
    public function getPageTSconfigForId($id): array
548
    {
549 5
        if (! $this->MP) {
550 5
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
551
        } else {
552
            // TODO: Please check, this makes no sense to split a boolean value.
553
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

553
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
554
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

554
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
555
        }
556
557
        // Call a hook to alter configuration
558 5
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
559
            $params = [
560
                'pageId' => $id,
561
                'pageTSConfig' => &$pageTSconfig,
562
            ];
563
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
564
                GeneralUtility::callUserFunction($userFunc, $params, $this);
565
            }
566
        }
567 5
        return $pageTSconfig;
568
    }
569
570
    /**
571
     * This methods returns an array of configurations.
572
     * Adds no urls!
573
     */
574 4
    public function getUrlsForPageId(int $pageId): array
575
    {
576
        // Get page TSconfig for page ID
577 4
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
578
579 4
        $res = [];
580
581
        // Fetch Crawler Configuration from pageTSconfig
582 4
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
583 4
        foreach ($crawlerCfg as $key => $values) {
584 3
            if (! is_array($values)) {
585 3
                continue;
586
            }
587 3
            $key = str_replace('.', '', $key);
588
            // Sub configuration for a single configuration string:
589 3
            $subCfg = (array) $crawlerCfg[$key . '.'];
590 3
            $subCfg['key'] = $key;
591
592 3
            if (strcmp($subCfg['procInstrFilter'] ?? '', '')) {
593 3
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
594
            }
595 3
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
596
597
            // process configuration if it is not page-specific or if the specific page is the current page:
598
            // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
599 3
            if (! strcmp((string) $subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
600
601
                // Explode, process etc.:
602 3
                $res[$key] = [];
603 3
                $res[$key]['subCfg'] = $subCfg;
604 3
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
605 3
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
606 3
                $res[$key]['origin'] = 'pagets';
607
608
                // recognize MP value
609 3
                if (! $this->MP) {
610 3
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
611
                } else {
612
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
0 ignored issues
show
Bug introduced by
Are you sure $this->MP of type true can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

612
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . /** @scrutinizer ignore-type */ $this->MP]);
Loading history...
613
                }
614
            }
615
        }
616
617
        // Get configuration from tx_crawler_configuration records up the rootline
618 4
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
619 4
        foreach ($crawlerConfigurations as $configurationRecord) {
620
621
            // check access to the configuration record
622 1
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || $this->hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
623 1
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
624
625
                // process configuration if it is not page-specific or if the specific page is the current page:
626
                // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
627 1
                if (! strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
628 1
                    $key = $configurationRecord['name'];
629
630
                    // don't overwrite previously defined paramSets
631 1
                    if (! isset($res[$key])) {
632
633
                        /* @var $TSparserObject TypoScriptParser */
634 1
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
635 1
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
636
637
                        $subCfg = [
638 1
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
639 1
                            'procInstrParams.' => $TSparserObject->setup,
640 1
                            'baseUrl' => $configurationRecord['base_url'],
641 1
                            'force_ssl' => (int) $configurationRecord['force_ssl'],
642 1
                            'userGroups' => $configurationRecord['fegroups'],
643 1
                            'exclude' => $configurationRecord['exclude'],
644 1
                            'key' => $key,
645
                        ];
646
647 1
                        if (! in_array($pageId, $this->expandExcludeString($subCfg['exclude']), true)) {
648 1
                            $res[$key] = [];
649 1
                            $res[$key]['subCfg'] = $subCfg;
650 1
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
651 1
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
652 1
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
653 1
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
654
                        }
655
                    }
656
                }
657
            }
658
        }
659
660 4
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
661
            $params = [
662
                'res' => &$res,
663
            ];
664
            GeneralUtility::callUserFunction($func, $params, $this);
665
        }
666 4
        return $res;
667
    }
668
669
    /**
670
     * Find all configurations of subpages of a page
671
     * TODO: Write Functional Tests
672
     */
673 1
    public function getConfigurationsForBranch(int $rootid, int $depth): array
674
    {
675 1
        $configurationsForBranch = [];
676 1
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
677 1
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
678 1
        foreach ($sets as $key => $value) {
679
            if (! is_array($value)) {
680
                continue;
681
            }
682
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
683
        }
684 1
        $pids = [];
685 1
        $rootLine = BackendUtility::BEgetRootLine($rootid);
686 1
        foreach ($rootLine as $node) {
687 1
            $pids[] = $node['uid'];
688
        }
689
        /* @var PageTreeView $tree */
690 1
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
691 1
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
692 1
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
693 1
        $tree->getTree($rootid, $depth, '');
694 1
        foreach ($tree->tree as $node) {
695
            $pids[] = $node['row']['uid'];
696
        }
697
698 1
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
699
        $statement = $queryBuilder
700 1
            ->select('name')
701 1
            ->from('tx_crawler_configuration')
702 1
            ->where(
703 1
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
704
            )
705 1
            ->execute();
706
707 1
        while ($row = $statement->fetch()) {
708 1
            $configurationsForBranch[] = $row['name'];
709
        }
710 1
        return $configurationsForBranch;
711
    }
712
713
    /**
714
     * Check if a user has access to an item
715
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
716
     *
717
     * @param string $groupList Comma-separated list of (fe_)group UIDs from a user
718
     * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access
719
     * @return bool TRUE if at least one of the users group UIDs is in the access list or the access list is empty
720
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
721
     */
722 3
    public function hasGroupAccess($groupList, $accessList)
723
    {
724 3
        if (empty($accessList)) {
725 1
            return true;
726
        }
727 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
728 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
729 1
                return true;
730
            }
731
        }
732 1
        return false;
733
    }
734
735
    /**
736
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
737
     * Syntax of values:
738
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
739
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
740
     * - For each configuration part:
741
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
742
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
743
     *        _ENABLELANG:1 picks only original records without their language overlays
744
     *         - Default: Literal value
745
     *
746
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
747
     * @param integer $pid Current page ID
748
     * @return array
749
     *
750
     * TODO: Write Functional Tests
751
     */
752 11
    public function expandParameters($paramArray, $pid)
753
    {
754
        // Traverse parameter names:
755 11
        foreach ($paramArray as $p => $v) {
756 11
            $v = trim($v);
757
758
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
759 11
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
760
                // So, find the value inside brackets and reset the paramArray value as an array.
761 11
                $v = substr($v, 1, -1);
762 11
                $paramArray[$p] = [];
763
764
                // Explode parts and traverse them:
765 11
                $parts = explode('|', $v);
766 11
                foreach ($parts as $pV) {
767
768
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
769 11
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
770 1
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
771
772
                        // Traverse range, add values:
773
                        // Limit to size of range!
774 1
                        $runAwayBrake = 1000;
775 1
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
776 1
                            $paramArray[$p][] = $a;
777 1
                            $runAwayBrake--;
778 1
                            if ($runAwayBrake <= 0) {
779
                                break;
780
                            }
781
                        }
782 10
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
783
784
                        // Parse parameters:
785 6
                        $subparts = GeneralUtility::trimExplode(';', $pV);
786 6
                        $subpartParams = [];
787 6
                        foreach ($subparts as $spV) {
788 6
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
789 6
                            $subpartParams[$pKey] = $pVal;
790
                        }
791
792
                        // Table exists:
793 6
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
794 6
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
795 6
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
796 6
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
797 6
                            $where = $subpartParams['_WHERE'] ?? '';
798 6
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
799
800 6
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
801 6
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
802 6
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
803
804 6
                                if ($recursiveDepth > 0) {
805
                                    /** @var QueryGenerator $queryGenerator */
806 2
                                    $queryGenerator = GeneralUtility::makeInstance(QueryGenerator::class);
807 2
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
808 2
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
809
                                } else {
810 4
                                    $pidArray = [(string) $lookUpPid];
811
                                }
812
813 6
                                $queryBuilder->getRestrictions()
814 6
                                    ->removeAll()
815 6
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
816
817
                                $queryBuilder
818 6
                                    ->select($fieldName)
819 6
                                    ->from($subpartParams['_TABLE'])
820 6
                                    ->where(
821 6
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
822
                                        $where
823
                                    );
824
825 6
                                if (! empty($addTable)) {
826
                                    // TODO: Check if this works as intended!
827
                                    $queryBuilder->add('from', $addTable);
828
                                }
829 6
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
830
831 6
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
832
                                    $queryBuilder->andWhere(
833
                                        $queryBuilder->expr()->lte(
834
                                            $transOrigPointerField,
835
                                            0
836
                                        )
837
                                    );
838
                                }
839
840 6
                                $statement = $queryBuilder->execute();
841
842 6
                                $rows = [];
843 6
                                while ($row = $statement->fetch()) {
844 6
                                    $rows[$row[$fieldName]] = $row;
845
                                }
846
847 6
                                if (is_array($rows)) {
848 6
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
849
                                }
850
                            }
851
                        }
852
                    } else {
853
                        // Just add value:
854 4
                        $paramArray[$p][] = $pV;
855
                    }
856
                    // Hook for processing own expandParameters place holder
857 11
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
858
                        $_params = [
859
                            'pObj' => &$this,
860
                            'paramArray' => &$paramArray,
861
                            'currentKey' => $p,
862
                            'currentValue' => $pV,
863
                            'pid' => $pid,
864
                        ];
865
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
866
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
867
                        }
868
                    }
869
                }
870
871
                // Make unique set of values and sort array by key:
872 11
                $paramArray[$p] = array_unique($paramArray[$p]);
873 11
                ksort($paramArray);
874
            } else {
875
                // Set the literal value as only value in array:
876 4
                $paramArray[$p] = [$v];
877
            }
878
        }
879
880 11
        return $paramArray;
881
    }
882
883
    /**
884
     * Compiling URLs from parameter array (output of expandParameters())
885
     * The number of URLs will be the multiplication of the number of parameter values for each key
886
     *
887
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
888
     * @param array $urls URLs accumulated in this array (for recursion)
889
     * @return array
890
     */
891 8
    public function compileUrls($paramArray, array $urls)
892
    {
893 8
        if (empty($paramArray)) {
894 8
            return $urls;
895
        }
896 7
        $varName = key($paramArray);
897 7
        $valueSet = array_shift($paramArray);
898
899
        // Traverse value set:
900 7
        $newUrls = [];
901 7
        foreach ($urls as $url) {
902 6
            foreach ($valueSet as $val) {
903 6
                if (count($newUrls) < $this->getMaximumUrlsToCompile()) {
904 6
                    $newUrls[] = $url . (strcmp((string) $val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string) $val) : '');
905
                }
906
            }
907
        }
908 7
        return $this->compileUrls($paramArray, $newUrls);
909
    }
910
911
    /************************************
912
     *
913
     * Crawler log
914
     *
915
     ************************************/
916
917
    /**
918
     * Return array of records from crawler queue for input page ID
919
     *
920
     * @param integer $id Page ID for which to look up log entries.
921
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
922
     * @param boolean $doFullFlush
923
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
924
     * @return array
925
     *
926
     * @deprecated
927
     */
928 4
    public function getLogEntriesForPageId($id, QueueFilter $queueFilter, $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
0 ignored issues
show
Unused Code introduced by
The parameter $doFullFlush is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

928
    public function getLogEntriesForPageId($id, QueueFilter $queueFilter, $doFlush = false, /** @scrutinizer ignore-unused */ $doFullFlush = false, $itemsPerPage = 10)

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
929
    {
930 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
931
        $queryBuilder
932 4
            ->select('*')
933 4
            ->from($this->tableName)
934 4
            ->where(
935 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, PDO::PARAM_INT))
936
            )
937 4
            ->orderBy('scheduled', 'DESC');
938
939 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
940 4
            ->getConnectionForTable($this->tableName)
941 4
            ->getExpressionBuilder();
942 4
        $query = $expressionBuilder->andX();
0 ignored issues
show
Unused Code introduced by
The assignment to $query is dead and can be removed.
Loading history...
943
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
944
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
945
        // between the statements, it's not a mistake in the code.
946 4
        switch ($queueFilter) {
947 4
            case 'pending':
948
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
949
                break;
950 4
            case 'finished':
951
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
952
                break;
953
        }
954
955 4
        if ($doFlush) {
956 2
            $this->queueRepository->flushQueue($queueFilter);
957
        }
958 4
        if ($itemsPerPage > 0) {
959
            $queryBuilder
960 4
                ->setMaxResults((int) $itemsPerPage);
961
        }
962
963 4
        return $queryBuilder->execute()->fetchAll();
964
    }
965
966
    /**
967
     * Return array of records from crawler queue for input set ID
968
     *
969
     * @param int $set_id Set ID for which to look up log entries.
970
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
971
     * @param bool $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
972
     * @param int $itemsPerPage Limit the amount of entries per page default is 10
973
     * @return array
974
     *
975
     * @deprecated
976
     */
977 6
    public function getLogEntriesForSetId(int $set_id, string $filter = '', bool $doFlush = false, bool $doFullFlush = false, int $itemsPerPage = 10)
978
    {
979 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
980
        $queryBuilder
981 6
            ->select('*')
982 6
            ->from($this->tableName)
983 6
            ->where(
984 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, PDO::PARAM_INT))
985
            )
986 6
            ->orderBy('scheduled', 'DESC');
987
988 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
989 6
            ->getConnectionForTable($this->tableName)
990 6
            ->getExpressionBuilder();
991 6
        $query = $expressionBuilder->andX();
992
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
993
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
994
        // between the statements, it's not a mistake in the code.
995 6
        $addWhere = '';
996 6
        switch ($filter) {
997 6
            case 'pending':
998 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
999 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
1000 1
                break;
1001 5
            case 'finished':
1002 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1003 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
1004 1
                break;
1005
        }
1006 6
        if ($doFlush) {
1007 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int) $set_id));
1008 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1008
            /** @scrutinizer ignore-deprecated */ $this->flushQueue($doFullFlush ? '' : $addWhere);
Loading history...
1009 4
            return [];
1010
        }
1011 2
        if ($itemsPerPage > 0) {
1012
            $queryBuilder
1013 2
                ->setMaxResults((int) $itemsPerPage);
1014
        }
1015
1016 2
        return $queryBuilder->execute()->fetchAll();
1017
    }
1018
1019
    /**
1020
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1021
     *
1022
     * @param integer $setId Set ID
1023
     * @param array $params Parameters to pass to call back function
1024
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1025
     * @param integer $page_id Page ID to attach it to
1026
     * @param integer $schedule Time at which to activate
1027
     */
1028
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
1029
    {
1030
        if (! is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
1031
            $params = [];
1032
        }
1033
        $params['_CALLBACKOBJ'] = $callBack;
1034
1035
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1036
            ->insert(
1037
                'tx_crawler_queue',
1038
                [
1039
                    'page_id' => (int) $page_id,
1040
                    'parameters' => json_encode($params),
1041
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
1042
                    'exec_time' => 0,
1043
                    'set_id' => (int) $setId,
1044
                    'result_data' => '',
1045
                ]
1046
            );
1047
    }
1048
1049
    /************************************
1050
     *
1051
     * URL setting
1052
     *
1053
     ************************************/
1054
1055
    /**
1056
     * Setting a URL for crawling:
1057
     *
1058
     * @param integer $id Page ID
1059
     * @param string $url Complete URL
1060
     * @param array $subCfg Sub configuration array (from TS config)
1061
     * @param integer $tstamp Scheduled-time
1062
     * @param string $configurationHash (optional) configuration hash
1063
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1064
     * @return bool
1065
     */
1066 8
    public function addUrl(
1067
        $id,
1068
        $url,
1069
        array $subCfg,
1070
        $tstamp,
1071
        $configurationHash = '',
1072
        $skipInnerDuplicationCheck = false
1073
    ) {
1074 8
        $urlAdded = false;
1075 8
        $rows = [];
1076
1077
        // Creating parameters:
1078
        $parameters = [
1079 8
            'url' => $url,
1080
        ];
1081
1082
        // fe user group simulation:
1083 8
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1084 8
        if ($uGs) {
1085 1
            $parameters['feUserGroupList'] = $uGs;
1086
        }
1087
1088
        // Setting processing instructions
1089 8
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1090 8
        if (is_array($subCfg['procInstrParams.'])) {
1091 5
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1092
        }
1093
1094
        // Compile value array:
1095 8
        $parameters_serialized = json_encode($parameters);
1096
        $fieldArray = [
1097 8
            'page_id' => (int) $id,
1098 8
            'parameters' => $parameters_serialized,
1099 8
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1100 8
            'configuration_hash' => $configurationHash,
1101 8
            'scheduled' => $tstamp,
1102 8
            'exec_time' => 0,
1103 8
            'set_id' => (int) $this->setID,
1104 8
            'result_data' => '',
1105 8
            'configuration' => $subCfg['key'],
1106
        ];
1107
1108 8
        if ($this->registerQueueEntriesInternallyOnly) {
1109
            //the entries will only be registered and not stored to the database
1110 1
            $this->queueEntries[] = $fieldArray;
1111
        } else {
1112 7
            if (! $skipInnerDuplicationCheck) {
1113
                // check if there is already an equal entry
1114 6
                $rows = $this->queueRepository->getDuplicateQueueItemsIfExists(
1115 6
                    (bool) $this->extensionSettings['enableTimeslot'],
1116
                    $tstamp,
1117 6
                    $this->getCurrentTime(),
1118 6
                    $fieldArray['page_id'],
1119 6
                    $fieldArray['parameters_hash']
1120
                );
1121
            }
1122
1123 7
            if (empty($rows)) {
1124 6
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1125 6
                $connectionForCrawlerQueue->insert(
1126 6
                    'tx_crawler_queue',
1127
                    $fieldArray
1128
                );
1129 6
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1130 6
                $rows[] = $uid;
1131 6
                $urlAdded = true;
1132
1133 6
                $signalPayload = ['uid' => $uid, 'fieldArray' => $fieldArray];
1134 6
                SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1134
                /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1135 6
                    self::class,
1136 6
                    SignalSlotUtility::SIGNAL_URL_ADDED_TO_QUEUE,
1137
                    $signalPayload
1138
                );
1139
            } else {
1140 3
                $signalPayload = ['rows' => $rows, 'fieldArray' => $fieldArray];
1141 3
                SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1141
                /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1142 3
                    self::class,
1143 3
                    SignalSlotUtility::SIGNAL_DUPLICATE_URL_IN_QUEUE,
1144
                    $signalPayload
1145
                );
1146
            }
1147
        }
1148
1149 8
        return $urlAdded;
1150
    }
1151
1152
    /**
1153
     * Returns the current system time
1154
     *
1155
     * @return int
1156
     */
1157 2
    public function getCurrentTime()
1158
    {
1159 2
        return time();
1160
    }
1161
1162
    /************************************
1163
     *
1164
     * URL reading
1165
     *
1166
     ************************************/
1167
1168
    /**
1169
     * Read URL for single queue entry
1170
     *
1171
     * @param integer $queueId
1172
     * @param boolean $force If set, will process even if exec_time has been set!
1173
     * @return integer
1174
     */
1175
    public function readUrl($queueId, $force = false)
1176
    {
1177
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1178
        $ret = 0;
1179
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1180
        // Get entry:
1181
        $queryBuilder
1182
            ->select('*')
1183
            ->from('tx_crawler_queue')
1184
            ->where(
1185
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, PDO::PARAM_INT))
1186
            );
1187
        if (! $force) {
1188
            $queryBuilder
1189
                ->andWhere('exec_time = 0')
1190
                ->andWhere('process_scheduled > 0');
1191
        }
1192
        $queueRec = $queryBuilder->execute()->fetch();
1193
1194
        if (! is_array($queueRec)) {
1195
            return;
1196
        }
1197
1198
        SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1198
        /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1199
            self::class,
1200
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1201
            [$queueId, &$queueRec]
1202
        );
1203
1204
        // Set exec_time to lock record:
1205
        $field_array = ['exec_time' => $this->getCurrentTime()];
1206
1207
        if (isset($this->processID)) {
1208
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1209
            $field_array['process_id_completed'] = $this->processID;
1210
        }
1211
1212
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1213
            ->update(
1214
                'tx_crawler_queue',
1215
                $field_array,
1216
                ['qid' => (int) $queueId]
1217
            );
1218
1219
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1220
        if ($result['content'] === null) {
1221
            $resultData = 'An errors happened';
1222
        } else {
1223
            /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
1224
            $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
1225
            $resultData = $jsonCompatibilityConverter->convert($result['content']);
1226
        }
1227
1228
        //atm there's no need to point to specific pollable extensions
1229
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1230
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1231
                // only check the success value if the instruction is runnig
1232
                // it is important to name the pollSuccess key same as the procInstructions key
1233
                if (is_array($resultData['parameters']['procInstructions'])
1234
                    && in_array(
1235
                        $pollable,
1236
                        $resultData['parameters']['procInstructions'], true
1237
                    )
1238
                ) {
1239
                    if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1240
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1241
                    }
1242
                }
1243
            }
1244
        }
1245
1246
        // Set result in log which also denotes the end of the processing of this entry.
1247
        $field_array = ['result_data' => json_encode($result)];
1248
1249
        SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1249
        /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1250
            self::class,
1251
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1252
            [$queueId, &$field_array]
1253
        );
1254
1255
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1256
            ->update(
1257
                'tx_crawler_queue',
1258
                $field_array,
1259
                ['qid' => (int) $queueId]
1260
            );
1261
1262
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1263
        return $ret;
1264
    }
1265
1266
    /**
1267
     * Read URL for not-yet-inserted log-entry
1268
     *
1269
     * @param array $field_array Queue field array,
1270
     *
1271
     * @return array|bool|mixed|string
1272
     */
1273
    public function readUrlFromArray($field_array)
1274
    {
1275
        // Set exec_time to lock record:
1276
        $field_array['exec_time'] = $this->getCurrentTime();
1277
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1278
        $connectionForCrawlerQueue->insert(
1279
            $this->tableName,
1280
            $field_array
1281
        );
1282
        $queueId = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1283
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1284
1285
        // Set result in log which also denotes the end of the processing of this entry.
1286
        $field_array = ['result_data' => json_encode($result)];
1287
1288
        SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1288
        /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1289
            self::class,
1290
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1291
            [$queueId, &$field_array]
1292
        );
1293
1294
        $connectionForCrawlerQueue->update(
1295
            $this->tableName,
1296
            $field_array,
1297
            ['qid' => $queueId]
1298
        );
1299
1300
        return $result;
1301
    }
1302
1303
    /*****************************
1304
     *
1305
     * Compiling URLs to crawl - tools
1306
     *
1307
     *****************************/
1308
1309
    /**
1310
     * @param integer $id Root page id to start from.
1311
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1312
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1313
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1314
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1315
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1316
     * @param array $incomingProcInstructions Array of processing instructions
1317
     * @param array $configurationSelection Array of configuration keys
1318
     * @return string
1319
     */
1320
    public function getPageTreeAndUrls(
1321
        $id,
1322
        $depth,
1323
        $scheduledTime,
1324
        $reqMinute,
1325
        $submitCrawlUrls,
1326
        $downloadCrawlUrls,
1327
        array $incomingProcInstructions,
1328
        array $configurationSelection
1329
    ) {
1330
        $this->scheduledTime = $scheduledTime;
1331
        $this->reqMinute = $reqMinute;
1332
        $this->submitCrawlUrls = $submitCrawlUrls;
1333
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1334
        $this->incomingProcInstructions = $incomingProcInstructions;
1335
        $this->incomingConfigurationSelection = $configurationSelection;
1336
1337
        $this->duplicateTrack = [];
1338
        $this->downloadUrls = [];
1339
1340
        // Drawing tree:
1341
        /* @var PageTreeView $tree */
1342
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1343
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1344
        $tree->init('AND ' . $perms_clause);
1345
1346
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1347
        if (is_array($pageInfo)) {
1348
            // Set root row:
1349
            $tree->tree[] = [
1350
                'row' => $pageInfo,
1351
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1352
            ];
1353
        }
1354
1355
        // Get branch beneath:
1356
        if ($depth) {
1357
            $tree->getTree($id, $depth, '');
1358
        }
1359
1360
        // Traverse page tree:
1361
        $code = '';
1362
1363
        foreach ($tree->tree as $data) {
1364
            $this->MP = false;
1365
1366
            // recognize mount points
1367
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
1368
                $mountpage = $this->pageRepository->getPage($data['row']['uid']);
1369
1370
                // fetch mounted pages
1371
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1372
1373
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1374
                $mountTree->init('AND ' . $perms_clause);
1375
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1376
1377
                foreach ($mountTree->tree as $mountData) {
1378
                    $code .= $this->drawURLs_addRowsForPage(
1379
                        $mountData['row'],
1380
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1381
                    );
1382
                }
1383
1384
                // replace page when mount_pid_ol is enabled
1385
                if ($mountpage[0]['mount_pid_ol']) {
1386
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1387
                } else {
1388
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1389
                    $this->MP = false;
1390
                }
1391
            }
1392
1393
            $code .= $this->drawURLs_addRowsForPage(
1394
                $data['row'],
1395
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1396
            );
1397
        }
1398
1399
        return $code;
1400
    }
1401
1402
    /**
1403
     * Expands exclude string
1404
     *
1405
     * @param string $excludeString Exclude string
1406
     * @return array
1407
     */
1408 2
    public function expandExcludeString($excludeString)
1409
    {
1410
        // internal static caches;
1411 2
        static $expandedExcludeStringCache;
1412 2
        static $treeCache;
1413
1414 2
        if (empty($expandedExcludeStringCache[$excludeString])) {
1415 2
            $pidList = [];
1416
1417 2
            if (! empty($excludeString)) {
1418
                /** @var PageTreeView $tree */
1419 1
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1420 1
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1421
1422 1
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1423
1424 1
                foreach ($excludeParts as $excludePart) {
1425 1
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1426
1427
                    // default is "page only" = "depth=0"
1428 1
                    if (empty($depth)) {
1429 1
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1430
                    }
1431
1432 1
                    $pidList[] = (int) $pid;
1433
1434 1
                    if ($depth > 0) {
1435
                        if (empty($treeCache[$pid][$depth])) {
1436
                            $tree->reset();
1437
                            $tree->getTree($pid, $depth);
0 ignored issues
show
Bug introduced by
$pid of type string is incompatible with the type integer expected by parameter $uid of TYPO3\CMS\Backend\Tree\V...ractTreeView::getTree(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1437
                            $tree->getTree(/** @scrutinizer ignore-type */ $pid, $depth);
Loading history...
1438
                            $treeCache[$pid][$depth] = $tree->tree;
1439
                        }
1440
1441
                        foreach ($treeCache[$pid][$depth] as $data) {
1442
                            $pidList[] = (int) $data['row']['uid'];
1443
                        }
1444
                    }
1445
                }
1446
            }
1447
1448 2
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1449
        }
1450
1451 2
        return $expandedExcludeStringCache[$excludeString];
1452
    }
1453
1454
    /**
1455
     * Create the rows for display of the page tree
1456
     * For each page a number of rows are shown displaying GET variable configuration
1457
     */
1458
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1459
    {
1460
        $skipMessage = '';
1461
1462
        // Get list of configurations
1463
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1464
        $configurations = ConfigurationService::removeDisallowedConfigurations($this->incomingConfigurationSelection, $configurations);
1465
1466
        // Traverse parameter combinations:
1467
        $c = 0;
1468
        $content = '';
1469
        if (! empty($configurations)) {
1470
            foreach ($configurations as $confKey => $confArray) {
1471
1472
                // Title column:
1473
                if (! $c) {
1474
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1475
                } else {
1476
                    $titleClm = '';
1477
                }
1478
1479
                if (! in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']), true)) {
1480
1481
                    // URL list:
1482
                    $urlList = $this->urlListFromUrlArray(
1483
                        $confArray,
1484
                        $pageRow,
1485
                        $this->scheduledTime,
1486
                        $this->reqMinute,
1487
                        $this->submitCrawlUrls,
1488
                        $this->downloadCrawlUrls,
1489
                        $this->duplicateTrack,
1490
                        $this->downloadUrls,
1491
                        // if empty the urls won't be filtered by processing instructions
1492
                        $this->incomingProcInstructions
1493
                    );
1494
1495
                    // Expanded parameters:
1496
                    $paramExpanded = '';
1497
                    $calcAccu = [];
1498
                    $calcRes = 1;
1499
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1500
                        $paramExpanded .= '
1501
                            <tr>
1502
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1503
                            '(' . count($gVal) . ')' .
1504
                            '</td>
1505
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1506
                            </tr>
1507
                        ';
1508
                        $calcRes *= count($gVal);
1509
                        $calcAccu[] = count($gVal);
1510
                    }
1511
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1512
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1513
1514
                    // Options
1515
                    $optionValues = '';
1516
                    if ($confArray['subCfg']['userGroups']) {
1517
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1518
                    }
1519
                    if ($confArray['subCfg']['procInstrFilter']) {
1520
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1521
                    }
1522
1523
                    // Compile row:
1524
                    $content .= '
1525
                        <tr>
1526
                            ' . $titleClm . '
1527
                            <td>' . htmlspecialchars($confKey) . '</td>
1528
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1529
                            <td>' . $paramExpanded . '</td>
1530
                            <td nowrap="nowrap">' . $urlList . '</td>
1531
                            <td nowrap="nowrap">' . $optionValues . '</td>
1532
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1533
                        </tr>';
1534
                } else {
1535
                    $content .= '<tr>
1536
                            ' . $titleClm . '
1537
                            <td>' . htmlspecialchars($confKey) . '</td>
1538
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1539
                        </tr>';
1540
                }
1541
1542
                $c++;
1543
            }
1544
        } else {
1545
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1546
1547
            // Compile row:
1548
            $content .= '
1549
                <tr>
1550
                    <td>' . $pageTitle . '</td>
1551
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1552
                </tr>';
1553
        }
1554
1555
        return $content;
1556
    }
1557
1558
    /*****************************
1559
     *
1560
     * CLI functions
1561
     *
1562
     *****************************/
1563
1564
    /**
1565
     * Running the functionality of the CLI (crawling URLs from queue)
1566
     */
1567
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1568
    {
1569
        $result = 0;
1570
        $counter = 0;
1571
1572
        // First, run hooks:
1573
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1574
            trigger_error(
1575
                'This hook (crawler/cli_hooks) is deprecated since 9.1.5 and will be removed when dropping support for TYPO3 9LTS and 10LTS',
1576
                E_USER_DEPRECATED
1577
            );
1578
            $hookObj = GeneralUtility::makeInstance($objRef);
1579
            if (is_object($hookObj)) {
1580
                $hookObj->crawler_init($this);
1581
            }
1582
        }
1583
1584
        // Clean up the queue
1585
        $this->queueRepository->cleanupQueue();
1586
1587
        // Select entries:
1588
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1589
1590
        if (! empty($rows)) {
1591
            $quidList = [];
1592
1593
            foreach ($rows as $r) {
1594
                $quidList[] = $r['qid'];
1595
            }
1596
1597
            $processId = $this->CLI_buildProcessId();
1598
1599
            //save the number of assigned queue entries to determine how many have been processed later
1600
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1601
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1602
1603
            if ($numberOfAffectedRows !== count($quidList)) {
1604
                return ($result | self::CLI_STATUS_ABORTED);
1605
            }
1606
1607
            foreach ($rows as $r) {
1608
                $result |= $this->readUrl($r['qid']);
1609
1610
                $counter++;
1611
                // Just to relax the system
1612
                usleep((int) $sleepTime);
1613
1614
                // if during the start and the current read url the cli has been disable we need to return from the function
1615
                // mark the process NOT as ended.
1616
                if ($this->crawler->isDisabled()) {
1617
                    return ($result | self::CLI_STATUS_ABORTED);
1618
                }
1619
1620
                if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1621
                    $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1621
                    /** @scrutinizer ignore-deprecated */ $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
Loading history...
1622
                    $result |= self::CLI_STATUS_ABORTED;
1623
                    //possible timeout
1624
                    break;
1625
                }
1626
            }
1627
1628
            sleep((int) $sleepAfterFinish);
1629
        }
1630
1631
        if ($counter > 0) {
1632
            $result |= self::CLI_STATUS_PROCESSED;
1633
        }
1634
1635
        return $result;
1636
    }
1637
1638
    /**
1639
     * Activate hooks
1640
     * @deprecated
1641
     */
1642
    public function CLI_runHooks(): void
1643
    {
1644
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1645
            $hookObj = GeneralUtility::makeInstance($objRef);
1646
            if (is_object($hookObj)) {
1647
                $hookObj->crawler_init($this);
1648
            }
1649
        }
1650
    }
1651
1652
    /**
1653
     * Try to acquire a new process with the given id
1654
     * also performs some auto-cleanup for orphan processes
1655
     * @param string $id identification string for the process
1656
     * @return boolean
1657
     * @todo preemption might not be the most elegant way to clean up
1658
     */
1659
    public function CLI_checkAndAcquireNewProcess($id)
1660
    {
1661
        $ret = true;
1662
1663
        $systemProcessId = getmypid();
1664
        if (! $systemProcessId) {
1665
            return false;
1666
        }
1667
1668
        $processCount = 0;
1669
        $orphanProcesses = [];
1670
1671
        $activeProcesses = $this->processRepository->findAllActive();
1672
        $currentTime = $this->getCurrentTime();
1673
1674
        /** @var Process $process */
1675
        foreach ($activeProcesses as $process) {
1676
            if ($process->getTtl() < $currentTime) {
1677
                $orphanProcesses[] = $process->getProcessId();
1678
            } else {
1679
                $processCount++;
1680
            }
1681
        }
1682
1683
        // if there are less than allowed active processes then add a new one
1684
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
1685
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1686
                'tx_crawler_process',
1687
                [
1688
                    'process_id' => $id,
1689
                    'active' => 1,
1690
                    'ttl' => $currentTime + (int) $this->extensionSettings['processMaxRunTime'],
1691
                    'system_process_id' => $systemProcessId,
1692
                ]
1693
            );
1694
        } else {
1695
            $ret = false;
1696
        }
1697
1698
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1699
        $this->CLI_releaseProcesses($orphanProcesses);
1700
1701
        return $ret;
1702
    }
1703
1704
    /**
1705
     * Release a process and the required resources
1706
     *
1707
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1708
     * @return boolean
1709
     */
1710
    public function CLI_releaseProcesses($releaseIds)
1711
    {
1712
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1713
1714
        if (! is_array($releaseIds)) {
1715
            $releaseIds = [$releaseIds];
1716
        }
1717
1718
        if (empty($releaseIds)) {
1719
            //nothing to release
1720
            return false;
1721
        }
1722
1723
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1724
        // this ensures that a single process can't mess up the entire process table
1725
1726
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1727
1728
        $queryBuilder
1729
            ->update($this->tableName, 'q')
1730
            ->where(
1731
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1732
            )
1733
            ->set('q.process_scheduled', 0)
1734
            ->set('q.process_id', '')
1735
            ->execute();
1736
1737
        // FIXME: Not entirely sure that this is equivalent to the previous version
1738
        $queryBuilder->resetQueryPart('set');
1739
1740
        $queryBuilder
1741
            ->update('tx_crawler_process')
1742
            ->where(
1743
                $queryBuilder->expr()->eq('active', 0),
1744
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1745
            )
1746
            ->set('system_process_id', 0)
1747
            ->execute();
1748
1749
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1750
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1751
1752
        return true;
1753
    }
1754
1755
    /**
1756
     * Create a unique Id for the current process
1757
     *
1758
     * @return string the ID
1759
     */
1760 1
    public function CLI_buildProcessId()
1761
    {
1762 1
        if (! $this->processID) {
1763
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1764
        }
1765 1
        return $this->processID;
1766
    }
1767
1768
    /**
1769
     * Prints a message to the stdout (only if debug-mode is enabled)
1770
     *
1771
     * @param string $msg the message
1772
     * @deprecated
1773
     * @codeCoverageIgnore
1774
     */
1775
    public function CLI_debug($msg): void
1776
    {
1777
        if ((int) $this->extensionSettings['processDebug']) {
1778
            echo $msg . "\n";
1779
            flush();
1780
        }
1781
    }
1782
1783
    /**
1784
     * Cleans up entries that stayed for too long in the queue. These are:
1785
     * - processed entries that are over 1.5 days in age
1786
     * - scheduled entries that are over 7 days old
1787
     *
1788
     * @deprecated
1789
     */
1790 1
    public function cleanUpOldQueueEntries(): void
1791
    {
1792
        // 24*60*60 Seconds in 24 hours
1793 1
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400;
1794 1
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
1795
1796 1
        $now = time();
1797 1
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
1798 1
        $this->flushQueue($condition);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1798
        /** @scrutinizer ignore-deprecated */ $this->flushQueue($condition);
Loading history...
1799 1
    }
1800
1801
    /**
1802
     * Removes queue entries
1803
     *
1804
     * @param string $where SQL related filter for the entries which should be removed
1805
     *
1806
     * @deprecated
1807
     */
1808 5
    protected function flushQueue($where = ''): void
1809
    {
1810 5
        $realWhere = strlen((string) $where) > 0 ? $where : '1=1';
1811
1812 5
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1813
1814
        $groups = $queryBuilder
1815 5
            ->selectLiteral('DISTINCT set_id')
1816 5
            ->from($this->tableName)
1817 5
            ->where($realWhere)
1818 5
            ->execute()
1819 5
            ->fetchAll();
1820 5
        if (is_array($groups)) {
0 ignored issues
show
introduced by
The condition is_array($groups) is always true.
Loading history...
1821 5
            foreach ($groups as $group) {
1822
                $subSet = $queryBuilder
1823 4
                    ->select('qid', 'set_id')
1824 4
                    ->from($this->tableName)
1825 4
                    ->where(
1826 4
                        $realWhere,
1827 4
                        $queryBuilder->expr()->eq('set_id', $group['set_id'])
1828
                    )
1829 4
                    ->execute()
1830 4
                    ->fetchAll();
1831
1832 4
                $payLoad = ['subSet' => $subSet];
1833 4
                SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1833
                /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1834 4
                    self::class,
1835 4
                    SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH,
1836
                    $payLoad
1837
                );
1838
            }
1839
        }
1840
1841
        $queryBuilder
1842 5
            ->delete($this->tableName)
1843 5
            ->where($realWhere)
1844 5
            ->execute();
1845 5
    }
1846
1847
    /**
1848
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1849
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1850
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1851
     *
1852
     * @param int $tstamp
1853
     * @param array $fieldArray
1854
     *
1855
     * @return array
1856
     * @deprecated
1857
     */
1858 5
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1859
    {
1860 5
        $rows = [];
1861
1862 5
        $currentTime = $this->getCurrentTime();
1863
1864 5
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1865
        $queryBuilder
1866 5
            ->select('qid')
1867 5
            ->from('tx_crawler_queue');
1868
        //if this entry is scheduled with "now"
1869 5
        if ($tstamp <= $currentTime) {
1870 2
            if ($this->extensionSettings['enableTimeslot']) {
1871 1
                $timeBegin = $currentTime - 100;
1872 1
                $timeEnd = $currentTime + 100;
1873
                $queryBuilder
1874 1
                    ->where(
1875 1
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1876
                    )
1877 1
                    ->orWhere(
1878 1
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1879
                    );
1880
            } else {
1881
                $queryBuilder
1882 1
                    ->where(
1883 2
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1884
                    );
1885
            }
1886 3
        } elseif ($tstamp > $currentTime) {
1887
            //entry with a timestamp in the future need to have the same schedule time
1888
            $queryBuilder
1889 3
                ->where(
1890 3
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1891
                );
1892
        }
1893
1894
        $queryBuilder
1895 5
            ->andWhere('NOT exec_time')
1896 5
            ->andWhere('NOT process_id')
1897 5
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], PDO::PARAM_INT)))
1898 5
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], PDO::PARAM_STR)));
1899
1900 5
        $statement = $queryBuilder->execute();
1901
1902 5
        while ($row = $statement->fetch()) {
1903 5
            $rows[] = $row['qid'];
1904
        }
1905
1906 5
        return $rows;
1907
    }
1908
1909
    /**
1910
     * Returns a md5 hash generated from a serialized configuration array.
1911
     *
1912
     * @return string
1913
     */
1914 10
    protected function getConfigurationHash(array $configuration)
1915
    {
1916 10
        unset($configuration['paramExpanded']);
1917 10
        unset($configuration['URLs']);
1918 10
        return md5(serialize($configuration));
1919
    }
1920
1921
    /**
1922
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
1923
     * the Site instance.
1924
     *
1925
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
1926
     * @throws SiteNotFoundException
1927
     * @throws InvalidRouteArgumentsException
1928
     *
1929
     * @deprecated Using CrawlerController::getUrlFromPageAndQueryParameters() is deprecated since 9.1.1 and will be removed in v11.x, please use UrlService->getUrlFromPageAndQueryParameters() instead.
1930
     * @codeCoverageIgnore
1931
     */
1932
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
1933
    {
1934
        $urlService = new UrlService();
1935
        return $urlService->getUrlFromPageAndQueryParameters($pageId, $queryString, $alternativeBaseUrl, $httpsOrHttp);
1936
    }
1937
1938 1
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1939
    {
1940
        // Swap if first is larger than last:
1941 1
        if ($reg[1] > $reg[2]) {
1942
            $temp = $reg[2];
1943
            $reg[2] = $reg[1];
1944
            $reg[1] = $temp;
1945
        }
1946
1947 1
        return $reg;
1948
    }
1949
1950 6
    private function getMaximumUrlsToCompile(): int
1951
    {
1952 6
        return $this->maximumUrlsToCompile;
1953
    }
1954
1955
    /**
1956
     * @return BackendUserAuthentication
1957
     */
1958 2
    private function getBackendUser()
1959
    {
1960
        // Make sure the _cli_ user is loaded
1961 2
        Bootstrap::initializeBackendAuthentication();
1962 2
        if ($this->backendUser === null) {
1963 2
            $this->backendUser = $GLOBALS['BE_USER'];
1964
        }
1965 2
        return $this->backendUser;
1966
    }
1967
1968
    /**
1969
     * Get querybuilder for given table
1970
     *
1971
     * @return QueryBuilder
1972
     */
1973 12
    private function getQueryBuilder(string $table)
1974
    {
1975 12
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
1976
    }
1977
}
1978