Passed
Push — deprecate/cli_hooks ( 08657b...73b282 )
by Tomas Norre
05:41 queued 02:17
created

CrawlerController::urlListFromUrlArray()   B

Complexity

Conditions 8
Paths 8

Size

Total Lines 68
Code Lines 39

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 29
CRAP Score 8.2037

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 8
eloc 39
c 1
b 0
f 0
nc 8
nop 9
dl 0
loc 68
ccs 29
cts 34
cp 0.8529
crap 8.2037
rs 8.0515

How to fix   Long Method    Many Parameters   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Converter\JsonCompatibilityConverter;
33
use AOE\Crawler\Crawler;
34
use AOE\Crawler\CrawlStrategy\CrawlStrategyFactory;
35
use AOE\Crawler\Domain\Model\Process;
36
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
37
use AOE\Crawler\Domain\Repository\ProcessRepository;
38
use AOE\Crawler\Domain\Repository\QueueRepository;
39
use AOE\Crawler\QueueExecutor;
40
use AOE\Crawler\Service\UrlService;
41
use AOE\Crawler\Utility\SignalSlotUtility;
42
use AOE\Crawler\Value\QueueFilter;
43
use PDO;
44
use Psr\Http\Message\UriInterface;
45
use Psr\Log\LoggerAwareInterface;
46
use Psr\Log\LoggerAwareTrait;
47
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
48
use TYPO3\CMS\Backend\Utility\BackendUtility;
49
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
50
use TYPO3\CMS\Core\Compatibility\PublicMethodDeprecationTrait;
51
use TYPO3\CMS\Core\Compatibility\PublicPropertyDeprecationTrait;
52
use TYPO3\CMS\Core\Core\Bootstrap;
53
use TYPO3\CMS\Core\Core\Environment;
54
use TYPO3\CMS\Core\Database\Connection;
55
use TYPO3\CMS\Core\Database\ConnectionPool;
56
use TYPO3\CMS\Core\Database\Query\QueryBuilder;
57
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
58
use TYPO3\CMS\Core\Database\QueryGenerator;
59
use TYPO3\CMS\Core\Exception\SiteNotFoundException;
60
use TYPO3\CMS\Core\Imaging\Icon;
61
use TYPO3\CMS\Core\Imaging\IconFactory;
62
use TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException;
63
use TYPO3\CMS\Core\Site\Entity\Site;
64
use TYPO3\CMS\Core\Type\Bitmask\Permission;
65
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
66
use TYPO3\CMS\Core\Utility\DebugUtility;
67
use TYPO3\CMS\Core\Utility\GeneralUtility;
68
use TYPO3\CMS\Core\Utility\MathUtility;
69
use TYPO3\CMS\Extbase\Object\ObjectManager;
70
use TYPO3\CMS\Frontend\Page\PageRepository;
71
72
/**
73
 * Class CrawlerController
74
 *
75
 * @package AOE\Crawler\Controller
76
 */
77
class CrawlerController implements LoggerAwareInterface
78
{
79
    use LoggerAwareTrait;
80
    use PublicMethodDeprecationTrait;
81
    use PublicPropertyDeprecationTrait;
82
83
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
84
85
    //queue not empty
86
    public const CLI_STATUS_REMAIN = 1;
87
88
    //(some) queue items where processed
89
    public const CLI_STATUS_PROCESSED = 2;
90
91
    //instance didn't finish
92
    public const CLI_STATUS_ABORTED = 4;
93
94
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
95
96
    /**
97
     * @var integer
98
     */
99
    public $setID = 0;
100
101
    /**
102
     * @var string
103
     */
104
    public $processID = '';
105
106
    /**
107
     * @var array
108
     */
109
    public $duplicateTrack = [];
110
111
    /**
112
     * @var array
113
     */
114
    public $downloadUrls = [];
115
116
    /**
117
     * @var array
118
     */
119
    public $incomingProcInstructions = [];
120
121
    /**
122
     * @var array
123
     */
124
    public $incomingConfigurationSelection = [];
125
126
    /**
127
     * @var bool
128
     */
129
    public $registerQueueEntriesInternallyOnly = false;
130
131
    /**
132
     * @var array
133
     */
134
    public $queueEntries = [];
135
136
    /**
137
     * @var array
138
     */
139
    public $urlList = [];
140
141
    /**
142
     * @var array
143
     */
144
    public $extensionSettings = [];
145
146
    /**
147
     * Mount Point
148
     *
149
     * @var bool
150
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
151
     */
152
    public $MP = false;
153
154
    /**
155
     * @var string
156
     * @deprecated
157
     */
158
    protected $processFilename;
159
160
    /**
161
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
162
     *
163
     * @var string
164
     * @deprecated
165
     */
166
    protected $accessMode;
167
168
    /**
169
     * @var QueueRepository
170
     */
171
    protected $queueRepository;
172
173
    /**
174
     * @var ProcessRepository
175
     */
176
    protected $processRepository;
177
178
    /**
179
     * @var ConfigurationRepository
180
     */
181
    protected $configurationRepository;
182
183
    /**
184
     * @var string
185
     */
186
    protected $tableName = 'tx_crawler_queue';
187
188
    /**
189
     * @var QueueExecutor
190
     */
191
    protected $queueExecutor;
192
193
    /**
194
     * @var int
195
     */
196
    protected $maximumUrlsToCompile = 10000;
197
198
    /**
199
     * @var IconFactory
200
     */
201
    protected $iconFactory;
202
203
    /**
204
     * @var string[]
205
     */
206
    private $deprecatedPublicMethods = [
0 ignored issues
show
introduced by
The private property $deprecatedPublicMethods is not used, and could be removed.
Loading history...
207
        'cleanUpOldQueueEntries' => 'Using CrawlerController::cleanUpOldQueueEntries() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->cleanUpOldQueueEntries() instead.',
208
        'CLI_debug' => 'Using CrawlerController->CLI_debug() is deprecated since 9.1.3 and will be removed in v11.x',
209
        'CLI_runHooks' => 'Using CrawlerController->CLI_runHooks() is deprecated since 9.1.5 and will be removed in v11.x',
210
        'getAccessMode' => 'Using CrawlerController->getAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
211
        'getLogEntriesForPageId' => 'Using CrawlerController->getLogEntriesForPageId() is deprecated since 9.1.5 and will be remove in v11.x',
212
        'getLogEntriesForSetId' => 'Using crawlerController::getLogEntriesForSetId() is deprecated since 9.0.1 and will be removed in v11.x',
213
        'flushQueue' => 'Using CrawlerController::flushQueue() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->flushQueue() instead.',
214
        'setAccessMode' => 'Using CrawlerController->setAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
215
        'getDisabled' => 'Using CrawlerController->getDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->isDisabled() instead',
216
        'setDisabled' => 'Using CrawlerController->setDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->setDisabled() instead',
217
        'getProcessFilename' => 'Using CrawlerController->getProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
218
        'setProcessFilename' => 'Using CrawlerController->setProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
219
        'getDuplicateRowsIfExist' => 'Using CrawlerController->getDuplicateRowsIfExist() is deprecated since 9.1.4 and will be remove in v11.x, please use QueueRepository->getDuplicateQueueItemsIfExists() instead',
220
    ];
221
222
    /**
223
     * @var string[]
224
     */
225
    private $deprecatedPublicProperties = [
1 ignored issue
show
introduced by
The private property $deprecatedPublicProperties is not used, and could be removed.
Loading history...
226
        'accessMode' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
227
        'processFilename' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
228
    ];
229
230
    /**
231
     * @var BackendUserAuthentication|null
232
     */
233
    private $backendUser;
234
235
    /**
236
     * @var integer
237
     */
238
    private $scheduledTime = 0;
239
240
    /**
241
     * @var integer
242
     */
243
    private $reqMinute = 0;
244
245
    /**
246
     * @var bool
247
     */
248
    private $submitCrawlUrls = false;
249
250
    /**
251
     * @var bool
252
     */
253
    private $downloadCrawlUrls = false;
254
255
    /**
256
     * @var PageRepository
257
     */
258
    private $pageRepository;
259
260
    /**
261
     * @var Crawler
262
     */
263
    private $crawler;
264
265
    /************************************
266
     *
267
     * Getting URLs based on Page TSconfig
268
     *
269
     ************************************/
270
271 36
    public function __construct()
272
    {
273 36
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
274 36
        $crawlStrategyFactory = GeneralUtility::makeInstance(CrawlStrategyFactory::class);
275 36
        $this->queueRepository = $objectManager->get(QueueRepository::class);
276 36
        $this->processRepository = $objectManager->get(ProcessRepository::class);
277 36
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
278 36
        $this->pageRepository = $objectManager->get(PageRepository::class);
279 36
        $this->queueExecutor = GeneralUtility::makeInstance(QueueExecutor::class, $crawlStrategyFactory);
280 36
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
281 36
        $this->crawler = GeneralUtility::makeInstance(Crawler::class);
282
283 36
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

283
        /** @scrutinizer ignore-deprecated */ $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
Loading history...
284
285
        /** @var ExtensionConfigurationProvider $configurationProvider */
286 36
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
287 36
        $settings = $configurationProvider->getExtensionConfiguration();
288 36
        $this->extensionSettings = is_array($settings) ? $settings : [];
0 ignored issues
show
introduced by
The condition is_array($settings) is always true.
Loading history...
289
290
        // set defaults:
291 36
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
292
            $this->extensionSettings['countInARun'] = 100;
293
        }
294
295 36
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
296 36
        $this->maximumUrlsToCompile = MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000);
297 36
    }
298
299
    /**
300
     * Method to set the accessMode can be gui, cli or cli_im
301
     *
302
     * @return string
303
     * @deprecated
304
     */
305 1
    public function getAccessMode()
306
    {
307 1
        return $this->accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

307
        return /** @scrutinizer ignore-deprecated */ $this->accessMode;
Loading history...
308
    }
309
310
    /**
311
     * @param string $accessMode
312
     * @deprecated
313
     */
314 1
    public function setAccessMode($accessMode): void
315
    {
316 1
        $this->accessMode = $accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

316
        /** @scrutinizer ignore-deprecated */ $this->accessMode = $accessMode;
Loading history...
317 1
    }
318
319
    /**
320
     * Set disabled status to prevent processes from being processed
321
     *
322
     * @param bool $disabled (optional, defaults to true)
323
     * @deprecated
324
     */
325 2
    public function setDisabled($disabled = true): void
326
    {
327 2
        if ($disabled) {
328 1
            GeneralUtility::writeFile($this->processFilename, '');
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

328
            GeneralUtility::writeFile(/** @scrutinizer ignore-deprecated */ $this->processFilename, '');
Loading history...
329
        } else {
330 1
            if (is_file($this->processFilename)) {
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

330
            if (is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename)) {
Loading history...
331 1
                unlink($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

331
                unlink(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
332
            }
333
        }
334 2
    }
335
336
    /**
337
     * Get disable status
338
     *
339
     * @return bool true if disabled
340
     * @deprecated
341
     */
342 2
    public function getDisabled()
343
    {
344 2
        return is_file($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

344
        return is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
345
    }
346
347
    /**
348
     * @param string $filenameWithPath
349
     * @deprecated
350
     */
351 3
    public function setProcessFilename($filenameWithPath): void
352
    {
353 3
        $this->processFilename = $filenameWithPath;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

353
        /** @scrutinizer ignore-deprecated */ $this->processFilename = $filenameWithPath;
Loading history...
354 3
    }
355
356
    /**
357
     * @return string
358
     * @deprecated
359
     */
360 1
    public function getProcessFilename()
361
    {
362 1
        return $this->processFilename;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

362
        return /** @scrutinizer ignore-deprecated */ $this->processFilename;
Loading history...
363
    }
364
365
    /**
366
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
367
     */
368 14
    public function setExtensionSettings(array $extensionSettings): void
369
    {
370 14
        $this->extensionSettings = $extensionSettings;
371 14
    }
372
373
    /**
374
     * Check if the given page should be crawled
375
     *
376
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
377
     */
378 12
    public function checkIfPageShouldBeSkipped(array $pageRow)
379
    {
380
        // if page is hidden
381 12
        if (! $this->extensionSettings['crawlHiddenPages'] && $pageRow['hidden']) {
382 1
            return 'Because page is hidden';
383
        }
384
385 11
        if (GeneralUtility::inList('3,4,199,254,255', $pageRow['doktype'])) {
386 3
            return 'Because doktype is not allowed';
387
        }
388
389 8
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
390 1
            if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
391 1
                return 'Doktype was excluded by "' . $key . '"';
392
            }
393
        }
394
395
        // veto hook
396 7
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
397
            $params = [
398 2
                'pageRow' => $pageRow,
399
            ];
400
            // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
401 2
            $veto = GeneralUtility::callUserFunction($func, $params, $this);
402 2
            if ($veto !== false) {
403 2
                if (is_string($veto)) {
404 1
                    return $veto;
405
                }
406 1
                return 'Veto from hook "' . htmlspecialchars($key) . '"';
407
            }
408
        }
409
410 5
        return false;
411
    }
412
413
    /**
414
     * Wrapper method for getUrlsForPageId()
415
     * It returns an array of configurations and no urls!
416
     *
417
     * @param array $pageRow Page record with at least dok-type and uid columns.
418
     * @param string $skipMessage
419
     * @return array
420
     * @see getUrlsForPageId()
421
     */
422 6
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
423
    {
424 6
        if (! is_int($pageRow['uid'])) {
425
            $skipMessage = 'PageUid ' . $pageRow['uid'] . ' was not an integer';
426
            return [];
427
        }
428
429 6
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
430 6
        if ($message === false) {
431 5
            $res = $this->getUrlsForPageId($pageRow['uid']);
432 5
            $skipMessage = '';
433
        } else {
434 1
            $skipMessage = $message;
435 1
            $res = [];
436
        }
437
438 6
        return $res;
439
    }
440
441
    /**
442
     * Creates a list of URLs from input array (and submits them to queue if asked for)
443
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
444
     *
445
     * @param array $vv Information about URLs from pageRow to crawl.
446
     * @param array $pageRow Page row
447
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
448
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
449
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
450
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
451
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
452
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
453
     * @param array $incomingProcInstructions Array of processing instructions
454
     * @return string List of URLs (meant for display in backend module)
455
     */
456 4
    public function urlListFromUrlArray(
457
        array $vv,
458
        array $pageRow,
459
        $scheduledTime,
460
        $reqMinute,
461
        $submitCrawlUrls,
462
        $downloadCrawlUrls,
463
        array &$duplicateTrack,
464
        array &$downloadUrls,
465
        array $incomingProcInstructions
466
    ) {
467 4
        if (! is_array($vv['URLs'])) {
468
            return 'ERROR - no URL generated';
469
        }
470 4
        $urlLog = [];
471 4
        $pageId = (int) $pageRow['uid'];
472 4
        $configurationHash = $this->getConfigurationHash($vv);
473 4
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
474
475 4
        $urlService = new UrlService();
476
477 4
        foreach ($vv['URLs'] as $urlQuery) {
478 4
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
479
                continue;
480
            }
481 4
            $url = (string) $urlService->getUrlFromPageAndQueryParameters(
482 4
                $pageId,
483
                $urlQuery,
484 4
                $vv['subCfg']['baseUrl'] ?? null,
485 4
                $vv['subCfg']['force_ssl'] ?? 0
486
            );
487
488
            // Create key by which to determine unique-ness:
489 4
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
490
491 4
            if (isset($duplicateTrack[$uKey])) {
492
                //if the url key is registered just display it and do not resubmit is
493
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
494
            } else {
495
                // Scheduled time:
496 4
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
497 4
                $schTime = intval($schTime / 60) * 60;
498 4
                $formattedDate = BackendUtility::datetime($schTime);
499 4
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
500 4
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
501
502
                // Submit for crawling!
503 4
                if ($submitCrawlUrls) {
504 4
                    $added = $this->addUrl(
505 4
                        $pageId,
506
                        $url,
507 4
                        $vv['subCfg'],
508
                        $scheduledTime,
509
                        $configurationHash,
510
                        $skipInnerCheck
511
                    );
512 4
                    if ($added === false) {
513 4
                        $urlList .= ' (URL already existed)';
514
                    }
515
                } elseif ($downloadCrawlUrls) {
516
                    $downloadUrls[$url] = $url;
517
                }
518 4
                $urlLog[] = $urlList;
519
            }
520 4
            $duplicateTrack[$uKey] = true;
521
        }
522
523 4
        return implode('<br>', $urlLog);
524
    }
525
526
    /**
527
     * Returns true if input processing instruction is among registered ones.
528
     *
529
     * @param string $piString PI to test
530
     * @param array $incomingProcInstructions Processing instructions
531
     * @return boolean
532
     */
533 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
534
    {
535 5
        if (empty($incomingProcInstructions)) {
536 1
            return true;
537
        }
538
539 4
        foreach ($incomingProcInstructions as $pi) {
540 4
            if (GeneralUtility::inList($piString, $pi)) {
541 2
                return true;
542
            }
543
        }
544 2
        return false;
545
    }
546
547 5
    public function getPageTSconfigForId($id): array
548
    {
549 5
        if (! $this->MP) {
550 5
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

550
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($id);
Loading history...
551
        } else {
552
            // TODO: Please check, this makes no sense to split a boolean value.
553
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

553
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
554
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

554
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($mountPointId);
Loading history...
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

554
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
555
        }
556
557
        // Call a hook to alter configuration
558 5
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
559
            $params = [
560
                'pageId' => $id,
561
                'pageTSConfig' => &$pageTSconfig,
562
            ];
563
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
564
                GeneralUtility::callUserFunction($userFunc, $params, $this);
565
            }
566
        }
567 5
        return $pageTSconfig;
568
    }
569
570
    /**
571
     * This methods returns an array of configurations.
572
     * Adds no urls!
573
     */
574 4
    public function getUrlsForPageId(int $pageId): array
575
    {
576
        // Get page TSconfig for page ID
577 4
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
578
579 4
        $res = [];
580
581
        // Fetch Crawler Configuration from pageTSconfig
582 4
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
583 4
        foreach ($crawlerCfg as $key => $values) {
584 3
            if (! is_array($values)) {
585 3
                continue;
586
            }
587 3
            $key = str_replace('.', '', $key);
588
            // Sub configuration for a single configuration string:
589 3
            $subCfg = (array) $crawlerCfg[$key . '.'];
590 3
            $subCfg['key'] = $key;
591
592 3
            if (strcmp($subCfg['procInstrFilter'] ?? '', '')) {
593 3
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
594
            }
595 3
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
596
597
            // process configuration if it is not page-specific or if the specific page is the current page:
598
            // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
599 3
            if (! strcmp((string) $subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
600
601
                // Explode, process etc.:
602 3
                $res[$key] = [];
603 3
                $res[$key]['subCfg'] = $subCfg;
604 3
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
605 3
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
606 3
                $res[$key]['origin'] = 'pagets';
607
608
                // recognize MP value
609 3
                if (! $this->MP) {
610 3
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
611
                } else {
612
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
0 ignored issues
show
Bug introduced by
Are you sure $this->MP of type true can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

612
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . /** @scrutinizer ignore-type */ $this->MP]);
Loading history...
613
                }
614
            }
615
        }
616
617
        // Get configuration from tx_crawler_configuration records up the rootline
618 4
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
619 4
        foreach ($crawlerConfigurations as $configurationRecord) {
620
621
            // check access to the configuration record
622 1
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || $this->hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
623 1
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
624
625
                // process configuration if it is not page-specific or if the specific page is the current page:
626
                // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
627 1
                if (! strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
628 1
                    $key = $configurationRecord['name'];
629
630
                    // don't overwrite previously defined paramSets
631 1
                    if (! isset($res[$key])) {
632
633
                        /* @var $TSparserObject TypoScriptParser */
634 1
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
635 1
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
636
637
                        $subCfg = [
638 1
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
639 1
                            'procInstrParams.' => $TSparserObject->setup,
640 1
                            'baseUrl' => $configurationRecord['base_url'],
641 1
                            'force_ssl' => (int) $configurationRecord['force_ssl'],
642 1
                            'userGroups' => $configurationRecord['fegroups'],
643 1
                            'exclude' => $configurationRecord['exclude'],
644 1
                            'key' => $key,
645
                        ];
646
647 1
                        if (! in_array($pageId, $this->expandExcludeString($subCfg['exclude']), true)) {
648 1
                            $res[$key] = [];
649 1
                            $res[$key]['subCfg'] = $subCfg;
650 1
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
651 1
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
652 1
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
653 1
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
654
                        }
655
                    }
656
                }
657
            }
658
        }
659
660 4
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
661
            $params = [
662
                'res' => &$res,
663
            ];
664
            GeneralUtility::callUserFunction($func, $params, $this);
665
        }
666 4
        return $res;
667
    }
668
669
    /**
670
     * Find all configurations of subpages of a page
671
     * TODO: Write Functional Tests
672
     */
673 1
    public function getConfigurationsForBranch(int $rootid, int $depth): array
674
    {
675 1
        $configurationsForBranch = [];
676 1
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
677 1
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
678 1
        foreach ($sets as $key => $value) {
679
            if (! is_array($value)) {
680
                continue;
681
            }
682
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
683
        }
684 1
        $pids = [];
685 1
        $rootLine = BackendUtility::BEgetRootLine($rootid);
686 1
        foreach ($rootLine as $node) {
687 1
            $pids[] = $node['uid'];
688
        }
689
        /* @var PageTreeView $tree */
690 1
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
691 1
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
692 1
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
693 1
        $tree->getTree($rootid, $depth, '');
694 1
        foreach ($tree->tree as $node) {
695
            $pids[] = $node['row']['uid'];
696
        }
697
698 1
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
699
        $statement = $queryBuilder
700 1
            ->select('name')
701 1
            ->from('tx_crawler_configuration')
702 1
            ->where(
703 1
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
704
            )
705 1
            ->execute();
706
707 1
        while ($row = $statement->fetch()) {
708 1
            $configurationsForBranch[] = $row['name'];
709
        }
710 1
        return $configurationsForBranch;
711
    }
712
713
    /**
714
     * Check if a user has access to an item
715
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
716
     *
717
     * @param string $groupList Comma-separated list of (fe_)group UIDs from a user
718
     * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access
719
     * @return bool TRUE if at least one of the users group UIDs is in the access list or the access list is empty
720
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
721
     */
722 3
    public function hasGroupAccess($groupList, $accessList)
723
    {
724 3
        if (empty($accessList)) {
725 1
            return true;
726
        }
727 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
728 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
729 1
                return true;
730
            }
731
        }
732 1
        return false;
733
    }
734
735
    /**
736
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
737
     * Syntax of values:
738
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
739
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
740
     * - For each configuration part:
741
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
742
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
743
     *        _ENABLELANG:1 picks only original records without their language overlays
744
     *         - Default: Literal value
745
     *
746
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
747
     * @param integer $pid Current page ID
748
     * @return array
749
     *
750
     * TODO: Write Functional Tests
751
     */
752 11
    public function expandParameters($paramArray, $pid)
753
    {
754
        // Traverse parameter names:
755 11
        foreach ($paramArray as $p => $v) {
756 11
            $v = trim($v);
757
758
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
759 11
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
760
                // So, find the value inside brackets and reset the paramArray value as an array.
761 11
                $v = substr($v, 1, -1);
762 11
                $paramArray[$p] = [];
763
764
                // Explode parts and traverse them:
765 11
                $parts = explode('|', $v);
766 11
                foreach ($parts as $pV) {
767
768
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
769 11
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
770 1
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
771
772
                        // Traverse range, add values:
773
                        // Limit to size of range!
774 1
                        $runAwayBrake = 1000;
775 1
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
776 1
                            $paramArray[$p][] = $a;
777 1
                            $runAwayBrake--;
778 1
                            if ($runAwayBrake <= 0) {
779
                                break;
780
                            }
781
                        }
782 10
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
783
784
                        // Parse parameters:
785 6
                        $subparts = GeneralUtility::trimExplode(';', $pV);
786 6
                        $subpartParams = [];
787 6
                        foreach ($subparts as $spV) {
788 6
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
789 6
                            $subpartParams[$pKey] = $pVal;
790
                        }
791
792
                        // Table exists:
793 6
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
794 6
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
795 6
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
796 6
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
797 6
                            $where = $subpartParams['_WHERE'] ?? '';
798 6
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
799
800 6
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
801 6
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
802 6
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
803
804 6
                                if ($recursiveDepth > 0) {
805
                                    /** @var QueryGenerator $queryGenerator */
806 2
                                    $queryGenerator = GeneralUtility::makeInstance(QueryGenerator::class);
807 2
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
808 2
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
809
                                } else {
810 4
                                    $pidArray = [(string) $lookUpPid];
811
                                }
812
813 6
                                $queryBuilder->getRestrictions()
814 6
                                    ->removeAll()
815 6
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
816
817
                                $queryBuilder
818 6
                                    ->select($fieldName)
819 6
                                    ->from($subpartParams['_TABLE'])
820 6
                                    ->where(
821 6
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
822
                                        $where
823
                                    );
824
825 6
                                if (! empty($addTable)) {
826
                                    // TODO: Check if this works as intended!
827
                                    $queryBuilder->add('from', $addTable);
828
                                }
829 6
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
830
831 6
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
832
                                    $queryBuilder->andWhere(
833
                                        $queryBuilder->expr()->lte(
834
                                            $transOrigPointerField,
835
                                            0
836
                                        )
837
                                    );
838
                                }
839
840 6
                                $statement = $queryBuilder->execute();
841
842 6
                                $rows = [];
843 6
                                while ($row = $statement->fetch()) {
844 6
                                    $rows[$row[$fieldName]] = $row;
845
                                }
846
847 6
                                if (is_array($rows)) {
848 6
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
849
                                }
850
                            }
851
                        }
852
                    } else {
853
                        // Just add value:
854 4
                        $paramArray[$p][] = $pV;
855
                    }
856
                    // Hook for processing own expandParameters place holder
857 11
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
858
                        $_params = [
859
                            'pObj' => &$this,
860
                            'paramArray' => &$paramArray,
861
                            'currentKey' => $p,
862
                            'currentValue' => $pV,
863
                            'pid' => $pid,
864
                        ];
865
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
866
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
867
                        }
868
                    }
869
                }
870
871
                // Make unique set of values and sort array by key:
872 11
                $paramArray[$p] = array_unique($paramArray[$p]);
873 11
                ksort($paramArray);
874
            } else {
875
                // Set the literal value as only value in array:
876 4
                $paramArray[$p] = [$v];
877
            }
878
        }
879
880 11
        return $paramArray;
881
    }
882
883
    /**
884
     * Compiling URLs from parameter array (output of expandParameters())
885
     * The number of URLs will be the multiplication of the number of parameter values for each key
886
     *
887
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
888
     * @param array $urls URLs accumulated in this array (for recursion)
889
     * @return array
890
     */
891 7
    public function compileUrls($paramArray, array $urls)
892
    {
893 7
        if (empty($paramArray)) {
894 7
            return $urls;
895
        }
896
        // shift first off stack:
897 6
        reset($paramArray);
898 6
        $varName = key($paramArray);
899 6
        $valueSet = array_shift($paramArray);
900
901
        // Traverse value set:
902 6
        $newUrls = [];
903 6
        foreach ($urls as $url) {
904 5
            foreach ($valueSet as $val) {
905 5
                $newUrls[] = $url . (strcmp((string) $val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string) $val) : '');
906
907 5
                if (count($newUrls) > $this->maximumUrlsToCompile) {
908
                    break;
909
                }
910
            }
911
        }
912 6
        return $this->compileUrls($paramArray, $newUrls);
913
    }
914
915
    /************************************
916
     *
917
     * Crawler log
918
     *
919
     ************************************/
920
921
    /**
922
     * Return array of records from crawler queue for input page ID
923
     *
924
     * @param integer $id Page ID for which to look up log entries.
925
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
926
     * @param boolean $doFullFlush
927
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
928
     * @return array
929
     *
930
     * @deprecated
931
     */
932 4
    public function getLogEntriesForPageId($id, QueueFilter $queueFilter, $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
0 ignored issues
show
Unused Code introduced by
The parameter $doFullFlush is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

932
    public function getLogEntriesForPageId($id, QueueFilter $queueFilter, $doFlush = false, /** @scrutinizer ignore-unused */ $doFullFlush = false, $itemsPerPage = 10)

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
933
    {
934 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
935
        $queryBuilder
936 4
            ->select('*')
937 4
            ->from($this->tableName)
938 4
            ->where(
939 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, PDO::PARAM_INT))
940
            )
941 4
            ->orderBy('scheduled', 'DESC');
942
943 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
944 4
            ->getConnectionForTable($this->tableName)
945 4
            ->getExpressionBuilder();
946 4
        $query = $expressionBuilder->andX();
0 ignored issues
show
Unused Code introduced by
The assignment to $query is dead and can be removed.
Loading history...
947
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
948
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
949
        // between the statements, it's not a mistake in the code.
950 4
        switch ($queueFilter) {
951 4
            case 'pending':
952
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
953
                break;
954 4
            case 'finished':
955
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
956
                break;
957
        }
958
959 4
        if ($doFlush) {
960 2
            $this->queueRepository->flushQueue($queueFilter);
961
        }
962 4
        if ($itemsPerPage > 0) {
963
            $queryBuilder
964 4
                ->setMaxResults((int) $itemsPerPage);
965
        }
966
967 4
        return $queryBuilder->execute()->fetchAll();
968
    }
969
970
    /**
971
     * Return array of records from crawler queue for input set ID
972
     *
973
     * @param int $set_id Set ID for which to look up log entries.
974
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
975
     * @param bool $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
976
     * @param int $itemsPerPage Limit the amount of entries per page default is 10
977
     * @return array
978
     *
979
     * @deprecated
980
     */
981 6
    public function getLogEntriesForSetId(int $set_id, string $filter = '', bool $doFlush = false, bool $doFullFlush = false, int $itemsPerPage = 10)
982
    {
983 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
984
        $queryBuilder
985 6
            ->select('*')
986 6
            ->from($this->tableName)
987 6
            ->where(
988 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, PDO::PARAM_INT))
989
            )
990 6
            ->orderBy('scheduled', 'DESC');
991
992 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
993 6
            ->getConnectionForTable($this->tableName)
994 6
            ->getExpressionBuilder();
995 6
        $query = $expressionBuilder->andX();
996
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
997
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
998
        // between the statements, it's not a mistake in the code.
999 6
        $addWhere = '';
1000 6
        switch ($filter) {
1001 6
            case 'pending':
1002 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
1003 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
1004 1
                break;
1005 5
            case 'finished':
1006 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1007 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
1008 1
                break;
1009
        }
1010 6
        if ($doFlush) {
1011 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int) $set_id));
1012 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1012
            /** @scrutinizer ignore-deprecated */ $this->flushQueue($doFullFlush ? '' : $addWhere);
Loading history...
1013 4
            return [];
1014
        }
1015 2
        if ($itemsPerPage > 0) {
1016
            $queryBuilder
1017 2
                ->setMaxResults((int) $itemsPerPage);
1018
        }
1019
1020 2
        return $queryBuilder->execute()->fetchAll();
1021
    }
1022
1023
    /**
1024
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1025
     *
1026
     * @param integer $setId Set ID
1027
     * @param array $params Parameters to pass to call back function
1028
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1029
     * @param integer $page_id Page ID to attach it to
1030
     * @param integer $schedule Time at which to activate
1031
     */
1032
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
1033
    {
1034
        if (! is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
1035
            $params = [];
1036
        }
1037
        $params['_CALLBACKOBJ'] = $callBack;
1038
1039
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1040
            ->insert(
1041
                'tx_crawler_queue',
1042
                [
1043
                    'page_id' => (int) $page_id,
1044
                    'parameters' => json_encode($params),
1045
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
1046
                    'exec_time' => 0,
1047
                    'set_id' => (int) $setId,
1048
                    'result_data' => '',
1049
                ]
1050
            );
1051
    }
1052
1053
    /************************************
1054
     *
1055
     * URL setting
1056
     *
1057
     ************************************/
1058
1059
    /**
1060
     * Setting a URL for crawling:
1061
     *
1062
     * @param integer $id Page ID
1063
     * @param string $url Complete URL
1064
     * @param array $subCfg Sub configuration array (from TS config)
1065
     * @param integer $tstamp Scheduled-time
1066
     * @param string $configurationHash (optional) configuration hash
1067
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1068
     * @return bool
1069
     */
1070 8
    public function addUrl(
1071
        $id,
1072
        $url,
1073
        array $subCfg,
1074
        $tstamp,
1075
        $configurationHash = '',
1076
        $skipInnerDuplicationCheck = false
1077
    ) {
1078 8
        $urlAdded = false;
1079 8
        $rows = [];
1080
1081
        // Creating parameters:
1082
        $parameters = [
1083 8
            'url' => $url,
1084
        ];
1085
1086
        // fe user group simulation:
1087 8
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1088 8
        if ($uGs) {
1089 1
            $parameters['feUserGroupList'] = $uGs;
1090
        }
1091
1092
        // Setting processing instructions
1093 8
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1094 8
        if (is_array($subCfg['procInstrParams.'])) {
1095 5
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1096
        }
1097
1098
        // Compile value array:
1099 8
        $parameters_serialized = json_encode($parameters);
1100
        $fieldArray = [
1101 8
            'page_id' => (int) $id,
1102 8
            'parameters' => $parameters_serialized,
1103 8
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1104 8
            'configuration_hash' => $configurationHash,
1105 8
            'scheduled' => $tstamp,
1106 8
            'exec_time' => 0,
1107 8
            'set_id' => (int) $this->setID,
1108 8
            'result_data' => '',
1109 8
            'configuration' => $subCfg['key'],
1110
        ];
1111
1112 8
        if ($this->registerQueueEntriesInternallyOnly) {
1113
            //the entries will only be registered and not stored to the database
1114 1
            $this->queueEntries[] = $fieldArray;
1115
        } else {
1116 7
            if (! $skipInnerDuplicationCheck) {
1117
                // check if there is already an equal entry
1118 6
                $rows = $this->queueRepository->getDuplicateQueueItemsIfExists(
1119 6
                    (bool) $this->extensionSettings['enableTimeslot'],
1120
                    $tstamp,
1121 6
                    $this->getCurrentTime(),
1122 6
                    $fieldArray['page_id'],
1123 6
                    $fieldArray['parameters_hash']
1124
                );
1125
            }
1126
1127 7
            if (empty($rows)) {
1128 6
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1129 6
                $connectionForCrawlerQueue->insert(
1130 6
                    'tx_crawler_queue',
1131
                    $fieldArray
1132
                );
1133 6
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1134 6
                $rows[] = $uid;
1135 6
                $urlAdded = true;
1136
1137 6
                $signalPayload = ['uid' => $uid, 'fieldArray' => $fieldArray];
1138 6
                SignalSlotUtility::emitSignal(
1139 6
                    self::class,
1140 6
                    SignalSlotUtility::SIGNAL_URL_ADDED_TO_QUEUE,
1141
                    $signalPayload
1142
                );
1143
            } else {
1144 3
                $signalPayload = ['rows' => $rows, 'fieldArray' => $fieldArray];
1145 3
                SignalSlotUtility::emitSignal(
1146 3
                    self::class,
1147 3
                    SignalSlotUtility::SIGNAL_DUPLICATE_URL_IN_QUEUE,
1148
                    $signalPayload
1149
                );
1150
            }
1151
        }
1152
1153 8
        return $urlAdded;
1154
    }
1155
1156
    /**
1157
     * Returns the current system time
1158
     *
1159
     * @return int
1160
     */
1161 2
    public function getCurrentTime()
1162
    {
1163 2
        return time();
1164
    }
1165
1166
    /************************************
1167
     *
1168
     * URL reading
1169
     *
1170
     ************************************/
1171
1172
    /**
1173
     * Read URL for single queue entry
1174
     *
1175
     * @param integer $queueId
1176
     * @param boolean $force If set, will process even if exec_time has been set!
1177
     * @return integer
1178
     */
1179
    public function readUrl($queueId, $force = false)
1180
    {
1181
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1182
        $ret = 0;
1183
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1184
        // Get entry:
1185
        $queryBuilder
1186
            ->select('*')
1187
            ->from('tx_crawler_queue')
1188
            ->where(
1189
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, PDO::PARAM_INT))
1190
            );
1191
        if (! $force) {
1192
            $queryBuilder
1193
                ->andWhere('exec_time = 0')
1194
                ->andWhere('process_scheduled > 0');
1195
        }
1196
        $queueRec = $queryBuilder->execute()->fetch();
1197
1198
        if (! is_array($queueRec)) {
1199
            return;
1200
        }
1201
1202
        SignalSlotUtility::emitSignal(
1203
            self::class,
1204
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1205
            [$queueId, &$queueRec]
1206
        );
1207
1208
        // Set exec_time to lock record:
1209
        $field_array = ['exec_time' => $this->getCurrentTime()];
1210
1211
        if (isset($this->processID)) {
1212
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1213
            $field_array['process_id_completed'] = $this->processID;
1214
        }
1215
1216
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1217
            ->update(
1218
                'tx_crawler_queue',
1219
                $field_array,
1220
                ['qid' => (int) $queueId]
1221
            );
1222
1223
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1224
        if ($result['content'] === null) {
1225
            $resultData = 'An errors happened';
1226
        } else {
1227
            /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
1228
            $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
1229
            $resultData = $jsonCompatibilityConverter->convert($result['content']);
1230
        }
1231
1232
        //atm there's no need to point to specific pollable extensions
1233
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1234
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1235
                // only check the success value if the instruction is runnig
1236
                // it is important to name the pollSuccess key same as the procInstructions key
1237
                if (is_array($resultData['parameters']['procInstructions'])
1238
                    && in_array(
1239
                        $pollable,
1240
                        $resultData['parameters']['procInstructions'], true
1241
                    )
1242
                ) {
1243
                    if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1244
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1245
                    }
1246
                }
1247
            }
1248
        }
1249
1250
        // Set result in log which also denotes the end of the processing of this entry.
1251
        $field_array = ['result_data' => json_encode($result)];
1252
1253
        SignalSlotUtility::emitSignal(
1254
            self::class,
1255
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1256
            [$queueId, &$field_array]
1257
        );
1258
1259
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1260
            ->update(
1261
                'tx_crawler_queue',
1262
                $field_array,
1263
                ['qid' => (int) $queueId]
1264
            );
1265
1266
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1267
        return $ret;
1268
    }
1269
1270
    /**
1271
     * Read URL for not-yet-inserted log-entry
1272
     *
1273
     * @param array $field_array Queue field array,
1274
     *
1275
     * @return array|bool|mixed|string
1276
     */
1277
    public function readUrlFromArray($field_array)
1278
    {
1279
        // Set exec_time to lock record:
1280
        $field_array['exec_time'] = $this->getCurrentTime();
1281
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1282
        $connectionForCrawlerQueue->insert(
1283
            $this->tableName,
1284
            $field_array
1285
        );
1286
        $queueId = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1287
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1288
1289
        // Set result in log which also denotes the end of the processing of this entry.
1290
        $field_array = ['result_data' => json_encode($result)];
1291
1292
        SignalSlotUtility::emitSignal(
1293
            self::class,
1294
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1295
            [$queueId, &$field_array]
1296
        );
1297
1298
        $connectionForCrawlerQueue->update(
1299
            $this->tableName,
1300
            $field_array,
1301
            ['qid' => $queueId]
1302
        );
1303
1304
        return $result;
1305
    }
1306
1307
    /*****************************
1308
     *
1309
     * Compiling URLs to crawl - tools
1310
     *
1311
     *****************************/
1312
1313
    /**
1314
     * @param integer $id Root page id to start from.
1315
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1316
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1317
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1318
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1319
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1320
     * @param array $incomingProcInstructions Array of processing instructions
1321
     * @param array $configurationSelection Array of configuration keys
1322
     * @return string
1323
     */
1324
    public function getPageTreeAndUrls(
1325
        $id,
1326
        $depth,
1327
        $scheduledTime,
1328
        $reqMinute,
1329
        $submitCrawlUrls,
1330
        $downloadCrawlUrls,
1331
        array $incomingProcInstructions,
1332
        array $configurationSelection
1333
    ) {
1334
        $this->scheduledTime = $scheduledTime;
1335
        $this->reqMinute = $reqMinute;
1336
        $this->submitCrawlUrls = $submitCrawlUrls;
1337
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1338
        $this->incomingProcInstructions = $incomingProcInstructions;
1339
        $this->incomingConfigurationSelection = $configurationSelection;
1340
1341
        $this->duplicateTrack = [];
1342
        $this->downloadUrls = [];
1343
1344
        // Drawing tree:
1345
        /* @var PageTreeView $tree */
1346
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1347
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1348
        $tree->init('AND ' . $perms_clause);
1349
1350
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1351
        if (is_array($pageInfo)) {
0 ignored issues
show
introduced by
The condition is_array($pageInfo) is always true.
Loading history...
1352
            // Set root row:
1353
            $tree->tree[] = [
1354
                'row' => $pageInfo,
1355
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1356
            ];
1357
        }
1358
1359
        // Get branch beneath:
1360
        if ($depth) {
1361
            $tree->getTree($id, $depth, '');
1362
        }
1363
1364
        // Traverse page tree:
1365
        $code = '';
1366
1367
        foreach ($tree->tree as $data) {
1368
            $this->MP = false;
1369
1370
            // recognize mount points
1371
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
1372
                $mountpage = $this->pageRepository->getPage($data['row']['uid']);
1373
1374
                // fetch mounted pages
1375
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1376
1377
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1378
                $mountTree->init('AND ' . $perms_clause);
1379
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1380
1381
                foreach ($mountTree->tree as $mountData) {
1382
                    $code .= $this->drawURLs_addRowsForPage(
1383
                        $mountData['row'],
1384
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1385
                    );
1386
                }
1387
1388
                // replace page when mount_pid_ol is enabled
1389
                if ($mountpage[0]['mount_pid_ol']) {
1390
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1391
                } else {
1392
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1393
                    $this->MP = false;
1394
                }
1395
            }
1396
1397
            $code .= $this->drawURLs_addRowsForPage(
1398
                $data['row'],
1399
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1400
            );
1401
        }
1402
1403
        return $code;
1404
    }
1405
1406
    /**
1407
     * Expands exclude string
1408
     *
1409
     * @param string $excludeString Exclude string
1410
     * @return array
1411
     */
1412 2
    public function expandExcludeString($excludeString)
1413
    {
1414
        // internal static caches;
1415 2
        static $expandedExcludeStringCache;
1416 2
        static $treeCache;
1417
1418 2
        if (empty($expandedExcludeStringCache[$excludeString])) {
1419 2
            $pidList = [];
1420
1421 2
            if (! empty($excludeString)) {
1422
                /** @var PageTreeView $tree */
1423 1
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1424 1
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1425
1426 1
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1427
1428 1
                foreach ($excludeParts as $excludePart) {
1429 1
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1430
1431
                    // default is "page only" = "depth=0"
1432 1
                    if (empty($depth)) {
1433 1
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1434
                    }
1435
1436 1
                    $pidList[] = (int) $pid;
1437
1438 1
                    if ($depth > 0) {
1439
                        if (empty($treeCache[$pid][$depth])) {
1440
                            $tree->reset();
1441
                            $tree->getTree($pid, $depth);
0 ignored issues
show
Bug introduced by
$pid of type string is incompatible with the type integer expected by parameter $uid of TYPO3\CMS\Backend\Tree\V...ractTreeView::getTree(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1441
                            $tree->getTree(/** @scrutinizer ignore-type */ $pid, $depth);
Loading history...
1442
                            $treeCache[$pid][$depth] = $tree->tree;
1443
                        }
1444
1445
                        foreach ($treeCache[$pid][$depth] as $data) {
1446
                            $pidList[] = (int) $data['row']['uid'];
1447
                        }
1448
                    }
1449
                }
1450
            }
1451
1452 2
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1453
        }
1454
1455 2
        return $expandedExcludeStringCache[$excludeString];
1456
    }
1457
1458
    /**
1459
     * Create the rows for display of the page tree
1460
     * For each page a number of rows are shown displaying GET variable configuration
1461
     */
1462
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1463
    {
1464
        $skipMessage = '';
1465
1466
        // Get list of configurations
1467
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1468
1469
        if (! empty($this->incomingConfigurationSelection)) {
1470
            // remove configuration that does not match the current selection
1471
            foreach ($configurations as $confKey => $confArray) {
1472
                if (! in_array($confKey, $this->incomingConfigurationSelection, true)) {
1473
                    unset($configurations[$confKey]);
1474
                }
1475
            }
1476
        }
1477
1478
        // Traverse parameter combinations:
1479
        $c = 0;
1480
        $content = '';
1481
        if (! empty($configurations)) {
1482
            foreach ($configurations as $confKey => $confArray) {
1483
1484
                // Title column:
1485
                if (! $c) {
1486
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1487
                } else {
1488
                    $titleClm = '';
1489
                }
1490
1491
                if (! in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']), true)) {
1492
1493
                    // URL list:
1494
                    $urlList = $this->urlListFromUrlArray(
1495
                        $confArray,
1496
                        $pageRow,
1497
                        $this->scheduledTime,
1498
                        $this->reqMinute,
1499
                        $this->submitCrawlUrls,
1500
                        $this->downloadCrawlUrls,
1501
                        $this->duplicateTrack,
1502
                        $this->downloadUrls,
1503
                        // if empty the urls won't be filtered by processing instructions
1504
                        $this->incomingProcInstructions
1505
                    );
1506
1507
                    // Expanded parameters:
1508
                    $paramExpanded = '';
1509
                    $calcAccu = [];
1510
                    $calcRes = 1;
1511
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1512
                        $paramExpanded .= '
1513
                            <tr>
1514
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1515
                            '(' . count($gVal) . ')' .
1516
                            '</td>
1517
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1518
                            </tr>
1519
                        ';
1520
                        $calcRes *= count($gVal);
1521
                        $calcAccu[] = count($gVal);
1522
                    }
1523
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1524
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1525
1526
                    // Options
1527
                    $optionValues = '';
1528
                    if ($confArray['subCfg']['userGroups']) {
1529
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1530
                    }
1531
                    if ($confArray['subCfg']['procInstrFilter']) {
1532
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1533
                    }
1534
1535
                    // Compile row:
1536
                    $content .= '
1537
                        <tr>
1538
                            ' . $titleClm . '
1539
                            <td>' . htmlspecialchars($confKey) . '</td>
1540
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1541
                            <td>' . $paramExpanded . '</td>
1542
                            <td nowrap="nowrap">' . $urlList . '</td>
1543
                            <td nowrap="nowrap">' . $optionValues . '</td>
1544
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1545
                        </tr>';
1546
                } else {
1547
                    $content .= '<tr>
1548
                            ' . $titleClm . '
1549
                            <td>' . htmlspecialchars($confKey) . '</td>
1550
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1551
                        </tr>';
1552
                }
1553
1554
                $c++;
1555
            }
1556
        } else {
1557
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1558
1559
            // Compile row:
1560
            $content .= '
1561
                <tr>
1562
                    <td>' . $pageTitle . '</td>
1563
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1564
                </tr>';
1565
        }
1566
1567
        return $content;
1568
    }
1569
1570
    /*****************************
1571
     *
1572
     * CLI functions
1573
     *
1574
     *****************************/
1575
1576
    /**
1577
     * Running the functionality of the CLI (crawling URLs from queue)
1578
     */
1579
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1580
    {
1581
        $result = 0;
1582
        $counter = 0;
1583
1584
        // First, run hooks:
1585
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1586
            trigger_error(
1587
                'This hook (crawler/cli_hooks) is deprecated since 9.1.5 and will be removed when dropping support for TYPO3 9LTS and 10LTS',
1588
                E_USER_DEPRECATED
1589
            );
1590
            $hookObj = GeneralUtility::makeInstance($objRef);
1591
            if (is_object($hookObj)) {
1592
                $hookObj->crawler_init($this);
1593
            }
1594
        }
1595
1596
        // Clean up the queue
1597
        $this->queueRepository->cleanupQueue();
1598
1599
        // Select entries:
1600
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1601
1602
        if (! empty($rows)) {
1603
            $quidList = [];
1604
1605
            foreach ($rows as $r) {
1606
                $quidList[] = $r['qid'];
1607
            }
1608
1609
            $processId = $this->CLI_buildProcessId();
1610
1611
            //save the number of assigned queue entries to determine how many have been processed later
1612
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1613
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1614
1615
            if ($numberOfAffectedRows !== count($quidList)) {
1616
                $this->CLI_debug('Nothing processed due to multi-process collision (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1616
                /** @scrutinizer ignore-deprecated */ $this->CLI_debug('Nothing processed due to multi-process collision (' . $this->CLI_buildProcessId() . ')');
Loading history...
1617
                return ($result | self::CLI_STATUS_ABORTED);
1618
            }
1619
1620
            foreach ($rows as $r) {
1621
                $result |= $this->readUrl($r['qid']);
1622
1623
                $counter++;
1624
                // Just to relax the system
1625
                usleep((int) $sleepTime);
1626
1627
                // if during the start and the current read url the cli has been disable we need to return from the function
1628
                // mark the process NOT as ended.
1629
                if ($this->crawler->isDisabled()) {
1630
                    return ($result | self::CLI_STATUS_ABORTED);
1631
                }
1632
1633
                if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1634
                    $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1634
                    /** @scrutinizer ignore-deprecated */ $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
Loading history...
1635
                    $result |= self::CLI_STATUS_ABORTED;
1636
                    //possible timeout
1637
                    break;
1638
                }
1639
            }
1640
1641
            sleep((int) $sleepAfterFinish);
1642
1643
            $msg = 'Rows: ' . $counter;
1644
            $this->CLI_debug($msg . ' (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1644
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug($msg . ' (' . $this->CLI_buildProcessId() . ')');
Loading history...
1645
        } else {
1646
            $this->CLI_debug('Nothing within queue which needs to be processed (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1646
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug('Nothing within queue which needs to be processed (' . $this->CLI_buildProcessId() . ')');
Loading history...
1647
        }
1648
1649
        if ($counter > 0) {
1650
            $result |= self::CLI_STATUS_PROCESSED;
1651
        }
1652
1653
        return $result;
1654
    }
1655
1656
    /**
1657
     * Activate hooks
1658
     * @deprecated
1659
     */
1660
    public function CLI_runHooks(): void
1661
    {
1662
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1663
            $hookObj = GeneralUtility::makeInstance($objRef);
1664
            if (is_object($hookObj)) {
1665
                $hookObj->crawler_init($this);
1666
            }
1667
        }
1668
    }
1669
1670
    /**
1671
     * Try to acquire a new process with the given id
1672
     * also performs some auto-cleanup for orphan processes
1673
     * @param string $id identification string for the process
1674
     * @return boolean
1675
     * @todo preemption might not be the most elegant way to clean up
1676
     */
1677
    public function CLI_checkAndAcquireNewProcess($id)
1678
    {
1679
        $ret = true;
1680
1681
        $systemProcessId = getmypid();
1682
        if (! $systemProcessId) {
1683
            return false;
1684
        }
1685
1686
        $processCount = 0;
1687
        $orphanProcesses = [];
1688
1689
        $activeProcesses = $this->processRepository->findAllActive();
1690
        $currentTime = $this->getCurrentTime();
1691
1692
        /** @var Process $process */
1693
        foreach ($activeProcesses as $process) {
1694
            if ($process->getTtl() < $currentTime) {
1695
                $orphanProcesses[] = $process->getProcessId();
1696
            } else {
1697
                $processCount++;
1698
            }
1699
        }
1700
1701
        // if there are less than allowed active processes then add a new one
1702
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
1703
            $this->CLI_debug('add process ' . $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1703
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug('add process ' . $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
Loading history...
1704
1705
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1706
                'tx_crawler_process',
1707
                [
1708
                    'process_id' => $id,
1709
                    'active' => 1,
1710
                    'ttl' => $currentTime + (int) $this->extensionSettings['processMaxRunTime'],
1711
                    'system_process_id' => $systemProcessId,
1712
                ]
1713
            );
1714
        } else {
1715
            $this->CLI_debug('Processlimit reached (' . ($processCount) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1715
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug('Processlimit reached (' . ($processCount) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
Loading history...
1716
            $ret = false;
1717
        }
1718
1719
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1720
        $this->CLI_releaseProcesses($orphanProcesses);
1721
1722
        return $ret;
1723
    }
1724
1725
    /**
1726
     * Release a process and the required resources
1727
     *
1728
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1729
     * @return boolean
1730
     */
1731
    public function CLI_releaseProcesses($releaseIds)
1732
    {
1733
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1734
1735
        if (! is_array($releaseIds)) {
1736
            $releaseIds = [$releaseIds];
1737
        }
1738
1739
        if (empty($releaseIds)) {
1740
            //nothing to release
1741
            return false;
1742
        }
1743
1744
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1745
        // this ensures that a single process can't mess up the entire process table
1746
1747
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1748
1749
        $queryBuilder
1750
            ->update($this->tableName, 'q')
1751
            ->where(
1752
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1753
            )
1754
            ->set('q.process_scheduled', 0)
1755
            ->set('q.process_id', '')
1756
            ->execute();
1757
1758
        // FIXME: Not entirely sure that this is equivalent to the previous version
1759
        $queryBuilder->resetQueryPart('set');
1760
1761
        $queryBuilder
1762
            ->update('tx_crawler_process')
1763
            ->where(
1764
                $queryBuilder->expr()->eq('active', 0),
1765
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1766
            )
1767
            ->set('system_process_id', 0)
1768
            ->execute();
1769
1770
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1771
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1772
1773
        return true;
1774
    }
1775
1776
    /**
1777
     * Create a unique Id for the current process
1778
     *
1779
     * @return string the ID
1780
     */
1781 1
    public function CLI_buildProcessId()
1782
    {
1783 1
        if (! $this->processID) {
1784
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1785
        }
1786 1
        return $this->processID;
1787
    }
1788
1789
    /**
1790
     * Prints a message to the stdout (only if debug-mode is enabled)
1791
     *
1792
     * @param string $msg the message
1793
     * @deprecated
1794
     * @codeCoverageIgnore
1795
     */
1796
    public function CLI_debug($msg): void
1797
    {
1798
        if ((int) $this->extensionSettings['processDebug']) {
1799
            echo $msg . "\n";
1800
            flush();
1801
        }
1802
    }
1803
1804
    /**
1805
     * Cleans up entries that stayed for too long in the queue. These are:
1806
     * - processed entries that are over 1.5 days in age
1807
     * - scheduled entries that are over 7 days old
1808
     *
1809
     * @deprecated
1810
     */
1811 1
    public function cleanUpOldQueueEntries(): void
1812
    {
1813
        // 24*60*60 Seconds in 24 hours
1814 1
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400;
1815 1
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
1816
1817 1
        $now = time();
1818 1
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
1819 1
        $this->flushQueue($condition);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1819
        /** @scrutinizer ignore-deprecated */ $this->flushQueue($condition);
Loading history...
1820 1
    }
1821
1822
    /**
1823
     * Removes queue entries
1824
     *
1825
     * @param string $where SQL related filter for the entries which should be removed
1826
     *
1827
     * @deprecated
1828
     */
1829 5
    protected function flushQueue($where = ''): void
1830
    {
1831 5
        $realWhere = strlen((string) $where) > 0 ? $where : '1=1';
1832
1833 5
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1834
1835
        $groups = $queryBuilder
1836 5
            ->selectLiteral('DISTINCT set_id')
1837 5
            ->from($this->tableName)
1838 5
            ->where($realWhere)
1839 5
            ->execute()
1840 5
            ->fetchAll();
1841 5
        if (is_array($groups)) {
0 ignored issues
show
introduced by
The condition is_array($groups) is always true.
Loading history...
1842 5
            foreach ($groups as $group) {
1843
                $subSet = $queryBuilder
1844 4
                    ->select('qid', 'set_id')
1845 4
                    ->from($this->tableName)
1846 4
                    ->where(
1847 4
                        $realWhere,
1848 4
                        $queryBuilder->expr()->eq('set_id', $group['set_id'])
1849
                    )
1850 4
                    ->execute()
1851 4
                    ->fetchAll();
1852
1853 4
                $payLoad = ['subSet' => $subSet];
1854 4
                SignalSlotUtility::emitSignal(
1855 4
                    self::class,
1856 4
                    SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH,
1857
                    $payLoad
1858
                );
1859
            }
1860
        }
1861
1862
        $queryBuilder
1863 5
            ->delete($this->tableName)
1864 5
            ->where($realWhere)
1865 5
            ->execute();
1866 5
    }
1867
1868
    /**
1869
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1870
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1871
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1872
     *
1873
     * @param int $tstamp
1874
     * @param array $fieldArray
1875
     *
1876
     * @return array
1877
     * @deprecated
1878
     */
1879 5
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1880
    {
1881 5
        $rows = [];
1882
1883 5
        $currentTime = $this->getCurrentTime();
1884
1885 5
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1886
        $queryBuilder
1887 5
            ->select('qid')
1888 5
            ->from('tx_crawler_queue');
1889
        //if this entry is scheduled with "now"
1890 5
        if ($tstamp <= $currentTime) {
1891 2
            if ($this->extensionSettings['enableTimeslot']) {
1892 1
                $timeBegin = $currentTime - 100;
1893 1
                $timeEnd = $currentTime + 100;
1894
                $queryBuilder
1895 1
                    ->where(
1896 1
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1897
                    )
1898 1
                    ->orWhere(
1899 1
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1900
                    );
1901
            } else {
1902
                $queryBuilder
1903 1
                    ->where(
1904 2
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1905
                    );
1906
            }
1907 3
        } elseif ($tstamp > $currentTime) {
1908
            //entry with a timestamp in the future need to have the same schedule time
1909
            $queryBuilder
1910 3
                ->where(
1911 3
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1912
                );
1913
        }
1914
1915
        $queryBuilder
1916 5
            ->andWhere('NOT exec_time')
1917 5
            ->andWhere('NOT process_id')
1918 5
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], PDO::PARAM_INT)))
1919 5
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], PDO::PARAM_STR)));
1920
1921 5
        $statement = $queryBuilder->execute();
1922
1923 5
        while ($row = $statement->fetch()) {
1924 5
            $rows[] = $row['qid'];
1925
        }
1926
1927 5
        return $rows;
1928
    }
1929
1930
    /**
1931
     * Returns a md5 hash generated from a serialized configuration array.
1932
     *
1933
     * @return string
1934
     */
1935 10
    protected function getConfigurationHash(array $configuration)
1936
    {
1937 10
        unset($configuration['paramExpanded']);
1938 10
        unset($configuration['URLs']);
1939 10
        return md5(serialize($configuration));
1940
    }
1941
1942
    /**
1943
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
1944
     * the Site instance.
1945
     *
1946
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
1947
     * @throws SiteNotFoundException
1948
     * @throws InvalidRouteArgumentsException
1949
     *
1950
     * @deprecated Using CrawlerController::getUrlFromPageAndQueryParameters() is deprecated since 9.1.1 and will be removed in v11.x, please use UrlService->getUrlFromPageAndQueryParameters() instead.
1951
     * @codeCoverageIgnore
1952
     */
1953
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
1954
    {
1955
        $urlService = new UrlService();
1956
        return $urlService->getUrlFromPageAndQueryParameters($pageId, $queryString, $alternativeBaseUrl, $httpsOrHttp);
1957
    }
1958
1959 1
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1960
    {
1961
        // Swap if first is larger than last:
1962 1
        if ($reg[1] > $reg[2]) {
1963
            $temp = $reg[2];
1964
            $reg[2] = $reg[1];
1965
            $reg[1] = $temp;
1966
        }
1967
1968 1
        return $reg;
1969
    }
1970
1971
    /**
1972
     * @return BackendUserAuthentication
1973
     */
1974 2
    private function getBackendUser()
1975
    {
1976
        // Make sure the _cli_ user is loaded
1977 2
        Bootstrap::initializeBackendAuthentication();
1978 2
        if ($this->backendUser === null) {
1979 2
            $this->backendUser = $GLOBALS['BE_USER'];
1980
        }
1981 2
        return $this->backendUser;
1982
    }
1983
1984
    /**
1985
     * Get querybuilder for given table
1986
     *
1987
     * @return QueryBuilder
1988
     */
1989 12
    private function getQueryBuilder(string $table)
1990
    {
1991 12
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
1992
    }
1993
}
1994