Passed
Push — tests/adding-tests ( 71f9ed...f80e1c )
by Tomas Norre
08:38
created

CrawlerController::urlListFromUrlArray()   B

Complexity

Conditions 8
Paths 8

Size

Total Lines 68
Code Lines 39

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 29
CRAP Score 8.2037

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 8
eloc 39
c 1
b 0
f 0
nc 8
nop 9
dl 0
loc 68
ccs 29
cts 34
cp 0.8529
crap 8.2037
rs 8.0515

How to fix   Long Method    Many Parameters   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Converter\JsonCompatibilityConverter;
33
use AOE\Crawler\Crawler;
34
use AOE\Crawler\CrawlStrategy\CrawlStrategyFactory;
35
use AOE\Crawler\Domain\Model\Process;
36
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
37
use AOE\Crawler\Domain\Repository\ProcessRepository;
38
use AOE\Crawler\Domain\Repository\QueueRepository;
39
use AOE\Crawler\QueueExecutor;
40
use AOE\Crawler\Service\ConfigurationService;
41
use AOE\Crawler\Service\UrlService;
42
use AOE\Crawler\Service\UserService;
43
use AOE\Crawler\Utility\SignalSlotUtility;
44
use AOE\Crawler\Value\QueueFilter;
45
use PDO;
46
use Psr\Http\Message\UriInterface;
47
use Psr\Log\LoggerAwareInterface;
48
use Psr\Log\LoggerAwareTrait;
49
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
50
use TYPO3\CMS\Backend\Utility\BackendUtility;
51
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
52
use TYPO3\CMS\Core\Compatibility\PublicMethodDeprecationTrait;
53
use TYPO3\CMS\Core\Compatibility\PublicPropertyDeprecationTrait;
54
use TYPO3\CMS\Core\Core\Bootstrap;
55
use TYPO3\CMS\Core\Core\Environment;
56
use TYPO3\CMS\Core\Database\Connection;
57
use TYPO3\CMS\Core\Database\ConnectionPool;
58
use TYPO3\CMS\Core\Database\Query\QueryBuilder;
59
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
60
use TYPO3\CMS\Core\Database\QueryGenerator;
61
use TYPO3\CMS\Core\Domain\Repository\PageRepository;
62
use TYPO3\CMS\Core\Exception\SiteNotFoundException;
63
use TYPO3\CMS\Core\Imaging\Icon;
64
use TYPO3\CMS\Core\Imaging\IconFactory;
65
use TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException;
66
use TYPO3\CMS\Core\Site\Entity\Site;
67
use TYPO3\CMS\Core\Type\Bitmask\Permission;
68
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
69
use TYPO3\CMS\Core\Utility\DebugUtility;
70
use TYPO3\CMS\Core\Utility\GeneralUtility;
71
use TYPO3\CMS\Core\Utility\MathUtility;
72
use TYPO3\CMS\Extbase\Object\ObjectManager;
73
74
/**
75
 * Class CrawlerController
76
 *
77
 * @package AOE\Crawler\Controller
78
 */
79
class CrawlerController implements LoggerAwareInterface
80
{
81
    use LoggerAwareTrait;
82
    use PublicMethodDeprecationTrait;
83
    use PublicPropertyDeprecationTrait;
84
85
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
86
87
    //queue not empty
88
    public const CLI_STATUS_REMAIN = 1;
89
90
    //(some) queue items where processed
91
    public const CLI_STATUS_PROCESSED = 2;
92
93
    //instance didn't finish
94
    public const CLI_STATUS_ABORTED = 4;
95
96
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
97
98
    /**
99
     * @var integer
100
     */
101
    public $setID = 0;
102
103
    /**
104
     * @var string
105
     */
106
    public $processID = '';
107
108
    /**
109
     * @var array
110
     */
111
    public $duplicateTrack = [];
112
113
    /**
114
     * @var array
115
     */
116
    public $downloadUrls = [];
117
118
    /**
119
     * @var array
120
     */
121
    public $incomingProcInstructions = [];
122
123
    /**
124
     * @var array
125
     */
126
    public $incomingConfigurationSelection = [];
127
128
    /**
129
     * @var bool
130
     */
131
    public $registerQueueEntriesInternallyOnly = false;
132
133
    /**
134
     * @var array
135
     */
136
    public $queueEntries = [];
137
138
    /**
139
     * @var array
140
     */
141
    public $urlList = [];
142
143
    /**
144
     * @var array
145
     */
146
    public $extensionSettings = [];
147
148
    /**
149
     * Mount Point
150
     *
151
     * @var bool
152
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
153
     */
154
    public $MP = false;
155
156
    /**
157
     * @var string
158
     * @deprecated
159
     */
160
    protected $processFilename;
161
162
    /**
163
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
164
     *
165
     * @var string
166
     * @deprecated
167
     */
168
    protected $accessMode;
169
170
    /**
171
     * @var QueueRepository
172
     */
173
    protected $queueRepository;
174
175
    /**
176
     * @var ProcessRepository
177
     */
178
    protected $processRepository;
179
180
    /**
181
     * @var ConfigurationRepository
182
     */
183
    protected $configurationRepository;
184
185
    /**
186
     * @var string
187
     */
188
    protected $tableName = 'tx_crawler_queue';
189
190
    /**
191
     * @var QueueExecutor
192
     */
193
    protected $queueExecutor;
194
195
    /**
196
     * @var int
197
     */
198
    protected $maximumUrlsToCompile = 10000;
199
200
    /**
201
     * @var IconFactory
202
     */
203
    protected $iconFactory;
204
205
    /**
206
     * @var string[]
207
     */
208
    private $deprecatedPublicMethods = [
0 ignored issues
show
introduced by
The private property $deprecatedPublicMethods is not used, and could be removed.
Loading history...
209
        'cleanUpOldQueueEntries' => 'Using CrawlerController::cleanUpOldQueueEntries() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->cleanUpOldQueueEntries() instead.',
210
        'CLI_debug' => 'Using CrawlerController->CLI_debug() is deprecated since 9.1.3 and will be removed in v11.x',
211
        'CLI_runHooks' => 'Using CrawlerController->CLI_runHooks() is deprecated since 9.1.5 and will be removed in v11.x',
212
        'getAccessMode' => 'Using CrawlerController->getAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
213
        'getLogEntriesForPageId' => 'Using CrawlerController->getLogEntriesForPageId() is deprecated since 9.1.5 and will be remove in v11.x',
214
        'getLogEntriesForSetId' => 'Using crawlerController::getLogEntriesForSetId() is deprecated since 9.0.1 and will be removed in v11.x',
215
        'hasGroupAccess' => 'Using CrawlerController->getLogEntriesForPageId() is deprecated since 9.2.2 and will be remove in v11.x, please use UserService::hasGroupAccess() instead.',
216
        'flushQueue' => 'Using CrawlerController::flushQueue() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->flushQueue() instead.',
217
        'setAccessMode' => 'Using CrawlerController->setAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
218
        'getDisabled' => 'Using CrawlerController->getDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->isDisabled() instead',
219
        'setDisabled' => 'Using CrawlerController->setDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->setDisabled() instead',
220
        'getProcessFilename' => 'Using CrawlerController->getProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
221
        'setProcessFilename' => 'Using CrawlerController->setProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
222
        'getDuplicateRowsIfExist' => 'Using CrawlerController->getDuplicateRowsIfExist() is deprecated since 9.1.4 and will be remove in v11.x, please use QueueRepository->getDuplicateQueueItemsIfExists() instead',
223
    ];
224
225
    /**
226
     * @var string[]
227
     */
228
    private $deprecatedPublicProperties = [
229
        'accessMode' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
230
        'processFilename' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
231
    ];
232
233
    /**
234
     * @var BackendUserAuthentication|null
235
     */
236
    private $backendUser;
237
238
    /**
239
     * @var integer
240
     */
241
    private $scheduledTime = 0;
242
243
    /**
244
     * @var integer
245
     */
246
    private $reqMinute = 0;
247
248
    /**
249
     * @var bool
250
     */
251
    private $submitCrawlUrls = false;
252
253
    /**
254
     * @var bool
255
     */
256
    private $downloadCrawlUrls = false;
257
258
    /**
259
     * @var PageRepository
260
     */
261
    private $pageRepository;
262
263
    /**
264
     * @var Crawler
265
     */
266
    private $crawler;
267
268
    /************************************
269
     *
270
     * Getting URLs based on Page TSconfig
271
     *
272
     ************************************/
273
274 41
    public function __construct()
275
    {
276 41
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
277 41
        $crawlStrategyFactory = GeneralUtility::makeInstance(CrawlStrategyFactory::class);
278 41
        $this->queueRepository = $objectManager->get(QueueRepository::class);
279 41
        $this->processRepository = $objectManager->get(ProcessRepository::class);
280 41
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
281 41
        $this->pageRepository = GeneralUtility::makeInstance(PageRepository::class);
282 41
        $this->queueExecutor = GeneralUtility::makeInstance(QueueExecutor::class, $crawlStrategyFactory);
283 41
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
284 41
        $this->crawler = GeneralUtility::makeInstance(Crawler::class);
285
286 41
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

286
        /** @scrutinizer ignore-deprecated */ $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
Loading history...
287
288
        /** @var ExtensionConfigurationProvider $configurationProvider */
289 41
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
290 41
        $settings = $configurationProvider->getExtensionConfiguration();
291 41
        $this->extensionSettings = is_array($settings) ? $settings : [];
0 ignored issues
show
introduced by
The condition is_array($settings) is always true.
Loading history...
292
293
        // set defaults:
294 41
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
295
            $this->extensionSettings['countInARun'] = 100;
296
        }
297
298 41
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
299 41
        $this->setMaximumUrlsToCompile(MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000));
300 41
    }
301
302 45
    public function setMaximumUrlsToCompile(int $maximumUrlsToCompile): void
303
    {
304 45
        $this->maximumUrlsToCompile = $maximumUrlsToCompile;
305 45
    }
306
307
    /**
308
     * Method to set the accessMode can be gui, cli or cli_im
309
     *
310
     * @return string
311
     * @deprecated
312
     */
313 1
    public function getAccessMode()
314
    {
315 1
        return $this->accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

315
        return /** @scrutinizer ignore-deprecated */ $this->accessMode;
Loading history...
316
    }
317
318
    /**
319
     * @param string $accessMode
320
     * @deprecated
321
     */
322 1
    public function setAccessMode($accessMode): void
323
    {
324 1
        $this->accessMode = $accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

324
        /** @scrutinizer ignore-deprecated */ $this->accessMode = $accessMode;
Loading history...
325 1
    }
326
327
    /**
328
     * Set disabled status to prevent processes from being processed
329
     * @deprecated
330
     */
331 3
    public function setDisabled(?bool $disabled = true): void
332
    {
333 3
        if ($disabled) {
334 2
            GeneralUtility::writeFile($this->processFilename, 'disabled');
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

334
            GeneralUtility::writeFile(/** @scrutinizer ignore-deprecated */ $this->processFilename, 'disabled');
Loading history...
335 1
        } elseif (is_file($this->processFilename)) {
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

335
        } elseif (is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename)) {
Loading history...
336 1
            unlink($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

336
            unlink(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
337
        }
338 3
    }
339
340
    /**
341
     * Get disable status
342
     * @deprecated
343
     */
344 3
    public function getDisabled(): bool
345
    {
346 3
        return is_file($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

346
        return is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
347
    }
348
349
    /**
350
     * @param string $filenameWithPath
351
     * @deprecated
352
     */
353 4
    public function setProcessFilename($filenameWithPath): void
354
    {
355 4
        $this->processFilename = $filenameWithPath;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

355
        /** @scrutinizer ignore-deprecated */ $this->processFilename = $filenameWithPath;
Loading history...
356 4
    }
357
358
    /**
359
     * @return string
360
     * @deprecated
361
     */
362 1
    public function getProcessFilename()
363
    {
364 1
        return $this->processFilename;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

364
        return /** @scrutinizer ignore-deprecated */ $this->processFilename;
Loading history...
365
    }
366
367
    /**
368
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
369
     */
370 14
    public function setExtensionSettings(array $extensionSettings): void
371
    {
372 14
        $this->extensionSettings = $extensionSettings;
373 14
    }
374
375
    /**
376
     * Check if the given page should be crawled
377
     *
378
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
379
     */
380 15
    public function checkIfPageShouldBeSkipped(array $pageRow)
381
    {
382
        // if page is hidden
383 15
        if (! $this->extensionSettings['crawlHiddenPages'] && $pageRow['hidden']) {
384 1
            return 'Because page is hidden';
385
        }
386
387 14
        if (GeneralUtility::inList('3,4,199,254,255', $pageRow['doktype'])) {
388 3
            return 'Because doktype is not allowed';
389
        }
390
391 11
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
392 1
            if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
393 1
                return 'Doktype was excluded by "' . $key . '"';
394
            }
395
        }
396
397
        // veto hook
398 10
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
399
            $params = [
400 2
                'pageRow' => $pageRow,
401
            ];
402
            // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
403 2
            $veto = GeneralUtility::callUserFunction($func, $params, $this);
404 2
            if ($veto !== false) {
405 2
                if (is_string($veto)) {
406 1
                    return $veto;
407
                }
408 1
                return 'Veto from hook "' . htmlspecialchars($key) . '"';
409
            }
410
        }
411
412 8
        return false;
413
    }
414
415
    /**
416
     * Wrapper method for getUrlsForPageId()
417
     * It returns an array of configurations and no urls!
418
     *
419
     * @param array $pageRow Page record with at least dok-type and uid columns.
420
     * @param string $skipMessage
421
     * @return array
422
     * @see getUrlsForPageId()
423
     */
424 9
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
425
    {
426 9
        if (! is_int($pageRow['uid'])) {
427
            $skipMessage = 'PageUid ' . $pageRow['uid'] . ' was not an integer';
428
            return [];
429
        }
430
431 9
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
432 9
        if ($message === false) {
433 8
            $res = $this->getUrlsForPageId($pageRow['uid']);
434 8
            $skipMessage = '';
435
        } else {
436 1
            $skipMessage = $message;
437 1
            $res = [];
438
        }
439
440 9
        return $res;
441
    }
442
443
    /**
444
     * Creates a list of URLs from input array (and submits them to queue if asked for)
445
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
446
     *
447
     * @param array $vv Information about URLs from pageRow to crawl.
448
     * @param array $pageRow Page row
449
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
450
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
451
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
452
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
453
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
454
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
455
     * @param array $incomingProcInstructions Array of processing instructions
456
     * @return string List of URLs (meant for display in backend module)
457
     */
458 7
    public function urlListFromUrlArray(
459
        array $vv,
460
        array $pageRow,
461
        $scheduledTime,
462
        $reqMinute,
463
        $submitCrawlUrls,
464
        $downloadCrawlUrls,
465
        array &$duplicateTrack,
466
        array &$downloadUrls,
467
        array $incomingProcInstructions
468
    ) {
469 7
        if (! is_array($vv['URLs'])) {
470
            return 'ERROR - no URL generated';
471
        }
472 7
        $urlLog = [];
473 7
        $pageId = (int) $pageRow['uid'];
474 7
        $configurationHash = $this->getConfigurationHash($vv);
475 7
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
476
477 7
        $urlService = new UrlService();
478
479 7
        foreach ($vv['URLs'] as $urlQuery) {
480 7
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
481
                continue;
482
            }
483 7
            $url = (string) $urlService->getUrlFromPageAndQueryParameters(
484 7
                $pageId,
485
                $urlQuery,
486 7
                $vv['subCfg']['baseUrl'] ?? null,
487 7
                $vv['subCfg']['force_ssl'] ?? 0
488
            );
489
490
            // Create key by which to determine unique-ness:
491 7
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
492
493 7
            if (isset($duplicateTrack[$uKey])) {
494
                //if the url key is registered just display it and do not resubmit is
495
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
496
            } else {
497
                // Scheduled time:
498 7
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
499 7
                $schTime = intval($schTime / 60) * 60;
500 7
                $formattedDate = BackendUtility::datetime($schTime);
501 7
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
502 7
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
503
504
                // Submit for crawling!
505 7
                if ($submitCrawlUrls) {
506 7
                    $added = $this->addUrl(
507 7
                        $pageId,
508
                        $url,
509 7
                        $vv['subCfg'],
510
                        $scheduledTime,
511
                        $configurationHash,
512
                        $skipInnerCheck
513
                    );
514 7
                    if ($added === false) {
515 7
                        $urlList .= ' (URL already existed)';
516
                    }
517
                } elseif ($downloadCrawlUrls) {
518
                    $downloadUrls[$url] = $url;
519
                }
520 7
                $urlLog[] = $urlList;
521
            }
522 7
            $duplicateTrack[$uKey] = true;
523
        }
524
525 7
        return implode('<br>', $urlLog);
526
    }
527
528
    /**
529
     * Returns true if input processing instruction is among registered ones.
530
     *
531
     * @param string $piString PI to test
532
     * @param array $incomingProcInstructions Processing instructions
533
     * @return boolean
534
     */
535 8
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
536
    {
537 8
        if (empty($incomingProcInstructions)) {
538 4
            return true;
539
        }
540
541 4
        foreach ($incomingProcInstructions as $pi) {
542 4
            if (GeneralUtility::inList($piString, $pi)) {
543 2
                return true;
544
            }
545
        }
546 2
        return false;
547
    }
548
549 9
    public function getPageTSconfigForId($id): array
550
    {
551 9
        if (! $this->MP) {
552 9
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
553
        } else {
554
            // TODO: Please check, this makes no sense to split a boolean value.
555
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

555
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
556
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

556
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
557
        }
558
559
        // Call a hook to alter configuration
560 9
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
561
            $params = [
562
                'pageId' => $id,
563
                'pageTSConfig' => &$pageTSconfig,
564
            ];
565
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
566
                GeneralUtility::callUserFunction($userFunc, $params, $this);
567
            }
568
        }
569 9
        return $pageTSconfig;
570
    }
571
572
    /**
573
     * This methods returns an array of configurations.
574
     * Adds no urls!
575
     */
576 7
    public function getUrlsForPageId(int $pageId): array
577
    {
578
        // Get page TSconfig for page ID
579 7
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
580
581 7
        $res = [];
582
583
        // Fetch Crawler Configuration from pageTSconfig
584 7
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
585 7
        foreach ($crawlerCfg as $key => $values) {
586 6
            if (! is_array($values)) {
587 6
                continue;
588
            }
589 6
            $key = str_replace('.', '', $key);
590
            // Sub configuration for a single configuration string:
591 6
            $subCfg = (array) $crawlerCfg[$key . '.'];
592 6
            $subCfg['key'] = $key;
593
594 6
            if (strcmp($subCfg['procInstrFilter'] ?? '', '')) {
595 6
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
596
            }
597 6
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
598
599
            // process configuration if it is not page-specific or if the specific page is the current page:
600
            // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
601 6
            if (! strcmp((string) $subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
602
603
                // Explode, process etc.:
604 6
                $res[$key] = [];
605 6
                $res[$key]['subCfg'] = $subCfg;
606 6
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
607 6
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
608 6
                $res[$key]['origin'] = 'pagets';
609
610
                // recognize MP value
611 6
                if (! $this->MP) {
612 6
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
613
                } else {
614
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
0 ignored issues
show
Bug introduced by
Are you sure $this->MP of type true can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

614
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . /** @scrutinizer ignore-type */ $this->MP]);
Loading history...
615
                }
616
            }
617
        }
618
619
        // Get configuration from tx_crawler_configuration records up the rootline
620 7
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
621 7
        foreach ($crawlerConfigurations as $configurationRecord) {
622
623
            // check access to the configuration record
624 1
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || UserService::hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
625 1
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
626
627
                // process configuration if it is not page-specific or if the specific page is the current page:
628
                // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
629 1
                if (! strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
630 1
                    $key = $configurationRecord['name'];
631
632
                    // don't overwrite previously defined paramSets
633 1
                    if (! isset($res[$key])) {
634
635
                        /* @var $TSparserObject TypoScriptParser */
636 1
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
637 1
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
638
639
                        $subCfg = [
640 1
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
641 1
                            'procInstrParams.' => $TSparserObject->setup,
642 1
                            'baseUrl' => $configurationRecord['base_url'],
643 1
                            'force_ssl' => (int) $configurationRecord['force_ssl'],
644 1
                            'userGroups' => $configurationRecord['fegroups'],
645 1
                            'exclude' => $configurationRecord['exclude'],
646 1
                            'key' => $key,
647
                        ];
648
649 1
                        if (! in_array($pageId, $this->expandExcludeString($subCfg['exclude']), true)) {
650 1
                            $res[$key] = [];
651 1
                            $res[$key]['subCfg'] = $subCfg;
652 1
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
653 1
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
654 1
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
655 1
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
656
                        }
657
                    }
658
                }
659
            }
660
        }
661
662 7
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
663
            $params = [
664
                'res' => &$res,
665
            ];
666
            GeneralUtility::callUserFunction($func, $params, $this);
667
        }
668 7
        return $res;
669
    }
670
671
    /**
672
     * Find all configurations of subpages of a page
673
     * TODO: Write Functional Tests
674
     */
675 2
    public function getConfigurationsForBranch(int $rootid, int $depth): array
676
    {
677 2
        $configurationsForBranch = [];
678 2
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
679 2
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
680 2
        foreach ($sets as $key => $value) {
681
            if (! is_array($value)) {
682
                continue;
683
            }
684
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
685
        }
686 2
        $pids = [];
687 2
        $rootLine = BackendUtility::BEgetRootLine($rootid);
688 2
        foreach ($rootLine as $node) {
689 1
            $pids[] = $node['uid'];
690
        }
691
        /* @var PageTreeView $tree */
692 2
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
693 2
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
694 2
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
695 2
        $tree->getTree($rootid, $depth, '');
696 2
        foreach ($tree->tree as $node) {
697
            $pids[] = $node['row']['uid'];
698
        }
699
700 2
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
701
        $statement = $queryBuilder
702 2
            ->select('name')
703 2
            ->from('tx_crawler_configuration')
704 2
            ->where(
705 2
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
706
            )
707 2
            ->execute();
708
709 2
        while ($row = $statement->fetch()) {
710 1
            $configurationsForBranch[] = $row['name'];
711
        }
712 2
        return $configurationsForBranch;
713
    }
714
715
    /**
716
     * Check if a user has access to an item
717
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
718
     *
719
     * @param string $groupList Comma-separated list of (fe_)group UIDs from a user
720
     * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access
721
     * @return bool TRUE if at least one of the users group UIDs is in the access list or the access list is empty
722
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
723
     * @deprecated
724
     * @codeCoverageIgnore
725
     */
726
    public function hasGroupAccess($groupList, $accessList)
727
    {
728
        if (empty($accessList)) {
729
            return true;
730
        }
731
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
732
            if (GeneralUtility::inList($accessList, $groupUid)) {
733
                return true;
734
            }
735
        }
736
        return false;
737
    }
738
739
    /**
740
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
741
     * Syntax of values:
742
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
743
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
744
     * - For each configuration part:
745
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
746
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
747
     *        _ENABLELANG:1 picks only original records without their language overlays
748
     *         - Default: Literal value
749
     *
750
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
751
     * @param integer $pid Current page ID
752
     * @return array
753
     *
754
     * TODO: Write Functional Tests
755
     */
756 14
    public function expandParameters($paramArray, $pid)
757
    {
758
        // Traverse parameter names:
759 14
        foreach ($paramArray as $p => $v) {
760 14
            $v = trim($v);
761
762
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
763 14
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
764
                // So, find the value inside brackets and reset the paramArray value as an array.
765 14
                $v = substr($v, 1, -1);
766 14
                $paramArray[$p] = [];
767
768
                // Explode parts and traverse them:
769 14
                $parts = explode('|', $v);
770 14
                foreach ($parts as $pV) {
771
772
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
773 14
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
774 1
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
775
776
                        // Traverse range, add values:
777
                        // Limit to size of range!
778 1
                        $runAwayBrake = 1000;
779 1
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
780 1
                            $paramArray[$p][] = $a;
781 1
                            $runAwayBrake--;
782 1
                            if ($runAwayBrake <= 0) {
783
                                break;
784
                            }
785
                        }
786 13
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
787
788
                        // Parse parameters:
789 6
                        $subparts = GeneralUtility::trimExplode(';', $pV);
790 6
                        $subpartParams = [];
791 6
                        foreach ($subparts as $spV) {
792 6
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
793 6
                            $subpartParams[$pKey] = $pVal;
794
                        }
795
796
                        // Table exists:
797 6
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
798 6
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
799 6
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
800 6
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
801 6
                            $where = $subpartParams['_WHERE'] ?? '';
802 6
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
803
804 6
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
805 6
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
806 6
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
807
808 6
                                if ($recursiveDepth > 0) {
809
                                    /** @var QueryGenerator $queryGenerator */
810 2
                                    $queryGenerator = GeneralUtility::makeInstance(QueryGenerator::class);
811 2
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
812 2
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
813
                                } else {
814 4
                                    $pidArray = [(string) $lookUpPid];
815
                                }
816
817 6
                                $queryBuilder->getRestrictions()
818 6
                                    ->removeAll()
819 6
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
820
821
                                $queryBuilder
822 6
                                    ->select($fieldName)
823 6
                                    ->from($subpartParams['_TABLE'])
824 6
                                    ->where(
825 6
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
826
                                        $where
827
                                    );
828
829 6
                                if (! empty($addTable)) {
830
                                    // TODO: Check if this works as intended!
831
                                    $queryBuilder->add('from', $addTable);
832
                                }
833 6
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
834
835 6
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
836
                                    $queryBuilder->andWhere(
837
                                        $queryBuilder->expr()->lte(
838
                                            $transOrigPointerField,
839
                                            0
840
                                        )
841
                                    );
842
                                }
843
844 6
                                $statement = $queryBuilder->execute();
845
846 6
                                $rows = [];
847 6
                                while ($row = $statement->fetch()) {
848 6
                                    $rows[$row[$fieldName]] = $row;
849
                                }
850
851 6
                                if (is_array($rows)) {
852 6
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
853
                                }
854
                            }
855
                        }
856
                    } else {
857
                        // Just add value:
858 7
                        $paramArray[$p][] = $pV;
859
                    }
860
                    // Hook for processing own expandParameters place holder
861 14
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
862
                        $_params = [
863
                            'pObj' => &$this,
864
                            'paramArray' => &$paramArray,
865
                            'currentKey' => $p,
866
                            'currentValue' => $pV,
867
                            'pid' => $pid,
868
                        ];
869
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
870
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
871
                        }
872
                    }
873
                }
874
875
                // Make unique set of values and sort array by key:
876 14
                $paramArray[$p] = array_unique($paramArray[$p]);
877 14
                ksort($paramArray);
878
            } else {
879
                // Set the literal value as only value in array:
880 7
                $paramArray[$p] = [$v];
881
            }
882
        }
883
884 14
        return $paramArray;
885
    }
886
887
    /**
888
     * Compiling URLs from parameter array (output of expandParameters())
889
     * The number of URLs will be the multiplication of the number of parameter values for each key
890
     *
891
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
892
     * @param array $urls URLs accumulated in this array (for recursion)
893
     * @return array
894
     */
895 11
    public function compileUrls($paramArray, array $urls)
896
    {
897 11
        if (empty($paramArray)) {
898 11
            return $urls;
899
        }
900 10
        $varName = key($paramArray);
901 10
        $valueSet = array_shift($paramArray);
902
903
        // Traverse value set:
904 10
        $newUrls = [];
905 10
        foreach ($urls as $url) {
906 9
            foreach ($valueSet as $val) {
907 9
                if (count($newUrls) < $this->getMaximumUrlsToCompile()) {
908 9
                    $newUrls[] = $url . (strcmp((string) $val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string) $val) : '');
909
                }
910
            }
911
        }
912 10
        return $this->compileUrls($paramArray, $newUrls);
913
    }
914
915
    /************************************
916
     *
917
     * Crawler log
918
     *
919
     ************************************/
920
921
    /**
922
     * Return array of records from crawler queue for input page ID
923
     *
924
     * @param integer $id Page ID for which to look up log entries.
925
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
926
     * @param boolean $doFullFlush
927
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
928
     * @return array
929
     *
930
     * @deprecated
931
     */
932 4
    public function getLogEntriesForPageId($id, QueueFilter $queueFilter, $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
0 ignored issues
show
Unused Code introduced by
The parameter $doFullFlush is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

932
    public function getLogEntriesForPageId($id, QueueFilter $queueFilter, $doFlush = false, /** @scrutinizer ignore-unused */ $doFullFlush = false, $itemsPerPage = 10)

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
933
    {
934 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
935
        $queryBuilder
936 4
            ->select('*')
937 4
            ->from($this->tableName)
938 4
            ->where(
939 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, PDO::PARAM_INT))
940
            )
941 4
            ->orderBy('scheduled', 'DESC');
942
943 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
944 4
            ->getConnectionForTable($this->tableName)
945 4
            ->getExpressionBuilder();
946 4
        $query = $expressionBuilder->andX();
0 ignored issues
show
Unused Code introduced by
The assignment to $query is dead and can be removed.
Loading history...
947
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
948
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
949
        // between the statements, it's not a mistake in the code.
950 4
        switch ($queueFilter) {
951 4
            case 'pending':
952
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
953
                break;
954 4
            case 'finished':
955
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
956
                break;
957
        }
958
959 4
        if ($doFlush) {
960 2
            $this->queueRepository->flushQueue($queueFilter);
961
        }
962 4
        if ($itemsPerPage > 0) {
963
            $queryBuilder
964 4
                ->setMaxResults((int) $itemsPerPage);
965
        }
966
967 4
        return $queryBuilder->execute()->fetchAll();
968
    }
969
970
    /**
971
     * Return array of records from crawler queue for input set ID
972
     *
973
     * @param int $set_id Set ID for which to look up log entries.
974
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
975
     * @param bool $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
976
     * @param int $itemsPerPage Limit the amount of entries per page default is 10
977
     * @return array
978
     *
979
     * @deprecated
980
     */
981 6
    public function getLogEntriesForSetId(int $set_id, string $filter = '', bool $doFlush = false, bool $doFullFlush = false, int $itemsPerPage = 10)
982
    {
983 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
984
        $queryBuilder
985 6
            ->select('*')
986 6
            ->from($this->tableName)
987 6
            ->where(
988 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, PDO::PARAM_INT))
989
            )
990 6
            ->orderBy('scheduled', 'DESC');
991
992 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
993 6
            ->getConnectionForTable($this->tableName)
994 6
            ->getExpressionBuilder();
995 6
        $query = $expressionBuilder->andX();
996
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
997
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
998
        // between the statements, it's not a mistake in the code.
999 6
        $addWhere = '';
1000 6
        switch ($filter) {
1001 6
            case 'pending':
1002 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
1003 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
1004 1
                break;
1005 5
            case 'finished':
1006 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1007 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
1008 1
                break;
1009
        }
1010 6
        if ($doFlush) {
1011 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int) $set_id));
1012 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1012
            /** @scrutinizer ignore-deprecated */ $this->flushQueue($doFullFlush ? '' : $addWhere);
Loading history...
1013 4
            return [];
1014
        }
1015 2
        if ($itemsPerPage > 0) {
1016
            $queryBuilder
1017 2
                ->setMaxResults((int) $itemsPerPage);
1018
        }
1019
1020 2
        return $queryBuilder->execute()->fetchAll();
1021
    }
1022
1023
    /**
1024
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1025
     *
1026
     * @param integer $setId Set ID
1027
     * @param array $params Parameters to pass to call back function
1028
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1029
     * @param integer $page_id Page ID to attach it to
1030
     * @param integer $schedule Time at which to activate
1031
     */
1032
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
1033
    {
1034
        if (! is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
1035
            $params = [];
1036
        }
1037
        $params['_CALLBACKOBJ'] = $callBack;
1038
1039
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1040
            ->insert(
1041
                'tx_crawler_queue',
1042
                [
1043
                    'page_id' => (int) $page_id,
1044
                    'parameters' => json_encode($params),
1045
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
1046
                    'exec_time' => 0,
1047
                    'set_id' => (int) $setId,
1048
                    'result_data' => '',
1049
                ]
1050
            );
1051
    }
1052
1053
    /************************************
1054
     *
1055
     * URL setting
1056
     *
1057
     ************************************/
1058
1059
    /**
1060
     * Setting a URL for crawling:
1061
     *
1062
     * @param integer $id Page ID
1063
     * @param string $url Complete URL
1064
     * @param array $subCfg Sub configuration array (from TS config)
1065
     * @param integer $tstamp Scheduled-time
1066
     * @param string $configurationHash (optional) configuration hash
1067
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1068
     * @return bool
1069
     */
1070 11
    public function addUrl(
1071
        $id,
1072
        $url,
1073
        array $subCfg,
1074
        $tstamp,
1075
        $configurationHash = '',
1076
        $skipInnerDuplicationCheck = false
1077
    ) {
1078 11
        $urlAdded = false;
1079 11
        $rows = [];
1080
1081
        // Creating parameters:
1082
        $parameters = [
1083 11
            'url' => $url,
1084
        ];
1085
1086
        // fe user group simulation:
1087 11
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1088 11
        if ($uGs) {
1089 1
            $parameters['feUserGroupList'] = $uGs;
1090
        }
1091
1092
        // Setting processing instructions
1093 11
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1094 11
        if (is_array($subCfg['procInstrParams.'])) {
1095 8
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1096
        }
1097
1098
        // Compile value array:
1099 11
        $parameters_serialized = json_encode($parameters);
1100
        $fieldArray = [
1101 11
            'page_id' => (int) $id,
1102 11
            'parameters' => $parameters_serialized,
1103 11
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1104 11
            'configuration_hash' => $configurationHash,
1105 11
            'scheduled' => $tstamp,
1106 11
            'exec_time' => 0,
1107 11
            'set_id' => (int) $this->setID,
1108 11
            'result_data' => '',
1109 11
            'configuration' => $subCfg['key'],
1110
        ];
1111
1112 11
        if ($this->registerQueueEntriesInternallyOnly) {
1113
            //the entries will only be registered and not stored to the database
1114 1
            $this->queueEntries[] = $fieldArray;
1115
        } else {
1116 10
            if (! $skipInnerDuplicationCheck) {
1117
                // check if there is already an equal entry
1118 9
                $rows = $this->queueRepository->getDuplicateQueueItemsIfExists(
1119 9
                    (bool) $this->extensionSettings['enableTimeslot'],
1120
                    $tstamp,
1121 9
                    $this->getCurrentTime(),
1122 9
                    $fieldArray['page_id'],
1123 9
                    $fieldArray['parameters_hash']
1124
                );
1125
            }
1126
1127 10
            if (empty($rows)) {
1128 9
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1129 9
                $connectionForCrawlerQueue->insert(
1130 9
                    'tx_crawler_queue',
1131
                    $fieldArray
1132
                );
1133 9
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1134 9
                $rows[] = $uid;
1135 9
                $urlAdded = true;
1136
1137 9
                $signalPayload = ['uid' => $uid, 'fieldArray' => $fieldArray];
1138 9
                SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1138
                /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1139 9
                    self::class,
1140 9
                    SignalSlotUtility::SIGNAL_URL_ADDED_TO_QUEUE,
1141
                    $signalPayload
1142
                );
1143
            } else {
1144 5
                $signalPayload = ['rows' => $rows, 'fieldArray' => $fieldArray];
1145 5
                SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1145
                /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1146 5
                    self::class,
1147 5
                    SignalSlotUtility::SIGNAL_DUPLICATE_URL_IN_QUEUE,
1148
                    $signalPayload
1149
                );
1150
            }
1151
        }
1152
1153 11
        return $urlAdded;
1154
    }
1155
1156
    /**
1157
     * Returns the current system time
1158
     *
1159
     * @return int
1160
     */
1161 4
    public function getCurrentTime()
1162
    {
1163 4
        return time();
1164
    }
1165
1166
    /************************************
1167
     *
1168
     * URL reading
1169
     *
1170
     ************************************/
1171
1172
    /**
1173
     * Read URL for single queue entry
1174
     *
1175
     * @param integer $queueId
1176
     * @param boolean $force If set, will process even if exec_time has been set!
1177
     * @return integer
1178
     */
1179 2
    public function readUrl($queueId, $force = false)
1180
    {
1181 2
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1182 2
        $ret = 0;
1183 2
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1184
        // Get entry:
1185
        $queryBuilder
1186 2
            ->select('*')
1187 2
            ->from('tx_crawler_queue')
1188 2
            ->where(
1189 2
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, PDO::PARAM_INT))
1190
            );
1191 2
        if (! $force) {
1192
            $queryBuilder
1193 2
                ->andWhere('exec_time = 0')
1194 2
                ->andWhere('process_scheduled > 0');
1195
        }
1196 2
        $queueRec = $queryBuilder->execute()->fetch();
1197
1198 2
        if (! is_array($queueRec)) {
1199
            return;
1200
        }
1201
1202 2
        SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1202
        /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1203 2
            self::class,
1204 2
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1205 2
            [$queueId, &$queueRec]
1206
        );
1207
1208
        // Set exec_time to lock record:
1209 2
        $field_array = ['exec_time' => $this->getCurrentTime()];
1210
1211 2
        if (isset($this->processID)) {
1212
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1213 2
            $field_array['process_id_completed'] = $this->processID;
1214
        }
1215
1216 2
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1217 2
            ->update(
1218 2
                'tx_crawler_queue',
1219
                $field_array,
1220 2
                ['qid' => (int) $queueId]
1221
            );
1222
1223 2
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1224 2
        if ($result['content'] === null) {
1225
            $resultData = 'An errors happened';
1226
        } else {
1227
            /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
1228 2
            $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
1229 2
            $resultData = $jsonCompatibilityConverter->convert($result['content']);
1230
        }
1231
1232
        //atm there's no need to point to specific pollable extensions
1233 2
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1234
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1235
                // only check the success value if the instruction is runnig
1236
                // it is important to name the pollSuccess key same as the procInstructions key
1237
                if (is_array($resultData['parameters']['procInstructions'])
1238
                    && in_array(
1239
                        $pollable,
1240
                        $resultData['parameters']['procInstructions'], true
1241
                    )
1242
                ) {
1243
                    if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1244
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1245
                    }
1246
                }
1247
            }
1248
        }
1249
1250
        // Set result in log which also denotes the end of the processing of this entry.
1251 2
        $field_array = ['result_data' => json_encode($result)];
1252
1253 2
        SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1253
        /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1254 2
            self::class,
1255 2
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1256 2
            [$queueId, &$field_array]
1257
        );
1258
1259 2
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1260 2
            ->update(
1261 2
                'tx_crawler_queue',
1262
                $field_array,
1263 2
                ['qid' => (int) $queueId]
1264
            );
1265
1266 2
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1267 2
        return $ret;
1268
    }
1269
1270
    /**
1271
     * Read URL for not-yet-inserted log-entry
1272
     *
1273
     * @param array $field_array Queue field array,
1274
     *
1275
     * @return array|bool|mixed|string
1276
     */
1277
    public function readUrlFromArray($field_array)
1278
    {
1279
        // Set exec_time to lock record:
1280
        $field_array['exec_time'] = $this->getCurrentTime();
1281
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1282
        $connectionForCrawlerQueue->insert(
1283
            $this->tableName,
1284
            $field_array
1285
        );
1286
        $queueId = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1287
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1288
1289
        // Set result in log which also denotes the end of the processing of this entry.
1290
        $field_array = ['result_data' => json_encode($result)];
1291
1292
        SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1292
        /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1293
            self::class,
1294
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1295
            [$queueId, &$field_array]
1296
        );
1297
1298
        $connectionForCrawlerQueue->update(
1299
            $this->tableName,
1300
            $field_array,
1301
            ['qid' => $queueId]
1302
        );
1303
1304
        return $result;
1305
    }
1306
1307
    /*****************************
1308
     *
1309
     * Compiling URLs to crawl - tools
1310
     *
1311
     *****************************/
1312
1313
    /**
1314
     * @param integer $id Root page id to start from.
1315
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1316
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1317
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1318
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1319
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1320
     * @param array $incomingProcInstructions Array of processing instructions
1321
     * @param array $configurationSelection Array of configuration keys
1322
     * @return string
1323
     */
1324
    public function getPageTreeAndUrls(
1325
        $id,
1326
        $depth,
1327
        $scheduledTime,
1328
        $reqMinute,
1329
        $submitCrawlUrls,
1330
        $downloadCrawlUrls,
1331
        array $incomingProcInstructions,
1332
        array $configurationSelection
1333
    ) {
1334
        $this->scheduledTime = $scheduledTime;
1335
        $this->reqMinute = $reqMinute;
1336
        $this->submitCrawlUrls = $submitCrawlUrls;
1337
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1338
        $this->incomingProcInstructions = $incomingProcInstructions;
1339
        $this->incomingConfigurationSelection = $configurationSelection;
1340
1341
        $this->duplicateTrack = [];
1342
        $this->downloadUrls = [];
1343
1344
        // Drawing tree:
1345
        /* @var PageTreeView $tree */
1346
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1347
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1348
        $tree->init('AND ' . $perms_clause);
1349
1350
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1351
        if (is_array($pageInfo)) {
1352
            // Set root row:
1353
            $tree->tree[] = [
1354
                'row' => $pageInfo,
1355
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1356
            ];
1357
        }
1358
1359
        // Get branch beneath:
1360
        if ($depth) {
1361
            $tree->getTree($id, $depth, '');
1362
        }
1363
1364
        // Traverse page tree:
1365
        $code = '';
1366
1367
        foreach ($tree->tree as $data) {
1368
            $this->MP = false;
1369
1370
            // recognize mount points
1371
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
1372
                $mountpage = $this->pageRepository->getPage($data['row']['uid']);
1373
1374
                // fetch mounted pages
1375
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1376
1377
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1378
                $mountTree->init('AND ' . $perms_clause);
1379
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1380
1381
                foreach ($mountTree->tree as $mountData) {
1382
                    $code .= $this->drawURLs_addRowsForPage(
1383
                        $mountData['row'],
1384
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1385
                    );
1386
                }
1387
1388
                // replace page when mount_pid_ol is enabled
1389
                if ($mountpage[0]['mount_pid_ol']) {
1390
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1391
                } else {
1392
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1393
                    $this->MP = false;
1394
                }
1395
            }
1396
1397
            $code .= $this->drawURLs_addRowsForPage(
1398
                $data['row'],
1399
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1400
            );
1401
        }
1402
1403
        return $code;
1404
    }
1405
1406
    /**
1407
     * Expands exclude string
1408
     *
1409
     * @param string $excludeString Exclude string
1410
     * @return array
1411
     */
1412 2
    public function expandExcludeString($excludeString)
1413
    {
1414
        // internal static caches;
1415 2
        static $expandedExcludeStringCache;
1416 2
        static $treeCache;
1417
1418 2
        if (empty($expandedExcludeStringCache[$excludeString])) {
1419 2
            $pidList = [];
1420
1421 2
            if (! empty($excludeString)) {
1422
                /** @var PageTreeView $tree */
1423 1
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1424 1
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1425
1426 1
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1427
1428 1
                foreach ($excludeParts as $excludePart) {
1429 1
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1430
1431
                    // default is "page only" = "depth=0"
1432 1
                    if (empty($depth)) {
1433 1
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1434
                    }
1435
1436 1
                    $pidList[] = (int) $pid;
1437
1438 1
                    if ($depth > 0) {
1439
                        if (empty($treeCache[$pid][$depth])) {
1440
                            $tree->reset();
1441
                            $tree->getTree($pid, $depth);
0 ignored issues
show
Bug introduced by
$pid of type string is incompatible with the type integer expected by parameter $uid of TYPO3\CMS\Backend\Tree\V...ractTreeView::getTree(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1441
                            $tree->getTree(/** @scrutinizer ignore-type */ $pid, $depth);
Loading history...
1442
                            $treeCache[$pid][$depth] = $tree->tree;
1443
                        }
1444
1445
                        foreach ($treeCache[$pid][$depth] as $data) {
1446
                            $pidList[] = (int) $data['row']['uid'];
1447
                        }
1448
                    }
1449
                }
1450
            }
1451
1452 2
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1453
        }
1454
1455 2
        return $expandedExcludeStringCache[$excludeString];
1456
    }
1457
1458
    /**
1459
     * Create the rows for display of the page tree
1460
     * For each page a number of rows are shown displaying GET variable configuration
1461
     */
1462
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1463
    {
1464
        $skipMessage = '';
1465
1466
        // Get list of configurations
1467
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1468
        $configurations = ConfigurationService::removeDisallowedConfigurations($this->incomingConfigurationSelection, $configurations);
1469
1470
        // Traverse parameter combinations:
1471
        $c = 0;
1472
        $content = '';
1473
        if (! empty($configurations)) {
1474
            foreach ($configurations as $confKey => $confArray) {
1475
1476
                // Title column:
1477
                if (! $c) {
1478
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1479
                } else {
1480
                    $titleClm = '';
1481
                }
1482
1483
                if (! in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']), true)) {
1484
1485
                    // URL list:
1486
                    $urlList = $this->urlListFromUrlArray(
1487
                        $confArray,
1488
                        $pageRow,
1489
                        $this->scheduledTime,
1490
                        $this->reqMinute,
1491
                        $this->submitCrawlUrls,
1492
                        $this->downloadCrawlUrls,
1493
                        $this->duplicateTrack,
1494
                        $this->downloadUrls,
1495
                        // if empty the urls won't be filtered by processing instructions
1496
                        $this->incomingProcInstructions
1497
                    );
1498
1499
                    // Expanded parameters:
1500
                    $paramExpanded = '';
1501
                    $calcAccu = [];
1502
                    $calcRes = 1;
1503
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1504
                        $paramExpanded .= '
1505
                            <tr>
1506
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1507
                            '(' . count($gVal) . ')' .
1508
                            '</td>
1509
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1510
                            </tr>
1511
                        ';
1512
                        $calcRes *= count($gVal);
1513
                        $calcAccu[] = count($gVal);
1514
                    }
1515
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1516
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1517
1518
                    // Options
1519
                    $optionValues = '';
1520
                    if ($confArray['subCfg']['userGroups']) {
1521
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1522
                    }
1523
                    if ($confArray['subCfg']['procInstrFilter']) {
1524
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1525
                    }
1526
1527
                    // Compile row:
1528
                    $content .= '
1529
                        <tr>
1530
                            ' . $titleClm . '
1531
                            <td>' . htmlspecialchars($confKey) . '</td>
1532
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1533
                            <td>' . $paramExpanded . '</td>
1534
                            <td nowrap="nowrap">' . $urlList . '</td>
1535
                            <td nowrap="nowrap">' . $optionValues . '</td>
1536
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1537
                        </tr>';
1538
                } else {
1539
                    $content .= '<tr>
1540
                            ' . $titleClm . '
1541
                            <td>' . htmlspecialchars($confKey) . '</td>
1542
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1543
                        </tr>';
1544
                }
1545
1546
                $c++;
1547
            }
1548
        } else {
1549
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1550
1551
            // Compile row:
1552
            $content .= '
1553
                <tr>
1554
                    <td>' . $pageTitle . '</td>
1555
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1556
                </tr>';
1557
        }
1558
1559
        return $content;
1560
    }
1561
1562
    /*****************************
1563
     *
1564
     * CLI functions
1565
     *
1566
     *****************************/
1567
1568
    /**
1569
     * Running the functionality of the CLI (crawling URLs from queue)
1570
     */
1571 2
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1572
    {
1573 2
        $result = 0;
1574 2
        $counter = 0;
1575
1576
        // First, run hooks:
1577 2
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1578 2
            trigger_error(
1579 2
                'This hook (crawler/cli_hooks) is deprecated since 9.1.5 and will be removed when dropping support for TYPO3 9LTS and 10LTS',
1580 2
                E_USER_DEPRECATED
1581
            );
1582 2
            $hookObj = GeneralUtility::makeInstance($objRef);
1583 2
            if (is_object($hookObj)) {
1584 2
                $hookObj->crawler_init($this);
1585
            }
1586
        }
1587
1588
        // Clean up the queue
1589 2
        $this->queueRepository->cleanupQueue();
1590
1591
        // Select entries:
1592 2
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1593
1594 2
        if (! empty($rows)) {
1595 2
            $quidList = [];
1596
1597 2
            foreach ($rows as $r) {
1598 2
                $quidList[] = $r['qid'];
1599
            }
1600
1601 2
            $processId = $this->CLI_buildProcessId();
1602
1603
            //save the number of assigned queue entries to determine how many have been processed later
1604 2
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1605 2
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1606
1607 2
            if ($numberOfAffectedRows !== count($quidList)) {
1608
                return ($result | self::CLI_STATUS_ABORTED);
1609
            }
1610
1611 2
            foreach ($rows as $r) {
1612 2
                $result |= $this->readUrl($r['qid']);
1613
1614 2
                $counter++;
1615
                // Just to relax the system
1616 2
                usleep((int) $sleepTime);
1617
1618
                // if during the start and the current read url the cli has been disable we need to return from the function
1619
                // mark the process NOT as ended.
1620 2
                if ($this->crawler->isDisabled()) {
1621
                    return ($result | self::CLI_STATUS_ABORTED);
1622
                }
1623
1624 2
                if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1625
                    $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1625
                    /** @scrutinizer ignore-deprecated */ $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
Loading history...
1626
                    $result |= self::CLI_STATUS_ABORTED;
1627
                    //possible timeout
1628
                    break;
1629
                }
1630
            }
1631
1632 2
            sleep((int) $sleepAfterFinish);
1633
        }
1634
1635 2
        if ($counter > 0) {
1636 2
            $result |= self::CLI_STATUS_PROCESSED;
1637
        }
1638
1639 2
        return $result;
1640
    }
1641
1642
    /**
1643
     * Activate hooks
1644
     * @deprecated
1645
     */
1646
    public function CLI_runHooks(): void
1647
    {
1648
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1649
            $hookObj = GeneralUtility::makeInstance($objRef);
1650
            if (is_object($hookObj)) {
1651
                $hookObj->crawler_init($this);
1652
            }
1653
        }
1654
    }
1655
1656
    /**
1657
     * Try to acquire a new process with the given id
1658
     * also performs some auto-cleanup for orphan processes
1659
     * @param string $id identification string for the process
1660
     * @return boolean
1661
     * @todo preemption might not be the most elegant way to clean up
1662
     */
1663 2
    public function CLI_checkAndAcquireNewProcess($id)
1664
    {
1665 2
        $ret = true;
1666
1667 2
        $systemProcessId = getmypid();
1668 2
        if (! $systemProcessId) {
1669
            return false;
1670
        }
1671
1672 2
        $processCount = 0;
1673 2
        $orphanProcesses = [];
1674
1675 2
        $activeProcesses = $this->processRepository->findAllActive();
1676 2
        $currentTime = $this->getCurrentTime();
1677
1678
        /** @var Process $process */
1679 2
        foreach ($activeProcesses as $process) {
1680
            if ($process->getTtl() < $currentTime) {
1681
                $orphanProcesses[] = $process->getProcessId();
1682
            } else {
1683
                $processCount++;
1684
            }
1685
        }
1686
1687
        // if there are less than allowed active processes then add a new one
1688 2
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
1689 2
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1690 2
                'tx_crawler_process',
1691
                [
1692 2
                    'process_id' => $id,
1693 2
                    'active' => 1,
1694 2
                    'ttl' => $currentTime + (int) $this->extensionSettings['processMaxRunTime'],
1695 2
                    'system_process_id' => $systemProcessId,
1696
                ]
1697
            );
1698
        } else {
1699
            $ret = false;
1700
        }
1701
1702 2
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1703 2
        $this->CLI_releaseProcesses($orphanProcesses);
1704
1705 2
        return $ret;
1706
    }
1707
1708
    /**
1709
     * Release a process and the required resources
1710
     *
1711
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1712
     * @return boolean
1713
     */
1714 2
    public function CLI_releaseProcesses($releaseIds)
1715
    {
1716 2
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1717
1718 2
        if (! is_array($releaseIds)) {
1719 2
            $releaseIds = [$releaseIds];
1720
        }
1721
1722 2
        if (empty($releaseIds)) {
1723
            //nothing to release
1724 2
            return false;
1725
        }
1726
1727
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1728
        // this ensures that a single process can't mess up the entire process table
1729
1730
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1731
1732
        $queryBuilder
1733 2
            ->update($this->tableName, 'q')
1734 2
            ->where(
1735 2
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1736
            )
1737 2
            ->set('q.process_scheduled', 0)
1738 2
            ->set('q.process_id', '')
1739 2
            ->execute();
1740
1741
        // FIXME: Not entirely sure that this is equivalent to the previous version
1742 2
        $queryBuilder->resetQueryPart('set');
1743
1744
        $queryBuilder
1745 2
            ->update('tx_crawler_process')
1746 2
            ->where(
1747 2
                $queryBuilder->expr()->eq('active', 0),
1748 2
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1749
            )
1750 2
            ->set('system_process_id', 0)
1751 2
            ->execute();
1752
1753 2
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1754 2
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1755
1756 2
        return true;
1757
    }
1758
1759
    /**
1760
     * Create a unique Id for the current process
1761
     *
1762
     * @return string the ID
1763
     */
1764 3
    public function CLI_buildProcessId()
1765
    {
1766 3
        if (! $this->processID) {
1767 2
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1768
        }
1769 3
        return $this->processID;
1770
    }
1771
1772
    /**
1773
     * Prints a message to the stdout (only if debug-mode is enabled)
1774
     *
1775
     * @param string $msg the message
1776
     * @deprecated
1777
     * @codeCoverageIgnore
1778
     */
1779
    public function CLI_debug($msg): void
1780
    {
1781
        if ((int) $this->extensionSettings['processDebug']) {
1782
            echo $msg . "\n";
1783
            flush();
1784
        }
1785
    }
1786
1787
    /**
1788
     * Cleans up entries that stayed for too long in the queue. These are:
1789
     * - processed entries that are over 1.5 days in age
1790
     * - scheduled entries that are over 7 days old
1791
     *
1792
     * @deprecated
1793
     */
1794 1
    public function cleanUpOldQueueEntries(): void
1795
    {
1796
        // 24*60*60 Seconds in 24 hours
1797 1
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400;
1798 1
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
1799
1800 1
        $now = time();
1801 1
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
1802 1
        $this->flushQueue($condition);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1802
        /** @scrutinizer ignore-deprecated */ $this->flushQueue($condition);
Loading history...
1803 1
    }
1804
1805
    /**
1806
     * Removes queue entries
1807
     *
1808
     * @param string $where SQL related filter for the entries which should be removed
1809
     *
1810
     * @deprecated
1811
     */
1812 5
    protected function flushQueue($where = ''): void
1813
    {
1814 5
        $realWhere = strlen((string) $where) > 0 ? $where : '1=1';
1815
1816 5
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1817
1818
        $groups = $queryBuilder
1819 5
            ->selectLiteral('DISTINCT set_id')
1820 5
            ->from($this->tableName)
1821 5
            ->where($realWhere)
1822 5
            ->execute()
1823 5
            ->fetchAll();
1824 5
        if (is_array($groups)) {
0 ignored issues
show
introduced by
The condition is_array($groups) is always true.
Loading history...
1825 5
            foreach ($groups as $group) {
1826
                $subSet = $queryBuilder
1827 4
                    ->select('qid', 'set_id')
1828 4
                    ->from($this->tableName)
1829 4
                    ->where(
1830 4
                        $realWhere,
1831 4
                        $queryBuilder->expr()->eq('set_id', $group['set_id'])
1832
                    )
1833 4
                    ->execute()
1834 4
                    ->fetchAll();
1835
1836 4
                $payLoad = ['subSet' => $subSet];
1837 4
                SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1837
                /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1838 4
                    self::class,
1839 4
                    SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH,
1840
                    $payLoad
1841
                );
1842
            }
1843
        }
1844
1845
        $queryBuilder
1846 5
            ->delete($this->tableName)
1847 5
            ->where($realWhere)
1848 5
            ->execute();
1849 5
    }
1850
1851
    /**
1852
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1853
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1854
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1855
     *
1856
     * @param int $tstamp
1857
     * @param array $fieldArray
1858
     *
1859
     * @return array
1860
     * @deprecated
1861
     */
1862 5
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1863
    {
1864 5
        $rows = [];
1865
1866 5
        $currentTime = $this->getCurrentTime();
1867
1868 5
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1869
        $queryBuilder
1870 5
            ->select('qid')
1871 5
            ->from('tx_crawler_queue');
1872
        //if this entry is scheduled with "now"
1873 5
        if ($tstamp <= $currentTime) {
1874 2
            if ($this->extensionSettings['enableTimeslot']) {
1875 1
                $timeBegin = $currentTime - 100;
1876 1
                $timeEnd = $currentTime + 100;
1877
                $queryBuilder
1878 1
                    ->where(
1879 1
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1880
                    )
1881 1
                    ->orWhere(
1882 1
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1883
                    );
1884
            } else {
1885
                $queryBuilder
1886 1
                    ->where(
1887 2
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1888
                    );
1889
            }
1890 3
        } elseif ($tstamp > $currentTime) {
1891
            //entry with a timestamp in the future need to have the same schedule time
1892
            $queryBuilder
1893 3
                ->where(
1894 3
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1895
                );
1896
        }
1897
1898
        $queryBuilder
1899 5
            ->andWhere('NOT exec_time')
1900 5
            ->andWhere('NOT process_id')
1901 5
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], PDO::PARAM_INT)))
1902 5
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], PDO::PARAM_STR)));
1903
1904 5
        $statement = $queryBuilder->execute();
1905
1906 5
        while ($row = $statement->fetch()) {
1907 5
            $rows[] = $row['qid'];
1908
        }
1909
1910 5
        return $rows;
1911
    }
1912
1913
    /**
1914
     * Returns a md5 hash generated from a serialized configuration array.
1915
     *
1916
     * @return string
1917
     */
1918 13
    protected function getConfigurationHash(array $configuration)
1919
    {
1920 13
        unset($configuration['paramExpanded']);
1921 13
        unset($configuration['URLs']);
1922 13
        return md5(serialize($configuration));
1923
    }
1924
1925
    /**
1926
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
1927
     * the Site instance.
1928
     *
1929
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
1930
     * @throws SiteNotFoundException
1931
     * @throws InvalidRouteArgumentsException
1932
     *
1933
     * @deprecated Using CrawlerController::getUrlFromPageAndQueryParameters() is deprecated since 9.1.1 and will be removed in v11.x, please use UrlService->getUrlFromPageAndQueryParameters() instead.
1934
     * @codeCoverageIgnore
1935
     */
1936
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
1937
    {
1938
        $urlService = new UrlService();
1939
        return $urlService->getUrlFromPageAndQueryParameters($pageId, $queryString, $alternativeBaseUrl, $httpsOrHttp);
1940
    }
1941
1942 1
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1943
    {
1944
        // Swap if first is larger than last:
1945 1
        if ($reg[1] > $reg[2]) {
1946
            $temp = $reg[2];
1947
            $reg[2] = $reg[1];
1948
            $reg[1] = $temp;
1949
        }
1950
1951 1
        return $reg;
1952
    }
1953
1954 9
    private function getMaximumUrlsToCompile(): int
1955
    {
1956 9
        return $this->maximumUrlsToCompile;
1957
    }
1958
1959
    /**
1960
     * @return BackendUserAuthentication
1961
     */
1962 3
    private function getBackendUser()
1963
    {
1964
        // Make sure the _cli_ user is loaded
1965 3
        Bootstrap::initializeBackendAuthentication();
1966 3
        if ($this->backendUser === null) {
1967 3
            $this->backendUser = $GLOBALS['BE_USER'];
1968
        }
1969 3
        return $this->backendUser;
1970
    }
1971
1972
    /**
1973
     * Get querybuilder for given table
1974
     *
1975
     * @return QueryBuilder
1976
     */
1977 13
    private function getQueryBuilder(string $table)
1978
    {
1979 13
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
1980
    }
1981
}
1982