Passed
Push — tests/adding-tests ( 71f9ed...f80e1c )
by Tomas Norre
08:38
created

CrawlerController   F

Complexity

Total Complexity 201

Size/Duplication

Total Lines 1901
Duplicated Lines 0 %

Test Coverage

Coverage 73.95%

Importance

Changes 15
Bugs 0 Features 0
Metric Value
eloc 813
c 15
b 0
f 0
dl 0
loc 1901
ccs 545
cts 737
cp 0.7395
rs 1.787
wmc 201

44 Methods

Rating   Name   Duplication   Size   Complexity  
A CLI_runHooks() 0 6 3
A getCurrentTime() 0 3 1
A addQueueEntry_callBack() 0 17 3
B getLogEntriesForSetId() 0 40 6
A getLogEntriesForPageId() 0 36 5
B getPageTreeAndUrls() 0 80 7
B expandExcludeString() 0 44 9
A readUrlFromArray() 0 28 1
B addUrl() 0 84 6
A swapIfFirstIsLargerThanSecond() 0 10 2
A getMaximumUrlsToCompile() 0 3 1
A hasGroupAccess() 0 11 4
A getUrlFromPageAndQueryParameters() 0 4 1
A CLI_debug() 0 5 2
A getConfigurationHash() 0 5 1
B checkIfPageShouldBeSkipped() 0 33 9
A getDisabled() 0 3 1
A getQueryBuilder() 0 3 1
A getAccessMode() 0 3 1
A __construct() 0 26 3
A setMaximumUrlsToCompile() 0 3 1
C getUrlsForPageId() 0 93 16
A setDisabled() 0 6 3
A getUrlsForPageRow() 0 17 3
A getPageTSconfigForId() 0 21 4
A compileUrls() 0 18 6
A getBackendUser() 0 8 2
A drawURLs_PIfilter() 0 12 4
A getDuplicateRowsIfExist() 0 49 5
A getProcessFilename() 0 3 1
A setAccessMode() 0 3 1
A setProcessFilename() 0 3 1
A cleanUpOldQueueEntries() 0 9 1
A setExtensionSettings() 0 3 1
B getConfigurationsForBranch() 0 38 8
F expandParameters() 0 129 25
B urlListFromUrlArray() 0 68 8
A flushQueue() 0 37 4
B drawURLs_addRowsForPage() 0 98 9
B CLI_run() 0 69 10
A CLI_buildProcessId() 0 6 2
A CLI_releaseProcesses() 0 43 3
A CLI_checkAndAcquireNewProcess() 0 43 5
C readUrl() 0 89 11

How to fix   Complexity   

Complex Class

Complex classes like CrawlerController often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use CrawlerController, and based on these observations, apply Extract Interface, too.

1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Converter\JsonCompatibilityConverter;
33
use AOE\Crawler\Crawler;
34
use AOE\Crawler\CrawlStrategy\CrawlStrategyFactory;
35
use AOE\Crawler\Domain\Model\Process;
36
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
37
use AOE\Crawler\Domain\Repository\ProcessRepository;
38
use AOE\Crawler\Domain\Repository\QueueRepository;
39
use AOE\Crawler\QueueExecutor;
40
use AOE\Crawler\Service\ConfigurationService;
41
use AOE\Crawler\Service\UrlService;
42
use AOE\Crawler\Service\UserService;
43
use AOE\Crawler\Utility\SignalSlotUtility;
44
use AOE\Crawler\Value\QueueFilter;
45
use PDO;
46
use Psr\Http\Message\UriInterface;
47
use Psr\Log\LoggerAwareInterface;
48
use Psr\Log\LoggerAwareTrait;
49
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
50
use TYPO3\CMS\Backend\Utility\BackendUtility;
51
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
52
use TYPO3\CMS\Core\Compatibility\PublicMethodDeprecationTrait;
53
use TYPO3\CMS\Core\Compatibility\PublicPropertyDeprecationTrait;
54
use TYPO3\CMS\Core\Core\Bootstrap;
55
use TYPO3\CMS\Core\Core\Environment;
56
use TYPO3\CMS\Core\Database\Connection;
57
use TYPO3\CMS\Core\Database\ConnectionPool;
58
use TYPO3\CMS\Core\Database\Query\QueryBuilder;
59
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
60
use TYPO3\CMS\Core\Database\QueryGenerator;
61
use TYPO3\CMS\Core\Domain\Repository\PageRepository;
62
use TYPO3\CMS\Core\Exception\SiteNotFoundException;
63
use TYPO3\CMS\Core\Imaging\Icon;
64
use TYPO3\CMS\Core\Imaging\IconFactory;
65
use TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException;
66
use TYPO3\CMS\Core\Site\Entity\Site;
67
use TYPO3\CMS\Core\Type\Bitmask\Permission;
68
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
69
use TYPO3\CMS\Core\Utility\DebugUtility;
70
use TYPO3\CMS\Core\Utility\GeneralUtility;
71
use TYPO3\CMS\Core\Utility\MathUtility;
72
use TYPO3\CMS\Extbase\Object\ObjectManager;
73
74
/**
75
 * Class CrawlerController
76
 *
77
 * @package AOE\Crawler\Controller
78
 */
79
class CrawlerController implements LoggerAwareInterface
80
{
81
    use LoggerAwareTrait;
82
    use PublicMethodDeprecationTrait;
83
    use PublicPropertyDeprecationTrait;
84
85
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
86
87
    //queue not empty
88
    public const CLI_STATUS_REMAIN = 1;
89
90
    //(some) queue items where processed
91
    public const CLI_STATUS_PROCESSED = 2;
92
93
    //instance didn't finish
94
    public const CLI_STATUS_ABORTED = 4;
95
96
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
97
98
    /**
99
     * @var integer
100
     */
101
    public $setID = 0;
102
103
    /**
104
     * @var string
105
     */
106
    public $processID = '';
107
108
    /**
109
     * @var array
110
     */
111
    public $duplicateTrack = [];
112
113
    /**
114
     * @var array
115
     */
116
    public $downloadUrls = [];
117
118
    /**
119
     * @var array
120
     */
121
    public $incomingProcInstructions = [];
122
123
    /**
124
     * @var array
125
     */
126
    public $incomingConfigurationSelection = [];
127
128
    /**
129
     * @var bool
130
     */
131
    public $registerQueueEntriesInternallyOnly = false;
132
133
    /**
134
     * @var array
135
     */
136
    public $queueEntries = [];
137
138
    /**
139
     * @var array
140
     */
141
    public $urlList = [];
142
143
    /**
144
     * @var array
145
     */
146
    public $extensionSettings = [];
147
148
    /**
149
     * Mount Point
150
     *
151
     * @var bool
152
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
153
     */
154
    public $MP = false;
155
156
    /**
157
     * @var string
158
     * @deprecated
159
     */
160
    protected $processFilename;
161
162
    /**
163
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
164
     *
165
     * @var string
166
     * @deprecated
167
     */
168
    protected $accessMode;
169
170
    /**
171
     * @var QueueRepository
172
     */
173
    protected $queueRepository;
174
175
    /**
176
     * @var ProcessRepository
177
     */
178
    protected $processRepository;
179
180
    /**
181
     * @var ConfigurationRepository
182
     */
183
    protected $configurationRepository;
184
185
    /**
186
     * @var string
187
     */
188
    protected $tableName = 'tx_crawler_queue';
189
190
    /**
191
     * @var QueueExecutor
192
     */
193
    protected $queueExecutor;
194
195
    /**
196
     * @var int
197
     */
198
    protected $maximumUrlsToCompile = 10000;
199
200
    /**
201
     * @var IconFactory
202
     */
203
    protected $iconFactory;
204
205
    /**
206
     * @var string[]
207
     */
208
    private $deprecatedPublicMethods = [
0 ignored issues
show
introduced by
The private property $deprecatedPublicMethods is not used, and could be removed.
Loading history...
209
        'cleanUpOldQueueEntries' => 'Using CrawlerController::cleanUpOldQueueEntries() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->cleanUpOldQueueEntries() instead.',
210
        'CLI_debug' => 'Using CrawlerController->CLI_debug() is deprecated since 9.1.3 and will be removed in v11.x',
211
        'CLI_runHooks' => 'Using CrawlerController->CLI_runHooks() is deprecated since 9.1.5 and will be removed in v11.x',
212
        'getAccessMode' => 'Using CrawlerController->getAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
213
        'getLogEntriesForPageId' => 'Using CrawlerController->getLogEntriesForPageId() is deprecated since 9.1.5 and will be remove in v11.x',
214
        'getLogEntriesForSetId' => 'Using crawlerController::getLogEntriesForSetId() is deprecated since 9.0.1 and will be removed in v11.x',
215
        'hasGroupAccess' => 'Using CrawlerController->getLogEntriesForPageId() is deprecated since 9.2.2 and will be remove in v11.x, please use UserService::hasGroupAccess() instead.',
216
        'flushQueue' => 'Using CrawlerController::flushQueue() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->flushQueue() instead.',
217
        'setAccessMode' => 'Using CrawlerController->setAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
218
        'getDisabled' => 'Using CrawlerController->getDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->isDisabled() instead',
219
        'setDisabled' => 'Using CrawlerController->setDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->setDisabled() instead',
220
        'getProcessFilename' => 'Using CrawlerController->getProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
221
        'setProcessFilename' => 'Using CrawlerController->setProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
222
        'getDuplicateRowsIfExist' => 'Using CrawlerController->getDuplicateRowsIfExist() is deprecated since 9.1.4 and will be remove in v11.x, please use QueueRepository->getDuplicateQueueItemsIfExists() instead',
223
    ];
224
225
    /**
226
     * @var string[]
227
     */
228
    private $deprecatedPublicProperties = [
229
        'accessMode' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
230
        'processFilename' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
231
    ];
232
233
    /**
234
     * @var BackendUserAuthentication|null
235
     */
236
    private $backendUser;
237
238
    /**
239
     * @var integer
240
     */
241
    private $scheduledTime = 0;
242
243
    /**
244
     * @var integer
245
     */
246
    private $reqMinute = 0;
247
248
    /**
249
     * @var bool
250
     */
251
    private $submitCrawlUrls = false;
252
253
    /**
254
     * @var bool
255
     */
256
    private $downloadCrawlUrls = false;
257
258
    /**
259
     * @var PageRepository
260
     */
261
    private $pageRepository;
262
263
    /**
264
     * @var Crawler
265
     */
266
    private $crawler;
267
268
    /************************************
269
     *
270
     * Getting URLs based on Page TSconfig
271
     *
272
     ************************************/
273
274 41
    public function __construct()
275
    {
276 41
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
277 41
        $crawlStrategyFactory = GeneralUtility::makeInstance(CrawlStrategyFactory::class);
278 41
        $this->queueRepository = $objectManager->get(QueueRepository::class);
279 41
        $this->processRepository = $objectManager->get(ProcessRepository::class);
280 41
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
281 41
        $this->pageRepository = GeneralUtility::makeInstance(PageRepository::class);
282 41
        $this->queueExecutor = GeneralUtility::makeInstance(QueueExecutor::class, $crawlStrategyFactory);
283 41
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
284 41
        $this->crawler = GeneralUtility::makeInstance(Crawler::class);
285
286 41
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

286
        /** @scrutinizer ignore-deprecated */ $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
Loading history...
287
288
        /** @var ExtensionConfigurationProvider $configurationProvider */
289 41
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
290 41
        $settings = $configurationProvider->getExtensionConfiguration();
291 41
        $this->extensionSettings = is_array($settings) ? $settings : [];
0 ignored issues
show
introduced by
The condition is_array($settings) is always true.
Loading history...
292
293
        // set defaults:
294 41
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
295
            $this->extensionSettings['countInARun'] = 100;
296
        }
297
298 41
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
299 41
        $this->setMaximumUrlsToCompile(MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000));
300 41
    }
301
302 45
    public function setMaximumUrlsToCompile(int $maximumUrlsToCompile): void
303
    {
304 45
        $this->maximumUrlsToCompile = $maximumUrlsToCompile;
305 45
    }
306
307
    /**
308
     * Method to set the accessMode can be gui, cli or cli_im
309
     *
310
     * @return string
311
     * @deprecated
312
     */
313 1
    public function getAccessMode()
314
    {
315 1
        return $this->accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

315
        return /** @scrutinizer ignore-deprecated */ $this->accessMode;
Loading history...
316
    }
317
318
    /**
319
     * @param string $accessMode
320
     * @deprecated
321
     */
322 1
    public function setAccessMode($accessMode): void
323
    {
324 1
        $this->accessMode = $accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

324
        /** @scrutinizer ignore-deprecated */ $this->accessMode = $accessMode;
Loading history...
325 1
    }
326
327
    /**
328
     * Set disabled status to prevent processes from being processed
329
     * @deprecated
330
     */
331 3
    public function setDisabled(?bool $disabled = true): void
332
    {
333 3
        if ($disabled) {
334 2
            GeneralUtility::writeFile($this->processFilename, 'disabled');
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

334
            GeneralUtility::writeFile(/** @scrutinizer ignore-deprecated */ $this->processFilename, 'disabled');
Loading history...
335 1
        } elseif (is_file($this->processFilename)) {
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

335
        } elseif (is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename)) {
Loading history...
336 1
            unlink($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

336
            unlink(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
337
        }
338 3
    }
339
340
    /**
341
     * Get disable status
342
     * @deprecated
343
     */
344 3
    public function getDisabled(): bool
345
    {
346 3
        return is_file($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

346
        return is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
347
    }
348
349
    /**
350
     * @param string $filenameWithPath
351
     * @deprecated
352
     */
353 4
    public function setProcessFilename($filenameWithPath): void
354
    {
355 4
        $this->processFilename = $filenameWithPath;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

355
        /** @scrutinizer ignore-deprecated */ $this->processFilename = $filenameWithPath;
Loading history...
356 4
    }
357
358
    /**
359
     * @return string
360
     * @deprecated
361
     */
362 1
    public function getProcessFilename()
363
    {
364 1
        return $this->processFilename;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

364
        return /** @scrutinizer ignore-deprecated */ $this->processFilename;
Loading history...
365
    }
366
367
    /**
368
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
369
     */
370 14
    public function setExtensionSettings(array $extensionSettings): void
371
    {
372 14
        $this->extensionSettings = $extensionSettings;
373 14
    }
374
375
    /**
376
     * Check if the given page should be crawled
377
     *
378
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
379
     */
380 15
    public function checkIfPageShouldBeSkipped(array $pageRow)
381
    {
382
        // if page is hidden
383 15
        if (! $this->extensionSettings['crawlHiddenPages'] && $pageRow['hidden']) {
384 1
            return 'Because page is hidden';
385
        }
386
387 14
        if (GeneralUtility::inList('3,4,199,254,255', $pageRow['doktype'])) {
388 3
            return 'Because doktype is not allowed';
389
        }
390
391 11
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
392 1
            if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
393 1
                return 'Doktype was excluded by "' . $key . '"';
394
            }
395
        }
396
397
        // veto hook
398 10
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
399
            $params = [
400 2
                'pageRow' => $pageRow,
401
            ];
402
            // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
403 2
            $veto = GeneralUtility::callUserFunction($func, $params, $this);
404 2
            if ($veto !== false) {
405 2
                if (is_string($veto)) {
406 1
                    return $veto;
407
                }
408 1
                return 'Veto from hook "' . htmlspecialchars($key) . '"';
409
            }
410
        }
411
412 8
        return false;
413
    }
414
415
    /**
416
     * Wrapper method for getUrlsForPageId()
417
     * It returns an array of configurations and no urls!
418
     *
419
     * @param array $pageRow Page record with at least dok-type and uid columns.
420
     * @param string $skipMessage
421
     * @return array
422
     * @see getUrlsForPageId()
423
     */
424 9
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
425
    {
426 9
        if (! is_int($pageRow['uid'])) {
427
            $skipMessage = 'PageUid ' . $pageRow['uid'] . ' was not an integer';
428
            return [];
429
        }
430
431 9
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
432 9
        if ($message === false) {
433 8
            $res = $this->getUrlsForPageId($pageRow['uid']);
434 8
            $skipMessage = '';
435
        } else {
436 1
            $skipMessage = $message;
437 1
            $res = [];
438
        }
439
440 9
        return $res;
441
    }
442
443
    /**
444
     * Creates a list of URLs from input array (and submits them to queue if asked for)
445
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
446
     *
447
     * @param array $vv Information about URLs from pageRow to crawl.
448
     * @param array $pageRow Page row
449
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
450
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
451
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
452
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
453
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
454
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
455
     * @param array $incomingProcInstructions Array of processing instructions
456
     * @return string List of URLs (meant for display in backend module)
457
     */
458 7
    public function urlListFromUrlArray(
459
        array $vv,
460
        array $pageRow,
461
        $scheduledTime,
462
        $reqMinute,
463
        $submitCrawlUrls,
464
        $downloadCrawlUrls,
465
        array &$duplicateTrack,
466
        array &$downloadUrls,
467
        array $incomingProcInstructions
468
    ) {
469 7
        if (! is_array($vv['URLs'])) {
470
            return 'ERROR - no URL generated';
471
        }
472 7
        $urlLog = [];
473 7
        $pageId = (int) $pageRow['uid'];
474 7
        $configurationHash = $this->getConfigurationHash($vv);
475 7
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
476
477 7
        $urlService = new UrlService();
478
479 7
        foreach ($vv['URLs'] as $urlQuery) {
480 7
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
481
                continue;
482
            }
483 7
            $url = (string) $urlService->getUrlFromPageAndQueryParameters(
484 7
                $pageId,
485
                $urlQuery,
486 7
                $vv['subCfg']['baseUrl'] ?? null,
487 7
                $vv['subCfg']['force_ssl'] ?? 0
488
            );
489
490
            // Create key by which to determine unique-ness:
491 7
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
492
493 7
            if (isset($duplicateTrack[$uKey])) {
494
                //if the url key is registered just display it and do not resubmit is
495
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
496
            } else {
497
                // Scheduled time:
498 7
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
499 7
                $schTime = intval($schTime / 60) * 60;
500 7
                $formattedDate = BackendUtility::datetime($schTime);
501 7
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
502 7
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
503
504
                // Submit for crawling!
505 7
                if ($submitCrawlUrls) {
506 7
                    $added = $this->addUrl(
507 7
                        $pageId,
508
                        $url,
509 7
                        $vv['subCfg'],
510
                        $scheduledTime,
511
                        $configurationHash,
512
                        $skipInnerCheck
513
                    );
514 7
                    if ($added === false) {
515 7
                        $urlList .= ' (URL already existed)';
516
                    }
517
                } elseif ($downloadCrawlUrls) {
518
                    $downloadUrls[$url] = $url;
519
                }
520 7
                $urlLog[] = $urlList;
521
            }
522 7
            $duplicateTrack[$uKey] = true;
523
        }
524
525 7
        return implode('<br>', $urlLog);
526
    }
527
528
    /**
529
     * Returns true if input processing instruction is among registered ones.
530
     *
531
     * @param string $piString PI to test
532
     * @param array $incomingProcInstructions Processing instructions
533
     * @return boolean
534
     */
535 8
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
536
    {
537 8
        if (empty($incomingProcInstructions)) {
538 4
            return true;
539
        }
540
541 4
        foreach ($incomingProcInstructions as $pi) {
542 4
            if (GeneralUtility::inList($piString, $pi)) {
543 2
                return true;
544
            }
545
        }
546 2
        return false;
547
    }
548
549 9
    public function getPageTSconfigForId($id): array
550
    {
551 9
        if (! $this->MP) {
552 9
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
553
        } else {
554
            // TODO: Please check, this makes no sense to split a boolean value.
555
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

555
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
556
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

556
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
557
        }
558
559
        // Call a hook to alter configuration
560 9
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
561
            $params = [
562
                'pageId' => $id,
563
                'pageTSConfig' => &$pageTSconfig,
564
            ];
565
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
566
                GeneralUtility::callUserFunction($userFunc, $params, $this);
567
            }
568
        }
569 9
        return $pageTSconfig;
570
    }
571
572
    /**
573
     * This methods returns an array of configurations.
574
     * Adds no urls!
575
     */
576 7
    public function getUrlsForPageId(int $pageId): array
577
    {
578
        // Get page TSconfig for page ID
579 7
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
580
581 7
        $res = [];
582
583
        // Fetch Crawler Configuration from pageTSconfig
584 7
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
585 7
        foreach ($crawlerCfg as $key => $values) {
586 6
            if (! is_array($values)) {
587 6
                continue;
588
            }
589 6
            $key = str_replace('.', '', $key);
590
            // Sub configuration for a single configuration string:
591 6
            $subCfg = (array) $crawlerCfg[$key . '.'];
592 6
            $subCfg['key'] = $key;
593
594 6
            if (strcmp($subCfg['procInstrFilter'] ?? '', '')) {
595 6
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
596
            }
597 6
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
598
599
            // process configuration if it is not page-specific or if the specific page is the current page:
600
            // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
601 6
            if (! strcmp((string) $subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
602
603
                // Explode, process etc.:
604 6
                $res[$key] = [];
605 6
                $res[$key]['subCfg'] = $subCfg;
606 6
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
607 6
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
608 6
                $res[$key]['origin'] = 'pagets';
609
610
                // recognize MP value
611 6
                if (! $this->MP) {
612 6
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
613
                } else {
614
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
0 ignored issues
show
Bug introduced by
Are you sure $this->MP of type true can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

614
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . /** @scrutinizer ignore-type */ $this->MP]);
Loading history...
615
                }
616
            }
617
        }
618
619
        // Get configuration from tx_crawler_configuration records up the rootline
620 7
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
621 7
        foreach ($crawlerConfigurations as $configurationRecord) {
622
623
            // check access to the configuration record
624 1
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || UserService::hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
625 1
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
626
627
                // process configuration if it is not page-specific or if the specific page is the current page:
628
                // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
629 1
                if (! strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
630 1
                    $key = $configurationRecord['name'];
631
632
                    // don't overwrite previously defined paramSets
633 1
                    if (! isset($res[$key])) {
634
635
                        /* @var $TSparserObject TypoScriptParser */
636 1
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
637 1
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
638
639
                        $subCfg = [
640 1
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
641 1
                            'procInstrParams.' => $TSparserObject->setup,
642 1
                            'baseUrl' => $configurationRecord['base_url'],
643 1
                            'force_ssl' => (int) $configurationRecord['force_ssl'],
644 1
                            'userGroups' => $configurationRecord['fegroups'],
645 1
                            'exclude' => $configurationRecord['exclude'],
646 1
                            'key' => $key,
647
                        ];
648
649 1
                        if (! in_array($pageId, $this->expandExcludeString($subCfg['exclude']), true)) {
650 1
                            $res[$key] = [];
651 1
                            $res[$key]['subCfg'] = $subCfg;
652 1
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
653 1
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
654 1
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
655 1
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
656
                        }
657
                    }
658
                }
659
            }
660
        }
661
662 7
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
663
            $params = [
664
                'res' => &$res,
665
            ];
666
            GeneralUtility::callUserFunction($func, $params, $this);
667
        }
668 7
        return $res;
669
    }
670
671
    /**
672
     * Find all configurations of subpages of a page
673
     * TODO: Write Functional Tests
674
     */
675 2
    public function getConfigurationsForBranch(int $rootid, int $depth): array
676
    {
677 2
        $configurationsForBranch = [];
678 2
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
679 2
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
680 2
        foreach ($sets as $key => $value) {
681
            if (! is_array($value)) {
682
                continue;
683
            }
684
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
685
        }
686 2
        $pids = [];
687 2
        $rootLine = BackendUtility::BEgetRootLine($rootid);
688 2
        foreach ($rootLine as $node) {
689 1
            $pids[] = $node['uid'];
690
        }
691
        /* @var PageTreeView $tree */
692 2
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
693 2
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
694 2
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
695 2
        $tree->getTree($rootid, $depth, '');
696 2
        foreach ($tree->tree as $node) {
697
            $pids[] = $node['row']['uid'];
698
        }
699
700 2
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
701
        $statement = $queryBuilder
702 2
            ->select('name')
703 2
            ->from('tx_crawler_configuration')
704 2
            ->where(
705 2
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
706
            )
707 2
            ->execute();
708
709 2
        while ($row = $statement->fetch()) {
710 1
            $configurationsForBranch[] = $row['name'];
711
        }
712 2
        return $configurationsForBranch;
713
    }
714
715
    /**
716
     * Check if a user has access to an item
717
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
718
     *
719
     * @param string $groupList Comma-separated list of (fe_)group UIDs from a user
720
     * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access
721
     * @return bool TRUE if at least one of the users group UIDs is in the access list or the access list is empty
722
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
723
     * @deprecated
724
     * @codeCoverageIgnore
725
     */
726
    public function hasGroupAccess($groupList, $accessList)
727
    {
728
        if (empty($accessList)) {
729
            return true;
730
        }
731
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
732
            if (GeneralUtility::inList($accessList, $groupUid)) {
733
                return true;
734
            }
735
        }
736
        return false;
737
    }
738
739
    /**
740
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
741
     * Syntax of values:
742
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
743
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
744
     * - For each configuration part:
745
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
746
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
747
     *        _ENABLELANG:1 picks only original records without their language overlays
748
     *         - Default: Literal value
749
     *
750
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
751
     * @param integer $pid Current page ID
752
     * @return array
753
     *
754
     * TODO: Write Functional Tests
755
     */
756 14
    public function expandParameters($paramArray, $pid)
757
    {
758
        // Traverse parameter names:
759 14
        foreach ($paramArray as $p => $v) {
760 14
            $v = trim($v);
761
762
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
763 14
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
764
                // So, find the value inside brackets and reset the paramArray value as an array.
765 14
                $v = substr($v, 1, -1);
766 14
                $paramArray[$p] = [];
767
768
                // Explode parts and traverse them:
769 14
                $parts = explode('|', $v);
770 14
                foreach ($parts as $pV) {
771
772
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
773 14
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
774 1
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
775
776
                        // Traverse range, add values:
777
                        // Limit to size of range!
778 1
                        $runAwayBrake = 1000;
779 1
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
780 1
                            $paramArray[$p][] = $a;
781 1
                            $runAwayBrake--;
782 1
                            if ($runAwayBrake <= 0) {
783
                                break;
784
                            }
785
                        }
786 13
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
787
788
                        // Parse parameters:
789 6
                        $subparts = GeneralUtility::trimExplode(';', $pV);
790 6
                        $subpartParams = [];
791 6
                        foreach ($subparts as $spV) {
792 6
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
793 6
                            $subpartParams[$pKey] = $pVal;
794
                        }
795
796
                        // Table exists:
797 6
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
798 6
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
799 6
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
800 6
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
801 6
                            $where = $subpartParams['_WHERE'] ?? '';
802 6
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
803
804 6
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
805 6
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
806 6
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
807
808 6
                                if ($recursiveDepth > 0) {
809
                                    /** @var QueryGenerator $queryGenerator */
810 2
                                    $queryGenerator = GeneralUtility::makeInstance(QueryGenerator::class);
811 2
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
812 2
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
813
                                } else {
814 4
                                    $pidArray = [(string) $lookUpPid];
815
                                }
816
817 6
                                $queryBuilder->getRestrictions()
818 6
                                    ->removeAll()
819 6
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
820
821
                                $queryBuilder
822 6
                                    ->select($fieldName)
823 6
                                    ->from($subpartParams['_TABLE'])
824 6
                                    ->where(
825 6
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
826
                                        $where
827
                                    );
828
829 6
                                if (! empty($addTable)) {
830
                                    // TODO: Check if this works as intended!
831
                                    $queryBuilder->add('from', $addTable);
832
                                }
833 6
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
834
835 6
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
836
                                    $queryBuilder->andWhere(
837
                                        $queryBuilder->expr()->lte(
838
                                            $transOrigPointerField,
839
                                            0
840
                                        )
841
                                    );
842
                                }
843
844 6
                                $statement = $queryBuilder->execute();
845
846 6
                                $rows = [];
847 6
                                while ($row = $statement->fetch()) {
848 6
                                    $rows[$row[$fieldName]] = $row;
849
                                }
850
851 6
                                if (is_array($rows)) {
852 6
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
853
                                }
854
                            }
855
                        }
856
                    } else {
857
                        // Just add value:
858 7
                        $paramArray[$p][] = $pV;
859
                    }
860
                    // Hook for processing own expandParameters place holder
861 14
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
862
                        $_params = [
863
                            'pObj' => &$this,
864
                            'paramArray' => &$paramArray,
865
                            'currentKey' => $p,
866
                            'currentValue' => $pV,
867
                            'pid' => $pid,
868
                        ];
869
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
870
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
871
                        }
872
                    }
873
                }
874
875
                // Make unique set of values and sort array by key:
876 14
                $paramArray[$p] = array_unique($paramArray[$p]);
877 14
                ksort($paramArray);
878
            } else {
879
                // Set the literal value as only value in array:
880 7
                $paramArray[$p] = [$v];
881
            }
882
        }
883
884 14
        return $paramArray;
885
    }
886
887
    /**
888
     * Compiling URLs from parameter array (output of expandParameters())
889
     * The number of URLs will be the multiplication of the number of parameter values for each key
890
     *
891
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
892
     * @param array $urls URLs accumulated in this array (for recursion)
893
     * @return array
894
     */
895 11
    public function compileUrls($paramArray, array $urls)
896
    {
897 11
        if (empty($paramArray)) {
898 11
            return $urls;
899
        }
900 10
        $varName = key($paramArray);
901 10
        $valueSet = array_shift($paramArray);
902
903
        // Traverse value set:
904 10
        $newUrls = [];
905 10
        foreach ($urls as $url) {
906 9
            foreach ($valueSet as $val) {
907 9
                if (count($newUrls) < $this->getMaximumUrlsToCompile()) {
908 9
                    $newUrls[] = $url . (strcmp((string) $val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string) $val) : '');
909
                }
910
            }
911
        }
912 10
        return $this->compileUrls($paramArray, $newUrls);
913
    }
914
915
    /************************************
916
     *
917
     * Crawler log
918
     *
919
     ************************************/
920
921
    /**
922
     * Return array of records from crawler queue for input page ID
923
     *
924
     * @param integer $id Page ID for which to look up log entries.
925
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
926
     * @param boolean $doFullFlush
927
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
928
     * @return array
929
     *
930
     * @deprecated
931
     */
932 4
    public function getLogEntriesForPageId($id, QueueFilter $queueFilter, $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
0 ignored issues
show
Unused Code introduced by
The parameter $doFullFlush is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

932
    public function getLogEntriesForPageId($id, QueueFilter $queueFilter, $doFlush = false, /** @scrutinizer ignore-unused */ $doFullFlush = false, $itemsPerPage = 10)

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
933
    {
934 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
935
        $queryBuilder
936 4
            ->select('*')
937 4
            ->from($this->tableName)
938 4
            ->where(
939 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, PDO::PARAM_INT))
940
            )
941 4
            ->orderBy('scheduled', 'DESC');
942
943 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
944 4
            ->getConnectionForTable($this->tableName)
945 4
            ->getExpressionBuilder();
946 4
        $query = $expressionBuilder->andX();
0 ignored issues
show
Unused Code introduced by
The assignment to $query is dead and can be removed.
Loading history...
947
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
948
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
949
        // between the statements, it's not a mistake in the code.
950 4
        switch ($queueFilter) {
951 4
            case 'pending':
952
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
953
                break;
954 4
            case 'finished':
955
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
956
                break;
957
        }
958
959 4
        if ($doFlush) {
960 2
            $this->queueRepository->flushQueue($queueFilter);
961
        }
962 4
        if ($itemsPerPage > 0) {
963
            $queryBuilder
964 4
                ->setMaxResults((int) $itemsPerPage);
965
        }
966
967 4
        return $queryBuilder->execute()->fetchAll();
968
    }
969
970
    /**
971
     * Return array of records from crawler queue for input set ID
972
     *
973
     * @param int $set_id Set ID for which to look up log entries.
974
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
975
     * @param bool $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
976
     * @param int $itemsPerPage Limit the amount of entries per page default is 10
977
     * @return array
978
     *
979
     * @deprecated
980
     */
981 6
    public function getLogEntriesForSetId(int $set_id, string $filter = '', bool $doFlush = false, bool $doFullFlush = false, int $itemsPerPage = 10)
982
    {
983 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
984
        $queryBuilder
985 6
            ->select('*')
986 6
            ->from($this->tableName)
987 6
            ->where(
988 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, PDO::PARAM_INT))
989
            )
990 6
            ->orderBy('scheduled', 'DESC');
991
992 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
993 6
            ->getConnectionForTable($this->tableName)
994 6
            ->getExpressionBuilder();
995 6
        $query = $expressionBuilder->andX();
996
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
997
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
998
        // between the statements, it's not a mistake in the code.
999 6
        $addWhere = '';
1000 6
        switch ($filter) {
1001 6
            case 'pending':
1002 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
1003 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
1004 1
                break;
1005 5
            case 'finished':
1006 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1007 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
1008 1
                break;
1009
        }
1010 6
        if ($doFlush) {
1011 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int) $set_id));
1012 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1012
            /** @scrutinizer ignore-deprecated */ $this->flushQueue($doFullFlush ? '' : $addWhere);
Loading history...
1013 4
            return [];
1014
        }
1015 2
        if ($itemsPerPage > 0) {
1016
            $queryBuilder
1017 2
                ->setMaxResults((int) $itemsPerPage);
1018
        }
1019
1020 2
        return $queryBuilder->execute()->fetchAll();
1021
    }
1022
1023
    /**
1024
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1025
     *
1026
     * @param integer $setId Set ID
1027
     * @param array $params Parameters to pass to call back function
1028
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1029
     * @param integer $page_id Page ID to attach it to
1030
     * @param integer $schedule Time at which to activate
1031
     */
1032
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
1033
    {
1034
        if (! is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
1035
            $params = [];
1036
        }
1037
        $params['_CALLBACKOBJ'] = $callBack;
1038
1039
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1040
            ->insert(
1041
                'tx_crawler_queue',
1042
                [
1043
                    'page_id' => (int) $page_id,
1044
                    'parameters' => json_encode($params),
1045
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
1046
                    'exec_time' => 0,
1047
                    'set_id' => (int) $setId,
1048
                    'result_data' => '',
1049
                ]
1050
            );
1051
    }
1052
1053
    /************************************
1054
     *
1055
     * URL setting
1056
     *
1057
     ************************************/
1058
1059
    /**
1060
     * Setting a URL for crawling:
1061
     *
1062
     * @param integer $id Page ID
1063
     * @param string $url Complete URL
1064
     * @param array $subCfg Sub configuration array (from TS config)
1065
     * @param integer $tstamp Scheduled-time
1066
     * @param string $configurationHash (optional) configuration hash
1067
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1068
     * @return bool
1069
     */
1070 11
    public function addUrl(
1071
        $id,
1072
        $url,
1073
        array $subCfg,
1074
        $tstamp,
1075
        $configurationHash = '',
1076
        $skipInnerDuplicationCheck = false
1077
    ) {
1078 11
        $urlAdded = false;
1079 11
        $rows = [];
1080
1081
        // Creating parameters:
1082
        $parameters = [
1083 11
            'url' => $url,
1084
        ];
1085
1086
        // fe user group simulation:
1087 11
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1088 11
        if ($uGs) {
1089 1
            $parameters['feUserGroupList'] = $uGs;
1090
        }
1091
1092
        // Setting processing instructions
1093 11
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1094 11
        if (is_array($subCfg['procInstrParams.'])) {
1095 8
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1096
        }
1097
1098
        // Compile value array:
1099 11
        $parameters_serialized = json_encode($parameters);
1100
        $fieldArray = [
1101 11
            'page_id' => (int) $id,
1102 11
            'parameters' => $parameters_serialized,
1103 11
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1104 11
            'configuration_hash' => $configurationHash,
1105 11
            'scheduled' => $tstamp,
1106 11
            'exec_time' => 0,
1107 11
            'set_id' => (int) $this->setID,
1108 11
            'result_data' => '',
1109 11
            'configuration' => $subCfg['key'],
1110
        ];
1111
1112 11
        if ($this->registerQueueEntriesInternallyOnly) {
1113
            //the entries will only be registered and not stored to the database
1114 1
            $this->queueEntries[] = $fieldArray;
1115
        } else {
1116 10
            if (! $skipInnerDuplicationCheck) {
1117
                // check if there is already an equal entry
1118 9
                $rows = $this->queueRepository->getDuplicateQueueItemsIfExists(
1119 9
                    (bool) $this->extensionSettings['enableTimeslot'],
1120
                    $tstamp,
1121 9
                    $this->getCurrentTime(),
1122 9
                    $fieldArray['page_id'],
1123 9
                    $fieldArray['parameters_hash']
1124
                );
1125
            }
1126
1127 10
            if (empty($rows)) {
1128 9
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1129 9
                $connectionForCrawlerQueue->insert(
1130 9
                    'tx_crawler_queue',
1131
                    $fieldArray
1132
                );
1133 9
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1134 9
                $rows[] = $uid;
1135 9
                $urlAdded = true;
1136
1137 9
                $signalPayload = ['uid' => $uid, 'fieldArray' => $fieldArray];
1138 9
                SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1138
                /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1139 9
                    self::class,
1140 9
                    SignalSlotUtility::SIGNAL_URL_ADDED_TO_QUEUE,
1141
                    $signalPayload
1142
                );
1143
            } else {
1144 5
                $signalPayload = ['rows' => $rows, 'fieldArray' => $fieldArray];
1145 5
                SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1145
                /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1146 5
                    self::class,
1147 5
                    SignalSlotUtility::SIGNAL_DUPLICATE_URL_IN_QUEUE,
1148
                    $signalPayload
1149
                );
1150
            }
1151
        }
1152
1153 11
        return $urlAdded;
1154
    }
1155
1156
    /**
1157
     * Returns the current system time
1158
     *
1159
     * @return int
1160
     */
1161 4
    public function getCurrentTime()
1162
    {
1163 4
        return time();
1164
    }
1165
1166
    /************************************
1167
     *
1168
     * URL reading
1169
     *
1170
     ************************************/
1171
1172
    /**
1173
     * Read URL for single queue entry
1174
     *
1175
     * @param integer $queueId
1176
     * @param boolean $force If set, will process even if exec_time has been set!
1177
     * @return integer
1178
     */
1179 2
    public function readUrl($queueId, $force = false)
1180
    {
1181 2
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1182 2
        $ret = 0;
1183 2
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1184
        // Get entry:
1185
        $queryBuilder
1186 2
            ->select('*')
1187 2
            ->from('tx_crawler_queue')
1188 2
            ->where(
1189 2
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, PDO::PARAM_INT))
1190
            );
1191 2
        if (! $force) {
1192
            $queryBuilder
1193 2
                ->andWhere('exec_time = 0')
1194 2
                ->andWhere('process_scheduled > 0');
1195
        }
1196 2
        $queueRec = $queryBuilder->execute()->fetch();
1197
1198 2
        if (! is_array($queueRec)) {
1199
            return;
1200
        }
1201
1202 2
        SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1202
        /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1203 2
            self::class,
1204 2
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1205 2
            [$queueId, &$queueRec]
1206
        );
1207
1208
        // Set exec_time to lock record:
1209 2
        $field_array = ['exec_time' => $this->getCurrentTime()];
1210
1211 2
        if (isset($this->processID)) {
1212
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1213 2
            $field_array['process_id_completed'] = $this->processID;
1214
        }
1215
1216 2
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1217 2
            ->update(
1218 2
                'tx_crawler_queue',
1219
                $field_array,
1220 2
                ['qid' => (int) $queueId]
1221
            );
1222
1223 2
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1224 2
        if ($result['content'] === null) {
1225
            $resultData = 'An errors happened';
1226
        } else {
1227
            /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
1228 2
            $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
1229 2
            $resultData = $jsonCompatibilityConverter->convert($result['content']);
1230
        }
1231
1232
        //atm there's no need to point to specific pollable extensions
1233 2
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1234
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1235
                // only check the success value if the instruction is runnig
1236
                // it is important to name the pollSuccess key same as the procInstructions key
1237
                if (is_array($resultData['parameters']['procInstructions'])
1238
                    && in_array(
1239
                        $pollable,
1240
                        $resultData['parameters']['procInstructions'], true
1241
                    )
1242
                ) {
1243
                    if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1244
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1245
                    }
1246
                }
1247
            }
1248
        }
1249
1250
        // Set result in log which also denotes the end of the processing of this entry.
1251 2
        $field_array = ['result_data' => json_encode($result)];
1252
1253 2
        SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1253
        /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1254 2
            self::class,
1255 2
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1256 2
            [$queueId, &$field_array]
1257
        );
1258
1259 2
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1260 2
            ->update(
1261 2
                'tx_crawler_queue',
1262
                $field_array,
1263 2
                ['qid' => (int) $queueId]
1264
            );
1265
1266 2
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1267 2
        return $ret;
1268
    }
1269
1270
    /**
1271
     * Read URL for not-yet-inserted log-entry
1272
     *
1273
     * @param array $field_array Queue field array,
1274
     *
1275
     * @return array|bool|mixed|string
1276
     */
1277
    public function readUrlFromArray($field_array)
1278
    {
1279
        // Set exec_time to lock record:
1280
        $field_array['exec_time'] = $this->getCurrentTime();
1281
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1282
        $connectionForCrawlerQueue->insert(
1283
            $this->tableName,
1284
            $field_array
1285
        );
1286
        $queueId = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1287
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1288
1289
        // Set result in log which also denotes the end of the processing of this entry.
1290
        $field_array = ['result_data' => json_encode($result)];
1291
1292
        SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1292
        /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1293
            self::class,
1294
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1295
            [$queueId, &$field_array]
1296
        );
1297
1298
        $connectionForCrawlerQueue->update(
1299
            $this->tableName,
1300
            $field_array,
1301
            ['qid' => $queueId]
1302
        );
1303
1304
        return $result;
1305
    }
1306
1307
    /*****************************
1308
     *
1309
     * Compiling URLs to crawl - tools
1310
     *
1311
     *****************************/
1312
1313
    /**
1314
     * @param integer $id Root page id to start from.
1315
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1316
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1317
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1318
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1319
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1320
     * @param array $incomingProcInstructions Array of processing instructions
1321
     * @param array $configurationSelection Array of configuration keys
1322
     * @return string
1323
     */
1324
    public function getPageTreeAndUrls(
1325
        $id,
1326
        $depth,
1327
        $scheduledTime,
1328
        $reqMinute,
1329
        $submitCrawlUrls,
1330
        $downloadCrawlUrls,
1331
        array $incomingProcInstructions,
1332
        array $configurationSelection
1333
    ) {
1334
        $this->scheduledTime = $scheduledTime;
1335
        $this->reqMinute = $reqMinute;
1336
        $this->submitCrawlUrls = $submitCrawlUrls;
1337
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1338
        $this->incomingProcInstructions = $incomingProcInstructions;
1339
        $this->incomingConfigurationSelection = $configurationSelection;
1340
1341
        $this->duplicateTrack = [];
1342
        $this->downloadUrls = [];
1343
1344
        // Drawing tree:
1345
        /* @var PageTreeView $tree */
1346
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1347
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1348
        $tree->init('AND ' . $perms_clause);
1349
1350
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1351
        if (is_array($pageInfo)) {
1352
            // Set root row:
1353
            $tree->tree[] = [
1354
                'row' => $pageInfo,
1355
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1356
            ];
1357
        }
1358
1359
        // Get branch beneath:
1360
        if ($depth) {
1361
            $tree->getTree($id, $depth, '');
1362
        }
1363
1364
        // Traverse page tree:
1365
        $code = '';
1366
1367
        foreach ($tree->tree as $data) {
1368
            $this->MP = false;
1369
1370
            // recognize mount points
1371
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
1372
                $mountpage = $this->pageRepository->getPage($data['row']['uid']);
1373
1374
                // fetch mounted pages
1375
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1376
1377
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1378
                $mountTree->init('AND ' . $perms_clause);
1379
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1380
1381
                foreach ($mountTree->tree as $mountData) {
1382
                    $code .= $this->drawURLs_addRowsForPage(
1383
                        $mountData['row'],
1384
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1385
                    );
1386
                }
1387
1388
                // replace page when mount_pid_ol is enabled
1389
                if ($mountpage[0]['mount_pid_ol']) {
1390
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1391
                } else {
1392
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1393
                    $this->MP = false;
1394
                }
1395
            }
1396
1397
            $code .= $this->drawURLs_addRowsForPage(
1398
                $data['row'],
1399
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1400
            );
1401
        }
1402
1403
        return $code;
1404
    }
1405
1406
    /**
1407
     * Expands exclude string
1408
     *
1409
     * @param string $excludeString Exclude string
1410
     * @return array
1411
     */
1412 2
    public function expandExcludeString($excludeString)
1413
    {
1414
        // internal static caches;
1415 2
        static $expandedExcludeStringCache;
1416 2
        static $treeCache;
1417
1418 2
        if (empty($expandedExcludeStringCache[$excludeString])) {
1419 2
            $pidList = [];
1420
1421 2
            if (! empty($excludeString)) {
1422
                /** @var PageTreeView $tree */
1423 1
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1424 1
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1425
1426 1
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1427
1428 1
                foreach ($excludeParts as $excludePart) {
1429 1
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1430
1431
                    // default is "page only" = "depth=0"
1432 1
                    if (empty($depth)) {
1433 1
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1434
                    }
1435
1436 1
                    $pidList[] = (int) $pid;
1437
1438 1
                    if ($depth > 0) {
1439
                        if (empty($treeCache[$pid][$depth])) {
1440
                            $tree->reset();
1441
                            $tree->getTree($pid, $depth);
0 ignored issues
show
Bug introduced by
$pid of type string is incompatible with the type integer expected by parameter $uid of TYPO3\CMS\Backend\Tree\V...ractTreeView::getTree(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1441
                            $tree->getTree(/** @scrutinizer ignore-type */ $pid, $depth);
Loading history...
1442
                            $treeCache[$pid][$depth] = $tree->tree;
1443
                        }
1444
1445
                        foreach ($treeCache[$pid][$depth] as $data) {
1446
                            $pidList[] = (int) $data['row']['uid'];
1447
                        }
1448
                    }
1449
                }
1450
            }
1451
1452 2
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1453
        }
1454
1455 2
        return $expandedExcludeStringCache[$excludeString];
1456
    }
1457
1458
    /**
1459
     * Create the rows for display of the page tree
1460
     * For each page a number of rows are shown displaying GET variable configuration
1461
     */
1462
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1463
    {
1464
        $skipMessage = '';
1465
1466
        // Get list of configurations
1467
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1468
        $configurations = ConfigurationService::removeDisallowedConfigurations($this->incomingConfigurationSelection, $configurations);
1469
1470
        // Traverse parameter combinations:
1471
        $c = 0;
1472
        $content = '';
1473
        if (! empty($configurations)) {
1474
            foreach ($configurations as $confKey => $confArray) {
1475
1476
                // Title column:
1477
                if (! $c) {
1478
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1479
                } else {
1480
                    $titleClm = '';
1481
                }
1482
1483
                if (! in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']), true)) {
1484
1485
                    // URL list:
1486
                    $urlList = $this->urlListFromUrlArray(
1487
                        $confArray,
1488
                        $pageRow,
1489
                        $this->scheduledTime,
1490
                        $this->reqMinute,
1491
                        $this->submitCrawlUrls,
1492
                        $this->downloadCrawlUrls,
1493
                        $this->duplicateTrack,
1494
                        $this->downloadUrls,
1495
                        // if empty the urls won't be filtered by processing instructions
1496
                        $this->incomingProcInstructions
1497
                    );
1498
1499
                    // Expanded parameters:
1500
                    $paramExpanded = '';
1501
                    $calcAccu = [];
1502
                    $calcRes = 1;
1503
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1504
                        $paramExpanded .= '
1505
                            <tr>
1506
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1507
                            '(' . count($gVal) . ')' .
1508
                            '</td>
1509
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1510
                            </tr>
1511
                        ';
1512
                        $calcRes *= count($gVal);
1513
                        $calcAccu[] = count($gVal);
1514
                    }
1515
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1516
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1517
1518
                    // Options
1519
                    $optionValues = '';
1520
                    if ($confArray['subCfg']['userGroups']) {
1521
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1522
                    }
1523
                    if ($confArray['subCfg']['procInstrFilter']) {
1524
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1525
                    }
1526
1527
                    // Compile row:
1528
                    $content .= '
1529
                        <tr>
1530
                            ' . $titleClm . '
1531
                            <td>' . htmlspecialchars($confKey) . '</td>
1532
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1533
                            <td>' . $paramExpanded . '</td>
1534
                            <td nowrap="nowrap">' . $urlList . '</td>
1535
                            <td nowrap="nowrap">' . $optionValues . '</td>
1536
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1537
                        </tr>';
1538
                } else {
1539
                    $content .= '<tr>
1540
                            ' . $titleClm . '
1541
                            <td>' . htmlspecialchars($confKey) . '</td>
1542
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1543
                        </tr>';
1544
                }
1545
1546
                $c++;
1547
            }
1548
        } else {
1549
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1550
1551
            // Compile row:
1552
            $content .= '
1553
                <tr>
1554
                    <td>' . $pageTitle . '</td>
1555
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1556
                </tr>';
1557
        }
1558
1559
        return $content;
1560
    }
1561
1562
    /*****************************
1563
     *
1564
     * CLI functions
1565
     *
1566
     *****************************/
1567
1568
    /**
1569
     * Running the functionality of the CLI (crawling URLs from queue)
1570
     */
1571 2
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1572
    {
1573 2
        $result = 0;
1574 2
        $counter = 0;
1575
1576
        // First, run hooks:
1577 2
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1578 2
            trigger_error(
1579 2
                'This hook (crawler/cli_hooks) is deprecated since 9.1.5 and will be removed when dropping support for TYPO3 9LTS and 10LTS',
1580 2
                E_USER_DEPRECATED
1581
            );
1582 2
            $hookObj = GeneralUtility::makeInstance($objRef);
1583 2
            if (is_object($hookObj)) {
1584 2
                $hookObj->crawler_init($this);
1585
            }
1586
        }
1587
1588
        // Clean up the queue
1589 2
        $this->queueRepository->cleanupQueue();
1590
1591
        // Select entries:
1592 2
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1593
1594 2
        if (! empty($rows)) {
1595 2
            $quidList = [];
1596
1597 2
            foreach ($rows as $r) {
1598 2
                $quidList[] = $r['qid'];
1599
            }
1600
1601 2
            $processId = $this->CLI_buildProcessId();
1602
1603
            //save the number of assigned queue entries to determine how many have been processed later
1604 2
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1605 2
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1606
1607 2
            if ($numberOfAffectedRows !== count($quidList)) {
1608
                return ($result | self::CLI_STATUS_ABORTED);
1609
            }
1610
1611 2
            foreach ($rows as $r) {
1612 2
                $result |= $this->readUrl($r['qid']);
1613
1614 2
                $counter++;
1615
                // Just to relax the system
1616 2
                usleep((int) $sleepTime);
1617
1618
                // if during the start and the current read url the cli has been disable we need to return from the function
1619
                // mark the process NOT as ended.
1620 2
                if ($this->crawler->isDisabled()) {
1621
                    return ($result | self::CLI_STATUS_ABORTED);
1622
                }
1623
1624 2
                if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1625
                    $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1625
                    /** @scrutinizer ignore-deprecated */ $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
Loading history...
1626
                    $result |= self::CLI_STATUS_ABORTED;
1627
                    //possible timeout
1628
                    break;
1629
                }
1630
            }
1631
1632 2
            sleep((int) $sleepAfterFinish);
1633
        }
1634
1635 2
        if ($counter > 0) {
1636 2
            $result |= self::CLI_STATUS_PROCESSED;
1637
        }
1638
1639 2
        return $result;
1640
    }
1641
1642
    /**
1643
     * Activate hooks
1644
     * @deprecated
1645
     */
1646
    public function CLI_runHooks(): void
1647
    {
1648
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1649
            $hookObj = GeneralUtility::makeInstance($objRef);
1650
            if (is_object($hookObj)) {
1651
                $hookObj->crawler_init($this);
1652
            }
1653
        }
1654
    }
1655
1656
    /**
1657
     * Try to acquire a new process with the given id
1658
     * also performs some auto-cleanup for orphan processes
1659
     * @param string $id identification string for the process
1660
     * @return boolean
1661
     * @todo preemption might not be the most elegant way to clean up
1662
     */
1663 2
    public function CLI_checkAndAcquireNewProcess($id)
1664
    {
1665 2
        $ret = true;
1666
1667 2
        $systemProcessId = getmypid();
1668 2
        if (! $systemProcessId) {
1669
            return false;
1670
        }
1671
1672 2
        $processCount = 0;
1673 2
        $orphanProcesses = [];
1674
1675 2
        $activeProcesses = $this->processRepository->findAllActive();
1676 2
        $currentTime = $this->getCurrentTime();
1677
1678
        /** @var Process $process */
1679 2
        foreach ($activeProcesses as $process) {
1680
            if ($process->getTtl() < $currentTime) {
1681
                $orphanProcesses[] = $process->getProcessId();
1682
            } else {
1683
                $processCount++;
1684
            }
1685
        }
1686
1687
        // if there are less than allowed active processes then add a new one
1688 2
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
1689 2
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1690 2
                'tx_crawler_process',
1691
                [
1692 2
                    'process_id' => $id,
1693 2
                    'active' => 1,
1694 2
                    'ttl' => $currentTime + (int) $this->extensionSettings['processMaxRunTime'],
1695 2
                    'system_process_id' => $systemProcessId,
1696
                ]
1697
            );
1698
        } else {
1699
            $ret = false;
1700
        }
1701
1702 2
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1703 2
        $this->CLI_releaseProcesses($orphanProcesses);
1704
1705 2
        return $ret;
1706
    }
1707
1708
    /**
1709
     * Release a process and the required resources
1710
     *
1711
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1712
     * @return boolean
1713
     */
1714 2
    public function CLI_releaseProcesses($releaseIds)
1715
    {
1716 2
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1717
1718 2
        if (! is_array($releaseIds)) {
1719 2
            $releaseIds = [$releaseIds];
1720
        }
1721
1722 2
        if (empty($releaseIds)) {
1723
            //nothing to release
1724 2
            return false;
1725
        }
1726
1727
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1728
        // this ensures that a single process can't mess up the entire process table
1729
1730
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1731
1732
        $queryBuilder
1733 2
            ->update($this->tableName, 'q')
1734 2
            ->where(
1735 2
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1736
            )
1737 2
            ->set('q.process_scheduled', 0)
1738 2
            ->set('q.process_id', '')
1739 2
            ->execute();
1740
1741
        // FIXME: Not entirely sure that this is equivalent to the previous version
1742 2
        $queryBuilder->resetQueryPart('set');
1743
1744
        $queryBuilder
1745 2
            ->update('tx_crawler_process')
1746 2
            ->where(
1747 2
                $queryBuilder->expr()->eq('active', 0),
1748 2
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1749
            )
1750 2
            ->set('system_process_id', 0)
1751 2
            ->execute();
1752
1753 2
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1754 2
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1755
1756 2
        return true;
1757
    }
1758
1759
    /**
1760
     * Create a unique Id for the current process
1761
     *
1762
     * @return string the ID
1763
     */
1764 3
    public function CLI_buildProcessId()
1765
    {
1766 3
        if (! $this->processID) {
1767 2
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1768
        }
1769 3
        return $this->processID;
1770
    }
1771
1772
    /**
1773
     * Prints a message to the stdout (only if debug-mode is enabled)
1774
     *
1775
     * @param string $msg the message
1776
     * @deprecated
1777
     * @codeCoverageIgnore
1778
     */
1779
    public function CLI_debug($msg): void
1780
    {
1781
        if ((int) $this->extensionSettings['processDebug']) {
1782
            echo $msg . "\n";
1783
            flush();
1784
        }
1785
    }
1786
1787
    /**
1788
     * Cleans up entries that stayed for too long in the queue. These are:
1789
     * - processed entries that are over 1.5 days in age
1790
     * - scheduled entries that are over 7 days old
1791
     *
1792
     * @deprecated
1793
     */
1794 1
    public function cleanUpOldQueueEntries(): void
1795
    {
1796
        // 24*60*60 Seconds in 24 hours
1797 1
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400;
1798 1
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
1799
1800 1
        $now = time();
1801 1
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
1802 1
        $this->flushQueue($condition);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1802
        /** @scrutinizer ignore-deprecated */ $this->flushQueue($condition);
Loading history...
1803 1
    }
1804
1805
    /**
1806
     * Removes queue entries
1807
     *
1808
     * @param string $where SQL related filter for the entries which should be removed
1809
     *
1810
     * @deprecated
1811
     */
1812 5
    protected function flushQueue($where = ''): void
1813
    {
1814 5
        $realWhere = strlen((string) $where) > 0 ? $where : '1=1';
1815
1816 5
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1817
1818
        $groups = $queryBuilder
1819 5
            ->selectLiteral('DISTINCT set_id')
1820 5
            ->from($this->tableName)
1821 5
            ->where($realWhere)
1822 5
            ->execute()
1823 5
            ->fetchAll();
1824 5
        if (is_array($groups)) {
0 ignored issues
show
introduced by
The condition is_array($groups) is always true.
Loading history...
1825 5
            foreach ($groups as $group) {
1826
                $subSet = $queryBuilder
1827 4
                    ->select('qid', 'set_id')
1828 4
                    ->from($this->tableName)
1829 4
                    ->where(
1830 4
                        $realWhere,
1831 4
                        $queryBuilder->expr()->eq('set_id', $group['set_id'])
1832
                    )
1833 4
                    ->execute()
1834 4
                    ->fetchAll();
1835
1836 4
                $payLoad = ['subSet' => $subSet];
1837 4
                SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1837
                /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1838 4
                    self::class,
1839 4
                    SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH,
1840
                    $payLoad
1841
                );
1842
            }
1843
        }
1844
1845
        $queryBuilder
1846 5
            ->delete($this->tableName)
1847 5
            ->where($realWhere)
1848 5
            ->execute();
1849 5
    }
1850
1851
    /**
1852
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1853
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1854
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1855
     *
1856
     * @param int $tstamp
1857
     * @param array $fieldArray
1858
     *
1859
     * @return array
1860
     * @deprecated
1861
     */
1862 5
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1863
    {
1864 5
        $rows = [];
1865
1866 5
        $currentTime = $this->getCurrentTime();
1867
1868 5
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1869
        $queryBuilder
1870 5
            ->select('qid')
1871 5
            ->from('tx_crawler_queue');
1872
        //if this entry is scheduled with "now"
1873 5
        if ($tstamp <= $currentTime) {
1874 2
            if ($this->extensionSettings['enableTimeslot']) {
1875 1
                $timeBegin = $currentTime - 100;
1876 1
                $timeEnd = $currentTime + 100;
1877
                $queryBuilder
1878 1
                    ->where(
1879 1
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1880
                    )
1881 1
                    ->orWhere(
1882 1
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1883
                    );
1884
            } else {
1885
                $queryBuilder
1886 1
                    ->where(
1887 2
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1888
                    );
1889
            }
1890 3
        } elseif ($tstamp > $currentTime) {
1891
            //entry with a timestamp in the future need to have the same schedule time
1892
            $queryBuilder
1893 3
                ->where(
1894 3
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1895
                );
1896
        }
1897
1898
        $queryBuilder
1899 5
            ->andWhere('NOT exec_time')
1900 5
            ->andWhere('NOT process_id')
1901 5
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], PDO::PARAM_INT)))
1902 5
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], PDO::PARAM_STR)));
1903
1904 5
        $statement = $queryBuilder->execute();
1905
1906 5
        while ($row = $statement->fetch()) {
1907 5
            $rows[] = $row['qid'];
1908
        }
1909
1910 5
        return $rows;
1911
    }
1912
1913
    /**
1914
     * Returns a md5 hash generated from a serialized configuration array.
1915
     *
1916
     * @return string
1917
     */
1918 13
    protected function getConfigurationHash(array $configuration)
1919
    {
1920 13
        unset($configuration['paramExpanded']);
1921 13
        unset($configuration['URLs']);
1922 13
        return md5(serialize($configuration));
1923
    }
1924
1925
    /**
1926
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
1927
     * the Site instance.
1928
     *
1929
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
1930
     * @throws SiteNotFoundException
1931
     * @throws InvalidRouteArgumentsException
1932
     *
1933
     * @deprecated Using CrawlerController::getUrlFromPageAndQueryParameters() is deprecated since 9.1.1 and will be removed in v11.x, please use UrlService->getUrlFromPageAndQueryParameters() instead.
1934
     * @codeCoverageIgnore
1935
     */
1936
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
1937
    {
1938
        $urlService = new UrlService();
1939
        return $urlService->getUrlFromPageAndQueryParameters($pageId, $queryString, $alternativeBaseUrl, $httpsOrHttp);
1940
    }
1941
1942 1
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1943
    {
1944
        // Swap if first is larger than last:
1945 1
        if ($reg[1] > $reg[2]) {
1946
            $temp = $reg[2];
1947
            $reg[2] = $reg[1];
1948
            $reg[1] = $temp;
1949
        }
1950
1951 1
        return $reg;
1952
    }
1953
1954 9
    private function getMaximumUrlsToCompile(): int
1955
    {
1956 9
        return $this->maximumUrlsToCompile;
1957
    }
1958
1959
    /**
1960
     * @return BackendUserAuthentication
1961
     */
1962 3
    private function getBackendUser()
1963
    {
1964
        // Make sure the _cli_ user is loaded
1965 3
        Bootstrap::initializeBackendAuthentication();
1966 3
        if ($this->backendUser === null) {
1967 3
            $this->backendUser = $GLOBALS['BE_USER'];
1968
        }
1969 3
        return $this->backendUser;
1970
    }
1971
1972
    /**
1973
     * Get querybuilder for given table
1974
     *
1975
     * @return QueryBuilder
1976
     */
1977 13
    private function getQueryBuilder(string $table)
1978
    {
1979 13
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
1980
    }
1981
}
1982