Passed
Push — test/php73 ( c5d465...4c9ff7 )
by Tomas Norre
06:45
created

CrawlerController::CLI_run()   B

Complexity

Conditions 10
Paths 33

Size

Total Lines 69
Code Lines 34

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 30
CRAP Score 10.2918

Importance

Changes 2
Bugs 0 Features 0
Metric Value
cc 10
eloc 34
c 2
b 0
f 0
nc 33
nop 3
dl 0
loc 69
ccs 30
cts 35
cp 0.8571
crap 10.2918
rs 7.6666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Converter\JsonCompatibilityConverter;
33
use AOE\Crawler\Crawler;
34
use AOE\Crawler\CrawlStrategy\CrawlStrategyFactory;
35
use AOE\Crawler\Domain\Model\Process;
36
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
37
use AOE\Crawler\Domain\Repository\ProcessRepository;
38
use AOE\Crawler\Domain\Repository\QueueRepository;
39
use AOE\Crawler\QueueExecutor;
40
use AOE\Crawler\Service\ConfigurationService;
41
use AOE\Crawler\Service\UrlService;
42
use AOE\Crawler\Service\UserService;
43
use AOE\Crawler\Utility\SignalSlotUtility;
44
use AOE\Crawler\Value\QueueFilter;
45
use PDO;
46
use Psr\Http\Message\UriInterface;
47
use Psr\Log\LoggerAwareInterface;
48
use Psr\Log\LoggerAwareTrait;
49
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
50
use TYPO3\CMS\Backend\Utility\BackendUtility;
51
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
52
use TYPO3\CMS\Core\Compatibility\PublicMethodDeprecationTrait;
53
use TYPO3\CMS\Core\Compatibility\PublicPropertyDeprecationTrait;
54
use TYPO3\CMS\Core\Core\Bootstrap;
55
use TYPO3\CMS\Core\Core\Environment;
56
use TYPO3\CMS\Core\Database\Connection;
57
use TYPO3\CMS\Core\Database\ConnectionPool;
58
use TYPO3\CMS\Core\Database\Query\QueryBuilder;
59
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
60
use TYPO3\CMS\Core\Database\QueryGenerator;
61
use TYPO3\CMS\Core\Domain\Repository\PageRepository;
62
use TYPO3\CMS\Core\Exception\SiteNotFoundException;
63
use TYPO3\CMS\Core\Imaging\Icon;
64
use TYPO3\CMS\Core\Imaging\IconFactory;
65
use TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException;
66
use TYPO3\CMS\Core\Site\Entity\Site;
67
use TYPO3\CMS\Core\Type\Bitmask\Permission;
68
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
69
use TYPO3\CMS\Core\Utility\DebugUtility;
70
use TYPO3\CMS\Core\Utility\GeneralUtility;
71
use TYPO3\CMS\Core\Utility\MathUtility;
72
use TYPO3\CMS\Extbase\Object\ObjectManager;
73
74
/**
75
 * Class CrawlerController
76
 *
77
 * @package AOE\Crawler\Controller
78
 */
79
class CrawlerController implements LoggerAwareInterface
80
{
81
    use LoggerAwareTrait;
82
    use PublicMethodDeprecationTrait;
83
    use PublicPropertyDeprecationTrait;
84
85
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
86
87
    //queue not empty
88
    public const CLI_STATUS_REMAIN = 1;
89
90
    //(some) queue items where processed
91
    public const CLI_STATUS_PROCESSED = 2;
92
93
    //instance didn't finish
94
    public const CLI_STATUS_ABORTED = 4;
95
96
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
97
98
    /**
99
     * @var integer
100
     */
101
    public $setID = 0;
102
103
    /**
104
     * @var string
105
     */
106
    public $processID = '';
107
108
    /**
109
     * @var array
110
     */
111
    public $duplicateTrack = [];
112
113
    /**
114
     * @var array
115
     */
116
    public $downloadUrls = [];
117
118
    /**
119
     * @var array
120
     */
121
    public $incomingProcInstructions = [];
122
123
    /**
124
     * @var array
125
     */
126
    public $incomingConfigurationSelection = [];
127
128
    /**
129
     * @var bool
130
     */
131
    public $registerQueueEntriesInternallyOnly = false;
132
133
    /**
134
     * @var array
135
     */
136
    public $queueEntries = [];
137
138
    /**
139
     * @var array
140
     */
141
    public $urlList = [];
142
143
    /**
144
     * @var array
145
     */
146
    public $extensionSettings = [];
147
148
    /**
149
     * Mount Point
150
     *
151
     * @var bool
152
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
153
     */
154
    public $MP = false;
155
156
    /**
157
     * @var string
158
     * @deprecated
159
     */
160
    protected $processFilename;
161
162
    /**
163
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
164
     *
165
     * @var string
166
     * @deprecated
167
     */
168
    protected $accessMode;
169
170
    /**
171
     * @var QueueRepository
172
     */
173
    protected $queueRepository;
174
175
    /**
176
     * @var ProcessRepository
177
     */
178
    protected $processRepository;
179
180
    /**
181
     * @var ConfigurationRepository
182
     */
183
    protected $configurationRepository;
184
185
    /**
186
     * @var string
187
     */
188
    protected $tableName = 'tx_crawler_queue';
189
190
    /**
191
     * @var QueueExecutor
192
     */
193
    protected $queueExecutor;
194
195
    /**
196
     * @var int
197
     */
198
    protected $maximumUrlsToCompile = 10000;
199
200
    /**
201
     * @var IconFactory
202
     */
203
    protected $iconFactory;
204
205
    /**
206
     * @var string[]
207
     */
208
    private $deprecatedPublicMethods = [
0 ignored issues
show
introduced by
The private property $deprecatedPublicMethods is not used, and could be removed.
Loading history...
209
        'cleanUpOldQueueEntries' => 'Using CrawlerController::cleanUpOldQueueEntries() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->cleanUpOldQueueEntries() instead.',
210
        'CLI_debug' => 'Using CrawlerController->CLI_debug() is deprecated since 9.1.3 and will be removed in v11.x',
211
        'CLI_releaseProcesses' => 'Using CrawlerController->CLI_releaseProcesses() is deprecated since 9.2.2 and will be removed in v11.x',
212
        'CLI_runHooks' => 'Using CrawlerController->CLI_runHooks() is deprecated since 9.1.5 and will be removed in v11.x',
213
        'getAccessMode' => 'Using CrawlerController->getAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
214
        'getLogEntriesForPageId' => 'Using CrawlerController->getLogEntriesForPageId() is deprecated since 9.1.5 and will be remove in v11.x',
215
        'getLogEntriesForSetId' => 'Using crawlerController::getLogEntriesForSetId() is deprecated since 9.0.1 and will be removed in v11.x',
216
        'hasGroupAccess' => 'Using CrawlerController->getLogEntriesForPageId() is deprecated since 9.2.2 and will be remove in v11.x, please use UserService::hasGroupAccess() instead.',
217
        'flushQueue' => 'Using CrawlerController::flushQueue() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->flushQueue() instead.',
218
        'setAccessMode' => 'Using CrawlerController->setAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
219
        'getDisabled' => 'Using CrawlerController->getDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->isDisabled() instead',
220
        'setDisabled' => 'Using CrawlerController->setDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->setDisabled() instead',
221
        'getProcessFilename' => 'Using CrawlerController->getProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
222
        'setProcessFilename' => 'Using CrawlerController->setProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
223
        'getDuplicateRowsIfExist' => 'Using CrawlerController->getDuplicateRowsIfExist() is deprecated since 9.1.4 and will be remove in v11.x, please use QueueRepository->getDuplicateQueueItemsIfExists() instead',
224
    ];
225
226
    /**
227
     * @var string[]
228
     */
229
    private $deprecatedPublicProperties = [
230
        'accessMode' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
231
        'processFilename' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
232
    ];
233
234
    /**
235
     * @var BackendUserAuthentication|null
236
     */
237
    private $backendUser;
238
239
    /**
240
     * @var integer
241
     */
242
    private $scheduledTime = 0;
243
244
    /**
245
     * @var integer
246
     */
247
    private $reqMinute = 0;
248
249
    /**
250
     * @var bool
251
     */
252
    private $submitCrawlUrls = false;
253
254
    /**
255
     * @var bool
256
     */
257
    private $downloadCrawlUrls = false;
258
259
    /**
260
     * @var PageRepository
261
     */
262
    private $pageRepository;
263
264
    /**
265
     * @var Crawler
266
     */
267
    private $crawler;
268
269
    /************************************
270
     *
271
     * Getting URLs based on Page TSconfig
272
     *
273
     ************************************/
274
275 41
    public function __construct()
276
    {
277 41
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
278 41
        $crawlStrategyFactory = GeneralUtility::makeInstance(CrawlStrategyFactory::class);
279 41
        $this->queueRepository = $objectManager->get(QueueRepository::class);
280 41
        $this->processRepository = $objectManager->get(ProcessRepository::class);
281 41
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
282 41
        $this->pageRepository = GeneralUtility::makeInstance(PageRepository::class);
283 41
        $this->queueExecutor = GeneralUtility::makeInstance(QueueExecutor::class, $crawlStrategyFactory);
284 41
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
285 41
        $this->crawler = GeneralUtility::makeInstance(Crawler::class);
286
287 41
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

287
        /** @scrutinizer ignore-deprecated */ $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
Loading history...
288
289
        /** @var ExtensionConfigurationProvider $configurationProvider */
290 41
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
291 41
        $settings = $configurationProvider->getExtensionConfiguration();
292 41
        $this->extensionSettings = is_array($settings) ? $settings : [];
293
294
        // set defaults:
295 41
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
296
            $this->extensionSettings['countInARun'] = 100;
297
        }
298
299 41
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
300 41
        $this->setMaximumUrlsToCompile(MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000));
301 41
    }
302
303 45
    public function setMaximumUrlsToCompile(int $maximumUrlsToCompile): void
304
    {
305 45
        $this->maximumUrlsToCompile = $maximumUrlsToCompile;
306 45
    }
307
308
    /**
309
     * Method to set the accessMode can be gui, cli or cli_im
310
     *
311
     * @return string
312
     * @deprecated
313
     */
314 1
    public function getAccessMode()
315
    {
316 1
        return $this->accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

316
        return /** @scrutinizer ignore-deprecated */ $this->accessMode;
Loading history...
317
    }
318
319
    /**
320
     * @param string $accessMode
321
     * @deprecated
322
     */
323 1
    public function setAccessMode($accessMode): void
324
    {
325 1
        $this->accessMode = $accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

325
        /** @scrutinizer ignore-deprecated */ $this->accessMode = $accessMode;
Loading history...
326 1
    }
327
328
    /**
329
     * Set disabled status to prevent processes from being processed
330
     * @deprecated
331
     */
332 3
    public function setDisabled(?bool $disabled = true): void
333
    {
334 3
        if ($disabled) {
335 2
            GeneralUtility::writeFile($this->processFilename, 'disabled');
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

335
            GeneralUtility::writeFile(/** @scrutinizer ignore-deprecated */ $this->processFilename, 'disabled');
Loading history...
336 1
        } elseif (is_file($this->processFilename)) {
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

336
        } elseif (is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename)) {
Loading history...
337 1
            unlink($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

337
            unlink(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
338
        }
339 3
    }
340
341
    /**
342
     * Get disable status
343
     * @deprecated
344
     */
345 3
    public function getDisabled(): bool
346
    {
347 3
        return is_file($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

347
        return is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
348
    }
349
350
    /**
351
     * @param string $filenameWithPath
352
     * @deprecated
353
     */
354 4
    public function setProcessFilename($filenameWithPath): void
355
    {
356 4
        $this->processFilename = $filenameWithPath;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

356
        /** @scrutinizer ignore-deprecated */ $this->processFilename = $filenameWithPath;
Loading history...
357 4
    }
358
359
    /**
360
     * @return string
361
     * @deprecated
362
     */
363 1
    public function getProcessFilename()
364
    {
365 1
        return $this->processFilename;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

365
        return /** @scrutinizer ignore-deprecated */ $this->processFilename;
Loading history...
366
    }
367
368
    /**
369
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
370
     */
371 14
    public function setExtensionSettings(array $extensionSettings): void
372
    {
373 14
        $this->extensionSettings = $extensionSettings;
374 14
    }
375
376
    /**
377
     * Check if the given page should be crawled
378
     *
379
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
380
     */
381 15
    public function checkIfPageShouldBeSkipped(array $pageRow)
382
    {
383
        // if page is hidden
384 15
        if (! $this->extensionSettings['crawlHiddenPages'] && $pageRow['hidden']) {
385 1
            return 'Because page is hidden';
386
        }
387
388 14
        if (GeneralUtility::inList('3,4,199,254,255', $pageRow['doktype'])) {
389 3
            return 'Because doktype is not allowed';
390
        }
391
392 11
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
393 1
            if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
394 1
                return 'Doktype was excluded by "' . $key . '"';
395
            }
396
        }
397
398
        // veto hook
399 10
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
400
            $params = [
401 2
                'pageRow' => $pageRow,
402
            ];
403
            // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
404 2
            $veto = GeneralUtility::callUserFunction($func, $params, $this);
405 2
            if ($veto !== false) {
406 2
                if (is_string($veto)) {
407 1
                    return $veto;
408
                }
409 1
                return 'Veto from hook "' . htmlspecialchars($key) . '"';
410
            }
411
        }
412
413 8
        return false;
414
    }
415
416
    /**
417
     * Wrapper method for getUrlsForPageId()
418
     * It returns an array of configurations and no urls!
419
     *
420
     * @param array $pageRow Page record with at least dok-type and uid columns.
421
     * @param string $skipMessage
422
     * @return array
423
     * @see getUrlsForPageId()
424
     */
425 9
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
426
    {
427 9
        if (! is_int($pageRow['uid'])) {
428
            $skipMessage = 'PageUid ' . $pageRow['uid'] . ' was not an integer';
429
            return [];
430
        }
431
432 9
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
433 9
        if ($message === false) {
434 8
            $res = $this->getUrlsForPageId($pageRow['uid']);
435 8
            $skipMessage = '';
436
        } else {
437 1
            $skipMessage = $message;
438 1
            $res = [];
439
        }
440
441 9
        return $res;
442
    }
443
444
    /**
445
     * Creates a list of URLs from input array (and submits them to queue if asked for)
446
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
447
     *
448
     * @param array $vv Information about URLs from pageRow to crawl.
449
     * @param array $pageRow Page row
450
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
451
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
452
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
453
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
454
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
455
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
456
     * @param array $incomingProcInstructions Array of processing instructions
457
     * @return string List of URLs (meant for display in backend module)
458
     */
459 7
    public function urlListFromUrlArray(
460
        array $vv,
461
        array $pageRow,
462
        $scheduledTime,
463
        $reqMinute,
464
        $submitCrawlUrls,
465
        $downloadCrawlUrls,
466
        array &$duplicateTrack,
467
        array &$downloadUrls,
468
        array $incomingProcInstructions
469
    ) {
470 7
        if (! is_array($vv['URLs'])) {
471
            return 'ERROR - no URL generated';
472
        }
473 7
        $urlLog = [];
474 7
        $pageId = (int) $pageRow['uid'];
475 7
        $configurationHash = $this->getConfigurationHash($vv);
476 7
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
477
478 7
        $urlService = new UrlService();
479
480 7
        foreach ($vv['URLs'] as $urlQuery) {
481 7
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
482
                continue;
483
            }
484 7
            $url = (string) $urlService->getUrlFromPageAndQueryParameters(
485 7
                $pageId,
486 7
                $urlQuery,
487 7
                $vv['subCfg']['baseUrl'] ?? null,
488 7
                $vv['subCfg']['force_ssl'] ?? 0
489
            );
490
491
            // Create key by which to determine unique-ness:
492 7
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
493
494 7
            if (isset($duplicateTrack[$uKey])) {
495
                //if the url key is registered just display it and do not resubmit is
496
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
497
            } else {
498
                // Scheduled time:
499 7
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
500 7
                $schTime = intval($schTime / 60) * 60;
501 7
                $formattedDate = BackendUtility::datetime($schTime);
502 7
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
503 7
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
504
505
                // Submit for crawling!
506 7
                if ($submitCrawlUrls) {
507 7
                    $added = $this->addUrl(
508 7
                        $pageId,
509 7
                        $url,
510 7
                        $vv['subCfg'],
511 7
                        $scheduledTime,
512 7
                        $configurationHash,
513 7
                        $skipInnerCheck
514
                    );
515 7
                    if ($added === false) {
516 7
                        $urlList .= ' (URL already existed)';
517
                    }
518
                } elseif ($downloadCrawlUrls) {
519
                    $downloadUrls[$url] = $url;
520
                }
521 7
                $urlLog[] = $urlList;
522
            }
523 7
            $duplicateTrack[$uKey] = true;
524
        }
525
526 7
        return implode('<br>', $urlLog);
527
    }
528
529
    /**
530
     * Returns true if input processing instruction is among registered ones.
531
     *
532
     * @param string $piString PI to test
533
     * @param array $incomingProcInstructions Processing instructions
534
     * @return boolean
535
     */
536 8
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
537
    {
538 8
        if (empty($incomingProcInstructions)) {
539 4
            return true;
540
        }
541
542 4
        foreach ($incomingProcInstructions as $pi) {
543 4
            if (GeneralUtility::inList($piString, $pi)) {
544 2
                return true;
545
            }
546
        }
547 2
        return false;
548
    }
549
550 9
    public function getPageTSconfigForId(int $id): array
551
    {
552 9
        if (! $this->MP) {
553 9
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
554
        } else {
555
            // TODO: Please check, this makes no sense to split a boolean value.
556
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

556
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
557
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

557
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
558
        }
559
560
        // Call a hook to alter configuration
561 9
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
562
            $params = [
563
                'pageId' => $id,
564
                'pageTSConfig' => &$pageTSconfig,
565
            ];
566
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
567
                GeneralUtility::callUserFunction($userFunc, $params, $this);
568
            }
569
        }
570 9
        return $pageTSconfig;
571
    }
572
573
    /**
574
     * This methods returns an array of configurations.
575
     * Adds no urls!
576
     */
577 7
    public function getUrlsForPageId(int $pageId): array
578
    {
579
        // Get page TSconfig for page ID
580 7
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
581
582 7
        $res = [];
583
584
        // Fetch Crawler Configuration from pageTSconfig
585 7
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
586 7
        foreach ($crawlerCfg as $key => $values) {
587 6
            if (! is_array($values)) {
588 6
                continue;
589
            }
590 6
            $key = str_replace('.', '', $key);
591
            // Sub configuration for a single configuration string:
592 6
            $subCfg = (array) $crawlerCfg[$key . '.'];
593 6
            $subCfg['key'] = $key;
594
595 6
            if (strcmp($subCfg['procInstrFilter'] ?? '', '')) {
596 6
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
597
            }
598 6
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
599
600
            // process configuration if it is not page-specific or if the specific page is the current page:
601
            // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
602 6
            if (! strcmp((string) $subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
603
604
                // Explode, process etc.:
605 6
                $res[$key] = [];
606 6
                $res[$key]['subCfg'] = $subCfg;
607 6
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
608 6
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
609 6
                $res[$key]['origin'] = 'pagets';
610
611
                // recognize MP value
612 6
                if (! $this->MP) {
613 6
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
614
                } else {
615
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
0 ignored issues
show
Bug introduced by
Are you sure $this->MP of type true can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

615
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . /** @scrutinizer ignore-type */ $this->MP]);
Loading history...
616
                }
617
            }
618
        }
619
620
        // Get configuration from tx_crawler_configuration records up the rootline
621 7
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
622 7
        foreach ($crawlerConfigurations as $configurationRecord) {
623
624
            // check access to the configuration record
625 1
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || UserService::hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
626 1
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
627
628
                // process configuration if it is not page-specific or if the specific page is the current page:
629
                // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
630 1
                if (! strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
631 1
                    $key = $configurationRecord['name'];
632
633
                    // don't overwrite previously defined paramSets
634 1
                    if (! isset($res[$key])) {
635
636
                        /* @var $TSparserObject TypoScriptParser */
637 1
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
638 1
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
639
640
                        $subCfg = [
641 1
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
642 1
                            'procInstrParams.' => $TSparserObject->setup,
643 1
                            'baseUrl' => $configurationRecord['base_url'],
644 1
                            'force_ssl' => (int) $configurationRecord['force_ssl'],
645 1
                            'userGroups' => $configurationRecord['fegroups'],
646 1
                            'exclude' => $configurationRecord['exclude'],
647 1
                            'key' => $key,
648
                        ];
649
650 1
                        if (! in_array($pageId, $this->expandExcludeString($subCfg['exclude']), true)) {
651 1
                            $res[$key] = [];
652 1
                            $res[$key]['subCfg'] = $subCfg;
653 1
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
654 1
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
655 1
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
656 1
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
657
                        }
658
                    }
659
                }
660
            }
661
        }
662
663 7
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
664
            $params = [
665
                'res' => &$res,
666
            ];
667
            GeneralUtility::callUserFunction($func, $params, $this);
668
        }
669 7
        return $res;
670
    }
671
672
    /**
673
     * Find all configurations of subpages of a page
674
     * TODO: Write Functional Tests
675
     */
676 2
    public function getConfigurationsForBranch(int $rootid, int $depth): array
677
    {
678 2
        $configurationsForBranch = [];
679 2
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
680 2
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
681 2
        foreach ($sets as $key => $value) {
682
            if (! is_array($value)) {
683
                continue;
684
            }
685
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
686
        }
687 2
        $pids = [];
688 2
        $rootLine = BackendUtility::BEgetRootLine($rootid);
689 2
        foreach ($rootLine as $node) {
690 1
            $pids[] = $node['uid'];
691
        }
692
        /* @var PageTreeView $tree */
693 2
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
694 2
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
695 2
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
696 2
        $tree->getTree($rootid, $depth, '');
697 2
        foreach ($tree->tree as $node) {
698
            $pids[] = $node['row']['uid'];
699
        }
700
701 2
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
702
        $statement = $queryBuilder
703 2
            ->select('name')
704 2
            ->from('tx_crawler_configuration')
705 2
            ->where(
706 2
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
707
            )
708 2
            ->execute();
709
710 2
        while ($row = $statement->fetch()) {
711 1
            $configurationsForBranch[] = $row['name'];
712
        }
713 2
        return $configurationsForBranch;
714
    }
715
716
    /**
717
     * Check if a user has access to an item
718
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
719
     *
720
     * @param string $groupList Comma-separated list of (fe_)group UIDs from a user
721
     * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access
722
     * @return bool TRUE if at least one of the users group UIDs is in the access list or the access list is empty
723
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
724
     * @deprecated
725
     * @codeCoverageIgnore
726
     */
727
    public function hasGroupAccess($groupList, $accessList)
728
    {
729
        if (empty($accessList)) {
730
            return true;
731
        }
732
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
733
            if (GeneralUtility::inList($accessList, $groupUid)) {
734
                return true;
735
            }
736
        }
737
        return false;
738
    }
739
740
    /**
741
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
742
     * Syntax of values:
743
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
744
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
745
     * - For each configuration part:
746
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
747
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
748
     *        _ENABLELANG:1 picks only original records without their language overlays
749
     *         - Default: Literal value
750
     *
751
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
752
     * @param integer $pid Current page ID
753
     * @return array
754
     *
755
     * TODO: Write Functional Tests
756
     */
757 14
    public function expandParameters($paramArray, $pid)
758
    {
759
        // Traverse parameter names:
760 14
        foreach ($paramArray as $p => $v) {
761 14
            $v = trim($v);
762
763
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
764 14
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
765
                // So, find the value inside brackets and reset the paramArray value as an array.
766 14
                $v = substr($v, 1, -1);
767 14
                $paramArray[$p] = [];
768
769
                // Explode parts and traverse them:
770 14
                $parts = explode('|', $v);
771 14
                foreach ($parts as $pV) {
772
773
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
774 14
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
775 1
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
776
777
                        // Traverse range, add values:
778
                        // Limit to size of range!
779 1
                        $runAwayBrake = 1000;
780 1
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
781 1
                            $paramArray[$p][] = $a;
782 1
                            $runAwayBrake--;
783 1
                            if ($runAwayBrake <= 0) {
784
                                break;
785
                            }
786
                        }
787 13
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
788
789
                        // Parse parameters:
790 6
                        $subparts = GeneralUtility::trimExplode(';', $pV);
791 6
                        $subpartParams = [];
792 6
                        foreach ($subparts as $spV) {
793 6
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
794 6
                            $subpartParams[$pKey] = $pVal;
795
                        }
796
797
                        // Table exists:
798 6
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
799 6
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
800 6
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
801 6
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
802 6
                            $where = $subpartParams['_WHERE'] ?? '';
803 6
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
804
805 6
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
806 6
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
807 6
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
808
809 6
                                if ($recursiveDepth > 0) {
810
                                    /** @var QueryGenerator $queryGenerator */
811 2
                                    $queryGenerator = GeneralUtility::makeInstance(QueryGenerator::class);
812 2
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
813 2
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
814
                                } else {
815 4
                                    $pidArray = [(string) $lookUpPid];
816
                                }
817
818 6
                                $queryBuilder->getRestrictions()
819 6
                                    ->removeAll()
820 6
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
821
822
                                $queryBuilder
823 6
                                    ->select($fieldName)
824 6
                                    ->from($subpartParams['_TABLE'])
825 6
                                    ->where(
826 6
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
827 6
                                        $where
828
                                    );
829
830 6
                                if (! empty($addTable)) {
831
                                    // TODO: Check if this works as intended!
832
                                    $queryBuilder->add('from', $addTable);
833
                                }
834 6
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
835
836 6
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
837
                                    $queryBuilder->andWhere(
838
                                        $queryBuilder->expr()->lte(
839
                                            $transOrigPointerField,
840
                                            0
841
                                        )
842
                                    );
843
                                }
844
845 6
                                $statement = $queryBuilder->execute();
846
847 6
                                $rows = [];
848 6
                                while ($row = $statement->fetch()) {
849 6
                                    $rows[$row[$fieldName]] = $row;
850
                                }
851
852 6
                                if (is_array($rows)) {
853 6
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
854
                                }
855
                            }
856
                        }
857
                    } else {
858
                        // Just add value:
859 7
                        $paramArray[$p][] = $pV;
860
                    }
861
                    // Hook for processing own expandParameters place holder
862 14
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
863
                        $_params = [
864
                            'pObj' => &$this,
865
                            'paramArray' => &$paramArray,
866
                            'currentKey' => $p,
867
                            'currentValue' => $pV,
868
                            'pid' => $pid,
869
                        ];
870
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
871
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
872
                        }
873
                    }
874
                }
875
876
                // Make unique set of values and sort array by key:
877 14
                $paramArray[$p] = array_unique($paramArray[$p]);
878 14
                ksort($paramArray);
879
            } else {
880
                // Set the literal value as only value in array:
881 7
                $paramArray[$p] = [$v];
882
            }
883
        }
884
885 14
        return $paramArray;
886
    }
887
888
    /**
889
     * Compiling URLs from parameter array (output of expandParameters())
890
     * The number of URLs will be the multiplication of the number of parameter values for each key
891
     *
892
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
893
     * @param array $urls URLs accumulated in this array (for recursion)
894
     * @return array
895
     */
896 11
    public function compileUrls($paramArray, array $urls)
897
    {
898 11
        if (empty($paramArray)) {
899 11
            return $urls;
900
        }
901 10
        $varName = key($paramArray);
902 10
        $valueSet = array_shift($paramArray);
903
904
        // Traverse value set:
905 10
        $newUrls = [];
906 10
        foreach ($urls as $url) {
907 9
            foreach ($valueSet as $val) {
908 9
                if (count($newUrls) < $this->getMaximumUrlsToCompile()) {
909 9
                    $newUrls[] = $url . (strcmp((string) $val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string) $val) : '');
910
                }
911
            }
912
        }
913 10
        return $this->compileUrls($paramArray, $newUrls);
914
    }
915
916
    /************************************
917
     *
918
     * Crawler log
919
     *
920
     ************************************/
921
922
    /**
923
     * Return array of records from crawler queue for input page ID
924
     *
925
     * @param integer $id Page ID for which to look up log entries.
926
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
927
     * @param boolean $doFullFlush
928
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
929
     * @return array
930
     *
931
     * @deprecated
932
     */
933 4
    public function getLogEntriesForPageId($id, QueueFilter $queueFilter, $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
0 ignored issues
show
Unused Code introduced by
The parameter $doFullFlush is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

933
    public function getLogEntriesForPageId($id, QueueFilter $queueFilter, $doFlush = false, /** @scrutinizer ignore-unused */ $doFullFlush = false, $itemsPerPage = 10)

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
934
    {
935 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
936
        $queryBuilder
937 4
            ->select('*')
938 4
            ->from($this->tableName)
939 4
            ->where(
940 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, PDO::PARAM_INT))
941
            )
942 4
            ->orderBy('scheduled', 'DESC');
943
944 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
945 4
            ->getConnectionForTable($this->tableName)
946 4
            ->getExpressionBuilder();
947 4
        $query = $expressionBuilder->andX();
0 ignored issues
show
Unused Code introduced by
The assignment to $query is dead and can be removed.
Loading history...
948
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
949
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
950
        // between the statements, it's not a mistake in the code.
951 4
        switch ($queueFilter) {
952 4
            case 'pending':
953
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
954
                break;
955 4
            case 'finished':
956
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
957
                break;
958
        }
959
960 4
        if ($doFlush) {
961 2
            $this->queueRepository->flushQueue($queueFilter);
962
        }
963 4
        if ($itemsPerPage > 0) {
964
            $queryBuilder
965 4
                ->setMaxResults((int) $itemsPerPage);
966
        }
967
968 4
        return $queryBuilder->execute()->fetchAll();
969
    }
970
971
    /**
972
     * Return array of records from crawler queue for input set ID
973
     *
974
     * @param int $set_id Set ID for which to look up log entries.
975
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
976
     * @param bool $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
977
     * @param int $itemsPerPage Limit the amount of entries per page default is 10
978
     * @return array
979
     *
980
     * @deprecated
981
     */
982 6
    public function getLogEntriesForSetId(int $set_id, string $filter = '', bool $doFlush = false, bool $doFullFlush = false, int $itemsPerPage = 10)
983
    {
984 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
985
        $queryBuilder
986 6
            ->select('*')
987 6
            ->from($this->tableName)
988 6
            ->where(
989 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, PDO::PARAM_INT))
990
            )
991 6
            ->orderBy('scheduled', 'DESC');
992
993 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
994 6
            ->getConnectionForTable($this->tableName)
995 6
            ->getExpressionBuilder();
996 6
        $query = $expressionBuilder->andX();
997
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
998
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
999
        // between the statements, it's not a mistake in the code.
1000 6
        $addWhere = '';
1001 6
        switch ($filter) {
1002 6
            case 'pending':
1003 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
1004 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
1005 1
                break;
1006 5
            case 'finished':
1007 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1008 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
1009 1
                break;
1010
        }
1011 6
        if ($doFlush) {
1012 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int) $set_id));
1013 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1013
            /** @scrutinizer ignore-deprecated */ $this->flushQueue($doFullFlush ? '' : $addWhere);
Loading history...
1014 4
            return [];
1015
        }
1016 2
        if ($itemsPerPage > 0) {
1017
            $queryBuilder
1018 2
                ->setMaxResults((int) $itemsPerPage);
1019
        }
1020
1021 2
        return $queryBuilder->execute()->fetchAll();
1022
    }
1023
1024
    /**
1025
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1026
     *
1027
     * @param integer $setId Set ID
1028
     * @param array $params Parameters to pass to call back function
1029
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1030
     * @param integer $page_id Page ID to attach it to
1031
     * @param integer $schedule Time at which to activate
1032
     */
1033
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
1034
    {
1035
        if (! is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
1036
            $params = [];
1037
        }
1038
        $params['_CALLBACKOBJ'] = $callBack;
1039
1040
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1041
            ->insert(
1042
                'tx_crawler_queue',
1043
                [
1044
                    'page_id' => (int) $page_id,
1045
                    'parameters' => json_encode($params),
1046
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
1047
                    'exec_time' => 0,
1048
                    'set_id' => (int) $setId,
1049
                    'result_data' => '',
1050
                ]
1051
            );
1052
    }
1053
1054
    /************************************
1055
     *
1056
     * URL setting
1057
     *
1058
     ************************************/
1059
1060
    /**
1061
     * Setting a URL for crawling:
1062
     *
1063
     * @param integer $id Page ID
1064
     * @param string $url Complete URL
1065
     * @param array $subCfg Sub configuration array (from TS config)
1066
     * @param integer $tstamp Scheduled-time
1067
     * @param string $configurationHash (optional) configuration hash
1068
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1069
     * @return bool
1070
     */
1071 11
    public function addUrl(
1072
        $id,
1073
        $url,
1074
        array $subCfg,
1075
        $tstamp,
1076
        $configurationHash = '',
1077
        $skipInnerDuplicationCheck = false
1078
    ) {
1079 11
        $urlAdded = false;
1080 11
        $rows = [];
1081
1082
        // Creating parameters:
1083
        $parameters = [
1084 11
            'url' => $url,
1085
        ];
1086
1087
        // fe user group simulation:
1088 11
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1089 11
        if ($uGs) {
1090 1
            $parameters['feUserGroupList'] = $uGs;
1091
        }
1092
1093
        // Setting processing instructions
1094 11
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1095 11
        if (is_array($subCfg['procInstrParams.'])) {
1096 8
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1097
        }
1098
1099
        // Compile value array:
1100 11
        $parameters_serialized = json_encode($parameters);
1101
        $fieldArray = [
1102 11
            'page_id' => (int) $id,
1103 11
            'parameters' => $parameters_serialized,
1104 11
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1105 11
            'configuration_hash' => $configurationHash,
1106 11
            'scheduled' => $tstamp,
1107 11
            'exec_time' => 0,
1108 11
            'set_id' => (int) $this->setID,
1109 11
            'result_data' => '',
1110 11
            'configuration' => $subCfg['key'],
1111
        ];
1112
1113 11
        if ($this->registerQueueEntriesInternallyOnly) {
1114
            //the entries will only be registered and not stored to the database
1115 1
            $this->queueEntries[] = $fieldArray;
1116
        } else {
1117 10
            if (! $skipInnerDuplicationCheck) {
1118
                // check if there is already an equal entry
1119 9
                $rows = $this->queueRepository->getDuplicateQueueItemsIfExists(
1120 9
                    (bool) $this->extensionSettings['enableTimeslot'],
1121 9
                    $tstamp,
1122 9
                    $this->getCurrentTime(),
1123 9
                    $fieldArray['page_id'],
1124 9
                    $fieldArray['parameters_hash']
1125
                );
1126
            }
1127
1128 10
            if (empty($rows)) {
1129 9
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1130 9
                $connectionForCrawlerQueue->insert(
1131 9
                    'tx_crawler_queue',
1132 9
                    $fieldArray
1133
                );
1134 9
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1135 9
                $rows[] = $uid;
1136 9
                $urlAdded = true;
1137
1138 9
                $signalPayload = ['uid' => $uid, 'fieldArray' => $fieldArray];
1139 9
                SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1139
                /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1140 9
                    self::class,
1141 9
                    SignalSlotUtility::SIGNAL_URL_ADDED_TO_QUEUE,
1142 9
                    $signalPayload
1143
                );
1144
            } else {
1145 5
                $signalPayload = ['rows' => $rows, 'fieldArray' => $fieldArray];
1146 5
                SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1146
                /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1147 5
                    self::class,
1148 5
                    SignalSlotUtility::SIGNAL_DUPLICATE_URL_IN_QUEUE,
1149 5
                    $signalPayload
1150
                );
1151
            }
1152
        }
1153
1154 11
        return $urlAdded;
1155
    }
1156
1157
    /**
1158
     * Returns the current system time
1159
     *
1160
     * @return int
1161
     */
1162 4
    public function getCurrentTime()
1163
    {
1164 4
        return time();
1165
    }
1166
1167
    /************************************
1168
     *
1169
     * URL reading
1170
     *
1171
     ************************************/
1172
1173
    /**
1174
     * Read URL for single queue entry
1175
     *
1176
     * @param integer $queueId
1177
     * @param boolean $force If set, will process even if exec_time has been set!
1178
     *
1179
     * @return int|null
1180
     */
1181 2
    public function readUrl($queueId, $force = false)
1182
    {
1183 2
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1184 2
        $ret = 0;
1185 2
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1186
        // Get entry:
1187
        $queryBuilder
1188 2
            ->select('*')
1189 2
            ->from('tx_crawler_queue')
1190 2
            ->where(
1191 2
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, PDO::PARAM_INT))
1192
            );
1193 2
        if (! $force) {
1194
            $queryBuilder
1195 2
                ->andWhere('exec_time = 0')
1196 2
                ->andWhere('process_scheduled > 0');
1197
        }
1198 2
        $queueRec = $queryBuilder->execute()->fetch();
1199
1200 2
        if (! is_array($queueRec)) {
1201
            return;
1202
        }
1203
1204 2
        SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1204
        /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1205 2
            self::class,
1206 2
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1207 2
            [$queueId, &$queueRec]
1208
        );
1209
1210
        // Set exec_time to lock record:
1211 2
        $field_array = ['exec_time' => $this->getCurrentTime()];
1212
1213 2
        if (isset($this->processID)) {
1214
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1215 2
            $field_array['process_id_completed'] = $this->processID;
1216
        }
1217
1218 2
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1219 2
            ->update(
1220 2
                'tx_crawler_queue',
1221 2
                $field_array,
1222 2
                ['qid' => (int) $queueId]
1223
            );
1224
1225 2
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1226 2
        if ($result['content'] === null) {
1227
            $resultData = 'An errors happened';
1228
        } else {
1229
            /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
1230 2
            $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
1231 2
            $resultData = $jsonCompatibilityConverter->convert($result['content']);
1232
        }
1233
1234
        //atm there's no need to point to specific pollable extensions
1235 2
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1236
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1237
                // only check the success value if the instruction is runnig
1238
                // it is important to name the pollSuccess key same as the procInstructions key
1239
                if (is_array($resultData['parameters']['procInstructions'])
1240
                    && in_array(
1241
                        $pollable,
1242
                        $resultData['parameters']['procInstructions'], true
1243
                    )
1244
                ) {
1245
                    if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1246
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1247
                    }
1248
                }
1249
            }
1250
        }
1251
1252
        // Set result in log which also denotes the end of the processing of this entry.
1253 2
        $field_array = ['result_data' => json_encode($result)];
1254
1255 2
        SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1255
        /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1256 2
            self::class,
1257 2
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1258 2
            [$queueId, &$field_array]
1259
        );
1260
1261 2
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1262 2
            ->update(
1263 2
                'tx_crawler_queue',
1264 2
                $field_array,
1265 2
                ['qid' => (int) $queueId]
1266
            );
1267
1268 2
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1269 2
        return $ret;
1270
    }
1271
1272
    /**
1273
     * Read URL for not-yet-inserted log-entry
1274
     *
1275
     * @param array $field_array Queue field array,
1276
     *
1277
     * @return array|bool|mixed|string
1278
     */
1279
    public function readUrlFromArray($field_array)
1280
    {
1281
        // Set exec_time to lock record:
1282
        $field_array['exec_time'] = $this->getCurrentTime();
1283
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1284
        $connectionForCrawlerQueue->insert(
1285
            $this->tableName,
1286
            $field_array
1287
        );
1288
        $queueId = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1289
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1290
1291
        // Set result in log which also denotes the end of the processing of this entry.
1292
        $field_array = ['result_data' => json_encode($result)];
1293
1294
        SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1294
        /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1295
            self::class,
1296
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1297
            [$queueId, &$field_array]
1298
        );
1299
1300
        $connectionForCrawlerQueue->update(
1301
            $this->tableName,
1302
            $field_array,
1303
            ['qid' => $queueId]
1304
        );
1305
1306
        return $result;
1307
    }
1308
1309
    /*****************************
1310
     *
1311
     * Compiling URLs to crawl - tools
1312
     *
1313
     *****************************/
1314
1315
    /**
1316
     * @param integer $id Root page id to start from.
1317
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1318
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1319
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1320
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1321
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1322
     * @param array $incomingProcInstructions Array of processing instructions
1323
     * @param array $configurationSelection Array of configuration keys
1324
     * @return string
1325
     */
1326
    public function getPageTreeAndUrls(
1327
        $id,
1328
        $depth,
1329
        $scheduledTime,
1330
        $reqMinute,
1331
        $submitCrawlUrls,
1332
        $downloadCrawlUrls,
1333
        array $incomingProcInstructions,
1334
        array $configurationSelection
1335
    ) {
1336
        $this->scheduledTime = $scheduledTime;
1337
        $this->reqMinute = $reqMinute;
1338
        $this->submitCrawlUrls = $submitCrawlUrls;
1339
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1340
        $this->incomingProcInstructions = $incomingProcInstructions;
1341
        $this->incomingConfigurationSelection = $configurationSelection;
1342
1343
        $this->duplicateTrack = [];
1344
        $this->downloadUrls = [];
1345
1346
        // Drawing tree:
1347
        /* @var PageTreeView $tree */
1348
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1349
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1350
        $tree->init('AND ' . $perms_clause);
1351
1352
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1353
        if (is_array($pageInfo)) {
1354
            // Set root row:
1355
            $tree->tree[] = [
1356
                'row' => $pageInfo,
1357
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1358
            ];
1359
        }
1360
1361
        // Get branch beneath:
1362
        if ($depth) {
1363
            $tree->getTree($id, $depth, '');
1364
        }
1365
1366
        // Traverse page tree:
1367
        $code = '';
1368
1369
        foreach ($tree->tree as $data) {
1370
            $this->MP = false;
1371
1372
            // recognize mount points
1373
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
1374
                $mountpage = $this->pageRepository->getPage($data['row']['uid']);
1375
1376
                // fetch mounted pages
1377
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1378
1379
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1380
                $mountTree->init('AND ' . $perms_clause);
1381
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1382
1383
                foreach ($mountTree->tree as $mountData) {
1384
                    $code .= $this->drawURLs_addRowsForPage(
1385
                        $mountData['row'],
1386
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1387
                    );
1388
                }
1389
1390
                // replace page when mount_pid_ol is enabled
1391
                if ($mountpage[0]['mount_pid_ol']) {
1392
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1393
                } else {
1394
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1395
                    $this->MP = false;
1396
                }
1397
            }
1398
1399
            $code .= $this->drawURLs_addRowsForPage(
1400
                $data['row'],
1401
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1402
            );
1403
        }
1404
1405
        return $code;
1406
    }
1407
1408
    /**
1409
     * Expands exclude string
1410
     *
1411
     * @param string $excludeString Exclude string
1412
     * @return array
1413
     */
1414 2
    public function expandExcludeString($excludeString)
1415
    {
1416
        // internal static caches;
1417 2
        static $expandedExcludeStringCache;
1418 2
        static $treeCache;
1419
1420 2
        if (empty($expandedExcludeStringCache[$excludeString])) {
1421 2
            $pidList = [];
1422
1423 2
            if (! empty($excludeString)) {
1424
                /** @var PageTreeView $tree */
1425 1
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1426 1
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1427
1428 1
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1429
1430 1
                foreach ($excludeParts as $excludePart) {
1431 1
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1432
1433
                    // default is "page only" = "depth=0"
1434 1
                    if (empty($depth)) {
1435 1
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1436
                    }
1437
1438 1
                    $pidList[] = (int) $pid;
1439
1440 1
                    if ($depth > 0) {
1441
                        if (empty($treeCache[$pid][$depth])) {
1442
                            $tree->reset();
1443
                            $tree->getTree($pid, $depth);
0 ignored issues
show
Bug introduced by
$pid of type string is incompatible with the type integer expected by parameter $uid of TYPO3\CMS\Backend\Tree\V...ractTreeView::getTree(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1443
                            $tree->getTree(/** @scrutinizer ignore-type */ $pid, $depth);
Loading history...
1444
                            $treeCache[$pid][$depth] = $tree->tree;
1445
                        }
1446
1447
                        foreach ($treeCache[$pid][$depth] as $data) {
1448
                            $pidList[] = (int) $data['row']['uid'];
1449
                        }
1450
                    }
1451
                }
1452
            }
1453
1454 2
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1455
        }
1456
1457 2
        return $expandedExcludeStringCache[$excludeString];
1458
    }
1459
1460
    /**
1461
     * Create the rows for display of the page tree
1462
     * For each page a number of rows are shown displaying GET variable configuration
1463
     */
1464
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1465
    {
1466
        $skipMessage = '';
1467
1468
        // Get list of configurations
1469
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1470
        $configurations = ConfigurationService::removeDisallowedConfigurations($this->incomingConfigurationSelection, $configurations);
1471
1472
        // Traverse parameter combinations:
1473
        $c = 0;
1474
        $content = '';
1475
        if (! empty($configurations)) {
1476
            foreach ($configurations as $confKey => $confArray) {
1477
1478
                // Title column:
1479
                if (! $c) {
1480
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1481
                } else {
1482
                    $titleClm = '';
1483
                }
1484
1485
                if (! in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']), true)) {
1486
1487
                    // URL list:
1488
                    $urlList = $this->urlListFromUrlArray(
1489
                        $confArray,
1490
                        $pageRow,
1491
                        $this->scheduledTime,
1492
                        $this->reqMinute,
1493
                        $this->submitCrawlUrls,
1494
                        $this->downloadCrawlUrls,
1495
                        $this->duplicateTrack,
1496
                        $this->downloadUrls,
1497
                        // if empty the urls won't be filtered by processing instructions
1498
                        $this->incomingProcInstructions
1499
                    );
1500
1501
                    // Expanded parameters:
1502
                    $paramExpanded = '';
1503
                    $calcAccu = [];
1504
                    $calcRes = 1;
1505
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1506
                        $paramExpanded .= '
1507
                            <tr>
1508
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1509
                            '(' . count($gVal) . ')' .
1510
                            '</td>
1511
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1512
                            </tr>
1513
                        ';
1514
                        $calcRes *= count($gVal);
1515
                        $calcAccu[] = count($gVal);
1516
                    }
1517
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1518
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1519
1520
                    // Options
1521
                    $optionValues = '';
1522
                    if ($confArray['subCfg']['userGroups']) {
1523
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1524
                    }
1525
                    if ($confArray['subCfg']['procInstrFilter']) {
1526
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1527
                    }
1528
1529
                    // Compile row:
1530
                    $content .= '
1531
                        <tr>
1532
                            ' . $titleClm . '
1533
                            <td>' . htmlspecialchars($confKey) . '</td>
1534
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1535
                            <td>' . $paramExpanded . '</td>
1536
                            <td nowrap="nowrap">' . $urlList . '</td>
1537
                            <td nowrap="nowrap">' . $optionValues . '</td>
1538
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1539
                        </tr>';
1540
                } else {
1541
                    $content .= '<tr>
1542
                            ' . $titleClm . '
1543
                            <td>' . htmlspecialchars($confKey) . '</td>
1544
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1545
                        </tr>';
1546
                }
1547
1548
                $c++;
1549
            }
1550
        } else {
1551
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1552
1553
            // Compile row:
1554
            $content .= '
1555
                <tr>
1556
                    <td>' . $pageTitle . '</td>
1557
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1558
                </tr>';
1559
        }
1560
1561
        return $content;
1562
    }
1563
1564
    /*****************************
1565
     *
1566
     * CLI functions
1567
     *
1568
     *****************************/
1569
1570
    /**
1571
     * Running the functionality of the CLI (crawling URLs from queue)
1572
     */
1573 2
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1574
    {
1575 2
        $result = 0;
1576 2
        $counter = 0;
1577
1578
        // First, run hooks:
1579 2
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1580 2
            trigger_error(
1581 2
                'This hook (crawler/cli_hooks) is deprecated since 9.1.5 and will be removed when dropping support for TYPO3 9LTS and 10LTS',
1582 2
                E_USER_DEPRECATED
1583
            );
1584 2
            $hookObj = GeneralUtility::makeInstance($objRef);
1585 2
            if (is_object($hookObj)) {
1586 2
                $hookObj->crawler_init($this);
1587
            }
1588
        }
1589
1590
        // Clean up the queue
1591 2
        $this->queueRepository->cleanupQueue();
1592
1593
        // Select entries:
1594 2
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1595
1596 2
        if (! empty($rows)) {
1597 2
            $quidList = [];
1598
1599 2
            foreach ($rows as $r) {
1600 2
                $quidList[] = $r['qid'];
1601
            }
1602
1603 2
            $processId = $this->CLI_buildProcessId();
1604
1605
            //save the number of assigned queue entries to determine how many have been processed later
1606 2
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1607 2
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1608
1609 2
            if ($numberOfAffectedRows !== count($quidList)) {
1610
                return ($result | self::CLI_STATUS_ABORTED);
1611
            }
1612
1613 2
            foreach ($rows as $r) {
1614 2
                $result |= $this->readUrl($r['qid']);
1615
1616 2
                $counter++;
1617
                // Just to relax the system
1618 2
                usleep((int) $sleepTime);
1619
1620
                // if during the start and the current read url the cli has been disable we need to return from the function
1621
                // mark the process NOT as ended.
1622 2
                if ($this->crawler->isDisabled()) {
1623
                    return ($result | self::CLI_STATUS_ABORTED);
1624
                }
1625
1626 2
                if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1627
                    $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1627
                    /** @scrutinizer ignore-deprecated */ $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
Loading history...
1628
                    $result |= self::CLI_STATUS_ABORTED;
1629
                    //possible timeout
1630
                    break;
1631
                }
1632
            }
1633
1634 2
            sleep((int) $sleepAfterFinish);
1635
        }
1636
1637 2
        if ($counter > 0) {
1638 2
            $result |= self::CLI_STATUS_PROCESSED;
1639
        }
1640
1641 2
        return $result;
1642
    }
1643
1644
    /**
1645
     * Activate hooks
1646
     * @deprecated
1647
     */
1648
    public function CLI_runHooks(): void
1649
    {
1650
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1651
            $hookObj = GeneralUtility::makeInstance($objRef);
1652
            if (is_object($hookObj)) {
1653
                $hookObj->crawler_init($this);
1654
            }
1655
        }
1656
    }
1657
1658
    /**
1659
     * Try to acquire a new process with the given id
1660
     * also performs some auto-cleanup for orphan processes
1661
     * @param string $id identification string for the process
1662
     * @return boolean
1663
     * @todo preemption might not be the most elegant way to clean up
1664
     */
1665 2
    public function CLI_checkAndAcquireNewProcess($id)
1666
    {
1667 2
        $ret = true;
1668
1669 2
        $systemProcessId = getmypid();
1670 2
        if (! $systemProcessId) {
1671
            return false;
1672
        }
1673
1674 2
        $processCount = 0;
1675 2
        $orphanProcesses = [];
1676
1677 2
        $activeProcesses = $this->processRepository->findAllActive();
1678 2
        $currentTime = $this->getCurrentTime();
1679
1680
        /** @var Process $process */
1681 2
        foreach ($activeProcesses as $process) {
1682
            if ($process->getTtl() < $currentTime) {
1683
                $orphanProcesses[] = $process->getProcessId();
1684
            } else {
1685
                $processCount++;
1686
            }
1687
        }
1688
1689
        // if there are less than allowed active processes then add a new one
1690 2
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
1691 2
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1692 2
                'tx_crawler_process',
1693
                [
1694 2
                    'process_id' => $id,
1695 2
                    'active' => 1,
1696 2
                    'ttl' => $currentTime + (int) $this->extensionSettings['processMaxRunTime'],
1697 2
                    'system_process_id' => $systemProcessId,
1698
                ]
1699
            );
1700
        } else {
1701
            $ret = false;
1702
        }
1703
1704 2
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1705 2
        $this->processRepository->markRequestedProcessesAsNotActive($orphanProcesses);
1706 2
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($orphanProcesses);
1707
1708 2
        return $ret;
1709
    }
1710
1711
    /**
1712
     * Release a process and the required resources
1713
     *
1714
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1715
     * @return boolean
1716
     * @deprecated
1717
     */
1718
    public function CLI_releaseProcesses($releaseIds)
1719
    {
1720
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1721
1722
        if (! is_array($releaseIds)) {
1723
            $releaseIds = [$releaseIds];
1724
        }
1725
1726
        if (empty($releaseIds)) {
1727
            //nothing to release
1728
            return false;
1729
        }
1730
1731
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1732
        // this ensures that a single process can't mess up the entire process table
1733
1734
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1735
1736
        $queryBuilder
1737
            ->update($this->tableName, 'q')
1738
            ->where(
1739
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1740
            )
1741
            ->set('q.process_scheduled', 0)
1742
            ->set('q.process_id', '')
1743
            ->execute();
1744
1745
        // FIXME: Not entirely sure that this is equivalent to the previous version
1746
        $queryBuilder->resetQueryPart('set');
1747
1748
        $queryBuilder
1749
            ->update('tx_crawler_process')
1750
            ->where(
1751
                $queryBuilder->expr()->eq('active', 0),
1752
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1753
            )
1754
            ->set('system_process_id', 0)
1755
            ->execute();
1756
1757
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1758
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1759
1760
        return true;
1761
    }
1762
1763
    /**
1764
     * Create a unique Id for the current process
1765
     *
1766
     * @return string the ID
1767
     */
1768 3
    public function CLI_buildProcessId()
1769
    {
1770 3
        if (! $this->processID) {
1771 2
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1772
        }
1773 3
        return $this->processID;
1774
    }
1775
1776
    /**
1777
     * Prints a message to the stdout (only if debug-mode is enabled)
1778
     *
1779
     * @param string $msg the message
1780
     * @deprecated
1781
     * @codeCoverageIgnore
1782
     */
1783
    public function CLI_debug($msg): void
1784
    {
1785
        if ((int) $this->extensionSettings['processDebug']) {
1786
            echo $msg . "\n";
1787
            flush();
1788
        }
1789
    }
1790
1791
    /**
1792
     * Cleans up entries that stayed for too long in the queue. These are:
1793
     * - processed entries that are over 1.5 days in age
1794
     * - scheduled entries that are over 7 days old
1795
     *
1796
     * @deprecated
1797
     */
1798 1
    public function cleanUpOldQueueEntries(): void
1799
    {
1800
        // 24*60*60 Seconds in 24 hours
1801 1
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400;
1802 1
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
1803
1804 1
        $now = time();
1805 1
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
1806 1
        $this->flushQueue($condition);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1806
        /** @scrutinizer ignore-deprecated */ $this->flushQueue($condition);
Loading history...
1807 1
    }
1808
1809
    /**
1810
     * Removes queue entries
1811
     *
1812
     * @param string $where SQL related filter for the entries which should be removed
1813
     *
1814
     * @deprecated
1815
     */
1816 5
    protected function flushQueue($where = ''): void
1817
    {
1818 5
        $realWhere = strlen((string) $where) > 0 ? $where : '1=1';
1819
1820 5
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1821
1822
        $groups = $queryBuilder
1823 5
            ->selectLiteral('DISTINCT set_id')
1824 5
            ->from($this->tableName)
1825 5
            ->where($realWhere)
1826 5
            ->execute()
1827 5
            ->fetchAll();
1828 5
        if (is_array($groups)) {
0 ignored issues
show
introduced by
The condition is_array($groups) is always true.
Loading history...
1829 5
            foreach ($groups as $group) {
1830
                $subSet = $queryBuilder
1831 4
                    ->select('qid', 'set_id')
1832 4
                    ->from($this->tableName)
1833 4
                    ->where(
1834 4
                        $realWhere,
1835 4
                        $queryBuilder->expr()->eq('set_id', $group['set_id'])
1836
                    )
1837 4
                    ->execute()
1838 4
                    ->fetchAll();
1839
1840 4
                $payLoad = ['subSet' => $subSet];
1841 4
                SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1841
                /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1842 4
                    self::class,
1843 4
                    SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH,
1844 4
                    $payLoad
1845
                );
1846
            }
1847
        }
1848
1849
        $queryBuilder
1850 5
            ->delete($this->tableName)
1851 5
            ->where($realWhere)
1852 5
            ->execute();
1853 5
    }
1854
1855
    /**
1856
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1857
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1858
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1859
     *
1860
     * @param int $tstamp
1861
     * @param array $fieldArray
1862
     *
1863
     * @return array
1864
     * @deprecated
1865
     */
1866 5
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1867
    {
1868 5
        $rows = [];
1869
1870 5
        $currentTime = $this->getCurrentTime();
1871
1872 5
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1873
        $queryBuilder
1874 5
            ->select('qid')
1875 5
            ->from('tx_crawler_queue');
1876
        //if this entry is scheduled with "now"
1877 5
        if ($tstamp <= $currentTime) {
1878 2
            if ($this->extensionSettings['enableTimeslot']) {
1879 1
                $timeBegin = $currentTime - 100;
1880 1
                $timeEnd = $currentTime + 100;
1881
                $queryBuilder
1882 1
                    ->where(
1883 1
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1884
                    )
1885 1
                    ->orWhere(
1886 1
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1887
                    );
1888
            } else {
1889
                $queryBuilder
1890 1
                    ->where(
1891 2
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1892
                    );
1893
            }
1894 3
        } elseif ($tstamp > $currentTime) {
1895
            //entry with a timestamp in the future need to have the same schedule time
1896
            $queryBuilder
1897 3
                ->where(
1898 3
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1899
                );
1900
        }
1901
1902
        $queryBuilder
1903 5
            ->andWhere('NOT exec_time')
1904 5
            ->andWhere('NOT process_id')
1905 5
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], PDO::PARAM_INT)))
1906 5
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], PDO::PARAM_STR)));
1907
1908 5
        $statement = $queryBuilder->execute();
1909
1910 5
        while ($row = $statement->fetch()) {
1911 5
            $rows[] = $row['qid'];
1912
        }
1913
1914 5
        return $rows;
1915
    }
1916
1917
    /**
1918
     * Returns a md5 hash generated from a serialized configuration array.
1919
     *
1920
     * @return string
1921
     */
1922 13
    protected function getConfigurationHash(array $configuration)
1923
    {
1924 13
        unset($configuration['paramExpanded']);
1925 13
        unset($configuration['URLs']);
1926 13
        return md5(serialize($configuration));
1927
    }
1928
1929
    /**
1930
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
1931
     * the Site instance.
1932
     *
1933
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
1934
     * @throws SiteNotFoundException
1935
     * @throws InvalidRouteArgumentsException
1936
     *
1937
     * @deprecated Using CrawlerController::getUrlFromPageAndQueryParameters() is deprecated since 9.1.1 and will be removed in v11.x, please use UrlService->getUrlFromPageAndQueryParameters() instead.
1938
     * @codeCoverageIgnore
1939
     */
1940
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
1941
    {
1942
        $urlService = new UrlService();
1943
        return $urlService->getUrlFromPageAndQueryParameters($pageId, $queryString, $alternativeBaseUrl, $httpsOrHttp);
1944
    }
1945
1946 1
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1947
    {
1948
        // Swap if first is larger than last:
1949 1
        if ($reg[1] > $reg[2]) {
1950
            $temp = $reg[2];
1951
            $reg[2] = $reg[1];
1952
            $reg[1] = $temp;
1953
        }
1954
1955 1
        return $reg;
1956
    }
1957
1958 9
    private function getMaximumUrlsToCompile(): int
1959
    {
1960 9
        return $this->maximumUrlsToCompile;
1961
    }
1962
1963
    /**
1964
     * @return BackendUserAuthentication
1965
     */
1966 3
    private function getBackendUser()
1967
    {
1968
        // Make sure the _cli_ user is loaded
1969 3
        Bootstrap::initializeBackendAuthentication();
1970 3
        if ($this->backendUser === null) {
1971 3
            $this->backendUser = $GLOBALS['BE_USER'];
1972
        }
1973 3
        return $this->backendUser;
1974
    }
1975
1976
    /**
1977
     * Get querybuilder for given table
1978
     *
1979
     * @return QueryBuilder
1980
     */
1981 13
    private function getQueryBuilder(string $table)
1982
    {
1983 13
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
1984
    }
1985
}
1986