Passed
Push — feature/switch-to-configuratio... ( 74ae66 )
by Tomas Norre
07:20
created

CrawlerController::urlListFromUrlArray()   B

Complexity

Conditions 8
Paths 8

Size

Total Lines 68
Code Lines 39

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 34
CRAP Score 8.1348

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 8
eloc 39
c 1
b 0
f 0
nc 8
nop 9
dl 0
loc 68
ccs 34
cts 39
cp 0.8718
crap 8.1348
rs 8.0515

How to fix   Long Method    Many Parameters   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Converter\JsonCompatibilityConverter;
33
use AOE\Crawler\Crawler;
34
use AOE\Crawler\CrawlStrategy\CrawlStrategyFactory;
35
use AOE\Crawler\Domain\Model\Configuration;
36
use AOE\Crawler\Domain\Model\Process;
37
use AOE\Crawler\Domain\Model\Queue;
38
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
39
use AOE\Crawler\Domain\Repository\ProcessRepository;
40
use AOE\Crawler\Domain\Repository\QueueRepository;
41
use AOE\Crawler\QueueExecutor;
42
use AOE\Crawler\Service\UrlService;
43
use AOE\Crawler\Utility\SignalSlotUtility;
44
use AOE\Crawler\Value\QueueFilter;
45
use Psr\Http\Message\UriInterface;
46
use Psr\Log\LoggerAwareInterface;
47
use Psr\Log\LoggerAwareTrait;
48
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
49
use TYPO3\CMS\Backend\Utility\BackendUtility;
50
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
51
use TYPO3\CMS\Core\Compatibility\PublicMethodDeprecationTrait;
52
use TYPO3\CMS\Core\Compatibility\PublicPropertyDeprecationTrait;
53
use TYPO3\CMS\Core\Core\Bootstrap;
54
use TYPO3\CMS\Core\Core\Environment;
55
use TYPO3\CMS\Core\Database\Connection;
56
use TYPO3\CMS\Core\Database\ConnectionPool;
57
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
58
use TYPO3\CMS\Core\Imaging\Icon;
59
use TYPO3\CMS\Core\Imaging\IconFactory;
60
use TYPO3\CMS\Core\Site\Entity\Site;
61
use TYPO3\CMS\Core\Type\Bitmask\Permission;
62
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
63
use TYPO3\CMS\Core\Utility\DebugUtility;
64
use TYPO3\CMS\Core\Utility\GeneralUtility;
65
use TYPO3\CMS\Core\Utility\MathUtility;
66
use TYPO3\CMS\Extbase\Object\ObjectManager;
67
use TYPO3\CMS\Frontend\Page\PageRepository;
68
69
/**
70
 * Class CrawlerController
71
 *
72
 * @package AOE\Crawler\Controller
73
 */
74
class CrawlerController implements LoggerAwareInterface
75
{
76
    use LoggerAwareTrait;
77
    use PublicMethodDeprecationTrait;
78
    use PublicPropertyDeprecationTrait;
79
80
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
81
82
    //queue not empty
83
    public const CLI_STATUS_REMAIN = 1;
84
85
    //(some) queue items where processed
86
    public const CLI_STATUS_PROCESSED = 2;
87
88
    //instance didn't finish
89
    public const CLI_STATUS_ABORTED = 4;
90
91
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
92
93
    /**
94
     * @var integer
95
     */
96
    public $setID = 0;
97
98
    /**
99
     * @var string
100
     */
101
    public $processID = '';
102
103
    /**
104
     * @var array
105
     */
106
    public $duplicateTrack = [];
107
108
    /**
109
     * @var array
110
     */
111
    public $downloadUrls = [];
112
113
    /**
114
     * @var array
115
     */
116
    public $incomingProcInstructions = [];
117
118
    /**
119
     * @var array
120
     */
121
    public $incomingConfigurationSelection = [];
122
123
    /**
124
     * @var bool
125
     */
126
    public $registerQueueEntriesInternallyOnly = false;
127
128
    /**
129
     * @var array
130
     */
131
    public $queueEntries = [];
132
133
    /**
134
     * @var array
135
     */
136
    public $urlList = [];
137
138
    /**
139
     * @var array
140
     */
141
    public $extensionSettings = [];
142
143
    /**
144
     * Mount Point
145
     *
146
     * @var bool
147
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
148
     */
149
    public $MP = false;
150
151
    /**
152
     * @var string
153
     * @deprecated
154
     */
155
    protected $processFilename;
156
157
    /**
158
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
159
     *
160
     * @var string
161
     * @deprecated
162
     */
163
    protected $accessMode;
164
165
    /**
166
     * @var QueueRepository
167
     */
168
    protected $queueRepository;
169
170
    /**
171
     * @var ProcessRepository
172
     */
173
    protected $processRepository;
174
175
    /**
176
     * @var ConfigurationRepository
177
     */
178
    protected $configurationRepository;
179
180
    /**
181
     * @var string
182
     */
183
    protected $tableName = 'tx_crawler_queue';
184
185
    /**
186
     * @var QueueExecutor
187
     */
188
    protected $queueExecutor;
189
190
    /**
191
     * @var int
192
     */
193
    protected $maximumUrlsToCompile = 10000;
194
195
    /**
196
     * @var IconFactory
197
     */
198
    protected $iconFactory;
199
200
    /**
201
     * @var string[]
202
     */
203
    private $deprecatedPublicMethods = [
0 ignored issues
show
introduced by
The private property $deprecatedPublicMethods is not used, and could be removed.
Loading history...
204
        'cleanUpOldQueueEntries' => 'Using CrawlerController::cleanUpOldQueueEntries() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->cleanUpOldQueueEntries() instead.',
205
        'CLI_debug' => 'Using CrawlerController->CLI_debug() is deprecated since 9.1.3 and will be removed in v11.x',
206
        'getAccessMode' => 'Using CrawlerController->getAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
207
        'getLogEntriesForSetId' => 'Using crawlerController::getLogEntriesForSetId() is deprecated since 9.0.1 and will be removed in v11.x',
208
        'flushQueue' => 'Using CrawlerController::flushQueue() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->flushQueue() instead.',
209
        'setAccessMode' => 'Using CrawlerController->setAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
210
        'getDisabled' => 'Using CrawlerController->getDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->isDisabled() instead',
211
        'setDisabled' => 'Using CrawlerController->setDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->setDisabled() instead',
212
        'getProcessFilename' => 'Using CrawlerController->getProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
213
        'setProcessFilename' => 'Using CrawlerController->setProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
214
215
    ];
216
217
    /**
218
     * @var string[]
219
     */
220
    private $deprecatedPublicProperties = [
1 ignored issue
show
introduced by
The private property $deprecatedPublicProperties is not used, and could be removed.
Loading history...
221
        'accessMode' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
222
        'processFilename' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
223
    ];
224
225
    /**
226
     * @var BackendUserAuthentication|null
227
     */
228
    private $backendUser;
229
230
    /**
231
     * @var integer
232
     */
233
    private $scheduledTime = 0;
234
235
    /**
236
     * @var integer
237
     */
238
    private $reqMinute = 0;
239
240
    /**
241
     * @var bool
242
     */
243
    private $submitCrawlUrls = false;
244
245
    /**
246
     * @var bool
247
     */
248
    private $downloadCrawlUrls = false;
249
250
    /**
251
     * @var PageRepository
252
     */
253
    private $pageRepository;
254
255
    /**
256
     * @var Crawler
257
     */
258
    private $crawler;
259
260
    /************************************
261
     *
262
     * Getting URLs based on Page TSconfig
263
     *
264
     ************************************/
265
266 35
    public function __construct()
267
    {
268 35
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
269 35
        $crawlStrategyFactory = GeneralUtility::makeInstance(CrawlStrategyFactory::class);
270 35
        $this->queueRepository = $objectManager->get(QueueRepository::class);
271 35
        $this->processRepository = $objectManager->get(ProcessRepository::class);
272 35
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
273 35
        $this->pageRepository = $objectManager->get(PageRepository::class);
274 35
        $this->queueExecutor = GeneralUtility::makeInstance(QueueExecutor::class, $crawlStrategyFactory);
275 35
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
276 35
        $this->crawler = GeneralUtility::makeInstance(Crawler::class);
277
278 35
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

278
        /** @scrutinizer ignore-deprecated */ $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
Loading history...
279
280
        /** @var ExtensionConfigurationProvider $configurationProvider */
281 35
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
282 35
        $settings = $configurationProvider->getExtensionConfiguration();
283 35
        $this->extensionSettings = is_array($settings) ? $settings : [];
0 ignored issues
show
introduced by
The condition is_array($settings) is always true.
Loading history...
284
285
        // set defaults:
286 35
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
287
            $this->extensionSettings['countInARun'] = 100;
288
        }
289
290 35
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
291 35
        $this->maximumUrlsToCompile = MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000);
292 35
    }
293
294
    /**
295
     * Method to set the accessMode can be gui, cli or cli_im
296
     *
297
     * @return string
298
     * @deprecated
299
     */
300 1
    public function getAccessMode()
301
    {
302 1
        return $this->accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

302
        return /** @scrutinizer ignore-deprecated */ $this->accessMode;
Loading history...
303
    }
304
305
    /**
306
     * @param string $accessMode
307
     * @deprecated
308
     */
309 1
    public function setAccessMode($accessMode): void
310
    {
311 1
        $this->accessMode = $accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

311
        /** @scrutinizer ignore-deprecated */ $this->accessMode = $accessMode;
Loading history...
312 1
    }
313
314
    /**
315
     * Set disabled status to prevent processes from being processed
316
     *
317
     * @param bool $disabled (optional, defaults to true)
318
     * @deprecated
319
     */
320 2
    public function setDisabled($disabled = true): void
321
    {
322 2
        if ($disabled) {
323 1
            GeneralUtility::writeFile($this->processFilename, '');
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

323
            GeneralUtility::writeFile(/** @scrutinizer ignore-deprecated */ $this->processFilename, '');
Loading history...
324
        } else {
325 1
            if (is_file($this->processFilename)) {
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

325
            if (is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename)) {
Loading history...
326 1
                unlink($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

326
                unlink(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
327
            }
328
        }
329 2
    }
330
331
    /**
332
     * Get disable status
333
     *
334
     * @return bool true if disabled
335
     * @deprecated
336
     */
337 2
    public function getDisabled()
338
    {
339 2
        return is_file($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

339
        return is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
340
    }
341
342
    /**
343
     * @param string $filenameWithPath
344
     * @deprecated
345
     */
346 3
    public function setProcessFilename($filenameWithPath): void
347
    {
348 3
        $this->processFilename = $filenameWithPath;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

348
        /** @scrutinizer ignore-deprecated */ $this->processFilename = $filenameWithPath;
Loading history...
349 3
    }
350
351
    /**
352
     * @return string
353
     * @deprecated
354
     */
355 1
    public function getProcessFilename()
356
    {
357 1
        return $this->processFilename;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

357
        return /** @scrutinizer ignore-deprecated */ $this->processFilename;
Loading history...
358
    }
359
360
    /**
361
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
362
     */
363 14
    public function setExtensionSettings(array $extensionSettings): void
364
    {
365 14
        $this->extensionSettings = $extensionSettings;
366 14
    }
367
368
    /**
369
     * Check if the given page should be crawled
370
     *
371
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
372
     */
373 11
    public function checkIfPageShouldBeSkipped(array $pageRow)
374
    {
375 11
        $skipPage = false;
376
        // message will be overwritten later
377 11
        $skipMessage = 'Skipped';
378
379
        // if page is hidden
380 11
        if (! $this->extensionSettings['crawlHiddenPages']) {
381 11
            if ($pageRow['hidden']) {
382 1
                $skipPage = true;
383 1
                $skipMessage = 'Because page is hidden';
384
            }
385
        }
386
387 11
        if (! $skipPage) {
388 10
            if (GeneralUtility::inList('3,4,199,254,255', $pageRow['doktype'])) {
389 3
                $skipPage = true;
390 3
                $skipMessage = 'Because doktype is not allowed';
391
            }
392
        }
393
394 11
        if (! $skipPage) {
395 7
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
396 1
                if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
397 1
                    $skipPage = true;
398 1
                    $skipMessage = 'Doktype was excluded by "' . $key . '"';
399 1
                    break;
400
                }
401
            }
402
        }
403
404 11
        if (! $skipPage) {
405
            // veto hook
406 6
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
407
                $params = [
408 2
                    'pageRow' => $pageRow,
409
                ];
410
                // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
411 2
                $veto = GeneralUtility::callUserFunction($func, $params, $this);
412 2
                if ($veto !== false) {
413 2
                    $skipPage = true;
414 2
                    if (is_string($veto)) {
415 1
                        $skipMessage = $veto;
416
                    } else {
417 1
                        $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
418
                    }
419
                    // no need to execute other hooks if a previous one return a veto
420 2
                    break;
421
                }
422
            }
423
        }
424
425 11
        return $skipPage ? $skipMessage : false;
426
    }
427
428
    /**
429
     * Wrapper method for getUrlsForPageId()
430
     * It returns an array of configurations and no urls!
431
     *
432
     * @param array $pageRow Page record with at least dok-type and uid columns.
433
     * @param string $skipMessage
434
     * @return array
435
     * @see getUrlsForPageId()
436
     */
437 5
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
438
    {
439 5
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
440 5
        if ($message === false) {
441 4
            $res = $this->getUrlsForPageId($pageRow['uid']);
442 4
            $skipMessage = '';
443
        } else {
444 1
            $skipMessage = $message;
445 1
            $res = [];
446
        }
447
448 5
        return $res;
449
    }
450
451
    /**
452
     * Creates a list of URLs from input array (and submits them to queue if asked for)
453
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
454
     *
455
     * @param array $vv Information about URLs from pageRow to crawl.
456
     * @param array $pageRow Page row
457
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
458
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
459
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
460
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
461
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
462
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
463
     * @param array $incomingProcInstructions Array of processing instructions
464
     * @return string List of URLs (meant for display in backend module)
465
     */
466 3
    public function urlListFromUrlArray(
467
        array $vv,
468
        array $pageRow,
469
        $scheduledTime,
470
        $reqMinute,
471
        $submitCrawlUrls,
472
        $downloadCrawlUrls,
473
        array &$duplicateTrack,
474
        array &$downloadUrls,
475
        array $incomingProcInstructions
476
    ) {
477 3
        if (! is_array($vv['URLs'])) {
478
            return 'ERROR - no URL generated';
479
        }
480 3
        $urlLog = [];
481 3
        $pageId = (int) $pageRow['uid'];
482 3
        $configurationHash = $this->getConfigurationHash($vv);
483 3
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
484
485 3
        $urlService = new UrlService();
486
487 3
        foreach ($vv['URLs'] as $urlQuery) {
488 3
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
489
                continue;
490
            }
491 3
            $url = (string) $urlService->getUrlFromPageAndQueryParameters(
492 3
                $pageId,
493 3
                $urlQuery,
494 3
                $vv['subCfg']['baseUrl'] ?? null,
495 3
                $vv['subCfg']['force_ssl'] ?? 0
496
            );
497
498
            // Create key by which to determine unique-ness:
499 3
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
500
501 3
            if (isset($duplicateTrack[$uKey])) {
502
                //if the url key is registered just display it and do not resubmit is
503
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
504
            } else {
505
                // Scheduled time:
506 3
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
507 3
                $schTime = intval($schTime / 60) * 60;
508 3
                $formattedDate = BackendUtility::datetime($schTime);
509 3
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
510 3
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
511
512
                // Submit for crawling!
513 3
                if ($submitCrawlUrls) {
514 3
                    $added = $this->addUrl(
515 3
                        $pageId,
516 3
                        $url,
517 3
                        $vv['subCfg'],
518 3
                        $scheduledTime,
519 3
                        $configurationHash,
520 3
                        $skipInnerCheck
521
                    );
522 3
                    if ($added === false) {
523 3
                        $urlList .= ' (URL already existed)';
524
                    }
525
                } elseif ($downloadCrawlUrls) {
526
                    $downloadUrls[$url] = $url;
527
                }
528 3
                $urlLog[] = $urlList;
529
            }
530 3
            $duplicateTrack[$uKey] = true;
531
        }
532
533 3
        return implode('<br>', $urlLog);
534
    }
535
536
    /**
537
     * Returns true if input processing instruction is among registered ones.
538
     *
539
     * @param string $piString PI to test
540
     * @param array $incomingProcInstructions Processing instructions
541
     * @return boolean
542
     */
543 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
544
    {
545 5
        if (empty($incomingProcInstructions)) {
546 1
            return true;
547
        }
548
549 4
        foreach ($incomingProcInstructions as $pi) {
550 4
            if (GeneralUtility::inList($piString, $pi)) {
551 2
                return true;
552
            }
553
        }
554 2
        return false;
555
    }
556
557 4
    public function getPageTSconfigForId($id): array
558
    {
559 4
        if (! $this->MP) {
560 4
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

560
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($id);
Loading history...
561
        } else {
562
            // TODO: Please check, this makes no sense to split a boolean value.
563
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

563
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
564
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

564
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($mountPointId);
Loading history...
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

564
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
565
        }
566
567
        // Call a hook to alter configuration
568 4
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
569
            $params = [
570
                'pageId' => $id,
571
                'pageTSConfig' => &$pageTSconfig,
572
            ];
573
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
574
                GeneralUtility::callUserFunction($userFunc, $params, $this);
575
            }
576
        }
577 4
        return $pageTSconfig;
578
    }
579
580
    /**
581
     * This methods returns an array of configurations.
582
     * Adds no urls!
583
     */
584 3
    public function getUrlsForPageId(int $pageId): array
585
    {
586
        // Get page TSconfig for page ID
587 3
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
588
589 3
        $res = [];
590
591
        // Fetch Crawler Configuration from pageTSconfig
592 3
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
593 3
        foreach ($crawlerCfg as $key => $values) {
594 3
            if (! is_array($values)) {
595 3
                continue;
596
            }
597 3
            $key = str_replace('.', '', $key);
598
            // Sub configuration for a single configuration string:
599 3
            $subCfg = (array) $crawlerCfg[$key . '.'];
600 3
            $subCfg['key'] = $key;
601
602 3
            if (strcmp($subCfg['procInstrFilter'] ?? '', '')) {
603 3
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
604
            }
605 3
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
606
607
            // process configuration if it is not page-specific or if the specific page is the current page:
608
            // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
609 3
            if (! strcmp((string) $subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
610
611
                // Explode, process etc.:
612 3
                $res[$key] = [];
613 3
                $res[$key]['subCfg'] = $subCfg;
614 3
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
615 3
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
616 3
                $res[$key]['origin'] = 'pagets';
617
618
                // recognize MP value
619 3
                if (! $this->MP) {
620 3
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
621
                } else {
622
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
0 ignored issues
show
Bug introduced by
Are you sure $this->MP of type true can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

622
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . /** @scrutinizer ignore-type */ $this->MP]);
Loading history...
623
                }
624
            }
625
        }
626
627
        // Get configuration from tx_crawler_configuration records up the rootline
628 3
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
629
        /** @var Configuration $configurationRecord */
630 3
        foreach ($crawlerConfigurations as $configurationRecord) {
631
632
            // check access to the configuration record
633
            if (empty($configurationRecord->getBeGroups()) || $this->getBackendUser()->isAdmin() || $this->hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord->getBeGroups())) {
634
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord->getPidsOnly(), true));
635
636
                // process configuration if it is not page-specific or if the specific page is the current page:
637
                // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
638
                if (! strcmp($configurationRecord->getPidsOnly(), '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
639
                    $key = $configurationRecord->getName();
640
641
                    // don't overwrite previously defined paramSets
642
                    if (! isset($res[$key])) {
643
644
                        /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
645
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
646
                        $TSparserObject->parse($configurationRecord->getProcessingInstructionParameters());
647
648
                        $subCfg = [
649
                            'procInstrFilter' => $configurationRecord->getProcessingInstructionFilter(),
650
                            'procInstrParams.' => $TSparserObject->setup,
651
                            'baseUrl' => $configurationRecord->getBaseUrl(),
652
                            'force_ssl' => (int) $configurationRecord->isForceSsl(),
653
                            'userGroups' => $configurationRecord->getFeGroups(),
654
                            'exclude' => $configurationRecord->getExclude(),
655
                            'key' => $key,
656
                        ];
657
658
                        if (! in_array($pageId, $this->expandExcludeString($subCfg['exclude']), true)) {
659
                            $res[$key] = [];
660
                            $res[$key]['subCfg'] = $subCfg;
661
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord->getConfiguration());
662
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
663
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
664
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord->getUid();
665
                        }
666
                    }
667
                }
668
            }
669
        }
670
671 3
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
672
            $params = [
673
                'res' => &$res,
674
            ];
675
            GeneralUtility::callUserFunction($func, $params, $this);
676
        }
677 3
        return $res;
678
    }
679
680
    /**
681
     * Find all configurations of subpages of a page
682
     * TODO: Write Functional Tests
683
     */
684 1
    public function getConfigurationsForBranch(int $rootid, int $depth): array
685
    {
686 1
        $configurationsForBranch = [];
687 1
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
688 1
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
689 1
        foreach ($sets as $key => $value) {
690
            if (! is_array($value)) {
691
                continue;
692
            }
693
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
694
        }
695 1
        $pids = [];
696 1
        $rootLine = BackendUtility::BEgetRootLine($rootid);
697 1
        foreach ($rootLine as $node) {
698 1
            $pids[] = $node['uid'];
699
        }
700
        /* @var PageTreeView $tree */
701 1
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
702 1
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
703 1
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
704 1
        $tree->getTree($rootid, $depth, '');
705 1
        foreach ($tree->tree as $node) {
706
            $pids[] = $node['row']['uid'];
707
        }
708
709 1
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
710
        $statement = $queryBuilder
711 1
            ->select('name')
712 1
            ->from('tx_crawler_configuration')
713 1
            ->where(
714 1
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
715
            )
716 1
            ->execute();
717
718 1
        while ($row = $statement->fetch()) {
719 1
            $configurationsForBranch[] = $row['name'];
720
        }
721 1
        return $configurationsForBranch;
722
    }
723
724
    /**
725
     * Check if a user has access to an item
726
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
727
     *
728
     * @param string $groupList Comma-separated list of (fe_)group UIDs from a user
729
     * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access
730
     * @return bool TRUE if at least one of the users group UIDs is in the access list or the access list is empty
731
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
732
     */
733 3
    public function hasGroupAccess($groupList, $accessList)
734
    {
735 3
        if (empty($accessList)) {
736 1
            return true;
737
        }
738 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
739 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
740 1
                return true;
741
            }
742
        }
743 1
        return false;
744
    }
745
746
    /**
747
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
748
     * Syntax of values:
749
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
750
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
751
     * - For each configuration part:
752
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
753
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
754
     *        _ENABLELANG:1 picks only original records without their language overlays
755
     *         - Default: Literal value
756
     *
757
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
758
     * @param integer $pid Current page ID
759
     * @return array
760
     *
761
     * TODO: Write Functional Tests
762
     */
763 10
    public function expandParameters($paramArray, $pid)
764
    {
765
        // Traverse parameter names:
766 10
        foreach ($paramArray as $p => $v) {
767 10
            $v = trim($v);
768
769
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
770 10
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
771
                // So, find the value inside brackets and reset the paramArray value as an array.
772 10
                $v = substr($v, 1, -1);
773 10
                $paramArray[$p] = [];
774
775
                // Explode parts and traverse them:
776 10
                $parts = explode('|', $v);
777 10
                foreach ($parts as $pV) {
778
779
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
780 10
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
781 1
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
782
783
                        // Traverse range, add values:
784
                        // Limit to size of range!
785 1
                        $runAwayBrake = 1000;
786 1
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
787 1
                            $paramArray[$p][] = $a;
788 1
                            $runAwayBrake--;
789 1
                            if ($runAwayBrake <= 0) {
790
                                break;
791
                            }
792
                        }
793 9
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
794
795
                        // Parse parameters:
796 6
                        $subparts = GeneralUtility::trimExplode(';', $pV);
797 6
                        $subpartParams = [];
798 6
                        foreach ($subparts as $spV) {
799 6
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
800 6
                            $subpartParams[$pKey] = $pVal;
801
                        }
802
803
                        // Table exists:
804 6
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
805 6
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
806 6
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
807 6
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
808 6
                            $where = $subpartParams['_WHERE'] ?? '';
809 6
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
810
811 6
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
812 6
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
813 6
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
814
815 6
                                if ($recursiveDepth > 0) {
816
                                    /** @var \TYPO3\CMS\Core\Database\QueryGenerator $queryGenerator */
817 2
                                    $queryGenerator = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Database\QueryGenerator::class);
818 2
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
819 2
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
820
                                } else {
821 4
                                    $pidArray = [(string) $lookUpPid];
822
                                }
823
824 6
                                $queryBuilder->getRestrictions()
825 6
                                    ->removeAll()
826 6
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
827
828
                                $queryBuilder
829 6
                                    ->select($fieldName)
830 6
                                    ->from($subpartParams['_TABLE'])
831 6
                                    ->where(
832 6
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
833 6
                                        $where
834
                                    );
835
836 6
                                if (! empty($addTable)) {
837
                                    // TODO: Check if this works as intended!
838
                                    $queryBuilder->add('from', $addTable);
839
                                }
840 6
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
841
842 6
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
843
                                    $queryBuilder->andWhere(
844
                                        $queryBuilder->expr()->lte(
845
                                            $transOrigPointerField,
846
                                            0
847
                                        )
848
                                    );
849
                                }
850
851 6
                                $statement = $queryBuilder->execute();
852
853 6
                                $rows = [];
854 6
                                while ($row = $statement->fetch()) {
855 6
                                    $rows[$row[$fieldName]] = $row;
856
                                }
857
858 6
                                if (is_array($rows)) {
859 6
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
860
                                }
861
                            }
862
                        }
863
                    } else {
864
                        // Just add value:
865 3
                        $paramArray[$p][] = $pV;
866
                    }
867
                    // Hook for processing own expandParameters place holder
868 10
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
869
                        $_params = [
870
                            'pObj' => &$this,
871
                            'paramArray' => &$paramArray,
872
                            'currentKey' => $p,
873
                            'currentValue' => $pV,
874
                            'pid' => $pid,
875
                        ];
876
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
877
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
878
                        }
879
                    }
880
                }
881
882
                // Make unique set of values and sort array by key:
883 10
                $paramArray[$p] = array_unique($paramArray[$p]);
884 10
                ksort($paramArray);
885
            } else {
886
                // Set the literal value as only value in array:
887 3
                $paramArray[$p] = [$v];
888
            }
889
        }
890
891 10
        return $paramArray;
892
    }
893
894
    /**
895
     * Compiling URLs from parameter array (output of expandParameters())
896
     * The number of URLs will be the multiplication of the number of parameter values for each key
897
     *
898
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
899
     * @param array $urls URLs accumulated in this array (for recursion)
900
     * @return array
901
     */
902 6
    public function compileUrls($paramArray, array $urls)
903
    {
904 6
        if (empty($paramArray)) {
905 6
            return $urls;
906
        }
907
        // shift first off stack:
908 5
        reset($paramArray);
909 5
        $varName = key($paramArray);
910 5
        $valueSet = array_shift($paramArray);
911
912
        // Traverse value set:
913 5
        $newUrls = [];
914 5
        foreach ($urls as $url) {
915 4
            foreach ($valueSet as $val) {
916 4
                $newUrls[] = $url . (strcmp((string) $val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string) $val) : '');
917
918 4
                if (count($newUrls) > $this->maximumUrlsToCompile) {
919
                    break;
920
                }
921
            }
922
        }
923 5
        return $this->compileUrls($paramArray, $newUrls);
924
    }
925
926
    /************************************
927
     *
928
     * Crawler log
929
     *
930
     ************************************/
931
932
    /**
933
     * Return array of records from crawler queue for input page ID
934
     *
935
     * @param integer $id Page ID for which to look up log entries.
936
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
937
     * @param boolean $doFullFlush
938
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
939
     * @return array
940
     */
941 4
    public function getLogEntriesForPageId($id, QueueFilter $queueFilter, $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
942
    {
943 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
944
        $queryBuilder
945 4
            ->select('*')
946 4
            ->from($this->tableName)
947 4
            ->where(
948 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT))
949
            )
950 4
            ->orderBy('scheduled', 'DESC');
951
952 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
953 4
            ->getConnectionForTable($this->tableName)
954 4
            ->getExpressionBuilder();
955 4
        $query = $expressionBuilder->andX();
0 ignored issues
show
Unused Code introduced by
The assignment to $query is dead and can be removed.
Loading history...
956
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
957
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
958
        // between the statements, it's not a mistake in the code.
959 4
        switch ($queueFilter) {
960 4
            case 'pending':
961
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
962
                break;
963 4
            case 'finished':
964
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
965
                break;
966
        }
967
968 4
        if ($doFlush) {
969 2
            if ($doFullFlush) {
970 1
                $this->queueRepository->flushQueue($queueFilter);
971
            } else {
972 1
                $this->queueRepository->flushQueue($queueFilter);
973
            }
974
        }
975 4
        if ($itemsPerPage > 0) {
976
            $queryBuilder
977 4
                ->setMaxResults((int) $itemsPerPage);
978
        }
979
980 4
        return $queryBuilder->execute()->fetchAll();
981
    }
982
983
    /**
984
     * Return array of records from crawler queue for input set ID
985
     *
986
     * @param int $set_id Set ID for which to look up log entries.
987
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
988
     * @param bool $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
989
     * @param int $itemsPerPage Limit the amount of entries per page default is 10
990
     * @return array
991
     *
992
     * @deprecated
993
     */
994 6
    public function getLogEntriesForSetId(int $set_id, string $filter = '', bool $doFlush = false, bool $doFullFlush = false, int $itemsPerPage = 10)
995
    {
996 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
997
        $queryBuilder
998 6
            ->select('*')
999 6
            ->from($this->tableName)
1000 6
            ->where(
1001 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, \PDO::PARAM_INT))
1002
            )
1003 6
            ->orderBy('scheduled', 'DESC');
1004
1005 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1006 6
            ->getConnectionForTable($this->tableName)
1007 6
            ->getExpressionBuilder();
1008 6
        $query = $expressionBuilder->andX();
1009
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
1010
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
1011
        // between the statements, it's not a mistake in the code.
1012 6
        $addWhere = '';
1013 6
        switch ($filter) {
1014 6
            case 'pending':
1015 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
1016 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
1017 1
                break;
1018 5
            case 'finished':
1019 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1020 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
1021 1
                break;
1022
        }
1023 6
        if ($doFlush) {
1024 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int) $set_id));
1025 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1025
            /** @scrutinizer ignore-deprecated */ $this->flushQueue($doFullFlush ? '' : $addWhere);
Loading history...
1026 4
            return [];
1027
        }
1028 2
        if ($itemsPerPage > 0) {
1029
            $queryBuilder
1030 2
                ->setMaxResults((int) $itemsPerPage);
1031
        }
1032
1033 2
        return $queryBuilder->execute()->fetchAll();
1034
    }
1035
1036
    /**
1037
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1038
     *
1039
     * @param integer $setId Set ID
1040
     * @param array $params Parameters to pass to call back function
1041
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1042
     * @param integer $page_id Page ID to attach it to
1043
     * @param integer $schedule Time at which to activate
1044
     */
1045
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
1046
    {
1047
        if (! is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
1048
            $params = [];
1049
        }
1050
        $params['_CALLBACKOBJ'] = $callBack;
1051
1052
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1053
            ->insert(
1054
                'tx_crawler_queue',
1055
                [
1056
                    'page_id' => (int) $page_id,
1057
                    'parameters' => json_encode($params),
1058
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
1059
                    'exec_time' => 0,
1060
                    'set_id' => (int) $setId,
1061
                    'result_data' => '',
1062
                ]
1063
            );
1064
    }
1065
1066
    /************************************
1067
     *
1068
     * URL setting
1069
     *
1070
     ************************************/
1071
1072
    /**
1073
     * Setting a URL for crawling:
1074
     *
1075
     * @param integer $id Page ID
1076
     * @param string $url Complete URL
1077
     * @param Configuration|array $subCfg Sub configuration array (from TS config)
1078
     * @param integer $tstamp Scheduled-time
1079
     * @param string $configurationHash (optional) configuration hash
1080
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1081
     * @return bool
1082
     */
1083 7
    public function addUrl(
1084
        $id,
1085
        $url,
1086
        $subCfg,
1087
        $tstamp,
1088
        $configurationHash = '',
1089
        $skipInnerDuplicationCheck = false
1090
    ) {
1091 7
        $urlAdded = false;
1092 7
        $rows = [];
1093
1094
        // Creating parameters:
1095
        $parameters = [
1096 7
            'url' => $url,
1097
        ];
1098
1099
        // Creates a Configuration Object from array
1100 7
        if (is_array($subCfg)) {
1101 3
            $subCfg['name'] = $subCfg['name'] ?: 'Config without name';
1102 3
            $subCfg = Configuration::fromArray($subCfg);
1103
        }
1104
1105
        // fe user group simulation:
1106 7
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg->getFeGroups(), true)));
1107 7
        if ($uGs) {
1108 1
            $parameters['feUserGroupList'] = $uGs;
1109
        }
1110
1111
        // Setting processing instructions
1112 7
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg->getProcessingInstructionFilter());
1113 7
        if (is_array($subCfg->getProcessingInstructionParameters())) {
0 ignored issues
show
introduced by
The condition is_array($subCfg->getPro...nstructionParameters()) is always false.
Loading history...
1114 3
            $parameters['procInstrParams'] = $subCfg->getProcessingInstructionParameters();
1115
        }
1116
1117
        // Compile value array:
1118 7
        $parameters_serialized = json_encode($parameters);
1119
        $fieldArray = [
1120 7
            'page_id' => (int) $id,
1121 7
            'parameters' => $parameters_serialized,
1122 7
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1123 7
            'configuration_hash' => $configurationHash,
1124 7
            'scheduled' => $tstamp,
1125 7
            'exec_time' => 0,
1126 7
            'set_id' => (int) $this->setID,
1127 7
            'result_data' => '',
1128 7
            'configuration' => $subCfg,
1129
        ];
1130
1131 7
        if ($this->registerQueueEntriesInternallyOnly) {
1132
            //the entries will only be registered and not stored to the database
1133 1
            $this->queueEntries[] = $fieldArray;
1134
        } else {
1135 6
            if (! $skipInnerDuplicationCheck) {
1136
                // check if there is already an equal entry
1137 5
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1138
            }
1139
1140 6
            if (empty($rows)) {
1141
                //$queueObject = Queue::fromArray($fieldArray);
1142
                //$this->queueRepository->add($queueObject);
1143 5
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1144 5
                $connectionForCrawlerQueue->insert(
1145 5
                    'tx_crawler_queue',
1146 5
                    $fieldArray
1147
                );
1148 5
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1149 5
                $rows[] = $uid;
1150
1151 5
                $urlAdded = true;
1152
1153 5
                $signalPayload = ['uid' => $uid, 'fieldArray' => $fieldArray];
1154 5
                SignalSlotUtility::emitSignal(
1155 5
                    self::class,
1156 5
                    SignalSlotUtility::SIGNAL_URL_ADDED_TO_QUEUE,
1157 5
                    $signalPayload
1158
                );
1159
            } else {
1160 3
                $signalPayload = ['rows' => $rows, 'fieldArray' => $fieldArray];
1161 3
                SignalSlotUtility::emitSignal(
1162 3
                    self::class,
1163 3
                    SignalSlotUtility::SIGNAL_DUPLICATE_URL_IN_QUEUE,
1164 3
                    $signalPayload
1165
                );
1166
            }
1167
        }
1168
1169 7
        return $urlAdded;
1170
    }
1171
1172
    /**
1173
     * Returns the current system time
1174
     *
1175
     * @return int
1176
     */
1177
    public function getCurrentTime()
1178
    {
1179
        return time();
1180
    }
1181
1182
    /************************************
1183
     *
1184
     * URL reading
1185
     *
1186
     ************************************/
1187
1188
    /**
1189
     * Read URL for single queue entry
1190
     *
1191
     * @param integer $queueId
1192
     * @param boolean $force If set, will process even if exec_time has been set!
1193
     * @return integer
1194
     */
1195
    public function readUrl($queueId, $force = false)
1196
    {
1197
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1198
        $ret = 0;
1199
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1200
        // Get entry:
1201
        $queryBuilder
1202
            ->select('*')
1203
            ->from('tx_crawler_queue')
1204
            ->where(
1205
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1206
            );
1207
        if (! $force) {
1208
            $queryBuilder
1209
                ->andWhere('exec_time = 0')
1210
                ->andWhere('process_scheduled > 0');
1211
        }
1212
        $queueRec = $queryBuilder->execute()->fetch();
1213
1214
        if (! is_array($queueRec)) {
1215
            return;
1216
        }
1217
1218
        SignalSlotUtility::emitSignal(
1219
            self::class,
1220
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1221
            [$queueId, &$queueRec]
1222
        );
1223
1224
        // Set exec_time to lock record:
1225
        $field_array = ['exec_time' => $this->getCurrentTime()];
1226
1227
        if (isset($this->processID)) {
1228
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1229
            $field_array['process_id_completed'] = $this->processID;
1230
        }
1231
1232
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1233
            ->update(
1234
                'tx_crawler_queue',
1235
                $field_array,
1236
                ['qid' => (int) $queueId]
1237
            );
1238
1239
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1240
        if ($result['content'] === null) {
1241
            $resultData = 'An errors happened';
1242
        } else {
1243
            /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
1244
            $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
1245
            $resultData = $jsonCompatibilityConverter->convert($result['content']);
1246
        }
1247
1248
        //atm there's no need to point to specific pollable extensions
1249
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1250
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1251
                // only check the success value if the instruction is runnig
1252
                // it is important to name the pollSuccess key same as the procInstructions key
1253
                if (is_array($resultData['parameters']['procInstructions'])
1254
                    && in_array(
1255
                        $pollable,
1256
                        $resultData['parameters']['procInstructions'], true
1257
                    )
1258
                ) {
1259
                    if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1260
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1261
                    }
1262
                }
1263
            }
1264
        }
1265
1266
        // Set result in log which also denotes the end of the processing of this entry.
1267
        $field_array = ['result_data' => json_encode($result)];
1268
1269
        SignalSlotUtility::emitSignal(
1270
            self::class,
1271
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1272
            [$queueId, &$field_array]
1273
        );
1274
1275
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1276
            ->update(
1277
                'tx_crawler_queue',
1278
                $field_array,
1279
                ['qid' => (int) $queueId]
1280
            );
1281
1282
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1283
        return $ret;
1284
    }
1285
1286
    /**
1287
     * Read URL for not-yet-inserted log-entry
1288
     *
1289
     * @param array $field_array Queue field array,
1290
     *
1291
     * @return array|bool|mixed|string
1292
     */
1293
    public function readUrlFromArray($field_array)
1294
    {
1295
        // Set exec_time to lock record:
1296
        $field_array['exec_time'] = $this->getCurrentTime();
1297
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1298
        $connectionForCrawlerQueue->insert(
1299
            $this->tableName,
1300
            $field_array
1301
        );
1302
        $queueId = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1303
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1304
1305
        // Set result in log which also denotes the end of the processing of this entry.
1306
        $field_array = ['result_data' => json_encode($result)];
1307
1308
        SignalSlotUtility::emitSignal(
1309
            self::class,
1310
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1311
            [$queueId, &$field_array]
1312
        );
1313
1314
        $connectionForCrawlerQueue->update(
1315
            $this->tableName,
1316
            $field_array,
1317
            ['qid' => $queueId]
1318
        );
1319
1320
        return $result;
1321
    }
1322
1323
    /*****************************
1324
     *
1325
     * Compiling URLs to crawl - tools
1326
     *
1327
     *****************************/
1328
1329
    /**
1330
     * @param integer $id Root page id to start from.
1331
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1332
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1333
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1334
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1335
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1336
     * @param array $incomingProcInstructions Array of processing instructions
1337
     * @param array $configurationSelection Array of configuration keys
1338
     * @return string
1339
     */
1340
    public function getPageTreeAndUrls(
1341
        $id,
1342
        $depth,
1343
        $scheduledTime,
1344
        $reqMinute,
1345
        $submitCrawlUrls,
1346
        $downloadCrawlUrls,
1347
        array $incomingProcInstructions,
1348
        array $configurationSelection
1349
    ) {
1350
        $this->scheduledTime = $scheduledTime;
1351
        $this->reqMinute = $reqMinute;
1352
        $this->submitCrawlUrls = $submitCrawlUrls;
1353
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1354
        $this->incomingProcInstructions = $incomingProcInstructions;
1355
        $this->incomingConfigurationSelection = $configurationSelection;
1356
1357
        $this->duplicateTrack = [];
1358
        $this->downloadUrls = [];
1359
1360
        // Drawing tree:
1361
        /* @var PageTreeView $tree */
1362
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1363
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1364
        $tree->init('AND ' . $perms_clause);
1365
1366
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1367
        if (is_array($pageInfo)) {
0 ignored issues
show
introduced by
The condition is_array($pageInfo) is always true.
Loading history...
1368
            // Set root row:
1369
            $tree->tree[] = [
1370
                'row' => $pageInfo,
1371
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1372
            ];
1373
        }
1374
1375
        // Get branch beneath:
1376
        if ($depth) {
1377
            $tree->getTree($id, $depth, '');
1378
        }
1379
1380
        // Traverse page tree:
1381
        $code = '';
1382
1383
        foreach ($tree->tree as $data) {
1384
            $this->MP = false;
1385
1386
            // recognize mount points
1387
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
1388
                $mountpage = $this->pageRepository->getPage($data['row']['uid']);
1389
1390
                // fetch mounted pages
1391
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1392
1393
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1394
                $mountTree->init('AND ' . $perms_clause);
1395
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1396
1397
                foreach ($mountTree->tree as $mountData) {
1398
                    $code .= $this->drawURLs_addRowsForPage(
1399
                        $mountData['row'],
1400
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1401
                    );
1402
                }
1403
1404
                // replace page when mount_pid_ol is enabled
1405
                if ($mountpage[0]['mount_pid_ol']) {
1406
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1407
                } else {
1408
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1409
                    $this->MP = false;
1410
                }
1411
            }
1412
1413
            $code .= $this->drawURLs_addRowsForPage(
1414
                $data['row'],
1415
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1416
            );
1417
        }
1418
1419
        return $code;
1420
    }
1421
1422
    /**
1423
     * Expands exclude string
1424
     *
1425
     * @param string $excludeString Exclude string
1426
     * @return array
1427
     */
1428 1
    public function expandExcludeString($excludeString)
1429
    {
1430
        // internal static caches;
1431 1
        static $expandedExcludeStringCache;
1432 1
        static $treeCache;
1433
1434 1
        if (empty($expandedExcludeStringCache[$excludeString])) {
1435 1
            $pidList = [];
1436
1437 1
            if (! empty($excludeString)) {
1438
                /** @var PageTreeView $tree */
1439 1
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1440 1
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1441
1442 1
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1443
1444 1
                foreach ($excludeParts as $excludePart) {
1445 1
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1446
1447
                    // default is "page only" = "depth=0"
1448 1
                    if (empty($depth)) {
1449 1
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1450
                    }
1451
1452 1
                    $pidList[] = (int) $pid;
1453
1454 1
                    if ($depth > 0) {
1455
                        if (empty($treeCache[$pid][$depth])) {
1456
                            $tree->reset();
1457
                            $tree->getTree($pid, $depth);
0 ignored issues
show
Bug introduced by
$pid of type string is incompatible with the type integer expected by parameter $uid of TYPO3\CMS\Backend\Tree\V...ractTreeView::getTree(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1457
                            $tree->getTree(/** @scrutinizer ignore-type */ $pid, $depth);
Loading history...
1458
                            $treeCache[$pid][$depth] = $tree->tree;
1459
                        }
1460
1461
                        foreach ($treeCache[$pid][$depth] as $data) {
1462
                            $pidList[] = (int) $data['row']['uid'];
1463
                        }
1464
                    }
1465
                }
1466
            }
1467
1468 1
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1469
        }
1470
1471 1
        return $expandedExcludeStringCache[$excludeString];
1472
    }
1473
1474
    /**
1475
     * Create the rows for display of the page tree
1476
     * For each page a number of rows are shown displaying GET variable configuration
1477
     */
1478
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1479
    {
1480
        $skipMessage = '';
1481
1482
        // Get list of configurations
1483
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1484
1485
        if (! empty($this->incomingConfigurationSelection)) {
1486
            // remove configuration that does not match the current selection
1487
            foreach ($configurations as $confKey => $confArray) {
1488
                if (! in_array($confKey, $this->incomingConfigurationSelection, true)) {
1489
                    unset($configurations[$confKey]);
1490
                }
1491
            }
1492
        }
1493
1494
        // Traverse parameter combinations:
1495
        $c = 0;
1496
        $content = '';
1497
        if (! empty($configurations)) {
1498
            foreach ($configurations as $confKey => $confArray) {
1499
1500
                // Title column:
1501
                if (! $c) {
1502
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1503
                } else {
1504
                    $titleClm = '';
1505
                }
1506
1507
                if (! in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']), true)) {
1508
1509
                    // URL list:
1510
                    $urlList = $this->urlListFromUrlArray(
1511
                        $confArray,
1512
                        $pageRow,
1513
                        $this->scheduledTime,
1514
                        $this->reqMinute,
1515
                        $this->submitCrawlUrls,
1516
                        $this->downloadCrawlUrls,
1517
                        $this->duplicateTrack,
1518
                        $this->downloadUrls,
1519
                        // if empty the urls won't be filtered by processing instructions
1520
                        $this->incomingProcInstructions
1521
                    );
1522
1523
                    // Expanded parameters:
1524
                    $paramExpanded = '';
1525
                    $calcAccu = [];
1526
                    $calcRes = 1;
1527
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1528
                        $paramExpanded .= '
1529
                            <tr>
1530
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1531
                            '(' . count($gVal) . ')' .
1532
                            '</td>
1533
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1534
                            </tr>
1535
                        ';
1536
                        $calcRes *= count($gVal);
1537
                        $calcAccu[] = count($gVal);
1538
                    }
1539
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1540
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1541
1542
                    // Options
1543
                    $optionValues = '';
1544
                    if ($confArray['subCfg']['userGroups']) {
1545
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1546
                    }
1547
                    if ($confArray['subCfg']['procInstrFilter']) {
1548
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1549
                    }
1550
1551
                    // Compile row:
1552
                    $content .= '
1553
                        <tr>
1554
                            ' . $titleClm . '
1555
                            <td>' . htmlspecialchars($confKey) . '</td>
1556
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1557
                            <td>' . $paramExpanded . '</td>
1558
                            <td nowrap="nowrap">' . $urlList . '</td>
1559
                            <td nowrap="nowrap">' . $optionValues . '</td>
1560
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1561
                        </tr>';
1562
                } else {
1563
                    $content .= '<tr>
1564
                            ' . $titleClm . '
1565
                            <td>' . htmlspecialchars($confKey) . '</td>
1566
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1567
                        </tr>';
1568
                }
1569
1570
                $c++;
1571
            }
1572
        } else {
1573
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1574
1575
            // Compile row:
1576
            $content .= '
1577
                <tr>
1578
                    <td>' . $pageTitle . '</td>
1579
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1580
                </tr>';
1581
        }
1582
1583
        return $content;
1584
    }
1585
1586
    /*****************************
1587
     *
1588
     * CLI functions
1589
     *
1590
     *****************************/
1591
1592
    /**
1593
     * Running the functionality of the CLI (crawling URLs from queue)
1594
     */
1595
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1596
    {
1597
        $result = 0;
1598
        $counter = 0;
1599
1600
        // First, run hooks:
1601
        $this->CLI_runHooks();
1602
1603
        // Clean up the queue
1604
        $this->queueRepository->cleanupQueue();
1605
1606
        // Select entries:
1607
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1608
1609
        if (! empty($rows)) {
1610
            $quidList = [];
1611
1612
            foreach ($rows as $r) {
1613
                $quidList[] = $r['qid'];
1614
            }
1615
1616
            $processId = $this->CLI_buildProcessId();
1617
1618
            //save the number of assigned queue entries to determine how many have been processed later
1619
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1620
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1621
1622
            if ($numberOfAffectedRows !== count($quidList)) {
1623
                $this->CLI_debug('Nothing processed due to multi-process collision (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1623
                /** @scrutinizer ignore-deprecated */ $this->CLI_debug('Nothing processed due to multi-process collision (' . $this->CLI_buildProcessId() . ')');
Loading history...
1624
                return ($result | self::CLI_STATUS_ABORTED);
1625
            }
1626
1627
            foreach ($rows as $r) {
1628
                $result |= $this->readUrl($r['qid']);
1629
1630
                $counter++;
1631
                // Just to relax the system
1632
                usleep((int) $sleepTime);
1633
1634
                // if during the start and the current read url the cli has been disable we need to return from the function
1635
                // mark the process NOT as ended.
1636
                if ($this->crawler->isDisabled()) {
1637
                    return ($result | self::CLI_STATUS_ABORTED);
1638
                }
1639
1640
                if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1641
                    $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1641
                    /** @scrutinizer ignore-deprecated */ $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
Loading history...
1642
                    $result |= self::CLI_STATUS_ABORTED;
1643
                    //possible timeout
1644
                    break;
1645
                }
1646
            }
1647
1648
            sleep((int) $sleepAfterFinish);
1649
1650
            $msg = 'Rows: ' . $counter;
1651
            $this->CLI_debug($msg . ' (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1651
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug($msg . ' (' . $this->CLI_buildProcessId() . ')');
Loading history...
1652
        } else {
1653
            $this->CLI_debug('Nothing within queue which needs to be processed (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1653
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug('Nothing within queue which needs to be processed (' . $this->CLI_buildProcessId() . ')');
Loading history...
1654
        }
1655
1656
        if ($counter > 0) {
1657
            $result |= self::CLI_STATUS_PROCESSED;
1658
        }
1659
1660
        return $result;
1661
    }
1662
1663
    /**
1664
     * Activate hooks
1665
     */
1666
    public function CLI_runHooks(): void
1667
    {
1668
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1669
            $hookObj = GeneralUtility::makeInstance($objRef);
1670
            if (is_object($hookObj)) {
1671
                $hookObj->crawler_init($this);
1672
            }
1673
        }
1674
    }
1675
1676
    /**
1677
     * Try to acquire a new process with the given id
1678
     * also performs some auto-cleanup for orphan processes
1679
     * @param string $id identification string for the process
1680
     * @return boolean
1681
     * @todo preemption might not be the most elegant way to clean up
1682
     */
1683
    public function CLI_checkAndAcquireNewProcess($id)
1684
    {
1685
        $ret = true;
1686
1687
        $systemProcessId = getmypid();
1688
        if (! $systemProcessId) {
1689
            return false;
1690
        }
1691
1692
        $processCount = 0;
1693
        $orphanProcesses = [];
1694
1695
        $activeProcesses = $this->processRepository->findAllActive();
1696
        $currentTime = $this->getCurrentTime();
1697
1698
        /** @var Process $process */
1699
        foreach ($activeProcesses as $process) {
1700
            if ($process->getTtl() < $currentTime) {
1701
                $orphanProcesses[] = $process->getProcessId();
1702
            } else {
1703
                $processCount++;
1704
            }
1705
        }
1706
1707
        // if there are less than allowed active processes then add a new one
1708
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
1709
            $this->CLI_debug('add process ' . $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1709
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug('add process ' . $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
Loading history...
1710
1711
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1712
                'tx_crawler_process',
1713
                [
1714
                    'process_id' => $id,
1715
                    'active' => 1,
1716
                    'ttl' => $currentTime + (int) $this->extensionSettings['processMaxRunTime'],
1717
                    'system_process_id' => $systemProcessId,
1718
                ]
1719
            );
1720
        } else {
1721
            $this->CLI_debug('Processlimit reached (' . ($processCount) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1721
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug('Processlimit reached (' . ($processCount) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
Loading history...
1722
            $ret = false;
1723
        }
1724
1725
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1726
        $this->CLI_releaseProcesses($orphanProcesses);
1727
1728
        return $ret;
1729
    }
1730
1731
    /**
1732
     * Release a process and the required resources
1733
     *
1734
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1735
     * @return boolean
1736
     */
1737
    public function CLI_releaseProcesses($releaseIds)
1738
    {
1739
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1740
1741
        if (! is_array($releaseIds)) {
1742
            $releaseIds = [$releaseIds];
1743
        }
1744
1745
        if (empty($releaseIds)) {
1746
            //nothing to release
1747
            return false;
1748
        }
1749
1750
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1751
        // this ensures that a single process can't mess up the entire process table
1752
1753
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1754
1755
        $queryBuilder
1756
            ->update($this->tableName, 'q')
1757
            ->where(
1758
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1759
            )
1760
            ->set('q.process_scheduled', 0)
1761
            ->set('q.process_id', '')
1762
            ->execute();
1763
1764
        // FIXME: Not entirely sure that this is equivalent to the previous version
1765
        $queryBuilder->resetQueryPart('set');
1766
1767
        $queryBuilder
1768
            ->update('tx_crawler_process')
1769
            ->where(
1770
                $queryBuilder->expr()->eq('active', 0),
1771
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1772
            )
1773
            ->set('system_process_id', 0)
1774
            ->execute();
1775
1776
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1777
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1778
1779
        return true;
1780
    }
1781
1782
    /**
1783
     * Create a unique Id for the current process
1784
     *
1785
     * @return string the ID
1786
     */
1787 1
    public function CLI_buildProcessId()
1788
    {
1789 1
        if (! $this->processID) {
1790
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1791
        }
1792 1
        return $this->processID;
1793
    }
1794
1795
    /**
1796
     * Prints a message to the stdout (only if debug-mode is enabled)
1797
     *
1798
     * @param string $msg the message
1799
     * @deprecated
1800
     */
1801
    public function CLI_debug($msg): void
1802
    {
1803
        if ((int) $this->extensionSettings['processDebug']) {
1804
            echo $msg . "\n";
1805
            flush();
1806
        }
1807
    }
1808
1809
    /**
1810
     * Cleans up entries that stayed for too long in the queue. These are:
1811
     * - processed entries that are over 1.5 days in age
1812
     * - scheduled entries that are over 7 days old
1813
     *
1814
     * @deprecated
1815
     */
1816 1
    public function cleanUpOldQueueEntries(): void
1817
    {
1818
        // 24*60*60 Seconds in 24 hours
1819 1
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400;
1820 1
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
1821
1822 1
        $now = time();
1823 1
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
1824 1
        $this->flushQueue($condition);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1824
        /** @scrutinizer ignore-deprecated */ $this->flushQueue($condition);
Loading history...
1825 1
    }
1826
1827
    /**
1828
     * Removes queue entries
1829
     *
1830
     * @param string $where SQL related filter for the entries which should be removed
1831
     *
1832
     * @deprecated
1833
     */
1834 5
    protected function flushQueue($where = ''): void
1835
    {
1836 5
        $realWhere = strlen((string) $where) > 0 ? $where : '1=1';
1837
1838 5
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1839
1840
        $groups = $queryBuilder
1841 5
            ->selectLiteral('DISTINCT set_id')
1842 5
            ->from($this->tableName)
1843 5
            ->where($realWhere)
1844 5
            ->execute()
1845 5
            ->fetchAll();
1846 5
        if (is_array($groups)) {
0 ignored issues
show
introduced by
The condition is_array($groups) is always true.
Loading history...
1847 5
            foreach ($groups as $group) {
1848
                $subSet = $queryBuilder
1849 4
                    ->select('qid', 'set_id')
1850 4
                    ->from($this->tableName)
1851 4
                    ->where(
1852 4
                        $realWhere,
1853 4
                        $queryBuilder->expr()->eq('set_id', $group['set_id'])
1854
                    )
1855 4
                    ->execute()
1856 4
                    ->fetchAll();
1857
1858 4
                $payLoad = ['subSet' => $subSet];
1859 4
                SignalSlotUtility::emitSignal(
1860 4
                    self::class,
1861 4
                    SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH,
1862 4
                    $payLoad
1863
                );
1864
            }
1865
        }
1866
1867
        $queryBuilder
1868 5
            ->delete($this->tableName)
1869 5
            ->where($realWhere)
1870 5
            ->execute();
1871 5
    }
1872
1873
    /**
1874
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1875
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1876
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1877
     *
1878
     * @param int $tstamp
1879
     * @param array $fieldArray
1880
     *
1881
     * @return array
1882
     */
1883 8
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1884
    {
1885 8
        $rows = [];
1886
1887 8
        $currentTime = $this->getCurrentTime();
1888
1889 8
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1890
        $queryBuilder
1891 8
            ->select('qid')
1892 8
            ->from('tx_crawler_queue');
1893
        //if this entry is scheduled with "now"
1894 8
        if ($tstamp <= $currentTime) {
1895 3
            if ($this->extensionSettings['enableTimeslot']) {
1896 2
                $timeBegin = $currentTime - 100;
1897 2
                $timeEnd = $currentTime + 100;
1898
                $queryBuilder
1899 2
                    ->where(
1900 2
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1901
                    )
1902 2
                    ->orWhere(
1903 2
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1904
                    );
1905
            } else {
1906
                $queryBuilder
1907 1
                    ->where(
1908 3
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1909
                    );
1910
            }
1911 5
        } elseif ($tstamp > $currentTime) {
1912
            //entry with a timestamp in the future need to have the same schedule time
1913
            $queryBuilder
1914 5
                ->where(
1915 5
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1916
                );
1917
        }
1918
1919
        $queryBuilder
1920 8
            ->andWhere('NOT exec_time')
1921 8
            ->andWhere('NOT process_id')
1922 8
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT)))
1923 8
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR)));
1924
1925 8
        $statement = $queryBuilder->execute();
1926
1927 8
        while ($row = $statement->fetch()) {
1928 7
            $rows[] = $row['qid'];
1929
        }
1930
1931 8
        return $rows;
1932
    }
1933
1934
    /**
1935
     * Returns a md5 hash generated from a serialized configuration array.
1936
     *
1937
     * @return string
1938
     */
1939 9
    protected function getConfigurationHash(array $configuration)
1940
    {
1941 9
        unset($configuration['paramExpanded']);
1942 9
        unset($configuration['URLs']);
1943 9
        return md5(serialize($configuration));
1944
    }
1945
1946
    /**
1947
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
1948
     * the Site instance.
1949
     *
1950
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
1951
     * @throws \TYPO3\CMS\Core\Exception\SiteNotFoundException
1952
     * @throws \TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException
1953
     *
1954
     * @deprecated Using CrawlerController::getUrlFromPageAndQueryParameters() is deprecated since 9.1.1 and will be removed in v11.x, please use UrlService->getUrlFromPageAndQueryParameters() instead.
1955
     */
1956
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
1957
    {
1958
        $urlService = new UrlService();
1959
        return $urlService->getUrlFromPageAndQueryParameters($pageId, $queryString, $alternativeBaseUrl, $httpsOrHttp);
1960
    }
1961
1962 1
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1963
    {
1964
        // Swap if first is larger than last:
1965 1
        if ($reg[1] > $reg[2]) {
1966
            $temp = $reg[2];
1967
            $reg[2] = $reg[1];
1968
            $reg[1] = $temp;
1969
        }
1970
1971 1
        return $reg;
1972
    }
1973
1974
    /**
1975
     * @return BackendUserAuthentication
1976
     */
1977 2
    private function getBackendUser()
1978
    {
1979
        // Make sure the _cli_ user is loaded
1980 2
        Bootstrap::initializeBackendAuthentication();
1981 2
        if ($this->backendUser === null) {
1982 2
            $this->backendUser = $GLOBALS['BE_USER'];
1983
        }
1984 2
        return $this->backendUser;
1985
    }
1986
1987
    /**
1988
     * Get querybuilder for given table
1989
     *
1990
     * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder
1991
     */
1992 12
    private function getQueryBuilder(string $table)
1993
    {
1994 12
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
1995
    }
1996
}
1997